Job boards (Indeed, LinkedIn, Greenhouse) actively fight scrapers. But their listings are indexed by Google. Instead of maintaining scrapers that break monthly, search Google for indexed job listings and Reddit for who-is-hiring threads. This tutorial builds an aggregator that survives anti-bot updates.
Prerequisites
- Python 3.8+
- A Scavio API key
- Basic knowledge of job listing data
Walkthrough
Step 1: Search Google for job listings
Use site: operators to find listings on specific job boards indexed by Google.
import requests, os
H = {'x-api-key': os.environ['SCAVIO_API_KEY'], 'Content-Type': 'application/json'}
def find_jobs(role: str, location: str = '') -> list:
query = f'{role} {location} site:greenhouse.io OR site:lever.co OR site:jobs.ashbyhq.com'
resp = requests.post('https://api.scavio.dev/api/v1/search', headers=H,
json={'platform': 'google', 'query': query}, timeout=10)
return [{'title': r.get('title',''), 'url': r.get('link',''), 'snippet': r.get('snippet','')}
for r in resp.json().get('organic', [])]Step 2: Add Reddit hiring thread search
Find who-is-hiring posts on relevant subreddits.
def find_reddit_jobs(role: str) -> list:
resp = requests.post('https://api.scavio.dev/api/v1/search', headers=H,
json={'platform': 'reddit', 'query': f'{role} hiring'}, timeout=10)
threads = resp.json().get('organic', [])
return [{'title': t.get('title',''), 'url': t.get('link',''), 'subreddit': t.get('subreddit',''),
'score': t.get('score', 0)}
for t in threads if 'hiring' in t.get('title','').lower() or 'job' in t.get('title','').lower()]Step 3: Deduplicate and rank
Remove duplicate listings and rank by relevance.
def aggregate_jobs(role: str, location: str = '') -> dict:
google_jobs = find_jobs(role, location)
reddit_jobs = find_reddit_jobs(role)
seen_urls = set()
unique_jobs = []
for job in google_jobs + reddit_jobs:
url = job.get('url', '')
if url and url not in seen_urls:
seen_urls.add(url)
unique_jobs.append(job)
return {
'role': role,
'location': location,
'total_found': len(unique_jobs),
'from_google': len(google_jobs),
'from_reddit': len(reddit_jobs),
'listings': unique_jobs,
}
results = aggregate_jobs('senior python developer', 'remote')
print(f"Found {results['total_found']} unique listings")
for job in results['listings'][:5]:
print(f" - {job['title']}")Step 4: Schedule daily updates
Run the aggregator on a schedule and track new listings.
import json
from pathlib import Path
from datetime import date
def daily_job_check(roles: list, location: str = '') -> dict:
today = date.today().isoformat()
all_listings = []
for role in roles:
results = aggregate_jobs(role, location)
all_listings.extend(results['listings'])
# Load previous listings to find new ones
history_file = Path('job_history.json')
seen = set()
if history_file.exists():
seen = set(json.loads(history_file.read_text()).get('urls', []))
new_listings = [j for j in all_listings if j.get('url') not in seen]
# Update history
seen.update(j.get('url','') for j in all_listings)
history_file.write_text(json.dumps({'urls': list(seen), 'last_run': today}))
return {'date': today, 'new_listings': len(new_listings), 'total_tracked': len(seen), 'new': new_listings}Python Example
import requests, os
H = {'x-api-key': os.environ['SCAVIO_API_KEY'], 'Content-Type': 'application/json'}
def find_jobs(role, location=''):
q = f'{role} {location} site:greenhouse.io OR site:lever.co'
r = requests.post('https://api.scavio.dev/api/v1/search', headers=H,
json={'platform': 'google', 'query': q}).json()
return [{'title': x['title'], 'url': x.get('link','')} for x in r.get('organic',[])]JavaScript Example
async function findJobs(role, location = '') {
const q = `${role} ${location} site:greenhouse.io OR site:lever.co`;
const r = await fetch('https://api.scavio.dev/api/v1/search', {
method: 'POST', headers: {'x-api-key': process.env.SCAVIO_API_KEY, 'Content-Type': 'application/json'},
body: JSON.stringify({platform: 'google', query: q})
});
return (await r.json()).organic?.map(x => ({title: x.title, url: x.link})) || [];
}Expected Output
A job listing aggregator that finds positions from Google-indexed job boards and Reddit hiring threads without maintaining any scrapers.