Common Crawl provides petabytes of free web archive data, but it is months old and requires heavy processing. Live search APIs return fresh results instantly but cost per query. This tutorial combines both into a hybrid pipeline: use Common Crawl for bulk historical analysis and Scavio for real-time freshness. The hybrid approach gives you the scale of Common Crawl with the currency of live search at minimal cost.
Prerequisites
- Python 3.9+ installed
- requests library installed
- A Scavio API key from scavio.dev
- Basic understanding of Common Crawl index
Walkthrough
Step 1: Query the Common Crawl Index API
Search Common Crawl's CDX index to find historical web pages matching your query. The index is free to query and returns URLs of archived pages.
import requests
CC_INDEX_URL = 'https://index.commoncrawl.org/CC-MAIN-2026-04-index'
def search_common_crawl(domain: str, limit: int = 20) -> list:
"""Search Common Crawl index for pages from a domain."""
resp = requests.get(CC_INDEX_URL,
params={'url': f'{domain}/*', 'output': 'json', 'limit': limit},
timeout=30)
if resp.status_code != 200:
print(f'Common Crawl returned {resp.status_code}')
return []
lines = resp.text.strip().split('\n')
results = []
for line in lines:
try:
import json
record = json.loads(line)
results.append({
'url': record.get('url', ''),
'timestamp': record.get('timestamp', ''),
'status': record.get('status', ''),
'mime': record.get('mime', ''),
'length': record.get('length', 0),
})
except json.JSONDecodeError:
continue
return results
# Search for pages from a domain
pages = search_common_crawl('docs.python.org')
print(f'Found {len(pages)} archived pages from docs.python.org')
for p in pages[:5]:
print(f' [{p["timestamp"]}] {p["url"][:60]}')Step 2: Build the hybrid search function
Combine Common Crawl data with live search results. Use Common Crawl for historical coverage and live search for fresh results.
import os
SCAVIO_KEY = os.environ['SCAVIO_API_KEY']
def live_search(query: str, count: int = 5) -> list:
resp = requests.post('https://api.scavio.dev/api/v1/search',
headers={'x-api-key': SCAVIO_KEY, 'Content-Type': 'application/json'},
json={'query': query, 'country_code': 'us', 'num_results': count})
return [{'url': r['link'], 'title': r['title'],
'snippet': r.get('snippet', ''), 'source': 'live_search'}
for r in resp.json().get('organic_results', [])]
def hybrid_search(query: str, domain: str = None) -> dict:
"""Combine Common Crawl historical data with live results."""
results = {'historical': [], 'live': [], 'cost': 0}
# Step 1: Check Common Crawl for historical data (free)
if domain:
cc_results = search_common_crawl(domain, limit=10)
results['historical'] = [{
'url': r['url'], 'timestamp': r['timestamp'],
'source': 'common_crawl'
} for r in cc_results]
# Step 2: Get fresh live results ($0.005)
live = live_search(query, count=5)
results['live'] = live
results['cost'] = 0.005
results['total'] = len(results['historical']) + len(results['live'])
return results
result = hybrid_search('python 3.14 new features', domain='docs.python.org')
print(f'Historical (Common Crawl): {len(result["historical"])} pages')
print(f'Live (Scavio): {len(result["live"])} results')
print(f'Total: {result["total"]} | Cost: ${result["cost"]}')Step 3: Analyze coverage gaps between historical and live data
Compare what Common Crawl has versus what live search returns. This reveals content that exists but is not in the archive, and archived content that may have been removed.
def analyze_coverage(domain: str, queries: list) -> dict:
# Get Common Crawl inventory
cc_urls = set(r['url'] for r in search_common_crawl(domain, limit=50))
# Get live search URLs for the domain
live_urls = set()
for query in queries:
live = live_search(f'site:{domain} {query}', count=5)
live_urls.update(r['url'] for r in live)
# Analyze overlap
both = cc_urls & live_urls
cc_only = cc_urls - live_urls
live_only = live_urls - cc_urls
print(f'Domain: {domain}')
print(f'Common Crawl pages: {len(cc_urls)}')
print(f'Live search pages: {len(live_urls)}')
print(f'In both: {len(both)}')
print(f'CC only (may be removed): {len(cc_only)}')
print(f'Live only (new content): {len(live_only)}')
print(f'Cost: {len(queries)} live searches = ${len(queries) * 0.005:.3f}')
return {'cc_only': cc_only, 'live_only': live_only, 'both': both}
coverage = analyze_coverage('docs.python.org', ['asyncio', 'pathlib', 'dataclasses'])Python Example
import requests, os, json
SCAVIO_KEY = os.environ['SCAVIO_API_KEY']
CC_INDEX = 'https://index.commoncrawl.org/CC-MAIN-2026-04-index'
def hybrid_search(query, domain=None):
results = []
# Common Crawl (free)
if domain:
cc = requests.get(CC_INDEX, params={'url': f'{domain}/*', 'output': 'json', 'limit': 10}, timeout=30)
for line in cc.text.strip().split('\n'):
try:
r = json.loads(line)
results.append({'url': r['url'], 'source': 'common_crawl'})
except Exception:
pass
# Live search ($0.005)
live = requests.post('https://api.scavio.dev/api/v1/search',
headers={'x-api-key': SCAVIO_KEY, 'Content-Type': 'application/json'},
json={'query': query, 'country_code': 'us', 'num_results': 5})
for r in live.json().get('organic_results', []):
results.append({'url': r['link'], 'title': r['title'], 'source': 'live'})
print(f'{len(results)} total results (cost: $0.005)')
return results
hybrid_search('python asyncio', 'docs.python.org')JavaScript Example
const SCAVIO_KEY = process.env.SCAVIO_API_KEY;
async function hybridSearch(query, domain) {
const results = [];
if (domain) {
const cc = await fetch(`https://index.commoncrawl.org/CC-MAIN-2026-04-index?url=${domain}/*&output=json&limit=10`);
const lines = (await cc.text()).trim().split('\n');
lines.forEach(line => { try { results.push({...JSON.parse(line), source: 'cc'}); } catch {} });
}
const live = await fetch('https://api.scavio.dev/api/v1/search', {
method: 'POST',
headers: { 'x-api-key': SCAVIO_KEY, 'Content-Type': 'application/json' },
body: JSON.stringify({ query, country_code: 'us', num_results: 5 })
});
for (const r of (await live.json()).organic_results || []) {
results.push({ url: r.link, title: r.title, source: 'live' });
}
console.log(`${results.length} results (cost: $0.005)`);
return results;
}
hybridSearch('python asyncio', 'docs.python.org');Expected Output
Found 15 archived pages from docs.python.org
[20260115] https://docs.python.org/3/library/asyncio.html
[20260115] https://docs.python.org/3/whatsnew/3.14.html
Historical (Common Crawl): 15 pages
Live (Scavio): 5 results
Total: 20 | Cost: $0.005
Domain: docs.python.org
Common Crawl pages: 50
Live search pages: 12
In both: 8
CC only (may be removed): 42
Live only (new content): 4
Cost: 3 live searches = $0.015