JavaScript-rendered directories like Yelp, Angi, and modern SaaS marketplaces load data dynamically, making traditional scraping fail. You need headless browsers, wait for hydration, and handle anti-bot measures. A faster approach: use search to find indexed listings (Google already rendered the JS for you), then use the extract endpoint for specific pages. Scavio search costs $0.005/request and the extract endpoint pulls rendered content from any URL.
Prerequisites
- Python 3.9+ installed
- requests library installed
- A Scavio API key from scavio.dev
- Basic understanding of web data extraction
Walkthrough
Step 1: Search for indexed directory listings
Google indexes JS-rendered pages. Search for directory listings using site: queries to get pre-rendered titles and snippets without needing a browser.
import os, requests, re
SCAVIO_KEY = os.environ['SCAVIO_API_KEY']
URL = 'https://api.scavio.dev/api/v1/search'
H = {'x-api-key': SCAVIO_KEY, 'Content-Type': 'application/json'}
def search_directory_listings(directory_domain: str, niche: str, num: int = 10) -> list:
"""Find directory listings via Google's index (already JS-rendered)."""
resp = requests.post(URL, headers=H,
json={'query': f'site:{directory_domain} {niche}',
'country_code': 'us', 'num_results': num})
resp.raise_for_status()
results = resp.json().get('organic_results', [])
return [{
'title': r['title'],
'url': r['link'],
'snippet': r.get('snippet', ''),
'rich_snippet': r.get('rich_snippet', {}),
} for r in results]
# Search Yelp (heavily JS-rendered)
yelp_results = search_directory_listings('yelp.com', 'plumber San Francisco')
print(f'Yelp listings found: {len(yelp_results)}')
for r in yelp_results[:3]:
print(f' {r["title"][:60]}')
print(f' {r["snippet"][:100]}')Step 2: Extract full page data with the extract endpoint
For listings where the snippet is not enough, use the Scavio extract endpoint to get the full rendered page content. This handles JavaScript rendering server-side.
EXTRACT_URL = 'https://api.scavio.dev/api/v1/extract'
def extract_page(url: str) -> dict:
"""Extract rendered content from a JS-heavy page."""
resp = requests.post(EXTRACT_URL,
headers={'x-api-key': SCAVIO_KEY, 'Content-Type': 'application/json'},
json={'url': url})
resp.raise_for_status()
return resp.json()
def extract_listing_data(listing_url: str) -> dict:
"""Extract structured data from a directory listing page."""
page = extract_page(listing_url)
content = page.get('content', '')
metadata = page.get('metadata', {})
# Extract rating from content
rating_match = re.search(r'(\d+\.\d+)\s*(?:star|/5|rating)', content, re.I)
# Extract phone
phone_match = re.search(r'\(?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}', content)
return {
'title': metadata.get('title', ''),
'description': metadata.get('description', ''),
'rating': float(rating_match.group(1)) if rating_match else None,
'phone': phone_match.group(0) if phone_match else None,
'url': listing_url,
'content_length': len(content),
}
# Extract data from a listing
if yelp_results:
listing = extract_listing_data(yelp_results[0]['url'])
print(f'Extracted: {listing["title"]}')
print(f' Rating: {listing["rating"]}')
print(f' Phone: {listing["phone"]}')
print(f' Content: {listing["content_length"]} chars')Step 3: Build a pipeline for multiple directories
Search across multiple JS-rendered directories and combine the results. Use search for discovery and selective extract for enrichment.
import time
def pipeline(niche: str, city: str, directories: list) -> list:
"""Search multiple JS-rendered directories."""
all_listings = []
for domain in directories:
print(f'Searching {domain}...')
listings = search_directory_listings(domain, f'{niche} {city}')
for listing in listings:
listing['directory'] = domain
all_listings.extend(listings)
time.sleep(0.3)
# Enrich top results with extract (selective to save cost)
enriched = []
for listing in all_listings[:5]: # Only extract top 5
try:
data = extract_listing_data(listing['url'])
listing.update(data)
except Exception as e:
listing['extract_error'] = str(e)
enriched.append(listing)
time.sleep(0.5)
return all_listings
results = pipeline('electrician', 'Seattle WA', ['yelp.com', 'angi.com', 'thumbtack.com'])
print(f'\nTotal listings: {len(results)}')
for r in results[:5]:
rating = r.get('rating', 'N/A')
print(f" [{r['directory']}] {r['title'][:50]} (Rating: {rating})")
print(f'\nSearch cost: 3 x $0.005 = $0.015')
print(f'Extract cost: 5 x $0.005 = $0.025')
print(f'Total: $0.040')Step 4: Handle pagination and rate limiting
For larger extraction jobs, implement pagination through search results and respect rate limits to avoid being throttled.
def paginated_search(domain: str, niche: str, max_pages: int = 3) -> list:
"""Search with pagination for more results."""
all_results = []
for page in range(max_pages):
# Add page-specific terms to get different results
suffixes = ['', 'reviews', 'top rated']
suffix = suffixes[page] if page < len(suffixes) else f'page {page+1}'
query = f'site:{domain} {niche} {suffix}'.strip()
resp = requests.post(URL, headers=H,
json={'query': query, 'country_code': 'us', 'num_results': 10})
results = resp.json().get('organic_results', [])
for r in results:
# Deduplicate by URL
if r['link'] not in [x['url'] for x in all_results]:
all_results.append({
'title': r['title'],
'url': r['link'],
'snippet': r.get('snippet', ''),
})
time.sleep(0.5)
if len(results) < 5:
break # No more results
return all_results
results = paginated_search('yelp.com', 'plumber Portland OR', max_pages=3)
print(f'Paginated search: {len(results)} unique listings')
print(f'Cost: 3 pages x $0.005 = $0.015')Python Example
import os, requests, re, time
SCAVIO_KEY = os.environ['SCAVIO_API_KEY']
H = {'x-api-key': SCAVIO_KEY, 'Content-Type': 'application/json'}
def search_dir(domain, query):
resp = requests.post('https://api.scavio.dev/api/v1/search', headers=H,
json={'query': f'site:{domain} {query}', 'country_code': 'us', 'num_results': 10})
return [{'title': r['title'], 'url': r['link'], 'snippet': r.get('snippet', '')}
for r in resp.json().get('organic_results', [])]
def extract(url):
resp = requests.post('https://api.scavio.dev/api/v1/extract', headers=H, json={'url': url})
return resp.json()
listings = search_dir('yelp.com', 'plumber San Francisco')
print(f'Found {len(listings)} listings')
for l in listings[:3]:
print(f" {l['title'][:50]}")JavaScript Example
const SCAVIO_KEY = process.env.SCAVIO_API_KEY;
const H = { 'x-api-key': SCAVIO_KEY, 'Content-Type': 'application/json' };
async function searchDir(domain, query) {
const resp = await fetch('https://api.scavio.dev/api/v1/search', {
method: 'POST', headers: H,
body: JSON.stringify({ query: `site:${domain} ${query}`, country_code: 'us', num_results: 10 })
});
return ((await resp.json()).organic_results || []).map(r => ({
title: r.title, url: r.link, snippet: r.snippet || ''
}));
}
async function extract(url) {
const resp = await fetch('https://api.scavio.dev/api/v1/extract', {
method: 'POST', headers: H, body: JSON.stringify({ url })
});
return resp.json();
}
searchDir('yelp.com', 'plumber San Francisco').then(l => {
console.log(`Found ${l.length} listings`);
l.slice(0, 3).forEach(x => console.log(` ${x.title.slice(0, 50)}`));
});Expected Output
Searching yelp.com...
Searching angi.com...
Searching thumbtack.com...
Total listings: 24
[yelp.com] Mike's Electric - Seattle, WA (Rating: 4.8)
[yelp.com] Brennan Electric - Licensed Electrician (Rating: 4.9)
[angi.com] Top Electricians in Seattle (Rating: N/A)
[thumbtack.com] Seattle Electrician Services (Rating: 4.7)
Search cost: 3 x $0.005 = $0.015
Extract cost: 5 x $0.005 = $0.025
Total: $0.040