Web scrapers break when they hit CAPTCHAs, IP bans, and rate limits. A structured API returns parsed JSON without any of these issues because the API provider handles browser rendering, proxy rotation, and CAPTCHA solving on their end. This tutorial migrates a CAPTCHA-plagued scraping pipeline to clean API calls, showing the before/after for Google, Amazon, and Reddit.
Prerequisites
- Python 3.8+
- requests library
- A Scavio API key from scavio.dev
- Existing scraping pipeline to migrate (optional)
Walkthrough
Step 1: Compare scraper vs API approaches
Show the problems with scraping and how an API eliminates them.
import os, requests, time
API_KEY = os.environ['SCAVIO_API_KEY']
SH = {'x-api-key': API_KEY, 'Content-Type': 'application/json'}
# --- BEFORE: Scraper approach (common failure modes) ---
# def scrape_google(query):
# try:
# r = requests.get(f'https://www.google.com/search?q={query}',
# headers={'User-Agent': 'Mozilla/5.0'})
# if r.status_code == 429: raise Exception('Rate limited')
# if 'captcha' in r.text.lower(): raise Exception('CAPTCHA triggered')
# # Parse HTML... breaks when Google changes layout
# except Exception as e:
# print(f'Scraper failed: {e}') # This happens constantly
# --- AFTER: API approach (no CAPTCHAs, no parsing) ---
def search(query, platform=None):
body = {'query': query, 'country_code': 'us'}
if platform: body['platform'] = platform
data = requests.post('https://api.scavio.dev/api/v1/search',
headers=SH, json=body).json()
return data.get('organic_results', [])
results = search('best python framework 2026')
print(f'API returned {len(results)} results. No CAPTCHA. No IP ban. No HTML parsing.')
for r in results[:3]:
print(f' {r["position"]}. {r["title"][:50]}')Step 2: Migrate Google data extraction
Replace Google scraping with structured API calls.
def migrate_google_pipeline(queries):
"""Before: 50+ lines of scraping code, proxy rotation, CAPTCHA handling.
After: 5 lines per query."""
results = []
failures = 0
for query in queries:
data = requests.post('https://api.scavio.dev/api/v1/search',
headers=SH, json={'query': query, 'country_code': 'us'}).json()
organic = data.get('organic_results', [])
if organic:
results.append({'query': query, 'results': len(organic),
'top': organic[0]['title'][:50]})
else:
failures += 1
success_rate = (len(queries) - failures) / len(queries) * 100
print(f'Migrated Google pipeline:')
print(f' Queries: {len(queries)} | Success: {success_rate:.0f}% | Failures: {failures}')
print(f' Cost: ${len(queries) * 0.005:.3f}')
print(f' CAPTCHA blocks: 0 (vs typical 5-15% with scrapers)')
print(f' Lines of code: ~5 per query (vs ~50 with scraping + parsing)')
return results
queries = ['python web framework', 'serp api 2026', 'best code editor',
'react vs vue', 'machine learning tutorial']
migrate_google_pipeline(queries)Step 3: Migrate Amazon and Reddit extraction
Replace platform-specific scrapers with platform API parameters.
def migrate_amazon_pipeline(products):
"""Before: Selenium + CAPTCHA solver + proxy rotation for Amazon.
After: Same API, platform='amazon'."""
for product in products:
data = requests.post('https://api.scavio.dev/api/v1/search',
headers=SH, json={'query': product, 'platform': 'amazon', 'country_code': 'us'}).json()
results = data.get('organic_results', [])[:3]
print(f' Amazon: {product[:30]:30} | {len(results)} results | Top: {results[0].get("price", "N/A") if results else "N/A"}')
def migrate_reddit_pipeline(queries):
"""Before: Reddit rate limits + auth + JSON parsing.
After: Same API, platform='reddit'."""
for query in queries:
data = requests.post('https://api.scavio.dev/api/v1/search',
headers=SH, json={'query': query, 'platform': 'reddit', 'country_code': 'us'}).json()
results = data.get('organic_results', [])[:3]
print(f' Reddit: {query[:30]:30} | {len(results)} discussions')
print('Amazon migration:')
migrate_amazon_pipeline(['wireless earbuds', 'laptop stand', 'usb hub'])
print('\nReddit migration:')
migrate_reddit_pipeline(['best serp api', 'python web scraping', 'api recommendation'])Step 4: Compare reliability and cost
Run a reliability test and calculate cost savings.
def reliability_test(queries, platforms):
total = 0
success = 0
start = time.time()
for query in queries:
for platform in platforms:
total += 1
body = {'query': query, 'country_code': 'us'}
if platform != 'google': body['platform'] = platform
try:
r = requests.post('https://api.scavio.dev/api/v1/search',
headers=SH, json=body)
if r.status_code == 200:
success += 1
except: pass
elapsed = time.time() - start
cost = total * 0.005
print(f'\n=== Pipeline Reliability Report ===')
print(f' Requests: {total} | Success: {success} ({success/total*100:.0f}%)')
print(f' Time: {elapsed:.1f}s | Avg: {elapsed/total:.2f}s per request')
print(f' Cost: ${cost:.3f}')
print(f' CAPTCHA blocks: 0')
print(f' IP bans: 0')
print(f' HTML parsing errors: 0')
print(f'\n vs Scraper estimate:')
print(f' Typical scraper success rate: 80-90%')
print(f' Proxy cost: $10-50/month')
print(f' CAPTCHA solver: $1-3/1000 solves')
print(f' Maintenance: 2-5 hours/month fixing broken selectors')
reliability_test(['serp api', 'web scraping'], ['google', 'amazon', 'reddit'])Python Example
import os, requests
SH = {'x-api-key': os.environ['SCAVIO_API_KEY'], 'Content-Type': 'application/json'}
def pipeline(query, platform=None):
body = {'query': query, 'country_code': 'us'}
if platform: body['platform'] = platform
data = requests.post('https://api.scavio.dev/api/v1/search',
headers=SH, json=body).json()
results = data.get('organic_results', [])
print(f'{platform or "google"}: {len(results)} results, 0 CAPTCHAs. Cost: $0.005')
for p in [None, 'amazon', 'reddit']:
pipeline('wireless earbuds', p)JavaScript Example
const SH = { 'x-api-key': process.env.SCAVIO_API_KEY, 'Content-Type': 'application/json' };
async function pipeline(query, platform) {
const body = { query, country_code: 'us' };
if (platform) body.platform = platform;
const data = await fetch('https://api.scavio.dev/api/v1/search', {
method: 'POST', headers: SH, body: JSON.stringify(body)
}).then(r => r.json());
console.log(`${platform || 'google'}: ${(data.organic_results || []).length} results, 0 CAPTCHAs`);
}
for (const p of [null, 'amazon', 'reddit']) await pipeline('wireless earbuds', p);Expected Output
API returned 10 results. No CAPTCHA. No IP ban. No HTML parsing.
1. FastAPI - Modern Python Web Framework
2. Django - The web framework for perfectionists
Migrated Google pipeline:
Queries: 5 | Success: 100% | Failures: 0
Cost: $0.025
CAPTCHA blocks: 0
Lines of code: ~5 per query (vs ~50 with scraping + parsing)
=== Pipeline Reliability Report ===
Requests: 6 | Success: 6 (100%)
CAPTCHA blocks: 0
IP bans: 0