n8n HTTP Request nodes that scrape websites break constantly due to HTML changes, CAPTCHAs, and rate limits. Replacing them with structured API calls returns clean JSON, never breaks on layout changes, and eliminates proxy costs. This tutorial migrates common n8n scraping patterns to API calls node by node.
Prerequisites
- n8n instance running
- A Scavio API key from scavio.dev
- Existing n8n workflows with HTTP scraping nodes
- Basic n8n workflow knowledge
Walkthrough
Step 1: Identify scraping nodes to replace
Export your n8n workflow and find HTTP Request nodes that scrape websites.
import json, os, requests
API_KEY = os.environ['SCAVIO_API_KEY']
SH = {'x-api-key': API_KEY, 'Content-Type': 'application/json'}
# Analyze an n8n workflow export for scraping nodes
def find_scraping_nodes(workflow_json):
nodes = workflow_json.get('nodes', [])
scraping_nodes = []
for node in nodes:
if node.get('type') == 'n8n-nodes-base.httpRequest':
url = node.get('parameters', {}).get('url', '')
if any(site in url for site in ['google.com', 'amazon.com', 'reddit.com', 'bing.com']):
scraping_nodes.append({
'name': node.get('name', 'unnamed'),
'url': url,
'type': 'replaceable'
})
return scraping_nodes
# Simulated workflow analysis
sample = {'nodes': [
{'type': 'n8n-nodes-base.httpRequest', 'name': 'Scrape Google', 'parameters': {'url': 'https://google.com/search?q=test'}},
{'type': 'n8n-nodes-base.httpRequest', 'name': 'Scrape Amazon', 'parameters': {'url': 'https://amazon.com/s?k=test'}},
{'type': 'n8n-nodes-base.httpRequest', 'name': 'Internal API', 'parameters': {'url': 'https://api.mycompany.com/data'}},
]}
scrapers = find_scraping_nodes(sample)
print(f'Found {len(scrapers)} scraping nodes to replace:')
for s in scrapers:
print(f' {s["name"]}: {s["url"][:50]}')Step 2: Create replacement API node configuration
Generate n8n HTTP Request node configs that use the search API instead.
def generate_replacement_node(scraping_node):
"""Generate n8n node config for API replacement."""
url = scraping_node['url']
name = scraping_node['name']
# Determine platform from URL
platform = None
if 'google.com' in url: platform = None # default is Google
elif 'amazon.com' in url: platform = 'amazon'
elif 'reddit.com' in url: platform = 'reddit'
body = {'query': '{{ $json.query }}', 'country_code': 'us'}
if platform:
body['platform'] = platform
replacement = {
'name': f'{name} (API)',
'type': 'n8n-nodes-base.httpRequest',
'parameters': {
'method': 'POST',
'url': 'https://api.scavio.dev/api/v1/search',
'headers': {
'x-api-key': '{{ $env.SCAVIO_API_KEY }}',
'Content-Type': 'application/json'
},
'body': json.dumps(body),
'responseFormat': 'json'
}
}
return replacement
print('=== Replacement Nodes ===')
for s in scrapers:
replacement = generate_replacement_node(s)
print(f'\n{s["name"]} -> {replacement["name"]}')
print(f' URL: {replacement["parameters"]["url"]}')
print(f' Method: POST (was GET)')
print(f' Response: Clean JSON (was raw HTML)')Step 3: Test replacement and compare output
Run both old and new approaches to verify data quality matches.
def compare_outputs(query, platform=None):
"""Compare API output quality for the replacement."""
body = {'query': query, 'country_code': 'us'}
if platform:
body['platform'] = platform
data = requests.post('https://api.scavio.dev/api/v1/search',
headers=SH, json=body).json()
results = data.get('organic_results', [])
print(f'\nQuery: "{query}" (platform: {platform or "google"})')
print(f' Results: {len(results)}')
print(f' Fields per result: {list(results[0].keys()) if results else "N/A"}')
if results:
print(f' Sample: {results[0].get("title", "")[:50]}')
print(f' Format: Structured JSON (no HTML parsing needed)')
print(f' Cost: $0.005 per query')
print(f' Reliability: No CAPTCHAs, no proxy needed, no HTML changes')
compare_outputs('wireless earbuds review')
compare_outputs('wireless earbuds', platform='amazon')
compare_outputs('wireless earbuds recommendation', platform='reddit')
print(f'\n=== Migration Summary ===')
print(f' Nodes to replace: {len(scrapers)}')
print(f' Time to migrate: ~10 minutes per node')
print(f' Monthly savings: proxy costs + maintenance time')Python Example
import os, requests
SH = {'x-api-key': os.environ['SCAVIO_API_KEY'], 'Content-Type': 'application/json'}
# Before: n8n HTTP scraping Google (breaks often)
# After: n8n HTTP Request to API (stable JSON)
data = requests.post('https://api.scavio.dev/api/v1/search',
headers=SH, json={'query': 'wireless earbuds', 'country_code': 'us'}).json()
print(f'Results: {len(data.get("organic_results", []))}')
print(f'Format: JSON | No HTML parsing | $0.005/query')JavaScript Example
const SH = { 'x-api-key': process.env.SCAVIO_API_KEY, 'Content-Type': 'application/json' };
const data = await fetch('https://api.scavio.dev/api/v1/search', {
method: 'POST', headers: SH,
body: JSON.stringify({ query: 'wireless earbuds', country_code: 'us' })
}).then(r => r.json());
console.log(`Results: ${(data.organic_results || []).length}`);
console.log('Format: JSON | No HTML parsing | $0.005/query');Expected Output
Found 2 scraping nodes to replace:
Scrape Google: https://google.com/search?q=test
Scrape Amazon: https://amazon.com/s?k=test
Query: "wireless earbuds review" (platform: google)
Results: 10
Fields per result: ['title', 'link', 'snippet', 'position']
Format: Structured JSON (no HTML parsing needed)
Cost: $0.005 per query
=== Migration Summary ===
Nodes to replace: 2
Time to migrate: ~10 minutes per node
Monthly savings: proxy costs + maintenance time