Overview
This pipeline helps teams migrate from ScrapingAnt HTML scraping to Scavio structured API calls. For each existing ScrapingAnt job, it creates an equivalent Scavio API call that returns clean JSON instead of raw HTML requiring parsing. The migration runs both paths in parallel during a transition period, comparing outputs to verify the API results match or exceed the scraping results in data quality.
Trigger
Cron schedule (daily during migration, then on-demand)
Schedule
Runs daily during migration period
Workflow Steps
Load ScrapingAnt job definitions
Read the list of existing ScrapingAnt scraping jobs with their target URLs and extraction rules.
Map jobs to Scavio API equivalents
Convert each scraping job to the equivalent Scavio API call (Google search, Amazon search, etc.).
Run both paths in parallel
Execute both the ScrapingAnt job and the Scavio API call for comparison during transition.
Compare output quality
Verify that Scavio API results contain equivalent or better data than the scraped HTML output.
Log migration progress
Track which jobs have been verified and are ready to cut over to API-only mode.
Python Implementation
import requests
import json
from pathlib import Path
from datetime import datetime
API_KEY = "your_scavio_api_key"
# Existing ScrapingAnt jobs mapped to Scavio equivalents
MIGRATION_JOBS = [
{
"id": "job_001",
"description": "Product prices on Amazon",
"scrapingant_url": "https://www.amazon.com/s?k=wireless+earbuds",
"scavio_platform": "amazon",
"scavio_query": "wireless earbuds",
"required_fields": ["title", "price", "rating"],
},
{
"id": "job_002",
"description": "Google search results for SEO tracking",
"scrapingant_url": "https://www.google.com/search?q=best+crm+software",
"scavio_platform": "google",
"scavio_query": "best crm software",
"required_fields": ["title", "link", "snippet"],
},
]
def run_scavio(platform: str, query: str, required_fields: list[str]) -> dict:
res = requests.post(
"https://api.scavio.dev/api/v1/search",
headers={"x-api-key": API_KEY},
json={"platform": platform, "query": query},
timeout=15,
)
res.raise_for_status()
results = res.json().get("organic", [])
# Check field coverage
covered = 0
total = 0
for r in results[:5]:
for field in required_fields:
total += 1
if r.get(field):
covered += 1
coverage = covered / total if total else 0
return {
"results_count": len(results),
"field_coverage": round(coverage * 100, 1),
"sample": results[0] if results else {},
}
def run():
date = datetime.utcnow().strftime("%Y-%m-%d")
migration_report = []
for job in MIGRATION_JOBS:
scavio_result = run_scavio(job["scavio_platform"], job["scavio_query"], job["required_fields"])
ready = scavio_result["field_coverage"] >= 80
migration_report.append({
"job_id": job["id"],
"description": job["description"],
"scavio_results": scavio_result["results_count"],
"field_coverage_pct": scavio_result["field_coverage"],
"ready_to_migrate": ready,
})
ready_count = sum(1 for r in migration_report if r["ready_to_migrate"])
output = {
"date": date,
"total_jobs": len(MIGRATION_JOBS),
"ready_to_migrate": ready_count,
"jobs": migration_report,
}
Path(f"migration_report_{date}.json").write_text(json.dumps(output, indent=2))
print(f"Migration check {date}: {ready_count}/{len(MIGRATION_JOBS)} jobs ready")
for r in migration_report:
status = "READY" if r["ready_to_migrate"] else "PENDING"
print(f" [{status}] {r['description']}: {r['field_coverage_pct']}% field coverage")
if __name__ == "__main__":
run()JavaScript Implementation
const API_KEY = "your_scavio_api_key";
const JOBS = [
{ id: "job_001", platform: "amazon", query: "wireless earbuds", required: ["title", "price"] },
{ id: "job_002", platform: "google", query: "best crm software", required: ["title", "link", "snippet"] },
];
async function testMigration(job) {
const res = await fetch("https://api.scavio.dev/api/v1/search", {
method: "POST",
headers: { "x-api-key": API_KEY, "content-type": "application/json" },
body: JSON.stringify({ platform: job.platform, query: job.query }),
});
if (!res.ok) return { jobId: job.id, ready: false, results: 0 };
const organic = (await res.json()).organic ?? [];
const covered = organic.slice(0, 5).reduce((s, r) => s + job.required.filter((f) => r[f]).length, 0);
const total = Math.min(organic.length, 5) * job.required.length;
const coverage = total ? Math.round((covered / total) * 100) : 0;
return { jobId: job.id, ready: coverage >= 80, coverage, results: organic.length };
}
for (const job of JOBS) {
const result = await testMigration(job);
console.log(`${result.ready ? "READY" : "PENDING"} ${result.jobId}: ${result.coverage}% coverage, ${result.results} results`);
}Platforms Used
Web search with knowledge graph, PAA, and AI overviews
Amazon
Product search with prices, ratings, and reviews