Not all search APIs produce equal RAG results. Snippet length, result freshness, and relevance scoring directly impact your LLM answer quality. This tutorial builds a benchmark framework that tests search APIs across dimensions that matter for RAG: snippet coverage, result freshness, title relevance, and cost per useful result. Test Scavio ($0.005/credit, 6 platforms), Tavily ($30/mo for 10K), SerpAPI ($25/mo for 1K), and others using the same evaluation criteria.
Prerequisites
- Python 3.9+ installed
- requests library installed
- A Scavio API key from scavio.dev
- Optional: API keys for providers you want to compare
Walkthrough
Step 1: Define the benchmark test suite
Create a set of diverse queries that test different RAG scenarios: factual lookups, comparison questions, technical queries, and current events.
import os, time, requests
from dataclasses import dataclass, field
@dataclass
class BenchmarkQuery:
query: str
category: str
expected_terms: list # Terms we expect in good results
TEST_SUITE = [
BenchmarkQuery('Python 3.15 release date', 'factual',
['python', '3.15', 'release', '2026']),
BenchmarkQuery('FastAPI vs Django performance 2026', 'comparison',
['fastapi', 'django', 'performance', 'benchmark']),
BenchmarkQuery('how to deploy to Cloudflare Workers', 'technical',
['cloudflare', 'workers', 'deploy', 'wrangler']),
BenchmarkQuery('best noise cancelling headphones 2026', 'product',
['noise', 'cancelling', 'headphones', 'best']),
BenchmarkQuery('React Server Components production patterns', 'technical',
['react', 'server', 'components', 'rsc']),
]
print(f'Benchmark suite: {len(TEST_SUITE)} queries')
for q in TEST_SUITE:
print(f' [{q.category}] {q.query}')Step 2: Build the evaluation metrics
Measure four dimensions: snippet coverage (how much text per result), term relevance (expected terms found), freshness (2026 mentions), and result count.
@dataclass
class EvalResult:
query: str
provider: str
result_count: int
avg_snippet_length: float
term_coverage: float # 0-1 how many expected terms found
freshness_score: float # 0-1 mentions of current year
latency_ms: float
cost_per_query: float
def evaluate_results(query: BenchmarkQuery, results: list, provider: str,
latency_ms: float, cost: float) -> EvalResult:
if not results:
return EvalResult(query.query, provider, 0, 0, 0, 0, latency_ms, cost)
# Snippet coverage
snippets = [r.get('snippet', '') for r in results]
avg_len = sum(len(s) for s in snippets) / len(snippets)
# Term relevance
all_text = ' '.join(f"{r.get('title','')} {r.get('snippet','')}" for r in results).lower()
terms_found = sum(1 for t in query.expected_terms if t.lower() in all_text)
term_coverage = terms_found / len(query.expected_terms) if query.expected_terms else 0
# Freshness
fresh_count = sum(1 for r in results if '2026' in f"{r.get('title','')} {r.get('snippet','')}")
freshness = fresh_count / len(results)
return EvalResult(
query=query.query, provider=provider,
result_count=len(results), avg_snippet_length=avg_len,
term_coverage=term_coverage, freshness_score=freshness,
latency_ms=latency_ms, cost_per_query=cost
)
print('Evaluation metrics defined')Step 3: Run the Scavio benchmark
Execute all test queries against the Scavio API and collect evaluation metrics. Each query costs $0.005.
SCAVIO_KEY = os.environ['SCAVIO_API_KEY']
def benchmark_scavio(test_suite: list[BenchmarkQuery]) -> list[EvalResult]:
results = []
for bq in test_suite:
start = time.time()
resp = requests.post('https://api.scavio.dev/api/v1/search',
headers={'x-api-key': SCAVIO_KEY, 'Content-Type': 'application/json'},
json={'query': bq.query, 'country_code': 'us', 'num_results': 10})
latency = (time.time() - start) * 1000
organic = resp.json().get('organic_results', [])
search_results = [{'title': r['title'], 'snippet': r.get('snippet', ''),
'link': r['link']} for r in organic]
eval_result = evaluate_results(bq, search_results, 'scavio', latency, 0.005)
results.append(eval_result)
time.sleep(0.3)
return results
scavio_results = benchmark_scavio(TEST_SUITE)
for er in scavio_results:
print(f'[{er.provider}] {er.query[:40]}')
print(f' Results: {er.result_count}, Snippets: {er.avg_snippet_length:.0f} chars')
print(f' Relevance: {er.term_coverage:.0%}, Fresh: {er.freshness_score:.0%}')
print(f' Latency: {er.latency_ms:.0f}ms, Cost: ${er.cost_per_query}')Step 4: Generate the comparison report
Aggregate results across all queries and providers into a summary report. Rank providers by a composite score weighted toward RAG-relevant metrics.
def benchmark_report(all_results: dict[str, list[EvalResult]]):
print('Search API Benchmark for RAG Quality')
print('=' * 55)
summaries = {}
for provider, results in all_results.items():
n = len(results)
summaries[provider] = {
'avg_results': sum(r.result_count for r in results) / n,
'avg_snippet': sum(r.avg_snippet_length for r in results) / n,
'avg_relevance': sum(r.term_coverage for r in results) / n,
'avg_freshness': sum(r.freshness_score for r in results) / n,
'avg_latency': sum(r.latency_ms for r in results) / n,
'cost_per_query': results[0].cost_per_query,
}
# Composite score: relevance 40%, snippets 25%, freshness 20%, cost 15%
for provider, s in summaries.items():
snippet_score = min(s['avg_snippet'] / 200, 1) # Normalize to 0-1
cost_score = 1 - min(s['cost_per_query'] / 0.05, 1) # Lower is better
composite = (s['avg_relevance'] * 0.4 + snippet_score * 0.25 +
s['avg_freshness'] * 0.2 + cost_score * 0.15)
s['composite'] = composite
# Sort by composite score
ranked = sorted(summaries.items(), key=lambda x: x[1]['composite'], reverse=True)
for rank, (provider, s) in enumerate(ranked, 1):
print(f'\n#{rank} {provider.upper()}')
print(f' Relevance: {s["avg_relevance"]:.0%} Snippets: {s["avg_snippet"]:.0f} chars')
print(f' Freshness: {s["avg_freshness"]:.0%} Latency: {s["avg_latency"]:.0f}ms')
print(f' Cost: ${s["cost_per_query"]}/query Composite: {s["composite"]:.2f}')
benchmark_report({'scavio': scavio_results})Python Example
import os, time, requests
SCAVIO_KEY = os.environ['SCAVIO_API_KEY']
def benchmark(queries):
for q in queries:
start = time.time()
resp = requests.post('https://api.scavio.dev/api/v1/search',
headers={'x-api-key': SCAVIO_KEY, 'Content-Type': 'application/json'},
json={'query': q, 'country_code': 'us', 'num_results': 10})
latency = (time.time() - start) * 1000
results = resp.json().get('organic_results', [])
snippets = [r.get('snippet', '') for r in results]
avg_len = sum(len(s) for s in snippets) / len(snippets) if snippets else 0
print(f'{q[:40]:40s} | {len(results):2d} results | {avg_len:5.0f} chars | {latency:4.0f}ms')
time.sleep(0.3)
benchmark(['Python 3.15 release date', 'FastAPI vs Django 2026',
'best headphones 2026', 'deploy cloudflare workers'])JavaScript Example
const SCAVIO_KEY = process.env.SCAVIO_API_KEY;
async function benchmark(queries) {
for (const q of queries) {
const start = Date.now();
const resp = await fetch('https://api.scavio.dev/api/v1/search', {
method: 'POST',
headers: { 'x-api-key': SCAVIO_KEY, 'Content-Type': 'application/json' },
body: JSON.stringify({ query: q, country_code: 'us', num_results: 10 })
});
const latency = Date.now() - start;
const results = (await resp.json()).organic_results || [];
const avgSnippet = results.reduce((s, r) => s + (r.snippet || '').length, 0) / (results.length || 1);
console.log(`${q.slice(0,40).padEnd(40)} | ${results.length} results | ${avgSnippet.toFixed(0)} chars | ${latency}ms`);
}
}
benchmark(['Python 3.15 release', 'FastAPI vs Django', 'best headphones 2026']);Expected Output
Search API Benchmark for RAG Quality
=======================================================
#1 SCAVIO
Relevance: 85% Snippets: 156 chars
Freshness: 60% Latency: 340ms
Cost: $0.005/query Composite: 0.78
Python 3.15 release date | 10 results | 145 chars | 320ms
FastAPI vs Django 2026 | 10 results | 162 chars | 290ms
best headphones 2026 | 10 results | 158 chars | 310ms