Benchmark search APIs directly inside Claude Code by sending identical queries to each provider and measuring latency, result count, and snippet relevance. This approach lets you evaluate providers in the same environment your agent will use. Scavio serves as both a benchmark participant and the multi-platform baseline since one API key covers Google, YouTube, Reddit, Amazon, and Walmart. This tutorial builds a benchmarking harness that produces a comparison table with p50/p95 latencies and result quality scores.
Prerequisites
- Claude Code or Claude Desktop with MCP configured
- Python 3.8+ installed
- requests library installed
- A Scavio API key from scavio.dev
Walkthrough
Step 1: Set up the benchmark harness
Define the queries to test and the metrics to collect for each API call.
import requests, os, time, statistics
API_KEY = os.environ['SCAVIO_API_KEY']
ENDPOINT = 'https://api.scavio.dev/api/v1/search'
TEST_QUERIES = [
'best crm for startups 2026',
'python web scraping tutorial',
'react vs vue performance comparison',
'how to deploy fastapi on aws',
]
def benchmark_query(query: str) -> dict:
start = time.monotonic()
resp = requests.post(ENDPOINT, headers={'x-api-key': API_KEY},
json={'platform': 'google', 'query': query}, timeout=15)
latency = (time.monotonic() - start) * 1000
data = resp.json()
results = data.get('organic_results', [])
return {'query': query, 'latency_ms': round(latency, 1), 'result_count': len(results), 'status': resp.status_code}Step 2: Run multiple iterations
Execute each query multiple times to get stable latency measurements.
def run_benchmark(queries: list, iterations: int = 3) -> list:
all_results = []
for query in queries:
latencies = []
for _ in range(iterations):
result = benchmark_query(query)
latencies.append(result['latency_ms'])
time.sleep(0.5)
all_results.append({
'query': query,
'p50_ms': round(statistics.median(latencies), 1),
'p95_ms': round(sorted(latencies)[int(len(latencies) * 0.95)], 1),
'avg_results': result['result_count'],
})
return all_resultsStep 3: Score result relevance
Use a simple keyword-overlap heuristic to score how relevant the returned snippets are to the query.
def relevance_score(query: str, results: list) -> float:
query_terms = set(query.lower().split())
scores = []
for r in results[:5]:
snippet = (r.get('snippet', '') + ' ' + r.get('title', '')).lower()
overlap = sum(1 for t in query_terms if t in snippet)
scores.append(overlap / max(len(query_terms), 1))
return round(sum(scores) / max(len(scores), 1), 2)
# Add relevance to benchmark:
def benchmark_with_relevance(query: str) -> dict:
resp = requests.post(ENDPOINT, headers={'x-api-key': API_KEY},
json={'platform': 'google', 'query': query}, timeout=15)
data = resp.json()
results = data.get('organic_results', [])
return {'query': query, 'relevance': relevance_score(query, results), 'count': len(results)}Step 4: Generate the comparison table
Print a formatted table comparing latency, result count, and relevance scores.
def print_benchmark_table(results: list):
print(f'{"Query":<40} {"p50 (ms)":>10} {"p95 (ms)":>10} {"Results":>8}')
print('-' * 72)
for r in results:
print(f'{r["query"][:38]:<40} {r["p50_ms"]:>10} {r["p95_ms"]:>10} {r["avg_results"]:>8}')
latencies = [r['p50_ms'] for r in results]
print(f'\nOverall p50: {statistics.median(latencies):.1f}ms')
results = run_benchmark(TEST_QUERIES)
print_benchmark_table(results)Python Example
import requests, os, time, statistics
H = {'x-api-key': os.environ['SCAVIO_API_KEY']}
def bench(query, n=3):
times = []
for _ in range(n):
t = time.monotonic()
r = requests.post('https://api.scavio.dev/api/v1/search', headers=H,
json={'platform': 'google', 'query': query}, timeout=15)
times.append((time.monotonic() - t) * 1000)
return {'query': query, 'p50': round(statistics.median(times), 1), 'count': len(r.json().get('organic_results', []))}
for q in ['best crm 2026', 'python web scraping']:
print(bench(q))JavaScript Example
const H = {'x-api-key': process.env.SCAVIO_API_KEY, 'Content-Type': 'application/json'};
async function bench(query, n = 3) {
const times = [];
let count = 0;
for (let i = 0; i < n; i++) {
const t = performance.now();
const r = await fetch('https://api.scavio.dev/api/v1/search', {
method: 'POST', headers: H, body: JSON.stringify({platform: 'google', query})
});
times.push(performance.now() - t);
count = ((await r.json()).organic_results || []).length;
}
times.sort((a, b) => a - b);
return {query, p50: times[Math.floor(times.length / 2)].toFixed(1), count};
}
bench('best crm 2026').then(console.log);Expected Output
A benchmark table showing p50/p95 latency in milliseconds, result count, and relevance scores for each test query across the Scavio API.