Toutes les API de recherche ne produisent pas des résultats RAG équivalents. La longueur des extraits, la fraîcheur des résultats et le score de pertinence impactent directement la qualité des réponses de votre LLM. Ce tutoriel construit un cadre de benchmark qui teste les API de recherche selon des dimensions importantes pour le RAG : couverture des extraits, fraîcheur des résultats, pertinence des titres et coût par résultat utile. Testez Scavio ($0.005/crédit, 6 plateformes), Tavily ($30/mois pour 10K), SerpAPI ($25/mois pour 1K) et d'autres en utilisant les mêmes critères d'évaluation.
Prérequis
- Python 3.9+ installé
- bibliothèque requests installée
- Une clé API Scavio provenant de scavio.dev
- Facultatif : clés API pour les fournisseurs que vous souhaitez comparer
Parcours
Étape 1: Définir la suite de tests de benchmark
Créer un ensemble de requêtes variées qui testent différents scénarios RAG : recherches factuelles, questions de comparaison, requêtes techniques et événements actuels.
import os, time, requests
from dataclasses import dataclass, field
@dataclass
class BenchmarkQuery:
query: str
category: str
expected_terms: list # Terms we expect in good results
TEST_SUITE = [
BenchmarkQuery('Python 3.15 release date', 'factual',
['python', '3.15', 'release', '2026']),
BenchmarkQuery('FastAPI vs Django performance 2026', 'comparison',
['fastapi', 'django', 'performance', 'benchmark']),
BenchmarkQuery('how to deploy to Cloudflare Workers', 'technical',
['cloudflare', 'workers', 'deploy', 'wrangler']),
BenchmarkQuery('best noise cancelling headphones 2026', 'product',
['noise', 'cancelling', 'headphones', 'best']),
BenchmarkQuery('React Server Components production patterns', 'technical',
['react', 'server', 'components', 'rsc']),
]
print(f'Benchmark suite: {len(TEST_SUITE)} queries')
for q in TEST_SUITE:
print(f' [{q.category}] {q.query}')Étape 2: Construire les métriques d'évaluation
Mesurer quatre dimensions : couverture des extraits (quantité de texte par résultat), pertinence des termes (termes attendus trouvés), fraîcheur (mentions 2026) et nombre de résultats.
@dataclass
class EvalResult:
query: str
provider: str
result_count: int
avg_snippet_length: float
term_coverage: float # 0-1 how many expected terms found
freshness_score: float # 0-1 mentions of current year
latency_ms: float
cost_per_query: float
def evaluate_results(query: BenchmarkQuery, results: list, provider: str,
latency_ms: float, cost: float) -> EvalResult:
if not results:
return EvalResult(query.query, provider, 0, 0, 0, 0, latency_ms, cost)
# Snippet coverage
snippets = [r.get('snippet', '') for r in results]
avg_len = sum(len(s) for s in snippets) / len(snippets)
# Term relevance
all_text = ' '.join(f"{r.get('title','')} {r.get('snippet','')}" for r in results).lower()
terms_found = sum(1 for t in query.expected_terms if t.lower() in all_text)
term_coverage = terms_found / len(query.expected_terms) if query.expected_terms else 0
# Freshness
fresh_count = sum(1 for r in results if '2026' in f"{r.get('title','')} {r.get('snippet','')}")
freshness = fresh_count / len(results)
return EvalResult(
query=query.query, provider=provider,
result_count=len(results), avg_snippet_length=avg_len,
term_coverage=term_coverage, freshness_score=freshness,
latency_ms=latency_ms, cost_per_query=cost
)
print('Evaluation metrics defined')Étape 3: Exécuter le benchmark Scavio
Exécuter toutes les requêtes de test contre l'API Scavio et collecter les métriques d'évaluation. Chaque requête coûte $0.005.
SCAVIO_KEY = os.environ['SCAVIO_API_KEY']
def benchmark_scavio(test_suite: list[BenchmarkQuery]) -> list[EvalResult]:
results = []
for bq in test_suite:
start = time.time()
resp = requests.post('https://api.scavio.dev/api/v1/search',
headers={'x-api-key': SCAVIO_KEY, 'Content-Type': 'application/json'},
json={'query': bq.query, 'country_code': 'us', 'num_results': 10})
latency = (time.time() - start) * 1000
organic = resp.json().get('organic_results', [])
search_results = [{'title': r['title'], 'snippet': r.get('snippet', ''),
'link': r['link']} for r in organic]
eval_result = evaluate_results(bq, search_results, 'scavio', latency, 0.005)
results.append(eval_result)
time.sleep(0.3)
return results
scavio_results = benchmark_scavio(TEST_SUITE)
for er in scavio_results:
print(f'[{er.provider}] {er.query[:40]}')
print(f' Results: {er.result_count}, Snippets: {er.avg_snippet_length:.0f} chars')
print(f' Relevance: {er.term_coverage:.0%}, Fresh: {er.freshness_score:.0%}')
print(f' Latency: {er.latency_ms:.0f}ms, Cost: ${er.cost_per_query}')Étape 4: Générer le rapport de comparaison
Agréger les résultats de toutes les requêtes et tous les fournisseurs dans un rapport récapitulatif. Classer les fournisseurs par un score composite pondéré en faveur des métriques pertinentes pour le RAG.
def benchmark_report(all_results: dict[str, list[EvalResult]]):
print('Search API Benchmark for RAG Quality')
print('=' * 55)
summaries = {}
for provider, results in all_results.items():
n = len(results)
summaries[provider] = {
'avg_results': sum(r.result_count for r in results) / n,
'avg_snippet': sum(r.avg_snippet_length for r in results) / n,
'avg_relevance': sum(r.term_coverage for r in results) / n,
'avg_freshness': sum(r.freshness_score for r in results) / n,
'avg_latency': sum(r.latency_ms for r in results) / n,
'cost_per_query': results[0].cost_per_query,
}
# Composite score: relevance 40%, snippets 25%, freshness 20%, cost 15%
for provider, s in summaries.items():
snippet_score = min(s['avg_snippet'] / 200, 1) # Normalize to 0-1
cost_score = 1 - min(s['cost_per_query'] / 0.05, 1) # Lower is better
composite = (s['avg_relevance'] * 0.4 + snippet_score * 0.25 +
s['avg_freshness'] * 0.2 + cost_score * 0.15)
s['composite'] = composite
# Sort by composite score
ranked = sorted(summaries.items(), key=lambda x: x[1]['composite'], reverse=True)
for rank, (provider, s) in enumerate(ranked, 1):
print(f'\n#{rank} {provider.upper()}')
print(f' Relevance: {s["avg_relevance"]:.0%} Snippets: {s["avg_snippet"]:.0f} chars')
print(f' Freshness: {s["avg_freshness"]:.0%} Latency: {s["avg_latency"]:.0f}ms')
print(f' Cost: ${s["cost_per_query"]}/query Composite: {s["composite"]:.2f}')
benchmark_report({'scavio': scavio_results})Exemple Python
import os, time, requests
SCAVIO_KEY = os.environ['SCAVIO_API_KEY']
def benchmark(queries):
for q in queries:
start = time.time()
resp = requests.post('https://api.scavio.dev/api/v1/search',
headers={'x-api-key': SCAVIO_KEY, 'Content-Type': 'application/json'},
json={'query': q, 'country_code': 'us', 'num_results': 10})
latency = (time.time() - start) * 1000
results = resp.json().get('organic_results', [])
snippets = [r.get('snippet', '') for r in results]
avg_len = sum(len(s) for s in snippets) / len(snippets) if snippets else 0
print(f'{q[:40]:40s} | {len(results):2d} results | {avg_len:5.0f} chars | {latency:4.0f}ms')
time.sleep(0.3)
benchmark(['Python 3.15 release date', 'FastAPI vs Django 2026',
'best headphones 2026', 'deploy cloudflare workers'])Exemple JavaScript
const SCAVIO_KEY = process.env.SCAVIO_API_KEY;
async function benchmark(queries) {
for (const q of queries) {
const start = Date.now();
const resp = await fetch('https://api.scavio.dev/api/v1/search', {
method: 'POST',
headers: { 'x-api-key': SCAVIO_KEY, 'Content-Type': 'application/json' },
body: JSON.stringify({ query: q, country_code: 'us', num_results: 10 })
});
const latency = Date.now() - start;
const results = (await resp.json()).organic_results || [];
const avgSnippet = results.reduce((s, r) => s + (r.snippet || '').length, 0) / (results.length || 1);
console.log(`${q.slice(0,40).padEnd(40)} | ${results.length} results | ${avgSnippet.toFixed(0)} chars | ${latency}ms`);
}
}
benchmark(['Python 3.15 release', 'FastAPI vs Django', 'best headphones 2026']);Sortie attendue
Search API Benchmark for RAG Quality
=======================================================
#1 SCAVIO
Relevance: 85% Snippets: 156 chars
Freshness: 60% Latency: 340ms
Cost: $0.005/query Composite: 0.78
Python 3.15 release date | 10 results | 145 chars | 320ms
FastAPI vs Django 2026 | 10 results | 162 chars | 290ms
best headphones 2026 | 10 results | 158 chars | 310ms