Overview
RAG pipelines are only as good as the search results feeding them. This workflow benchmarks search quality by running a curated set of questions through your search API, comparing retrieved results against known-good answers, and scoring retrieval precision. Run it weekly to catch search quality regressions before they degrade your RAG output.
Trigger
Weekly on Monday 6 AM, or on-demand before deploying RAG changes.
Schedule
Weekly
Workflow Steps
Load Benchmark Dataset
Read the curated Q&A dataset with questions, expected answer snippets, and expected source domains.
Run Search Queries
For each question, call Scavio search API and collect top 10 organic results.
Score Retrieval Quality
Check if expected source domains appear in results. Score snippet overlap with expected answers.
Detect Regressions
Compare current scores against last week's baseline. Flag any queries with significant quality drops.
Output Benchmark Report
Generate a report with pass/fail per query, overall precision score, and regression alerts.
Python Implementation
import requests, os, json
from pathlib import Path
from difflib import SequenceMatcher
API_KEY = os.environ["SCAVIO_API_KEY"]
H = {"x-api-key": API_KEY, "Content-Type": "application/json"}
BENCHMARK = [
{"question": "what is retrieval augmented generation", "expected_domains": ["arxiv.org", "aws.amazon.com"], "expected_snippet": "retrieval augmented generation combines"},
{"question": "langchain search tool setup", "expected_domains": ["python.langchain.com", "docs.langchain.com"], "expected_snippet": "langchain tool integration"},
{"question": "ollama api reference", "expected_domains": ["github.com/ollama", "ollama.com"], "expected_snippet": "ollama api"},
]
def search(query: str) -> list:
resp = requests.post(
"https://api.scavio.dev/api/v1/search",
headers=H,
json={"query": query, "country_code": "us"},
timeout=15,
)
return resp.json().get("organic_results", [])[:10]
def score_results(results: list, expected_domains: list, expected_snippet: str) -> dict:
result_domains = [r.get("link", "").split("/")[2] if r.get("link") else "" for r in results]
domain_hits = sum(1 for d in expected_domains if any(d in rd for rd in result_domains))
domain_precision = domain_hits / len(expected_domains) if expected_domains else 0
all_snippets = " ".join(r.get("snippet", "") for r in results).lower()
snippet_similarity = SequenceMatcher(None, expected_snippet.lower(), all_snippets[:500]).ratio()
return {"domain_precision": round(domain_precision, 2), "snippet_similarity": round(snippet_similarity, 2)}
def run_benchmark():
scores = []
for item in BENCHMARK:
results = search(item["question"])
score = score_results(results, item["expected_domains"], item["expected_snippet"])
score["question"] = item["question"]
scores.append(score)
print(f"{item['question']}: domain={score['domain_precision']}, snippet={score['snippet_similarity']}")
avg_domain = sum(s["domain_precision"] for s in scores) / len(scores)
avg_snippet = sum(s["snippet_similarity"] for s in scores) / len(scores)
print(f"\nOverall: domain precision={avg_domain:.2f}, snippet similarity={avg_snippet:.2f}")
return scores
benchmark = run_benchmark()JavaScript Implementation
const H = {'x-api-key': process.env.SCAVIO_API_KEY, 'Content-Type': 'application/json'};
const BENCHMARK = [
{question:'what is retrieval augmented generation', expectedDomains:['arxiv.org','aws.amazon.com'], expectedSnippet:'retrieval augmented generation combines'},
{question:'langchain search tool setup', expectedDomains:['python.langchain.com','docs.langchain.com'], expectedSnippet:'langchain tool integration'},
{question:'ollama api reference', expectedDomains:['github.com/ollama','ollama.com'], expectedSnippet:'ollama api'},
];
async function search(query) {
const r = await fetch('https://api.scavio.dev/api/v1/search', {method:'POST', headers:H, body:JSON.stringify({query, country_code:'us'})});
return ((await r.json()).organic_results || []).slice(0,10);
}
function scoreResults(results, expectedDomains, expectedSnippet) {
const resultDomains = results.map(r=>{try{return new URL(r.link).hostname}catch{return ''}});
const domainHits = expectedDomains.filter(d=>resultDomains.some(rd=>rd.includes(d))).length;
const domainPrecision = expectedDomains.length ? domainHits/expectedDomains.length : 0;
const allSnippets = results.map(r=>r.snippet||'').join(' ').toLowerCase().slice(0,500);
const snippetMatch = allSnippets.includes(expectedSnippet.toLowerCase()) ? 1 : 0.3;
return {domainPrecision:Math.round(domainPrecision*100)/100, snippetSimilarity:Math.round(snippetMatch*100)/100};
}
async function runBenchmark() {
const scores = [];
for (const item of BENCHMARK) {
const results = await search(item.question);
const score = scoreResults(results, item.expectedDomains, item.expectedSnippet);
score.question = item.question;
scores.push(score);
console.log(item.question+': domain='+score.domainPrecision+', snippet='+score.snippetSimilarity);
}
const avgDomain = scores.reduce((s,x)=>s+x.domainPrecision,0)/scores.length;
const avgSnippet = scores.reduce((s,x)=>s+x.snippetSimilarity,0)/scores.length;
console.log('\nOverall: domain precision='+avgDomain.toFixed(2)+', snippet similarity='+avgSnippet.toFixed(2));
return scores;
}
await runBenchmark();Platforms Used
Web search with knowledge graph, PAA, and AI overviews