Which Scavio platforms does this workflow use?

This workflow uses the following Scavio platforms: google. Each platform is called via the same unified API endpoint.

Can I run this workflow on the free tier?

Yes. Scavio's free tier includes 250 credits per month with no credit card required. That is enough to test and validate this workflow before scaling it.

Benchmark RAG Search Quality with Live Data

Q: How is this workflow triggered?

This workflow uses a weekly on monday 6 am, or on-demand before deploying rag changes.. Weekly.

Overview

RAG pipelines are only as good as the search results feeding them. This workflow benchmarks search quality by running a curated set of questions through your search API, comparing retrieved results against known-good answers, and scoring retrieval precision. Run it weekly to catch search quality regressions before they degrade your RAG output.

Trigger

Weekly on Monday 6 AM, or on-demand before deploying RAG changes.

Schedule

Weekly

Workflow Steps

Load Benchmark Dataset

Read the curated Q&A dataset with questions, expected answer snippets, and expected source domains.

Run Search Queries

For each question, call Scavio search API and collect top 10 organic results.

Score Retrieval Quality

Check if expected source domains appear in results. Score snippet overlap with expected answers.

Detect Regressions

Compare current scores against last week's baseline. Flag any queries with significant quality drops.

Output Benchmark Report

Generate a report with pass/fail per query, overall precision score, and regression alerts.

Python Implementation

Python

import requests, os, json
from pathlib import Path
from difflib import SequenceMatcher

API_KEY = os.environ["SCAVIO_API_KEY"]
H = {"x-api-key": API_KEY, "Content-Type": "application/json"}

BENCHMARK = [
    {"question": "what is retrieval augmented generation", "expected_domains": ["arxiv.org", "aws.amazon.com"], "expected_snippet": "retrieval augmented generation combines"},
    {"question": "langchain search tool setup", "expected_domains": ["python.langchain.com", "docs.langchain.com"], "expected_snippet": "langchain tool integration"},
    {"question": "ollama api reference", "expected_domains": ["github.com/ollama", "ollama.com"], "expected_snippet": "ollama api"},
]

def search(query: str) -> list:
    resp = requests.post(
        "https://api.scavio.dev/api/v1/search",
        headers=H,
        json={"query": query, "country_code": "us"},
        timeout=15,
    )
    return resp.json().get("organic_results", [])[:10]

def score_results(results: list, expected_domains: list, expected_snippet: str) -> dict:
    result_domains = [r.get("link", "").split("/")[2] if r.get("link") else "" for r in results]
    domain_hits = sum(1 for d in expected_domains if any(d in rd for rd in result_domains))
    domain_precision = domain_hits / len(expected_domains) if expected_domains else 0

    all_snippets = " ".join(r.get("snippet", "") for r in results).lower()
    snippet_similarity = SequenceMatcher(None, expected_snippet.lower(), all_snippets[:500]).ratio()

    return {"domain_precision": round(domain_precision, 2), "snippet_similarity": round(snippet_similarity, 2)}

def run_benchmark():
    scores = []
    for item in BENCHMARK:
        results = search(item["question"])
        score = score_results(results, item["expected_domains"], item["expected_snippet"])
        score["question"] = item["question"]
        scores.append(score)
        print(f"{item['question']}: domain={score['domain_precision']}, snippet={score['snippet_similarity']}")

    avg_domain = sum(s["domain_precision"] for s in scores) / len(scores)
    avg_snippet = sum(s["snippet_similarity"] for s in scores) / len(scores)
    print(f"\nOverall: domain precision={avg_domain:.2f}, snippet similarity={avg_snippet:.2f}")
    return scores

benchmark = run_benchmark()

JavaScript Implementation

JavaScript

const H = {'x-api-key': process.env.SCAVIO_API_KEY, 'Content-Type': 'application/json'};

const BENCHMARK = [
  {question:'what is retrieval augmented generation', expectedDomains:['arxiv.org','aws.amazon.com'], expectedSnippet:'retrieval augmented generation combines'},
  {question:'langchain search tool setup', expectedDomains:['python.langchain.com','docs.langchain.com'], expectedSnippet:'langchain tool integration'},
  {question:'ollama api reference', expectedDomains:['github.com/ollama','ollama.com'], expectedSnippet:'ollama api'},
];

async function search(query) {
  const r = await fetch('https://api.scavio.dev/api/v1/search', {method:'POST', headers:H, body:JSON.stringify({query, country_code:'us'})});
  return ((await r.json()).organic_results || []).slice(0,10);
}

function scoreResults(results, expectedDomains, expectedSnippet) {
  const resultDomains = results.map(r=>{try{return new URL(r.link).hostname}catch{return ''}});
  const domainHits = expectedDomains.filter(d=>resultDomains.some(rd=>rd.includes(d))).length;
  const domainPrecision = expectedDomains.length ? domainHits/expectedDomains.length : 0;
  const allSnippets = results.map(r=>r.snippet||'').join(' ').toLowerCase().slice(0,500);
  const snippetMatch = allSnippets.includes(expectedSnippet.toLowerCase()) ? 1 : 0.3;
  return {domainPrecision:Math.round(domainPrecision*100)/100, snippetSimilarity:Math.round(snippetMatch*100)/100};
}

async function runBenchmark() {
  const scores = [];
  for (const item of BENCHMARK) {
    const results = await search(item.question);
    const score = scoreResults(results, item.expectedDomains, item.expectedSnippet);
    score.question = item.question;
    scores.push(score);
    console.log(item.question+': domain='+score.domainPrecision+', snippet='+score.snippetSimilarity);
  }
  const avgDomain = scores.reduce((s,x)=>s+x.domainPrecision,0)/scores.length;
  const avgSnippet = scores.reduce((s,x)=>s+x.snippetSimilarity,0)/scores.length;
  console.log('\nOverall: domain precision='+avgDomain.toFixed(2)+', snippet similarity='+avgSnippet.toFixed(2));
  return scores;
}

await runBenchmark();

Platforms Used

Google

Web search with knowledge graph, PAA, and AI overviews

RAG Search Quality Benchmark Pipeline