Overview
This pipeline adds web search grounding to local LLMs (Ollama, vLLM, llama.cpp) with a TTL-based cache layer to reduce API costs. When a user asks a question, the pipeline checks the cache for recent results. If cached results exist within the TTL window (default 1 hour), they are used without an API call. If not, Scavio is queried and results are cached. This makes local LLM search grounding affordable for high-traffic deployments.
Trigger
On each local LLM query that needs web grounding
Schedule
On-demand per local LLM query
Workflow Steps
Receive query from local LLM
Accept the user's question and determine if web search grounding is needed.
Check search cache
Look up the query in the local cache. If results exist and TTL has not expired, return cached data.
Query Scavio on cache miss
If cache is empty or expired, query Scavio API for fresh results with AI Overview.
Update cache
Store the fresh results in cache with the current timestamp for TTL calculation.
Format context and return
Format search results as a context block for the local LLM prompt.
Python Implementation
import requests
import json
import hashlib
from datetime import datetime, timedelta
from pathlib import Path
API_KEY = "your_scavio_api_key"
CACHE_DIR = Path("search_cache")
CACHE_DIR.mkdir(exist_ok=True)
CACHE_TTL = timedelta(hours=1)
def cache_key(query: str) -> str:
return hashlib.md5(query.lower().strip().encode()).hexdigest()
def get_cached(query: str) -> dict | None:
path = CACHE_DIR / f"{cache_key(query)}.json"
if not path.exists():
return None
data = json.loads(path.read_text())
cached_at = datetime.fromisoformat(data["cached_at"])
if datetime.utcnow() - cached_at > CACHE_TTL:
return None
return data
def search_with_cache(query: str) -> dict:
cached = get_cached(query)
if cached:
return {**cached, "source": "cache"}
res = requests.post(
"https://api.scavio.dev/api/v1/search",
headers={"x-api-key": API_KEY},
json={"platform": "google", "query": query, "ai_overview": True},
timeout=15,
)
res.raise_for_status()
data = res.json()
result = {
"query": query,
"ai_overview": data.get("ai_overview", {}).get("text", ""),
"results": [{"title": r.get("title", ""), "snippet": r.get("snippet", "")} for r in data.get("organic", [])[:5]],
"cached_at": datetime.utcnow().isoformat(),
}
path = CACHE_DIR / f"{cache_key(query)}.json"
path.write_text(json.dumps(result, indent=2))
return {**result, "source": "api"}
def format_context(search_result: dict) -> str:
parts = []
if search_result.get("ai_overview"):
parts.append(f"AI Overview: {search_result['ai_overview']}")
for r in search_result.get("results", []):
parts.append(f"- {r['title']}: {r['snippet']}")
return "\n".join(parts)
def run():
queries = ["latest ollama version 2026", "best local llm model 2026", "latest ollama version 2026"]
for q in queries:
result = search_with_cache(q)
print(f" [{result['source']}] {q}: {len(result['results'])} results")
if __name__ == "__main__":
run()JavaScript Implementation
const API_KEY = "your_scavio_api_key";
const cache = new Map();
const TTL = 3600000; // 1 hour
async function searchCached(query) {
const key = query.toLowerCase().trim();
const cached = cache.get(key);
if (cached && Date.now() - cached.ts < TTL) return { ...cached.data, source: "cache" };
const res = await fetch("https://api.scavio.dev/api/v1/search", {
method: "POST",
headers: { "x-api-key": API_KEY, "content-type": "application/json" },
body: JSON.stringify({ platform: "google", query, ai_overview: true }),
});
const data = await res.json();
const result = {
query,
aiOverview: data.ai_overview?.text ?? "",
results: (data.organic ?? []).slice(0, 5).map((r) => ({ title: r.title ?? "", snippet: r.snippet ?? "" })),
};
cache.set(key, { data: result, ts: Date.now() });
return { ...result, source: "api" };
}
for (const q of ["latest ollama version 2026", "best local llm 2026", "latest ollama version 2026"]) {
const r = await searchCached(q);
console.log(`[${r.source}] ${q}: ${r.results.length} results`);
}Platforms Used
Web search with knowledge graph, PAA, and AI overviews