Local LLMs running on llama.cpp, Ollama, or vLLM are powerful but frozen in time. They hallucinate current events, recent releases, and live data because their training data has a cutoff. Adding a search API gives them real-time grounding: before answering, the LLM searches the web and uses fresh results as context. This tutorial works with any OpenAI-compatible local LLM endpoint. Cost: $0.005 per grounded answer.
Prerequisites
- A local LLM running (Ollama, llama.cpp server, or vLLM)
- Python 3.9+ installed
- requests library installed
- A Scavio API key from scavio.dev
Walkthrough
Step 1: Connect to your local LLM
Set up the connection to your local LLM. Works with any OpenAI-compatible endpoint (Ollama, llama.cpp server, vLLM).
import requests
# Common local LLM endpoints:
# Ollama: http://localhost:11434/v1/chat/completions
# llama.cpp: http://localhost:8080/v1/chat/completions
# vLLM: http://localhost:8000/v1/chat/completions
LLM_URL = 'http://localhost:11434/v1/chat/completions' # Ollama default
LLM_MODEL = 'llama3' # or 'mistral', 'codellama', etc.
def ask_llm(messages: list, max_tokens: int = 512) -> str:
resp = requests.post(LLM_URL, json={
'model': LLM_MODEL,
'messages': messages,
'max_tokens': max_tokens,
'temperature': 0.3
}, timeout=120)
return resp.json()['choices'][0]['message']['content']
# Test connection
try:
answer = ask_llm([{'role': 'user', 'content': 'Say hello in one word.'}], max_tokens=10)
print(f'LLM connected: {answer}')
except Exception as e:
print(f'LLM connection error: {e}')
print('Make sure Ollama/llama.cpp is running.')Step 2: Add the search grounding function
Build a function that searches the web and formats results as context for the LLM. The LLM only sees the search snippets, not the full pages.
import os
SCAVIO_KEY = os.environ['SCAVIO_API_KEY']
def search_context(query: str, count: int = 5) -> str:
"""Search the web and return formatted context for the LLM."""
resp = requests.post('https://api.scavio.dev/api/v1/search',
headers={'x-api-key': SCAVIO_KEY, 'Content-Type': 'application/json'},
json={'query': query, 'country_code': 'us', 'num_results': count})
results = resp.json().get('organic_results', [])
if not results:
return 'No search results found.'
context = 'Search results (use these to answer accurately):\n\n'
for i, r in enumerate(results, 1):
context += f'[{i}] {r["title"]}\n'
context += f' {r.get("snippet", "")}\n'
context += f' Source: {r["link"]}\n\n'
return context
# Test
ctx = search_context('Python 3.14 release date')
print(ctx[:300])Step 3: Build the grounded answer pipeline
Combine search and LLM into a single function. The LLM receives search context and must cite sources in its answer.
def grounded_answer(question: str) -> dict:
"""Answer a question using search-grounded local LLM."""
# Step 1: Search for context
context = search_context(question, count=5)
# Step 2: Ask LLM with context
messages = [
{'role': 'system', 'content': (
'You are a helpful assistant. Answer ONLY based on the search results provided. '
'Cite sources as [1], [2], etc. If the search results do not contain the answer, '
'say "I could not find this information in the search results."'
)},
{'role': 'user', 'content': f'{context}\nQuestion: {question}'}
]
answer = ask_llm(messages, max_tokens=512)
return {
'question': question,
'answer': answer,
'grounded': True,
'search_cost': 0.005
}
# Test with a question that requires current data
result = grounded_answer('What is the latest version of Python?')
print(f'Q: {result["question"]}')
print(f'A: {result["answer"]}')
print(f'Grounded: {result["grounded"]}, Cost: ${result["search_cost"]}')Step 4: Add smart grounding (only search when needed)
Not every question needs search. Add a check that decides whether to ground with search or answer directly, saving costs.
def needs_grounding(question: str) -> bool:
"""Heuristic: does this question need real-time data?"""
grounding_triggers = [
'latest', 'current', 'today', '2026', '2025', 'now',
'price', 'cost', 'version', 'release', 'new', 'update',
'best', 'top', 'compare', 'vs', 'alternative',
'how much', 'where to', 'who is',
]
q_lower = question.lower()
return any(trigger in q_lower for trigger in grounding_triggers)
def smart_answer(question: str) -> dict:
"""Answer with search grounding only when needed."""
if needs_grounding(question):
return grounded_answer(question)
# Direct LLM answer (no search cost)
messages = [{'role': 'user', 'content': question}]
answer = ask_llm(messages, max_tokens=512)
return {
'question': question,
'answer': answer,
'grounded': False,
'search_cost': 0
}
# Test both paths
for q in ['What is a Python list comprehension?',
'What is the latest Python version in 2026?']:
result = smart_answer(q)
print(f'[{"GROUNDED" if result["grounded"] else "DIRECT"}] '
f'${result["search_cost"]} - {q}')
print(f' {result["answer"][:100]}...')
print()Python Example
import requests, os
LLM_URL = 'http://localhost:11434/v1/chat/completions'
SCAVIO_KEY = os.environ['SCAVIO_API_KEY']
def search(query, count=5):
resp = requests.post('https://api.scavio.dev/api/v1/search',
headers={'x-api-key': SCAVIO_KEY, 'Content-Type': 'application/json'},
json={'query': query, 'country_code': 'us', 'num_results': count})
return resp.json().get('organic_results', [])
def grounded_ask(question):
results = search(question)
ctx = '\n'.join(f'[{i+1}] {r["title"]}: {r.get("snippet","")}' for i, r in enumerate(results))
resp = requests.post(LLM_URL, json={'model': 'llama3', 'messages': [
{'role': 'system', 'content': 'Answer from search results. Cite [1],[2].'},
{'role': 'user', 'content': f'{ctx}\n\nQ: {question}'}], 'max_tokens': 512})
return resp.json()['choices'][0]['message']['content']
print(grounded_ask('latest Python version 2026'))JavaScript Example
const LLM_URL = 'http://localhost:11434/v1/chat/completions';
const SCAVIO_KEY = process.env.SCAVIO_API_KEY;
async function groundedAsk(question) {
const searchResp = await fetch('https://api.scavio.dev/api/v1/search', {
method: 'POST',
headers: { 'x-api-key': SCAVIO_KEY, 'Content-Type': 'application/json' },
body: JSON.stringify({ query: question, country_code: 'us', num_results: 5 })
});
const results = (await searchResp.json()).organic_results || [];
const ctx = results.map((r, i) => `[${i+1}] ${r.title}: ${r.snippet || ''}`).join('\n');
const llmResp = await fetch(LLM_URL, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ model: 'llama3', messages: [
{ role: 'system', content: 'Answer from search results. Cite [1],[2].' },
{ role: 'user', content: `${ctx}\n\nQ: ${question}` }], max_tokens: 512 })
});
return (await llmResp.json()).choices[0].message.content;
}
groundedAsk('latest Python version 2026').then(console.log);Expected Output
LLM connected: Hello
Search results (use these to answer accurately):
[1] Python Release Python 3.14.0
Python 3.14.0 was released on October 7, 2025...
Source: https://www.python.org/downloads/release/python-3140/
Q: What is the latest version of Python?
A: According to the search results, the latest version of Python is 3.14.0,
released on October 7, 2025 [1].
[DIRECT] $0 - What is a Python list comprehension?
A list comprehension is a concise way to create lists...
[GROUNDED] $0.005 - What is the latest Python version in 2026?
The latest Python version is 3.14.0, released October 2025 [1]...