Run SearXNG, Hermes 3, and Qwen 2.5 as a local AI search stack using Docker Compose, then compare the result quality and latency against a cloud search API. Self-hosted search stacks appeal to privacy-focused teams, but they require ongoing maintenance and produce inconsistent results depending on which SearXNG engines are functioning. This tutorial sets up the full stack, runs comparison queries, and helps you decide where self-hosted works and where an API is the better choice.
Prerequisites
- Docker and Docker Compose installed
- 16GB+ RAM for local model inference
- A Scavio API key from scavio.dev for comparison
- Python 3.8+ installed
Walkthrough
Step 1: Write the Docker Compose file
Define services for SearXNG (metasearch), Ollama with Hermes/Qwen models, and a bridge service.
# docker-compose.yml
# Save this as docker-compose.yml in your project directory
compose_yaml = """
version: '3.8'
services:
searxng:
image: searxng/searxng:latest
ports:
- '8080:8080'
volumes:
- ./searxng:/etc/searxng
restart: unless-stopped
ollama:
image: ollama/ollama:latest
ports:
- '11434:11434'
volumes:
- ollama_data:/root/.ollama
restart: unless-stopped
volumes:
ollama_data:
"""
with open('docker-compose.yml', 'w') as f:
f.write(compose_yaml)
print('docker-compose.yml written')
print('Run: docker compose up -d')
print('Then: docker exec ollama ollama pull hermes3')
print('Then: docker exec ollama ollama pull qwen2.5')Step 2: Configure SearXNG engines
Customize SearXNG settings to enable the search engines you need and disable noisy ones.
import os
os.makedirs('searxng', exist_ok=True)
settings_yml = """
use_default_settings: true
server:
secret_key: 'change-this-to-a-random-string'
search:
safe_search: 0
autocomplete: ''
default_lang: 'en'
engines:
- name: google
engine: google
shortcut: g
disabled: false
- name: duckduckgo
engine: duckduckgo
shortcut: ddg
disabled: false
- name: brave
engine: brave
shortcut: br
disabled: false
"""
with open('searxng/settings.yml', 'w') as f:
f.write(settings_yml)
print('SearXNG settings written to searxng/settings.yml')Step 3: Connect Hermes to SearXNG
Build a Python bridge that sends a query to SearXNG, passes results to Hermes/Qwen via Ollama, and returns the grounded answer.
import requests
def searxng_search(query: str) -> list:
resp = requests.get('http://localhost:8080/search',
params={'q': query, 'format': 'json'}, timeout=15)
results = resp.json().get('results', [])
return [{'title': r.get('title', ''), 'url': r.get('url', ''),
'content': r.get('content', '')} for r in results[:5]]
def ollama_generate(model: str, prompt: str) -> str:
resp = requests.post('http://localhost:11434/api/generate',
json={'model': model, 'prompt': prompt, 'stream': False}, timeout=60)
return resp.json().get('response', '')
def local_grounded_search(query: str, model: str = 'hermes3') -> str:
results = searxng_search(query)
context = '\n'.join(f"{r['title']}: {r['content']}" for r in results)
prompt = f'Using this context:\n{context}\n\nAnswer: {query}'
return ollama_generate(model, prompt)
# print(local_grounded_search('best python frameworks 2026'))Step 4: Test queries and compare with API
Run the same queries through both the local stack and Scavio, then compare result quality and latency.
import time
SCAVIO_KEY = os.environ['SCAVIO_API_KEY']
def scavio_search(query: str) -> list:
resp = requests.post('https://api.scavio.dev/api/v1/search',
headers={'x-api-key': SCAVIO_KEY},
json={'platform': 'google', 'query': query}, timeout=10)
return resp.json().get('organic_results', [])[:5]
def compare(query: str) -> dict:
# Local stack
start = time.monotonic()
local_results = searxng_search(query)
local_ms = (time.monotonic() - start) * 1000
# Cloud API
start = time.monotonic()
api_results = scavio_search(query)
api_ms = (time.monotonic() - start) * 1000
print(f'Query: {query}')
print(f' Local: {len(local_results)} results in {local_ms:.0f}ms')
print(f' API: {len(api_results)} results in {api_ms:.0f}ms')
return {'query': query, 'local_count': len(local_results), 'api_count': len(api_results),
'local_ms': round(local_ms), 'api_ms': round(api_ms)}
# compare('best python web framework 2026')Step 5: Compare results with API baseline
Evaluate which queries work well locally and which need a cloud API for reliable results.
def evaluate_stack(queries: list) -> dict:
results = []
for q in queries:
try:
api = scavio_search(q)
results.append({'query': q, 'api_results': len(api), 'status': 'ok'})
except Exception as e:
results.append({'query': q, 'error': str(e)})
ok = sum(1 for r in results if r.get('status') == 'ok')
print(f'{ok}/{len(queries)} queries returned API results')
print('Local stack: good for privacy, slower, inconsistent engines')
print('Cloud API: consistent results, structured JSON, multi-platform')
return {'total': len(queries), 'ok': ok, 'results': results}
evaluate_stack(['best crm 2026', 'python async tutorial', 'react vs vue performance'])Python Example
import requests, os
H = {'x-api-key': os.environ['SCAVIO_API_KEY']}
# Cloud API comparison baseline
def cloud_search(query):
data = requests.post('https://api.scavio.dev/api/v1/search', headers=H,
json={'platform': 'google', 'query': query}).json()
return data.get('organic_results', [])[:5]
# Local SearXNG (when running)
def local_search(query):
try:
data = requests.get('http://localhost:8080/search', params={'q': query, 'format': 'json'}, timeout=10).json()
return data.get('results', [])[:5]
except: return []
print(f'API: {len(cloud_search("best crm 2026"))} results')JavaScript Example
const H = {'x-api-key': process.env.SCAVIO_API_KEY, 'Content-Type': 'application/json'};
async function cloudSearch(query) {
const r = await fetch('https://api.scavio.dev/api/v1/search', {
method: 'POST', headers: H, body: JSON.stringify({platform: 'google', query})
});
return (await r.json()).organic_results || [];
}
async function localSearch(query) {
try {
const r = await fetch(`http://localhost:8080/search?q=${encodeURIComponent(query)}&format=json`);
return (await r.json()).results || [];
} catch { return []; }
}
cloudSearch('best crm 2026').then(r => console.log('API:', r.length, 'results'));Expected Output
A Docker Compose stack running SearXNG and Ollama with Hermes/Qwen, with comparison benchmarks against Scavio's cloud API for result quality and latency.