Reduce search API latency in AI agents by applying four techniques: parallel requests for multi-query workflows, result caching for repeated queries, query pruning to reduce payload sizes, and connection pooling to eliminate handshake overhead. In typical agent workflows, search calls account for 60-80% of total response time. Even small latency reductions compound across multi-step reasoning chains. This tutorial implements each optimization with the Scavio API and measures the before/after impact.
Prerequisites
- Python 3.8+ installed
- requests library installed
- A Scavio API key from scavio.dev
- An existing agent workflow with search calls
Walkthrough
Step 1: Measure baseline latency
Establish a baseline by timing sequential search calls so you can measure the impact of each optimization.
import requests, os, time
from concurrent.futures import ThreadPoolExecutor
API_KEY = os.environ['SCAVIO_API_KEY']
SESSION = requests.Session()
SESSION.headers.update({'x-api-key': API_KEY})
def timed_search(query: str) -> tuple:
start = time.monotonic()
resp = SESSION.post('https://api.scavio.dev/api/v1/search',
json={'platform': 'google', 'query': query}, timeout=10)
latency = (time.monotonic() - start) * 1000
return query, round(latency, 1), len(resp.json().get('organic_results', []))
# Baseline: sequential
queries = ['best crm 2026', 'python async tutorial', 'react vs vue']
start = time.monotonic()
for q in queries:
_, ms, _ = timed_search(q)
print(f'{q}: {ms}ms')
print(f'Sequential total: {(time.monotonic() - start)*1000:.0f}ms')Step 2: Parallelize multi-query requests
Use a thread pool to send multiple search requests simultaneously, cutting total wall-clock time by 2-3x.
def parallel_search(queries: list, max_workers: int = 3) -> list:
start = time.monotonic()
with ThreadPoolExecutor(max_workers=max_workers) as pool:
results = list(pool.map(timed_search, queries))
total = (time.monotonic() - start) * 1000
for q, ms, count in results:
print(f'{q}: {ms}ms ({count} results)')
print(f'Parallel total: {total:.0f}ms')
return results
parallel_search(queries)Step 3: Add a result cache with TTL
Cache search results by query string with a time-to-live to avoid redundant API calls for repeated queries.
import hashlib
cache = {}
CACHE_TTL = 300 # seconds
def cached_search(query: str, platform: str = 'google') -> dict:
key = hashlib.md5(f'{platform}:{query}'.encode()).hexdigest()
now = time.time()
if key in cache and now - cache[key]['ts'] < CACHE_TTL:
return cache[key]['data']
resp = SESSION.post('https://api.scavio.dev/api/v1/search',
json={'platform': platform, 'query': query}, timeout=10)
data = resp.json()
cache[key] = {'data': data, 'ts': now}
return data
# First call: network
start = time.monotonic()
cached_search('best crm 2026')
print(f'First call: {(time.monotonic() - start)*1000:.0f}ms')
# Second call: cache
start = time.monotonic()
cached_search('best crm 2026')
print(f'Cache hit: {(time.monotonic() - start)*1000:.0f}ms')Step 4: Prune response payloads
Strip unnecessary fields from search results before passing them to the LLM context to reduce token processing time.
def pruned_search(query: str) -> list:
data = cached_search(query)
results = data.get('organic_results', [])
return [{
'title': r.get('title', ''),
'snippet': r.get('snippet', '')[:200],
'url': r.get('link', ''),
} for r in results[:5]]
# Compare payload sizes:
import json
full = cached_search('best crm 2026')
pruned = pruned_search('best crm 2026')
print(f'Full response: {len(json.dumps(full))} chars')
print(f'Pruned response: {len(json.dumps(pruned))} chars')
print(f'Reduction: {100 - len(json.dumps(pruned)) * 100 // len(json.dumps(full))}%')Python Example
import requests, os, time, hashlib
from concurrent.futures import ThreadPoolExecutor
S = requests.Session()
S.headers.update({'x-api-key': os.environ['SCAVIO_API_KEY']})
cache = {}
def fast_search(query):
key = hashlib.md5(query.encode()).hexdigest()
if key in cache and time.time() - cache[key]['ts'] < 300:
return cache[key]['data']
data = S.post('https://api.scavio.dev/api/v1/search',
json={'platform': 'google', 'query': query}).json()
cache[key] = {'data': data, 'ts': time.time()}
return data
def parallel(queries):
with ThreadPoolExecutor(3) as pool:
return list(pool.map(fast_search, queries))JavaScript Example
const cache = new Map();
async function fastSearch(query) {
const key = query;
const cached = cache.get(key);
if (cached && Date.now() - cached.ts < 300000) return cached.data;
const r = await fetch('https://api.scavio.dev/api/v1/search', {
method: 'POST',
headers: {'x-api-key': process.env.SCAVIO_API_KEY, 'Content-Type': 'application/json'},
body: JSON.stringify({platform: 'google', query})
});
const data = await r.json();
cache.set(key, {data, ts: Date.now()});
return data;
}
async function parallel(queries) {
return Promise.all(queries.map(fastSearch));
}
parallel(['best crm 2026', 'react tutorial']).then(r => console.log(r.length + ' results'));Expected Output
Measurable latency reductions: parallel requests cut total time by 2-3x, caching eliminates repeated calls, and payload pruning reduces downstream token processing.