Agent Search Quota Optimization
Autonomous agents can burn through search budgets in hours. Cost-aware search: daily budget caps, result caching, query deduplication. Python patterns included.
Exa's free tier gives you 1,000 searches per month. An agent workflow that searches on every user query burns through that in a day. The fix is not upgrading -- it is optimizing: cache results, batch similar queries, and route by intent so cheap queries go to cheap APIs and only high-value semantic queries hit Exa. A cost-routing function can cut search spend by 60-80%.
Why agent search is expensive
Agents search aggressively. A single user question might trigger 3-5 search calls: one for the main query, one for each sub-question the agent generates, one to verify claims. At 4 searches per user query and 100 users per day, you need 12,000 searches per month. Exa's free tier covers 8% of that. Even paid tiers ($49/month for 8,000 credits on Websets Starter) run out quickly.
Strategy 1: cache results aggressively
import hashlib, json, time, os
from pathlib import Path
CACHE_DIR = Path("search_cache")
CACHE_DIR.mkdir(exist_ok=True)
CACHE_TTL = 3600 * 6 # 6 hours -- most search results are stable this long
def cached_search(query: str, search_fn, ttl: int = CACHE_TTL) -> dict:
"""Cache search results to avoid redundant API calls."""
cache_key = hashlib.md5(query.lower().strip().encode()).hexdigest()
cache_path = CACHE_DIR / f"{cache_key}.json"
# Check cache
if cache_path.exists():
cached = json.loads(cache_path.read_text())
if time.time() - cached["timestamp"] < ttl:
cached["_from_cache"] = True
return cached["data"]
# Cache miss: call API
result = search_fn(query)
# Store in cache
cache_path.write_text(json.dumps({
"query": query,
"timestamp": time.time(),
"data": result,
}))
result["_from_cache"] = False
return result
# With 6-hour TTL:
# - Same query within 6 hours = free (cache hit)
# - Typical cache hit rate for agent workflows: 40-60%
# - 1,000 unique queries becomes ~500 API callsStrategy 2: batch similar queries
from collections import defaultdict
import re
def normalize_query(query: str) -> str:
"""Normalize query to reduce near-duplicates."""
q = query.lower().strip()
q = re.sub(r's+', ' ', q)
# Remove common filler words agents add
fillers = ["please find", "search for", "look up", "what is", "tell me about"]
for filler in fillers:
q = q.replace(filler, "").strip()
return q
def deduplicate_agent_queries(queries: list) -> list:
"""Remove near-duplicate queries before sending to API."""
seen = {}
unique = []
for q in queries:
normalized = normalize_query(q)
if normalized not in seen:
seen[normalized] = q
unique.append(q)
print(f"Deduplicated: {len(queries)} -> {len(unique)} queries "
f"({len(queries) - len(unique)} duplicates removed)")
return unique
# Agent generates these queries for one user question:
agent_queries = [
"best SERP API pricing 2026",
"What is the best SERP API pricing 2026",
"SERP API pricing comparison",
"search for best SERP API pricing 2026",
]
# After dedup: 2 unique queries instead of 4Strategy 3: cost-routing by intent
import requests, os
def classify_query_intent(query: str) -> str:
"""Classify if query needs semantic search or keyword search."""
semantic_indicators = [
"similar to", "like", "related to", "concept",
"meaning", "explain", "analogy", "comparable",
]
for indicator in semantic_indicators:
if indicator in query.lower():
return "semantic"
return "keyword"
def cost_routed_search(query: str) -> dict:
"""Route queries to cheapest appropriate API."""
intent = classify_query_intent(query)
if intent == "semantic":
# Exa: $7/1k searches -- best for semantic/meaning-based
# Only route here when keyword search would miss the point
return exa_search(query)
else:
# Scavio: $5/1k -- structured keyword search
# Handles 80%+ of agent queries
resp = requests.post(
"https://api.scavio.dev/api/v1/search",
headers={"x-api-key": os.environ["SCAVIO_API_KEY"]},
json={"query": query, "platform": "google"},
timeout=10,
)
return resp.json()
def exa_search(query: str) -> dict:
"""Exa semantic search -- use sparingly."""
resp = requests.post(
"https://api.exa.ai/search",
headers={"x-api-key": os.environ["EXA_API_KEY"]},
json={"query": query, "numResults": 10, "type": "neural"},
timeout=10,
)
return resp.json()
# Routing distribution (typical agent workload):
# 80% keyword -> Scavio at $0.005/query
# 20% semantic -> Exa at $0.007/query
# Blended cost: $0.0054/query vs $0.007/query (all-Exa)
# Savings: 23% on per-query cost + cache savings on topFull optimization pipeline
class OptimizedSearchClient:
"""Combines caching, dedup, and routing for minimum cost."""
def __init__(self):
self.stats = {"cache_hits": 0, "api_calls": 0, "total_queries": 0}
def search(self, query: str) -> dict:
self.stats["total_queries"] += 1
# Layer 1: Cache
result = cached_search(query, self._routed_search)
if result.get("_from_cache"):
self.stats["cache_hits"] += 1
else:
self.stats["api_calls"] += 1
return result
def batch_search(self, queries: list) -> list:
"""Search multiple queries with dedup + cache + routing."""
unique = deduplicate_agent_queries(queries)
return [self.search(q) for q in unique]
def _routed_search(self, query: str) -> dict:
return cost_routed_search(query)
def print_stats(self):
total = self.stats["total_queries"]
hits = self.stats["cache_hits"]
calls = self.stats["api_calls"]
print(f"Total queries: {total}")
print(f"Cache hits: {hits} ({hits/max(total,1)*100:.0f}%)")
print(f"API calls: {calls} ({calls/max(total,1)*100:.0f}%)")
print(f"Estimated cost: ${calls * 0.005:.2f}")
# Usage:
# client = OptimizedSearchClient()
# results = client.batch_search(agent_queries)
# client.print_stats()Cost savings at scale
- Raw: 12,000 queries/month on Exa = $84/month
- With caching (50% hit rate): 6,000 API calls = $42/month
- With dedup (30% reduction): 4,200 API calls = $29.40/month
- With routing (80% to Scavio): 840 Exa + 3,360 Scavio = $22.68/month
- Total savings: 73% reduction from $84 to $22.68/month
Start with caching -- it is the highest-impact, lowest-effort optimization. Add dedup next if your agent generates repetitive sub-queries. Add cost routing last, only if you genuinely need semantic search for a portion of your queries. Most agent workloads are keyword-based and do not need Exa's neural retrieval at all.