agentscostoptimization

Agent Search Quota Optimization

Autonomous agents can burn through search budgets in hours. Cost-aware search: daily budget caps, result caching, query deduplication. Python patterns included.

9 min

Exa's free tier gives you 1,000 searches per month. An agent workflow that searches on every user query burns through that in a day. The fix is not upgrading -- it is optimizing: cache results, batch similar queries, and route by intent so cheap queries go to cheap APIs and only high-value semantic queries hit Exa. A cost-routing function can cut search spend by 60-80%.

Why agent search is expensive

Agents search aggressively. A single user question might trigger 3-5 search calls: one for the main query, one for each sub-question the agent generates, one to verify claims. At 4 searches per user query and 100 users per day, you need 12,000 searches per month. Exa's free tier covers 8% of that. Even paid tiers ($49/month for 8,000 credits on Websets Starter) run out quickly.

Strategy 1: cache results aggressively

Python
import hashlib, json, time, os
from pathlib import Path

CACHE_DIR = Path("search_cache")
CACHE_DIR.mkdir(exist_ok=True)
CACHE_TTL = 3600 * 6  # 6 hours -- most search results are stable this long

def cached_search(query: str, search_fn, ttl: int = CACHE_TTL) -> dict:
    """Cache search results to avoid redundant API calls."""
    cache_key = hashlib.md5(query.lower().strip().encode()).hexdigest()
    cache_path = CACHE_DIR / f"{cache_key}.json"

    # Check cache
    if cache_path.exists():
        cached = json.loads(cache_path.read_text())
        if time.time() - cached["timestamp"] < ttl:
            cached["_from_cache"] = True
            return cached["data"]

    # Cache miss: call API
    result = search_fn(query)

    # Store in cache
    cache_path.write_text(json.dumps({
        "query": query,
        "timestamp": time.time(),
        "data": result,
    }))

    result["_from_cache"] = False
    return result

# With 6-hour TTL:
# - Same query within 6 hours = free (cache hit)
# - Typical cache hit rate for agent workflows: 40-60%
# - 1,000 unique queries becomes ~500 API calls

Strategy 2: batch similar queries

Python
from collections import defaultdict
import re

def normalize_query(query: str) -> str:
    """Normalize query to reduce near-duplicates."""
    q = query.lower().strip()
    q = re.sub(r's+', ' ', q)
    # Remove common filler words agents add
    fillers = ["please find", "search for", "look up", "what is", "tell me about"]
    for filler in fillers:
        q = q.replace(filler, "").strip()
    return q

def deduplicate_agent_queries(queries: list) -> list:
    """Remove near-duplicate queries before sending to API."""
    seen = {}
    unique = []
    for q in queries:
        normalized = normalize_query(q)
        if normalized not in seen:
            seen[normalized] = q
            unique.append(q)

    print(f"Deduplicated: {len(queries)} -> {len(unique)} queries "
          f"({len(queries) - len(unique)} duplicates removed)")
    return unique

# Agent generates these queries for one user question:
agent_queries = [
    "best SERP API pricing 2026",
    "What is the best SERP API pricing 2026",
    "SERP API pricing comparison",
    "search for best SERP API pricing 2026",
]
# After dedup: 2 unique queries instead of 4

Strategy 3: cost-routing by intent

Python
import requests, os

def classify_query_intent(query: str) -> str:
    """Classify if query needs semantic search or keyword search."""
    semantic_indicators = [
        "similar to", "like", "related to", "concept",
        "meaning", "explain", "analogy", "comparable",
    ]
    for indicator in semantic_indicators:
        if indicator in query.lower():
            return "semantic"
    return "keyword"

def cost_routed_search(query: str) -> dict:
    """Route queries to cheapest appropriate API."""
    intent = classify_query_intent(query)

    if intent == "semantic":
        # Exa: $7/1k searches -- best for semantic/meaning-based
        # Only route here when keyword search would miss the point
        return exa_search(query)
    else:
        # Scavio: $5/1k -- structured keyword search
        # Handles 80%+ of agent queries
        resp = requests.post(
            "https://api.scavio.dev/api/v1/search",
            headers={"x-api-key": os.environ["SCAVIO_API_KEY"]},
            json={"query": query, "platform": "google"},
            timeout=10,
        )
        return resp.json()

def exa_search(query: str) -> dict:
    """Exa semantic search -- use sparingly."""
    resp = requests.post(
        "https://api.exa.ai/search",
        headers={"x-api-key": os.environ["EXA_API_KEY"]},
        json={"query": query, "numResults": 10, "type": "neural"},
        timeout=10,
    )
    return resp.json()

# Routing distribution (typical agent workload):
# 80% keyword -> Scavio at $0.005/query
# 20% semantic -> Exa at $0.007/query
# Blended cost: $0.0054/query vs $0.007/query (all-Exa)
# Savings: 23% on per-query cost + cache savings on top

Full optimization pipeline

Python
class OptimizedSearchClient:
    """Combines caching, dedup, and routing for minimum cost."""

    def __init__(self):
        self.stats = {"cache_hits": 0, "api_calls": 0, "total_queries": 0}

    def search(self, query: str) -> dict:
        self.stats["total_queries"] += 1

        # Layer 1: Cache
        result = cached_search(query, self._routed_search)
        if result.get("_from_cache"):
            self.stats["cache_hits"] += 1
        else:
            self.stats["api_calls"] += 1
        return result

    def batch_search(self, queries: list) -> list:
        """Search multiple queries with dedup + cache + routing."""
        unique = deduplicate_agent_queries(queries)
        return [self.search(q) for q in unique]

    def _routed_search(self, query: str) -> dict:
        return cost_routed_search(query)

    def print_stats(self):
        total = self.stats["total_queries"]
        hits = self.stats["cache_hits"]
        calls = self.stats["api_calls"]
        print(f"Total queries: {total}")
        print(f"Cache hits: {hits} ({hits/max(total,1)*100:.0f}%)")
        print(f"API calls: {calls} ({calls/max(total,1)*100:.0f}%)")
        print(f"Estimated cost: ${calls * 0.005:.2f}")

# Usage:
# client = OptimizedSearchClient()
# results = client.batch_search(agent_queries)
# client.print_stats()

Cost savings at scale

  • Raw: 12,000 queries/month on Exa = $84/month
  • With caching (50% hit rate): 6,000 API calls = $42/month
  • With dedup (30% reduction): 4,200 API calls = $29.40/month
  • With routing (80% to Scavio): 840 Exa + 3,360 Scavio = $22.68/month
  • Total savings: 73% reduction from $84 to $22.68/month

Start with caching -- it is the highest-impact, lowest-effort optimization. Add dedup next if your agent generates repetitive sub-queries. Add cost routing last, only if you genuinely need semantic search for a portion of your queries. Most agent workloads are keyword-based and do not need Exa's neural retrieval at all.