LLMs hallucinate brand information constantly. They invent product features, cite wrong pricing, and confuse competitors. Grounding brand research with live search data fixes this by giving the LLM verified facts to work with. This tutorial builds a brand research pipeline that searches Google for company info, Amazon for product data, Reddit for community sentiment, and YouTube for content presence, all through the Scavio API at $0.005 per search.
Prerequisites
- Python 3.9+ installed
- requests library installed
- A Scavio API key from scavio.dev
- An LLM API key (OpenAI, Anthropic, or local Ollama)
Walkthrough
Step 1: Build the brand data collection pipeline
Search multiple platforms to collect verified brand data. Each platform provides a different perspective: Google for official info, Amazon for products, Reddit for reputation.
import os, requests, time
SCAVIO_KEY = os.environ['SCAVIO_API_KEY']
URL = 'https://api.scavio.dev/api/v1/search'
H = {'x-api-key': SCAVIO_KEY, 'Content-Type': 'application/json'}
def research_brand(brand: str) -> dict:
data = {'brand': brand, 'sources': {}}
# Google: official info, knowledge graph
resp = requests.post(URL, headers=H,
json={'query': f'{brand} company', 'country_code': 'us', 'num_results': 5})
result = resp.json()
data['sources']['google'] = {
'knowledge_graph': result.get('knowledge_graph', {}),
'top_results': [{'title': r['title'], 'snippet': r.get('snippet', ''),
'url': r['link']} for r in result.get('organic_results', [])[:3]],
}
time.sleep(0.3)
# Amazon: product presence
resp = requests.post(URL, headers=H,
json={'query': f'site:amazon.com {brand}', 'country_code': 'us', 'num_results': 5})
data['sources']['amazon'] = [{'title': r['title'], 'url': r['link']}
for r in resp.json().get('organic_results', [])]
time.sleep(0.3)
# Reddit: community sentiment
resp = requests.post(URL, headers=H,
json={'query': f'site:reddit.com {brand} review', 'country_code': 'us', 'num_results': 5})
data['sources']['reddit'] = [{'title': r['title'], 'snippet': r.get('snippet', ''),
'url': r['link']} for r in resp.json().get('organic_results', [])]
return data
brand_data = research_brand('Notion')
kg = brand_data['sources']['google']['knowledge_graph']
print(f'Brand: {brand_data["brand"]}')
print(f'Description: {kg.get("description", "N/A")}')
print(f'Amazon products: {len(brand_data["sources"]["amazon"])}')
print(f'Reddit discussions: {len(brand_data["sources"]["reddit"])}')Step 2: Format brand data as LLM context
Structure the collected data into a format the LLM can use effectively. Include source URLs so the LLM can cite its sources.
def format_brand_context(data: dict) -> str:
lines = [f'Verified brand research data for: {data["brand"]}', '']
# Knowledge graph data
kg = data['sources']['google'].get('knowledge_graph', {})
if kg:
lines.append('OFFICIAL INFO (Google Knowledge Graph):')
if kg.get('description'):
lines.append(f' Description: {kg["description"]}')
if kg.get('website'):
lines.append(f' Website: {kg["website"]}')
if kg.get('founded'):
lines.append(f' Founded: {kg["founded"]}')
lines.append('')
# Google search results
lines.append('WEB PRESENCE (Google):')
for i, r in enumerate(data['sources']['google']['top_results'], 1):
lines.append(f' [{i}] {r["title"]}')
lines.append(f' {r["snippet"][:150]}')
lines.append('')
# Amazon products
amazon = data['sources']['amazon']
if amazon:
lines.append(f'PRODUCT PRESENCE (Amazon, {len(amazon)} listings):')
for r in amazon[:3]:
lines.append(f' - {r["title"][:80]}')
lines.append('')
# Reddit sentiment
reddit = data['sources']['reddit']
if reddit:
lines.append(f'COMMUNITY SENTIMENT (Reddit, {len(reddit)} discussions):')
for r in reddit[:3]:
lines.append(f' - {r["title"][:80]}')
if r['snippet']:
lines.append(f' "{r["snippet"][:120]}"')
return '\n'.join(lines)
context = format_brand_context(brand_data)
print(context)Step 3: Ask the LLM with grounded context
Send the verified brand data to the LLM with instructions to only use the provided sources. This eliminates hallucination about brand facts.
def grounded_brand_analysis(brand: str, question: str) -> dict:
data = research_brand(brand)
context = format_brand_context(data)
messages = [
{'role': 'system', 'content': (
'You are a brand research analyst. Answer ONLY based on the verified '
'data provided below. Do NOT make up facts, pricing, features, or '
'statistics. If the data does not contain the answer, say so. '
'Cite sources as [Google], [Amazon], or [Reddit].'
)},
{'role': 'user', 'content': f'{context}\n\nQuestion: {question}'}
]
llm_url = os.environ.get('LLM_URL', 'http://localhost:11434/v1/chat/completions')
resp = requests.post(llm_url, json={
'model': 'llama3', 'messages': messages, 'max_tokens': 512
})
answer = resp.json()['choices'][0]['message']['content']
return {
'brand': brand,
'question': question,
'answer': answer,
'sources_used': 3, # google + amazon + reddit
'cost': 0.015, # 3 searches
}
result = grounded_brand_analysis('Notion', 'What is Notion and what products do they offer?')
print(f'Q: {result["question"]}')
print(f'A: {result["answer"]}')
print(f'\nSources: {result["sources_used"]} platforms, Cost: ${result["cost"]}')Step 4: Batch research multiple brands for comparison
Research multiple brands and generate a comparison report. The LLM can accurately compare brands because every fact is grounded in live data.
def compare_brands(brands: list[str], question: str) -> str:
all_context = []
for brand in brands:
data = research_brand(brand)
context = format_brand_context(data)
all_context.append(context)
time.sleep(0.5)
combined = '\n\n---\n\n'.join(all_context)
messages = [
{'role': 'system', 'content': (
'You are a brand research analyst. Compare the brands using ONLY '
'the verified data provided. Do not invent facts or features. '
'Cite sources. Be specific about what each brand offers.'
)},
{'role': 'user', 'content': f'{combined}\n\nCompare these brands: {question}'}
]
llm_url = os.environ.get('LLM_URL', 'http://localhost:11434/v1/chat/completions')
resp = requests.post(llm_url, json={
'model': 'llama3', 'messages': messages, 'max_tokens': 1024
})
answer = resp.json()['choices'][0]['message']['content']
cost = len(brands) * 3 * 0.005 # 3 searches per brand
print(f'Compared {len(brands)} brands using {len(brands) * 3} searches')
print(f'Cost: ${cost:.3f}')
return answer
comparison = compare_brands(['Notion', 'Obsidian', 'Coda'],
'Which is best for team collaboration and why?')
print(comparison)Python Example
import os, requests, time
SCAVIO_KEY = os.environ['SCAVIO_API_KEY']
H = {'x-api-key': SCAVIO_KEY, 'Content-Type': 'application/json'}
def research_brand(brand):
results = {}
for prefix in ['', 'site:reddit.com ', 'site:amazon.com ']:
resp = requests.post('https://api.scavio.dev/api/v1/search', headers=H,
json={'query': f'{prefix}{brand}', 'country_code': 'us', 'num_results': 5})
platform = 'reddit' if 'reddit' in prefix else 'amazon' if 'amazon' in prefix else 'google'
results[platform] = resp.json().get('organic_results', [])
time.sleep(0.2)
kg = results.get('google', [{}])[0] if results.get('google') else {}
print(f'{brand}: {len(results["google"])} google, {len(results["reddit"])} reddit, {len(results["amazon"])} amazon')
return results
research_brand('Notion')JavaScript Example
const SCAVIO_KEY = process.env.SCAVIO_API_KEY;
async function researchBrand(brand) {
const results = {};
for (const [platform, prefix] of [['google',''], ['reddit','site:reddit.com '], ['amazon','site:amazon.com ']]) {
const resp = await fetch('https://api.scavio.dev/api/v1/search', {
method: 'POST',
headers: { 'x-api-key': SCAVIO_KEY, 'Content-Type': 'application/json' },
body: JSON.stringify({ query: `${prefix}${brand}`, country_code: 'us', num_results: 5 })
});
results[platform] = (await resp.json()).organic_results || [];
}
console.log(`${brand}: google=${results.google.length}, reddit=${results.reddit.length}, amazon=${results.amazon.length}`);
return results;
}
researchBrand('Notion');Expected Output
Brand: Notion
Description: American productivity and note-taking web application
Amazon products: 4
Reddit discussions: 5
Q: What is Notion and what products do they offer?
A: Based on the verified data, Notion is an American productivity and
note-taking web application [Google]. Their Amazon presence includes
Notion-related productivity guides and templates [Amazon]. Reddit
discussions show strong community adoption for team wikis and project
management [Reddit].
Sources: 3 platforms, Cost: $0.015