Les LLM hallucinent constamment des informations sur les marques. Ils inventent des caractéristiques de produits, citent des prix erronés et confondent les concurrents. Ancrage de la recherche de marque avec des données de recherche en direct corrige cela en fournissant au LLM des faits vérifiés. Ce tutoriel construit un pipeline de recherche de marque qui interroge Google pour les infos entreprise, Amazon pour les données produits, Reddit pour le sentiment communautaire, et YouTube pour la présence de contenu, le tout via l'API Scavio à 0,005 $ par recherche.
Prérequis
- Python 3.9+ installé
- bibliothèque requests installée
- Une clé API Scavio depuis scavio.dev
- Une clé API LLM (OpenAI, Anthropic ou Ollama local)
Parcours
Étape 1: Construire le pipeline de collecte de données de marque
Recherchez sur plusieurs plateformes pour collecter des données de marque vérifiées. Chaque plateforme offre une perspective différente : Google pour les infos officielles, Amazon pour les produits, Reddit pour la réputation.
import os, requests, time
SCAVIO_KEY = os.environ['SCAVIO_API_KEY']
URL = 'https://api.scavio.dev/api/v1/search'
H = {'x-api-key': SCAVIO_KEY, 'Content-Type': 'application/json'}
def research_brand(brand: str) -> dict:
data = {'brand': brand, 'sources': {}}
# Google: official info, knowledge graph
resp = requests.post(URL, headers=H,
json={'query': f'{brand} company', 'country_code': 'us', 'num_results': 5})
result = resp.json()
data['sources']['google'] = {
'knowledge_graph': result.get('knowledge_graph', {}),
'top_results': [{'title': r['title'], 'snippet': r.get('snippet', ''),
'url': r['link']} for r in result.get('organic_results', [])[:3]],
}
time.sleep(0.3)
# Amazon: product presence
resp = requests.post(URL, headers=H,
json={'query': f'site:amazon.com {brand}', 'country_code': 'us', 'num_results': 5})
data['sources']['amazon'] = [{'title': r['title'], 'url': r['link']}
for r in resp.json().get('organic_results', [])]
time.sleep(0.3)
# Reddit: community sentiment
resp = requests.post(URL, headers=H,
json={'query': f'site:reddit.com {brand} review', 'country_code': 'us', 'num_results': 5})
data['sources']['reddit'] = [{'title': r['title'], 'snippet': r.get('snippet', ''),
'url': r['link']} for r in resp.json().get('organic_results', [])]
return data
brand_data = research_brand('Notion')
kg = brand_data['sources']['google']['knowledge_graph']
print(f'Brand: {brand_data["brand"]}')
print(f'Description: {kg.get("description", "N/A")}')
print(f'Amazon products: {len(brand_data["sources"]["amazon"])}')
print(f'Reddit discussions: {len(brand_data["sources"]["reddit"])}')Étape 2: Formater les données de marque en contexte LLM
Structurez les données collectées dans un format que le LLM peut utiliser efficacement. Incluez les URLs sources afin que le LLM puisse citer ses sources.
def format_brand_context(data: dict) -> str:
lines = [f'Verified brand research data for: {data["brand"]}', '']
# Knowledge graph data
kg = data['sources']['google'].get('knowledge_graph', {})
if kg:
lines.append('OFFICIAL INFO (Google Knowledge Graph):')
if kg.get('description'):
lines.append(f' Description: {kg["description"]}')
if kg.get('website'):
lines.append(f' Website: {kg["website"]}')
if kg.get('founded'):
lines.append(f' Founded: {kg["founded"]}')
lines.append('')
# Google search results
lines.append('WEB PRESENCE (Google):')
for i, r in enumerate(data['sources']['google']['top_results'], 1):
lines.append(f' [{i}] {r["title"]}')
lines.append(f' {r["snippet"][:150]}')
lines.append('')
# Amazon products
amazon = data['sources']['amazon']
if amazon:
lines.append(f'PRODUCT PRESENCE (Amazon, {len(amazon)} listings):')
for r in amazon[:3]:
lines.append(f' - {r["title"][:80]}')
lines.append('')
# Reddit sentiment
reddit = data['sources']['reddit']
if reddit:
lines.append(f'COMMUNITY SENTIMENT (Reddit, {len(reddit)} discussions):')
for r in reddit[:3]:
lines.append(f' - {r["title"][:80]}')
if r['snippet']:
lines.append(f' "{r["snippet"][:120]}"')
return '\n'.join(lines)
context = format_brand_context(brand_data)
print(context)Étape 3: Interroger le LLM avec un contexte ancré
Envoyez les données de marque vérifiées au LLM avec pour instruction de n'utiliser que les sources fournies. Cela élimine les hallucinations sur les faits relatifs à la marque.
def grounded_brand_analysis(brand: str, question: str) -> dict:
data = research_brand(brand)
context = format_brand_context(data)
messages = [
{'role': 'system', 'content': (
'You are a brand research analyst. Answer ONLY based on the verified '
'data provided below. Do NOT make up facts, pricing, features, or '
'statistics. If the data does not contain the answer, say so. '
'Cite sources as [Google], [Amazon], or [Reddit].'
)},
{'role': 'user', 'content': f'{context}\n\nQuestion: {question}'}
]
llm_url = os.environ.get('LLM_URL', 'http://localhost:11434/v1/chat/completions')
resp = requests.post(llm_url, json={
'model': 'llama3', 'messages': messages, 'max_tokens': 512
})
answer = resp.json()['choices'][0]['message']['content']
return {
'brand': brand,
'question': question,
'answer': answer,
'sources_used': 3, # google + amazon + reddit
'cost': 0.015, # 3 searches
}
result = grounded_brand_analysis('Notion', 'What is Notion and what products do they offer?')
print(f'Q: {result["question"]}')
print(f'A: {result["answer"]}')
print(f'\nSources: {result["sources_used"]} platforms, Cost: ${result["cost"]}')Étape 4: Rechercher par lot plusieurs marques pour comparaison
Recherchez plusieurs marques et générez un rapport de comparaison. Le LLM peut comparer les marques avec précision car chaque fait est ancré dans des données en direct.
def compare_brands(brands: list[str], question: str) -> str:
all_context = []
for brand in brands:
data = research_brand(brand)
context = format_brand_context(data)
all_context.append(context)
time.sleep(0.5)
combined = '\n\n---\n\n'.join(all_context)
messages = [
{'role': 'system', 'content': (
'You are a brand research analyst. Compare the brands using ONLY '
'the verified data provided. Do not invent facts or features. '
'Cite sources. Be specific about what each brand offers.'
)},
{'role': 'user', 'content': f'{combined}\n\nCompare these brands: {question}'}
]
llm_url = os.environ.get('LLM_URL', 'http://localhost:11434/v1/chat/completions')
resp = requests.post(llm_url, json={
'model': 'llama3', 'messages': messages, 'max_tokens': 1024
})
answer = resp.json()['choices'][0]['message']['content']
cost = len(brands) * 3 * 0.005 # 3 searches per brand
print(f'Compared {len(brands)} brands using {len(brands) * 3} searches')
print(f'Cost: ${cost:.3f}')
return answer
comparison = compare_brands(['Notion', 'Obsidian', 'Coda'],
'Which is best for team collaboration and why?')
print(comparison)Exemple Python
import os, requests, time
SCAVIO_KEY = os.environ['SCAVIO_API_KEY']
H = {'x-api-key': SCAVIO_KEY, 'Content-Type': 'application/json'}
def research_brand(brand):
results = {}
for prefix in ['', 'site:reddit.com ', 'site:amazon.com ']:
resp = requests.post('https://api.scavio.dev/api/v1/search', headers=H,
json={'query': f'{prefix}{brand}', 'country_code': 'us', 'num_results': 5})
platform = 'reddit' if 'reddit' in prefix else 'amazon' if 'amazon' in prefix else 'google'
results[platform] = resp.json().get('organic_results', [])
time.sleep(0.2)
kg = results.get('google', [{}])[0] if results.get('google') else {}
print(f'{brand}: {len(results["google"])} google, {len(results["reddit"])} reddit, {len(results["amazon"])} amazon')
return results
research_brand('Notion')Exemple JavaScript
const SCAVIO_KEY = process.env.SCAVIO_API_KEY;
async function researchBrand(brand) {
const results = {};
for (const [platform, prefix] of [['google',''], ['reddit','site:reddit.com '], ['amazon','site:amazon.com ']]) {
const resp = await fetch('https://api.scavio.dev/api/v1/search', {
method: 'POST',
headers: { 'x-api-key': SCAVIO_KEY, 'Content-Type': 'application/json' },
body: JSON.stringify({ query: `${prefix}${brand}`, country_code: 'us', num_results: 5 })
});
results[platform] = (await resp.json()).organic_results || [];
}
console.log(`${brand}: google=${results.google.length}, reddit=${results.reddit.length}, amazon=${results.amazon.length}`);
return results;
}
researchBrand('Notion');Sortie attendue
Brand: Notion
Description: American productivity and note-taking web application
Amazon products: 4
Reddit discussions: 5
Q: What is Notion and what products do they offer?
A: Based on the verified data, Notion is an American productivity and
note-taking web application [Google]. Their Amazon presence includes
Notion-related productivity guides and templates [Amazon]. Reddit
discussions show strong community adoption for team wikis and project
management [Reddit].
Sources: 3 platforms, Cost: $0.015