Tous les résultats de recherche ne se valent pas en termes de fiabilité. Un domaine .gov citant des données primaires est plus fiable qu’un article de blog d’une ferme de contenu. Ce tutoriel construit un pipeline de notation de confiance qui évalue chaque résultat de recherche sur l’autorité de la source, la fraîcheur du contenu et la cohérence des références croisées. Les scores aident les agents d’IA à prioriser les sources fiables et à signaler les sources douteuses. Coût : 0,005 $ par recherche, plus des requêtes de vérification en option.
Prérequis
- Python 3.9+ installé
- bibliothèque requests installée
- Une clé API Scavio depuis scavio.dev
Parcours
Étape 1: Définir les niveaux d’autorité des sources
Classer les domaines en niveaux d’autorité en fonction de leur TLD et de leur réputation connue. Cela fournit un signal de confiance de base.
AUTHORITY_TIERS = {
'tier1': {
'domains': {'gov', 'edu', 'mil'},
'known_sites': {'reuters.com', 'apnews.com', 'nature.com', 'science.org',
'arxiv.org', 'nih.gov', 'cdc.gov', 'who.int'},
'score': 90
},
'tier2': {
'domains': set(),
'known_sites': {'nytimes.com', 'bbc.com', 'washingtonpost.com',
'github.com', 'stackoverflow.com', 'docs.python.org',
'developer.mozilla.org', 'microsoft.com'},
'score': 75
},
'tier3': {
'domains': {'org', 'io'},
'known_sites': {'medium.com', 'dev.to', 'hackernoon.com', 'reddit.com'},
'score': 50
},
}
def get_authority_score(url: str) -> int:
domain = url.split('/')[2] if '/' in url else ''
tld = domain.split('.')[-1]
for tier_name, tier in AUTHORITY_TIERS.items():
if domain in tier['known_sites'] or tld in tier['domains']:
return tier['score']
return 30 # unknown domain baseline
test_urls = ['https://nih.gov/study', 'https://github.com/repo',
'https://randomsite.xyz/blog']
for url in test_urls:
print(f' {url}: authority={get_authority_score(url)}')Étape 2: Ajouter une notation de fraîcheur
Noter les résultats en fonction de la récence de la publication ou de la mise à jour du contenu. Extraire les dates des extraits et des URL.
import re
from datetime import datetime
def get_freshness_score(snippet: str, url: str) -> int:
"""Score freshness from 0-100 based on detected dates."""
text = snippet + ' ' + url
# Look for year patterns
years = re.findall(r'20(2[4-9])', text)
if years:
latest_year = max(int('20' + y) for y in years)
current_year = 2026
age = current_year - latest_year
if age == 0:
return 100 # current year
elif age == 1:
return 70
elif age == 2:
return 40
else:
return 10
# Look for month-year patterns
months = re.findall(r'(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\w*\s+202[4-9]', text)
if months:
return 80 # has a recent date reference
return 20 # no date information found
test_snippets = [
('Updated May 2026 - Best CRM tools', 'https://site.com/crm-2026'),
('A comprehensive guide from 2024', 'https://site.com/old-guide'),
('Learn Python programming basics', 'https://site.com/python'),
]
for snippet, url in test_snippets:
print(f' freshness={get_freshness_score(snippet, url):3d}: {snippet[:50]}')Étape 3: Construire le pipeline de notation de confiance composite
Combiner l’autorité, la fraîcheur et la cohérence des références croisées en un seul score de confiance pour chaque résultat de recherche.
import requests, os
SCAVIO_KEY = os.environ['SCAVIO_API_KEY']
def trust_score_results(query: str) -> list:
resp = requests.post('https://api.scavio.dev/api/v1/search',
headers={'x-api-key': SCAVIO_KEY, 'Content-Type': 'application/json'},
json={'query': query, 'country_code': 'us', 'num_results': 10})
results = resp.json().get('organic_results', [])
scored = []
# Collect all snippets for cross-reference
all_snippets = [r.get('snippet', '').lower() for r in results]
for i, r in enumerate(results):
authority = get_authority_score(r['link'])
freshness = get_freshness_score(r.get('snippet', ''), r['link'])
# Cross-reference: do other results mention similar facts?
my_keywords = set(re.findall(r'\b\w{5,}\b', r.get('snippet', '').lower()))
cross_ref = 0
for j, other in enumerate(all_snippets):
if i != j:
other_words = set(re.findall(r'\b\w{5,}\b', other))
overlap = len(my_keywords & other_words)
if overlap > 3:
cross_ref += 1
consistency = min(cross_ref * 20, 100)
# Weighted composite
trust = round(authority * 0.4 + freshness * 0.3 + consistency * 0.3)
scored.append({
'title': r['title'][:50], 'url': r['link'],
'trust_score': trust, 'authority': authority,
'freshness': freshness, 'consistency': consistency
})
scored.sort(key=lambda x: -x['trust_score'])
return scored
results = trust_score_results('best CRM software 2026')
print(f'{"Score":>5} {"Auth":>5} {"Fresh":>5} {"Cross":>5} Title')
print('-' * 70)
for r in results[:5]:
print(f'{r["trust_score"]:>5} {r["authority"]:>5} {r["freshness"]:>5} '
f'{r["consistency"]:>5} {r["title"]}')Exemple Python
import requests, os, re
SCAVIO_KEY = os.environ['SCAVIO_API_KEY']
KNOWN = {'gov': 90, 'edu': 90, 'github.com': 75, 'stackoverflow.com': 75}
def trust_score(query):
resp = requests.post('https://api.scavio.dev/api/v1/search',
headers={'x-api-key': SCAVIO_KEY, 'Content-Type': 'application/json'},
json={'query': query, 'country_code': 'us', 'num_results': 10})
for r in resp.json().get('organic_results', []):
domain = r['link'].split('/')[2] if '/' in r['link'] else ''
tld = domain.split('.')[-1]
auth = KNOWN.get(domain, KNOWN.get(tld, 30))
fresh = 100 if '2026' in r.get('snippet', '') else 40
score = int(auth * 0.5 + fresh * 0.5)
print(f'[{score:3d}] {r["title"][:50]}')
trust_score('python best practices 2026')Exemple JavaScript
const SCAVIO_KEY = process.env.SCAVIO_API_KEY;
const KNOWN = { gov: 90, edu: 90, 'github.com': 75, 'stackoverflow.com': 75 };
async function trustScore(query) {
const resp = await fetch('https://api.scavio.dev/api/v1/search', {
method: 'POST',
headers: { 'x-api-key': SCAVIO_KEY, 'Content-Type': 'application/json' },
body: JSON.stringify({ query, country_code: 'us', num_results: 10 })
});
for (const r of (await resp.json()).organic_results || []) {
const domain = new URL(r.link).hostname;
const tld = domain.split('.').pop();
const auth = KNOWN[domain] || KNOWN[tld] || 30;
const fresh = (r.snippet || '').includes('2026') ? 100 : 40;
console.log(`[${Math.round(auth*0.5+fresh*0.5)}] ${r.title.slice(0, 50)}`);
}
}
trustScore('python best practices 2026');Sortie attendue
Score Auth Fresh Cross Title
----------------------------------------------------------------------
82 90 100 40 NIH Guidelines on Data Analysis 2026
75 75 100 60 GitHub - python-best-practices: Updated May
68 75 70 60 Stack Overflow: Python 3.14 New Features
52 30 100 40 Best Python Practices 2026 - TechBlog
38 30 40 40 Python Tips and Tricks - randomsite.com