标准 RAG 管道从向量存储中检索上下文,但当文档被索引时,该上下文就会过时。对于有关定价、最近事件或当前排名的查询,陈旧的上下文会导致肯定错误的答案。 SERP 增强通过添加并行检索路径来解决此问题:当查询看起来对时间敏感时,获取实时搜索结果并将其与矢量结果合并,然后再传递给 LLM。本教程展示如何将 SERP 增强添加到任何现有的 RAG 管道。通过 Scavio API 进行的每次搜索调用费用为 0.005 美元。
前置条件
- 已安装 Python 3.9+
- 现有的 RAG 管道(任何向量存储)
- 请求已安装库
- 来自 scavio.dev 的 Scavio API 密钥
操作指南
步骤 1: 检测时间敏感的查询
构建一个分类器来确定查询是否需要实时数据。有关价格、日期、版本或当前事件的查询应触发 SERP 增强。
import re
TIME_SIGNALS = [
r'\b202[4-9]\b', r'\blatest\b', r'\bcurrent\b', r'\bprice\b',
r'\bpricing\b', r'\btoday\b', r'\brecent\b', r'\bnew\b',
r'\bversion\b', r'\brelease\b', r'\bupdate\b'
]
def needs_live_data(query: str) -> bool:
query_lower = query.lower()
return any(re.search(p, query_lower) for p in TIME_SIGNALS)
# Examples:
for q in ['What is a transformer?', 'Latest Python version 2026', 'Semrush pricing today']:
print(f'{q}: live_data={needs_live_data(q)}')步骤 2: 构建SERP检索功能
创建一个检索器,以与矢量存储相同的格式返回文档,以便它们可以无缝合并。
import requests, os
API_KEY = os.environ['SCAVIO_API_KEY']
def serp_retrieve(query: str, k: int = 5) -> list:
resp = requests.post('https://api.scavio.dev/api/v1/search',
headers={'x-api-key': API_KEY, 'Content-Type': 'application/json'},
json={'query': query, 'country_code': 'us'})
resp.raise_for_status()
results = resp.json().get('organic_results', [])[:k]
return [{
'content': f'{r["title"]}\n{r.get("snippet", "")}',
'source': r['link'],
'retriever': 'serp',
'position': r['position']
} for r in results]步骤 3: 合并向量和 SERP 结果
合并两个检索器的结果,按 URL 删除重复数据,并提高 SERP 结果的排名新鲜度。
def merge_results(vector_docs: list, serp_docs: list) -> list:
seen_urls = set()
merged = []
# SERP results first (fresh data priority)
for doc in serp_docs:
url = doc.get('source', '')
if url not in seen_urls:
seen_urls.add(url)
merged.append(doc)
# Then vector results
for doc in vector_docs:
url = doc.get('source', '')
if url not in seen_urls:
seen_urls.add(url)
merged.append(doc)
return merged[:10] # cap at 10 context docs
# Example:
vector_results = [{'content': 'Old pricing data...', 'source': 'https://example.com/old', 'retriever': 'vector'}]
serp_results = serp_retrieve('Semrush pricing 2026')
merged = merge_results(vector_results, serp_results)
for doc in merged:
print(f'[{doc["retriever"]}] {doc["content"][:60]}...')步骤 4: 集成到您的 RAG 管道中
将增强逻辑包装到现有的检索步骤中。仅当查询对时间敏感时才调用 SERP API,以将成本降至最低。
def augmented_retrieve(query: str, vector_store) -> list:
# Always get vector results
vector_docs = vector_store.similarity_search(query, k=5)
vector_formatted = [{'content': d.page_content, 'source': d.metadata.get('source', ''),
'retriever': 'vector'} for d in vector_docs]
# Conditionally add SERP results
if needs_live_data(query):
serp_docs = serp_retrieve(query, k=5)
return merge_results(vector_formatted, serp_docs)
return vector_formatted
# Usage in your chain:
# docs = augmented_retrieve(user_query, my_vector_store)
# context = '\n\n'.join(d['content'] for d in docs)
# answer = llm(f'Context:\n{context}\n\nQuestion: {user_query}')Python 示例
import os, re, requests
API_KEY = os.environ['SCAVIO_API_KEY']
TIME_SIGNALS = [r'\b202[4-9]\b', r'\blatest\b', r'\bprice\b', r'\bcurrent\b']
def needs_live_data(query: str) -> bool:
return any(re.search(p, query.lower()) for p in TIME_SIGNALS)
def serp_retrieve(query: str, k: int = 5) -> list:
resp = requests.post('https://api.scavio.dev/api/v1/search',
headers={'x-api-key': API_KEY, 'Content-Type': 'application/json'},
json={'query': query, 'country_code': 'us'})
return [{'content': f'{r["title"]}\n{r.get("snippet", "")}',
'source': r['link'], 'retriever': 'serp'}
for r in resp.json().get('organic_results', [])[:k]]
def augmented_rag(query: str, vector_docs: list) -> list:
if needs_live_data(query):
serp_docs = serp_retrieve(query)
return serp_docs + vector_docs
return vector_docs
query = 'Semrush pricing 2026'
result = augmented_rag(query, [{'content': 'old data', 'retriever': 'vector'}])
for r in result:
print(f'[{r["retriever"]}] {r["content"][:60]}')JavaScript 示例
const API_KEY = process.env.SCAVIO_API_KEY;
const TIME_SIGNALS = [/\b202[4-9]\b/i, /\blatest\b/i, /\bprice\b/i, /\bcurrent\b/i];
function needsLiveData(query) {
return TIME_SIGNALS.some(p => p.test(query));
}
async function serpRetrieve(query, k = 5) {
const resp = await fetch('https://api.scavio.dev/api/v1/search', {
method: 'POST',
headers: { 'x-api-key': API_KEY, 'Content-Type': 'application/json' },
body: JSON.stringify({ query, country_code: 'us' })
});
const data = await resp.json();
return (data.organic_results || []).slice(0, k)
.map(r => ({ content: `${r.title}\n${r.snippet || ''}`, source: r.link, retriever: 'serp' }));
}
async function augmentedRag(query, vectorDocs) {
if (needsLiveData(query)) {
const serp = await serpRetrieve(query);
return [...serp, ...vectorDocs];
}
return vectorDocs;
}
augmentedRag('Semrush pricing 2026', [{ content: 'old', retriever: 'vector' }])
.then(docs => docs.forEach(d => console.log(`[${d.retriever}] ${d.content.slice(0, 60)}`)));预期输出
What is a transformer?: live_data=False
Latest Python version 2026: live_data=True
Semrush pricing today: live_data=True
[serp] Semrush Pricing Plans 2026 - Complete Breakdown...
[serp] Semrush Review: Is the $139.95/mo Pro Plan Worth It?...
[serp] Semrush vs Ahrefs Pricing Comparison (May 2026)...
[vector] old data