YouTube 评论包含大多数工具忽略的产品反馈、观众情绪和内容创意。本教程使用 Scavio 搜索 API 查找 YouTube 视频页面、提取评论数据并返回干净的结构化 JSON,以供分析管道、情绪评分或代理使用。
前置条件
- Python 3.11+ 或 Node.js 20+
- 来自 https://scavio.dev 的 Scavio API 密钥
- 基本熟悉 JSON 解析
- 可选:用于数据分析的 pandas
操作指南
步骤 1: 搜索 YouTube 视频页面
使用 Scavio Search API 和站点范围的查询来查找与您的主题匹配的 YouTube 视频 URL。 API 返回页面元数据和内容片段。
import httpx
SCAVIO_API_KEY = "your-api-key"
async def search_youtube_videos(topic: str, max_results: int = 5) -> list[dict]:
async with httpx.AsyncClient(timeout=15) as client:
resp = await client.post(
"https://api.scavio.dev/api/v1/search",
headers={"x-api-key": SCAVIO_API_KEY},
json={
"query": f"site:youtube.com {topic}",
"num_results": max_results
}
)
resp.raise_for_status()
results = resp.json().get("results", [])
videos = []
for r in results:
url = r.get("url", "")
if "youtube.com/watch" in url or "youtu.be/" in url:
videos.append({
"url": url,
"title": r.get("title", ""),
"snippet": r.get("description", ""),
"video_id": extract_video_id(url)
})
return videos
def extract_video_id(url: str) -> str:
if "v=" in url:
return url.split("v=")[1].split("&")[0]
if "youtu.be/" in url:
return url.split("youtu.be/")[1].split("?")[0]
return ""步骤 2: 从视频页面获取并解析评论数据
搜索与每个视频相关的讨论和评论线程。 Scavio API 捕获页面内容,其中包括索引页面中的可见注释文本。
async def fetch_video_comments(
client: httpx.AsyncClient,
video_id: str,
video_title: str
) -> list[dict]:
resp = await client.post(
"https://api.scavio.dev/api/v1/search",
headers={"x-api-key": SCAVIO_API_KEY},
json={
"query": f"youtube comments "{video_title}"",
"num_results": 10
}
)
resp.raise_for_status()
results = resp.json().get("results", [])
comments = []
for r in results:
content = r.get("description", "")
if len(content) > 20:
comments.append({
"video_id": video_id,
"source_url": r.get("url", ""),
"text": content,
"source_title": r.get("title", "")
})
return comments步骤 3: 将输出结构化为干净的 JSON
将视频元数据和评论数据合并到单个结构化 JSON 输出中。添加时间戳和重复数据删除以保持数据干净。
from datetime import datetime
async def extract_youtube_comments(topic: str) -> dict:
videos = await search_youtube_videos(topic)
all_data = []
seen_texts = set()
async with httpx.AsyncClient(timeout=15) as client:
for video in videos:
comments = await fetch_video_comments(
client, video["video_id"], video["title"]
)
unique_comments = []
for c in comments:
text_key = c["text"][:100].lower()
if text_key not in seen_texts:
seen_texts.add(text_key)
unique_comments.append(c)
all_data.append({
"video": video,
"comments": unique_comments,
"comment_count": len(unique_comments)
})
return {
"topic": topic,
"extracted_at": datetime.now().isoformat(),
"total_videos": len(videos),
"total_comments": sum(v["comment_count"] for v in all_data),
"videos": all_data
}Python 示例
import asyncio
import json
import httpx
from datetime import datetime
SCAVIO_API_KEY = "your-api-key"
def extract_video_id(url: str) -> str:
if "v=" in url:
return url.split("v=")[1].split("&")[0]
if "youtu.be/" in url:
return url.split("youtu.be/")[1].split("?")[0]
return ""
async def main():
topic = "AI agent frameworks 2026"
async with httpx.AsyncClient(timeout=15) as client:
# Find videos
resp = await client.post(
"https://api.scavio.dev/api/v1/search",
headers={"x-api-key": SCAVIO_API_KEY},
json={"query": f"site:youtube.com {topic}", "num_results": 5}
)
videos = [
{"url": r["url"], "title": r.get("title", ""), "video_id": extract_video_id(r["url"])}
for r in resp.json().get("results", [])
if "youtube.com/watch" in r.get("url", "")
]
# Fetch comment discussions
all_comments = []
for v in videos:
resp = await client.post(
"https://api.scavio.dev/api/v1/search",
headers={"x-api-key": SCAVIO_API_KEY},
json={"query": f"youtube comments \"{v['title']}\"", "num_results": 5}
)
for r in resp.json().get("results", []):
if len(r.get("description", "")) > 20:
all_comments.append({"video": v["title"], "text": r["description"][:200]})
output = {
"topic": topic,
"extracted_at": datetime.now().isoformat(),
"videos_found": len(videos),
"comments_extracted": len(all_comments),
"data": all_comments
}
print(json.dumps(output, indent=2))
asyncio.run(main())JavaScript 示例
const SCAVIO_API_KEY = "your-api-key";
function extractVideoId(url) {
if (url.includes("v=")) return url.split("v=")[1].split("&")[0];
if (url.includes("youtu.be/")) return url.split("youtu.be/")[1].split("?")[0];
return "";
}
async function searchScavio(query, numResults = 5) {
const resp = await fetch("https://api.scavio.dev/api/v1/search", {
method: "POST",
headers: { "x-api-key": SCAVIO_API_KEY, "Content-Type": "application/json" },
body: JSON.stringify({ query, num_results: numResults })
});
return resp.json();
}
async function main() {
const topic = "AI agent frameworks 2026";
const videoData = await searchScavio("site:youtube.com " + topic);
const videos = (videoData.results || [])
.filter(r => (r.url || "").includes("youtube.com/watch"))
.map(r => ({ url: r.url, title: r.title, videoId: extractVideoId(r.url) }));
const allComments = [];
for (const v of videos) {
const commData = await searchScavio('youtube comments "' + v.title + '"', 5);
for (const r of (commData.results || [])) {
if ((r.description || "").length > 20) {
allComments.push({ video: v.title, text: r.description.slice(0, 200) });
}
}
}
console.log(JSON.stringify({
topic,
extractedAt: new Date().toISOString(),
videosFound: videos.length,
commentsExtracted: allComments.length,
data: allComments
}, null, 2));
}
main();预期输出
{
"topic": "AI agent frameworks 2026",
"extracted_at": "2026-05-17T10:30:00",
"videos_found": 4,
"comments_extracted": 12,
"data": [...]
}