YouTube comments contain product feedback, audience sentiment, and content ideas that most tools ignore. This tutorial uses the Scavio Search API to find YouTube video pages, extract comment data, and return clean structured JSON ready for analysis pipelines, sentiment scoring, or agent consumption.
Prerequisites
- Python 3.11+ or Node.js 20+
- A Scavio API key from https://scavio.dev
- Basic familiarity with JSON parsing
- Optional: pandas for data analysis
Walkthrough
Step 1: Search for YouTube video pages
Use the Scavio Search API with a site-scoped query to find YouTube video URLs matching your topic. The API returns page metadata and content snippets.
import httpx
SCAVIO_API_KEY = "your-api-key"
async def search_youtube_videos(topic: str, max_results: int = 5) -> list[dict]:
async with httpx.AsyncClient(timeout=15) as client:
resp = await client.post(
"https://api.scavio.dev/api/v1/search",
headers={"x-api-key": SCAVIO_API_KEY},
json={
"query": f"site:youtube.com {topic}",
"num_results": max_results
}
)
resp.raise_for_status()
results = resp.json().get("results", [])
videos = []
for r in results:
url = r.get("url", "")
if "youtube.com/watch" in url or "youtu.be/" in url:
videos.append({
"url": url,
"title": r.get("title", ""),
"snippet": r.get("description", ""),
"video_id": extract_video_id(url)
})
return videos
def extract_video_id(url: str) -> str:
if "v=" in url:
return url.split("v=")[1].split("&")[0]
if "youtu.be/" in url:
return url.split("youtu.be/")[1].split("?")[0]
return ""Step 2: Fetch and parse comment data from video pages
Search for discussions and comment threads related to each video. The Scavio API captures page content that includes visible comment text from indexed pages.
async def fetch_video_comments(
client: httpx.AsyncClient,
video_id: str,
video_title: str
) -> list[dict]:
resp = await client.post(
"https://api.scavio.dev/api/v1/search",
headers={"x-api-key": SCAVIO_API_KEY},
json={
"query": f"youtube comments "{video_title}"",
"num_results": 10
}
)
resp.raise_for_status()
results = resp.json().get("results", [])
comments = []
for r in results:
content = r.get("description", "")
if len(content) > 20:
comments.append({
"video_id": video_id,
"source_url": r.get("url", ""),
"text": content,
"source_title": r.get("title", "")
})
return commentsStep 3: Structure the output as clean JSON
Combine video metadata and comment data into a single structured JSON output. Add timestamps and deduplication to keep the data clean.
from datetime import datetime
async def extract_youtube_comments(topic: str) -> dict:
videos = await search_youtube_videos(topic)
all_data = []
seen_texts = set()
async with httpx.AsyncClient(timeout=15) as client:
for video in videos:
comments = await fetch_video_comments(
client, video["video_id"], video["title"]
)
unique_comments = []
for c in comments:
text_key = c["text"][:100].lower()
if text_key not in seen_texts:
seen_texts.add(text_key)
unique_comments.append(c)
all_data.append({
"video": video,
"comments": unique_comments,
"comment_count": len(unique_comments)
})
return {
"topic": topic,
"extracted_at": datetime.now().isoformat(),
"total_videos": len(videos),
"total_comments": sum(v["comment_count"] for v in all_data),
"videos": all_data
}Python Example
import asyncio
import json
import httpx
from datetime import datetime
SCAVIO_API_KEY = "your-api-key"
def extract_video_id(url: str) -> str:
if "v=" in url:
return url.split("v=")[1].split("&")[0]
if "youtu.be/" in url:
return url.split("youtu.be/")[1].split("?")[0]
return ""
async def main():
topic = "AI agent frameworks 2026"
async with httpx.AsyncClient(timeout=15) as client:
# Find videos
resp = await client.post(
"https://api.scavio.dev/api/v1/search",
headers={"x-api-key": SCAVIO_API_KEY},
json={"query": f"site:youtube.com {topic}", "num_results": 5}
)
videos = [
{"url": r["url"], "title": r.get("title", ""), "video_id": extract_video_id(r["url"])}
for r in resp.json().get("results", [])
if "youtube.com/watch" in r.get("url", "")
]
# Fetch comment discussions
all_comments = []
for v in videos:
resp = await client.post(
"https://api.scavio.dev/api/v1/search",
headers={"x-api-key": SCAVIO_API_KEY},
json={"query": f"youtube comments \"{v['title']}\"", "num_results": 5}
)
for r in resp.json().get("results", []):
if len(r.get("description", "")) > 20:
all_comments.append({"video": v["title"], "text": r["description"][:200]})
output = {
"topic": topic,
"extracted_at": datetime.now().isoformat(),
"videos_found": len(videos),
"comments_extracted": len(all_comments),
"data": all_comments
}
print(json.dumps(output, indent=2))
asyncio.run(main())JavaScript Example
const SCAVIO_API_KEY = "your-api-key";
function extractVideoId(url) {
if (url.includes("v=")) return url.split("v=")[1].split("&")[0];
if (url.includes("youtu.be/")) return url.split("youtu.be/")[1].split("?")[0];
return "";
}
async function searchScavio(query, numResults = 5) {
const resp = await fetch("https://api.scavio.dev/api/v1/search", {
method: "POST",
headers: { "x-api-key": SCAVIO_API_KEY, "Content-Type": "application/json" },
body: JSON.stringify({ query, num_results: numResults })
});
return resp.json();
}
async function main() {
const topic = "AI agent frameworks 2026";
const videoData = await searchScavio("site:youtube.com " + topic);
const videos = (videoData.results || [])
.filter(r => (r.url || "").includes("youtube.com/watch"))
.map(r => ({ url: r.url, title: r.title, videoId: extractVideoId(r.url) }));
const allComments = [];
for (const v of videos) {
const commData = await searchScavio('youtube comments "' + v.title + '"', 5);
for (const r of (commData.results || [])) {
if ((r.description || "").length > 20) {
allComments.push({ video: v.title, text: r.description.slice(0, 200) });
}
}
}
console.log(JSON.stringify({
topic,
extractedAt: new Date().toISOString(),
videosFound: videos.length,
commentsExtracted: allComments.length,
data: allComments
}, null, 2));
}
main();Expected Output
{
"topic": "AI agent frameworks 2026",
"extracted_at": "2026-05-17T10:30:00",
"videos_found": 4,
"comments_extracted": 12,
"data": [...]
}