YouTube Transcripts to MongoDB Knowledge Base Pipeline
Extract YouTube transcripts, chunk them, and store in MongoDB. Build a searchable video KB for RAG pipelines and agent grounding.
YouTube videos contain massive amounts of unstructured knowledge that never appears in text search results. Extracting transcripts and storing them in MongoDB creates a searchable knowledge base from video content that your agents and RAG pipelines can query directly.
The Pipeline
Step 1: Search YouTube for videos on your target topics. Step 2: Extract transcripts from each video. Step 3: Chunk transcripts into searchable segments. Step 4: Store in MongoDB with text indexes. Step 5: Query your video KB alongside web search results.
Step 1: Finding Relevant Videos
import requests, os
H = {"x-api-key": os.environ["SCAVIO_API_KEY"]}
def find_videos(topic, max_results=10):
"""Search YouTube for videos on a topic."""
r = requests.post("https://api.scavio.dev/api/v1/search",
headers=H,
json={"platform": "youtube", "query": topic},
timeout=10
).json()
videos = []
for item in r.get("organic", [])[:max_results]:
videos.append({
"title": item.get("title", ""),
"url": item.get("link", ""),
"snippet": item.get("snippet", ""),
"channel": item.get("channel", ""),
})
return videos
videos = find_videos("search api tutorial for developers 2026")
for v in videos:
print(f"{v['title']} - {v['url']}")Step 2: Transcript Extraction
from youtube_transcript_api import YouTubeTranscriptApi
from urllib.parse import urlparse, parse_qs
def extract_video_id(url):
"""Extract video ID from YouTube URL."""
parsed = urlparse(url)
if parsed.hostname == "youtu.be":
return parsed.path[1:]
qs = parse_qs(parsed.query)
return qs.get("v", [None])[0]
def get_transcript(video_url):
"""Get transcript from a YouTube video."""
video_id = extract_video_id(video_url)
if not video_id:
return None
try:
transcript = YouTubeTranscriptApi.get_transcript(video_id)
return {
"video_id": video_id,
"segments": transcript,
"full_text": " ".join([s["text"] for s in transcript]),
}
except Exception as e:
print(f"Transcript unavailable for {video_id}: {e}")
return NoneStep 3: Chunking for Search
def chunk_transcript(transcript_data, chunk_size=500):
"""Split transcript into searchable chunks."""
if not transcript_data:
return []
words = transcript_data["full_text"].split()
chunks = []
for i in range(0, len(words), chunk_size):
chunk_words = words[i:i + chunk_size]
# Find timestamp for this chunk
char_pos = len(" ".join(words[:i]))
timestamp = 0
running = 0
for seg in transcript_data["segments"]:
running += len(seg["text"]) + 1
if running >= char_pos:
timestamp = seg["start"]
break
chunks.append({
"text": " ".join(chunk_words),
"video_id": transcript_data["video_id"],
"timestamp": timestamp,
"chunk_index": i // chunk_size,
})
return chunksStep 4: MongoDB Storage
from pymongo import MongoClient
client = MongoClient("mongodb://localhost:27017")
db = client["video_kb"]
chunks_col = db["transcript_chunks"]
# Create text index
chunks_col.create_index([("text", "text")])
def ingest_video(video_info, transcript):
"""Store chunked transcript in MongoDB."""
video_chunks = chunk_transcript(transcript)
for chunk in video_chunks:
chunk["video_title"] = video_info["title"]
chunk["video_url"] = video_info["url"]
chunk["channel"] = video_info.get("channel", "")
if video_chunks:
chunks_col.insert_many(video_chunks)
return len(video_chunks)
# Ingest all videos for a topic
for video in videos:
transcript = get_transcript(video["url"])
if transcript:
count = ingest_video(video, transcript)
print(f"Ingested {count} chunks from: {video['title']}")Step 5: Querying the Video KB
def search_video_kb(query, limit=5):
"""Search transcript chunks in MongoDB."""
results = chunks_col.find(
{"$text": {"$search": query}},
{"score": {"$meta": "textScore"}}
).sort([("score", {"$meta": "textScore"})]).limit(limit)
return [
{
"text": r["text"][:300],
"video": r["video_title"],
"url": f"{r['video_url']}&t={int(r['timestamp'])}",
"score": r["score"],
}
for r in results
]
# Search across all ingested video transcripts
results = search_video_kb("how to set up search api")
for r in results:
print(f"{r['video']} (score: {r['score']:.2f})")
print(f" {r['url']}")
print(f" {r['text'][:100]}...")Combining Video KB with Web Search
For RAG pipelines, search your video KB first for domain-specific knowledge, then supplement with web search for current information. Video transcripts often contain practical, step-by-step knowledge that blog posts summarize but videos demonstrate in depth.