youtubemongodbknowledge-base

YouTube Transcripts to MongoDB Knowledge Base Pipeline

Extract YouTube transcripts, chunk them, and store in MongoDB. Build a searchable video KB for RAG pipelines and agent grounding.

6 min read

YouTube videos contain massive amounts of unstructured knowledge that never appears in text search results. Extracting transcripts and storing them in MongoDB creates a searchable knowledge base from video content that your agents and RAG pipelines can query directly.

The Pipeline

Step 1: Search YouTube for videos on your target topics. Step 2: Extract transcripts from each video. Step 3: Chunk transcripts into searchable segments. Step 4: Store in MongoDB with text indexes. Step 5: Query your video KB alongside web search results.

Step 1: Finding Relevant Videos

Python
import requests, os

H = {"x-api-key": os.environ["SCAVIO_API_KEY"]}

def find_videos(topic, max_results=10):
    """Search YouTube for videos on a topic."""
    r = requests.post("https://api.scavio.dev/api/v1/search",
        headers=H,
        json={"platform": "youtube", "query": topic},
        timeout=10
    ).json()
    videos = []
    for item in r.get("organic", [])[:max_results]:
        videos.append({
            "title": item.get("title", ""),
            "url": item.get("link", ""),
            "snippet": item.get("snippet", ""),
            "channel": item.get("channel", ""),
        })
    return videos

videos = find_videos("search api tutorial for developers 2026")
for v in videos:
    print(f"{v['title']} - {v['url']}")

Step 2: Transcript Extraction

Python
from youtube_transcript_api import YouTubeTranscriptApi
from urllib.parse import urlparse, parse_qs

def extract_video_id(url):
    """Extract video ID from YouTube URL."""
    parsed = urlparse(url)
    if parsed.hostname == "youtu.be":
        return parsed.path[1:]
    qs = parse_qs(parsed.query)
    return qs.get("v", [None])[0]

def get_transcript(video_url):
    """Get transcript from a YouTube video."""
    video_id = extract_video_id(video_url)
    if not video_id:
        return None
    try:
        transcript = YouTubeTranscriptApi.get_transcript(video_id)
        return {
            "video_id": video_id,
            "segments": transcript,
            "full_text": " ".join([s["text"] for s in transcript]),
        }
    except Exception as e:
        print(f"Transcript unavailable for {video_id}: {e}")
        return None

Step 3: Chunking for Search

Python
def chunk_transcript(transcript_data, chunk_size=500):
    """Split transcript into searchable chunks."""
    if not transcript_data:
        return []
    words = transcript_data["full_text"].split()
    chunks = []
    for i in range(0, len(words), chunk_size):
        chunk_words = words[i:i + chunk_size]
        # Find timestamp for this chunk
        char_pos = len(" ".join(words[:i]))
        timestamp = 0
        running = 0
        for seg in transcript_data["segments"]:
            running += len(seg["text"]) + 1
            if running >= char_pos:
                timestamp = seg["start"]
                break
        chunks.append({
            "text": " ".join(chunk_words),
            "video_id": transcript_data["video_id"],
            "timestamp": timestamp,
            "chunk_index": i // chunk_size,
        })
    return chunks

Step 4: MongoDB Storage

Python
from pymongo import MongoClient

client = MongoClient("mongodb://localhost:27017")
db = client["video_kb"]
chunks_col = db["transcript_chunks"]

# Create text index
chunks_col.create_index([("text", "text")])

def ingest_video(video_info, transcript):
    """Store chunked transcript in MongoDB."""
    video_chunks = chunk_transcript(transcript)
    for chunk in video_chunks:
        chunk["video_title"] = video_info["title"]
        chunk["video_url"] = video_info["url"]
        chunk["channel"] = video_info.get("channel", "")
    if video_chunks:
        chunks_col.insert_many(video_chunks)
    return len(video_chunks)

# Ingest all videos for a topic
for video in videos:
    transcript = get_transcript(video["url"])
    if transcript:
        count = ingest_video(video, transcript)
        print(f"Ingested {count} chunks from: {video['title']}")

Step 5: Querying the Video KB

Python
def search_video_kb(query, limit=5):
    """Search transcript chunks in MongoDB."""
    results = chunks_col.find(
        {"$text": {"$search": query}},
        {"score": {"$meta": "textScore"}}
    ).sort([("score", {"$meta": "textScore"})]).limit(limit)
    return [
        {
            "text": r["text"][:300],
            "video": r["video_title"],
            "url": f"{r['video_url']}&t={int(r['timestamp'])}",
            "score": r["score"],
        }
        for r in results
    ]

# Search across all ingested video transcripts
results = search_video_kb("how to set up search api")
for r in results:
    print(f"{r['video']} (score: {r['score']:.2f})")
    print(f"  {r['url']}")
    print(f"  {r['text'][:100]}...")

Combining Video KB with Web Search

For RAG pipelines, search your video KB first for domain-specific knowledge, then supplement with web search for current information. Video transcripts often contain practical, step-by-step knowledge that blog posts summarize but videos demonstrate in depth.