Build a YouTube knowledge base in MongoDB by searching for topic-relevant videos through the Scavio API, extracting metadata and transcript data, storing them as text-indexed documents, and querying the knowledge base with full-text search. YouTube contains expert knowledge across every domain, but it is locked inside video format. A searchable knowledge base converts that video content into queryable text, making it accessible for research agents, chatbots, and internal tools.
Prerequisites
- Python 3.8+ installed
- pymongo library installed
- MongoDB running locally or Atlas connection string
- A Scavio API key from scavio.dev
Walkthrough
Step 1: Search YouTube topics
Query YouTube through Scavio to find relevant videos on your target topics.
import os, requests
from pymongo import MongoClient
from datetime import datetime
API_KEY = os.environ['SCAVIO_API_KEY']
MONGO_URI = os.environ.get('MONGO_URI', 'mongodb://localhost:27017')
client = MongoClient(MONGO_URI)
db = client['youtube_kb']
collection = db['videos']
def search_youtube(topic: str) -> list:
resp = requests.post('https://api.scavio.dev/api/v1/search',
headers={'x-api-key': API_KEY},
json={'platform': 'youtube', 'query': topic}, timeout=15)
resp.raise_for_status()
return resp.json().get('organic_results', [])
results = search_youtube('python async programming tutorial')
print(f'Found {len(results)} videos')Step 2: Extract video metadata
Parse the search results to extract title, channel, description, views, and duration for each video.
def extract_metadata(results: list, topic: str) -> list:
docs = []
for r in results:
doc = {
'title': r.get('title', ''),
'url': r.get('link', ''),
'channel': r.get('channel', r.get('author', '')),
'description': r.get('snippet', r.get('description', '')),
'views': r.get('views', ''),
'duration': r.get('duration', ''),
'topic': topic,
'indexed_at': datetime.utcnow(),
'searchable_text': f"{r.get('title', '')} {r.get('snippet', '')} {r.get('channel', '')}",
}
docs.append(doc)
return docs
docs = extract_metadata(results, 'python async')
print(f'Extracted {len(docs)} video documents')
if docs:
print(f'Sample: {docs[0]["title"][:60]}')Step 3: Create MongoDB text index
Store the documents in MongoDB and create a text index on the searchable fields for full-text queries.
def index_videos(docs: list) -> int:
if not docs:
return 0
# Upsert to avoid duplicates
inserted = 0
for doc in docs:
result = collection.update_one(
{'url': doc['url']},
{'$set': doc},
upsert=True
)
if result.upserted_id:
inserted += 1
return inserted
# Create text index
collection.create_index([
('searchable_text', 'text'),
('title', 'text'),
('description', 'text'),
])
count = index_videos(docs)
print(f'Indexed {count} new videos (total: {collection.count_documents({})})')Step 4: Search the knowledge base
Query your MongoDB knowledge base using full-text search to find relevant videos by content.
def search_kb(query: str, limit: int = 5) -> list:
cursor = collection.find(
{'$text': {'$search': query}},
{'score': {'$meta': 'textScore'}}
).sort([('score', {'$meta': 'textScore'})]).limit(limit)
results = []
for doc in cursor:
results.append({
'title': doc['title'],
'url': doc['url'],
'channel': doc.get('channel', ''),
'score': doc.get('score', 0),
})
return results
results = search_kb('async await python')
for r in results:
print(f"{r['score']:.1f} | {r['title'][:50]} | {r['channel']}")Step 5: Automate daily indexing
Build a daily job that searches new topics and adds fresh videos to the knowledge base.
DAILY_TOPICS = [
'python best practices 2026',
'machine learning tutorial',
'system design interview',
'cloud architecture patterns',
]
def daily_index() -> dict:
total_new = 0
for topic in DAILY_TOPICS:
results = search_youtube(topic)
docs = extract_metadata(results, topic)
new = index_videos(docs)
total_new += new
print(f'{topic}: {new} new videos indexed')
total = collection.count_documents({})
print(f'Daily index complete: {total_new} new, {total} total')
return {'new': total_new, 'total': total}
# Run daily via cron: python -c 'from kb import daily_index; daily_index()'
daily_index()Python Example
import requests, os
from pymongo import MongoClient
H = {'x-api-key': os.environ['SCAVIO_API_KEY']}
client = MongoClient('mongodb://localhost:27017')
db = client['youtube_kb']
def index_topic(topic):
data = requests.post('https://api.scavio.dev/api/v1/search', headers=H,
json={'platform': 'youtube', 'query': topic}).json()
for r in data.get('organic_results', []):
db.videos.update_one({'url': r.get('link')}, {'$set': {
'title': r.get('title', ''), 'url': r.get('link', ''),
'text': f"{r.get('title', '')} {r.get('snippet', '')}"
}}, upsert=True)
index_topic('python async tutorial')
print(f'{db.videos.count_documents({})} videos in KB')JavaScript Example
const H = {'x-api-key': process.env.SCAVIO_API_KEY, 'Content-Type': 'application/json'};
async function searchYouTube(topic) {
const r = await fetch('https://api.scavio.dev/api/v1/search', {
method: 'POST', headers: H, body: JSON.stringify({platform: 'youtube', query: topic})
});
return (await r.json()).organic_results || [];
}
searchYouTube('python async tutorial').then(r => console.log(r.length + ' videos found'));Expected Output
A MongoDB-backed knowledge base populated with YouTube video metadata from search results, queryable via full-text search and updated daily.