YouTube videos contain vast amounts of knowledge locked in audio. By collecting transcripts via the Scavio API and indexing them in BigQuery, you can build a searchable knowledge base across thousands of videos. BigQuery's SEARCH() function enables full-text search over transcript text, letting you find specific topics, quotes, or instructions mentioned in any indexed video. This tutorial collects transcripts, loads them into BigQuery, and runs full-text queries.
Prerequisites
- Python 3.9+ installed
- requests and google-cloud-bigquery libraries installed
- A Scavio API key from scavio.dev
- A Google Cloud project with BigQuery enabled
Walkthrough
Step 1: Collect YouTube transcripts via API
Search YouTube for videos on a topic, extract video IDs, then fetch transcripts for each. Store the raw transcript data for BigQuery loading.
import os, requests, json, time, re
SCAVIO_KEY = os.environ['SCAVIO_API_KEY']
H = {'x-api-key': SCAVIO_KEY, 'Content-Type': 'application/json'}
URL = 'https://api.scavio.dev/api/v1/search'
def search_youtube_videos(topic: str, num: int = 10) -> list:
resp = requests.post(URL, headers=H,
json={'query': f'{topic} site:youtube.com', 'country_code': 'us', 'num_results': num})
results = resp.json().get('organic_results', [])
videos = []
for r in results:
vid_match = re.search(r'v=([^&]+)', r.get('link', ''))
if vid_match:
videos.append({'video_id': vid_match.group(1), 'title': r['title'], 'url': r['link']})
return videos
def get_transcript(video_id: str) -> list:
resp = requests.post(URL, headers=H,
json={'platform': 'youtube', 'action': 'transcript', 'video_id': video_id})
return resp.json().get('transcript', [])
videos = search_youtube_videos('python FastAPI tutorial')
print(f'Found {len(videos)} YouTube videos')
for v in videos[:3]:
print(f' {v["video_id"]}: {v["title"][:50]}')Step 2: Prepare transcript data for BigQuery
Collect transcripts, flatten them into full text, and structure the data in JSONL format for BigQuery loading.
def collect_transcripts(videos: list) -> list:
records = []
for v in videos:
segments = get_transcript(v['video_id'])
if not segments:
continue
full_text = ' '.join(seg.get('text', '') for seg in segments)
records.append({
'video_id': v['video_id'],
'title': v['title'],
'url': v['url'],
'transcript_text': full_text,
'segment_count': len(segments),
'word_count': len(full_text.split()),
'duration_seconds': sum(seg.get('duration', 0) for seg in segments),
})
time.sleep(0.3)
return records
def save_for_bigquery(records: list, output_file: str = 'transcripts.jsonl'):
with open(output_file, 'w') as f:
for r in records:
f.write(json.dumps(r) + '\n')
print(f'Saved {len(records)} transcripts to {output_file}')
total_words = sum(r['word_count'] for r in records)
print(f'Total words: {total_words:,}')
transcripts = collect_transcripts(videos[:5])
save_for_bigquery(transcripts)Step 3: Load into BigQuery and run SEARCH queries
Create a BigQuery table, load the transcript data, and use the SEARCH() function for full-text search across all indexed transcripts.
from google.cloud import bigquery
def setup_bigquery(project_id: str, dataset: str = 'youtube_data'):
client = bigquery.Client(project=project_id)
# Create dataset if needed
dataset_ref = bigquery.Dataset(f'{project_id}.{dataset}')
try:
client.create_dataset(dataset_ref)
except Exception:
pass # Already exists
# Create table with search index
schema = [
bigquery.SchemaField('video_id', 'STRING'),
bigquery.SchemaField('title', 'STRING'),
bigquery.SchemaField('url', 'STRING'),
bigquery.SchemaField('transcript_text', 'STRING'),
bigquery.SchemaField('segment_count', 'INTEGER'),
bigquery.SchemaField('word_count', 'INTEGER'),
bigquery.SchemaField('duration_seconds', 'FLOAT'),
]
table_ref = f'{project_id}.{dataset}.transcripts'
table = bigquery.Table(table_ref, schema=schema)
client.create_table(table, exists_ok=True)
# Load data
job_config = bigquery.LoadJobConfig(
source_format=bigquery.SourceFormat.NEWLINE_DELIMITED_JSON)
with open('transcripts.jsonl', 'rb') as f:
job = client.load_table_from_file(f, table_ref, job_config=job_config)
job.result()
print(f'Loaded data into {table_ref}')
return client
def search_transcripts(client, project_id: str, query: str):
sql = f"""
SELECT video_id, title, url,
SUBSTR(transcript_text, 1, 200) AS excerpt
FROM `{project_id}.youtube_data.transcripts`
WHERE SEARCH(transcript_text, @query)
LIMIT 10
"""
job_config = bigquery.QueryJobConfig(
query_parameters=[bigquery.ScalarQueryParameter('query', 'STRING', query)])
results = client.query(sql, job_config=job_config)
print(f'Search results for: "{query}"')
for row in results:
print(f' [{row.video_id}] {row.title[:50]}')
print(f' {row.excerpt[:100]}...')
# Example usage (requires GCP credentials)
# client = setup_bigquery('my-project-id')
# search_transcripts(client, 'my-project-id', 'FastAPI async endpoints')
print('BigQuery transcript search ready')Python Example
import os, requests, json, time, re
SCAVIO_KEY = os.environ['SCAVIO_API_KEY']
H = {'x-api-key': SCAVIO_KEY, 'Content-Type': 'application/json'}
def collect_transcripts(topic, num=5):
# Search YouTube
resp = requests.post('https://api.scavio.dev/api/v1/search', headers=H,
json={'query': f'{topic} site:youtube.com', 'country_code': 'us', 'num_results': num})
videos = []
for r in resp.json().get('organic_results', []):
m = re.search(r'v=([^&]+)', r.get('link', ''))
if m: videos.append({'id': m.group(1), 'title': r['title']})
# Get transcripts
for v in videos:
resp = requests.post('https://api.scavio.dev/api/v1/search', headers=H,
json={'platform': 'youtube', 'action': 'transcript', 'video_id': v['id']})
segs = resp.json().get('transcript', [])
v['text'] = ' '.join(s.get('text', '') for s in segs)
v['words'] = len(v['text'].split())
print(f'{v["id"]}: {v["words"]} words | {v["title"][:40]}')
time.sleep(0.3)
print(f'Cost: ${len(videos) * 0.010:.3f}')
collect_transcripts('Python FastAPI tutorial')JavaScript Example
const SCAVIO_KEY = process.env.SCAVIO_API_KEY;
async function collectTranscripts(topic) {
const resp = await fetch('https://api.scavio.dev/api/v1/search', {
method: 'POST',
headers: { 'x-api-key': SCAVIO_KEY, 'Content-Type': 'application/json' },
body: JSON.stringify({ query: `${topic} site:youtube.com`, country_code: 'us', num_results: 5 })
});
const videos = (await resp.json()).organic_results || [];
for (const v of videos.filter(r => r.link.includes('youtube.com/watch'))) {
const vidId = v.link.match(/v=([^&]+)/)?.[1];
if (!vidId) continue;
const t = await fetch('https://api.scavio.dev/api/v1/search', {
method: 'POST',
headers: { 'x-api-key': SCAVIO_KEY, 'Content-Type': 'application/json' },
body: JSON.stringify({ platform: 'youtube', action: 'transcript', video_id: vidId })
}).then(r => r.json());
const text = (t.transcript || []).map(s => s.text).join(' ');
console.log(`${vidId}: ${text.split(' ').length} words | ${v.title.slice(0, 40)}`);
}
}
collectTranscripts('Python FastAPI tutorial');Expected Output
Found 8 YouTube videos
dQw4w9WgXcQ: Python FastAPI Tutorial - Build a REST API in 30
abc123def4: FastAPI vs Django: Complete Comparison 2026
xyz789ghi0: Advanced FastAPI Patterns for Production
Saved 5 transcripts to transcripts.jsonl
Total words: 42,350
Search results for: "FastAPI async endpoints"
[dQw4w9WgXcQ] Python FastAPI Tutorial - Build a REST API in 30
FastAPI uses async def by default for all route handlers...
[abc123def4] FastAPI vs Django: Complete Comparison 2026
When it comes to async endpoints FastAPI has a clear advant...
BigQuery transcript search ready