Training ML models for news classification, summarization, or sentiment analysis requires a large, well-structured corpus of news articles. Web scraping news sites is fragile and legally complex. This tutorial builds a news corpus collection pipeline using the Scavio API to search for news on specific topics, extract article metadata from SERP snippets, deduplicate by URL, and store the corpus in a structured format ready for ML preprocessing. Each topic search costs $0.005.
Prerequisites
- Python 3.9+ installed
- requests library installed
- A Scavio API key from scavio.dev
- A list of news topics to collect
Walkthrough
Step 1: Define topics and search for news articles
Search for recent news on each topic. Use date-restricted queries to ensure freshness and news-specific search patterns.
import os, requests, json, time, hashlib
from datetime import datetime
SCAVIO_KEY = os.environ['SCAVIO_API_KEY']
H = {'x-api-key': SCAVIO_KEY, 'Content-Type': 'application/json'}
URL = 'https://api.scavio.dev/api/v1/search'
TOPICS = [
'artificial intelligence regulation',
'climate technology startups',
'semiconductor supply chain',
'electric vehicle market',
'cybersecurity breaches 2026',
]
def search_news(topic: str, num: int = 10) -> list:
resp = requests.post(URL, headers=H,
json={'query': f'{topic} news 2026', 'country_code': 'us', 'num_results': num})
results = resp.json().get('organic_results', [])
articles = []
for r in results:
articles.append({
'title': r.get('title', ''),
'url': r.get('link', ''),
'snippet': r.get('snippet', ''),
'source_domain': r.get('link', '').split('/')[2] if '/' in r.get('link', '') else '',
'topic': topic,
'collected_at': datetime.now().isoformat(),
})
return articles
articles = search_news('artificial intelligence regulation')
print(f'Collected {len(articles)} articles on AI regulation')Step 2: Deduplicate and categorize the corpus
Remove duplicate articles by URL hash, and add basic categorization metadata. Track corpus statistics.
class NewsCorpus:
def __init__(self):
self.articles = []
self.seen_urls = set()
def add_articles(self, new_articles: list) -> int:
added = 0
for article in new_articles:
url_hash = hashlib.md5(article['url'].encode()).hexdigest()
if url_hash not in self.seen_urls:
self.seen_urls.add(url_hash)
article['url_hash'] = url_hash
article['word_count'] = len(article['snippet'].split())
self.articles.append(article)
added += 1
return added
def stats(self) -> dict:
topics = {}
sources = {}
for a in self.articles:
topics[a['topic']] = topics.get(a['topic'], 0) + 1
sources[a['source_domain']] = sources.get(a['source_domain'], 0) + 1
return {
'total_articles': len(self.articles),
'unique_urls': len(self.seen_urls),
'topics': topics,
'top_sources': dict(sorted(sources.items(), key=lambda x: -x[1])[:10]),
}
corpus = NewsCorpus()
for topic in TOPICS:
articles = search_news(topic)
added = corpus.add_articles(articles)
print(f'{topic}: +{added} articles')
time.sleep(0.3)
stats = corpus.stats()
print(f'\nCorpus: {stats["total_articles"]} articles across {len(stats["topics"])} topics')Step 3: Export the corpus for ML training
Save the corpus in JSONL format, which is the standard input format for most ML training pipelines. Include metadata for filtering.
def export_corpus(corpus: NewsCorpus, output_file: str = 'news_corpus.jsonl'):
with open(output_file, 'w') as f:
for article in corpus.articles:
f.write(json.dumps(article) + '\n')
stats = corpus.stats()
print(f'Exported {stats["total_articles"]} articles to {output_file}')
print(f'Topics: {", ".join(f"{k} ({v})" for k, v in stats["topics"].items())}')
print(f'Top sources: {", ".join(list(stats["top_sources"].keys())[:5])}')
print(f'Cost: ${len(TOPICS) * 0.005:.3f}')
export_corpus(corpus)Python Example
import os, requests, json, time, hashlib
SCAVIO_KEY = os.environ['SCAVIO_API_KEY']
H = {'x-api-key': SCAVIO_KEY, 'Content-Type': 'application/json'}
def collect_news_corpus(topics, num_per_topic=10):
corpus = []
seen = set()
for topic in topics:
resp = requests.post('https://api.scavio.dev/api/v1/search', headers=H,
json={'query': f'{topic} news 2026', 'country_code': 'us', 'num_results': num_per_topic})
for r in resp.json().get('organic_results', []):
url_hash = hashlib.md5(r['link'].encode()).hexdigest()
if url_hash not in seen:
seen.add(url_hash)
corpus.append({'title': r['title'], 'url': r['link'],
'snippet': r.get('snippet', ''), 'topic': topic})
time.sleep(0.3)
print(f'Corpus: {len(corpus)} articles, {len(topics)} topics')
return corpus
corpus = collect_news_corpus(['AI regulation', 'climate tech', 'cybersecurity'])
with open('corpus.jsonl', 'w') as f:
for a in corpus:
f.write(json.dumps(a) + '\n')JavaScript Example
const SCAVIO_KEY = process.env.SCAVIO_API_KEY;
const fs = require('fs');
async function collectCorpus(topics) {
const corpus = [];
const seen = new Set();
for (const topic of topics) {
const resp = await fetch('https://api.scavio.dev/api/v1/search', {
method: 'POST',
headers: { 'x-api-key': SCAVIO_KEY, 'Content-Type': 'application/json' },
body: JSON.stringify({ query: `${topic} news 2026`, country_code: 'us', num_results: 10 })
});
for (const r of (await resp.json()).organic_results || []) {
if (!seen.has(r.link)) {
seen.add(r.link);
corpus.push({ title: r.title, url: r.link, snippet: r.snippet || '', topic });
}
}
}
console.log(`Corpus: ${corpus.length} articles`);
fs.writeFileSync('corpus.jsonl', corpus.map(a => JSON.stringify(a)).join('\n'));
}
collectCorpus(['AI regulation', 'climate tech']);Expected Output
artificial intelligence regulation: +10 articles
climate technology startups: +10 articles
semiconductor supply chain: +9 articles
electric vehicle market: +10 articles
cybersecurity breaches 2026: +10 articles
Corpus: 49 articles across 5 topics
Exported 49 articles to news_corpus.jsonl
Topics: artificial intelligence regulation (10), climate technology startups (10), semiconductor supply chain (9)
Top sources: reuters.com, bloomberg.com, techcrunch.com, nytimes.com, wired.com
Cost: $0.025