Overview
ML teams building sentiment analysis, topic classification, or NLP models need fresh labeled data. This workflow collects news articles on your target topics every midnight, extracts structured fields (title, snippet, source, date), deduplicates against your existing corpus, and appends clean records to a JSONL training file. It provides a steady stream of real-world text data without manual scraping. Collecting 50 articles across 10 topics costs about $0.05 per night.
Trigger
Cron midnight UTC daily
Schedule
Daily midnight
Workflow Steps
Load Topic Configuration
Read the list of topics, categories, and search queries for your ML dataset from a config file.
Search News for Each Topic
Call Scavio search on Google for each topic, filtered to recent results.
Extract Structured Fields
Parse each result into a structured record: title, snippet, URL, source domain, and inferred date.
Deduplicate Against Corpus
Check each record against existing URLs in the corpus to avoid duplicates.
Append to Training Dataset
Write deduplicated records to a JSONL file with topic labels for downstream ML training.
Python Implementation
import requests, os, json, hashlib
from pathlib import Path
from datetime import date
from urllib.parse import urlparse
API_KEY = os.environ["SCAVIO_API_KEY"]
SH = {"x-api-key": API_KEY, "Content-Type": "application/json"}
TOPICS_FILE = Path("ml_topics.json")
CORPUS_FILE = Path("ml_corpus.jsonl")
SEEN_FILE = Path("ml_seen_urls.json")
def search_news(query: str) -> list:
resp = requests.post(
"https://api.scavio.dev/api/v1/search",
headers=SH,
json={"query": query, "platform": "google"},
timeout=15,
)
resp.raise_for_status()
return resp.json().get("organic", [])
def extract_record(result: dict, topic: str, category: str) -> dict:
url = result.get("url", "")
return {
"title": result.get("title", ""),
"snippet": result.get("snippet", ""),
"url": url,
"source_domain": urlparse(url).netloc if url else "",
"topic": topic,
"category": category,
"collected_date": str(date.today()),
"url_hash": hashlib.md5(url.encode()).hexdigest(),
}
def run():
topics = json.loads(TOPICS_FILE.read_text())
seen = set()
if SEEN_FILE.exists():
seen = set(json.loads(SEEN_FILE.read_text()))
new_records = []
for topic_config in topics:
topic = topic_config["topic"]
category = topic_config.get("category", "general")
query = topic_config.get("query", f"{topic} news {date.today().year}")
results = search_news(query)
for r in results:
record = extract_record(r, topic, category)
if record["url_hash"] not in seen:
new_records.append(record)
seen.add(record["url_hash"])
with open(CORPUS_FILE, "a") as f:
for record in new_records:
f.write(json.dumps(record) + "\n")
SEEN_FILE.write_text(json.dumps(list(seen)))
print(f"Collected {len(new_records)} new articles on {date.today()}")
for r in new_records[:5]:
print(f" [{r['category']}] {r['title'][:60]}")
run()JavaScript Implementation
const SH = {'x-api-key': process.env.SCAVIO_API_KEY, 'Content-Type': 'application/json'};
const fs = await import('fs');
const crypto = await import('crypto');
const topics = JSON.parse(fs.readFileSync('ml_topics.json', 'utf8'));
let seen = new Set();
try { seen = new Set(JSON.parse(fs.readFileSync('ml_seen_urls.json', 'utf8'))); } catch {}
async function searchNews(query) {
const r = await fetch('https://api.scavio.dev/api/v1/search', {method:'POST', headers:SH, body:JSON.stringify({query, platform:'google'})});
return (await r.json()).organic || [];
}
const newRecords = [];
const today = new Date().toISOString().split('T')[0];
for (const tc of topics) {
const query = tc.query || tc.topic+' news 2026';
const results = await searchNews(query);
for (const r of results) {
const urlHash = crypto.createHash('md5').update(r.url||'').digest('hex');
if (!seen.has(urlHash)) {
const domain = r.url ? new URL(r.url).hostname : '';
newRecords.push({title:r.title||'', snippet:r.snippet||'', url:r.url||'', sourceDomain:domain, topic:tc.topic, category:tc.category||'general', collectedDate:today, urlHash});
seen.add(urlHash);
}
}
}
fs.appendFileSync('ml_corpus.jsonl', newRecords.map(r=>JSON.stringify(r)).join('\n')+(newRecords.length?'\n':''));
fs.writeFileSync('ml_seen_urls.json', JSON.stringify([...seen]));
console.log('Collected '+newRecords.length+' new articles');
newRecords.slice(0,5).forEach(r => console.log(' ['+r.category+'] '+r.title.slice(0,60)));Platforms Used
Web search with knowledge graph, PAA, and AI overviews