Finding the right dataset for a research or ML project requires searching across data portals, academic repositories, and government databases. This tutorial builds a dataset discovery agent using Mobus MCP for structured data catalog access and Scavio search for discovering datasets across the open web. The agent searches, evaluates metadata, and catalogs relevant datasets. Cost: $0.005 per search query.
Prerequisites
- Python 3.9+ installed
- Claude Code installed
- requests library installed
- A Scavio API key from scavio.dev
Walkthrough
Step 1: Set up search for dataset discovery
Build a search function optimized for finding datasets. Target specific data portals and repositories with site-specific queries.
import requests, os
SCAVIO_KEY = os.environ['SCAVIO_API_KEY']
DATA_PORTALS = [
'data.gov', 'kaggle.com', 'huggingface.co/datasets',
'datasetsearch.research.google.com', 'zenodo.org',
'archive.ics.uci.edu', 'registry.opendata.aws'
]
def search_datasets(topic: str, portal: str = None) -> list:
query = f'{topic} dataset'
if portal:
query = f'site:{portal} {topic} dataset'
resp = requests.post('https://api.scavio.dev/api/v1/search',
headers={'x-api-key': SCAVIO_KEY, 'Content-Type': 'application/json'},
json={'query': query, 'country_code': 'us', 'num_results': 10})
results = resp.json().get('organic_results', [])
datasets = []
for r in results:
datasets.append({
'title': r['title'],
'url': r['link'],
'description': r.get('snippet', ''),
'portal': r['link'].split('/')[2] if '/' in r['link'] else 'unknown'
})
return datasets
# Search across all portals
results = search_datasets('climate temperature')
print(f'Found {len(results)} datasets for climate temperature')
for r in results[:5]:
print(f' [{r["portal"]}] {r["title"][:50]}')Step 2: Build the dataset evaluator
Evaluate discovered datasets on metadata quality, format, size indicators, and license information extracted from search snippets.
import re
def evaluate_dataset(dataset: dict) -> dict:
text = (dataset['title'] + ' ' + dataset['description']).lower()
# Format detection
formats = []
for fmt in ['csv', 'json', 'parquet', 'xlsx', 'geojson', 'netcdf', 'hdf5']:
if fmt in text:
formats.append(fmt)
# Size indicators
size_match = re.search(r'(\d+(?:\.\d+)?\s*(?:gb|mb|tb|rows|records|entries))', text)
size = size_match.group(1) if size_match else 'unknown'
# License
licenses = []
for lic in ['cc0', 'cc-by', 'mit', 'apache', 'public domain', 'open', 'creative commons']:
if lic in text:
licenses.append(lic)
# Freshness
years = re.findall(r'20(2[3-9])', text)
latest_year = max(int('20' + y) for y in years) if years else 0
score = 0
score += min(len(formats) * 15, 30) # format variety
score += 20 if size != 'unknown' else 0 # has size info
score += 20 if licenses else 0 # has license
score += 30 if latest_year >= 2025 else 15 if latest_year >= 2023 else 0
return {
'title': dataset['title'][:50],
'url': dataset['url'],
'formats': formats or ['unknown'],
'size': size,
'license': licenses[0] if licenses else 'check source',
'score': score
}
evaluated = [evaluate_dataset(d) for d in results]
evaluated.sort(key=lambda x: -x['score'])
for d in evaluated[:5]:
print(f' [{d["score"]:3d}] {d["title"]} ({d["formats"][0]}, {d["size"]})')Step 3: Run multi-portal discovery pipeline
Search across multiple data portals for a given topic and compile a ranked catalog of datasets.
import time
def discover_datasets(topic: str, portals: list = None) -> list:
portals = portals or DATA_PORTALS[:4] # limit to save credits
all_datasets = []
seen_urls = set()
# General web search first
general = search_datasets(topic)
for d in general:
if d['url'] not in seen_urls:
seen_urls.add(d['url'])
all_datasets.append(d)
# Portal-specific searches
for portal in portals:
results = search_datasets(topic, portal)
for d in results:
if d['url'] not in seen_urls:
seen_urls.add(d['url'])
all_datasets.append(d)
time.sleep(0.3)
# Evaluate and rank
evaluated = [evaluate_dataset(d) for d in all_datasets]
evaluated.sort(key=lambda x: -x['score'])
queries = 1 + len(portals)
print(f'Dataset Discovery Report: {topic}')
print(f'Searched: {queries} queries (${queries * 0.005:.3f})')
print(f'Found: {len(evaluated)} unique datasets\n')
for i, d in enumerate(evaluated[:10], 1):
print(f'{i:2}. [{d["score"]:3d}] {d["title"]}')
print(f' Format: {d["formats"][0]} | Size: {d["size"]} | License: {d["license"]}')
print(f' URL: {d["url"]}')
return evaluated
catalog = discover_datasets('global temperature anomaly')Python Example
import requests, os, time
SCAVIO_KEY = os.environ['SCAVIO_API_KEY']
def find_datasets(topic):
datasets = []
for query in [f'{topic} dataset', f'site:kaggle.com {topic}', f'site:huggingface.co {topic} dataset']:
resp = requests.post('https://api.scavio.dev/api/v1/search',
headers={'x-api-key': SCAVIO_KEY, 'Content-Type': 'application/json'},
json={'query': query, 'country_code': 'us', 'num_results': 5})
for r in resp.json().get('organic_results', []):
datasets.append({'title': r['title'][:50], 'url': r['link']})
time.sleep(0.3)
seen = set()
unique = [d for d in datasets if d['url'] not in seen and not seen.add(d['url'])]
for d in unique[:5]:
print(f'{d["title"]}: {d["url"]}')
return unique
find_datasets('sentiment analysis')JavaScript Example
const SCAVIO_KEY = process.env.SCAVIO_API_KEY;
async function findDatasets(topic) {
const queries = [`${topic} dataset`, `site:kaggle.com ${topic}`, `site:huggingface.co ${topic} dataset`];
const datasets = [];
for (const q of queries) {
const resp = await fetch('https://api.scavio.dev/api/v1/search', {
method: 'POST',
headers: { 'x-api-key': SCAVIO_KEY, 'Content-Type': 'application/json' },
body: JSON.stringify({ query: q, country_code: 'us', num_results: 5 })
});
for (const r of (await resp.json()).organic_results || []) {
datasets.push({ title: r.title.slice(0, 50), url: r.link });
}
}
const seen = new Set();
return datasets.filter(d => !seen.has(d.url) && seen.add(d.url)).slice(0, 10);
}
findDatasets('sentiment analysis').then(d => d.forEach(x => console.log(x.title)));Expected Output
Found 10 datasets for climate temperature
[kaggle.com] Climate Change: Earth Surface Temperature Data
[data.gov] Global Historical Climatology Network Daily
[huggingface.co] Global Temperature Anomaly Dataset 2026
Dataset Discovery Report: global temperature anomaly
Searched: 5 queries ($0.025)
Found: 18 unique datasets
1. [ 80] Global Temperature Anomaly Dataset 2026
Format: csv | Size: 2.3 gb | License: cc-by
2. [ 65] NOAA Global Temperature Time Series
Format: csv | Size: 450 mb | License: public domain