Aggregate data from multiple search platforms into a unified dataset by querying Google, Amazon, YouTube, Reddit, and Walmart through a single API, normalizing the response fields, deduplicating overlapping results, and merging them into a combined output. Multi-source aggregation is essential for competitive analysis, market research, and product comparison workflows where no single platform provides a complete picture. A unified pipeline eliminates the need to manage separate API keys and response parsers for each platform.
Prerequisites
- Python 3.8+ installed
- requests library installed
- A Scavio API key from scavio.dev
- A query or topic to research across platforms
Walkthrough
Step 1: Query all platforms
Search the same query across all supported platforms in one pass.
import os, requests, time
API_KEY = os.environ['SCAVIO_API_KEY']
PLATFORMS = ['google', 'amazon', 'youtube', 'reddit', 'walmart']
def search_all(query: str) -> dict:
all_results = {}
for platform in PLATFORMS:
resp = requests.post('https://api.scavio.dev/api/v1/search',
headers={'x-api-key': API_KEY},
json={'platform': platform, 'query': query}, timeout=15)
data = resp.json()
all_results[platform] = data.get('organic_results', [])
time.sleep(0.2)
return all_results
results = search_all('wireless earbuds')
for platform, items in results.items():
print(f'{platform}: {len(items)} results')Step 2: Normalize response fields
Map platform-specific fields to a common schema for unified processing.
def normalize_result(result: dict, platform: str) -> dict:
"""Normalize a result from any platform to a common schema."""
return {
'platform': platform,
'title': result.get('title', ''),
'url': result.get('link', result.get('url', '')),
'description': result.get('snippet', result.get('description', '')),
'price': result.get('price', ''),
'rating': result.get('rating', ''),
'thumbnail': result.get('thumbnail', ''),
'source': result.get('source', result.get('channel', '')),
}
def normalize_all(raw_results: dict) -> list:
normalized = []
for platform, items in raw_results.items():
for item in items:
normalized.append(normalize_result(item, platform))
return normalized
normalized = normalize_all(results)
print(f'Total normalized results: {len(normalized)}')
for r in normalized[:3]:
print(f" [{r['platform']}] {r['title'][:50]}")Step 3: Deduplicate results
Remove duplicate entries that appear across multiple platforms.
from urllib.parse import urlparse
def deduplicate(results: list) -> list:
seen_titles = set()
seen_domains = {}
deduped = []
for r in results:
title_key = r['title'].lower().strip()[:60]
if title_key in seen_titles:
continue
seen_titles.add(title_key)
domain = urlparse(r['url']).netloc if r['url'] else ''
if domain and domain in seen_domains:
existing = seen_domains[domain]
if len(r['description']) > len(existing['description']):
deduped.remove(existing)
deduped.append(r)
seen_domains[domain] = r
continue
if domain:
seen_domains[domain] = r
deduped.append(r)
return deduped
deduped = deduplicate(normalized)
print(f'Before: {len(normalized)}, After: {len(deduped)} (removed {len(normalized) - len(deduped)} dupes)')Step 4: Merge and rank
Merge results from all platforms and rank by relevance signals.
def rank_results(results: list) -> list:
for r in results:
score = 0
if r['description']:
score += 1
if r['price']:
score += 1
if r['rating']:
score += 1
if r['url']:
score += 1
if r['platform'] == 'google':
score += 1 # Boost web results for general queries
r['relevance_score'] = score
return sorted(results, key=lambda x: x['relevance_score'], reverse=True)
ranked = rank_results(deduped)
for r in ranked[:5]:
print(f" [{r['platform']}] score={r['relevance_score']} {r['title'][:45]}")Step 5: Export aggregated dataset
Save the merged, deduplicated, and ranked results as a single JSON dataset.
import json
def export_aggregated(results: list, output_path: str):
dataset = {
'query': 'wireless earbuds',
'platforms': PLATFORMS,
'total_results': len(results),
'platform_breakdown': {},
'results': results,
}
for p in PLATFORMS:
dataset['platform_breakdown'][p] = len([r for r in results if r['platform'] == p])
with open(output_path, 'w') as f:
json.dump(dataset, f, indent=2)
print(f'Exported {len(results)} results to {output_path}')
for p, count in dataset['platform_breakdown'].items():
print(f' {p}: {count}')
export_aggregated(ranked, 'aggregated_results.json')Python Example
import requests, os
H = {'x-api-key': os.environ['SCAVIO_API_KEY']}
def multi_search(query, platforms=None):
platforms = platforms or ['google', 'amazon', 'youtube']
all_results = []
for p in platforms:
data = requests.post('https://api.scavio.dev/api/v1/search', headers=H,
json={'platform': p, 'query': query}).json()
for r in data.get('organic_results', [])[:3]:
all_results.append({'platform': p, 'title': r.get('title', ''), 'url': r.get('link', '')})
return all_results
print(multi_search('wireless earbuds'))JavaScript Example
const H = {'x-api-key': process.env.SCAVIO_API_KEY, 'Content-Type': 'application/json'};
async function multiSearch(query, platforms = ['google', 'amazon', 'youtube']) {
const all = [];
for (const p of platforms) {
const r = await fetch('https://api.scavio.dev/api/v1/search', {
method: 'POST', headers: H,
body: JSON.stringify({platform: p, query})
});
const results = (await r.json()).organic_results || [];
results.slice(0, 3).forEach(r => all.push({platform: p, title: r.title, url: r.link}));
}
return all;
}
multiSearch('wireless earbuds').then(console.log);Expected Output
A unified dataset combining normalized, deduplicated, and ranked results from Google, Amazon, YouTube, Reddit, and Walmart for comprehensive multi-source analysis.