Ecommerce data from search APIs can contain missing prices, stale listings, incorrect ratings, and duplicate products. Before feeding this data into pricing engines, inventory systems, or analytics dashboards, you need automated validation. This tutorial builds a data quality pipeline that checks product data from the Scavio API for completeness, price anomalies, freshness, and duplicates. Each product search costs $0.005, and the validation logic runs locally at zero additional cost.
Prerequisites
- Python 3.9+ installed
- requests library installed
- A Scavio API key from scavio.dev
- Basic understanding of product data structures
Walkthrough
Step 1: Fetch product data from the API
Search for products on Amazon via the Scavio API. The response includes title, price, rating, review count, and availability.
import requests, os
API_KEY = os.environ['SCAVIO_API_KEY']
def fetch_products(query: str) -> list:
resp = requests.post('https://api.scavio.dev/api/v1/search',
headers={'x-api-key': API_KEY, 'Content-Type': 'application/json'},
json={'platform': 'amazon', 'query': query, 'marketplace': 'US'})
resp.raise_for_status()
return resp.json().get('products', [])
products = fetch_products('wireless mouse')
print(f'Fetched {len(products)} products')Step 2: Check for missing required fields
Validate that each product has the essential fields: title, price, rating. Flag incomplete records so they can be excluded or enriched.
REQUIRED_FIELDS = ['title', 'price', 'rating', 'link']
def check_completeness(products: list) -> dict:
complete = []
incomplete = []
for p in products:
missing = [f for f in REQUIRED_FIELDS if not p.get(f)]
if missing:
incomplete.append({'product': p.get('title', 'unknown'), 'missing': missing})
else:
complete.append(p)
return {
'complete': len(complete),
'incomplete': len(incomplete),
'details': incomplete,
'completeness_rate': len(complete) / max(len(products), 1) * 100
}
report = check_completeness(products)
print(f'Completeness: {report["completeness_rate"]:.0f}% ({report["complete"]}/{report["complete"] + report["incomplete"]})')
for d in report['details']:
print(f' Missing {d["missing"]} for: {d["product"][:50]}')Step 3: Detect price anomalies
Flag products with prices that are suspiciously low or high compared to the median. This catches listing errors and third-party gouging.
import statistics
def detect_price_anomalies(products: list, threshold: float = 2.0) -> list:
priced = [p for p in products if p.get('price')]
if len(priced) < 3:
return []
prices = []
for p in priced:
price_str = str(p['price']).replace('$', '').replace(',', '')
try:
prices.append(float(price_str))
except ValueError:
continue
if not prices:
return []
median = statistics.median(prices)
stdev = statistics.stdev(prices) if len(prices) > 1 else median * 0.5
anomalies = []
for p, price in zip(priced, prices):
deviation = abs(price - median) / max(stdev, 0.01)
if deviation > threshold:
anomalies.append({
'product': p.get('title', '')[:50],
'price': price,
'median': median,
'deviation': round(deviation, 1)
})
return anomalies
anomalies = detect_price_anomalies(products)
for a in anomalies:
print(f'ANOMALY: ${a["price"]} (median ${a["median"]}) - {a["product"]}')Step 4: Check for duplicate listings
Detect duplicate or near-duplicate products by comparing normalized titles. This prevents double-counting in pricing analysis.
from difflib import SequenceMatcher
def find_duplicates(products: list, similarity_threshold: float = 0.85) -> list:
dupes = []
titles = [(i, p.get('title', '').lower().strip()) for i, p in enumerate(products)]
for i in range(len(titles)):
for j in range(i + 1, len(titles)):
ratio = SequenceMatcher(None, titles[i][1], titles[j][1]).ratio()
if ratio >= similarity_threshold:
dupes.append({
'product_a': products[titles[i][0]].get('title', '')[:50],
'product_b': products[titles[j][0]].get('title', '')[:50],
'similarity': round(ratio * 100, 1)
})
return dupes
dupes = find_duplicates(products)
for d in dupes:
print(f'DUPLICATE ({d["similarity"]}%): {d["product_a"]} <-> {d["product_b"]}')Step 5: Generate a quality scorecard
Combine all checks into a single quality score. Use this to decide whether to trust the data or re-fetch it.
def quality_scorecard(products: list) -> dict:
completeness = check_completeness(products)
anomalies = detect_price_anomalies(products)
duplicates = find_duplicates(products)
total = len(products)
score = 100
score -= (completeness['incomplete'] / max(total, 1)) * 30 # up to -30 for missing fields
score -= len(anomalies) * 5 # -5 per anomaly
score -= len(duplicates) * 3 # -3 per duplicate pair
return {
'total_products': total,
'quality_score': max(round(score, 1), 0),
'completeness_rate': round(completeness['completeness_rate'], 1),
'anomaly_count': len(anomalies),
'duplicate_pairs': len(duplicates),
'verdict': 'PASS' if score >= 70 else 'REVIEW' if score >= 50 else 'FAIL'
}
card = quality_scorecard(products)
for k, v in card.items():
print(f'{k}: {v}')Python Example
import os, requests, statistics
API_KEY = os.environ['SCAVIO_API_KEY']
def fetch_products(query: str) -> list:
resp = requests.post('https://api.scavio.dev/api/v1/search',
headers={'x-api-key': API_KEY, 'Content-Type': 'application/json'},
json={'platform': 'amazon', 'query': query, 'marketplace': 'US'})
return resp.json().get('products', [])
def validate(products: list) -> dict:
total = len(products)
complete = sum(1 for p in products if all(p.get(f) for f in ['title', 'price', 'rating']))
return {'total': total, 'complete': complete, 'rate': f'{complete/max(total,1)*100:.0f}%'}
products = fetch_products('wireless mouse')
print(validate(products))JavaScript Example
const API_KEY = process.env.SCAVIO_API_KEY;
async function fetchProducts(query) {
const resp = await fetch('https://api.scavio.dev/api/v1/search', {
method: 'POST',
headers: { 'x-api-key': API_KEY, 'Content-Type': 'application/json' },
body: JSON.stringify({ platform: 'amazon', query, marketplace: 'US' })
});
return (await resp.json()).products || [];
}
async function main() {
const products = await fetchProducts('wireless mouse');
const complete = products.filter(p => p.title && p.price && p.rating).length;
console.log(`${complete}/${products.length} complete (${(complete/products.length*100).toFixed(0)}%)`);
}
main().catch(console.error);Expected Output
Fetched 15 products
Completeness: 87% (13/15)
Missing ['price'] for: Generic USB Mouse Adapter Cable...
Missing ['rating'] for: Bulk Pack Wireless Mouse 10-Unit...
ANOMALY: $149.99 (median $24.99) - Premium Ergonomic Wireless Mouse Gold
DUPLICATE (91.2%): Logitech M720 Triathlon Multi-Device <-> Logitech M720 Triathlon Wireless
total_products: 15
quality_score: 78.0
completeness_rate: 86.7
anomaly_count: 1
duplicate_pairs: 1
verdict: PASS