Detecting content theft with a SERP API works by searching for distinctive phrases from your own content and checking whether other domains have published them without permission. This tutorial shows you how to extract unique sentences from your articles, query a search API for exact-match results, and flag domains that are republishing your work so you can file takedown requests or adjust your SEO strategy.
Prerequisites
- Scavio API key (free 250 credits/mo at scavio.dev)
- Python 3.9+ or Node.js 18+
- A list of URLs or text content you want to monitor
Walkthrough
Step 1: Extract unique phrases from your content
Select 3-5 distinctive sentences from each article you want to monitor. Avoid generic phrases that would appear on many sites. Pick sentences with specific data points, unique phrasing, or branded terminology that would only appear elsewhere if copied.
content_fingerprints = {
'https://yourblog.com/api-comparison-2026': [
'"The effective cost per query drops 38% when you factor in structured data extraction"',
'"Batch endpoints amortize cold-start latency across 50-query chunks"',
'"Three providers silently inject affiliate links into knowledge panel results"',
],
'https://yourblog.com/serp-monitoring-guide': [
'"Position tracking without render-side JS execution misses 23% of dynamic SERP features"',
'"Weekly crawl cadence catches ranking drops 4.2 days late on average"',
],
}Step 2: Search for each fingerprint via the API
Query the Scavio search API with each fingerprint phrase wrapped in double quotes for exact match. Any result that is not your own domain is a potential content thief.
import requests
API_KEY = 'your_scavio_api_key'
matches = []
for source_url, phrases in content_fingerprints.items():
for phrase in phrases:
resp = requests.post(
'https://api.scavio.dev/api/v1/search',
headers={'x-api-key': API_KEY},
json={'query': phrase, 'num': 10}
)
data = resp.json()
for item in data.get('organic', []):
if 'yourblog.com' not in item.get('url', ''):
matches.append({
'source': source_url,
'phrase': phrase,
'thief_url': item['url'],
'thief_title': item.get('title', ''),
})Step 3: Deduplicate and score matches
Group matches by domain and count how many fingerprint phrases each domain matched. A domain matching multiple unique phrases is almost certainly scraping your content rather than coincidentally using similar language.
from collections import Counter
from urllib.parse import urlparse
domain_counts = Counter()
for m in matches:
domain = urlparse(m['thief_url']).netloc
domain_counts[domain] += 1
print('Domains with most matches:')
for domain, count in domain_counts.most_common(10):
print(f' {domain}: {count} phrase matches')Step 4: Generate a theft report
Export the results into a structured report with the source URL, matched phrase, offending URL, and match count. This report is what you need for DMCA takedown requests or to send to your legal team.
import json
report = {
'scan_date': '2026-05-12',
'total_matches': len(matches),
'unique_domains': len(domain_counts),
'top_offenders': [
{'domain': d, 'phrase_matches': c}
for d, c in domain_counts.most_common(10)
],
'all_matches': matches,
}
with open('content_theft_report.json', 'w') as f:
json.dump(report, f, indent=2)
print(f'Report saved: {len(matches)} matches across {len(domain_counts)} domains')Python Example
import requests
import json
from collections import Counter
from urllib.parse import urlparse
API_KEY = 'your_scavio_api_key'
YOUR_DOMAIN = 'yourblog.com'
content_fingerprints = {
'https://yourblog.com/api-comparison-2026': [
'"The effective cost per query drops 38% when you factor in structured data extraction"',
'"Batch endpoints amortize cold-start latency across 50-query chunks"',
],
'https://yourblog.com/serp-monitoring-guide': [
'"Position tracking without render-side JS execution misses 23% of dynamic SERP features"',
],
}
matches = []
for source_url, phrases in content_fingerprints.items():
for phrase in phrases:
resp = requests.post(
'https://api.scavio.dev/api/v1/search',
headers={'x-api-key': API_KEY},
json={'query': phrase, 'num': 10}
)
data = resp.json()
for item in data.get('organic', []):
if YOUR_DOMAIN not in item.get('url', ''):
matches.append({
'source': source_url,
'phrase': phrase,
'thief_url': item['url'],
'thief_title': item.get('title', ''),
})
domain_counts = Counter(urlparse(m['thief_url']).netloc for m in matches)
report = {
'scan_date': '2026-05-12',
'total_matches': len(matches),
'unique_domains': len(domain_counts),
'top_offenders': [
{'domain': d, 'phrase_matches': c}
for d, c in domain_counts.most_common(10)
],
'all_matches': matches,
}
with open('content_theft_report.json', 'w') as f:
json.dump(report, f, indent=2)
print(f'Scan complete: {len(matches)} matches across {len(domain_counts)} domains')JavaScript Example
const fs = require('fs');
const API_KEY = 'your_scavio_api_key';
const YOUR_DOMAIN = 'yourblog.com';
const contentFingerprints = {
'https://yourblog.com/api-comparison-2026': [
'"The effective cost per query drops 38% when you factor in structured data extraction"',
'"Batch endpoints amortize cold-start latency across 50-query chunks"',
],
'https://yourblog.com/serp-monitoring-guide': [
'"Position tracking without render-side JS execution misses 23% of dynamic SERP features"',
],
};
async function main() {
const matches = [];
for (const [sourceUrl, phrases] of Object.entries(contentFingerprints)) {
for (const phrase of phrases) {
const resp = await fetch('https://api.scavio.dev/api/v1/search', {
method: 'POST',
headers: { 'x-api-key': API_KEY, 'Content-Type': 'application/json' },
body: JSON.stringify({ query: phrase, num: 10 }),
});
const data = await resp.json();
for (const item of data.organic || []) {
if (!item.url?.includes(YOUR_DOMAIN)) {
matches.push({
source: sourceUrl,
phrase,
thief_url: item.url,
thief_title: item.title || '',
});
}
}
}
}
const domainCounts = {};
for (const m of matches) {
const domain = new URL(m.thief_url).hostname;
domainCounts[domain] = (domainCounts[domain] || 0) + 1;
}
const sorted = Object.entries(domainCounts).sort((a, b) => b[1] - a[1]);
const report = {
scan_date: '2026-05-12',
total_matches: matches.length,
unique_domains: sorted.length,
top_offenders: sorted.slice(0, 10).map(([domain, count]) => ({ domain, phrase_matches: count })),
all_matches: matches,
};
fs.writeFileSync('content_theft_report.json', JSON.stringify(report, null, 2));
console.log(`Scan complete: ${matches.length} matches across ${sorted.length} domains`);
}
main();Expected Output
Scan complete: 7 matches across 4 domains