Enriching leads from multiple sources creates duplicate records: the same company appears as 'acme.com', 'www.acme.com', and 'Acme Inc' across different providers. This deduplication pipeline normalizes domains, fuzzy-matches company names, merges records, and assigns confidence scores. SERP enrichment via Scavio at $0.005/query fills gaps after merging.
Prerequisites
- Python 3.8+
- requests library
- A Scavio API key from scavio.dev
- Lead data from multiple sources
Walkthrough
Step 1: Normalize company identifiers
Standardize domains and company names for matching.
import os, requests, json, re
from collections import defaultdict
API_KEY = os.environ['SCAVIO_API_KEY']
SH = {'x-api-key': API_KEY, 'Content-Type': 'application/json'}
def normalize_domain(domain):
if not domain: return ''
domain = domain.lower().strip()
domain = re.sub(r'^https?://', '', domain)
domain = re.sub(r'^www\.', '', domain)
domain = domain.split('/')[0]
return domain
def normalize_name(name):
if not name: return ''
name = name.lower().strip()
for suffix in [' inc', ' inc.', ' llc', ' ltd', ' corp', ' co', ' co.']:
name = name.replace(suffix, '')
return name.strip()
# Test normalization
test_domains = ['https://www.acme.com/about', 'acme.com', 'WWW.ACME.COM']
test_names = ['Acme Inc.', 'ACME LLC', 'acme', 'Acme Co.']
for d in test_domains: print(f' {d:30} -> {normalize_domain(d)}')
for n in test_names: print(f' {n:30} -> {normalize_name(n)}')Step 2: Merge records from multiple sources
Group records by normalized domain and merge fields.
def merge_records(records):
groups = defaultdict(list)
for r in records:
key = normalize_domain(r.get('domain', '')) or normalize_name(r.get('company', ''))
if key:
groups[key].append(r)
merged = []
for key, group in groups.items():
record = {'domain': key, 'sources': len(group), 'source_names': []}
for r in group:
record['source_names'].append(r.get('source', 'unknown'))
for field in ['company', 'email', 'phone', 'industry', 'description', 'employee_count']:
if r.get(field) and not record.get(field):
record[field] = r[field]
merged.append(record)
return merged
# Simulate multi-source data
records = [
{'company': 'Acme Inc.', 'domain': 'acme.com', 'email': 'info@acme.com', 'source': 'apollo'},
{'company': 'Acme', 'domain': 'www.acme.com', 'phone': '555-0100', 'source': 'clearbit'},
{'company': 'ACME LLC', 'domain': 'acme.com', 'industry': 'Software', 'source': 'linkedin'},
{'company': 'Beta Labs', 'domain': 'betalabs.io', 'email': 'hi@betalabs.io', 'source': 'apollo'},
{'company': 'Beta Labs Inc', 'domain': 'betalabs.io', 'description': 'Dev tools startup', 'source': 'crunchbase'},
]
merged = merge_records(records)
for m in merged:
print(f'{m["domain"]}: {m["sources"]} sources ({" + ".join(m["source_names"])})')
for field in ['company', 'email', 'phone', 'industry']:
if m.get(field): print(f' {field}: {m[field]}')Step 3: Fill gaps with SERP enrichment
Use search API to fill missing fields after merge.
def serp_fill(record):
"""Fill missing fields using SERP search."""
missing = [f for f in ['description', 'industry'] if not record.get(f)]
if not missing:
return record, 0
company = record.get('company', record['domain'])
data = requests.post('https://api.scavio.dev/api/v1/search',
headers=SH, json={'query': f'{company} company', 'country_code': 'us'}).json()
organic = data.get('organic_results', [])[:3]
if 'description' in missing and organic:
record['description'] = organic[0].get('snippet', '')[:200]
if 'industry' in missing:
for r in organic:
snippet = r.get('snippet', '').lower()
for ind in ['software', 'saas', 'fintech', 'healthcare', 'ecommerce', 'marketing']:
if ind in snippet:
record['industry'] = ind.capitalize()
break
return record, 0.005
total_cost = 0
for m in merged:
m, cost = serp_fill(m)
total_cost += cost
filled = [f for f in ['description', 'industry'] if m.get(f)]
print(f'{m["domain"]}: filled {filled}')
print(f'SERP enrichment cost: ${total_cost:.3f}')Step 4: Score confidence and export
Assign confidence scores based on source count and field completeness.
def confidence_score(record):
score = 0
score += min(record.get('sources', 1) * 15, 45) # Up to 45 for 3+ sources
fields = ['company', 'email', 'phone', 'industry', 'description', 'employee_count']
filled = sum(1 for f in fields if record.get(f))
score += filled * 9 # Up to 54 for all 6 fields
return min(score, 99)
def export_deduped(records, filename='deduped_leads.json'):
for r in records:
r['confidence'] = confidence_score(r)
records.sort(key=lambda x: x['confidence'], reverse=True)
with open(filename, 'w') as f:
json.dump(records, f, indent=2)
print(f'\nExported {len(records)} deduplicated records to {filename}')
print(f'Confidence distribution:')
high = sum(1 for r in records if r['confidence'] >= 70)
med = sum(1 for r in records if 40 <= r['confidence'] < 70)
low = sum(1 for r in records if r['confidence'] < 40)
print(f' High (70+): {high} | Medium (40-69): {med} | Low (<40): {low}')
for r in records:
print(f' [{r["confidence"]:2}] {r["domain"]:20} | {r["sources"]} sources | {r.get("company", "N/A")}')
export_deduped(merged)Python Example
import os, requests, re
from collections import defaultdict
SH = {'x-api-key': os.environ['SCAVIO_API_KEY'], 'Content-Type': 'application/json'}
def dedup(records):
groups = defaultdict(list)
for r in records:
key = re.sub(r'^(https?://)?www\.', '', (r.get('domain', '')).lower()).split('/')[0]
groups[key].append(r)
for domain, group in groups.items():
print(f'{domain}: {len(group)} records merged')
if len(group) > 1:
data = requests.post('https://api.scavio.dev/api/v1/search',
headers=SH, json={'query': f'{domain} company', 'country_code': 'us'}).json()
desc = (data.get('organic_results') or [{}])[0].get('snippet', '')[:60]
print(f' SERP: {desc}')
dedup([{'domain': 'acme.com', 'source': 'a'}, {'domain': 'www.acme.com', 'source': 'b'}])JavaScript Example
const SH = { 'x-api-key': process.env.SCAVIO_API_KEY, 'Content-Type': 'application/json' };
function normDomain(d) { return (d||'').toLowerCase().replace(/^https?:\/\//, '').replace(/^www\./, '').split('/')[0]; }
async function dedup(records) {
const groups = {};
for (const r of records) {
const key = normDomain(r.domain);
if (!groups[key]) groups[key] = [];
groups[key].push(r);
}
for (const [domain, group] of Object.entries(groups)) {
console.log(`${domain}: ${group.length} records merged`);
}
}
dedup([{domain: 'acme.com'}, {domain: 'www.acme.com'}]);Expected Output
https://www.acme.com/about -> acme.com
acme.com -> acme.com
WWW.ACME.COM -> acme.com
Acme Inc. -> acme
ACME LLC -> acme
acme.com: 3 sources (apollo + clearbit + linkedin)
company: Acme Inc.
email: info@acme.com
phone: 555-0100
industry: Software
Exported 2 deduplicated records to deduped_leads.json
Confidence distribution:
High (70+): 1 | Medium (40-69): 1 | Low (<40): 0
[81] acme.com | 3 sources | Acme Inc.
[48] betalabs.io | 2 sources | Beta Labs