B2B directories like Clutch, G2, and Capterra contain valuable company data, but scraping them directly breaks Terms of Service and triggers anti-bot defenses. A better approach: use a search API to find directory listings and extract structured data from the snippets and rich results. This tutorial builds an n8n workflow that searches for companies in a niche, extracts directory data, and pushes clean records to a Google Sheet. Each search costs $0.005 via the Scavio API.
Prerequisites
- n8n installed (self-hosted or n8n.cloud)
- A Scavio API key from scavio.dev
- A Google Sheets account for output
- Basic n8n workflow knowledge
Walkthrough
Step 1: Set up the HTTP Request node for Scavio
Configure an HTTP Request node in n8n that calls the Scavio search endpoint. This is the core of the pipeline and will be reused for each directory search.
// n8n HTTP Request Node Configuration
// Method: POST
// URL: https://api.scavio.dev/api/v1/search
// Headers:
// x-api-key: {{ $env.SCAVIO_API_KEY }}
// Content-Type: application/json
// Body (JSON):
{
"query": "site:clutch.co {{ $json.niche }} companies",
"country_code": "us",
"num_results": 10
}
// Test with Python equivalent:
import os, requests
SCAVIO_KEY = os.environ['SCAVIO_API_KEY']
def search_directory(niche: str, directory: str = 'clutch.co') -> list:
resp = requests.post('https://api.scavio.dev/api/v1/search',
headers={'x-api-key': SCAVIO_KEY, 'Content-Type': 'application/json'},
json={'query': f'site:{directory} {niche} companies',
'country_code': 'us', 'num_results': 10})
return resp.json().get('organic_results', [])
results = search_directory('web development agency')
print(f'Found {len(results)} directory listings')
for r in results[:3]:
print(f" {r['title']} -> {r['link']}")Step 2: Build the data extraction logic
Parse search results to extract company names, ratings, and profile URLs from directory listings. Use a Code node in n8n or the Python equivalent.
import re
def extract_directory_data(results: list, directory: str) -> list:
"""Extract structured company data from directory search results."""
companies = []
for r in results:
title = r.get('title', '')
snippet = r.get('snippet', '')
link = r.get('link', '')
# Skip non-company pages (category pages, blog posts)
if any(skip in link for skip in ['/blog/', '/resources/', '/press/']):
continue
# Extract rating from snippet
rating_match = re.search(r'(\d+\.\d+)\s*(?:/5|stars?|rating)', snippet, re.I)
rating = float(rating_match.group(1)) if rating_match else None
# Extract review count
review_match = re.search(r'(\d+)\s*(?:reviews?|ratings?)', snippet, re.I)
reviews = int(review_match.group(1)) if review_match else None
# Clean company name from title
name = title.split(' - ')[0].split(' | ')[0].strip()
name = re.sub(r'\s*(?:Reviews?|Profile|Company).*$', '', name, flags=re.I)
companies.append({
'name': name,
'rating': rating,
'reviews': reviews,
'profile_url': link,
'directory': directory,
'snippet': snippet[:200],
})
return companies
results = search_directory('web development agency', 'clutch.co')
companies = extract_directory_data(results, 'clutch.co')
for c in companies[:3]:
print(f"{c['name']} | Rating: {c['rating']} | Reviews: {c['reviews']}")
print(f" {c['profile_url']}")Step 3: Search multiple directories for the same niche
Expand the pipeline to search across Clutch, G2, and Capterra. Deduplicate companies that appear on multiple directories.
import time
DIRECTORIES = ['clutch.co', 'g2.com', 'capterra.com']
def multi_directory_search(niche: str) -> list:
all_companies = []
for directory in DIRECTORIES:
results = search_directory(niche, directory)
companies = extract_directory_data(results, directory)
all_companies.extend(companies)
time.sleep(0.3)
# Deduplicate by normalized name
seen = {}
for c in all_companies:
key = c['name'].lower().strip()
if key in seen:
# Merge: keep the one with more reviews
existing = seen[key]
if (c.get('reviews') or 0) > (existing.get('reviews') or 0):
seen[key] = c
# Track which directories list them
dirs = existing.get('directories', [existing['directory']])
if c['directory'] not in dirs:
dirs.append(c['directory'])
existing['directories'] = dirs
else:
seen[key] = c
return list(seen.values())
companies = multi_directory_search('marketing automation software')
print(f'Found {len(companies)} unique companies across {len(DIRECTORIES)} directories')
print(f'Cost: {len(DIRECTORIES)} searches = ${len(DIRECTORIES) * 0.005:.3f}')
for c in companies[:5]:
dirs = c.get('directories', [c['directory']])
print(f" {c['name']} (on {', '.join(dirs)})")Step 4: Export to Google Sheets via n8n
Configure the n8n Google Sheets node to append extracted data. In Python, simulate the same output format for testing.
import csv, json
def export_to_csv(companies: list, filename: str = 'directory_companies.csv'):
"""Export companies to CSV (n8n equivalent: Google Sheets Append node)."""
fieldnames = ['name', 'rating', 'reviews', 'directory', 'profile_url', 'snippet']
with open(filename, 'w', newline='') as f:
writer = csv.DictWriter(f, fieldnames=fieldnames, extrasaction='ignore')
writer.writeheader()
writer.writerows(companies)
print(f'Exported {len(companies)} companies to {filename}')
def export_to_json(companies: list, filename: str = 'directory_companies.json'):
"""Export as JSON (n8n equivalent: write to webhook or database)."""
with open(filename, 'w') as f:
json.dump(companies, f, indent=2)
print(f'Exported {len(companies)} companies to {filename}')
# n8n workflow summary:
# 1. Schedule Trigger (daily/weekly)
# 2. Set Node: define niches to search
# 3. Loop: for each niche x directory
# 4. HTTP Request: Scavio search API
# 5. Code Node: extract_directory_data()
# 6. Google Sheets: append rows
companies = multi_directory_search('CRM software')
export_to_csv(companies)
print(f'\nn8n workflow cost per run: ${len(DIRECTORIES) * 0.005:.3f} per niche')Python Example
import os, requests, csv, time, re
SCAVIO_KEY = os.environ['SCAVIO_API_KEY']
H = {'x-api-key': SCAVIO_KEY, 'Content-Type': 'application/json'}
def search_dir(niche, site):
resp = requests.post('https://api.scavio.dev/api/v1/search', headers=H,
json={'query': f'site:{site} {niche} companies', 'country_code': 'us', 'num_results': 10})
companies = []
for r in resp.json().get('organic_results', []):
name = r['title'].split(' - ')[0].split(' | ')[0].strip()
rating_m = re.search(r'(\d+\.\d+)', r.get('snippet', ''))
companies.append({'name': name, 'rating': float(rating_m.group(1)) if rating_m else None,
'url': r['link'], 'directory': site})
return companies
all_companies = []
for site in ['clutch.co', 'g2.com']:
all_companies.extend(search_dir('web development', site))
time.sleep(0.3)
print(f'Found {len(all_companies)} listings')
for c in all_companies[:5]:
print(f" {c['name']} ({c['directory']})")JavaScript Example
const SCAVIO_KEY = process.env.SCAVIO_API_KEY;
async function searchDirectory(niche, site) {
const resp = await fetch('https://api.scavio.dev/api/v1/search', {
method: 'POST',
headers: { 'x-api-key': SCAVIO_KEY, 'Content-Type': 'application/json' },
body: JSON.stringify({ query: `site:${site} ${niche} companies`, country_code: 'us', num_results: 10 })
});
return ((await resp.json()).organic_results || []).map(r => ({
name: r.title.split(' - ')[0].trim(),
url: r.link, directory: site
}));
}
async function main() {
const all = [];
for (const site of ['clutch.co', 'g2.com']) {
all.push(...await searchDirectory('web development', site));
}
console.log(`Found ${all.length} listings`);
all.slice(0, 5).forEach(c => console.log(` ${c.name} (${c.directory})`));
}
main();Expected Output
Found 18 unique companies across 3 directories
Cost: 3 searches = $0.015
HubSpot (on g2.com, capterra.com)
Salesforce (on clutch.co, g2.com, capterra.com)
ActiveCampaign (on g2.com, capterra.com)
Marketo (on g2.com)
Mailchimp (on g2.com, capterra.com)
Exported 18 companies to directory_companies.csv
n8n workflow cost per run: $0.015 per niche