Extracting structured data from websites typically requires writing custom scrapers for each site's HTML layout. Scavio's extract endpoint takes a URL and returns structured content without any parsing code. This tutorial shows how to extract data from product pages, articles, and company websites using a single API call.
Prerequisites
- Python 3.8+ or Node.js 18+
- requests library (Python) or built-in fetch (JS)
- A Scavio API key from scavio.dev
Walkthrough
Step 1: Extract content from a URL
Send a URL to the extract endpoint and receive structured content.
import requests, os
H = {'x-api-key': os.environ['SCAVIO_API_KEY']}
def extract(url: str) -> dict:
resp = requests.post('https://api.scavio.dev/api/v1/extract',
headers=H, json={'url': url}, timeout=30)
return resp.json()
data = extract('https://example.com/product-page')
print(data)Step 2: Extract multiple URLs in batch
Process a list of URLs and aggregate the extracted data.
import time
def extract_batch(urls: list, delay: float = 0.5) -> list:
results = []
for url in urls:
try:
data = extract(url)
results.append({'url': url, 'status': 'ok', 'data': data})
except Exception as e:
results.append({'url': url, 'status': 'error', 'error': str(e)})
time.sleep(delay)
return results
urls = ['https://example.com/page1', 'https://example.com/page2']
extracted = extract_batch(urls)Step 3: Combine search + extract for enrichment
Search for companies, then extract structured data from their websites.
def search_and_extract(query: str) -> list:
# Search for relevant pages
search_resp = requests.post('https://api.scavio.dev/api/v1/search', headers=H,
json={'platform': 'google', 'query': query}, timeout=10)
results = search_resp.json().get('organic', [])[:3]
# Extract structured data from each result
enriched = []
for r in results:
try:
extracted = extract(r['link'])
enriched.append({'title': r['title'], 'url': r['link'], 'extracted': extracted})
except: pass
return enriched
data = search_and_extract('best CRM software pricing')Step 4: Save extracted data
Export the extracted data for downstream processing.
import json
def save_extracted(data: list, filepath: str):
with open(filepath, 'w') as f:
json.dump(data, f, indent=2)
print(f'Saved {len(data)} extracted records to {filepath}')
save_extracted(extracted, 'extracted_data.json')Python Example
import requests, os
H = {'x-api-key': os.environ['SCAVIO_API_KEY']}
def extract(url):
return requests.post('https://api.scavio.dev/api/v1/extract',
headers=H, json={'url': url}, timeout=30).json()
# Extract structured data from any URL:
data = extract('https://example.com/pricing')JavaScript Example
async function extract(url) {
const resp = await fetch('https://api.scavio.dev/api/v1/extract', {
method: 'POST', headers: {'x-api-key': process.env.SCAVIO_API_KEY, 'Content-Type': 'application/json'},
body: JSON.stringify({url})
});
return resp.json();
}Expected Output
Structured data extracted from any URL via a single API call, with no custom parsing code needed.