AI agents that fetch web pages directly hit Cloudflare bot protection on over 20% of websites. The agent gets an HTML challenge page instead of content, and if it does not detect this, it processes garbage data. This tutorial adds Cloudflare block detection to your agent and automatically falls back to search API snippets when direct fetching fails. Cost: $0.005 per fallback search.
Prerequisites
- Python 3.9+ installed
- requests library installed
- A Scavio API key for fallback searches
Walkthrough
Step 1: Build the Cloudflare detection function
Detect Cloudflare challenge pages by checking HTTP status codes, response headers, and page content patterns.
import requests
CLOUDFLARE_SIGNATURES = [
'cf-browser-verification',
'cloudflare-nginx',
'Checking your browser',
'Enable JavaScript and cookies to continue',
'cf-chl-bypass',
'Just a moment...',
'_cf_chl_opt',
'ray ID:',
]
def is_cloudflare_blocked(response: requests.Response) -> dict:
"""Detect if a response is a Cloudflare challenge page."""
# Check status code
if response.status_code in (403, 503, 429):
cf_header = response.headers.get('server', '').lower()
if 'cloudflare' in cf_header:
return {'blocked': True, 'type': 'cf_status', 'code': response.status_code}
# Check for CF-specific headers
if 'cf-ray' in response.headers and response.status_code != 200:
return {'blocked': True, 'type': 'cf_ray_error'}
# Check body for challenge signatures
body = response.text[:5000].lower()
for sig in CLOUDFLARE_SIGNATURES:
if sig.lower() in body:
return {'blocked': True, 'type': 'cf_challenge', 'signature': sig}
# Check if page is suspiciously small (challenge pages are small)
if response.status_code == 200 and len(response.text) < 500:
if 'cloudflare' in response.text.lower():
return {'blocked': True, 'type': 'cf_tiny_page'}
return {'blocked': False}
# Test against a few sites
test_urls = ['https://www.example.com', 'https://httpbin.org/status/200']
for url in test_urls:
try:
resp = requests.get(url, timeout=10, headers={'User-Agent': 'Mozilla/5.0'})
result = is_cloudflare_blocked(resp)
print(f'{url}: blocked={result["blocked"]}')
except Exception as e:
print(f'{url}: error ({e})')Step 2: Build the fetch-with-fallback function
Try to fetch a page directly. If Cloudflare blocks it, fall back to search API snippets to get the content.
import os
SCAVIO_KEY = os.environ['SCAVIO_API_KEY']
def fetch_with_fallback(url: str) -> dict:
"""Fetch a URL directly, fall back to search if Cloudflare blocks."""
# Try direct fetch
try:
resp = requests.get(url, timeout=10,
headers={'User-Agent': 'Mozilla/5.0 (compatible; MyAgent/1.0)'})
cf_check = is_cloudflare_blocked(resp)
if not cf_check['blocked'] and resp.status_code == 200:
return {
'content': resp.text[:5000],
'source': 'direct',
'url': url,
'cost': 0
}
print(f'Cloudflare blocked: {cf_check.get("type", "unknown")}')
except requests.exceptions.RequestException as e:
print(f'Direct fetch failed: {e}')
# Fallback: search for the URL to get snippets
domain = url.split('/')[2] if '/' in url else ''
path_hint = url.split('/')[-1].replace('-', ' ') if '/' in url else ''
query = f'site:{domain} {path_hint}'
resp = requests.post('https://api.scavio.dev/api/v1/search',
headers={'x-api-key': SCAVIO_KEY, 'Content-Type': 'application/json'},
json={'query': query, 'country_code': 'us', 'num_results': 3})
results = resp.json().get('organic_results', [])
content = '\n\n'.join(f'{r["title"]}\n{r.get("snippet", "")}' for r in results)
return {
'content': content or 'No content retrieved',
'source': 'search_fallback',
'url': url,
'cost': 0.005
}
# Test
result = fetch_with_fallback('https://www.example.com')
print(f'Source: {result["source"]}, Cost: ${result["cost"]}')
print(f'Content preview: {result["content"][:100]}...')Step 3: Integrate into an agent workflow with block rate tracking
Add block detection to your agent's web browsing pipeline and track what percentage of sites block your agent.
from collections import defaultdict
block_stats = defaultdict(int)
def agent_browse(urls: list) -> list:
"""Agent browses multiple URLs with automatic Cloudflare handling."""
pages = []
for url in urls:
result = fetch_with_fallback(url)
pages.append(result)
block_stats['total'] += 1
block_stats[result['source']] += 1
return pages
def block_report():
total = block_stats['total']
if total == 0:
print('No pages fetched yet.')
return
direct = block_stats.get('direct', 0)
fallback = block_stats.get('search_fallback', 0)
print(f'Agent Browse Report:')
print(f' Total pages: {total}')
print(f' Direct fetch: {direct} ({direct/total*100:.0f}%)')
print(f' CF blocked (fallback): {fallback} ({fallback/total*100:.0f}%)')
print(f' Fallback cost: ${fallback * 0.005:.3f}')
# Simulate agent browsing
urls = [
'https://www.example.com',
'https://httpbin.org/html',
]
pages = agent_browse(urls)
block_report()Python Example
import requests, os
SCAVIO_KEY = os.environ['SCAVIO_API_KEY']
def fetch(url):
try:
resp = requests.get(url, timeout=10, headers={'User-Agent': 'Mozilla/5.0'})
if resp.status_code == 200 and 'cloudflare' not in resp.text[:2000].lower():
return {'content': resp.text[:3000], 'source': 'direct', 'cost': 0}
except Exception:
pass
# Fallback to search
domain = url.split('/')[2] if '/' in url else ''
resp = requests.post('https://api.scavio.dev/api/v1/search',
headers={'x-api-key': SCAVIO_KEY, 'Content-Type': 'application/json'},
json={'query': f'site:{domain}', 'country_code': 'us', 'num_results': 3})
content = '\n'.join(r.get('snippet', '') for r in resp.json().get('organic_results', []))
return {'content': content, 'source': 'fallback', 'cost': 0.005}
r = fetch('https://www.example.com')
print(f'{r["source"]}: ${r["cost"]} - {r["content"][:80]}')JavaScript Example
const SCAVIO_KEY = process.env.SCAVIO_API_KEY;
async function fetchPage(url) {
try {
const resp = await fetch(url, { headers: { 'User-Agent': 'Mozilla/5.0' } });
const text = await resp.text();
if (resp.ok && !text.slice(0, 2000).toLowerCase().includes('cloudflare')) {
return { content: text.slice(0, 3000), source: 'direct', cost: 0 };
}
} catch {}
const domain = new URL(url).hostname;
const resp = await fetch('https://api.scavio.dev/api/v1/search', {
method: 'POST',
headers: { 'x-api-key': SCAVIO_KEY, 'Content-Type': 'application/json' },
body: JSON.stringify({ query: `site:${domain}`, country_code: 'us', num_results: 3 })
});
const content = (await resp.json()).organic_results?.map(r => r.snippet).join('\n') || '';
return { content, source: 'fallback', cost: 0.005 };
}
fetchPage('https://www.example.com').then(r => console.log(`${r.source}: ${r.content.slice(0, 80)}`));Expected Output
https://www.example.com: blocked=False
https://httpbin.org/status/200: blocked=False
Cloudflare blocked: cf_challenge
Source: search_fallback, Cost: $0.005
Content preview: Example Domain - This domain is for use in illustrative examples...
Agent Browse Report:
Total pages: 2
Direct fetch: 1 (50%)
CF blocked (fallback): 1 (50%)
Fallback cost: $0.005