Reddit 的官方 API 需要 OAuth 注册,具有严格的速率限制,并且最近引入了付费等级,这使得大规模数据收集变得昂贵。 Scavio API 提供了一种替代方案:通过单个经过身份验证的端点查询 Reddit 帖子和评论,该端点返回包含 subreddit、作者、分数、时间戳和完整评论树的结构化 JSON。本教程展示了如何搜索 Reddit 帖子、获取带有评论的单个线程以及构建数据收集管道,而无需接触官方 Reddit API。
前置条件
- Python 3.8 或更高版本
- 请求已安装库
- Scavio API 密钥
- 您想要从中收集数据的主题或子版块
操作指南
步骤 1: 按关键字搜索 Reddit 帖子
使用 Scavio Reddit 搜索端点查找与关键字匹配的帖子。结果包括帖子标题、reddit 子版块、作者、分数和时间戳。
import os
import requests
API_KEY = os.environ["SCAVIO_API_KEY"]
def search_reddit(query: str, sort: str = "relevance") -> list[dict]:
r = requests.post(
"https://api.scavio.dev/api/v1/reddit/search",
headers={"Authorization": f"Bearer {API_KEY}"},
json={"query": query, "sort": sort},
timeout=30
)
r.raise_for_status()
return r.json()["data"]["posts"]步骤 2: 获取带有评论的完整帖子
给定 Reddit 帖子 URL,在一次调用中获取完整帖子以及所有线程评论。
def fetch_thread(url: str) -> dict:
r = requests.post(
"https://api.scavio.dev/api/v1/reddit/post",
headers={"Authorization": f"Bearer {API_KEY}"},
json={"url": url},
timeout=30
)
r.raise_for_status()
return r.json()["data"]步骤 3: 从结果中提取结构化数据
将帖子和评论解析为扁平的、可供分析的格式,并提取相关字段。
def extract_post_data(post: dict) -> dict:
return {
"id": post.get("id"),
"title": post.get("title"),
"subreddit": post.get("subreddit"),
"author": post.get("author"),
"score": post.get("score"),
"timestamp": post.get("timestamp"),
"url": post.get("url"),
}
def extract_comments(thread: dict) -> list[dict]:
return [{
"author": c.get("author"),
"body": c.get("body"),
"score": c.get("score"),
"depth": c.get("depth"),
} for c in thread.get("comments", [])]步骤 4: 导出为 JSON
将收集的帖子和评论保存为结构化 JSON 数据集。
import json
def collect_dataset(query: str, max_threads: int = 5) -> dict:
posts = search_reddit(query, sort="new")
dataset = []
for post in posts[:max_threads]:
thread = fetch_thread(post["url"])
dataset.append({
"post": extract_post_data(post),
"comments": extract_comments(thread),
})
return {"query": query, "threads": dataset}
with open("reddit_data.json", "w") as f:
json.dump(collect_dataset("python web frameworks"), f, indent=2)Python 示例
import os
import json
import requests
API_KEY = os.environ["SCAVIO_API_KEY"]
def search_reddit(query: str) -> list[dict]:
r = requests.post("https://api.scavio.dev/api/v1/reddit/search",
headers={"Authorization": f"Bearer {API_KEY}"},
json={"query": query, "sort": "new"}, timeout=30)
r.raise_for_status()
return r.json()["data"]["posts"]
def fetch_thread(url: str) -> dict:
r = requests.post("https://api.scavio.dev/api/v1/reddit/post",
headers={"Authorization": f"Bearer {API_KEY}"},
json={"url": url}, timeout=30)
r.raise_for_status()
return r.json()["data"]
if __name__ == "__main__":
posts = search_reddit("best python frameworks 2026")
print(f"Found {len(posts)} posts")
for p in posts[:3]:
print(f" r/{p['subreddit']}: {p['title']}")
thread = fetch_thread(p["url"])
comments = thread.get("comments", [])
print(f" {len(comments)} comments")JavaScript 示例
const API_KEY = process.env.SCAVIO_API_KEY;
async function searchReddit(query) {
const r = await fetch("https://api.scavio.dev/api/v1/reddit/search", {
method: "POST",
headers: { Authorization: `Bearer ${API_KEY}`, "Content-Type": "application/json" },
body: JSON.stringify({ query, sort: "new" })
});
return (await r.json()).data.posts;
}
async function fetchThread(url) {
const r = await fetch("https://api.scavio.dev/api/v1/reddit/post", {
method: "POST",
headers: { Authorization: `Bearer ${API_KEY}`, "Content-Type": "application/json" },
body: JSON.stringify({ url })
});
return (await r.json()).data;
}
async function main() {
const posts = await searchReddit("best python frameworks 2026");
for (const p of posts.slice(0, 3)) {
console.log(`r/${p.subreddit}: ${p.title}`);
const thread = await fetchThread(p.url);
console.log(` ${thread.comments?.length || 0} comments`);
}
}
main().catch(console.error);预期输出
Found 14 posts
r/Python: FastAPI vs Django for new projects in 2026
47 comments
r/webdev: What Python framework are you using this year?
23 comments
r/learnpython: Best framework to learn as a beginner?
18 comments