Doiiars

1. 同步MediaCrawler为最新版本

2. 修复数据库not null错误
3. 支持PG数据库
4. 规范环境变量及配置使用
5. 规范为uv安装
6. 使用loggru
Showing 67 changed files with 3724 additions and 720 deletions

Too many changes to show.

To preserve performance only 67 of 67+ files are displayed.

# ====================== 数据库配置 ======================
# 数据库主机,例如localhost 或 127.0.0.1
DB_HOST=your_db_host
# 数据库端口号,默认为3306
DB_PORT=3306
# 数据库用户名
DB_USER=your_db_user
# 数据库密码
DB_PASSWORD=your_db_password
# 数据库名称
DB_NAME=your_db_name
# 数据库字符集,推荐utf8mb4,兼容emoji
DB_CHARSET=utf8mb4
# 数据库类型mysql或postgresql
DB_DIALECT=postgresql
# ======================= LLM 相关 =======================
# Insight Agent(推荐Kimi,https://platform.moonshot.cn/)API密钥,用于主LLM
INSIGHT_ENGINE_API_KEY=
# Insight Agent LLM接口BaseUrl,可自定义厂商API
INSIGHT_ENGINE_BASE_URL=
# Insight Agent LLM模型名称,如kimi-k2-0711-preview
INSIGHT_ENGINE_MODEL_NAME=
# Media Agent(推荐Gemini,可用中转厂商 https://www.chataiapi.com/)API密钥
MEDIA_ENGINE_API_KEY=
# Media Agent LLM接口BaseUrl
MEDIA_ENGINE_BASE_URL=
# Media Agent LLM模型名称,如gemini-2.5-pro
MEDIA_ENGINE_MODEL_NAME=
# Media Agent API密钥(推荐Deepseek)
MINDSPIDER_API_KEY=
# MindSpider LLM接口BaseUrl
MINDSPIDER_BASE_URL=
# MindSpider LLM模型名称,如deepseek-chat
MINDSPIDER_MODEL_NAME=
# Query Agent(推荐DeepSeek,https://www.deepseek.com/)API密钥
QUERY_ENGINE_API_KEY=
# Query Agent LLM接口BaseUrl
QUERY_ENGINE_BASE_URL=
# Query Agent LLM模型,如deepseek-reasoner
QUERY_ENGINE_MODEL_NAME=
# Report Agent(推荐Gemini,可用中转厂商 https://www.chataiapi.com/)API密钥
REPORT_ENGINE_API_KEY=
# Report Agent LLM接口BaseUrl
REPORT_ENGINE_BASE_URL=
# Report Agent LLM模型,如gemini-2.5-pro
REPORT_ENGINE_MODEL_NAME=
# Forum Host LLM API密钥,Qwen3最新模型,推荐 https://cloud.siliconflow.cn/
FORUM_HOST_API_KEY=
# Forum Host LLM BaseUrl
FORUM_HOST_BASE_URL=
# Forum Host LLM模型名,如Qwen/Qwen3-235B-A22B-Instruct-2507
FORUM_HOST_MODEL_NAME=
# SQL Keyword Optimizer LLM密钥,小参数Qwen3模型 https://cloud.siliconflow.cn/
KEYWORD_OPTIMIZER_API_KEY=
# Keyword Optimizer BaseUrl
KEYWORD_OPTIMIZER_BASE_URL=
# Keyword Optimizer LLM模型名称,如deepseek-chat
KEYWORD_OPTIMIZER_MODEL_NAME=
# ================== 网络工具配置 ====================
# Tavily API密钥,用于Tavily网络搜索。注册地址:https://www.tavily.com/
TAVILY_API_KEY=
# Bocha Web Search API密钥,用于Bocha搜索。注册地址:https://open.bochaai.com/
BOCHA_WEB_SEARCH_API_KEY=
\ No newline at end of file
... ...
... ... @@ -7,11 +7,12 @@ BroadTopicExtraction模块 - 数据库管理器
import sys
import json
from datetime import datetime, date
from datetime import datetime, date, timedelta
from pathlib import Path
from typing import List, Dict, Optional
import pymysql
from pymysql.cursors import DictCursor
from sqlalchemy import create_engine, text
from sqlalchemy.engine import Engine
from loguru import logger
# 添加项目根目录到路径
project_root = Path(__file__).parent.parent
... ... @@ -22,37 +23,44 @@ try:
except ImportError:
raise ImportError("无法导入config.py配置文件")
from config import settings
class DatabaseManager:
"""数据库管理器"""
def __init__(self):
"""初始化数据库管理器"""
self.connection = None
self.engine: Engine = None
self.connect()
def connect(self):
"""连接数据库"""
try:
self.connection = pymysql.connect(
host=config.DB_HOST,
port=config.DB_PORT,
user=config.DB_USER,
password=config.DB_PASSWORD,
database=config.DB_NAME,
charset=config.DB_CHARSET,
autocommit=True,
cursorclass=DictCursor
)
print(f"成功连接到数据库: {config.DB_NAME}")
dialect = (settings.DB_DIALECT or "mysql").lower()
if dialect in ("postgresql", "postgres"):
url = f"postgresql+psycopg://{settings.DB_USER}:{settings.DB_PASSWORD}@{settings.DB_HOST}:{settings.DB_PORT}/{settings.DB_NAME}"
else:
url = f"mysql+pymysql://{settings.DB_USER}:{settings.DB_PASSWORD}@{settings.DB_HOST}:{settings.DB_PORT}/{settings.DB_NAME}?charset={settings.DB_CHARSET}"
self.engine = create_engine(url, future=True)
logger.info(f"成功连接到数据库: {settings.DB_NAME}")
except ModuleNotFoundError as e:
missing: str = str(e)
if "psycopg" in missing:
logger.error("数据库连接失败: 未安装PostgreSQL驱动 psycopg。请安装: psycopg[binary]。参考指令:uv pip install psycopg[binary]")
elif "pymysql" in missing:
logger.error("数据库连接失败: 未安装MySQL驱动 pymysql。请安装: pymysql。参考指令:uv pip install pymysql")
else:
logger.error(f"数据库连接失败(缺少驱动): {e}")
raise
except Exception as e:
print(f"数据库连接失败: {e}")
logger.error(f"数据库连接失败: {e}")
raise
def close(self):
"""关闭数据库连接"""
if self.connection:
self.connection.close()
print("数据库连接已关闭")
if self.engine:
self.engine.dispose()
logger.info("数据库连接已关闭")
def __enter__(self):
return self
... ... @@ -79,48 +87,49 @@ class DatabaseManager:
current_timestamp = int(datetime.now().timestamp())
try:
cursor = self.connection.cursor()
# 先删除当天所有的新闻记录(覆盖模式)
delete_query = "DELETE FROM daily_news WHERE crawl_date = %s"
deleted_count = cursor.execute(delete_query, (crawl_date,))
if deleted_count > 0:
print(f"覆盖模式:删除了当天已有的 {deleted_count} 条新闻记录")
# 批量插入新记录
saved_count = 0
# 先独立事务执行删除,防止后续插入失败导致无法清理
with self.engine.begin() as conn:
deleted = conn.execute(text("DELETE FROM daily_news WHERE crawl_date = :d"), {"d": crawl_date}).rowcount
if deleted and deleted > 0:
logger.info(f"覆盖模式:删除了当天已有的 {deleted} 条新闻记录")
# 逐条插入,单条失败不影响后续(每条独立事务)
for news_item in news_data:
try:
# 简化的新闻ID生成
news_id = f"{news_item.get('source', 'unknown')}_{news_item.get('id', news_item.get('rank', 0))}"
# 插入新记录
insert_query = """
INSERT INTO daily_news (
news_id, source_platform, title, url, crawl_date,
rank_position, add_ts
) VALUES (%s, %s, %s, %s, %s, %s, %s)
"""
cursor.execute(insert_query, (
news_id,
news_item.get('source', 'unknown'),
news_item.get('title', ''),
news_item.get('url', ''),
crawl_date,
news_item.get('rank', None),
current_timestamp
))
title_val = (news_item.get("title", "") or "")
if len(title_val) > 500:
title_val = title_val[:500]
with self.engine.begin() as conn:
conn.execute(
text(
"""
INSERT INTO daily_news (
news_id, source_platform, title, url, crawl_date,
rank_position, add_ts, last_modify_ts
) VALUES (:news_id, :source_platform, :title, :url, :crawl_date, :rank_position, :add_ts, :last_modify_ts)
"""
),
{
"news_id": news_id,
"source_platform": news_item.get("source", "unknown"),
"title": title_val,
"url": news_item.get("url", ""),
"crawl_date": crawl_date,
"rank_position": news_item.get("rank", None),
"add_ts": current_timestamp,
"last_modify_ts": current_timestamp,
},
)
saved_count += 1
except Exception as e:
print(f"保存单条新闻失败: {e}")
logger.warning(f"保存单条新闻失败: {e}")
continue
print(f"成功保存 {saved_count} 条新闻记录")
logger.info(f"成功保存 {saved_count} 条新闻记录")
return saved_count
except Exception as e:
print(f"保存新闻数据失败: {e}")
logger.exception(f"保存新闻数据失败: {e}")
return 0
def get_daily_news(self, crawl_date: date = None) -> List[Dict]:
... ... @@ -136,15 +145,13 @@ class DatabaseManager:
if not crawl_date:
crawl_date = date.today()
query = """
SELECT * FROM daily_news
WHERE crawl_date = %s
ORDER BY rank_position ASC
"""
cursor = self.connection.cursor()
cursor.execute(query, (crawl_date,))
return cursor.fetchall()
query = (
"SELECT * FROM daily_news WHERE crawl_date = :d ORDER BY rank_position ASC"
)
with self.engine.connect() as conn:
result = conn.execute(text(query), {"d": crawl_date})
rows = result.mappings().all()
return rows
# ==================== 话题数据操作 ====================
... ... @@ -166,37 +173,31 @@ class DatabaseManager:
current_timestamp = int(datetime.now().timestamp())
try:
cursor = self.connection.cursor()
# 检查今天是否已有记录
check_query = "SELECT id FROM daily_topics WHERE extract_date = %s"
cursor.execute(check_query, (extract_date,))
existing = cursor.fetchone()
keywords_json = json.dumps(keywords, ensure_ascii=False)
if existing:
# 更新现有记录
update_query = """
UPDATE daily_topics
SET keywords = %s, summary = %s, add_ts = %s
WHERE extract_date = %s
"""
cursor.execute(update_query, (keywords_json, summary, current_timestamp, extract_date))
print(f"更新了 {extract_date} 的话题分析")
else:
# 插入新记录
insert_query = """
INSERT INTO daily_topics (extract_date, keywords, summary, add_ts)
VALUES (%s, %s, %s, %s)
"""
cursor.execute(insert_query, (extract_date, keywords_json, summary, current_timestamp))
print(f"保存了 {extract_date} 的话题分析")
with self.engine.begin() as conn:
check = conn.execute(
text("SELECT id FROM daily_topics WHERE extract_date = :d AND topic_id = :tid"),
{"d": extract_date, "tid": "summary"},
).first()
if check:
conn.execute(
text(
"UPDATE daily_topics SET keywords = :k, topic_description = :s, add_ts = :ts, last_modify_ts = :lmt, topic_name = :tn WHERE extract_date = :d AND topic_id = :tid"
),
{"k": keywords_json, "s": summary, "ts": current_timestamp, "lmt": current_timestamp, "d": extract_date, "tid": "summary", "tn": "每日新闻分析"},
)
logger.info(f"更新了 {extract_date} 的话题分析")
else:
conn.execute(
text(
"INSERT INTO daily_topics (extract_date, topic_id, topic_name, keywords, topic_description, add_ts, last_modify_ts) VALUES (:d, :tid, :tn, :k, :s, :ts, :lmt)"
),
{"d": extract_date, "tid": "summary", "tn": "每日新闻分析", "k": keywords_json, "s": summary, "ts": current_timestamp, "lmt": current_timestamp},
)
logger.info(f"保存了 {extract_date} 的话题分析")
return True
except Exception as e:
print(f"保存话题分析失败: {e}")
logger.exception(f"保存话题分析失败: {e}")
return False
def get_daily_topics(self, extract_date: date = None) -> Optional[Dict]:
... ... @@ -213,20 +214,15 @@ class DatabaseManager:
extract_date = date.today()
try:
cursor = self.connection.cursor()
query = "SELECT * FROM daily_topics WHERE extract_date = %s"
cursor.execute(query, (extract_date,))
result = cursor.fetchone()
if result:
# 解析关键词JSON
result['keywords'] = json.loads(result['keywords'])
return result
else:
with self.engine.connect() as conn:
result = conn.execute(text("SELECT * FROM daily_topics WHERE extract_date = :d"), {"d": extract_date}).mappings().first()
if result:
result = dict(result) # 转为可变dict以支持item赋值
result["keywords"] = json.loads(result["keywords"]) if result.get("keywords") else []
return result
return None
except Exception as e:
print(f"获取话题分析失败: {e}")
logger.exception(f"获取话题分析失败: {e}")
return None
def get_recent_topics(self, days: int = 7) -> List[Dict]:
... ... @@ -240,23 +236,23 @@ class DatabaseManager:
话题分析列表
"""
try:
cursor = self.connection.cursor()
query = """
SELECT * FROM daily_topics
WHERE extract_date >= DATE_SUB(CURDATE(), INTERVAL %s DAY)
ORDER BY extract_date DESC
"""
cursor.execute(query, (days,))
results = cursor.fetchall()
# 解析每个结果的关键词JSON
for result in results:
result['keywords'] = json.loads(result['keywords'])
return results
start_date = date.today() - timedelta(days=days)
with self.engine.connect() as conn:
results = conn.execute(
text(
"""
SELECT * FROM daily_topics
WHERE extract_date >= :start_date
ORDER BY extract_date DESC
"""
),
{"start_date": start_date},
).mappings().all()
for r in results:
r["keywords"] = json.loads(r["keywords"]) if r.get("keywords") else []
return results
except Exception as e:
print(f"获取最近话题分析失败: {e}")
logger.exception(f"获取最近话题分析失败: {e}")
return []
# ==================== 统计查询 ====================
... ... @@ -264,56 +260,48 @@ class DatabaseManager:
def get_summary_stats(self, days: int = 7) -> Dict:
"""获取统计摘要"""
try:
cursor = self.connection.cursor()
# 新闻统计
news_query = """
SELECT
crawl_date,
COUNT(*) as news_count,
COUNT(DISTINCT source_platform) as platforms_count
FROM daily_news
WHERE crawl_date >= DATE_SUB(CURDATE(), INTERVAL %s DAY)
GROUP BY crawl_date
ORDER BY crawl_date DESC
"""
cursor.execute(news_query, (days,))
news_stats = cursor.fetchall()
# 话题统计
topics_query = """
SELECT
extract_date,
keywords,
CHAR_LENGTH(summary) as summary_length
FROM daily_topics
WHERE extract_date >= DATE_SUB(CURDATE(), INTERVAL %s DAY)
ORDER BY extract_date DESC
"""
cursor.execute(topics_query, (days,))
topics_stats = cursor.fetchall()
return {
'news_stats': news_stats,
'topics_stats': topics_stats
}
start_date = date.today() - timedelta(days=days)
with self.engine.connect() as conn:
news_stats = conn.execute(
text(
"""
SELECT crawl_date, COUNT(*) as news_count, COUNT(DISTINCT source_platform) as platforms_count
FROM daily_news
WHERE crawl_date >= :start_date
GROUP BY crawl_date
ORDER BY crawl_date DESC
"""
),
{"start_date": start_date},
).all()
topics_stats = conn.execute(
text(
"""
SELECT extract_date, keywords, CHAR_LENGTH(topic_description) as summary_length
FROM daily_topics
WHERE extract_date >= :start_date
ORDER BY extract_date DESC
"""
),
{"start_date": start_date},
).all()
return {"news_stats": news_stats, "topics_stats": topics_stats}
except Exception as e:
print(f"获取统计摘要失败: {e}")
return {'news_stats': [], 'topics_stats': []}
logger.exception(f"获取统计摘要失败: {e}")
return {"news_stats": [], "topics_stats": []}
if __name__ == "__main__":
# 测试数据库管理器
with DatabaseManager() as db:
# 测试获取新闻
news = db.get_daily_news()
print(f"今日新闻数量: {len(news)}")
logger.info(f"今日新闻数量: {len(news)}")
# 测试获取话题
topics = db.get_daily_topics()
if topics:
print(f"今日话题关键词: {topics['keywords']}")
logger.info(f"今日话题关键词: {topics['keywords']}")
else:
print("今日暂无话题分析")
logger.info("今日暂无话题分析")
print("简化数据库管理器测试完成!")
logger.info("简化数据库管理器测试完成!")
... ...
... ... @@ -11,6 +11,7 @@ import argparse
from datetime import datetime, date
from pathlib import Path
from typing import List, Dict, Optional
from loguru import logger
# 添加项目根目录到路径
project_root = Path(__file__).parent.parent
... ... @@ -21,8 +22,8 @@ try:
from BroadTopicExtraction.topic_extractor import TopicExtractor
from BroadTopicExtraction.database_manager import DatabaseManager
except ImportError as e:
print(f"导入模块失败: {e}")
print("请确保在项目根目录运行,并且已安装所有依赖")
logger.exception(f"导入模块失败: {e}")
logger.error("请确保在项目根目录运行,并且已安装所有依赖")
sys.exit(1)
class BroadTopicExtraction:
... ... @@ -34,7 +35,7 @@ class BroadTopicExtraction:
self.topic_extractor = TopicExtractor()
self.db_manager = DatabaseManager()
print("BroadTopicExtraction 初始化完成")
logger.info("BroadTopicExtraction 初始化完成")
def close(self):
"""关闭资源"""
... ... @@ -68,21 +69,22 @@ class BroadTopicExtraction:
Returns:
包含完整提取结果的字典
"""
print("\n" + "=" * 80)
print("MindSpider AI爬虫 - 每日话题提取")
print("=" * 80)
print(f"执行时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"目标日期: {date.today()}")
extraction_result_message = ""
extraction_result_message += "\nMindSpider AI爬虫 - 每日话题提取\n"
extraction_result_message += f"执行时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n"
extraction_result_message += f"目标日期: {date.today()}\n"
if news_sources:
print(f"指定平台: {len(news_sources)} 个")
extraction_result_message += f"指定平台: {len(news_sources)} 个\n"
for source in news_sources:
source_name = SOURCE_NAMES.get(source, source)
print(f" - {source_name}")
extraction_result_message += f" - {source_name}\n"
else:
print(f"爬取平台: 全部 {len(SOURCE_NAMES)} 个平台")
extraction_result_message += f"爬取平台: 全部 {len(SOURCE_NAMES)} 个平台\n"
print(f"关键词数: 最多 {max_keywords} 个")
extraction_result_message += f"关键词数: 最多 {max_keywords} 个\n"
logger.info(extraction_result_message)
extraction_result = {
'success': False,
... ... @@ -96,7 +98,7 @@ class BroadTopicExtraction:
try:
# 步骤1: 收集新闻
print("\n【步骤1】收集热点新闻...")
logger.info("【步骤1】收集热点新闻...")
news_result = await self.news_collector.collect_and_save_news(
sources=news_sources
)
... ... @@ -112,7 +114,7 @@ class BroadTopicExtraction:
raise Exception("新闻收集失败或没有获取到新闻")
# 步骤2: 提取关键词和生成总结
print("\n【步骤2】提取关键词和生成总结...")
logger.info("【步骤2】提取关键词和生成总结...")
keywords, summary = self.topic_extractor.extract_keywords_and_summary(
news_result['news_list'],
max_keywords=max_keywords
... ... @@ -126,10 +128,10 @@ class BroadTopicExtraction:
}
if not keywords:
print("警告: 没有提取到有效关键词")
logger.warning("警告: 没有提取到有效关键词")
# 步骤3: 保存到数据库
print("\n【步骤3】保存分析结果到数据库...")
logger.info("【步骤3】保存分析结果到数据库...")
save_success = self.db_manager.save_daily_topics(
keywords, summary, date.today()
)
... ... @@ -141,56 +143,47 @@ class BroadTopicExtraction:
extraction_result['success'] = True
extraction_result['end_time'] = datetime.now().isoformat()
print("\n" + "=" * 80)
print("每日话题提取流程完成!")
print("=" * 80)
logger.info("每日话题提取流程完成!")
return extraction_result
except Exception as e:
print(f"\n话题提取流程失败: {e}")
logger.exception(f"话题提取流程失败: {e}")
extraction_result['error'] = str(e)
extraction_result['end_time'] = datetime.now().isoformat()
return extraction_result
def print_extraction_results(self, extraction_result: Dict):
"""打印提取结果"""
print("\n" + "=" * 80)
print("话题提取结果报告")
print("=" * 80)
if not extraction_result['success']:
print(f"❌ 提取失败: {extraction_result.get('error', '未知错误')}")
return
extraction_result_message = ""
# 新闻收集结果
news_data = extraction_result.get('news_collection', {})
print(f"📰 新闻收集: {news_data.get('total_news', 0)} 条新闻")
print(f" 成功源数: {news_data.get('successful_sources', 0)}/{news_data.get('total_sources', 0)}")
extraction_result_message += f"\n📰 新闻收集: {news_data.get('total_news', 0)} 条新闻\n"
extraction_result_message += f" 成功源数: {news_data.get('successful_sources', 0)}/{news_data.get('total_sources', 0)}\n"
# 话题提取结果
topic_data = extraction_result.get('topic_extraction', {})
keywords = topic_data.get('keywords', [])
summary = topic_data.get('summary', '')
print(f"\n🔑 提取关键词: {len(keywords)} 个")
extraction_result_message += f"\n🔑 提取关键词: {len(keywords)} 个\n"
if keywords:
# 每行显示5个关键词
for i in range(0, len(keywords), 5):
keyword_group = keywords[i:i+5]
print(f" {', '.join(keyword_group)}")
extraction_result_message += f" {', '.join(keyword_group)}\n"
print(f"\n📝 新闻总结:")
print(f" {summary}")
extraction_result_message += f"\n📝 新闻总结:\n {summary}\n"
# 数据库保存结果
db_data = extraction_result.get('database_save', {})
if db_data.get('success'):
print(f"\n💾 数据库保存: 成功")
extraction_result_message += f"\n💾 数据库保存: 成功\n"
else:
print(f"\n💾 数据库保存: 失败")
extraction_result_message += f"\n💾 数据库保存: 失败\n"
print("\n" + "=" * 80)
logger.info(extraction_result_message)
def get_keywords_for_crawling(self, extract_date: date = None) -> List[str]:
"""
... ... @@ -207,7 +200,7 @@ class BroadTopicExtraction:
topics_data = self.db_manager.get_daily_topics(extract_date)
if not topics_data:
print(f"没有找到 {extract_date or date.today()} 的话题数据")
logger.info(f"没有找到 {extract_date or date.today()} 的话题数据")
return []
keywords = topics_data['keywords']
... ... @@ -215,11 +208,11 @@ class BroadTopicExtraction:
# 生成搜索关键词
search_keywords = self.topic_extractor.get_search_keywords(keywords)
print(f"准备了 {len(search_keywords)} 个关键词用于爬取")
logger.info(f"准备了 {len(search_keywords)} 个关键词用于爬取")
return search_keywords
except Exception as e:
print(f"获取爬取关键词失败: {e}")
logger.error(f"获取爬取关键词失败: {e}")
return []
def get_daily_analysis(self, target_date: date = None) -> Optional[Dict]:
... ... @@ -227,7 +220,7 @@ class BroadTopicExtraction:
try:
return self.db_manager.get_daily_topics(target_date)
except Exception as e:
print(f"获取每日分析失败: {e}")
logger.error(f"获取每日分析失败: {e}")
return None
def get_recent_analysis(self, days: int = 7) -> List[Dict]:
... ... @@ -235,7 +228,7 @@ class BroadTopicExtraction:
try:
return self.db_manager.get_recent_topics(days)
except Exception as e:
print(f"获取最近分析失败: {e}")
logger.error(f"获取最近分析失败: {e}")
return []
# ==================== 命令行工具 ====================
... ... @@ -260,17 +253,17 @@ async def run_extraction_command(sources=None, keywords_count=100, show_details=
news_data = result.get('news_collection', {})
topic_data = result.get('topic_extraction', {})
print(f"✅ 话题提取成功完成!")
print(f" 收集新闻: {news_data.get('total_news', 0)} 条")
print(f" 提取关键词: {len(topic_data.get('keywords', []))} 个")
print(f" 生成总结: {len(topic_data.get('summary', ''))} 字符")
logger.info(f"✅ 话题提取成功完成!")
logger.info(f" 收集新闻: {news_data.get('total_news', 0)} 条")
logger.info(f" 提取关键词: {len(topic_data.get('keywords', []))} 个")
logger.info(f" 生成总结: {len(topic_data.get('summary', ''))} 字符")
# 获取爬取关键词
crawling_keywords = extractor.get_keywords_for_crawling()
if crawling_keywords:
print(f"\n🔑 为DeepSentimentCrawling准备的搜索关键词:")
print(f" {', '.join(crawling_keywords)}")
logger.info(f"\n🔑 为DeepSentimentCrawling准备的搜索关键词:")
logger.info(f" {', '.join(crawling_keywords)}")
# 保存关键词到文件
keywords_file = project_root / "data" / "daily_keywords.txt"
... ... @@ -279,16 +272,16 @@ async def run_extraction_command(sources=None, keywords_count=100, show_details=
with open(keywords_file, 'w', encoding='utf-8') as f:
f.write('\n'.join(crawling_keywords))
print(f" 关键词已保存到: {keywords_file}")
logger.info(f" 关键词已保存到: {keywords_file}")
return True
else:
print(f"❌ 话题提取失败: {result.get('error', '未知错误')}")
logger.error(f"❌ 话题提取失败: {result.get('error', '未知错误')}")
return False
except Exception as e:
print(f"❌ 执行过程中发生错误: {e}")
logger.error(f"❌ 执行过程中发生错误: {e}")
return False
def main():
... ... @@ -304,14 +297,14 @@ def main():
# 显示支持的新闻源
if args.list_sources:
print("支持的新闻源平台:")
logger.info("支持的新闻源平台:")
for source, name in SOURCE_NAMES.items():
print(f" {source:<25} {name}")
logger.info(f" {source:<25} {name}")
return
# 验证参数
if args.keywords < 1 or args.keywords > 200:
print("关键词数量应在1-200之间")
logger.error("关键词数量应在1-200之间")
sys.exit(1)
# 运行提取
... ... @@ -325,7 +318,7 @@ def main():
sys.exit(0 if success else 1)
except KeyboardInterrupt:
print("\n用户中断操作")
logger.info("用户中断操作")
sys.exit(1)
if __name__ == "__main__":
... ...
... ... @@ -18,19 +18,20 @@ sys.path.append(str(project_root))
try:
import config
from config import settings
except ImportError:
raise ImportError("无法导入config.py配置文件")
raise ImportError("无法导入settings.py配置文件")
class TopicExtractor:
"""话题提取器"""
def __init__(self):
"""初始化话题提取器"""
self.client = OpenAI(
api_key=config.DEEPSEEK_API_KEY,
base_url="https://api.deepseek.com"
api_key=settings.MINDSPIDER_API_KEY,
base_url=settings.MINDSPIDER_BASE_URL
)
self.model = "deepseek-chat"
self.model = settings.MINDSPIDER_MODEL_NAME
def extract_keywords_and_summary(self, news_list: List[Dict], max_keywords: int = 100) -> Tuple[List[str], str]:
"""
... ...
---
name: MediaCrawler Bug反馈
about: 创建一个问题Bug以帮助MediaCrawler开源项目改进
title: '[BUG] '
labels: bug
assignees: ''
---
## 🔍 问题检查清单
<!-- 请在提交issue前确认以下事项 -->
- [ ] 我已经仔细阅读了项目使用过程中的[常见问题汇总](https://nanmicoder.github.io/MediaCrawler/%E5%B8%B8%E8%A7%81%E9%97%AE%E9%A2%98.html)
- [ ] 我已经搜索并查看了[已关闭的issues](https://github.com/NanmiCoder/MediaCrawler/issues?q=is%3Aissue+is%3Aclosed)
- [ ] 我确认这不是由于滑块验证码、Cookie过期、Cookie提取错误、平台风控等常见原因导致的问题
## 🐛 问题描述
<!-- 请详细描述你遇到的问题 -->
## 📝 复现步骤
1.
2.
3.
## 💻 运行环境
- 操作系统:
- Python版本:
- 是否使用IP代理:
- 是否使用VPN翻墙软件:
- 目标平台(抖音/小红书/微博等):
## 📋 错误日志
<!-- 请提供完整的错误日志信息 -->
```shell
在此粘贴错误日志
```
## 📷 错误截图
<!-- 请提供错误截图 -->
... ...
---
name: MediaCrawler使用问题咨询
about: 提交使用过程中遇到的问题
title: '[问题] '
labels: question
assignees: ''
---
## ⚠️ 提交前确认
<!-- 请确认以下事项 -->
- [ ] 我已经仔细阅读了项目使用过程中的[常见问题汇总](https://nanmicoder.github.io/MediaCrawler/%E5%B8%B8%E8%A7%81%E9%97%AE%E9%A2%98.html)
- [ ] 我已经搜索并查看了[已关闭的issues](https://github.com/NanmiCoder/MediaCrawler/issues?q=is%3Aissue+is%3Aclosed)
- [ ] 我确认这不是由于滑块验证码、Cookie过期、Cookie提取错误、平台风控等常见原因导致的问题
## ❓ 问题描述
<!-- 清晰简洁地描述你遇到的问题 -->
## 🔍 使用场景
<!-- 描述你在使用哪个功能时遇到的问题 -->
- 目标平台: (如:小红书/抖音/微博等)
- 使用功能: (如:关键词搜索/用户主页爬取等)
## 💻 环境信息
- 操作系统:
- Python版本:
- 是否使用IP代理:
- 是否使用VPN翻墙软件:
- 目标平台(抖音/小红书/微博等):
## 📋 错误日志
```shell
在此粘贴完整的错误日志
```
## 📷 错误截图
<!-- 请提供错误截图 -->
... ...
# 构建 VitePress 站点并将其部署到 GitHub Pages 的示例工作流程
#
name: Deploy VitePress site to Pages
on:
# 在针对 `main` 分支的推送上运行。如果你
# 使用 `master` 分支作为默认分支,请将其更改为 `master`
push:
branches: [main]
# 允许你从 Actions 选项卡手动运行此工作流程
workflow_dispatch:
# 设置 GITHUB_TOKEN 的权限,以允许部署到 GitHub Pages
permissions:
contents: read
pages: write
id-token: write
# 只允许同时进行一次部署,跳过正在运行和最新队列之间的运行队列
# 但是,不要取消正在进行的运行,因为我们希望允许这些生产部署完成
concurrency:
group: pages
cancel-in-progress: false
jobs:
# 构建工作
build:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v4
with:
fetch-depth: 0 # 如果未启用 lastUpdated,则不需要
# - uses: pnpm/action-setup@v3 # 如果使用 pnpm,请取消注释
# - uses: oven-sh/setup-bun@v1 # 如果使用 Bun,请取消注释
- name: Setup Node
uses: actions/setup-node@v4
with:
node-version: 20
cache: npm # 或 pnpm / yarn
- name: Setup Pages
uses: actions/configure-pages@v4
- name: Install dependencies
run: npm ci # 或 pnpm install / yarn install / bun install
- name: Build with VitePress
run: npm run docs:build # 或 pnpm docs:build / yarn docs:build / bun run docs:build
- name: Upload artifact
uses: actions/upload-pages-artifact@v3
with:
path: docs/.vitepress/dist
# 部署工作
deploy:
environment:
name: github-pages
url: ${{ steps.deployment.outputs.page_url }}
needs: build
runs-on: ubuntu-latest
name: Deploy
steps:
- name: Deploy to GitHub Pages
id: deployment
uses: actions/deploy-pages@v4
\ No newline at end of file
... ...
... ... @@ -173,4 +173,9 @@ docs/.vitepress/cache
# other gitignore
.venv
.refer
\ No newline at end of file
.refer
agent_zone
debug_tools
database/*.db
\ No newline at end of file
... ...
NON-COMMERCIAL LEARNING LICENSE 1.1
Copyright (c) [2024] [relakkes@gmail.com]
WHEREAS:
1. The copyright owner owns and controls the copyright of this software and related documentation files (hereinafter referred to as the "Software");
2. The user wishes to use the Software for learning purposes;
3. The copyright owner is willing to authorize the user to use the Software under the conditions stated in this license;
NOW, THEREFORE, the parties, in compliance with relevant laws and regulations, agree to the following terms:
SCOPE OF AUTHORIZATION:
1. The copyright owner hereby grants any natural person or legal entity (hereinafter referred to as the "User") accepting this license a free, non-exclusive, non-transferable right to use, copy, modify, and merge the Software for non-commercial learning purposes, subject to the following conditions.
CONDITIONS:
1. The User must include the above copyright notice and this license statement in all reasonably prominent locations of the Software and its copies.
2. The Software is limited to learning and research purposes only, and may not be used for large-scale crawling or activities that disrupt platform operations.
3. Without the written consent of the copyright owner, the Software may not be used for any commercial purposes or to cause improper influence on third parties.
DISCLAIMER:
1. The Software is provided "AS IS," without warranty of any kind, express or implied, including but not limited to the warranties of merchantability, fitness for a particular purpose, and non-infringement.
2. In no event shall the copyright owner be liable for any direct, indirect, incidental, special, exemplary, or consequential damages (including, but not limited to, procurement of substitute goods or services; loss of use, data, or profits; or business interruption) however caused and on any theory of liability, whether in contract, strict liability, or tort (including negligence or otherwise) arising in any way out of the use of this Software, even if advised of the possibility of such damage.
APPLICABLE LAW:
1. The interpretation and enforcement of this license shall comply with local laws and regulations.
2. Any disputes arising from or related to this license shall be resolved through friendly negotiation between the parties; if negotiation fails, either party may submit the dispute to the people's court where the copyright owner is located for resolution.
This license constitutes the entire agreement between the parties regarding the Software, superseding and merging all prior discussions, communications, and agreements, whether oral or written.
非商业学习使用许可证 1.1
版权所有 (c) [2024] [relakkes@gmail.com]
鉴于:
1. 版权所有者拥有和控制本软件和相关文档文件(以下简称“软件”)的版权;
2. 使用者希望使用该软件进行学习;
3. 版权所有者愿意在本许可证所述的条件下授权使用者使用该软件;
现因此,双方遵循相关法律法规,同意如下条款:
授权范围:
1. 版权所有者特此免费授予接受本许可证的任何自然人或法人(以下简称“使用者”)非独占的、不可转让的权利,在非商业学习目的下使用、复制、修改、合并本软件,前提是遵守以下条件。
条件:
1. 使用者必须在软件及其副本的所有合理显著位置包含上述版权声明和本许可证声明。
2. 本软件仅限用于学习和研究目的,不得用于大规模爬虫或对平台造成运营干扰的行为。
3. 未经版权所有者书面同意,不得将本软件用于任何商业用途或对第三方造成不当影响。
免责声明:
1. 本软件按“现状”提供,不提供任何形式的明示或暗示保证,包括但不限于对适销性、特定用途的适用性和非侵权的保证。
2. 在任何情况下,版权所有者均不对因使用本软件而产生的,或在任何方式上与本软件有关的任何直接、间接、偶然、特殊、示例性或后果性损害负责(包括但不限于采购替代品或服务;使用、数据或利润的损失;或业务中断),无论这些损害是如何引起的,以及无论是通过合同、严格责任还是侵权行为(包括疏忽或其他方式)产生的,即使已被告知此类损害的可能性。
适用法律:
1. 本许可证的解释和执行应遵循当地法律法规。
2. 因本许可证引起的或与之相关的任何争议,双方应友好协商解决;协商不成时,任何一方可将争议提交至版权所有者所在地的人民法院诉讼解决。
本许可证构成双方之间关于本软件的完整协议,取代并合并以前的讨论、交流和协议,无论是口头还是书面的。
... ...
# 🔥 MediaCrawler - 自媒体平台爬虫 🕷️
<div align="center" markdown="1">
<sup>Special thanks to:</sup>
<br>
<br>
<a href="https://go.warp.dev/MediaCrawler">
<img alt="Warp sponsorship" width="400" src="https://github.com/warpdotdev/brand-assets/blob/main/Github/Sponsor/Warp-Github-LG-02.png?raw=true">
</a>
### [Warp is built for coding with multiple AI agents](https://go.warp.dev/MediaCrawler)
</div>
<hr>
<div align="center">
<a href="https://trendshift.io/repositories/8291" target="_blank">
<img src="https://trendshift.io/api/badge/repositories/8291" alt="NanmiCoder%2FMediaCrawler | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/>
</a>
[![GitHub Stars](https://img.shields.io/github/stars/NanmiCoder/MediaCrawler?style=social)](https://github.com/NanmiCoder/MediaCrawler/stargazers)
[![GitHub Forks](https://img.shields.io/github/forks/NanmiCoder/MediaCrawler?style=social)](https://github.com/NanmiCoder/MediaCrawler/network/members)
[![GitHub Issues](https://img.shields.io/github/issues/NanmiCoder/MediaCrawler)](https://github.com/NanmiCoder/MediaCrawler/issues)
[![GitHub Pull Requests](https://img.shields.io/github/issues-pr/NanmiCoder/MediaCrawler)](https://github.com/NanmiCoder/MediaCrawler/pulls)
[![License](https://img.shields.io/github/license/NanmiCoder/MediaCrawler)](https://github.com/NanmiCoder/MediaCrawler/blob/main/LICENSE)
[![中文](https://img.shields.io/badge/🇨🇳_中文-当前-blue)](README.md)
[![English](https://img.shields.io/badge/🇺🇸_English-Available-green)](README_en.md)
[![Español](https://img.shields.io/badge/🇪🇸_Español-Available-green)](README_es.md)
</div>
> **免责声明:**
>
> 大家请以学习为目的使用本仓库⚠️⚠️⚠️⚠️,[爬虫违法违规的案件](https://github.com/HiddenStrawberry/Crawler_Illegal_Cases_In_China) <br>
>
>本仓库的所有内容仅供学习和参考之用,禁止用于商业用途。任何人或组织不得将本仓库的内容用于非法用途或侵犯他人合法权益。本仓库所涉及的爬虫技术仅用于学习和研究,不得用于对其他平台进行大规模爬虫或其他非法行为。对于因使用本仓库内容而引起的任何法律责任,本仓库不承担任何责任。使用本仓库的内容即表示您同意本免责声明的所有条款和条件。
>
> 点击查看更为详细的免责声明。[点击跳转](#disclaimer)
## 📖 项目简介
一个功能强大的**多平台自媒体数据采集工具**,支持小红书、抖音、快手、B站、微博、贴吧、知乎等主流平台的公开信息抓取。
### 🔧 技术原理
- **核心技术**:基于 [Playwright](https://playwright.dev/) 浏览器自动化框架登录保存登录态
- **无需JS逆向**:利用保留登录态的浏览器上下文环境,通过 JS 表达式获取签名参数
- **优势特点**:无需逆向复杂的加密算法,大幅降低技术门槛
## ✨ 功能特性
| 平台 | 关键词搜索 | 指定帖子ID爬取 | 二级评论 | 指定创作者主页 | 登录态缓存 | IP代理池 | 生成评论词云图 |
| ------ | ---------- | -------------- | -------- | -------------- | ---------- | -------- | -------------- |
| 小红书 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| 抖音 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| 快手 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| B 站 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| 微博 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| 贴吧 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| 知乎 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
### 🚀 MediaCrawlerPro 重磅发布!
> 专注于学习成熟项目的架构设计,不仅仅是爬虫技术,Pro 版本的代码设计思路同样值得深入学习!
[MediaCrawlerPro](https://github.com/MediaCrawlerPro) 相较于开源版本的核心优势:
#### 🎯 核心功能升级
-**断点续爬功能**(重点特性)
-**多账号 + IP代理池支持**(重点特性)
-**去除 Playwright 依赖**,使用更简单
-**完整 Linux 环境支持**
#### 🏗️ 架构设计优化
-**代码重构优化**,更易读易维护(解耦 JS 签名逻辑)
-**企业级代码质量**,适合构建大型爬虫项目
-**完美架构设计**,高扩展性,源码学习价值更大
#### 🎁 额外功能
-**自媒体视频下载器桌面端**(适合学习全栈开发)
-**多平台首页信息流推荐**(HomeFeed)
- [ ] **基于自媒体平台的AI Agent正在开发中 🚀🚀**
点击查看:[MediaCrawlerPro 项目主页](https://github.com/MediaCrawlerPro) 更多介绍
## 🚀 快速开始
> 💡 **开源不易,如果这个项目对您有帮助,请给个 ⭐ Star 支持一下!**
## 📋 前置依赖
### 🚀 uv 安装(推荐)
在进行下一步操作之前,请确保电脑上已经安装了 uv:
- **安装地址**[uv 官方安装指南](https://docs.astral.sh/uv/getting-started/installation)
- **验证安装**:终端输入命令 `uv --version`,如果正常显示版本号,证明已经安装成功
- **推荐理由**:uv 是目前最强的 Python 包管理工具,速度快、依赖解析准确
### 🟢 Node.js 安装
项目依赖 Node.js,请前往官网下载安装:
- **下载地址**:https://nodejs.org/en/download/
- **版本要求**:>= 16.0.0
### 📦 Python 包安装
```shell
# 进入项目目录
cd MediaCrawler
# 使用 uv sync 命令来保证 python 版本和相关依赖包的一致性
uv sync
```
### 🌐 浏览器驱动安装
```shell
# 安装浏览器驱动
uv run playwright install
```
> **💡 提示**:MediaCrawler 目前已经支持使用 playwright 连接你本地的 Chrome 浏览器了,一些因为 Webdriver 导致的问题迎刃而解了。
>
> 目前开放了 `xhs` 和 `dy` 这两个使用 CDP 的方式连接本地浏览器,如有需要,查看 `config/base_config.py` 中的配置项。
## 🚀 运行爬虫程序
```shell
# 项目默认是没有开启评论爬取模式,如需评论请在 config/base_config.py 中的 ENABLE_GET_COMMENTS 变量修改
# 一些其他支持项,也可以在 config/base_config.py 查看功能,写的有中文注释
# 从配置文件中读取关键词搜索相关的帖子并爬取帖子信息与评论
uv run main.py --platform xhs --lt qrcode --type search
# 从配置文件中读取指定的帖子ID列表获取指定帖子的信息与评论信息
uv run main.py --platform xhs --lt qrcode --type detail
# 打开对应APP扫二维码登录
# 其他平台爬虫使用示例,执行下面的命令查看
uv run main.py --help
```
<details>
<summary>🔗 <strong>使用 Python 原生 venv 管理环境(不推荐)</strong></summary>
#### 创建并激活 Python 虚拟环境
> 如果是爬取抖音和知乎,需要提前安装 nodejs 环境,版本大于等于:`16` 即可
```shell
# 进入项目根目录
cd MediaCrawler
# 创建虚拟环境
# 我的 python 版本是:3.9.6,requirements.txt 中的库是基于这个版本的
# 如果是其他 python 版本,可能 requirements.txt 中的库不兼容,需自行解决
python -m venv venv
# macOS & Linux 激活虚拟环境
source venv/bin/activate
# Windows 激活虚拟环境
venv\Scripts\activate
```
#### 安装依赖库
```shell
pip install -r requirements.txt
```
#### 安装 playwright 浏览器驱动
```shell
playwright install
```
#### 运行爬虫程序(原生环境)
```shell
# 项目默认是没有开启评论爬取模式,如需评论请在 config/base_config.py 中的 ENABLE_GET_COMMENTS 变量修改
# 一些其他支持项,也可以在 config/base_config.py 查看功能,写的有中文注释
# 从配置文件中读取关键词搜索相关的帖子并爬取帖子信息与评论
python main.py --platform xhs --lt qrcode --type search
# 从配置文件中读取指定的帖子ID列表获取指定帖子的信息与评论信息
python main.py --platform xhs --lt qrcode --type detail
# 打开对应APP扫二维码登录
# 其他平台爬虫使用示例,执行下面的命令查看
python main.py --help
```
</details>
## 💾 数据保存
支持多种数据存储方式:
- **CSV 文件**:支持保存到 CSV 中(`data/` 目录下)
- **JSON 文件**:支持保存到 JSON 中(`data/` 目录下)
- **数据库存储**
- 使用参数 `--init_db` 进行数据库初始化(使用`--init_db`时不需要携带其他optional)
- **SQLite 数据库**:轻量级数据库,无需服务器,适合个人使用(推荐)
1. 初始化:`--init_db sqlite`
2. 数据存储:`--save_data_option sqlite`
- **MySQL 数据库**:支持关系型数据库 MySQL 中保存(需要提前创建数据库)
1. 初始化:`--init_db mysql`
2. 数据存储:`--save_data_option db`(db 参数为兼容历史更新保留)
### 使用示例:
```shell
# 初始化 SQLite 数据库(使用'--init_db'时不需要携带其他optional)
uv run main.py --init_db sqlite
# 使用 SQLite 存储数据(推荐个人用户使用)
uv run main.py --platform xhs --lt qrcode --type search --save_data_option sqlite
```
```shell
# 初始化 MySQL 数据库
uv run main.py --init_db mysql
# 使用 MySQL 存储数据(为适配历史更新,db参数进行沿用)
uv run main.py --platform xhs --lt qrcode --type search --save_data_option db
```
[🚀 MediaCrawlerPro 重磅发布 🚀!更多的功能,更好的架构设计!](https://github.com/MediaCrawlerPro)
### 💬 交流群组
- **微信交流群**[点击加入](https://nanmicoder.github.io/MediaCrawler/%E5%BE%AE%E4%BF%A1%E4%BA%A4%E6%B5%81%E7%BE%A4.html)
### 📚 其他
- **常见问题**[MediaCrawler 完整文档](https://nanmicoder.github.io/MediaCrawler/)
- **爬虫入门教程**[CrawlerTutorial 免费教程](https://github.com/NanmiCoder/CrawlerTutorial)
- **新闻爬虫开源项目**:[NewsCrawlerCollection](https://github.com/NanmiCoder/NewsCrawlerCollection)
---
### 💰 赞助商展示
<a href="https://h.wandouip.com">
<img src="docs/static/images/img_8.jpg">
<br>
豌豆HTTP自营千万级IP资源池,IP纯净度≥99.8%,每日保持IP高频更新,快速响应,稳定连接,满足多种业务场景,支持按需定制,注册免费提取10000ip。
</a>
---
<p align="center">
<a href="https://tikhub.io/?utm_source=github.com/NanmiCoder/MediaCrawler&utm_medium=marketing_social&utm_campaign=retargeting&utm_content=carousel_ad">
<img style="border-radius:20px" width="500" alt="TikHub IO_Banner zh" src="docs/static/images/tikhub_banner_zh.png">
</a>
</p>
[TikHub](https://tikhub.io/?utm_source=github.com/NanmiCoder/MediaCrawler&utm_medium=marketing_social&utm_campaign=retargeting&utm_content=carousel_ad) 提供超过 **700 个端点**,可用于从 **14+ 个社交媒体平台** 获取与分析数据 —— 包括视频、用户、评论、商店、商品与趋势等,一站式完成所有数据访问与分析。
通过每日签到,可以获取免费额度。可以使用我的注册链接:[https://user.tikhub.io/users/signup?referral_code=cfzyejV9](https://user.tikhub.io/users/signup?referral_code=cfzyejV9&utm_source=github.com/NanmiCoder/MediaCrawler&utm_medium=marketing_social&utm_campaign=retargeting&utm_content=carousel_ad) 或使用邀请码:`cfzyejV9`,注册并充值即可获得 **$2 免费额度**
[TikHub](https://tikhub.io/?utm_source=github.com/NanmiCoder/MediaCrawler&utm_medium=marketing_social&utm_campaign=retargeting&utm_content=carousel_ad) 提供以下服务:
- 🚀 丰富的社交媒体数据接口(TikTok、Douyin、XHS、YouTube、Instagram等)
- 💎 每日签到免费领取额度
- ⚡ 高成功率与高并发支持
- 🌐 官网:[https://tikhub.io/](https://tikhub.io/?utm_source=github.com/NanmiCoder/MediaCrawler&utm_medium=marketing_social&utm_campaign=retargeting&utm_content=carousel_ad)
- 💻 GitHub地址:[https://github.com/TikHubIO/](https://github.com/TikHubIO/)
---
<p align="center">
<a href="https://app.nstbrowser.io/account/register?utm_source=official&utm_term=mediacrawler">
<img style="border-radius:20px" alt="NstBrowser Banner " src="docs/static/images/nstbrowser.jpg">
</a>
</p>
Nstbrowser 指纹浏览器 — 多账号运营&自动化管理的最佳解决方案
<br>
多账号安全管理与会话隔离;指纹定制结合反检测浏览器环境,兼顾真实度与稳定性;覆盖店铺管理、电商监控、社媒营销、广告验证、Web3、投放监控与联盟营销等业务线;提供生产级并发与定制化企业服务;提供可一键部署的云端浏览器方案,配套全球高质量 IP 池,为您构建长期行业竞争力
<br>
[点击此处即刻开始免费使用](https://app.nstbrowser.io/account/register?utm_source=official&utm_term=mediacrawler)
<br>
使用 NSTBROWSER 可获得 10% 充值赠礼
### 🤝 成为赞助者
成为赞助者,可以将您的产品展示在这里,每天获得大量曝光!
**联系方式**
- 微信:`relakkes`
- 邮箱:`relakkes@gmail.com`
---
## ⭐ Star 趋势图
如果这个项目对您有帮助,请给个 ⭐ Star 支持一下,让更多的人看到 MediaCrawler!
[![Star History Chart](https://api.star-history.com/svg?repos=NanmiCoder/MediaCrawler&type=Date)](https://star-history.com/#NanmiCoder/MediaCrawler&Date)
## 📚 参考
- **小红书客户端**[ReaJason 的 xhs 仓库](https://github.com/ReaJason/xhs)
- **短信转发**[SmsForwarder 参考仓库](https://github.com/pppscn/SmsForwarder)
- **内网穿透工具**[ngrok 官方文档](https://ngrok.com/docs/)
# 免责声明
<div id="disclaimer">
## 1. 项目目的与性质
本项目(以下简称“本项目”)是作为一个技术研究与学习工具而创建的,旨在探索和学习网络数据采集技术。本项目专注于自媒体平台的数据爬取技术研究,旨在提供给学习者和研究者作为技术交流之用。
## 2. 法律合规性声明
本项目开发者(以下简称“开发者”)郑重提醒用户在下载、安装和使用本项目时,严格遵守中华人民共和国相关法律法规,包括但不限于《中华人民共和国网络安全法》、《中华人民共和国反间谍法》等所有适用的国家法律和政策。用户应自行承担一切因使用本项目而可能引起的法律责任。
## 3. 使用目的限制
本项目严禁用于任何非法目的或非学习、非研究的商业行为。本项目不得用于任何形式的非法侵入他人计算机系统,不得用于任何侵犯他人知识产权或其他合法权益的行为。用户应保证其使用本项目的目的纯属个人学习和技术研究,不得用于任何形式的非法活动。
## 4. 免责声明
开发者已尽最大努力确保本项目的正当性及安全性,但不对用户使用本项目可能引起的任何形式的直接或间接损失承担责任。包括但不限于由于使用本项目而导致的任何数据丢失、设备损坏、法律诉讼等。
## 5. 知识产权声明
本项目的知识产权归开发者所有。本项目受到著作权法和国际著作权条约以及其他知识产权法律和条约的保护。用户在遵守本声明及相关法律法规的前提下,可以下载和使用本项目。
## 6. 最终解释权
关于本项目的最终解释权归开发者所有。开发者保留随时更改或更新本免责声明的权利,恕不另行通知。
</div>
... ...
<div align="center" markdown="1">
<sup>Special thanks to:</sup>
<br>
<br>
<a href="https://go.warp.dev/MediaCrawler">
<img alt="Warp sponsorship" width="400" src="https://github.com/warpdotdev/brand-assets/blob/main/Github/Sponsor/Warp-Github-LG-02.png?raw=true">
</a>
### [Warp is built for coding with multiple AI agents](https://go.warp.dev/MediaCrawler)
</div>
<hr>
# 🔥 MediaCrawler - Social Media Platform Crawler 🕷️
<div align="center">
<a href="https://trendshift.io/repositories/8291" target="_blank">
<img src="https://trendshift.io/api/badge/repositories/8291" alt="NanmiCoder%2FMediaCrawler | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/>
</a>
[![GitHub Stars](https://img.shields.io/github/stars/NanmiCoder/MediaCrawler?style=social)](https://github.com/NanmiCoder/MediaCrawler/stargazers)
[![GitHub Forks](https://img.shields.io/github/forks/NanmiCoder/MediaCrawler?style=social)](https://github.com/NanmiCoder/MediaCrawler/network/members)
[![GitHub Issues](https://img.shields.io/github/issues/NanmiCoder/MediaCrawler)](https://github.com/NanmiCoder/MediaCrawler/issues)
[![GitHub Pull Requests](https://img.shields.io/github/issues-pr/NanmiCoder/MediaCrawler)](https://github.com/NanmiCoder/MediaCrawler/pulls)
[![License](https://img.shields.io/github/license/NanmiCoder/MediaCrawler)](https://github.com/NanmiCoder/MediaCrawler/blob/main/LICENSE)
[![中文](https://img.shields.io/badge/🇨🇳_中文-Available-blue)](README.md)
[![English](https://img.shields.io/badge/🇺🇸_English-Current-green)](README_en.md)
[![Español](https://img.shields.io/badge/🇪🇸_Español-Available-green)](README_es.md)
</div>
> **Disclaimer:**
>
> Please use this repository for learning purposes only ⚠️⚠️⚠️⚠️, [Web scraping illegal cases](https://github.com/HiddenStrawberry/Crawler_Illegal_Cases_In_China) <br>
>
>All content in this repository is for learning and reference purposes only, and commercial use is prohibited. No person or organization may use the content of this repository for illegal purposes or infringe upon the legitimate rights and interests of others. The web scraping technology involved in this repository is only for learning and research, and may not be used for large-scale crawling of other platforms or other illegal activities. This repository assumes no legal responsibility for any legal liability arising from the use of the content of this repository. By using the content of this repository, you agree to all terms and conditions of this disclaimer.
>
> Click to view a more detailed disclaimer. [Click to jump](#disclaimer)
## 📖 Project Introduction
A powerful **multi-platform social media data collection tool** that supports crawling public information from mainstream platforms including Xiaohongshu, Douyin, Kuaishou, Bilibili, Weibo, Tieba, Zhihu, and more.
### 🔧 Technical Principles
- **Core Technology**: Based on [Playwright](https://playwright.dev/) browser automation framework for login and maintaining login state
- **No JS Reverse Engineering Required**: Uses browser context environment with preserved login state to obtain signature parameters through JS expressions
- **Advantages**: No need to reverse complex encryption algorithms, significantly lowering the technical barrier
## ✨ Features
| Platform | Keyword Search | Specific Post ID Crawling | Secondary Comments | Specific Creator Homepage | Login State Cache | IP Proxy Pool | Generate Comment Word Cloud |
| ------ | ---------- | -------------- | -------- | -------------- | ---------- | -------- | -------------- |
| Xiaohongshu | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| Douyin | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| Kuaishou | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| Bilibili | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| Weibo | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| Tieba | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| Zhihu | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
<details id="pro-version">
<summary>🔗 <strong>🚀 MediaCrawlerPro Major Release! More features, better architectural design!</strong></summary>
### 🚀 MediaCrawlerPro Major Release!
> Focus on learning mature project architectural design, not just crawling technology. The code design philosophy of the Pro version is equally worth in-depth study!
[MediaCrawlerPro](https://github.com/MediaCrawlerPro) core advantages over the open-source version:
#### 🎯 Core Feature Upgrades
-**Resume crawling functionality** (Key feature)
-**Multi-account + IP proxy pool support** (Key feature)
-**Remove Playwright dependency**, easier to use
-**Complete Linux environment support**
#### 🏗️ Architectural Design Optimization
-**Code refactoring optimization**, more readable and maintainable (decoupled JS signature logic)
-**Enterprise-level code quality**, suitable for building large-scale crawler projects
-**Perfect architectural design**, high scalability, greater source code learning value
#### 🎁 Additional Features
-**Social media video downloader desktop app** (suitable for learning full-stack development)
-**Multi-platform homepage feed recommendations** (HomeFeed)
- [ ] **AI Agent based on social media platforms is under development 🚀🚀**
Click to view: [MediaCrawlerPro Project Homepage](https://github.com/MediaCrawlerPro) for more information
</details>
## 🚀 Quick Start
> 💡 **Open source is not easy, if this project helps you, please give a ⭐ Star to support!**
## 📋 Prerequisites
### 🚀 uv Installation (Recommended)
Before proceeding with the next steps, please ensure that uv is installed on your computer:
- **Installation Guide**: [uv Official Installation Guide](https://docs.astral.sh/uv/getting-started/installation)
- **Verify Installation**: Enter the command `uv --version` in the terminal. If the version number is displayed normally, the installation was successful
- **Recommendation Reason**: uv is currently the most powerful Python package management tool, with fast speed and accurate dependency resolution
### 🟢 Node.js Installation
The project depends on Node.js, please download and install from the official website:
- **Download Link**: https://nodejs.org/en/download/
- **Version Requirement**: >= 16.0.0
### 📦 Python Package Installation
```shell
# Enter project directory
cd MediaCrawler
# Use uv sync command to ensure consistency of python version and related dependency packages
uv sync
```
### 🌐 Browser Driver Installation
```shell
# Install browser driver
uv run playwright install
```
> **💡 Tip**: MediaCrawler now supports using playwright to connect to your local Chrome browser, solving some issues caused by Webdriver.
>
> Currently, `xhs` and `dy` are available using CDP mode to connect to local browsers. If needed, check the configuration items in `config/base_config.py`.
## 🚀 Run Crawler Program
```shell
# The project does not enable comment crawling mode by default. If you need comments, please modify the ENABLE_GET_COMMENTS variable in config/base_config.py
# Other supported options can also be viewed in config/base_config.py with Chinese comments
# Read keywords from configuration file to search related posts and crawl post information and comments
uv run main.py --platform xhs --lt qrcode --type search
# Read specified post ID list from configuration file to get information and comment information of specified posts
uv run main.py --platform xhs --lt qrcode --type detail
# Open corresponding APP to scan QR code for login
# For other platform crawler usage examples, execute the following command to view
uv run main.py --help
```
<details>
<summary>🔗 <strong>Using Python native venv environment management (Not recommended)</strong></summary>
#### Create and activate Python virtual environment
> If crawling Douyin and Zhihu, you need to install nodejs environment in advance, version greater than or equal to: `16`
```shell
# Enter project root directory
cd MediaCrawler
# Create virtual environment
# My python version is: 3.9.6, the libraries in requirements.txt are based on this version
# If using other python versions, the libraries in requirements.txt may not be compatible, please resolve on your own
python -m venv venv
# macOS & Linux activate virtual environment
source venv/bin/activate
# Windows activate virtual environment
venv\Scripts\activate
```
#### Install dependency libraries
```shell
pip install -r requirements.txt
```
#### Install playwright browser driver
```shell
playwright install
```
#### Run crawler program (native environment)
```shell
# The project does not enable comment crawling mode by default. If you need comments, please modify the ENABLE_GET_COMMENTS variable in config/base_config.py
# Other supported options can also be viewed in config/base_config.py with Chinese comments
# Read keywords from configuration file to search related posts and crawl post information and comments
python main.py --platform xhs --lt qrcode --type search
# Read specified post ID list from configuration file to get information and comment information of specified posts
python main.py --platform xhs --lt qrcode --type detail
# Open corresponding APP to scan QR code for login
# For other platform crawler usage examples, execute the following command to view
python main.py --help
```
</details>
## 💾 Data Storage
Supports multiple data storage methods:
- **CSV Files**: Supports saving to CSV (under `data/` directory)
- **JSON Files**: Supports saving to JSON (under `data/` directory)
- **Database Storage**
- Use the `--init_db` parameter for database initialization (when using `--init_db`, no other optional arguments are needed)
- **SQLite Database**: Lightweight database, no server required, suitable for personal use (recommended)
1. Initialization: `--init_db sqlite`
2. Data Storage: `--save_data_option sqlite`
- **MySQL Database**: Supports saving to relational database MySQL (database needs to be created in advance)
1. Initialization: `--init_db mysql`
2. Data Storage: `--save_data_option db` (the db parameter is retained for compatibility with historical updates)
### Usage Examples:
```shell
# Initialize SQLite database (when using '--init_db', no other optional arguments are needed)
uv run main.py --init_db sqlite
# Use SQLite to store data (recommended for personal users)
uv run main.py --platform xhs --lt qrcode --type search --save_data_option sqlite
```
```shell
# Initialize MySQL database
uv run main.py --init_db mysql
# Use MySQL to store data (the db parameter is retained for compatibility with historical updates)
uv run main.py --platform xhs --lt qrcode --type search --save_data_option db
```
---
[🚀 MediaCrawlerPro Major Release 🚀! More features, better architectural design!](https://github.com/MediaCrawlerPro)
## 🤝 Community & Support
### 💬 Discussion Groups
- **WeChat Discussion Group**: [Click to join](https://nanmicoder.github.io/MediaCrawler/%E5%BE%AE%E4%BF%A1%E4%BA%A4%E6%B5%81%E7%BE%A4.html)
### 📚 Documentation & Tutorials
- **Online Documentation**: [MediaCrawler Complete Documentation](https://nanmicoder.github.io/MediaCrawler/)
- **Crawler Tutorial**: [CrawlerTutorial Free Tutorial](https://github.com/NanmiCoder/CrawlerTutorial)
# Other common questions can be viewed in the online documentation
>
> The online documentation includes usage methods, common questions, joining project discussion groups, etc.
> [MediaCrawler Online Documentation](https://nanmicoder.github.io/MediaCrawler/)
>
# Author's Knowledge Services
> If you want to quickly get started and learn the usage of this project, source code architectural design, learn programming technology, or want to understand the source code design of MediaCrawlerPro, you can check out my paid knowledge column.
[Author's Paid Knowledge Column Introduction](https://nanmicoder.github.io/MediaCrawler/%E7%9F%A5%E8%AF%86%E4%BB%98%E8%B4%B9%E4%BB%8B%E7%BB%8D.html)
---
## ⭐ Star Trend Chart
If this project helps you, please give a ⭐ Star to support and let more people see MediaCrawler!
[![Star History Chart](https://api.star-history.com/svg?repos=NanmiCoder/MediaCrawler&type=Date)](https://star-history.com/#NanmiCoder/MediaCrawler&Date)
### 💰 Sponsor Display
<a href="https://www.swiftproxy.net/?ref=nanmi">
<img src="docs/static/images/img_5.png">
<br>
**Swiftproxy** - 90M+ global high-quality pure residential IPs, register to get free 500MB test traffic, dynamic traffic never expires!
> Exclusive discount code: **GHB5** Get 10% off instantly!
</a>
### 🤝 Become a Sponsor
Become a sponsor and showcase your product here, getting massive exposure daily!
**Contact Information**:
- WeChat: `relakkes`
- Email: `relakkes@gmail.com`
## 📚 References
- **Xiaohongshu Client**: [ReaJason's xhs repository](https://github.com/ReaJason/xhs)
- **SMS Forwarding**: [SmsForwarder reference repository](https://github.com/pppscn/SmsForwarder)
- **Intranet Penetration Tool**: [ngrok official documentation](https://ngrok.com/docs/)
# Disclaimer
<div id="disclaimer">
## 1. Project Purpose and Nature
This project (hereinafter referred to as "this project") was created as a technical research and learning tool, aimed at exploring and learning network data collection technologies. This project focuses on research of data crawling technologies for social media platforms, intended to provide learners and researchers with technical exchange purposes.
## 2. Legal Compliance Statement
The project developer (hereinafter referred to as "developer") solemnly reminds users to strictly comply with relevant laws and regulations of the People's Republic of China when downloading, installing and using this project, including but not limited to the "Cybersecurity Law of the People's Republic of China", "Counter-Espionage Law of the People's Republic of China" and all applicable national laws and policies. Users shall bear all legal responsibilities that may arise from using this project.
## 3. Usage Purpose Restrictions
This project is strictly prohibited from being used for any illegal purposes or non-learning, non-research commercial activities. This project may not be used for any form of illegal intrusion into other people's computer systems, nor may it be used for any activities that infringe upon others' intellectual property rights or other legitimate rights and interests. Users should ensure that their use of this project is purely for personal learning and technical research, and may not be used for any form of illegal activities.
## 4. Disclaimer
The developer has made every effort to ensure the legitimacy and security of this project, but assumes no responsibility for any form of direct or indirect losses that may arise from users' use of this project. Including but not limited to any data loss, equipment damage, legal litigation, etc. caused by using this project.
## 5. Intellectual Property Statement
The intellectual property rights of this project belong to the developer. This project is protected by copyright law and international copyright treaties as well as other intellectual property laws and treaties. Users may download and use this project under the premise of complying with this statement and relevant laws and regulations.
## 6. Final Interpretation Rights
The developer has the final interpretation rights regarding this project. The developer reserves the right to change or update this disclaimer at any time without further notice.
</div>
## 🙏 Acknowledgments
### JetBrains Open Source License Support
Thanks to JetBrains for providing free open source license support for this project!
<a href="https://www.jetbrains.com/?from=MediaCrawler">
<img src="https://www.jetbrains.com/company/brand/img/jetbrains_logo.png" width="100" alt="JetBrains" />
</a>
... ...
<div align="center" markdown="1">
<sup>Special thanks to:</sup>
<br>
<br>
<a href="https://go.warp.dev/MediaCrawler">
<img alt="Warp sponsorship" width="400" src="https://github.com/warpdotdev/brand-assets/blob/main/Github/Sponsor/Warp-Github-LG-02.png?raw=true">
</a>
### [Warp is built for coding with multiple AI agents](https://go.warp.dev/MediaCrawler)
</div>
<hr>
# 🔥 MediaCrawler - Rastreador de Plataformas de Redes Sociales 🕷️
<div align="center">
<a href="https://trendshift.io/repositories/8291" target="_blank">
<img src="https://trendshift.io/api/badge/repositories/8291" alt="NanmiCoder%2FMediaCrawler | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/>
</a>
[![GitHub Stars](https://img.shields.io/github/stars/NanmiCoder/MediaCrawler?style=social)](https://github.com/NanmiCoder/MediaCrawler/stargazers)
[![GitHub Forks](https://img.shields.io/github/forks/NanmiCoder/MediaCrawler?style=social)](https://github.com/NanmiCoder/MediaCrawler/network/members)
[![GitHub Issues](https://img.shields.io/github/issues/NanmiCoder/MediaCrawler)](https://github.com/NanmiCoder/MediaCrawler/issues)
[![GitHub Pull Requests](https://img.shields.io/github/issues-pr/NanmiCoder/MediaCrawler)](https://github.com/NanmiCoder/MediaCrawler/pulls)
[![License](https://img.shields.io/github/license/NanmiCoder/MediaCrawler)](https://github.com/NanmiCoder/MediaCrawler/blob/main/LICENSE)
[![中文](https://img.shields.io/badge/🇨🇳_中文-Available-blue)](README.md)
[![English](https://img.shields.io/badge/🇺🇸_English-Available-green)](README_en.md)
[![Español](https://img.shields.io/badge/🇪🇸_Español-Current-green)](README_es.md)
</div>
> **Descargo de responsabilidad:**
>
> Por favor, utilice este repositorio únicamente con fines de aprendizaje ⚠️⚠️⚠️⚠️, [Casos ilegales de web scraping](https://github.com/HiddenStrawberry/Crawler_Illegal_Cases_In_China) <br>
>
>Todo el contenido de este repositorio es únicamente para fines de aprendizaje y referencia, y está prohibido el uso comercial. Ninguna persona u organización puede usar el contenido de este repositorio para propósitos ilegales o infringir los derechos e intereses legítimos de otros. La tecnología de web scraping involucrada en este repositorio es solo para aprendizaje e investigación, y no puede ser utilizada para rastreo a gran escala de otras plataformas u otras actividades ilegales. Este repositorio no asume ninguna responsabilidad legal por cualquier responsabilidad legal que surja del uso del contenido de este repositorio. Al usar el contenido de este repositorio, usted acepta todos los términos y condiciones de este descargo de responsabilidad.
>
> Haga clic para ver un descargo de responsabilidad más detallado. [Haga clic para saltar](#disclaimer)
## 📖 Introducción del Proyecto
Una poderosa **herramienta de recolección de datos de redes sociales multiplataforma** que soporta el rastreo de información pública de plataformas principales incluyendo Xiaohongshu, Douyin, Kuaishou, Bilibili, Weibo, Tieba, Zhihu, y más.
### 🔧 Principios Técnicos
- **Tecnología Central**: Basado en el framework de automatización de navegador [Playwright](https://playwright.dev/) para login y mantenimiento del estado de login
- **No Requiere Ingeniería Inversa de JS**: Utiliza el entorno de contexto del navegador con estado de login preservado para obtener parámetros de firma a través de expresiones JS
- **Ventajas**: No necesita hacer ingeniería inversa de algoritmos de encriptación complejos, reduciendo significativamente la barrera técnica
## ✨ Características
| Plataforma | Búsqueda por Palabras Clave | Rastreo de ID de Publicación Específica | Comentarios Secundarios | Página de Inicio de Creador Específico | Caché de Estado de Login | Pool de Proxy IP | Generar Nube de Palabras de Comentarios |
| ------ | ---------- | -------------- | -------- | -------------- | ---------- | -------- | -------------- |
| Xiaohongshu | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| Douyin | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| Kuaishou | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| Bilibili | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| Weibo | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| Tieba | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| Zhihu | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
<details id="pro-version">
<summary>🔗 <strong>🚀 ¡Lanzamiento Mayor de MediaCrawlerPro! ¡Más características, mejor diseño arquitectónico!</strong></summary>
### 🚀 ¡Lanzamiento Mayor de MediaCrawlerPro!
> Enfócate en aprender el diseño arquitectónico de proyectos maduros, no solo tecnología de rastreo. ¡La filosofía de diseño de código de la versión Pro también vale la pena estudiar en profundidad!
[MediaCrawlerPro](https://github.com/MediaCrawlerPro) ventajas principales sobre la versión de código abierto:
#### 🎯 Actualizaciones de Características Principales
-**Funcionalidad de reanudación de rastreo** (Característica clave)
-**Soporte de múltiples cuentas + pool de proxy IP** (Característica clave)
-**Eliminar dependencia de Playwright**, más fácil de usar
-**Soporte completo de entorno Linux**
#### 🏗️ Optimización de Diseño Arquitectónico
-**Optimización de refactorización de código**, más legible y mantenible (lógica de firma JS desacoplada)
-**Calidad de código de nivel empresarial**, adecuado para construir proyectos de rastreo a gran escala
-**Diseño arquitectónico perfecto**, alta escalabilidad, mayor valor de aprendizaje del código fuente
#### 🎁 Características Adicionales
-**Aplicación de escritorio descargadora de videos de redes sociales** (adecuada para aprender desarrollo full-stack)
-**Recomendaciones de feed de página de inicio multiplataforma** (HomeFeed)
- [ ] **Agente AI basado en plataformas de redes sociales está en desarrollo 🚀🚀**
Haga clic para ver: [Página de Inicio del Proyecto MediaCrawlerPro](https://github.com/MediaCrawlerPro) para más información
</details>
## 🚀 Inicio Rápido
> 💡 **¡El código abierto no es fácil, si este proyecto te ayuda, por favor da una ⭐ Estrella para apoyar!**
## 📋 Prerrequisitos
### 🚀 Instalación de uv (Recomendado)
Antes de proceder con los siguientes pasos, por favor asegúrese de que uv esté instalado en su computadora:
- **Guía de Instalación**: [Guía Oficial de Instalación de uv](https://docs.astral.sh/uv/getting-started/installation)
- **Verificar Instalación**: Ingrese el comando `uv --version` en la terminal. Si el número de versión se muestra normalmente, la instalación fue exitosa
- **Razón de Recomendación**: uv es actualmente la herramienta de gestión de paquetes Python más poderosa, con velocidad rápida y resolución de dependencias precisa
### 🟢 Instalación de Node.js
El proyecto depende de Node.js, por favor descargue e instale desde el sitio web oficial:
- **Enlace de Descarga**: https://nodejs.org/en/download/
- **Requisito de Versión**: >= 16.0.0
### 📦 Instalación de Paquetes Python
```shell
# Entrar al directorio del proyecto
cd MediaCrawler
# Usar el comando uv sync para asegurar la consistencia de la versión de python y paquetes de dependencias relacionados
uv sync
```
### 🌐 Instalación de Controlador de Navegador
```shell
# Instalar controlador de navegador
uv run playwright install
```
> **💡 Consejo**: MediaCrawler ahora soporta usar playwright para conectarse a su navegador Chrome local, resolviendo algunos problemas causados por Webdriver.
>
> Actualmente, `xhs` y `dy` están disponibles usando el modo CDP para conectarse a navegadores locales. Si es necesario, verifique los elementos de configuración en `config/base_config.py`.
## 🚀 Ejecutar Programa Rastreador
```shell
# El proyecto no habilita el modo de rastreo de comentarios por defecto. Si necesita comentarios, por favor modifique la variable ENABLE_GET_COMMENTS en config/base_config.py
# Otras opciones soportadas también pueden verse en config/base_config.py con comentarios en chino
# Leer palabras clave del archivo de configuración para buscar publicaciones relacionadas y rastrear información de publicaciones y comentarios
uv run main.py --platform xhs --lt qrcode --type search
# Leer lista de ID de publicaciones específicas del archivo de configuración para obtener información e información de comentarios de publicaciones específicas
uv run main.py --platform xhs --lt qrcode --type detail
# Abrir la APP correspondiente para escanear código QR para login
# Para ejemplos de uso de rastreador de otras plataformas, ejecute el siguiente comando para ver
uv run main.py --help
```
<details>
<summary>🔗 <strong>Usando gestión de entorno venv nativo de Python (No recomendado)</strong></summary>
#### Crear y activar entorno virtual de Python
> Si rastrea Douyin y Zhihu, necesita instalar el entorno nodejs con anticipación, versión mayor o igual a: `16`
```shell
# Entrar al directorio raíz del proyecto
cd MediaCrawler
# Crear entorno virtual
# Mi versión de python es: 3.9.6, las librerías en requirements.txt están basadas en esta versión
# Si usa otras versiones de python, las librerías en requirements.txt pueden no ser compatibles, por favor resuelva por su cuenta
python -m venv venv
# macOS & Linux activar entorno virtual
source venv/bin/activate
# Windows activar entorno virtual
venv\Scripts\activate
```
#### Instalar librerías de dependencias
```shell
pip install -r requirements.txt
```
#### Instalar controlador de navegador playwright
```shell
playwright install
```
#### Ejecutar programa rastreador (entorno nativo)
```shell
# El proyecto no habilita el modo de rastreo de comentarios por defecto. Si necesita comentarios, por favor modifique la variable ENABLE_GET_COMMENTS en config/base_config.py
# Otras opciones soportadas también pueden verse en config/base_config.py con comentarios en chino
# Leer palabras clave del archivo de configuración para buscar publicaciones relacionadas y rastrear información de publicaciones y comentarios
python main.py --platform xhs --lt qrcode --type search
# Leer lista de ID de publicaciones específicas del archivo de configuración para obtener información e información de comentarios de publicaciones específicas
python main.py --platform xhs --lt qrcode --type detail
# Abrir la APP correspondiente para escanear código QR para login
# Para ejemplos de uso de rastreador de otras plataformas, ejecute el siguiente comando para ver
python main.py --help
```
</details>
## 💾 Almacenamiento de Datos
Soporta múltiples métodos de almacenamiento de datos:
- **Archivos CSV**: Soporta guardar en CSV (bajo el directorio `data/`)
- **Archivos JSON**: Soporta guardar en JSON (bajo el directorio `data/`)
- **Almacenamiento en Base de Datos**
- Use el parámetro `--init_db` para la inicialización de la base de datos (cuando use `--init_db`, no se necesitan otros argumentos opcionales)
- **Base de Datos SQLite**: Base de datos ligera, no requiere servidor, adecuada para uso personal (recomendado)
1. Inicialización: `--init_db sqlite`
2. Almacenamiento de Datos: `--save_data_option sqlite`
- **Base de Datos MySQL**: Soporta guardar en la base de datos relacional MySQL (la base de datos debe crearse con anticipación)
1. Inicialización: `--init_db mysql`
2. Almacenamiento de Datos: `--save_data_option db` (el parámetro db se mantiene por compatibilidad con actualizaciones históricas)
### Ejemplos de Uso:
```shell
# Inicializar la base de datos SQLite (cuando use '--init_db', no se necesitan otros argumentos opcionales)
uv run main.py --init_db sqlite
# Usar SQLite para almacenar datos (recomendado para usuarios personales)
uv run main.py --platform xhs --lt qrcode --type search --save_data_option sqlite
```
```shell
# Inicializar la base de datos MySQL
uv run main.py --init_db mysql
# Usar MySQL para almacenar datos (el parámetro db se mantiene por compatibilidad con actualizaciones históricas)
uv run main.py --platform xhs --lt qrcode --type search --save_data_option db
```
---
[🚀 ¡Lanzamiento Mayor de MediaCrawlerPro 🚀! ¡Más características, mejor diseño arquitectónico!](https://github.com/MediaCrawlerPro)
## 🤝 Comunidad y Soporte
### 💬 Grupos de Discusión
- **Grupo de Discusión WeChat**: [Haga clic para unirse](https://nanmicoder.github.io/MediaCrawler/%E5%BE%AE%E4%BF%A1%E4%BA%A4%E6%B5%81%E7%BE%A4.html)
### 📚 Documentación y Tutoriales
- **Documentación en Línea**: [Documentación Completa de MediaCrawler](https://nanmicoder.github.io/MediaCrawler/)
- **Tutorial de Rastreador**: [Tutorial Gratuito CrawlerTutorial](https://github.com/NanmiCoder/CrawlerTutorial)
# Otras preguntas comunes pueden verse en la documentación en línea
>
> La documentación en línea incluye métodos de uso, preguntas comunes, unirse a grupos de discusión del proyecto, etc.
> [Documentación en Línea de MediaCrawler](https://nanmicoder.github.io/MediaCrawler/)
>
# Servicios de Conocimiento del Autor
> Si quiere comenzar rápidamente y aprender el uso de este proyecto, diseño arquitectónico del código fuente, aprender tecnología de programación, o quiere entender el diseño del código fuente de MediaCrawlerPro, puede revisar mi columna de conocimiento pagado.
[Introducción de la Columna de Conocimiento Pagado del Autor](https://nanmicoder.github.io/MediaCrawler/%E7%9F%A5%E8%AF%86%E4%BB%98%E8%B4%B9%E4%BB%8B%E7%BB%8D.html)
---
## ⭐ Gráfico de Tendencia de Estrellas
¡Si este proyecto te ayuda, por favor da una ⭐ Estrella para apoyar y que más personas vean MediaCrawler!
[![Star History Chart](https://api.star-history.com/svg?repos=NanmiCoder/MediaCrawler&type=Date)](https://star-history.com/#NanmiCoder/MediaCrawler&Date)
### 💰 Exhibición de Patrocinadores
<a href="https://www.swiftproxy.net/?ref=nanmi">
<img src="docs/static/images/img_5.png">
<br>
**Swiftproxy** - ¡90M+ IPs residenciales puras de alta calidad globales, regístrese para obtener 500MB de tráfico de prueba gratuito, el tráfico dinámico nunca expira!
> Código de descuento exclusivo: **GHB5** ¡Obtenga 10% de descuento instantáneamente!
</a>
### 🤝 Conviértase en Patrocinador
¡Conviértase en patrocinador y muestre su producto aquí, obteniendo exposición masiva diariamente!
**Información de Contacto**:
- WeChat: `relakkes`
- Email: `relakkes@gmail.com`
## 📚 Referencias
- **Cliente Xiaohongshu**: [Repositorio xhs de ReaJason](https://github.com/ReaJason/xhs)
- **Reenvío de SMS**: [Repositorio de referencia SmsForwarder](https://github.com/pppscn/SmsForwarder)
- **Herramienta de Penetración de Intranet**: [Documentación oficial de ngrok](https://ngrok.com/docs/)
# Descargo de Responsabilidad
<div id="disclaimer">
## 1. Propósito y Naturaleza del Proyecto
Este proyecto (en adelante denominado "este proyecto") fue creado como una herramienta de investigación técnica y aprendizaje, con el objetivo de explorar y aprender tecnologías de recolección de datos de red. Este proyecto se enfoca en la investigación de tecnologías de rastreo de datos para plataformas de redes sociales, destinado a proporcionar a estudiantes e investigadores propósitos de intercambio técnico.
## 2. Declaración de Cumplimiento Legal
El desarrollador del proyecto (en adelante denominado "desarrollador") recuerda solemnemente a los usuarios que cumplan estrictamente con las leyes y regulaciones relevantes de la República Popular China al descargar, instalar y usar este proyecto, incluyendo pero no limitado a la "Ley de Ciberseguridad de la República Popular China", "Ley de Contraespionaje de la República Popular China" y todas las leyes y políticas nacionales aplicables. Los usuarios deberán asumir todas las responsabilidades legales que puedan surgir del uso de este proyecto.
## 3. Restricciones de Propósito de Uso
Este proyecto está estrictamente prohibido de ser utilizado para cualquier propósito ilegal o actividades comerciales que no sean de aprendizaje o investigación. Este proyecto no puede ser utilizado para ninguna forma de intrusión ilegal en sistemas informáticos de otras personas, ni puede ser utilizado para cualquier actividad que infrinja los derechos de propiedad intelectual de otros u otros derechos e intereses legítimos. Los usuarios deben asegurar que su uso de este proyecto sea puramente para aprendizaje personal e investigación técnica, y no puede ser utilizado para ninguna forma de actividades ilegales.
## 4. Descargo de Responsabilidad
El desarrollador ha hecho todos los esfuerzos para asegurar la legitimidad y seguridad de este proyecto, pero no asume responsabilidad por ninguna forma de pérdidas directas o indirectas que puedan surgir del uso de este proyecto por parte de los usuarios. Incluyendo pero no limitado a cualquier pérdida de datos, daño de equipos, litigios legales, etc. causados por el uso de este proyecto.
## 5. Declaración de Propiedad Intelectual
Los derechos de propiedad intelectual de este proyecto pertenecen al desarrollador. Este proyecto está protegido por la ley de derechos de autor y tratados internacionales de derechos de autor, así como otras leyes y tratados de propiedad intelectual. Los usuarios pueden descargar y usar este proyecto bajo la premisa de cumplir con esta declaración y las leyes y regulaciones relevantes.
## 6. Derechos de Interpretación Final
El desarrollador tiene los derechos de interpretación final con respecto a este proyecto. El desarrollador se reserva el derecho de cambiar o actualizar este descargo de responsabilidad en cualquier momento sin previo aviso.
</div>
## 🙏 Agradecimientos
### Soporte de Licencia de Código Abierto de JetBrains
¡Gracias a JetBrains por proporcionar soporte de licencia de código abierto gratuito para este proyecto!
<a href="https://www.jetbrains.com/?from=MediaCrawler">
<img src="https://www.jetbrains.com/company/brand/img/jetbrains_logo.png" width="100" alt="JetBrains" />
</a>
... ...
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
# 1. 不得用于任何商业用途。
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
# 3. 不得进行大规模爬取或对平台造成运营干扰。
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
# 5. 不得用于任何非法或不当的用途。
#
# 详细许可条款请参阅项目根目录下的LICENSE文件。
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
# -*- coding: utf-8 -*-
# @Author : relakkes@gmail.com
# @Time : 2024/4/6 14:21
# @Desc : 异步Aiomysql的增删改查封装
from typing import Any, Dict, List, Union
import aiomysql
class AsyncMysqlDB:
def __init__(self, pool: aiomysql.Pool) -> None:
self.__pool = pool
async def query(self, sql: str, *args: Union[str, int]) -> List[Dict[str, Any]]:
"""
从给定的 SQL 中查询记录,返回的是一个列表
:param sql: 查询的sql
:param args: sql中传递动态参数列表
:return:
"""
async with self.__pool.acquire() as conn:
async with conn.cursor(aiomysql.DictCursor) as cur:
await cur.execute(sql, args)
data = await cur.fetchall()
return data or []
async def get_first(self, sql: str, *args: Union[str, int]) -> Union[Dict[str, Any], None]:
"""
从给定的 SQL 中查询记录,返回的是符合条件的第一个结果
:param sql: 查询的sql
:param args:sql中传递动态参数列表
:return:
"""
async with self.__pool.acquire() as conn:
async with conn.cursor(aiomysql.DictCursor) as cur:
await cur.execute(sql, args)
data = await cur.fetchone()
return data
async def item_to_table(self, table_name: str, item: Dict[str, Any]) -> int:
"""
表中插入数据
:param table_name: 表名
:param item: 一条记录的字典信息
:return:
"""
fields = list(item.keys())
values = list(item.values())
fields = [f'`{field}`' for field in fields]
fieldstr = ','.join(fields)
valstr = ','.join(['%s'] * len(item))
sql = "INSERT INTO %s (%s) VALUES(%s)" % (table_name, fieldstr, valstr)
async with self.__pool.acquire() as conn:
async with conn.cursor(aiomysql.DictCursor) as cur:
await cur.execute(sql, values)
lastrowid = cur.lastrowid
return lastrowid
async def update_table(self, table_name: str, updates: Dict[str, Any], field_where: str,
value_where: Union[str, int, float]) -> int:
"""
更新指定表的记录
:param table_name: 表名
:param updates: 需要更新的字段和值的 key - value 映射
:param field_where: update 语句 where 条件中的字段名
:param value_where: update 语句 where 条件中的字段值
:return:
"""
upsets = []
values = []
for k, v in updates.items():
s = '`%s`=%%s' % k
upsets.append(s)
values.append(v)
upsets = ','.join(upsets)
sql = 'UPDATE %s SET %s WHERE %s="%s"' % (
table_name,
upsets,
field_where, value_where,
)
async with self.__pool.acquire() as conn:
async with conn.cursor() as cur:
rows = await cur.execute(sql, values)
return rows
async def execute(self, sql: str, *args: Union[str, int]) -> int:
"""
需要更新、写入等操作的 excute 执行语句
:param sql:
:param args:
:return:
"""
async with self.__pool.acquire() as conn:
async with conn.cursor() as cur:
rows = await cur.execute(sql, args)
return rows
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
# 1. 不得用于任何商业用途。
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
# 3. 不得进行大规模爬取或对平台造成运营干扰。
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
# 5. 不得用于任何非法或不当的用途。
#
# 详细许可条款请参阅项目根目录下的LICENSE文件。
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
# -*- coding: utf-8 -*-
# @Author : relakkes@gmail.com
# @Time : 2024/4/6 14:21
# @Desc : 异步SQLite的增删改查封装
from typing import Any, Dict, List, Union
import aiosqlite
class AsyncSqliteDB:
def __init__(self, db_path: str) -> None:
self.__db_path = db_path
async def query(self, sql: str, *args: Union[str, int]) -> List[Dict[str, Any]]:
"""
从给定的 SQL 中查询记录,返回的是一个列表
:param sql: 查询的sql
:param args: sql中传递动态参数列表
:return:
"""
async with aiosqlite.connect(self.__db_path) as conn:
conn.row_factory = aiosqlite.Row
async with conn.execute(sql, args) as cursor:
rows = await cursor.fetchall()
return [dict(row) for row in rows] if rows else []
async def get_first(self, sql: str, *args: Union[str, int]) -> Union[Dict[str, Any], None]:
"""
从给定的 SQL 中查询记录,返回的是符合条件的第一个结果
:param sql: 查询的sql
:param args:sql中传递动态参数列表
:return:
"""
async with aiosqlite.connect(self.__db_path) as conn:
conn.row_factory = aiosqlite.Row
async with conn.execute(sql, args) as cursor:
row = await cursor.fetchone()
return dict(row) if row else None
async def item_to_table(self, table_name: str, item: Dict[str, Any]) -> int:
"""
表中插入数据
:param table_name: 表名
:param item: 一条记录的字典信息
:return:
"""
fields = list(item.keys())
values = list(item.values())
fieldstr = ','.join(fields)
valstr = ','.join(['?'] * len(item))
sql = f"INSERT INTO {table_name} ({fieldstr}) VALUES({valstr})"
async with aiosqlite.connect(self.__db_path) as conn:
async with conn.execute(sql, values) as cursor:
await conn.commit()
return cursor.lastrowid
async def update_table(self, table_name: str, updates: Dict[str, Any], field_where: str,
value_where: Union[str, int, float]) -> int:
"""
更新指定表的记录
:param table_name: 表名
:param updates: 需要更新的字段和值的 key - value 映射
:param field_where: update 语句 where 条件中的字段名
:param value_where: update 语句 where 条件中的字段值
:return:
"""
upsets = []
values = []
for k, v in updates.items():
upsets.append(f'{k}=?')
values.append(v)
upsets_str = ','.join(upsets)
values.append(value_where)
sql = f'UPDATE {table_name} SET {upsets_str} WHERE {field_where}=?'
async with aiosqlite.connect(self.__db_path) as conn:
async with conn.execute(sql, values) as cursor:
await conn.commit()
return cursor.rowcount
async def execute(self, sql: str, *args: Union[str, int]) -> int:
"""
需要更新、写入等操作的 excute 执行语句
:param sql:
:param args:
:return:
"""
async with aiosqlite.connect(self.__db_path) as conn:
async with conn.execute(sql, args) as cursor:
await conn.commit()
return cursor.rowcount
async def executescript(self, sql_script: str) -> None:
"""
执行SQL脚本,用于初始化数据库表结构
:param sql_script: SQL脚本内容
:return:
"""
async with aiosqlite.connect(self.__db_path) as conn:
await conn.executescript(sql_script)
await conn.commit()
\ No newline at end of file
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
# 1. 不得用于任何商业用途。
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
# 3. 不得进行大规模爬取或对平台造成运营干扰。
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
# 1. 不得用于任何商业用途。
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
# 3. 不得进行大规模爬取或对平台造成运营干扰。
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
# 5. 不得用于任何非法或不当的用途。
#
# 详细许可条款请参阅项目根目录下的LICENSE文件。
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
#
# 详细许可条款请参阅项目根目录下的LICENSE文件。
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
import argparse
from __future__ import annotations
import sys
from enum import Enum
from types import SimpleNamespace
from typing import Iterable, Optional, Sequence, Type, TypeVar
import typer
from typing_extensions import Annotated
import config
from tools.utils import str2bool
async def parse_cmd():
# 读取command arg
parser = argparse.ArgumentParser(description='Media crawler program. / 媒体爬虫程序')
parser.add_argument('--platform', type=str,
help='Media platform select / 选择媒体平台 (xhs=小红书 | dy=抖音 | ks=快手 | bili=哔哩哔哩 | wb=微博 | tieba=百度贴吧 | zhihu=知乎)',
choices=["xhs", "dy", "ks", "bili", "wb", "tieba", "zhihu"], default=config.PLATFORM)
parser.add_argument('--lt', type=str,
help='Login type / 登录方式 (qrcode=二维码 | phone=手机号 | cookie=Cookie)',
choices=["qrcode", "phone", "cookie"], default=config.LOGIN_TYPE)
parser.add_argument('--type', type=str,
help='Crawler type / 爬取类型 (search=搜索 | detail=详情 | creator=创作者)',
choices=["search", "detail", "creator"], default=config.CRAWLER_TYPE)
parser.add_argument('--start', type=int,
help='Number of start page / 起始页码', default=config.START_PAGE)
parser.add_argument('--keywords', type=str,
help='Please input keywords / 请输入关键词', default=config.KEYWORDS)
parser.add_argument('--get_comment', type=str2bool,
help='''Whether to crawl level one comment / 是否爬取一级评论, supported values case insensitive / 支持的值(不区分大小写) ('yes', 'true', 't', 'y', '1', 'no', 'false', 'f', 'n', '0')''', default=config.ENABLE_GET_COMMENTS)
parser.add_argument('--get_sub_comment', type=str2bool,
help=''''Whether to crawl level two comment / 是否爬取二级评论, supported values case insensitive / 支持的值(不区分大小写) ('yes', 'true', 't', 'y', '1', 'no', 'false', 'f', 'n', '0')''', default=config.ENABLE_GET_SUB_COMMENTS)
parser.add_argument('--save_data_option', type=str,
help='Where to save the data / 数据保存方式 (csv=CSV文件 | db=MySQL数据库 | json=JSON文件 | sqlite=SQLite数据库)',
choices=['csv', 'db', 'json', 'sqlite'], default=config.SAVE_DATA_OPTION)
parser.add_argument('--cookies', type=str,
help='Cookies used for cookie login type / Cookie登录方式使用的Cookie值', default=config.COOKIES)
args = parser.parse_args()
# override config
config.PLATFORM = args.platform
config.LOGIN_TYPE = args.lt
config.CRAWLER_TYPE = args.type
config.START_PAGE = args.start
config.KEYWORDS = args.keywords
config.ENABLE_GET_COMMENTS = args.get_comment
config.ENABLE_GET_SUB_COMMENTS = args.get_sub_comment
config.SAVE_DATA_OPTION = args.save_data_option
config.COOKIES = args.cookies
EnumT = TypeVar("EnumT", bound=Enum)
class PlatformEnum(str, Enum):
"""支持的媒体平台枚举"""
XHS = "xhs"
DOUYIN = "dy"
KUAISHOU = "ks"
BILIBILI = "bili"
WEIBO = "wb"
TIEBA = "tieba"
ZHIHU = "zhihu"
class LoginTypeEnum(str, Enum):
"""登录方式枚举"""
QRCODE = "qrcode"
PHONE = "phone"
COOKIE = "cookie"
class CrawlerTypeEnum(str, Enum):
"""爬虫类型枚举"""
SEARCH = "search"
DETAIL = "detail"
CREATOR = "creator"
class SaveDataOptionEnum(str, Enum):
"""数据保存方式枚举"""
CSV = "csv"
DB = "db"
JSON = "json"
SQLITE = "sqlite"
POSTGRESQL = "postgresql"
class InitDbOptionEnum(str, Enum):
"""数据库初始化选项"""
SQLITE = "sqlite"
MYSQL = "mysql"
POSTGRESQL = "postgresql"
def _to_bool(value: bool | str) -> bool:
if isinstance(value, bool):
return value
return str2bool(value)
def _coerce_enum(
enum_cls: Type[EnumT],
value: EnumT | str,
default: EnumT,
) -> EnumT:
"""Safely convert a raw config value to an enum member."""
if isinstance(value, enum_cls):
return value
try:
return enum_cls(value)
except ValueError:
typer.secho(
f"⚠️ 配置值 '{value}' 不在 {enum_cls.__name__} 支持的范围内,已回退到默认值 '{default.value}'.",
fg=typer.colors.YELLOW,
)
return default
def _normalize_argv(argv: Optional[Sequence[str]]) -> Iterable[str]:
if argv is None:
return list(sys.argv[1:])
return list(argv)
def _inject_init_db_default(args: Sequence[str]) -> list[str]:
"""Ensure bare --init_db defaults to sqlite for backward compatibility."""
normalized: list[str] = []
i = 0
while i < len(args):
arg = args[i]
normalized.append(arg)
if arg == "--init_db":
next_arg = args[i + 1] if i + 1 < len(args) else None
if not next_arg or next_arg.startswith("-"):
normalized.append(InitDbOptionEnum.SQLITE.value)
i += 1
return normalized
async def parse_cmd(argv: Optional[Sequence[str]] = None):
"""使用 Typer 解析命令行参数。"""
app = typer.Typer(add_completion=False)
@app.callback(invoke_without_command=True)
def main(
platform: Annotated[
PlatformEnum,
typer.Option(
"--platform",
help="媒体平台选择 (xhs=小红书 | dy=抖音 | ks=快手 | bili=哔哩哔哩 | wb=微博 | tieba=百度贴吧 | zhihu=知乎)",
rich_help_panel="基础配置",
),
] = _coerce_enum(PlatformEnum, config.PLATFORM, PlatformEnum.XHS),
lt: Annotated[
LoginTypeEnum,
typer.Option(
"--lt",
help="登录方式 (qrcode=二维码 | phone=手机号 | cookie=Cookie)",
rich_help_panel="账号配置",
),
] = _coerce_enum(LoginTypeEnum, config.LOGIN_TYPE, LoginTypeEnum.QRCODE),
crawler_type: Annotated[
CrawlerTypeEnum,
typer.Option(
"--type",
help="爬取类型 (search=搜索 | detail=详情 | creator=创作者)",
rich_help_panel="基础配置",
),
] = _coerce_enum(CrawlerTypeEnum, config.CRAWLER_TYPE, CrawlerTypeEnum.SEARCH),
start: Annotated[
int,
typer.Option(
"--start",
help="起始页码",
rich_help_panel="基础配置",
),
] = config.START_PAGE,
keywords: Annotated[
str,
typer.Option(
"--keywords",
help="请输入关键词,多个关键词用逗号分隔",
rich_help_panel="基础配置",
),
] = config.KEYWORDS,
get_comment: Annotated[
str,
typer.Option(
"--get_comment",
help="是否爬取一级评论,支持 yes/true/t/y/1 或 no/false/f/n/0",
rich_help_panel="评论配置",
show_default=True,
),
] = str(config.ENABLE_GET_COMMENTS),
get_sub_comment: Annotated[
str,
typer.Option(
"--get_sub_comment",
help="是否爬取二级评论,支持 yes/true/t/y/1 或 no/false/f/n/0",
rich_help_panel="评论配置",
show_default=True,
),
] = str(config.ENABLE_GET_SUB_COMMENTS),
save_data_option: Annotated[
SaveDataOptionEnum,
typer.Option(
"--save_data_option",
help="数据保存方式 (csv=CSV文件 | db=MySQL数据库 | json=JSON文件 | sqlite=SQLite数据库 | postgresql=PostgreSQL数据库)",
rich_help_panel="存储配置",
),
] = _coerce_enum(
SaveDataOptionEnum, config.SAVE_DATA_OPTION, SaveDataOptionEnum.JSON
),
init_db: Annotated[
Optional[InitDbOptionEnum],
typer.Option(
"--init_db",
help="初始化数据库表结构 (sqlite | mysql | postgresql)",
rich_help_panel="存储配置",
),
] = None,
cookies: Annotated[
str,
typer.Option(
"--cookies",
help="Cookie 登录方式使用的 Cookie 值",
rich_help_panel="账号配置",
),
] = config.COOKIES,
) -> SimpleNamespace:
"""MediaCrawler 命令行入口"""
enable_comment = _to_bool(get_comment)
enable_sub_comment = _to_bool(get_sub_comment)
init_db_value = init_db.value if init_db else None
# override global config
config.PLATFORM = platform.value
config.LOGIN_TYPE = lt.value
config.CRAWLER_TYPE = crawler_type.value
config.START_PAGE = start
config.KEYWORDS = keywords
config.ENABLE_GET_COMMENTS = enable_comment
config.ENABLE_GET_SUB_COMMENTS = enable_sub_comment
config.SAVE_DATA_OPTION = save_data_option.value
config.COOKIES = cookies
return SimpleNamespace(
platform=config.PLATFORM,
lt=config.LOGIN_TYPE,
type=config.CRAWLER_TYPE,
start=config.START_PAGE,
keywords=config.KEYWORDS,
get_comment=config.ENABLE_GET_COMMENTS,
get_sub_comment=config.ENABLE_GET_SUB_COMMENTS,
save_data_option=config.SAVE_DATA_OPTION,
init_db=init_db_value,
cookies=config.COOKIES,
)
command = typer.main.get_command(app)
cli_args = _normalize_argv(argv)
cli_args = _inject_init_db_default(cli_args)
try:
result = command.main(args=cli_args, standalone_mode=False)
if isinstance(result, int): # help/options handled by Typer; propagate exit code
raise SystemExit(result)
return result
except typer.Exit as exc: # pragma: no cover - CLI exit paths
raise SystemExit(exc.exit_code) from exc
... ...
... ... @@ -10,5 +10,4 @@
from .base_config import *
from .db_config import *
from .tieba_config import *
\ No newline at end of file
from .db_config import *
\ No newline at end of file
... ...
... ... @@ -9,11 +9,12 @@
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
# 基础配置
PLATFORM = "xhs" # 平台,xhs | dy | ks | bili | wb | tieba | zhihu
KEYWORDS = "黑神话钟馗,九三阅兵,种地吧,董璇,非亲生,医美风险,游戏科学,阅兵准备,热巴,醉驾判无罪" # 关键词搜索配置,以英文逗号分隔
PLATFORM = "bili" # 平台,xhs | dy | ks | bili | wb | tieba | zhihu
KEYWORDS = "电影鬼灭之刃,亲属想侵吞3姐妹亡父赔偿款,网警斩断侵害未成年人网络黑色产业链,2007年后出生的人不能在马尔代夫吸烟,沈月,是公主也是自己的骑士,以军虐囚视频,唐朝诡事录,广州地铁回应APP乘车码频繁弹窗广告,全红婵的减肥计划精确到克" # 关键词搜索配置,以英文逗号分隔
LOGIN_TYPE = "qrcode" # qrcode or phone or cookie
COOKIES = ""
CRAWLER_TYPE = "search" # 爬取类型,search(关键词搜索) | detail(帖子详情)| creator(创作者主页数据)
# 是否开启 IP 代理
ENABLE_IP_PROXY = False
... ... @@ -36,7 +37,7 @@ SAVE_LOGIN_STATE = True
# 是否启用CDP模式 - 使用用户现有的Chrome/Edge浏览器进行爬取,提供更好的反检测能力
# 启用后将自动检测并启动用户的Chrome/Edge浏览器,通过CDP协议进行控制
# 这种方式使用真实的浏览器环境,包括用户的扩展、Cookie和设置,大大降低被检测的风险
ENABLE_CDP_MODE = False
ENABLE_CDP_MODE = True
# CDP调试端口,用于与浏览器通信
# 如果端口被占用,系统会自动尝试下一个可用端口
... ... @@ -59,8 +60,8 @@ BROWSER_LAUNCH_TIMEOUT = 30
# 设置为False可以保持浏览器运行,便于调试
AUTO_CLOSE_BROWSER = True
# 数据保存类型选项配置,支持四种类型:csv、db、json、sqlite, 最好保存到DB,有排重的功能。
SAVE_DATA_OPTION = "db" # csv or db or json or sqlite
# 数据保存类型选项配置,支持五种类型:csv、db、json、sqlite、postgresql, 最好保存到DB,有排重的功能。
SAVE_DATA_OPTION = "postgresql" # csv or db or json or sqlite or postgresql
# 用户浏览器缓存的浏览器文件配置
USER_DATA_DIR = "%s_user_data_dir" # %s will be replaced by platform name
... ... @@ -69,7 +70,7 @@ USER_DATA_DIR = "%s_user_data_dir" # %s will be replaced by platform name
START_PAGE = 1
# 爬取视频/帖子的数量控制
CRAWLER_MAX_NOTES_COUNT = 10
CRAWLER_MAX_NOTES_COUNT = 5
# 并发爬虫数量控制
MAX_CONCURRENCY_NUM = 1
... ...
... ... @@ -13,16 +13,23 @@
# 每天爬取视频/帖子的数量控制
MAX_NOTES_PER_DAY = 1
# 指定B站视频ID列表
# 指定B站视频URL列表 (支持完整URL或BV号)
# 示例:
# - 完整URL: "https://www.bilibili.com/video/BV1dwuKzmE26/?spm_id_from=333.1387.homepage.video_card.click"
# - BV号: "BV1d54y1g7db"
BILI_SPECIFIED_ID_LIST = [
"BV1d54y1g7db",
"https://www.bilibili.com/video/BV1dwuKzmE26/?spm_id_from=333.1387.homepage.video_card.click",
"BV1Sz4y1U77N",
"BV14Q4y1n7jz",
# ........................
]
# 指定B站用户ID列表
# 指定B站创作者URL列表 (支持完整URL或UID)
# 示例:
# - 完整URL: "https://space.bilibili.com/434377496?spm_id_from=333.1007.0.0"
# - UID: "20813884"
BILI_CREATOR_ID_LIST = [
"https://space.bilibili.com/434377496?spm_id_from=333.1007.0.0",
"20813884",
# ........................
]
... ... @@ -34,6 +41,11 @@ END_DAY = "2024-01-01"
# 搜索模式
BILI_SEARCH_MODE = "normal"
# 视频清晰度(qn)配置,常见取值:
# 16=360p, 32=480p, 64=720p, 80=1080p, 112=1080p高码率, 116=1080p60, 120=4K
# 注意:更高清晰度需要账号/视频本身支持
BILI_QN = 80
# 是否爬取用户信息
CREATOR_MODE = True
... ...
... ... @@ -12,11 +12,19 @@
import os
# mysql config - 使用MindSpider的数据库配置
MYSQL_DB_PWD = "mneDccc7sHHANtFk"
MYSQL_DB_USER = "root"
MYSQL_DB_HOST = "rm-2zeib6b13f6tt9kncoo.mysql.rds.aliyuncs.com"
MYSQL_DB_PORT = 3306
MYSQL_DB_NAME = "mindspider"
MYSQL_DB_PWD = "bettafish"
MYSQL_DB_USER = "bettafish"
MYSQL_DB_HOST = "127.0.0.1"
MYSQL_DB_PORT = 5444
MYSQL_DB_NAME = "bettafish"
mysql_db_config = {
"user": MYSQL_DB_USER,
"password": MYSQL_DB_PWD,
"host": MYSQL_DB_HOST,
"port": MYSQL_DB_PORT,
"db_name": MYSQL_DB_NAME,
}
# redis config
... ... @@ -30,4 +38,24 @@ CACHE_TYPE_REDIS = "redis"
CACHE_TYPE_MEMORY = "memory"
# sqlite config
SQLITE_DB_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "schema", "sqlite_tables.db")
\ No newline at end of file
SQLITE_DB_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "database", "sqlite_tables.db")
sqlite_db_config = {
"db_path": SQLITE_DB_PATH
}
# postgresql config - 使用MindSpider的数据库配置(如果DB_DIALECT是postgresql)或环境变量
POSTGRESQL_DB_PWD = os.getenv("POSTGRESQL_DB_PWD", "bettafish")
POSTGRESQL_DB_USER = os.getenv("POSTGRESQL_DB_USER", "bettafish")
POSTGRESQL_DB_HOST = os.getenv("POSTGRESQL_DB_HOST", "127.0.0.1")
POSTGRESQL_DB_PORT = os.getenv("POSTGRESQL_DB_PORT", "5444")
POSTGRESQL_DB_NAME = os.getenv("POSTGRESQL_DB_NAME", "bettafish")
postgresql_db_config = {
"user": POSTGRESQL_DB_USER,
"password": POSTGRESQL_DB_PWD,
"host": POSTGRESQL_DB_HOST,
"port": POSTGRESQL_DB_PORT,
"db_name": POSTGRESQL_DB_NAME,
}
... ...
... ... @@ -11,15 +11,27 @@
# 抖音平台配置
PUBLISH_TIME_TYPE = 0
# 指定DY视频ID列表
# 指定DY视频URL列表 (支持多种格式)
# 支持格式:
# 1. 完整视频URL: "https://www.douyin.com/video/7525538910311632128"
# 2. 带modal_id的URL: "https://www.douyin.com/user/xxx?modal_id=7525538910311632128"
# 3. 搜索页带modal_id: "https://www.douyin.com/root/search/python?modal_id=7525538910311632128"
# 4. 短链接: "https://v.douyin.com/drIPtQ_WPWY/"
# 5. 纯视频ID: "7280854932641664319"
DY_SPECIFIED_ID_LIST = [
"7280854932641664319",
"7202432992642387233",
"https://www.douyin.com/video/7525538910311632128",
"https://v.douyin.com/drIPtQ_WPWY/",
"https://www.douyin.com/user/MS4wLjABAAAATJPY7LAlaa5X-c8uNdWkvz0jUGgpw4eeXIwu_8BhvqE?from_tab_name=main&modal_id=7525538910311632128",
"7202432992642387233",
# ........................
]
# 指定DY用户ID列表
# 指定DY创作者URL列表 (支持完整URL或sec_user_id)
# 支持格式:
# 1. 完整创作者主页URL: "https://www.douyin.com/user/MS4wLjABAAAATJPY7LAlaa5X-c8uNdWkvz0jUGgpw4eeXIwu_8BhvqE?from_tab_name=main"
# 2. sec_user_id: "MS4wLjABAAAATJPY7LAlaa5X-c8uNdWkvz0jUGgpw4eeXIwu_8BhvqE"
DY_CREATOR_ID_LIST = [
"MS4wLjABAAAATJPY7LAlaa5X-c8uNdWkvz0jUGgpw4eeXIwu_8BhvqE",
"https://www.douyin.com/user/MS4wLjABAAAATJPY7LAlaa5X-c8uNdWkvz0jUGgpw4eeXIwu_8BhvqE?from_tab_name=main",
"MS4wLjABAAAATJPY7LAlaa5X-c8uNdWkvz0jUGgpw4eeXIwu_8BhvqE"
# ........................
]
... ...
... ... @@ -10,11 +10,22 @@
# 快手平台配置
# 指定快手视频ID列表
KS_SPECIFIED_ID_LIST = ["3xf8enb8dbj6uig", "3x6zz972bchmvqe"]
# 指定快手视频URL列表 (支持完整URL或纯ID)
# 支持格式:
# 1. 完整视频URL: "https://www.kuaishou.com/short-video/3x3zxz4mjrsc8ke?authorId=3x84qugg4ch9zhs&streamSource=search"
# 2. 纯视频ID: "3xf8enb8dbj6uig"
KS_SPECIFIED_ID_LIST = [
"https://www.kuaishou.com/short-video/3x3zxz4mjrsc8ke?authorId=3x84qugg4ch9zhs&streamSource=search&area=searchxxnull&searchKey=python",
"3xf8enb8dbj6uig",
# ........................
]
# 指定快手用户ID列表
# 指定快手创作者URL列表 (支持完整URL或纯ID)
# 支持格式:
# 1. 创作者主页URL: "https://www.kuaishou.com/profile/3x84qugg4ch9zhs"
# 2. 纯user_id: "3x4sm73aye7jq7i"
KS_CREATOR_ID_LIST = [
"https://www.kuaishou.com/profile/3x84qugg4ch9zhs",
"3x4sm73aye7jq7i",
# ........................
]
... ...
... ... @@ -17,12 +17,16 @@ SORT_TYPE = "popularity_descending"
# 指定笔记URL列表, 必须要携带xsec_token参数
XHS_SPECIFIED_NOTE_URL_LIST = [
"https://www.xiaohongshu.com/explore/66fad51c000000001b0224b8?xsec_token=AB3rO-QopW5sgrJ41GwN01WCXh6yWPxjSoFI9D5JIMgKw=&xsec_source=pc_search"
"https://www.xiaohongshu.com/explore/68f99f6d0000000007033fcf?xsec_token=ABZEzjuN2fPjKF9EcMsCCxfbt3IBRsFZldGFoCJbdDmXI=&xsec_source=pc_feed"
# ........................
]
# 指定用户ID列表
# 指定创作者URL列表 (支持完整URL或纯ID)
# 支持格式:
# 1. 完整创作者主页URL (带xsec_token和xsec_source参数): "https://www.xiaohongshu.com/user/profile/5eb8e1d400000000010075ae?xsec_token=AB1nWBKCo1vE2HEkfoJUOi5B6BE5n7wVrbdpHoWIj5xHw=&xsec_source=pc_feed"
# 2. 纯user_id: "63e36c9a000000002703502b"
XHS_CREATOR_ID_LIST = [
"63e36c9a000000002703502b",
"https://www.xiaohongshu.com/user/profile/5eb8e1d400000000010075ae?xsec_token=AB1nWBKCo1vE2HEkfoJUOi5B6BE5n7wVrbdpHoWIj5xHw=&xsec_source=pc_feed",
"63e36c9a000000002703502b",
# ........................
]
... ...
# persist-1<persist1@126.com>
# 原因:将 db.py 改造为模块,移除直接执行入口,修复相对导入问题。
# 副作用:无
# 回滚策略:还原此文件。
import asyncio
import sys
from pathlib import Path
# Add project root to sys.path
project_root = Path(__file__).resolve().parents[1]
if str(project_root) not in sys.path:
sys.path.append(str(project_root))
from tools import utils
from database.db_session import create_tables
async def init_table_schema(db_type: str):
"""
Initializes the database table schema.
This will create tables based on the ORM models.
Args:
db_type: The type of database, 'sqlite', 'mysql', or 'postgresql'.
"""
utils.logger.info(f"[init_table_schema] begin init {db_type} table schema ...")
await create_tables(db_type)
utils.logger.info(f"[init_table_schema] {db_type} table schema init successful")
async def init_db(db_type: str = None):
await init_table_schema(db_type)
async def close():
"""
Placeholder for closing database connections if needed in the future.
"""
pass
... ...
from sqlalchemy import text
from sqlalchemy.ext.asyncio import create_async_engine, AsyncSession
from sqlalchemy.orm import sessionmaker
from contextlib import asynccontextmanager
from .models import Base
import config
from config.db_config import mysql_db_config, sqlite_db_config, postgresql_db_config
# Keep a cache of engines
_engines = {}
async def create_database_if_not_exists(db_type: str):
if db_type == "mysql" or db_type == "db":
# Connect to the server without a database
server_url = f"mysql+asyncmy://{mysql_db_config['user']}:{mysql_db_config['password']}@{mysql_db_config['host']}:{mysql_db_config['port']}"
engine = create_async_engine(server_url, echo=False)
async with engine.connect() as conn:
await conn.execute(text(f"CREATE DATABASE IF NOT EXISTS {mysql_db_config['db_name']}"))
await engine.dispose()
elif db_type == "postgresql":
# Connect to PostgreSQL default database (postgres) to create target database
server_url = f"postgresql+asyncpg://{postgresql_db_config['user']}:{postgresql_db_config['password']}@{postgresql_db_config['host']}:{postgresql_db_config['port']}/postgres"
engine = create_async_engine(server_url, echo=False, isolation_level="AUTOCOMMIT")
async with engine.connect() as conn:
# PostgreSQL uses different syntax - check if database exists first
result = await conn.execute(
text(f"SELECT 1 FROM pg_database WHERE datname = '{postgresql_db_config['db_name']}'")
)
exists = result.scalar() is not None
if not exists:
# Set autocommit for CREATE DATABASE
await conn.commit()
await conn.execute(text(f"CREATE DATABASE {postgresql_db_config['db_name']}"))
await engine.dispose()
def get_async_engine(db_type: str = None):
if db_type is None:
db_type = config.SAVE_DATA_OPTION
if db_type in _engines:
return _engines[db_type]
if db_type in ["json", "csv"]:
return None
if db_type == "sqlite":
db_url = f"sqlite+aiosqlite:///{sqlite_db_config['db_path']}"
elif db_type == "mysql" or db_type == "db":
db_url = f"mysql+asyncmy://{mysql_db_config['user']}:{mysql_db_config['password']}@{mysql_db_config['host']}:{mysql_db_config['port']}/{mysql_db_config['db_name']}"
elif db_type == "postgresql":
db_url = f"postgresql+asyncpg://{postgresql_db_config['user']}:{postgresql_db_config['password']}@{postgresql_db_config['host']}:{postgresql_db_config['port']}/{postgresql_db_config['db_name']}"
else:
raise ValueError(f"Unsupported database type: {db_type}")
engine = create_async_engine(db_url, echo=False)
_engines[db_type] = engine
return engine
async def create_tables(db_type: str = None):
if db_type is None:
db_type = config.SAVE_DATA_OPTION
await create_database_if_not_exists(db_type)
engine = get_async_engine(db_type)
if engine:
async with engine.begin() as conn:
await conn.run_sync(Base.metadata.create_all)
@asynccontextmanager
async def get_session() -> AsyncSession:
engine = get_async_engine(config.SAVE_DATA_OPTION)
if not engine:
yield None
return
AsyncSessionFactory = sessionmaker(engine, class_=AsyncSession, expire_on_commit=False)
session = AsyncSessionFactory()
try:
yield session
await session.commit()
except Exception as e:
await session.rollback()
raise e
finally:
await session.close()
\ No newline at end of file
... ...
from sqlalchemy import create_engine, Column, Integer, Text, String, BigInteger
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker
Base = declarative_base()
class BilibiliVideo(Base):
__tablename__ = 'bilibili_video'
id = Column(Integer, primary_key=True)
video_id = Column(BigInteger, nullable=False, index=True, unique=True)
video_url = Column(Text, nullable=False)
user_id = Column(BigInteger, index=True)
nickname = Column(Text)
avatar = Column(Text)
liked_count = Column(Integer)
add_ts = Column(BigInteger)
last_modify_ts = Column(BigInteger)
video_type = Column(Text)
title = Column(Text)
desc = Column(Text)
create_time = Column(BigInteger, index=True)
disliked_count = Column(Text)
video_play_count = Column(Text)
video_favorite_count = Column(Text)
video_share_count = Column(Text)
video_coin_count = Column(Text)
video_danmaku = Column(Text)
video_comment = Column(Text)
video_cover_url = Column(Text)
source_keyword = Column(Text, default='')
class BilibiliVideoComment(Base):
__tablename__ = 'bilibili_video_comment'
id = Column(Integer, primary_key=True)
user_id = Column(String(255))
nickname = Column(Text)
sex = Column(Text)
sign = Column(Text)
avatar = Column(Text)
add_ts = Column(BigInteger)
last_modify_ts = Column(BigInteger)
comment_id = Column(BigInteger, index=True)
video_id = Column(BigInteger, index=True)
content = Column(Text)
create_time = Column(BigInteger)
sub_comment_count = Column(Text)
parent_comment_id = Column(String(255))
like_count = Column(Text, default='0')
class BilibiliUpInfo(Base):
__tablename__ = 'bilibili_up_info'
id = Column(Integer, primary_key=True)
user_id = Column(BigInteger, index=True)
nickname = Column(Text)
sex = Column(Text)
sign = Column(Text)
avatar = Column(Text)
add_ts = Column(BigInteger)
last_modify_ts = Column(BigInteger)
total_fans = Column(Integer)
total_liked = Column(Integer)
user_rank = Column(Integer)
is_official = Column(Integer)
class BilibiliContactInfo(Base):
__tablename__ = 'bilibili_contact_info'
id = Column(Integer, primary_key=True)
up_id = Column(BigInteger, index=True)
fan_id = Column(BigInteger, index=True)
up_name = Column(Text)
fan_name = Column(Text)
up_sign = Column(Text)
fan_sign = Column(Text)
up_avatar = Column(Text)
fan_avatar = Column(Text)
add_ts = Column(BigInteger)
last_modify_ts = Column(BigInteger)
class BilibiliUpDynamic(Base):
__tablename__ = 'bilibili_up_dynamic'
id = Column(Integer, primary_key=True)
dynamic_id = Column(BigInteger, index=True)
user_id = Column(String(255))
user_name = Column(Text)
text = Column(Text)
type = Column(Text)
pub_ts = Column(BigInteger)
total_comments = Column(Integer)
total_forwards = Column(Integer)
total_liked = Column(Integer)
add_ts = Column(BigInteger)
last_modify_ts = Column(BigInteger)
class DouyinAweme(Base):
__tablename__ = 'douyin_aweme'
id = Column(Integer, primary_key=True)
user_id = Column(String(255))
sec_uid = Column(String(255))
short_user_id = Column(String(255))
user_unique_id = Column(String(255))
nickname = Column(Text)
avatar = Column(Text)
user_signature = Column(Text)
ip_location = Column(Text)
add_ts = Column(BigInteger)
last_modify_ts = Column(BigInteger)
aweme_id = Column(BigInteger, index=True)
aweme_type = Column(Text)
title = Column(Text)
desc = Column(Text)
create_time = Column(BigInteger, index=True)
liked_count = Column(Text)
comment_count = Column(Text)
share_count = Column(Text)
collected_count = Column(Text)
aweme_url = Column(Text)
cover_url = Column(Text)
video_download_url = Column(Text)
music_download_url = Column(Text)
note_download_url = Column(Text)
source_keyword = Column(Text, default='')
class DouyinAwemeComment(Base):
__tablename__ = 'douyin_aweme_comment'
id = Column(Integer, primary_key=True)
user_id = Column(String(255))
sec_uid = Column(String(255))
short_user_id = Column(String(255))
user_unique_id = Column(String(255))
nickname = Column(Text)
avatar = Column(Text)
user_signature = Column(Text)
ip_location = Column(Text)
add_ts = Column(BigInteger)
last_modify_ts = Column(BigInteger)
comment_id = Column(BigInteger, index=True)
aweme_id = Column(BigInteger, index=True)
content = Column(Text)
create_time = Column(BigInteger)
sub_comment_count = Column(Text)
parent_comment_id = Column(String(255))
like_count = Column(Text, default='0')
pictures = Column(Text, default='')
class DyCreator(Base):
__tablename__ = 'dy_creator'
id = Column(Integer, primary_key=True)
user_id = Column(String(255))
nickname = Column(Text)
avatar = Column(Text)
ip_location = Column(Text)
add_ts = Column(BigInteger)
last_modify_ts = Column(BigInteger)
desc = Column(Text)
gender = Column(Text)
follows = Column(Text)
fans = Column(Text)
interaction = Column(Text)
videos_count = Column(String(255))
class KuaishouVideo(Base):
__tablename__ = 'kuaishou_video'
id = Column(Integer, primary_key=True)
user_id = Column(String(64))
nickname = Column(Text)
avatar = Column(Text)
add_ts = Column(BigInteger)
last_modify_ts = Column(BigInteger)
video_id = Column(String(255), index=True)
video_type = Column(Text)
title = Column(Text)
desc = Column(Text)
create_time = Column(BigInteger, index=True)
liked_count = Column(Text)
viewd_count = Column(Text)
video_url = Column(Text)
video_cover_url = Column(Text)
video_play_url = Column(Text)
source_keyword = Column(Text, default='')
class KuaishouVideoComment(Base):
__tablename__ = 'kuaishou_video_comment'
id = Column(Integer, primary_key=True)
user_id = Column(Text)
nickname = Column(Text)
avatar = Column(Text)
add_ts = Column(BigInteger)
last_modify_ts = Column(BigInteger)
comment_id = Column(BigInteger, index=True)
video_id = Column(String(255), index=True)
content = Column(Text)
create_time = Column(BigInteger)
sub_comment_count = Column(Text)
class WeiboNote(Base):
__tablename__ = 'weibo_note'
id = Column(Integer, primary_key=True)
user_id = Column(String(255))
nickname = Column(Text)
avatar = Column(Text)
gender = Column(Text)
profile_url = Column(Text)
ip_location = Column(Text, default='')
add_ts = Column(BigInteger)
last_modify_ts = Column(BigInteger)
note_id = Column(BigInteger, index=True)
content = Column(Text)
create_time = Column(BigInteger, index=True)
create_date_time = Column(String(255), index=True)
liked_count = Column(Text)
comments_count = Column(Text)
shared_count = Column(Text)
note_url = Column(Text)
source_keyword = Column(Text, default='')
class WeiboNoteComment(Base):
__tablename__ = 'weibo_note_comment'
id = Column(Integer, primary_key=True)
user_id = Column(String(255))
nickname = Column(Text)
avatar = Column(Text)
gender = Column(Text)
profile_url = Column(Text)
ip_location = Column(Text, default='')
add_ts = Column(BigInteger)
last_modify_ts = Column(BigInteger)
comment_id = Column(BigInteger, index=True)
note_id = Column(BigInteger, index=True)
content = Column(Text)
create_time = Column(BigInteger)
create_date_time = Column(String(255), index=True)
comment_like_count = Column(Text)
sub_comment_count = Column(Text)
parent_comment_id = Column(String(255))
class WeiboCreator(Base):
__tablename__ = 'weibo_creator'
id = Column(Integer, primary_key=True)
user_id = Column(String(255))
nickname = Column(Text)
avatar = Column(Text)
ip_location = Column(Text)
add_ts = Column(BigInteger)
last_modify_ts = Column(BigInteger)
desc = Column(Text)
gender = Column(Text)
follows = Column(Text)
fans = Column(Text)
tag_list = Column(Text)
class XhsCreator(Base):
__tablename__ = 'xhs_creator'
id = Column(Integer, primary_key=True)
user_id = Column(String(255))
nickname = Column(Text)
avatar = Column(Text)
ip_location = Column(Text)
add_ts = Column(BigInteger)
last_modify_ts = Column(BigInteger)
desc = Column(Text)
gender = Column(Text)
follows = Column(Text)
fans = Column(Text)
interaction = Column(Text)
tag_list = Column(Text)
class XhsNote(Base):
__tablename__ = 'xhs_note'
id = Column(Integer, primary_key=True)
user_id = Column(String(255))
nickname = Column(Text)
avatar = Column(Text)
ip_location = Column(Text)
add_ts = Column(BigInteger)
last_modify_ts = Column(BigInteger)
note_id = Column(String(255), index=True)
type = Column(Text)
title = Column(Text)
desc = Column(Text)
video_url = Column(Text)
time = Column(BigInteger, index=True)
last_update_time = Column(BigInteger)
liked_count = Column(Text)
collected_count = Column(Text)
comment_count = Column(Text)
share_count = Column(Text)
image_list = Column(Text)
tag_list = Column(Text)
note_url = Column(Text)
source_keyword = Column(Text, default='')
xsec_token = Column(Text)
class XhsNoteComment(Base):
__tablename__ = 'xhs_note_comment'
id = Column(Integer, primary_key=True)
user_id = Column(String(255))
nickname = Column(Text)
avatar = Column(Text)
ip_location = Column(Text)
add_ts = Column(BigInteger)
last_modify_ts = Column(BigInteger)
comment_id = Column(String(255), index=True)
create_time = Column(BigInteger, index=True)
note_id = Column(String(255))
content = Column(Text)
sub_comment_count = Column(Integer)
pictures = Column(Text)
parent_comment_id = Column(String(255))
like_count = Column(Text)
class TiebaNote(Base):
__tablename__ = 'tieba_note'
id = Column(Integer, primary_key=True)
note_id = Column(String(644), index=True)
title = Column(Text)
desc = Column(Text)
note_url = Column(Text)
publish_time = Column(String(255), index=True)
user_link = Column(Text, default='')
user_nickname = Column(Text, default='')
user_avatar = Column(Text, default='')
tieba_id = Column(String(255), default='')
tieba_name = Column(Text)
tieba_link = Column(Text)
total_replay_num = Column(Integer, default=0)
total_replay_page = Column(Integer, default=0)
ip_location = Column(Text, default='')
add_ts = Column(BigInteger)
last_modify_ts = Column(BigInteger)
source_keyword = Column(Text, default='')
class TiebaComment(Base):
__tablename__ = 'tieba_comment'
id = Column(Integer, primary_key=True)
comment_id = Column(String(255), index=True)
parent_comment_id = Column(String(255), default='')
content = Column(Text)
user_link = Column(Text, default='')
user_nickname = Column(Text, default='')
user_avatar = Column(Text, default='')
tieba_id = Column(String(255), default='')
tieba_name = Column(Text)
tieba_link = Column(Text)
publish_time = Column(String(255), index=True)
ip_location = Column(Text, default='')
sub_comment_count = Column(Integer, default=0)
note_id = Column(String(255), index=True)
note_url = Column(Text)
add_ts = Column(BigInteger)
last_modify_ts = Column(BigInteger)
class TiebaCreator(Base):
__tablename__ = 'tieba_creator'
id = Column(Integer, primary_key=True)
user_id = Column(String(64))
user_name = Column(Text)
nickname = Column(Text)
avatar = Column(Text)
ip_location = Column(Text)
add_ts = Column(BigInteger)
last_modify_ts = Column(BigInteger)
gender = Column(Text)
follows = Column(Text)
fans = Column(Text)
registration_duration = Column(Text)
class ZhihuContent(Base):
__tablename__ = 'zhihu_content'
id = Column(Integer, primary_key=True)
content_id = Column(String(64), index=True)
content_type = Column(Text)
content_text = Column(Text)
content_url = Column(Text)
question_id = Column(String(255))
title = Column(Text)
desc = Column(Text)
created_time = Column(String(32), index=True)
updated_time = Column(Text)
voteup_count = Column(Integer, default=0)
comment_count = Column(Integer, default=0)
source_keyword = Column(Text)
user_id = Column(String(255))
user_link = Column(Text)
user_nickname = Column(Text)
user_avatar = Column(Text)
user_url_token = Column(Text)
add_ts = Column(BigInteger)
last_modify_ts = Column(BigInteger)
# persist-1<persist1@126.com>
# 原因:修复 ORM 模型定义错误,确保与数据库表结构一致。
# 副作用:无
# 回滚策略:还原此行
class ZhihuComment(Base):
__tablename__ = 'zhihu_comment'
id = Column(Integer, primary_key=True)
comment_id = Column(String(64), index=True)
parent_comment_id = Column(String(64))
content = Column(Text)
publish_time = Column(String(32), index=True)
ip_location = Column(Text)
sub_comment_count = Column(Integer, default=0)
like_count = Column(Integer, default=0)
dislike_count = Column(Integer, default=0)
content_id = Column(String(64), index=True)
content_type = Column(Text)
user_id = Column(String(64))
user_link = Column(Text)
user_nickname = Column(Text)
user_avatar = Column(Text)
add_ts = Column(BigInteger)
last_modify_ts = Column(BigInteger)
class ZhihuCreator(Base):
__tablename__ = 'zhihu_creator'
id = Column(Integer, primary_key=True)
user_id = Column(String(64), unique=True, index=True)
user_link = Column(Text)
user_nickname = Column(Text)
user_avatar = Column(Text)
url_token = Column(Text)
gender = Column(Text)
ip_location = Column(Text)
follows = Column(Integer, default=0)
fans = Column(Integer, default=0)
anwser_count = Column(Integer, default=0)
video_count = Column(Integer, default=0)
question_count = Column(Integer, default=0)
article_count = Column(Integer, default=0)
column_count = Column(Integer, default=0)
get_voteup_count = Column(Integer, default=0)
add_ts = Column(BigInteger)
last_modify_ts = Column(BigInteger)
\ No newline at end of file
... ...
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
# 1. 不得用于任何商业用途。
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
# 3. 不得进行大规模爬取或对平台造成运营干扰。
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
# 5. 不得用于任何非法或不当的用途。
#
# 详细许可条款请参阅项目根目录下的LICENSE文件。
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
# -*- coding: utf-8 -*-
# @Author : relakkes@gmail.com
# @Time : 2024/4/6 14:54
# @Desc : mediacrawler db 管理
import asyncio
from typing import Dict
from urllib.parse import urlparse
import aiofiles
import aiomysql
import config
from async_db import AsyncMysqlDB
from async_sqlite_db import AsyncSqliteDB
from tools import utils
from var import db_conn_pool_var, media_crawler_db_var
async def init_mediacrawler_db():
"""
初始化数据库链接池对象,并将该对象塞给media_crawler_db_var上下文变量
Returns:
"""
pool = await aiomysql.create_pool(
host=config.MYSQL_DB_HOST,
port=config.MYSQL_DB_PORT,
user=config.MYSQL_DB_USER,
password=config.MYSQL_DB_PWD,
db=config.MYSQL_DB_NAME,
autocommit=True,
)
async_db_obj = AsyncMysqlDB(pool)
# 将连接池对象和封装的CRUD sql接口对象放到上下文变量中
db_conn_pool_var.set(pool)
media_crawler_db_var.set(async_db_obj)
async def init_sqlite_db():
"""
初始化SQLite数据库对象,并将该对象塞给media_crawler_db_var上下文变量
Returns:
"""
async_db_obj = AsyncSqliteDB(config.SQLITE_DB_PATH)
# 将SQLite数据库对象放到上下文变量中
media_crawler_db_var.set(async_db_obj)
async def init_db():
"""
初始化db连接池
Returns:
"""
utils.logger.info("[init_db] start init mediacrawler db connect object")
if config.SAVE_DATA_OPTION == "sqlite":
await init_sqlite_db()
utils.logger.info("[init_db] end init sqlite db connect object")
else:
await init_mediacrawler_db()
utils.logger.info("[init_db] end init mysql db connect object")
async def close():
"""
关闭数据库连接
Returns:
"""
utils.logger.info("[close] close mediacrawler db connection")
if config.SAVE_DATA_OPTION == "sqlite":
# SQLite数据库连接会在AsyncSqliteDB对象销毁时自动关闭
utils.logger.info("[close] sqlite db connection will be closed automatically")
else:
# MySQL连接池关闭
db_pool: aiomysql.Pool = db_conn_pool_var.get()
if db_pool is not None:
db_pool.close()
utils.logger.info("[close] mysql db pool closed")
async def init_table_schema(db_type: str = None):
"""
用来初始化数据库表结构,请在第一次需要创建表结构的时候使用,多次执行该函数会将已有的表以及数据全部删除
Args:
db_type: 数据库类型,可选值为 'sqlite' 或 'mysql',如果不指定则使用配置文件中的设置
Returns:
"""
# 如果没有指定数据库类型,则使用配置文件中的设置
if db_type is None:
db_type = config.SAVE_DATA_OPTION
if db_type == "sqlite":
utils.logger.info("[init_table_schema] begin init sqlite table schema ...")
# 检查并删除可能存在的损坏数据库文件
import os
if os.path.exists(config.SQLITE_DB_PATH):
try:
# 尝试删除现有的数据库文件
os.remove(config.SQLITE_DB_PATH)
utils.logger.info(f"[init_table_schema] removed existing sqlite db file: {config.SQLITE_DB_PATH}")
except Exception as e:
utils.logger.warning(f"[init_table_schema] failed to remove existing sqlite db file: {e}")
# 如果删除失败,尝试重命名文件
try:
backup_path = f"{config.SQLITE_DB_PATH}.backup_{utils.get_current_timestamp()}"
os.rename(config.SQLITE_DB_PATH, backup_path)
utils.logger.info(f"[init_table_schema] renamed existing sqlite db file to: {backup_path}")
except Exception as rename_e:
utils.logger.error(f"[init_table_schema] failed to rename existing sqlite db file: {rename_e}")
raise rename_e
await init_sqlite_db()
async_db_obj: AsyncSqliteDB = media_crawler_db_var.get()
async with aiofiles.open("schema/sqlite_tables.sql", mode="r", encoding="utf-8") as f:
schema_sql = await f.read()
await async_db_obj.executescript(schema_sql)
utils.logger.info("[init_table_schema] sqlite table schema init successful")
elif db_type == "mysql":
utils.logger.info("[init_table_schema] begin init mysql table schema ...")
await init_mediacrawler_db()
async_db_obj: AsyncMysqlDB = media_crawler_db_var.get()
async with aiofiles.open("schema/tables.sql", mode="r", encoding="utf-8") as f:
schema_sql = await f.read()
await async_db_obj.execute(schema_sql)
utils.logger.info("[init_table_schema] mysql table schema init successful")
await close()
else:
utils.logger.error(f"[init_table_schema] 不支持的数据库类型: {db_type}")
raise ValueError(f"不支持的数据库类型: {db_type},支持的类型: sqlite, mysql")
def show_database_options():
"""
显示支持的数据库选项
"""
print("\n=== MediaCrawler 数据库初始化工具 ===")
print("支持的数据库类型:")
print("1. sqlite - SQLite 数据库 (轻量级,无需额外配置)")
print("2. mysql - MySQL 数据库 (需要配置数据库连接信息)")
print("3. config - 使用配置文件中的设置")
print("4. exit - 退出程序")
print("="*50)
def get_user_choice():
"""
获取用户选择的数据库类型
Returns:
str: 用户选择的数据库类型
"""
while True:
choice = input("请输入数据库类型 (sqlite/mysql/config/exit): ").strip().lower()
if choice in ['sqlite', 'mysql', 'config', 'exit']:
return choice
else:
print("❌ 无效的选择,请输入: sqlite, mysql, config 或 exit")
async def main():
"""
主函数,处理用户交互和数据库初始化
"""
try:
show_database_options()
while True:
choice = get_user_choice()
if choice == 'exit':
print("👋 程序已退出")
break
elif choice == 'config':
print(f"📋 使用配置文件中的设置: {config.SAVE_DATA_OPTION}")
await init_table_schema()
print("✅ 数据库表结构初始化完成!")
break
else:
print(f"🚀 开始初始化 {choice.upper()} 数据库...")
await init_table_schema(choice)
print("✅ 数据库表结构初始化完成!")
break
except KeyboardInterrupt:
print("\n\n⚠️ 用户中断操作")
except Exception as e:
print(f"\n❌ 初始化失败: {str(e)}")
utils.logger.error(f"[main] 数据库初始化失败: {str(e)}")
if __name__ == '__main__':
asyncio.get_event_loop().run_until_complete(main())
import {defineConfig} from 'vitepress'
// https://vitepress.dev/reference/site-config
export default defineConfig({
title: "MediaCrawler自媒体爬虫",
description: "小红书爬虫,抖音爬虫, 快手爬虫, B站爬虫, 微博爬虫,百度贴吧爬虫,知乎爬虫...。 ",
lastUpdated: true,
base: '/MediaCrawler/',
head: [
[
'script',
{async: '', src: 'https://www.googletagmanager.com/gtag/js?id=G-5TK7GF3KK1'}
],
[
'script',
{},
`window.dataLayer = window.dataLayer || [];
function gtag(){dataLayer.push(arguments);}
gtag('js', new Date());
gtag('config', 'G-5TK7GF3KK1');`
]
],
themeConfig: {
editLink: {
pattern: 'https://github.com/NanmiCoder/MediaCrawler/tree/main/docs/:path'
},
search: {
provider: 'local'
},
// https://vitepress.dev/reference/default-theme-config
nav: [
{text: '首页', link: '/'},
{text: '联系我', link: '/作者介绍'},
{text: '支持我', link: '/知识付费介绍'},
],
sidebar: [
{
text: '作者介绍',
link: '/作者介绍',
},
{
text: 'MediaCrawler使用文档',
items: [
{text: '基本使用', link: '/'},
{text: '常见问题汇总', link: '/常见问题'},
{text: 'IP代理使用', link: '/代理使用'},
{text: '词云图使用', link: '/词云图使用配置'},
{text: '项目目录结构', link: '/项目代码结构'},
{text: '手机号登录说明', link: '/手机号登录说明'},
]
},
{
text: '知识付费',
items: [
{text: '知识付费介绍', link: '/知识付费介绍'},
{text: 'MediaCrawlerPro订阅', link: '/mediacrawlerpro订阅'},
{
text: 'MediaCrawler源码剖析课',
link: 'https://relakkes.feishu.cn/wiki/JUgBwdhIeiSbAwkFCLkciHdAnhh'
},
{text: '知识星球文章专栏', link: '/知识星球介绍'},
{text: '开发者咨询服务', link: '/开发者咨询'},
]
},
{
text: 'MediaCrawler项目交流群',
link: '/微信交流群',
},
{
text: '爬虫入门教程分享',
items: [
{text: "我写的爬虫入门教程", link: 'https://github.com/NanmiCoder/CrawlerTutorial'}
]
},
{
text: 'MediaCrawler捐赠名单',
items: [
{text: "捐赠名单", link: '/捐赠名单'}
]
},
],
socialLinks: [
{icon: 'github', link: 'https://github.com/NanmiCoder/MediaCrawler'}
]
}
})
... ...
<!-- 在vitepress右侧的目录导航中插入动态广告组件-->
<script setup>
import { ref, onMounted, onUnmounted } from 'vue'
const ads = ref([])
const currentAdIndex = ref(0)
let intervalId = null
const fetchAds = async () => {
return [
{
id: 1,
imageUrl: 'https://github.com/NanmiCoder/MediaCrawler/raw/main/docs/static/images/auto_test.png',
landingUrl: 'https://item.jd.com/10124939676219.html',
text: '给好朋友虫师新书站台推荐 - 基于Python的自动化测试框架设计'
}
]
}
const nextAd = () => {
currentAdIndex.value = (currentAdIndex.value + 1) % ads.value.length
}
onMounted(async () => {
ads.value = await fetchAds()
intervalId = setInterval(nextAd, 3000)
})
onUnmounted(() => {
if (intervalId) clearInterval(intervalId)
})
</script>
<template>
<div class="vp-ad-carousel">
<template v-if="ads.length > 0">
<div class="ad-content">
<a :href="ads[currentAdIndex].landingUrl" target="_blank" rel="noopener noreferrer">
<img :src="ads[currentAdIndex].imageUrl" :alt="ads[currentAdIndex].text" class="ad-image">
<p class="ad-text">{{ ads[currentAdIndex].text }}</p>
</a>
</div>
</template>
<p v-else class="loading">Loading ads...</p>
</div>
</template>
<style scoped>
.vp-ad-carousel {
margin-top: 1rem;
padding: 1rem;
background-color: var(--vp-c-bg-soft);
border-radius: 8px;
font-size: 0.875rem;
line-height: 1.5;
}
.ad-content {
display: flex;
flex-direction: column;
align-items: center;
}
.ad-image {
max-width: 130px;
height: auto;
margin-bottom: 0.5rem;
}
.ad-text {
text-align: center;
color: var(--vp-c-text-1);
}
.loading {
text-align: center;
color: var(--vp-c-text-2);
}
a {
text-decoration: none;
color: inherit;
}
</style>
... ...
<!--.vitepress/theme/MyLayout.vue-->
<script setup>
import DefaultTheme from 'vitepress/theme'
import DynamicAds from './DynamicAds.vue'
const { Layout } = DefaultTheme
</script>
<template>
<Layout>
<template #aside-bottom>
<DynamicAds />
</template>
</Layout>
</template>
\ No newline at end of file
... ...
/* .vitepress/theme/custom.css */
/**
* Component: Sidebar
* -------------------------------------------------------------------------- */
:root {
--vp-sidebar-width: 285px;
--vp-sidebar-bg-color: var(--vp-c-bg-alt);
}
\ No newline at end of file
... ...
// .vitepress/theme/index.js
import DefaultTheme from 'vitepress/theme'
import MyLayout from './MyLayout.vue'
export default {
extends: DefaultTheme,
// 使用注入插槽的包装组件覆盖 Layout
Layout: MyLayout
}
\ No newline at end of file
... ...
# CDP模式使用指南
## 概述
CDP(Chrome DevTools Protocol)模式是一种高级的反检测爬虫技术,通过控制用户现有的Chrome/Edge浏览器来进行网页爬取。与传统的Playwright自动化相比,CDP模式具有以下优势:
### 🎯 主要优势
1. **真实浏览器环境**: 使用用户实际安装的浏览器,包含所有扩展、插件和个人设置
2. **更好的反检测能力**: 浏览器指纹更加真实,难以被网站检测为自动化工具
3. **保留用户状态**: 自动继承用户的登录状态、Cookie和浏览历史
4. **扩展支持**: 可以利用用户安装的广告拦截器、代理扩展等工具
5. **更自然的行为**: 浏览器行为模式更接近真实用户
## 快速开始
### 1. 启用CDP模式
`config/base_config.py` 中设置:
```python
# 启用CDP模式
ENABLE_CDP_MODE = True
# CDP调试端口(可选,默认9222)
CDP_DEBUG_PORT = 9222
# 是否在无头模式下运行(建议设为False以获得最佳反检测效果)
CDP_HEADLESS = False
# 程序结束时是否自动关闭浏览器
AUTO_CLOSE_BROWSER = True
```
### 2. 运行测试
```bash
# 运行CDP功能测试
python examples/cdp_example.py
# 运行小红书爬虫(CDP模式)
python main.py
```
## 配置选项详解
### 基础配置
| 配置项 | 类型 | 默认值 | 说明 |
|--------|------|--------|------|
| `ENABLE_CDP_MODE` | bool | False | 是否启用CDP模式 |
| `CDP_DEBUG_PORT` | int | 9222 | CDP调试端口 |
| `CDP_HEADLESS` | bool | False | CDP模式下的无头模式 |
| `AUTO_CLOSE_BROWSER` | bool | True | 程序结束时是否关闭浏览器 |
### 高级配置
| 配置项 | 类型 | 默认值 | 说明 |
|--------|------|--------|------|
| `CUSTOM_BROWSER_PATH` | str | "" | 自定义浏览器路径 |
| `BROWSER_LAUNCH_TIMEOUT` | int | 30 | 浏览器启动超时时间(秒) |
### 自定义浏览器路径
如果系统自动检测失败,可以手动指定浏览器路径:
```python
# Windows示例
CUSTOM_BROWSER_PATH = r"C:\Program Files\Google\Chrome\Application\chrome.exe"
# macOS示例
CUSTOM_BROWSER_PATH = "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"
# Linux示例
CUSTOM_BROWSER_PATH = "/usr/bin/google-chrome"
```
## 支持的浏览器
### Windows
- Google Chrome (稳定版、Beta、Dev、Canary)
- Microsoft Edge (稳定版、Beta、Dev、Canary)
### macOS
- Google Chrome (稳定版、Beta、Dev、Canary)
- Microsoft Edge (稳定版、Beta、Dev、Canary)
### Linux
- Google Chrome / Chromium
- Microsoft Edge
## 使用示例
### 基本使用
```python
import asyncio
from playwright.async_api import async_playwright
from tools.cdp_browser import CDPBrowserManager
async def main():
cdp_manager = CDPBrowserManager()
async with async_playwright() as playwright:
# 启动CDP浏览器
browser_context = await cdp_manager.launch_and_connect(
playwright=playwright,
user_agent="自定义User-Agent",
headless=False
)
# 创建页面并访问网站
page = await browser_context.new_page()
await page.goto("https://example.com")
# 执行爬取操作...
# 清理资源
await cdp_manager.cleanup()
asyncio.run(main())
```
### 在爬虫中使用
CDP模式已集成到所有平台爬虫中,只需启用配置即可:
```python
# 在config/base_config.py中
ENABLE_CDP_MODE = True
# 然后正常运行爬虫
python main.py
```
## 故障排除
### 常见问题
#### 1. 浏览器检测失败
**错误**: `未找到可用的浏览器`
**解决方案**:
- 确保已安装Chrome或Edge浏览器
- 检查浏览器是否在标准路径下
- 使用`CUSTOM_BROWSER_PATH`指定浏览器路径
#### 2. 端口被占用
**错误**: `无法找到可用的端口`
**解决方案**:
- 关闭其他使用调试端口的程序
- 修改`CDP_DEBUG_PORT`为其他端口
- 系统会自动尝试下一个可用端口
#### 3. 浏览器启动超时
**错误**: `浏览器在30秒内未能启动`
**解决方案**:
- 增加`BROWSER_LAUNCH_TIMEOUT`
- 检查系统资源是否充足
- 尝试关闭其他占用资源的程序
#### 4. CDP连接失败
**错误**: `CDP连接失败`
**解决方案**:
- 检查防火墙设置
- 确保localhost访问正常
- 尝试重启浏览器
### 调试技巧
#### 1. 启用详细日志
```python
import logging
logging.basicConfig(level=logging.DEBUG)
```
#### 2. 手动测试CDP连接
```bash
# 手动启动Chrome
chrome --remote-debugging-port=9222
# 访问调试页面
curl http://localhost:9222/json
```
#### 3. 检查浏览器进程
```bash
# Windows
tasklist | findstr chrome
# macOS/Linux
ps aux | grep chrome
```
## 最佳实践
### 1. 反检测优化
- 保持`CDP_HEADLESS = False`以获得最佳反检测效果
- 使用真实的User-Agent字符串
- 避免过于频繁的请求
### 2. 性能优化
- 合理设置`AUTO_CLOSE_BROWSER`
- 复用浏览器实例而不是频繁重启
- 监控内存使用情况
### 3. 安全考虑
- 不要在生产环境中保存敏感Cookie
- 定期清理浏览器数据
- 注意用户隐私保护
### 4. 兼容性
- 测试不同浏览器版本的兼容性
- 准备回退方案(标准Playwright模式)
- 监控目标网站的反爬策略变化
## 技术原理
CDP模式的工作原理:
1. **浏览器检测**: 自动扫描系统中的Chrome/Edge安装路径
2. **进程启动**: 使用`--remote-debugging-port`参数启动浏览器
3. **CDP连接**: 通过WebSocket连接到浏览器的调试接口
4. **Playwright集成**: 使用`connectOverCDP`方法接管浏览器控制
5. **上下文管理**: 创建或复用浏览器上下文进行操作
这种方式绕过了传统WebDriver的检测机制,提供了更加隐蔽的自动化能力。
## 更新日志
### v1.0.0
- 初始版本发布
- 支持Windows和macOS的Chrome/Edge检测
- 集成到所有平台爬虫
- 提供完整的配置选项和错误处理
## 贡献
欢迎提交Issue和Pull Request来改进CDP模式功能。
## 许可证
本功能遵循项目的整体许可证条款,仅供学习和研究使用。
... ...
\n
———
》),
)÷(1-
”,
)、
=(
:
&
*
一一
~~~~
.
.一
./
--
=″
[*]
}>
[⑤]]
[①D]
c]
ng昉
//
[②e]
[②g]
={
}
,也
[①⑥]
[②B]
[①a]
[④a]
[①③]
[③h]
③]
1.
--
[②b]
’‘
×××
[①⑧]
0:2
=[
[⑤b]
[②c]
[④b]
[②③]
[③a]
[④c]
[①⑤]
[①⑦]
[①g]
∈[
[①⑨]
[①④]
[①c]
[②f]
[②⑧]
[②①]
[①C]
[③c]
[③g]
[②⑤]
[②②]
一.
[①h]
.数
[]
[①B]
数/
[①i]
[③e]
[①①]
[④d]
[④e]
[③b]
[⑤a]
[①A]
[②⑧]
[②⑦]
[①d]
[②j]
〕〔
][
://
′∈
[②④
[⑤e]
12%
b]
...
...................
…………………………………………………③
ZXFITL
[③F]
[①o]
]∧′=[
∪φ∈
′|
{-
②c
[③①]
R.L.
[①E]
Ψ
-[*]-
.日
[②d]
[②
[②⑦]
[②②]
[③e]
[①i]
[①B]
[①h]
[①d]
[①g]
[①②]
[②a]
f]
[⑩]
a]
[①e]
[②h]
[②⑥]
[③d]
[②⑩]
e]
元/吨
[②⑩]
2.3%
5:0
[①]
::
[②]
[③]
[④]
[⑤]
[⑥]
[⑦]
[⑧]
[⑨]
……
——
?
,
'
?
·
———
──
?
<
>
[
]
(
)
-
+
×
/
В
"
;
#
@
γ
μ
φ
φ.
×
Δ
sub
exp
sup
sub
Lex
+ξ
++
-β
<±
<Δ
<λ
<φ
<<
=
=☆
=-
>λ
_
~±
~+
[⑤f]
[⑤d]
[②i]
[②G]
[①f]
LI
[-
......
[③⑩]
第二
一番
一直
一个
一些
许多
有的是
也就是说
末##末
哎呀
哎哟
俺们
按照
吧哒
罢了
本着
比方
比如
鄙人
彼此
别的
别说
并且
不比
不成
不单
不但
不独
不管
不光
不过
不仅
不拘
不论
不怕
不然
不如
不特
不惟
不问
不只
朝着
趁着
除此之外
除非
除了
此间
此外
从而
但是
当着
的话
等等
叮咚
对于
多少
而况
而且
而是
而外
而言
而已
尔后
反过来
反过来说
反之
非但
非徒
否则
嘎登
各个
各位
各种
各自
根据
故此
固然
关于
果然
果真
哈哈
何处
何况
何时
哼唷
呼哧
还是
还有
换句话说
换言之
或是
或者
极了
及其
及至
即便
即或
即令
即若
即使
几时
既然
既是
继而
加之
假如
假若
假使
鉴于
较之
接着
结果
紧接着
进而
尽管
经过
就是
就是说
具体地说
具体说来
开始
开外
可见
可是
可以
况且
来着
例如
连同
两者
另外
另一方面
慢说
漫说
每当
莫若
某个
某些
哪边
哪儿
哪个
哪里
哪年
哪怕
哪天
哪些
哪样
那边
那儿
那个
那会儿
那里
那么
那么些
那么样
那时
那些
那样
乃至
你们
宁可
宁肯
宁愿
啪达
旁人
凭借
其次
其二
其他
其它
其一
其余
其中
起见
起见
岂但
恰恰相反
前后
前者
然而
然后
然则
人家
任何
任凭
如此
如果
如何
如其
如若
如上所述
若非
若是
上下
尚且
设若
设使
甚而
甚么
甚至
省得
时候
什么
什么样
使得
是的
首先
谁知
顺着
似的
虽然
虽说
虽则
随着
所以
他们
他人
它们
她们
倘或
倘然
倘若
倘使
通过
同时
万一
为何
为了
为什么
为着
嗡嗡
我们
呜呼
乌乎
无论
无宁
毋宁
相对而言
向着
沿
沿着
要不
要不然
要不是
要么
要是
也罢
也好
一般
一旦
一方面
一来
一切
一样
一则
依照
以便
以及
以免
以至
以至于
以致
抑或
因此
因而
因为
由此可见
由于
有的
有关
有些
于是
于是乎
与此同时
与否
与其
越是
云云
再说
再者
在下
咱们
怎么
怎么办
怎么样
怎样
照着
这边
这儿
这个
这会儿
这就是说
这里
这么
这么点儿
这么些
这么样
这时
这些
这样
正如
之类
之所以
之一
只是
只限
只要
只有
至于
诸位
着呢
自从
自个儿
自各儿
自己
自家
自身
综上所述
总的来看
总的来说
总的说来
总而言之
总之
纵令
纵然
纵使
遵照
作为
喔唷
... ...
# MediaCrawler使用方法
## 创建并激活 python 虚拟环境
> 如果是爬取抖音和知乎,需要提前安装nodejs环境,版本大于等于:`16`即可 <br>
```shell
# 进入项目根目录
cd MediaCrawler
# 创建虚拟环境
# 我的python版本是:3.9.6,requirements.txt中的库是基于这个版本的,如果是其他python版本,可能requirements.txt中的库不兼容,自行解决一下。
python -m venv venv
# macos & linux 激活虚拟环境
source venv/bin/activate
# windows 激活虚拟环境
venv\Scripts\activate
```
## 安装依赖库
```shell
pip install -r requirements.txt
```
## 安装 playwright浏览器驱动
```shell
playwright install
```
## 运行爬虫程序
```shell
### 项目默认是没有开启评论爬取模式,如需评论请在config/base_config.py中的 ENABLE_GET_COMMENTS 变量修改
### 一些其他支持项,也可以在config/base_config.py查看功能,写的有中文注释
# 从配置文件中读取关键词搜索相关的帖子并爬取帖子信息与评论
python main.py --platform xhs --lt qrcode --type search
# 从配置文件中读取指定的帖子ID列表获取指定帖子的信息与评论信息
python main.py --platform xhs --lt qrcode --type detail
# 使用SQLite数据库存储数据(推荐个人用户使用)
python main.py --platform xhs --lt qrcode --type search --save_data_option sqlite
# 使用MySQL数据库存储数据
python main.py --platform xhs --lt qrcode --type search --save_data_option db
# 打开对应APP扫二维码登录
# 其他平台爬虫使用示例,执行下面的命令查看
python main.py --help
```
## 💾 数据存储
支持多种数据存储方式:
- **CSV 文件**: 支持保存至 CSV (位于 `data/` 目录下)
- **JSON 文件**: 支持保存至 JSON (位于 `data/` 目录下)
- **数据库存储**
- 使用 `--init_db` 参数进行数据库初始化 (使用 `--init_db` 时,无需其他可选参数)
- **SQLite 数据库**: 轻量级数据库,无需服务器,适合个人使用 (推荐)
1. 初始化: `--init_db sqlite`
2. 数据存储: `--save_data_option sqlite`
- **MySQL 数据库**: 支持保存至关系型数据库 MySQL (需提前创建数据库)
1. 初始化: `--init_db mysql`
2. 数据存储: `--save_data_option db` (db 参数为兼容历史更新保留)
## 免责声明
> **免责声明:**
>
> 大家请以学习为目的使用本仓库,爬虫违法违规的案件:https://github.com/HiddenStrawberry/Crawler_Illegal_Cases_In_China <br>
>
>本项目的所有内容仅供学习和参考之用,禁止用于商业用途。任何人或组织不得将本仓库的内容用于非法用途或侵犯他人合法权益。本仓库所涉及的爬虫技术仅用于学习和研究,不得用于对其他平台进行大规模爬虫或其他非法行为。对于因使用本仓库内容而引起的任何法律责任,本仓库不承担任何责任。使用本仓库的内容即表示您同意本免责声明的所有条款和条件。
... ...
# 订阅MediaCrawlerPro版本源码访问权限
## 获取Pro版本的访问权限
> MediaCrawler开源超过一年了,相信该仓库帮过不少朋友低门槛的学习和了解爬虫。维护真的耗费了大量精力和人力 <br>
>
> 所以Pro版本不会开源,可以订阅Pro版本让我更加有动力去更新。<br>
>
> 如果感兴趣可以加我微信,订阅Pro版本访问权限哦,有门槛💰。<br>
>
> 仅针对想学习Pro版本源码实现的用户,如果是公司或者商业化盈利性质的就不要加我了,谢谢🙏
>
> 代码设计拓展性强,可以自己扩展更多的爬虫平台,更多的数据存储方式,相信对你架构这种爬虫代码有所帮助。
>
>
> **MediaCrawlerPro项目主页地址**
> [MediaCrawlerPro Github主页地址](https://github.com/MediaCrawlerPro)
扫描下方我的个人微信,备注:pro版本(如果图片展示不出来,可以直接添加我的微信号:relakkes)
![relakkes_weichat.JPG](static/images/relakkes_weichat.jpg)
## Pro版本诞生的背景
[MediaCrawler](https://github.com/NanmiCoder/MediaCrawler)这个项目开源至今获得了大量的关注,同时也暴露出来了一系列问题,比如:
- 能否支持多账号?
- 能否在linux部署?
- 能否去掉playwright的依赖?
- 有没有更简单的部署方法?
- 有没有针对新手上门槛更低的方法?
诸如上面的此类问题,想要在原有项目上去动刀,无疑是增加了复杂度,可能导致后续的维护更加困难。
出于可持续维护、简便易用、部署简单等目的,对MediaCrawler进行彻底重构。
## 项目介绍
### [MediaCrawler](https://github.com/NanmiCoder/MediaCrawler)的Pro版本python实现
**小红书爬虫****抖音爬虫****快手爬虫****B站爬虫****微博爬虫****百度贴吧****知乎爬虫**...。
支持多种平台的爬虫,支持多种数据的爬取,支持多种数据的存储,最重要的**完美支持多账号+IP代理池,让你的爬虫更加稳定**
相较于MediaCrawler,Pro版本最大的变化:
- 去掉了playwright的依赖,不再将Playwright集成到爬虫主干中,依赖过重。
- 增加了Docker,Docker-compose的方式部署,让部署更加简单。
- 多账号+IP代理池的支持,让爬虫更加稳定。
- 新增签名服务,解耦签名逻辑,让爬虫更加灵活。
... ...
# 代理 IP 使用说明
> 还是得跟大家再次强调下,不要对一些自媒体平台进行大规模爬虫或其他非法行为,要踩缝纫机的哦🤣
## 简易的流程图
![代理 IP 使用流程图](static/images/代理IP%20流程图.drawio.png)
## 选择一个代理IP提供商
### 快代理
[快代理使用文档](快代理使用文档.md)
### 豌豆HTTP文档查看
[豌豆HTTP使用文档](豌豆HTTP使用文档.md)
\ No newline at end of file
... ...