Doiiars

feat: migrate MediaCrawler to git submodule and enhance MindSpider automation

... ... @@ -340,6 +340,7 @@ test_results/
# Ai操作指引文件
OperationGuidance/
db_data/
insight_engine_streamlit_reports/
media_engine_streamlit_reports/
query_engine_streamlit_reports/
... ...
... ... @@ -107,19 +107,34 @@ sqlite_db_config = {{
"db_path": SQLITE_DB_PATH
}}
# postgresql config - 使用MindSpider的数据库配置(如果DB_DIALECT是postgresql)或环境变量
POSTGRESQL_DB_PWD = os.getenv("POSTGRESQL_DB_PWD", "{pg_password}")
POSTGRESQL_DB_USER = os.getenv("POSTGRESQL_DB_USER", "{pg_user}")
POSTGRESQL_DB_HOST = os.getenv("POSTGRESQL_DB_HOST", "{pg_host}")
POSTGRESQL_DB_PORT = os.getenv("POSTGRESQL_DB_PORT", "{pg_port}")
POSTGRESQL_DB_NAME = os.getenv("POSTGRESQL_DB_NAME", "{pg_db_name}")
# mongodb config
MONGODB_HOST = os.getenv("MONGODB_HOST", "localhost")
MONGODB_PORT = os.getenv("MONGODB_PORT", 27017)
MONGODB_USER = os.getenv("MONGODB_USER", "")
MONGODB_PWD = os.getenv("MONGODB_PWD", "")
MONGODB_DB_NAME = os.getenv("MONGODB_DB_NAME", "media_crawler")
postgresql_db_config = {{
"user": POSTGRESQL_DB_USER,
"password": POSTGRESQL_DB_PWD,
"host": POSTGRESQL_DB_HOST,
"port": POSTGRESQL_DB_PORT,
"db_name": POSTGRESQL_DB_NAME,
mongodb_config = {{
"host": MONGODB_HOST,
"port": int(MONGODB_PORT),
"user": MONGODB_USER,
"password": MONGODB_PWD,
"db_name": MONGODB_DB_NAME,
}}
# postgres config - 使用MindSpider的数据库配置(如果DB_DIALECT是postgresql)或环境变量
POSTGRES_DB_PWD = os.getenv("POSTGRES_DB_PWD", "{pg_password}")
POSTGRES_DB_USER = os.getenv("POSTGRES_DB_USER", "{pg_user}")
POSTGRES_DB_HOST = os.getenv("POSTGRES_DB_HOST", "{pg_host}")
POSTGRES_DB_PORT = os.getenv("POSTGRES_DB_PORT", "{pg_port}")
POSTGRES_DB_NAME = os.getenv("POSTGRES_DB_NAME", "{pg_db_name}")
postgres_db_config = {{
"user": POSTGRES_DB_USER,
"password": POSTGRES_DB_PWD,
"host": POSTGRES_DB_HOST,
"port": POSTGRES_DB_PORT,
"db_name": POSTGRES_DB_NAME,
}}
'''
... ... @@ -154,7 +169,7 @@ postgresql_db_config = {{
# 判断数据库类型,确定 SAVE_DATA_OPTION
db_dialect = (config.settings.DB_DIALECT or "mysql").lower()
is_postgresql = db_dialect in ("postgresql", "postgres")
save_data_option = "postgresql" if is_postgresql else "db"
save_data_option = "postgres" if is_postgresql else "db"
base_config_path = self.mediacrawler_path / "config" / "base_config.py"
... ... @@ -238,7 +253,7 @@ postgresql_db_config = {{
# 判断数据库类型,确定 save_data_option
db_dialect = (config.settings.DB_DIALECT or "mysql").lower()
is_postgresql = db_dialect in ("postgresql", "postgres")
save_data_option = "postgresql" if is_postgresql else "db"
save_data_option = "postgres" if is_postgresql else "db"
# 构建命令
cmd = [
... ... @@ -401,7 +416,7 @@ postgresql_db_config = {{
total_stats["keyword_results"][keyword] = {}
total_stats["keyword_results"][keyword][platform] = result
logger.info(f" ✅ 成功: {notes_count} 条内容, {comments_count} 条评论")
logger.info(f" ✅ 爬取成功")
else:
total_stats["failed_tasks"] += len(keywords)
total_stats["platform_summary"][platform]["failed_keywords"] = len(keywords)
... ... @@ -433,15 +448,12 @@ postgresql_db_config = {{
finish_message += f"\n 成功: {total_stats['successful_tasks']}"
finish_message += f"\n 失败: {total_stats['failed_tasks']}"
finish_message += f"\n 成功率: {total_stats['successful_tasks']/total_stats['total_tasks']*100:.1f}%"
finish_message += f"\n 总内容: {total_stats['total_notes']} 条"
finish_message += f"\n 总评论: {total_stats['total_comments']} 条"
logger.info(finish_message)
platform_summary_message = f"\n 各平台统计:"
platform_summary_message = f"\n📈 各平台统计:"
for platform, stats in total_stats["platform_summary"].items():
success_rate = stats["successful_keywords"] / len(keywords) * 100 if keywords else 0
platform_summary_message += f"\n {platform}: {stats['successful_keywords']}/{len(keywords)} 关键词成功 ({success_rate:.1f}%), "
platform_summary_message += f"{stats['total_notes']} 条内容"
platform_summary_message += f"\n {platform}: {stats['successful_keywords']}/{len(keywords)} 关键词成功 ({success_rate:.1f}%)"
logger.info(platform_summary_message)
return total_stats
... ...
... ... @@ -186,8 +186,8 @@ flowchart TB
- 记录任务状态、进度、结果等
5. **平台内容表**(继承自MediaCrawler)
- xhs_note - 小红书笔记(暂时废弃,详情查看:https://github.com/NanmiCoder/MediaCrawler/issues/754)
- douyin_aweme - 抖音视频
- xhs_note - 小红书笔记
- douyin_aweme - 抖音视频
- kuaishou_video - 快手视频
- bilibili_video - B站视频
- weibo_note - 微博帖子
... ... @@ -204,13 +204,27 @@ flowchart TB
- 操作系统:Windows/Linux/macOS
### 1. 克隆项目
### 1. 克隆项目与获取子模块
MindSpider 作为 BettaFish 的核心组件运行。请克隆 BettaFish 主项目并同步获取 `MediaCrawler` 爬虫子模块。
**方式一:克隆时直接获取(推荐)**
```bash
git clone --recurse-submodules https://github.com/666ghj/BettaFish.git
cd BettaFish/MindSpider
```
**方式二:已克隆主项目后补充拉取**
如果你已经克隆了 BettaFish 但 `MindSpider/DeepSentimentCrawling/MediaCrawler` 目录为空,请在**项目根目录**运行:
```bash
git clone https://github.com/yourusername/MindSpider.git
cd MindSpider
git submodule update --init --recursive
```
> **注意**:MediaCrawler 的 Python 依赖会在首次运行 `python main.py` 时由系统自动检测并静默安装到当前环境。
### 2. 创建并激活环境
#### Conda配置方法
... ... @@ -316,7 +330,7 @@ python main.py --broad-topic --date 2024-01-15
**首次使用每个平台都需要登录,这是最关键的步骤:**
1. **小红书登录**(暂时废弃,详情查看:https://github.com/NanmiCoder/MediaCrawler/issues/754)
1. **小红书登录**
```bash
# 测试小红书爬取(会弹出二维码)
python main.py --deep-sentiment --platforms xhs --test
... ...
... ... @@ -165,6 +165,21 @@ class MindSpider:
logger.exception(f"数据库初始化异常: {e}")
return False
def _ensure_database_ready(self) -> bool:
"""确保数据库表已就绪,如不存在则自动初始化"""
if not self.check_database_connection():
logger.error("数据库连接失败,无法继续")
return False
if not self.check_database_tables():
logger.warning("数据库表不存在,自动初始化中...")
if not self.initialize_database():
logger.error("数据库自动初始化失败")
return False
logger.info("数据库表自动初始化成功")
return True
def check_dependencies(self) -> bool:
"""检查依赖环境"""
logger.info("检查依赖环境...")
... ... @@ -184,19 +199,69 @@ class MindSpider:
logger.info("请运行: pip install -r requirements.txt")
return False
# 检查MediaCrawler依赖
# 检查并安装MediaCrawler依赖
mediacrawler_path = self.deep_sentiment_path / "MediaCrawler"
if not mediacrawler_path.exists():
logger.error("错误:找不到MediaCrawler目录")
return False
# 自动安装MediaCrawler的依赖
self._install_mediacrawler_dependencies()
logger.info("依赖环境检查通过")
return True
def _install_mediacrawler_dependencies(self) -> bool:
"""自动安装MediaCrawler子模块的依赖"""
mediacrawler_req = self.deep_sentiment_path / "MediaCrawler" / "requirements.txt"
if not mediacrawler_req.exists():
logger.warning(f"MediaCrawler requirements.txt 不存在: {mediacrawler_req}")
return False
# 检查是否已安装过(使用标记文件)
marker_file = self.deep_sentiment_path / "MediaCrawler" / ".deps_installed"
req_mtime = mediacrawler_req.stat().st_mtime
if marker_file.exists():
marker_mtime = marker_file.stat().st_mtime
if marker_mtime >= req_mtime:
logger.debug("MediaCrawler依赖已安装,跳过")
return True
logger.info("正在安装MediaCrawler依赖...")
try:
result = subprocess.run(
[sys.executable, "-m", "pip", "install", "-r", str(mediacrawler_req), "-q"],
capture_output=True,
text=True,
timeout=300 # 5分钟超时
)
if result.returncode == 0:
# 创建标记文件
marker_file.touch()
logger.info("MediaCrawler依赖安装成功")
return True
else:
logger.error(f"MediaCrawler依赖安装失败: {result.stderr}")
return False
except subprocess.TimeoutExpired:
logger.error("MediaCrawler依赖安装超时")
return False
except Exception as e:
logger.exception(f"MediaCrawler依赖安装异常: {e}")
return False
def run_broad_topic_extraction(self, extract_date: date = None, keywords_count: int = 100) -> bool:
"""运行BroadTopicExtraction模块"""
logger.info("运行BroadTopicExtraction模块...")
# 自动检查并初始化数据库表
if not self._ensure_database_ready():
return False
if not extract_date:
extract_date = date.today()
... ... @@ -234,6 +299,10 @@ class MindSpider:
"""运行DeepSentimentCrawling模块"""
logger.info("运行DeepSentimentCrawling模块...")
# 自动检查并初始化数据库表
if not self._ensure_database_ready():
return False
if not target_date:
target_date = date.today()
... ... @@ -282,6 +351,10 @@ class MindSpider:
"""运行完整工作流程"""
logger.info("开始完整的MindSpider工作流程")
# 自动检查并初始化数据库表(确保独立调用时也能自动初始化)
if not self._ensure_database_ready():
return False
if not target_date:
target_date = date.today()
... ...
... ... @@ -35,9 +35,11 @@ jieba==0.42.1
pymysql==1.1.0
aiomysql==0.2.0
aiosqlite==0.21.0
motor>=3.3.0
redis>=4.6.0
SQLAlchemy==2.0.35
asyncpg==0.29.0
psycopg[binary]>=3.1.0
cryptography==42.0.7
# ===== 爬虫相关 =====
... ... @@ -67,6 +69,7 @@ xgboost>=2.0.0
# NOTE:如果要安装GPU版本的torch,指令为pip3 install torch torchvision --index-url https://download.pytorch.org/whl/cu126
# ===== 工具库 =====
typer>=0.9.0
python-dotenv>=1.0.0
python-dateutil>=2.8.2
pytz>=2023.3
... ...