Doiiars

feat: migrate MediaCrawler to git submodule and enhance MindSpider automation

@@ -340,6 +340,7 @@ test_results/ @@ -340,6 +340,7 @@ test_results/
340 # Ai操作指引文件 340 # Ai操作指引文件
341 OperationGuidance/ 341 OperationGuidance/
342 342
  343 +db_data/
343 insight_engine_streamlit_reports/ 344 insight_engine_streamlit_reports/
344 media_engine_streamlit_reports/ 345 media_engine_streamlit_reports/
345 query_engine_streamlit_reports/ 346 query_engine_streamlit_reports/
@@ -107,19 +107,34 @@ sqlite_db_config = {{ @@ -107,19 +107,34 @@ sqlite_db_config = {{
107 "db_path": SQLITE_DB_PATH 107 "db_path": SQLITE_DB_PATH
108 }} 108 }}
109 109
110 -# postgresql config - 使用MindSpider的数据库配置(如果DB_DIALECT是postgresql)或环境变量  
111 -POSTGRESQL_DB_PWD = os.getenv("POSTGRESQL_DB_PWD", "{pg_password}")  
112 -POSTGRESQL_DB_USER = os.getenv("POSTGRESQL_DB_USER", "{pg_user}")  
113 -POSTGRESQL_DB_HOST = os.getenv("POSTGRESQL_DB_HOST", "{pg_host}")  
114 -POSTGRESQL_DB_PORT = os.getenv("POSTGRESQL_DB_PORT", "{pg_port}")  
115 -POSTGRESQL_DB_NAME = os.getenv("POSTGRESQL_DB_NAME", "{pg_db_name}") 110 +# mongodb config
  111 +MONGODB_HOST = os.getenv("MONGODB_HOST", "localhost")
  112 +MONGODB_PORT = os.getenv("MONGODB_PORT", 27017)
  113 +MONGODB_USER = os.getenv("MONGODB_USER", "")
  114 +MONGODB_PWD = os.getenv("MONGODB_PWD", "")
  115 +MONGODB_DB_NAME = os.getenv("MONGODB_DB_NAME", "media_crawler")
116 116
117 -postgresql_db_config = {{  
118 - "user": POSTGRESQL_DB_USER,  
119 - "password": POSTGRESQL_DB_PWD,  
120 - "host": POSTGRESQL_DB_HOST,  
121 - "port": POSTGRESQL_DB_PORT,  
122 - "db_name": POSTGRESQL_DB_NAME, 117 +mongodb_config = {{
  118 + "host": MONGODB_HOST,
  119 + "port": int(MONGODB_PORT),
  120 + "user": MONGODB_USER,
  121 + "password": MONGODB_PWD,
  122 + "db_name": MONGODB_DB_NAME,
  123 +}}
  124 +
  125 +# postgres config - 使用MindSpider的数据库配置(如果DB_DIALECT是postgresql)或环境变量
  126 +POSTGRES_DB_PWD = os.getenv("POSTGRES_DB_PWD", "{pg_password}")
  127 +POSTGRES_DB_USER = os.getenv("POSTGRES_DB_USER", "{pg_user}")
  128 +POSTGRES_DB_HOST = os.getenv("POSTGRES_DB_HOST", "{pg_host}")
  129 +POSTGRES_DB_PORT = os.getenv("POSTGRES_DB_PORT", "{pg_port}")
  130 +POSTGRES_DB_NAME = os.getenv("POSTGRES_DB_NAME", "{pg_db_name}")
  131 +
  132 +postgres_db_config = {{
  133 + "user": POSTGRES_DB_USER,
  134 + "password": POSTGRES_DB_PWD,
  135 + "host": POSTGRES_DB_HOST,
  136 + "port": POSTGRES_DB_PORT,
  137 + "db_name": POSTGRES_DB_NAME,
123 }} 138 }}
124 139
125 ''' 140 '''
@@ -154,7 +169,7 @@ postgresql_db_config = {{ @@ -154,7 +169,7 @@ postgresql_db_config = {{
154 # 判断数据库类型,确定 SAVE_DATA_OPTION 169 # 判断数据库类型,确定 SAVE_DATA_OPTION
155 db_dialect = (config.settings.DB_DIALECT or "mysql").lower() 170 db_dialect = (config.settings.DB_DIALECT or "mysql").lower()
156 is_postgresql = db_dialect in ("postgresql", "postgres") 171 is_postgresql = db_dialect in ("postgresql", "postgres")
157 - save_data_option = "postgresql" if is_postgresql else "db" 172 + save_data_option = "postgres" if is_postgresql else "db"
158 173
159 base_config_path = self.mediacrawler_path / "config" / "base_config.py" 174 base_config_path = self.mediacrawler_path / "config" / "base_config.py"
160 175
@@ -238,7 +253,7 @@ postgresql_db_config = {{ @@ -238,7 +253,7 @@ postgresql_db_config = {{
238 # 判断数据库类型,确定 save_data_option 253 # 判断数据库类型,确定 save_data_option
239 db_dialect = (config.settings.DB_DIALECT or "mysql").lower() 254 db_dialect = (config.settings.DB_DIALECT or "mysql").lower()
240 is_postgresql = db_dialect in ("postgresql", "postgres") 255 is_postgresql = db_dialect in ("postgresql", "postgres")
241 - save_data_option = "postgresql" if is_postgresql else "db" 256 + save_data_option = "postgres" if is_postgresql else "db"
242 257
243 # 构建命令 258 # 构建命令
244 cmd = [ 259 cmd = [
@@ -401,7 +416,7 @@ postgresql_db_config = {{ @@ -401,7 +416,7 @@ postgresql_db_config = {{
401 total_stats["keyword_results"][keyword] = {} 416 total_stats["keyword_results"][keyword] = {}
402 total_stats["keyword_results"][keyword][platform] = result 417 total_stats["keyword_results"][keyword][platform] = result
403 418
404 - logger.info(f" ✅ 成功: {notes_count} 条内容, {comments_count} 条评论") 419 + logger.info(f" ✅ 爬取成功")
405 else: 420 else:
406 total_stats["failed_tasks"] += len(keywords) 421 total_stats["failed_tasks"] += len(keywords)
407 total_stats["platform_summary"][platform]["failed_keywords"] = len(keywords) 422 total_stats["platform_summary"][platform]["failed_keywords"] = len(keywords)
@@ -433,15 +448,12 @@ postgresql_db_config = {{ @@ -433,15 +448,12 @@ postgresql_db_config = {{
433 finish_message += f"\n 成功: {total_stats['successful_tasks']}" 448 finish_message += f"\n 成功: {total_stats['successful_tasks']}"
434 finish_message += f"\n 失败: {total_stats['failed_tasks']}" 449 finish_message += f"\n 失败: {total_stats['failed_tasks']}"
435 finish_message += f"\n 成功率: {total_stats['successful_tasks']/total_stats['total_tasks']*100:.1f}%" 450 finish_message += f"\n 成功率: {total_stats['successful_tasks']/total_stats['total_tasks']*100:.1f}%"
436 - finish_message += f"\n 总内容: {total_stats['total_notes']} 条"  
437 - finish_message += f"\n 总评论: {total_stats['total_comments']} 条"  
438 logger.info(finish_message) 451 logger.info(finish_message)
439 452
440 - platform_summary_message = f"\n 各平台统计:" 453 + platform_summary_message = f"\n📈 各平台统计:"
441 for platform, stats in total_stats["platform_summary"].items(): 454 for platform, stats in total_stats["platform_summary"].items():
442 success_rate = stats["successful_keywords"] / len(keywords) * 100 if keywords else 0 455 success_rate = stats["successful_keywords"] / len(keywords) * 100 if keywords else 0
443 - platform_summary_message += f"\n {platform}: {stats['successful_keywords']}/{len(keywords)} 关键词成功 ({success_rate:.1f}%), "  
444 - platform_summary_message += f"{stats['total_notes']} 条内容" 456 + platform_summary_message += f"\n {platform}: {stats['successful_keywords']}/{len(keywords)} 关键词成功 ({success_rate:.1f}%)"
445 logger.info(platform_summary_message) 457 logger.info(platform_summary_message)
446 458
447 return total_stats 459 return total_stats
@@ -186,8 +186,8 @@ flowchart TB @@ -186,8 +186,8 @@ flowchart TB
186 - 记录任务状态、进度、结果等 186 - 记录任务状态、进度、结果等
187 187
188 5. **平台内容表**(继承自MediaCrawler) 188 5. **平台内容表**(继承自MediaCrawler)
189 - - xhs_note - 小红书笔记(暂时废弃,详情查看:https://github.com/NanmiCoder/MediaCrawler/issues/754)  
190 - - douyin_aweme - 抖音视频 189 + - xhs_note - 小红书笔记
  190 + - douyin_aweme - 抖音视频
191 - kuaishou_video - 快手视频 191 - kuaishou_video - 快手视频
192 - bilibili_video - B站视频 192 - bilibili_video - B站视频
193 - weibo_note - 微博帖子 193 - weibo_note - 微博帖子
@@ -204,13 +204,27 @@ flowchart TB @@ -204,13 +204,27 @@ flowchart TB
204 - 操作系统:Windows/Linux/macOS 204 - 操作系统:Windows/Linux/macOS
205 205
206 206
207 -### 1. 克隆项目 207 +### 1. 克隆项目与获取子模块
  208 +
  209 +MindSpider 作为 BettaFish 的核心组件运行。请克隆 BettaFish 主项目并同步获取 `MediaCrawler` 爬虫子模块。
  210 +
  211 +**方式一:克隆时直接获取(推荐)**
  212 +
  213 +```bash
  214 +git clone --recurse-submodules https://github.com/666ghj/BettaFish.git
  215 +cd BettaFish/MindSpider
  216 +```
  217 +
  218 +**方式二:已克隆主项目后补充拉取**
  219 +
  220 +如果你已经克隆了 BettaFish 但 `MindSpider/DeepSentimentCrawling/MediaCrawler` 目录为空,请在**项目根目录**运行:
208 221
209 ```bash 222 ```bash
210 -git clone https://github.com/yourusername/MindSpider.git  
211 -cd MindSpider 223 +git submodule update --init --recursive
212 ``` 224 ```
213 225
  226 +> **注意**:MediaCrawler 的 Python 依赖会在首次运行 `python main.py` 时由系统自动检测并静默安装到当前环境。
  227 +
214 ### 2. 创建并激活环境 228 ### 2. 创建并激活环境
215 229
216 #### Conda配置方法 230 #### Conda配置方法
@@ -316,7 +330,7 @@ python main.py --broad-topic --date 2024-01-15 @@ -316,7 +330,7 @@ python main.py --broad-topic --date 2024-01-15
316 330
317 **首次使用每个平台都需要登录,这是最关键的步骤:** 331 **首次使用每个平台都需要登录,这是最关键的步骤:**
318 332
319 -1. **小红书登录**(暂时废弃,详情查看:https://github.com/NanmiCoder/MediaCrawler/issues/754) 333 +1. **小红书登录**
320 ```bash 334 ```bash
321 # 测试小红书爬取(会弹出二维码) 335 # 测试小红书爬取(会弹出二维码)
322 python main.py --deep-sentiment --platforms xhs --test 336 python main.py --deep-sentiment --platforms xhs --test
@@ -165,6 +165,21 @@ class MindSpider: @@ -165,6 +165,21 @@ class MindSpider:
165 logger.exception(f"数据库初始化异常: {e}") 165 logger.exception(f"数据库初始化异常: {e}")
166 return False 166 return False
167 167
  168 + def _ensure_database_ready(self) -> bool:
  169 + """确保数据库表已就绪,如不存在则自动初始化"""
  170 + if not self.check_database_connection():
  171 + logger.error("数据库连接失败,无法继续")
  172 + return False
  173 +
  174 + if not self.check_database_tables():
  175 + logger.warning("数据库表不存在,自动初始化中...")
  176 + if not self.initialize_database():
  177 + logger.error("数据库自动初始化失败")
  178 + return False
  179 + logger.info("数据库表自动初始化成功")
  180 +
  181 + return True
  182 +
168 def check_dependencies(self) -> bool: 183 def check_dependencies(self) -> bool:
169 """检查依赖环境""" 184 """检查依赖环境"""
170 logger.info("检查依赖环境...") 185 logger.info("检查依赖环境...")
@@ -184,19 +199,69 @@ class MindSpider: @@ -184,19 +199,69 @@ class MindSpider:
184 logger.info("请运行: pip install -r requirements.txt") 199 logger.info("请运行: pip install -r requirements.txt")
185 return False 200 return False
186 201
187 - # 检查MediaCrawler依赖 202 + # 检查并安装MediaCrawler依赖
188 mediacrawler_path = self.deep_sentiment_path / "MediaCrawler" 203 mediacrawler_path = self.deep_sentiment_path / "MediaCrawler"
189 if not mediacrawler_path.exists(): 204 if not mediacrawler_path.exists():
190 logger.error("错误:找不到MediaCrawler目录") 205 logger.error("错误:找不到MediaCrawler目录")
191 return False 206 return False
192 207
  208 + # 自动安装MediaCrawler的依赖
  209 + self._install_mediacrawler_dependencies()
  210 +
193 logger.info("依赖环境检查通过") 211 logger.info("依赖环境检查通过")
194 return True 212 return True
195 213
  214 + def _install_mediacrawler_dependencies(self) -> bool:
  215 + """自动安装MediaCrawler子模块的依赖"""
  216 + mediacrawler_req = self.deep_sentiment_path / "MediaCrawler" / "requirements.txt"
  217 +
  218 + if not mediacrawler_req.exists():
  219 + logger.warning(f"MediaCrawler requirements.txt 不存在: {mediacrawler_req}")
  220 + return False
  221 +
  222 + # 检查是否已安装过(使用标记文件)
  223 + marker_file = self.deep_sentiment_path / "MediaCrawler" / ".deps_installed"
  224 + req_mtime = mediacrawler_req.stat().st_mtime
  225 +
  226 + if marker_file.exists():
  227 + marker_mtime = marker_file.stat().st_mtime
  228 + if marker_mtime >= req_mtime:
  229 + logger.debug("MediaCrawler依赖已安装,跳过")
  230 + return True
  231 +
  232 + logger.info("正在安装MediaCrawler依赖...")
  233 + try:
  234 + result = subprocess.run(
  235 + [sys.executable, "-m", "pip", "install", "-r", str(mediacrawler_req), "-q"],
  236 + capture_output=True,
  237 + text=True,
  238 + timeout=300 # 5分钟超时
  239 + )
  240 +
  241 + if result.returncode == 0:
  242 + # 创建标记文件
  243 + marker_file.touch()
  244 + logger.info("MediaCrawler依赖安装成功")
  245 + return True
  246 + else:
  247 + logger.error(f"MediaCrawler依赖安装失败: {result.stderr}")
  248 + return False
  249 +
  250 + except subprocess.TimeoutExpired:
  251 + logger.error("MediaCrawler依赖安装超时")
  252 + return False
  253 + except Exception as e:
  254 + logger.exception(f"MediaCrawler依赖安装异常: {e}")
  255 + return False
  256 +
196 def run_broad_topic_extraction(self, extract_date: date = None, keywords_count: int = 100) -> bool: 257 def run_broad_topic_extraction(self, extract_date: date = None, keywords_count: int = 100) -> bool:
197 """运行BroadTopicExtraction模块""" 258 """运行BroadTopicExtraction模块"""
198 logger.info("运行BroadTopicExtraction模块...") 259 logger.info("运行BroadTopicExtraction模块...")
199 260
  261 + # 自动检查并初始化数据库表
  262 + if not self._ensure_database_ready():
  263 + return False
  264 +
200 if not extract_date: 265 if not extract_date:
201 extract_date = date.today() 266 extract_date = date.today()
202 267
@@ -234,6 +299,10 @@ class MindSpider: @@ -234,6 +299,10 @@ class MindSpider:
234 """运行DeepSentimentCrawling模块""" 299 """运行DeepSentimentCrawling模块"""
235 logger.info("运行DeepSentimentCrawling模块...") 300 logger.info("运行DeepSentimentCrawling模块...")
236 301
  302 + # 自动检查并初始化数据库表
  303 + if not self._ensure_database_ready():
  304 + return False
  305 +
237 if not target_date: 306 if not target_date:
238 target_date = date.today() 307 target_date = date.today()
239 308
@@ -282,6 +351,10 @@ class MindSpider: @@ -282,6 +351,10 @@ class MindSpider:
282 """运行完整工作流程""" 351 """运行完整工作流程"""
283 logger.info("开始完整的MindSpider工作流程") 352 logger.info("开始完整的MindSpider工作流程")
284 353
  354 + # 自动检查并初始化数据库表(确保独立调用时也能自动初始化)
  355 + if not self._ensure_database_ready():
  356 + return False
  357 +
285 if not target_date: 358 if not target_date:
286 target_date = date.today() 359 target_date = date.today()
287 360
@@ -35,9 +35,11 @@ jieba==0.42.1 @@ -35,9 +35,11 @@ jieba==0.42.1
35 pymysql==1.1.0 35 pymysql==1.1.0
36 aiomysql==0.2.0 36 aiomysql==0.2.0
37 aiosqlite==0.21.0 37 aiosqlite==0.21.0
  38 +motor>=3.3.0
38 redis>=4.6.0 39 redis>=4.6.0
39 SQLAlchemy==2.0.35 40 SQLAlchemy==2.0.35
40 asyncpg==0.29.0 41 asyncpg==0.29.0
  42 +psycopg[binary]>=3.1.0
41 cryptography==42.0.7 43 cryptography==42.0.7
42 44
43 # ===== 爬虫相关 ===== 45 # ===== 爬虫相关 =====
@@ -67,6 +69,7 @@ xgboost>=2.0.0 @@ -67,6 +69,7 @@ xgboost>=2.0.0
67 # NOTE:如果要安装GPU版本的torch,指令为pip3 install torch torchvision --index-url https://download.pytorch.org/whl/cu126 69 # NOTE:如果要安装GPU版本的torch,指令为pip3 install torch torchvision --index-url https://download.pytorch.org/whl/cu126
68 70
69 # ===== 工具库 ===== 71 # ===== 工具库 =====
  72 +typer>=0.9.0
70 python-dotenv>=1.0.0 73 python-dotenv>=1.0.0
71 python-dateutil>=2.8.2 74 python-dateutil>=2.8.2
72 pytz>=2023.3 75 pytz>=2023.3