feat: migrate MediaCrawler to git submodule and enhance MindSpider automation
Showing
5 changed files
with
130 additions
and
27 deletions
| @@ -340,6 +340,7 @@ test_results/ | @@ -340,6 +340,7 @@ test_results/ | ||
| 340 | # Ai操作指引文件 | 340 | # Ai操作指引文件 |
| 341 | OperationGuidance/ | 341 | OperationGuidance/ |
| 342 | 342 | ||
| 343 | +db_data/ | ||
| 343 | insight_engine_streamlit_reports/ | 344 | insight_engine_streamlit_reports/ |
| 344 | media_engine_streamlit_reports/ | 345 | media_engine_streamlit_reports/ |
| 345 | query_engine_streamlit_reports/ | 346 | query_engine_streamlit_reports/ |
| @@ -107,19 +107,34 @@ sqlite_db_config = {{ | @@ -107,19 +107,34 @@ sqlite_db_config = {{ | ||
| 107 | "db_path": SQLITE_DB_PATH | 107 | "db_path": SQLITE_DB_PATH |
| 108 | }} | 108 | }} |
| 109 | 109 | ||
| 110 | -# postgresql config - 使用MindSpider的数据库配置(如果DB_DIALECT是postgresql)或环境变量 | ||
| 111 | -POSTGRESQL_DB_PWD = os.getenv("POSTGRESQL_DB_PWD", "{pg_password}") | ||
| 112 | -POSTGRESQL_DB_USER = os.getenv("POSTGRESQL_DB_USER", "{pg_user}") | ||
| 113 | -POSTGRESQL_DB_HOST = os.getenv("POSTGRESQL_DB_HOST", "{pg_host}") | ||
| 114 | -POSTGRESQL_DB_PORT = os.getenv("POSTGRESQL_DB_PORT", "{pg_port}") | ||
| 115 | -POSTGRESQL_DB_NAME = os.getenv("POSTGRESQL_DB_NAME", "{pg_db_name}") | 110 | +# mongodb config |
| 111 | +MONGODB_HOST = os.getenv("MONGODB_HOST", "localhost") | ||
| 112 | +MONGODB_PORT = os.getenv("MONGODB_PORT", 27017) | ||
| 113 | +MONGODB_USER = os.getenv("MONGODB_USER", "") | ||
| 114 | +MONGODB_PWD = os.getenv("MONGODB_PWD", "") | ||
| 115 | +MONGODB_DB_NAME = os.getenv("MONGODB_DB_NAME", "media_crawler") | ||
| 116 | 116 | ||
| 117 | -postgresql_db_config = {{ | ||
| 118 | - "user": POSTGRESQL_DB_USER, | ||
| 119 | - "password": POSTGRESQL_DB_PWD, | ||
| 120 | - "host": POSTGRESQL_DB_HOST, | ||
| 121 | - "port": POSTGRESQL_DB_PORT, | ||
| 122 | - "db_name": POSTGRESQL_DB_NAME, | 117 | +mongodb_config = {{ |
| 118 | + "host": MONGODB_HOST, | ||
| 119 | + "port": int(MONGODB_PORT), | ||
| 120 | + "user": MONGODB_USER, | ||
| 121 | + "password": MONGODB_PWD, | ||
| 122 | + "db_name": MONGODB_DB_NAME, | ||
| 123 | +}} | ||
| 124 | + | ||
| 125 | +# postgres config - 使用MindSpider的数据库配置(如果DB_DIALECT是postgresql)或环境变量 | ||
| 126 | +POSTGRES_DB_PWD = os.getenv("POSTGRES_DB_PWD", "{pg_password}") | ||
| 127 | +POSTGRES_DB_USER = os.getenv("POSTGRES_DB_USER", "{pg_user}") | ||
| 128 | +POSTGRES_DB_HOST = os.getenv("POSTGRES_DB_HOST", "{pg_host}") | ||
| 129 | +POSTGRES_DB_PORT = os.getenv("POSTGRES_DB_PORT", "{pg_port}") | ||
| 130 | +POSTGRES_DB_NAME = os.getenv("POSTGRES_DB_NAME", "{pg_db_name}") | ||
| 131 | + | ||
| 132 | +postgres_db_config = {{ | ||
| 133 | + "user": POSTGRES_DB_USER, | ||
| 134 | + "password": POSTGRES_DB_PWD, | ||
| 135 | + "host": POSTGRES_DB_HOST, | ||
| 136 | + "port": POSTGRES_DB_PORT, | ||
| 137 | + "db_name": POSTGRES_DB_NAME, | ||
| 123 | }} | 138 | }} |
| 124 | 139 | ||
| 125 | ''' | 140 | ''' |
| @@ -154,7 +169,7 @@ postgresql_db_config = {{ | @@ -154,7 +169,7 @@ postgresql_db_config = {{ | ||
| 154 | # 判断数据库类型,确定 SAVE_DATA_OPTION | 169 | # 判断数据库类型,确定 SAVE_DATA_OPTION |
| 155 | db_dialect = (config.settings.DB_DIALECT or "mysql").lower() | 170 | db_dialect = (config.settings.DB_DIALECT or "mysql").lower() |
| 156 | is_postgresql = db_dialect in ("postgresql", "postgres") | 171 | is_postgresql = db_dialect in ("postgresql", "postgres") |
| 157 | - save_data_option = "postgresql" if is_postgresql else "db" | 172 | + save_data_option = "postgres" if is_postgresql else "db" |
| 158 | 173 | ||
| 159 | base_config_path = self.mediacrawler_path / "config" / "base_config.py" | 174 | base_config_path = self.mediacrawler_path / "config" / "base_config.py" |
| 160 | 175 | ||
| @@ -238,7 +253,7 @@ postgresql_db_config = {{ | @@ -238,7 +253,7 @@ postgresql_db_config = {{ | ||
| 238 | # 判断数据库类型,确定 save_data_option | 253 | # 判断数据库类型,确定 save_data_option |
| 239 | db_dialect = (config.settings.DB_DIALECT or "mysql").lower() | 254 | db_dialect = (config.settings.DB_DIALECT or "mysql").lower() |
| 240 | is_postgresql = db_dialect in ("postgresql", "postgres") | 255 | is_postgresql = db_dialect in ("postgresql", "postgres") |
| 241 | - save_data_option = "postgresql" if is_postgresql else "db" | 256 | + save_data_option = "postgres" if is_postgresql else "db" |
| 242 | 257 | ||
| 243 | # 构建命令 | 258 | # 构建命令 |
| 244 | cmd = [ | 259 | cmd = [ |
| @@ -401,7 +416,7 @@ postgresql_db_config = {{ | @@ -401,7 +416,7 @@ postgresql_db_config = {{ | ||
| 401 | total_stats["keyword_results"][keyword] = {} | 416 | total_stats["keyword_results"][keyword] = {} |
| 402 | total_stats["keyword_results"][keyword][platform] = result | 417 | total_stats["keyword_results"][keyword][platform] = result |
| 403 | 418 | ||
| 404 | - logger.info(f" ✅ 成功: {notes_count} 条内容, {comments_count} 条评论") | 419 | + logger.info(f" ✅ 爬取成功") |
| 405 | else: | 420 | else: |
| 406 | total_stats["failed_tasks"] += len(keywords) | 421 | total_stats["failed_tasks"] += len(keywords) |
| 407 | total_stats["platform_summary"][platform]["failed_keywords"] = len(keywords) | 422 | total_stats["platform_summary"][platform]["failed_keywords"] = len(keywords) |
| @@ -433,15 +448,12 @@ postgresql_db_config = {{ | @@ -433,15 +448,12 @@ postgresql_db_config = {{ | ||
| 433 | finish_message += f"\n 成功: {total_stats['successful_tasks']}" | 448 | finish_message += f"\n 成功: {total_stats['successful_tasks']}" |
| 434 | finish_message += f"\n 失败: {total_stats['failed_tasks']}" | 449 | finish_message += f"\n 失败: {total_stats['failed_tasks']}" |
| 435 | finish_message += f"\n 成功率: {total_stats['successful_tasks']/total_stats['total_tasks']*100:.1f}%" | 450 | finish_message += f"\n 成功率: {total_stats['successful_tasks']/total_stats['total_tasks']*100:.1f}%" |
| 436 | - finish_message += f"\n 总内容: {total_stats['total_notes']} 条" | ||
| 437 | - finish_message += f"\n 总评论: {total_stats['total_comments']} 条" | ||
| 438 | logger.info(finish_message) | 451 | logger.info(finish_message) |
| 439 | 452 | ||
| 440 | - platform_summary_message = f"\n� 各平台统计:" | 453 | + platform_summary_message = f"\n📈 各平台统计:" |
| 441 | for platform, stats in total_stats["platform_summary"].items(): | 454 | for platform, stats in total_stats["platform_summary"].items(): |
| 442 | success_rate = stats["successful_keywords"] / len(keywords) * 100 if keywords else 0 | 455 | success_rate = stats["successful_keywords"] / len(keywords) * 100 if keywords else 0 |
| 443 | - platform_summary_message += f"\n {platform}: {stats['successful_keywords']}/{len(keywords)} 关键词成功 ({success_rate:.1f}%), " | ||
| 444 | - platform_summary_message += f"{stats['total_notes']} 条内容" | 456 | + platform_summary_message += f"\n {platform}: {stats['successful_keywords']}/{len(keywords)} 关键词成功 ({success_rate:.1f}%)" |
| 445 | logger.info(platform_summary_message) | 457 | logger.info(platform_summary_message) |
| 446 | 458 | ||
| 447 | return total_stats | 459 | return total_stats |
| @@ -186,8 +186,8 @@ flowchart TB | @@ -186,8 +186,8 @@ flowchart TB | ||
| 186 | - 记录任务状态、进度、结果等 | 186 | - 记录任务状态、进度、结果等 |
| 187 | 187 | ||
| 188 | 5. **平台内容表**(继承自MediaCrawler) | 188 | 5. **平台内容表**(继承自MediaCrawler) |
| 189 | - - xhs_note - 小红书笔记(暂时废弃,详情查看:https://github.com/NanmiCoder/MediaCrawler/issues/754) | ||
| 190 | - - douyin_aweme - 抖音视频 | 189 | + - xhs_note - 小红书笔记 |
| 190 | + - douyin_aweme - 抖音视频 | ||
| 191 | - kuaishou_video - 快手视频 | 191 | - kuaishou_video - 快手视频 |
| 192 | - bilibili_video - B站视频 | 192 | - bilibili_video - B站视频 |
| 193 | - weibo_note - 微博帖子 | 193 | - weibo_note - 微博帖子 |
| @@ -204,13 +204,27 @@ flowchart TB | @@ -204,13 +204,27 @@ flowchart TB | ||
| 204 | - 操作系统:Windows/Linux/macOS | 204 | - 操作系统:Windows/Linux/macOS |
| 205 | 205 | ||
| 206 | 206 | ||
| 207 | -### 1. 克隆项目 | 207 | +### 1. 克隆项目与获取子模块 |
| 208 | + | ||
| 209 | +MindSpider 作为 BettaFish 的核心组件运行。请克隆 BettaFish 主项目并同步获取 `MediaCrawler` 爬虫子模块。 | ||
| 210 | + | ||
| 211 | +**方式一:克隆时直接获取(推荐)** | ||
| 212 | + | ||
| 213 | +```bash | ||
| 214 | +git clone --recurse-submodules https://github.com/666ghj/BettaFish.git | ||
| 215 | +cd BettaFish/MindSpider | ||
| 216 | +``` | ||
| 217 | + | ||
| 218 | +**方式二:已克隆主项目后补充拉取** | ||
| 219 | + | ||
| 220 | +如果你已经克隆了 BettaFish 但 `MindSpider/DeepSentimentCrawling/MediaCrawler` 目录为空,请在**项目根目录**运行: | ||
| 208 | 221 | ||
| 209 | ```bash | 222 | ```bash |
| 210 | -git clone https://github.com/yourusername/MindSpider.git | ||
| 211 | -cd MindSpider | 223 | +git submodule update --init --recursive |
| 212 | ``` | 224 | ``` |
| 213 | 225 | ||
| 226 | +> **注意**:MediaCrawler 的 Python 依赖会在首次运行 `python main.py` 时由系统自动检测并静默安装到当前环境。 | ||
| 227 | + | ||
| 214 | ### 2. 创建并激活环境 | 228 | ### 2. 创建并激活环境 |
| 215 | 229 | ||
| 216 | #### Conda配置方法 | 230 | #### Conda配置方法 |
| @@ -316,7 +330,7 @@ python main.py --broad-topic --date 2024-01-15 | @@ -316,7 +330,7 @@ python main.py --broad-topic --date 2024-01-15 | ||
| 316 | 330 | ||
| 317 | **首次使用每个平台都需要登录,这是最关键的步骤:** | 331 | **首次使用每个平台都需要登录,这是最关键的步骤:** |
| 318 | 332 | ||
| 319 | -1. **小红书登录**(暂时废弃,详情查看:https://github.com/NanmiCoder/MediaCrawler/issues/754) | 333 | +1. **小红书登录** |
| 320 | ```bash | 334 | ```bash |
| 321 | # 测试小红书爬取(会弹出二维码) | 335 | # 测试小红书爬取(会弹出二维码) |
| 322 | python main.py --deep-sentiment --platforms xhs --test | 336 | python main.py --deep-sentiment --platforms xhs --test |
| @@ -165,6 +165,21 @@ class MindSpider: | @@ -165,6 +165,21 @@ class MindSpider: | ||
| 165 | logger.exception(f"数据库初始化异常: {e}") | 165 | logger.exception(f"数据库初始化异常: {e}") |
| 166 | return False | 166 | return False |
| 167 | 167 | ||
| 168 | + def _ensure_database_ready(self) -> bool: | ||
| 169 | + """确保数据库表已就绪,如不存在则自动初始化""" | ||
| 170 | + if not self.check_database_connection(): | ||
| 171 | + logger.error("数据库连接失败,无法继续") | ||
| 172 | + return False | ||
| 173 | + | ||
| 174 | + if not self.check_database_tables(): | ||
| 175 | + logger.warning("数据库表不存在,自动初始化中...") | ||
| 176 | + if not self.initialize_database(): | ||
| 177 | + logger.error("数据库自动初始化失败") | ||
| 178 | + return False | ||
| 179 | + logger.info("数据库表自动初始化成功") | ||
| 180 | + | ||
| 181 | + return True | ||
| 182 | + | ||
| 168 | def check_dependencies(self) -> bool: | 183 | def check_dependencies(self) -> bool: |
| 169 | """检查依赖环境""" | 184 | """检查依赖环境""" |
| 170 | logger.info("检查依赖环境...") | 185 | logger.info("检查依赖环境...") |
| @@ -184,19 +199,69 @@ class MindSpider: | @@ -184,19 +199,69 @@ class MindSpider: | ||
| 184 | logger.info("请运行: pip install -r requirements.txt") | 199 | logger.info("请运行: pip install -r requirements.txt") |
| 185 | return False | 200 | return False |
| 186 | 201 | ||
| 187 | - # 检查MediaCrawler依赖 | 202 | + # 检查并安装MediaCrawler依赖 |
| 188 | mediacrawler_path = self.deep_sentiment_path / "MediaCrawler" | 203 | mediacrawler_path = self.deep_sentiment_path / "MediaCrawler" |
| 189 | if not mediacrawler_path.exists(): | 204 | if not mediacrawler_path.exists(): |
| 190 | logger.error("错误:找不到MediaCrawler目录") | 205 | logger.error("错误:找不到MediaCrawler目录") |
| 191 | return False | 206 | return False |
| 192 | 207 | ||
| 208 | + # 自动安装MediaCrawler的依赖 | ||
| 209 | + self._install_mediacrawler_dependencies() | ||
| 210 | + | ||
| 193 | logger.info("依赖环境检查通过") | 211 | logger.info("依赖环境检查通过") |
| 194 | return True | 212 | return True |
| 195 | 213 | ||
| 214 | + def _install_mediacrawler_dependencies(self) -> bool: | ||
| 215 | + """自动安装MediaCrawler子模块的依赖""" | ||
| 216 | + mediacrawler_req = self.deep_sentiment_path / "MediaCrawler" / "requirements.txt" | ||
| 217 | + | ||
| 218 | + if not mediacrawler_req.exists(): | ||
| 219 | + logger.warning(f"MediaCrawler requirements.txt 不存在: {mediacrawler_req}") | ||
| 220 | + return False | ||
| 221 | + | ||
| 222 | + # 检查是否已安装过(使用标记文件) | ||
| 223 | + marker_file = self.deep_sentiment_path / "MediaCrawler" / ".deps_installed" | ||
| 224 | + req_mtime = mediacrawler_req.stat().st_mtime | ||
| 225 | + | ||
| 226 | + if marker_file.exists(): | ||
| 227 | + marker_mtime = marker_file.stat().st_mtime | ||
| 228 | + if marker_mtime >= req_mtime: | ||
| 229 | + logger.debug("MediaCrawler依赖已安装,跳过") | ||
| 230 | + return True | ||
| 231 | + | ||
| 232 | + logger.info("正在安装MediaCrawler依赖...") | ||
| 233 | + try: | ||
| 234 | + result = subprocess.run( | ||
| 235 | + [sys.executable, "-m", "pip", "install", "-r", str(mediacrawler_req), "-q"], | ||
| 236 | + capture_output=True, | ||
| 237 | + text=True, | ||
| 238 | + timeout=300 # 5分钟超时 | ||
| 239 | + ) | ||
| 240 | + | ||
| 241 | + if result.returncode == 0: | ||
| 242 | + # 创建标记文件 | ||
| 243 | + marker_file.touch() | ||
| 244 | + logger.info("MediaCrawler依赖安装成功") | ||
| 245 | + return True | ||
| 246 | + else: | ||
| 247 | + logger.error(f"MediaCrawler依赖安装失败: {result.stderr}") | ||
| 248 | + return False | ||
| 249 | + | ||
| 250 | + except subprocess.TimeoutExpired: | ||
| 251 | + logger.error("MediaCrawler依赖安装超时") | ||
| 252 | + return False | ||
| 253 | + except Exception as e: | ||
| 254 | + logger.exception(f"MediaCrawler依赖安装异常: {e}") | ||
| 255 | + return False | ||
| 256 | + | ||
| 196 | def run_broad_topic_extraction(self, extract_date: date = None, keywords_count: int = 100) -> bool: | 257 | def run_broad_topic_extraction(self, extract_date: date = None, keywords_count: int = 100) -> bool: |
| 197 | """运行BroadTopicExtraction模块""" | 258 | """运行BroadTopicExtraction模块""" |
| 198 | logger.info("运行BroadTopicExtraction模块...") | 259 | logger.info("运行BroadTopicExtraction模块...") |
| 199 | 260 | ||
| 261 | + # 自动检查并初始化数据库表 | ||
| 262 | + if not self._ensure_database_ready(): | ||
| 263 | + return False | ||
| 264 | + | ||
| 200 | if not extract_date: | 265 | if not extract_date: |
| 201 | extract_date = date.today() | 266 | extract_date = date.today() |
| 202 | 267 | ||
| @@ -234,6 +299,10 @@ class MindSpider: | @@ -234,6 +299,10 @@ class MindSpider: | ||
| 234 | """运行DeepSentimentCrawling模块""" | 299 | """运行DeepSentimentCrawling模块""" |
| 235 | logger.info("运行DeepSentimentCrawling模块...") | 300 | logger.info("运行DeepSentimentCrawling模块...") |
| 236 | 301 | ||
| 302 | + # 自动检查并初始化数据库表 | ||
| 303 | + if not self._ensure_database_ready(): | ||
| 304 | + return False | ||
| 305 | + | ||
| 237 | if not target_date: | 306 | if not target_date: |
| 238 | target_date = date.today() | 307 | target_date = date.today() |
| 239 | 308 | ||
| @@ -282,6 +351,10 @@ class MindSpider: | @@ -282,6 +351,10 @@ class MindSpider: | ||
| 282 | """运行完整工作流程""" | 351 | """运行完整工作流程""" |
| 283 | logger.info("开始完整的MindSpider工作流程") | 352 | logger.info("开始完整的MindSpider工作流程") |
| 284 | 353 | ||
| 354 | + # 自动检查并初始化数据库表(确保独立调用时也能自动初始化) | ||
| 355 | + if not self._ensure_database_ready(): | ||
| 356 | + return False | ||
| 357 | + | ||
| 285 | if not target_date: | 358 | if not target_date: |
| 286 | target_date = date.today() | 359 | target_date = date.today() |
| 287 | 360 |
| @@ -35,9 +35,11 @@ jieba==0.42.1 | @@ -35,9 +35,11 @@ jieba==0.42.1 | ||
| 35 | pymysql==1.1.0 | 35 | pymysql==1.1.0 |
| 36 | aiomysql==0.2.0 | 36 | aiomysql==0.2.0 |
| 37 | aiosqlite==0.21.0 | 37 | aiosqlite==0.21.0 |
| 38 | +motor>=3.3.0 | ||
| 38 | redis>=4.6.0 | 39 | redis>=4.6.0 |
| 39 | SQLAlchemy==2.0.35 | 40 | SQLAlchemy==2.0.35 |
| 40 | asyncpg==0.29.0 | 41 | asyncpg==0.29.0 |
| 42 | +psycopg[binary]>=3.1.0 | ||
| 41 | cryptography==42.0.7 | 43 | cryptography==42.0.7 |
| 42 | 44 | ||
| 43 | # ===== 爬虫相关 ===== | 45 | # ===== 爬虫相关 ===== |
| @@ -67,6 +69,7 @@ xgboost>=2.0.0 | @@ -67,6 +69,7 @@ xgboost>=2.0.0 | ||
| 67 | # NOTE:如果要安装GPU版本的torch,指令为pip3 install torch torchvision --index-url https://download.pytorch.org/whl/cu126 | 69 | # NOTE:如果要安装GPU版本的torch,指令为pip3 install torch torchvision --index-url https://download.pytorch.org/whl/cu126 |
| 68 | 70 | ||
| 69 | # ===== 工具库 ===== | 71 | # ===== 工具库 ===== |
| 72 | +typer>=0.9.0 | ||
| 70 | python-dotenv>=1.0.0 | 73 | python-dotenv>=1.0.0 |
| 71 | python-dateutil>=2.8.2 | 74 | python-dateutil>=2.8.2 |
| 72 | pytz>=2023.3 | 75 | pytz>=2023.3 |
-
Please register or login to post a comment