1. 同步MediaCrawler为最新版本
2. 修复数据库not null错误 3. 支持PG数据库 4. 规范环境变量及配置使用 5. 规范为uv安装 6. 使用loggru
Showing
11 changed files
with
740 additions
and
577 deletions
| @@ -7,11 +7,12 @@ BroadTopicExtraction模块 - 数据库管理器 | @@ -7,11 +7,12 @@ BroadTopicExtraction模块 - 数据库管理器 | ||
| 7 | 7 | ||
| 8 | import sys | 8 | import sys |
| 9 | import json | 9 | import json |
| 10 | -from datetime import datetime, date | 10 | +from datetime import datetime, date, timedelta |
| 11 | from pathlib import Path | 11 | from pathlib import Path |
| 12 | from typing import List, Dict, Optional | 12 | from typing import List, Dict, Optional |
| 13 | -import pymysql | ||
| 14 | -from pymysql.cursors import DictCursor | 13 | +from sqlalchemy import create_engine, text |
| 14 | +from sqlalchemy.engine import Engine | ||
| 15 | +from loguru import logger | ||
| 15 | 16 | ||
| 16 | # 添加项目根目录到路径 | 17 | # 添加项目根目录到路径 |
| 17 | project_root = Path(__file__).parent.parent | 18 | project_root = Path(__file__).parent.parent |
| @@ -22,37 +23,44 @@ try: | @@ -22,37 +23,44 @@ try: | ||
| 22 | except ImportError: | 23 | except ImportError: |
| 23 | raise ImportError("无法导入config.py配置文件") | 24 | raise ImportError("无法导入config.py配置文件") |
| 24 | 25 | ||
| 26 | +from config import settings | ||
| 27 | + | ||
| 25 | class DatabaseManager: | 28 | class DatabaseManager: |
| 26 | """数据库管理器""" | 29 | """数据库管理器""" |
| 27 | 30 | ||
| 28 | def __init__(self): | 31 | def __init__(self): |
| 29 | """初始化数据库管理器""" | 32 | """初始化数据库管理器""" |
| 30 | - self.connection = None | 33 | + self.engine: Engine = None |
| 31 | self.connect() | 34 | self.connect() |
| 32 | 35 | ||
| 33 | def connect(self): | 36 | def connect(self): |
| 34 | """连接数据库""" | 37 | """连接数据库""" |
| 35 | try: | 38 | try: |
| 36 | - self.connection = pymysql.connect( | ||
| 37 | - host=config.DB_HOST, | ||
| 38 | - port=config.DB_PORT, | ||
| 39 | - user=config.DB_USER, | ||
| 40 | - password=config.DB_PASSWORD, | ||
| 41 | - database=config.DB_NAME, | ||
| 42 | - charset=config.DB_CHARSET, | ||
| 43 | - autocommit=True, | ||
| 44 | - cursorclass=DictCursor | ||
| 45 | - ) | ||
| 46 | - print(f"成功连接到数据库: {config.DB_NAME}") | 39 | + dialect = (settings.DB_DIALECT or "mysql").lower() |
| 40 | + if dialect in ("postgresql", "postgres"): | ||
| 41 | + url = f"postgresql+psycopg://{settings.DB_USER}:{settings.DB_PASSWORD}@{settings.DB_HOST}:{settings.DB_PORT}/{settings.DB_NAME}" | ||
| 42 | + else: | ||
| 43 | + url = f"mysql+pymysql://{settings.DB_USER}:{settings.DB_PASSWORD}@{settings.DB_HOST}:{settings.DB_PORT}/{settings.DB_NAME}?charset={settings.DB_CHARSET}" | ||
| 44 | + self.engine = create_engine(url, future=True) | ||
| 45 | + logger.info(f"成功连接到数据库: {settings.DB_NAME}") | ||
| 46 | + except ModuleNotFoundError as e: | ||
| 47 | + missing: str = str(e) | ||
| 48 | + if "psycopg" in missing: | ||
| 49 | + logger.error("数据库连接失败: 未安装PostgreSQL驱动 psycopg。请安装: psycopg[binary]。参考指令:uv pip install psycopg[binary]") | ||
| 50 | + elif "pymysql" in missing: | ||
| 51 | + logger.error("数据库连接失败: 未安装MySQL驱动 pymysql。请安装: pymysql。参考指令:uv pip install pymysql") | ||
| 52 | + else: | ||
| 53 | + logger.error(f"数据库连接失败(缺少驱动): {e}") | ||
| 54 | + raise | ||
| 47 | except Exception as e: | 55 | except Exception as e: |
| 48 | - print(f"数据库连接失败: {e}") | 56 | + logger.error(f"数据库连接失败: {e}") |
| 49 | raise | 57 | raise |
| 50 | 58 | ||
| 51 | def close(self): | 59 | def close(self): |
| 52 | """关闭数据库连接""" | 60 | """关闭数据库连接""" |
| 53 | - if self.connection: | ||
| 54 | - self.connection.close() | ||
| 55 | - print("数据库连接已关闭") | 61 | + if self.engine: |
| 62 | + self.engine.dispose() | ||
| 63 | + logger.info("数据库连接已关闭") | ||
| 56 | 64 | ||
| 57 | def __enter__(self): | 65 | def __enter__(self): |
| 58 | return self | 66 | return self |
| @@ -79,48 +87,49 @@ class DatabaseManager: | @@ -79,48 +87,49 @@ class DatabaseManager: | ||
| 79 | current_timestamp = int(datetime.now().timestamp()) | 87 | current_timestamp = int(datetime.now().timestamp()) |
| 80 | 88 | ||
| 81 | try: | 89 | try: |
| 82 | - cursor = self.connection.cursor() | ||
| 83 | - | ||
| 84 | - # 先删除当天所有的新闻记录(覆盖模式) | ||
| 85 | - delete_query = "DELETE FROM daily_news WHERE crawl_date = %s" | ||
| 86 | - deleted_count = cursor.execute(delete_query, (crawl_date,)) | ||
| 87 | - if deleted_count > 0: | ||
| 88 | - print(f"覆盖模式:删除了当天已有的 {deleted_count} 条新闻记录") | ||
| 89 | - | ||
| 90 | - # 批量插入新记录 | ||
| 91 | saved_count = 0 | 90 | saved_count = 0 |
| 91 | + # 先独立事务执行删除,防止后续插入失败导致无法清理 | ||
| 92 | + with self.engine.begin() as conn: | ||
| 93 | + deleted = conn.execute(text("DELETE FROM daily_news WHERE crawl_date = :d"), {"d": crawl_date}).rowcount | ||
| 94 | + if deleted and deleted > 0: | ||
| 95 | + logger.info(f"覆盖模式:删除了当天已有的 {deleted} 条新闻记录") | ||
| 96 | + | ||
| 97 | + # 逐条插入,单条失败不影响后续(每条独立事务) | ||
| 92 | for news_item in news_data: | 98 | for news_item in news_data: |
| 93 | try: | 99 | try: |
| 94 | - # 简化的新闻ID生成 | ||
| 95 | news_id = f"{news_item.get('source', 'unknown')}_{news_item.get('id', news_item.get('rank', 0))}" | 100 | news_id = f"{news_item.get('source', 'unknown')}_{news_item.get('id', news_item.get('rank', 0))}" |
| 96 | - | ||
| 97 | - # 插入新记录 | ||
| 98 | - insert_query = """ | ||
| 99 | - INSERT INTO daily_news ( | ||
| 100 | - news_id, source_platform, title, url, crawl_date, | ||
| 101 | - rank_position, add_ts | ||
| 102 | - ) VALUES (%s, %s, %s, %s, %s, %s, %s) | ||
| 103 | - """ | ||
| 104 | - cursor.execute(insert_query, ( | ||
| 105 | - news_id, | ||
| 106 | - news_item.get('source', 'unknown'), | ||
| 107 | - news_item.get('title', ''), | ||
| 108 | - news_item.get('url', ''), | ||
| 109 | - crawl_date, | ||
| 110 | - news_item.get('rank', None), | ||
| 111 | - current_timestamp | ||
| 112 | - )) | 101 | + title_val = (news_item.get("title", "") or "") |
| 102 | + if len(title_val) > 500: | ||
| 103 | + title_val = title_val[:500] | ||
| 104 | + with self.engine.begin() as conn: | ||
| 105 | + conn.execute( | ||
| 106 | + text( | ||
| 107 | + """ | ||
| 108 | + INSERT INTO daily_news ( | ||
| 109 | + news_id, source_platform, title, url, crawl_date, | ||
| 110 | + rank_position, add_ts, last_modify_ts | ||
| 111 | + ) VALUES (:news_id, :source_platform, :title, :url, :crawl_date, :rank_position, :add_ts, :last_modify_ts) | ||
| 112 | + """ | ||
| 113 | + ), | ||
| 114 | + { | ||
| 115 | + "news_id": news_id, | ||
| 116 | + "source_platform": news_item.get("source", "unknown"), | ||
| 117 | + "title": title_val, | ||
| 118 | + "url": news_item.get("url", ""), | ||
| 119 | + "crawl_date": crawl_date, | ||
| 120 | + "rank_position": news_item.get("rank", None), | ||
| 121 | + "add_ts": current_timestamp, | ||
| 122 | + "last_modify_ts": current_timestamp, | ||
| 123 | + }, | ||
| 124 | + ) | ||
| 113 | saved_count += 1 | 125 | saved_count += 1 |
| 114 | - | ||
| 115 | except Exception as e: | 126 | except Exception as e: |
| 116 | - print(f"保存单条新闻失败: {e}") | 127 | + logger.warning(f"保存单条新闻失败: {e}") |
| 117 | continue | 128 | continue |
| 118 | - | ||
| 119 | - print(f"成功保存 {saved_count} 条新闻记录") | 129 | + logger.info(f"成功保存 {saved_count} 条新闻记录") |
| 120 | return saved_count | 130 | return saved_count |
| 121 | - | ||
| 122 | except Exception as e: | 131 | except Exception as e: |
| 123 | - print(f"保存新闻数据失败: {e}") | 132 | + logger.exception(f"保存新闻数据失败: {e}") |
| 124 | return 0 | 133 | return 0 |
| 125 | 134 | ||
| 126 | def get_daily_news(self, crawl_date: date = None) -> List[Dict]: | 135 | def get_daily_news(self, crawl_date: date = None) -> List[Dict]: |
| @@ -136,15 +145,13 @@ class DatabaseManager: | @@ -136,15 +145,13 @@ class DatabaseManager: | ||
| 136 | if not crawl_date: | 145 | if not crawl_date: |
| 137 | crawl_date = date.today() | 146 | crawl_date = date.today() |
| 138 | 147 | ||
| 139 | - query = """ | ||
| 140 | - SELECT * FROM daily_news | ||
| 141 | - WHERE crawl_date = %s | ||
| 142 | - ORDER BY rank_position ASC | ||
| 143 | - """ | ||
| 144 | - | ||
| 145 | - cursor = self.connection.cursor() | ||
| 146 | - cursor.execute(query, (crawl_date,)) | ||
| 147 | - return cursor.fetchall() | 148 | + query = ( |
| 149 | + "SELECT * FROM daily_news WHERE crawl_date = :d ORDER BY rank_position ASC" | ||
| 150 | + ) | ||
| 151 | + with self.engine.connect() as conn: | ||
| 152 | + result = conn.execute(text(query), {"d": crawl_date}) | ||
| 153 | + rows = result.mappings().all() | ||
| 154 | + return rows | ||
| 148 | 155 | ||
| 149 | # ==================== 话题数据操作 ==================== | 156 | # ==================== 话题数据操作 ==================== |
| 150 | 157 | ||
| @@ -166,37 +173,31 @@ class DatabaseManager: | @@ -166,37 +173,31 @@ class DatabaseManager: | ||
| 166 | current_timestamp = int(datetime.now().timestamp()) | 173 | current_timestamp = int(datetime.now().timestamp()) |
| 167 | 174 | ||
| 168 | try: | 175 | try: |
| 169 | - cursor = self.connection.cursor() | ||
| 170 | - | ||
| 171 | - # 检查今天是否已有记录 | ||
| 172 | - check_query = "SELECT id FROM daily_topics WHERE extract_date = %s" | ||
| 173 | - cursor.execute(check_query, (extract_date,)) | ||
| 174 | - existing = cursor.fetchone() | ||
| 175 | - | ||
| 176 | keywords_json = json.dumps(keywords, ensure_ascii=False) | 176 | keywords_json = json.dumps(keywords, ensure_ascii=False) |
| 177 | - | ||
| 178 | - if existing: | ||
| 179 | - # 更新现有记录 | ||
| 180 | - update_query = """ | ||
| 181 | - UPDATE daily_topics | ||
| 182 | - SET keywords = %s, summary = %s, add_ts = %s | ||
| 183 | - WHERE extract_date = %s | ||
| 184 | - """ | ||
| 185 | - cursor.execute(update_query, (keywords_json, summary, current_timestamp, extract_date)) | ||
| 186 | - print(f"更新了 {extract_date} 的话题分析") | ||
| 187 | - else: | ||
| 188 | - # 插入新记录 | ||
| 189 | - insert_query = """ | ||
| 190 | - INSERT INTO daily_topics (extract_date, keywords, summary, add_ts) | ||
| 191 | - VALUES (%s, %s, %s, %s) | ||
| 192 | - """ | ||
| 193 | - cursor.execute(insert_query, (extract_date, keywords_json, summary, current_timestamp)) | ||
| 194 | - print(f"保存了 {extract_date} 的话题分析") | ||
| 195 | - | 177 | + with self.engine.begin() as conn: |
| 178 | + check = conn.execute( | ||
| 179 | + text("SELECT id FROM daily_topics WHERE extract_date = :d AND topic_id = :tid"), | ||
| 180 | + {"d": extract_date, "tid": "summary"}, | ||
| 181 | + ).first() | ||
| 182 | + if check: | ||
| 183 | + conn.execute( | ||
| 184 | + text( | ||
| 185 | + "UPDATE daily_topics SET keywords = :k, topic_description = :s, add_ts = :ts, last_modify_ts = :lmt, topic_name = :tn WHERE extract_date = :d AND topic_id = :tid" | ||
| 186 | + ), | ||
| 187 | + {"k": keywords_json, "s": summary, "ts": current_timestamp, "lmt": current_timestamp, "d": extract_date, "tid": "summary", "tn": "每日新闻分析"}, | ||
| 188 | + ) | ||
| 189 | + logger.info(f"更新了 {extract_date} 的话题分析") | ||
| 190 | + else: | ||
| 191 | + conn.execute( | ||
| 192 | + text( | ||
| 193 | + "INSERT INTO daily_topics (extract_date, topic_id, topic_name, keywords, topic_description, add_ts, last_modify_ts) VALUES (:d, :tid, :tn, :k, :s, :ts, :lmt)" | ||
| 194 | + ), | ||
| 195 | + {"d": extract_date, "tid": "summary", "tn": "每日新闻分析", "k": keywords_json, "s": summary, "ts": current_timestamp, "lmt": current_timestamp}, | ||
| 196 | + ) | ||
| 197 | + logger.info(f"保存了 {extract_date} 的话题分析") | ||
| 196 | return True | 198 | return True |
| 197 | - | ||
| 198 | except Exception as e: | 199 | except Exception as e: |
| 199 | - print(f"保存话题分析失败: {e}") | 200 | + logger.exception(f"保存话题分析失败: {e}") |
| 200 | return False | 201 | return False |
| 201 | 202 | ||
| 202 | def get_daily_topics(self, extract_date: date = None) -> Optional[Dict]: | 203 | def get_daily_topics(self, extract_date: date = None) -> Optional[Dict]: |
| @@ -213,20 +214,15 @@ class DatabaseManager: | @@ -213,20 +214,15 @@ class DatabaseManager: | ||
| 213 | extract_date = date.today() | 214 | extract_date = date.today() |
| 214 | 215 | ||
| 215 | try: | 216 | try: |
| 216 | - cursor = self.connection.cursor() | ||
| 217 | - query = "SELECT * FROM daily_topics WHERE extract_date = %s" | ||
| 218 | - cursor.execute(query, (extract_date,)) | ||
| 219 | - result = cursor.fetchone() | ||
| 220 | - | ||
| 221 | - if result: | ||
| 222 | - # 解析关键词JSON | ||
| 223 | - result['keywords'] = json.loads(result['keywords']) | ||
| 224 | - return result | ||
| 225 | - else: | 217 | + with self.engine.connect() as conn: |
| 218 | + result = conn.execute(text("SELECT * FROM daily_topics WHERE extract_date = :d"), {"d": extract_date}).mappings().first() | ||
| 219 | + if result: | ||
| 220 | + result = dict(result) # 转为可变dict以支持item赋值 | ||
| 221 | + result["keywords"] = json.loads(result["keywords"]) if result.get("keywords") else [] | ||
| 222 | + return result | ||
| 226 | return None | 223 | return None |
| 227 | - | ||
| 228 | except Exception as e: | 224 | except Exception as e: |
| 229 | - print(f"获取话题分析失败: {e}") | 225 | + logger.exception(f"获取话题分析失败: {e}") |
| 230 | return None | 226 | return None |
| 231 | 227 | ||
| 232 | def get_recent_topics(self, days: int = 7) -> List[Dict]: | 228 | def get_recent_topics(self, days: int = 7) -> List[Dict]: |
| @@ -240,23 +236,23 @@ class DatabaseManager: | @@ -240,23 +236,23 @@ class DatabaseManager: | ||
| 240 | 话题分析列表 | 236 | 话题分析列表 |
| 241 | """ | 237 | """ |
| 242 | try: | 238 | try: |
| 243 | - cursor = self.connection.cursor() | ||
| 244 | - query = """ | ||
| 245 | - SELECT * FROM daily_topics | ||
| 246 | - WHERE extract_date >= DATE_SUB(CURDATE(), INTERVAL %s DAY) | ||
| 247 | - ORDER BY extract_date DESC | ||
| 248 | - """ | ||
| 249 | - cursor.execute(query, (days,)) | ||
| 250 | - results = cursor.fetchall() | ||
| 251 | - | ||
| 252 | - # 解析每个结果的关键词JSON | ||
| 253 | - for result in results: | ||
| 254 | - result['keywords'] = json.loads(result['keywords']) | ||
| 255 | - | ||
| 256 | - return results | ||
| 257 | - | 239 | + start_date = date.today() - timedelta(days=days) |
| 240 | + with self.engine.connect() as conn: | ||
| 241 | + results = conn.execute( | ||
| 242 | + text( | ||
| 243 | + """ | ||
| 244 | + SELECT * FROM daily_topics | ||
| 245 | + WHERE extract_date >= :start_date | ||
| 246 | + ORDER BY extract_date DESC | ||
| 247 | + """ | ||
| 248 | + ), | ||
| 249 | + {"start_date": start_date}, | ||
| 250 | + ).mappings().all() | ||
| 251 | + for r in results: | ||
| 252 | + r["keywords"] = json.loads(r["keywords"]) if r.get("keywords") else [] | ||
| 253 | + return results | ||
| 258 | except Exception as e: | 254 | except Exception as e: |
| 259 | - print(f"获取最近话题分析失败: {e}") | 255 | + logger.exception(f"获取最近话题分析失败: {e}") |
| 260 | return [] | 256 | return [] |
| 261 | 257 | ||
| 262 | # ==================== 统计查询 ==================== | 258 | # ==================== 统计查询 ==================== |
| @@ -264,56 +260,48 @@ class DatabaseManager: | @@ -264,56 +260,48 @@ class DatabaseManager: | ||
| 264 | def get_summary_stats(self, days: int = 7) -> Dict: | 260 | def get_summary_stats(self, days: int = 7) -> Dict: |
| 265 | """获取统计摘要""" | 261 | """获取统计摘要""" |
| 266 | try: | 262 | try: |
| 267 | - cursor = self.connection.cursor() | ||
| 268 | - | ||
| 269 | - # 新闻统计 | ||
| 270 | - news_query = """ | ||
| 271 | - SELECT | ||
| 272 | - crawl_date, | ||
| 273 | - COUNT(*) as news_count, | ||
| 274 | - COUNT(DISTINCT source_platform) as platforms_count | ||
| 275 | - FROM daily_news | ||
| 276 | - WHERE crawl_date >= DATE_SUB(CURDATE(), INTERVAL %s DAY) | ||
| 277 | - GROUP BY crawl_date | ||
| 278 | - ORDER BY crawl_date DESC | ||
| 279 | - """ | ||
| 280 | - cursor.execute(news_query, (days,)) | ||
| 281 | - news_stats = cursor.fetchall() | ||
| 282 | - | ||
| 283 | - # 话题统计 | ||
| 284 | - topics_query = """ | ||
| 285 | - SELECT | ||
| 286 | - extract_date, | ||
| 287 | - keywords, | ||
| 288 | - CHAR_LENGTH(summary) as summary_length | ||
| 289 | - FROM daily_topics | ||
| 290 | - WHERE extract_date >= DATE_SUB(CURDATE(), INTERVAL %s DAY) | ||
| 291 | - ORDER BY extract_date DESC | ||
| 292 | - """ | ||
| 293 | - cursor.execute(topics_query, (days,)) | ||
| 294 | - topics_stats = cursor.fetchall() | ||
| 295 | - | ||
| 296 | - return { | ||
| 297 | - 'news_stats': news_stats, | ||
| 298 | - 'topics_stats': topics_stats | ||
| 299 | - } | ||
| 300 | - | 263 | + start_date = date.today() - timedelta(days=days) |
| 264 | + with self.engine.connect() as conn: | ||
| 265 | + news_stats = conn.execute( | ||
| 266 | + text( | ||
| 267 | + """ | ||
| 268 | + SELECT crawl_date, COUNT(*) as news_count, COUNT(DISTINCT source_platform) as platforms_count | ||
| 269 | + FROM daily_news | ||
| 270 | + WHERE crawl_date >= :start_date | ||
| 271 | + GROUP BY crawl_date | ||
| 272 | + ORDER BY crawl_date DESC | ||
| 273 | + """ | ||
| 274 | + ), | ||
| 275 | + {"start_date": start_date}, | ||
| 276 | + ).all() | ||
| 277 | + topics_stats = conn.execute( | ||
| 278 | + text( | ||
| 279 | + """ | ||
| 280 | + SELECT extract_date, keywords, CHAR_LENGTH(topic_description) as summary_length | ||
| 281 | + FROM daily_topics | ||
| 282 | + WHERE extract_date >= :start_date | ||
| 283 | + ORDER BY extract_date DESC | ||
| 284 | + """ | ||
| 285 | + ), | ||
| 286 | + {"start_date": start_date}, | ||
| 287 | + ).all() | ||
| 288 | + return {"news_stats": news_stats, "topics_stats": topics_stats} | ||
| 301 | except Exception as e: | 289 | except Exception as e: |
| 302 | - print(f"获取统计摘要失败: {e}") | ||
| 303 | - return {'news_stats': [], 'topics_stats': []} | 290 | + logger.exception(f"获取统计摘要失败: {e}") |
| 291 | + return {"news_stats": [], "topics_stats": []} | ||
| 304 | 292 | ||
| 305 | if __name__ == "__main__": | 293 | if __name__ == "__main__": |
| 306 | # 测试数据库管理器 | 294 | # 测试数据库管理器 |
| 307 | with DatabaseManager() as db: | 295 | with DatabaseManager() as db: |
| 308 | # 测试获取新闻 | 296 | # 测试获取新闻 |
| 309 | news = db.get_daily_news() | 297 | news = db.get_daily_news() |
| 310 | - print(f"今日新闻数量: {len(news)}") | 298 | + logger.info(f"今日新闻数量: {len(news)}") |
| 311 | 299 | ||
| 312 | # 测试获取话题 | 300 | # 测试获取话题 |
| 313 | topics = db.get_daily_topics() | 301 | topics = db.get_daily_topics() |
| 314 | if topics: | 302 | if topics: |
| 315 | - print(f"今日话题关键词: {topics['keywords']}") | 303 | + logger.info(f"今日话题关键词: {topics['keywords']}") |
| 316 | else: | 304 | else: |
| 317 | - print("今日暂无话题分析") | 305 | + logger.info("今日暂无话题分析") |
| 318 | 306 | ||
| 319 | - print("简化数据库管理器测试完成!") | 307 | + logger.info("简化数据库管理器测试完成!") |
| @@ -11,6 +11,7 @@ import argparse | @@ -11,6 +11,7 @@ import argparse | ||
| 11 | from datetime import datetime, date | 11 | from datetime import datetime, date |
| 12 | from pathlib import Path | 12 | from pathlib import Path |
| 13 | from typing import List, Dict, Optional | 13 | from typing import List, Dict, Optional |
| 14 | +from loguru import logger | ||
| 14 | 15 | ||
| 15 | # 添加项目根目录到路径 | 16 | # 添加项目根目录到路径 |
| 16 | project_root = Path(__file__).parent.parent | 17 | project_root = Path(__file__).parent.parent |
| @@ -21,8 +22,8 @@ try: | @@ -21,8 +22,8 @@ try: | ||
| 21 | from BroadTopicExtraction.topic_extractor import TopicExtractor | 22 | from BroadTopicExtraction.topic_extractor import TopicExtractor |
| 22 | from BroadTopicExtraction.database_manager import DatabaseManager | 23 | from BroadTopicExtraction.database_manager import DatabaseManager |
| 23 | except ImportError as e: | 24 | except ImportError as e: |
| 24 | - print(f"导入模块失败: {e}") | ||
| 25 | - print("请确保在项目根目录运行,并且已安装所有依赖") | 25 | + logger.exception(f"导入模块失败: {e}") |
| 26 | + logger.error("请确保在项目根目录运行,并且已安装所有依赖") | ||
| 26 | sys.exit(1) | 27 | sys.exit(1) |
| 27 | 28 | ||
| 28 | class BroadTopicExtraction: | 29 | class BroadTopicExtraction: |
| @@ -34,7 +35,7 @@ class BroadTopicExtraction: | @@ -34,7 +35,7 @@ class BroadTopicExtraction: | ||
| 34 | self.topic_extractor = TopicExtractor() | 35 | self.topic_extractor = TopicExtractor() |
| 35 | self.db_manager = DatabaseManager() | 36 | self.db_manager = DatabaseManager() |
| 36 | 37 | ||
| 37 | - print("BroadTopicExtraction 初始化完成") | 38 | + logger.info("BroadTopicExtraction 初始化完成") |
| 38 | 39 | ||
| 39 | def close(self): | 40 | def close(self): |
| 40 | """关闭资源""" | 41 | """关闭资源""" |
| @@ -68,21 +69,22 @@ class BroadTopicExtraction: | @@ -68,21 +69,22 @@ class BroadTopicExtraction: | ||
| 68 | Returns: | 69 | Returns: |
| 69 | 包含完整提取结果的字典 | 70 | 包含完整提取结果的字典 |
| 70 | """ | 71 | """ |
| 71 | - print("\n" + "=" * 80) | ||
| 72 | - print("MindSpider AI爬虫 - 每日话题提取") | ||
| 73 | - print("=" * 80) | ||
| 74 | - print(f"执行时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") | ||
| 75 | - print(f"目标日期: {date.today()}") | 72 | + extraction_result_message = "" |
| 73 | + extraction_result_message += "\nMindSpider AI爬虫 - 每日话题提取\n" | ||
| 74 | + extraction_result_message += f"执行时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n" | ||
| 75 | + extraction_result_message += f"目标日期: {date.today()}\n" | ||
| 76 | 76 | ||
| 77 | if news_sources: | 77 | if news_sources: |
| 78 | - print(f"指定平台: {len(news_sources)} 个") | 78 | + extraction_result_message += f"指定平台: {len(news_sources)} 个\n" |
| 79 | for source in news_sources: | 79 | for source in news_sources: |
| 80 | source_name = SOURCE_NAMES.get(source, source) | 80 | source_name = SOURCE_NAMES.get(source, source) |
| 81 | - print(f" - {source_name}") | 81 | + extraction_result_message += f" - {source_name}\n" |
| 82 | else: | 82 | else: |
| 83 | - print(f"爬取平台: 全部 {len(SOURCE_NAMES)} 个平台") | 83 | + extraction_result_message += f"爬取平台: 全部 {len(SOURCE_NAMES)} 个平台\n" |
| 84 | 84 | ||
| 85 | - print(f"关键词数: 最多 {max_keywords} 个") | 85 | + extraction_result_message += f"关键词数: 最多 {max_keywords} 个\n" |
| 86 | + | ||
| 87 | + logger.info(extraction_result_message) | ||
| 86 | 88 | ||
| 87 | extraction_result = { | 89 | extraction_result = { |
| 88 | 'success': False, | 90 | 'success': False, |
| @@ -96,7 +98,7 @@ class BroadTopicExtraction: | @@ -96,7 +98,7 @@ class BroadTopicExtraction: | ||
| 96 | 98 | ||
| 97 | try: | 99 | try: |
| 98 | # 步骤1: 收集新闻 | 100 | # 步骤1: 收集新闻 |
| 99 | - print("\n【步骤1】收集热点新闻...") | 101 | + logger.info("【步骤1】收集热点新闻...") |
| 100 | news_result = await self.news_collector.collect_and_save_news( | 102 | news_result = await self.news_collector.collect_and_save_news( |
| 101 | sources=news_sources | 103 | sources=news_sources |
| 102 | ) | 104 | ) |
| @@ -112,7 +114,7 @@ class BroadTopicExtraction: | @@ -112,7 +114,7 @@ class BroadTopicExtraction: | ||
| 112 | raise Exception("新闻收集失败或没有获取到新闻") | 114 | raise Exception("新闻收集失败或没有获取到新闻") |
| 113 | 115 | ||
| 114 | # 步骤2: 提取关键词和生成总结 | 116 | # 步骤2: 提取关键词和生成总结 |
| 115 | - print("\n【步骤2】提取关键词和生成总结...") | 117 | + logger.info("【步骤2】提取关键词和生成总结...") |
| 116 | keywords, summary = self.topic_extractor.extract_keywords_and_summary( | 118 | keywords, summary = self.topic_extractor.extract_keywords_and_summary( |
| 117 | news_result['news_list'], | 119 | news_result['news_list'], |
| 118 | max_keywords=max_keywords | 120 | max_keywords=max_keywords |
| @@ -126,10 +128,10 @@ class BroadTopicExtraction: | @@ -126,10 +128,10 @@ class BroadTopicExtraction: | ||
| 126 | } | 128 | } |
| 127 | 129 | ||
| 128 | if not keywords: | 130 | if not keywords: |
| 129 | - print("警告: 没有提取到有效关键词") | 131 | + logger.warning("警告: 没有提取到有效关键词") |
| 130 | 132 | ||
| 131 | # 步骤3: 保存到数据库 | 133 | # 步骤3: 保存到数据库 |
| 132 | - print("\n【步骤3】保存分析结果到数据库...") | 134 | + logger.info("【步骤3】保存分析结果到数据库...") |
| 133 | save_success = self.db_manager.save_daily_topics( | 135 | save_success = self.db_manager.save_daily_topics( |
| 134 | keywords, summary, date.today() | 136 | keywords, summary, date.today() |
| 135 | ) | 137 | ) |
| @@ -141,56 +143,47 @@ class BroadTopicExtraction: | @@ -141,56 +143,47 @@ class BroadTopicExtraction: | ||
| 141 | extraction_result['success'] = True | 143 | extraction_result['success'] = True |
| 142 | extraction_result['end_time'] = datetime.now().isoformat() | 144 | extraction_result['end_time'] = datetime.now().isoformat() |
| 143 | 145 | ||
| 144 | - print("\n" + "=" * 80) | ||
| 145 | - print("每日话题提取流程完成!") | ||
| 146 | - print("=" * 80) | 146 | + logger.info("每日话题提取流程完成!") |
| 147 | 147 | ||
| 148 | return extraction_result | 148 | return extraction_result |
| 149 | 149 | ||
| 150 | except Exception as e: | 150 | except Exception as e: |
| 151 | - print(f"\n话题提取流程失败: {e}") | 151 | + logger.exception(f"话题提取流程失败: {e}") |
| 152 | extraction_result['error'] = str(e) | 152 | extraction_result['error'] = str(e) |
| 153 | extraction_result['end_time'] = datetime.now().isoformat() | 153 | extraction_result['end_time'] = datetime.now().isoformat() |
| 154 | return extraction_result | 154 | return extraction_result |
| 155 | 155 | ||
| 156 | def print_extraction_results(self, extraction_result: Dict): | 156 | def print_extraction_results(self, extraction_result: Dict): |
| 157 | """打印提取结果""" | 157 | """打印提取结果""" |
| 158 | - print("\n" + "=" * 80) | ||
| 159 | - print("话题提取结果报告") | ||
| 160 | - print("=" * 80) | ||
| 161 | - | ||
| 162 | - if not extraction_result['success']: | ||
| 163 | - print(f"❌ 提取失败: {extraction_result.get('error', '未知错误')}") | ||
| 164 | - return | 158 | + extraction_result_message = "" |
| 165 | 159 | ||
| 166 | # 新闻收集结果 | 160 | # 新闻收集结果 |
| 167 | news_data = extraction_result.get('news_collection', {}) | 161 | news_data = extraction_result.get('news_collection', {}) |
| 168 | - print(f"📰 新闻收集: {news_data.get('total_news', 0)} 条新闻") | ||
| 169 | - print(f" 成功源数: {news_data.get('successful_sources', 0)}/{news_data.get('total_sources', 0)}") | 162 | + extraction_result_message += f"\n📰 新闻收集: {news_data.get('total_news', 0)} 条新闻\n" |
| 163 | + extraction_result_message += f" 成功源数: {news_data.get('successful_sources', 0)}/{news_data.get('total_sources', 0)}\n" | ||
| 170 | 164 | ||
| 171 | # 话题提取结果 | 165 | # 话题提取结果 |
| 172 | topic_data = extraction_result.get('topic_extraction', {}) | 166 | topic_data = extraction_result.get('topic_extraction', {}) |
| 173 | keywords = topic_data.get('keywords', []) | 167 | keywords = topic_data.get('keywords', []) |
| 174 | summary = topic_data.get('summary', '') | 168 | summary = topic_data.get('summary', '') |
| 175 | 169 | ||
| 176 | - print(f"\n🔑 提取关键词: {len(keywords)} 个") | 170 | + extraction_result_message += f"\n🔑 提取关键词: {len(keywords)} 个\n" |
| 177 | if keywords: | 171 | if keywords: |
| 178 | # 每行显示5个关键词 | 172 | # 每行显示5个关键词 |
| 179 | for i in range(0, len(keywords), 5): | 173 | for i in range(0, len(keywords), 5): |
| 180 | keyword_group = keywords[i:i+5] | 174 | keyword_group = keywords[i:i+5] |
| 181 | - print(f" {', '.join(keyword_group)}") | 175 | + extraction_result_message += f" {', '.join(keyword_group)}\n" |
| 182 | 176 | ||
| 183 | - print(f"\n📝 新闻总结:") | ||
| 184 | - print(f" {summary}") | 177 | + extraction_result_message += f"\n📝 新闻总结:\n {summary}\n" |
| 185 | 178 | ||
| 186 | # 数据库保存结果 | 179 | # 数据库保存结果 |
| 187 | db_data = extraction_result.get('database_save', {}) | 180 | db_data = extraction_result.get('database_save', {}) |
| 188 | if db_data.get('success'): | 181 | if db_data.get('success'): |
| 189 | - print(f"\n💾 数据库保存: 成功") | 182 | + extraction_result_message += f"\n💾 数据库保存: 成功\n" |
| 190 | else: | 183 | else: |
| 191 | - print(f"\n💾 数据库保存: 失败") | 184 | + extraction_result_message += f"\n💾 数据库保存: 失败\n" |
| 192 | 185 | ||
| 193 | - print("\n" + "=" * 80) | 186 | + logger.info(extraction_result_message) |
| 194 | 187 | ||
| 195 | def get_keywords_for_crawling(self, extract_date: date = None) -> List[str]: | 188 | def get_keywords_for_crawling(self, extract_date: date = None) -> List[str]: |
| 196 | """ | 189 | """ |
| @@ -207,7 +200,7 @@ class BroadTopicExtraction: | @@ -207,7 +200,7 @@ class BroadTopicExtraction: | ||
| 207 | topics_data = self.db_manager.get_daily_topics(extract_date) | 200 | topics_data = self.db_manager.get_daily_topics(extract_date) |
| 208 | 201 | ||
| 209 | if not topics_data: | 202 | if not topics_data: |
| 210 | - print(f"没有找到 {extract_date or date.today()} 的话题数据") | 203 | + logger.info(f"没有找到 {extract_date or date.today()} 的话题数据") |
| 211 | return [] | 204 | return [] |
| 212 | 205 | ||
| 213 | keywords = topics_data['keywords'] | 206 | keywords = topics_data['keywords'] |
| @@ -215,11 +208,11 @@ class BroadTopicExtraction: | @@ -215,11 +208,11 @@ class BroadTopicExtraction: | ||
| 215 | # 生成搜索关键词 | 208 | # 生成搜索关键词 |
| 216 | search_keywords = self.topic_extractor.get_search_keywords(keywords) | 209 | search_keywords = self.topic_extractor.get_search_keywords(keywords) |
| 217 | 210 | ||
| 218 | - print(f"准备了 {len(search_keywords)} 个关键词用于爬取") | 211 | + logger.info(f"准备了 {len(search_keywords)} 个关键词用于爬取") |
| 219 | return search_keywords | 212 | return search_keywords |
| 220 | 213 | ||
| 221 | except Exception as e: | 214 | except Exception as e: |
| 222 | - print(f"获取爬取关键词失败: {e}") | 215 | + logger.error(f"获取爬取关键词失败: {e}") |
| 223 | return [] | 216 | return [] |
| 224 | 217 | ||
| 225 | def get_daily_analysis(self, target_date: date = None) -> Optional[Dict]: | 218 | def get_daily_analysis(self, target_date: date = None) -> Optional[Dict]: |
| @@ -227,7 +220,7 @@ class BroadTopicExtraction: | @@ -227,7 +220,7 @@ class BroadTopicExtraction: | ||
| 227 | try: | 220 | try: |
| 228 | return self.db_manager.get_daily_topics(target_date) | 221 | return self.db_manager.get_daily_topics(target_date) |
| 229 | except Exception as e: | 222 | except Exception as e: |
| 230 | - print(f"获取每日分析失败: {e}") | 223 | + logger.error(f"获取每日分析失败: {e}") |
| 231 | return None | 224 | return None |
| 232 | 225 | ||
| 233 | def get_recent_analysis(self, days: int = 7) -> List[Dict]: | 226 | def get_recent_analysis(self, days: int = 7) -> List[Dict]: |
| @@ -235,7 +228,7 @@ class BroadTopicExtraction: | @@ -235,7 +228,7 @@ class BroadTopicExtraction: | ||
| 235 | try: | 228 | try: |
| 236 | return self.db_manager.get_recent_topics(days) | 229 | return self.db_manager.get_recent_topics(days) |
| 237 | except Exception as e: | 230 | except Exception as e: |
| 238 | - print(f"获取最近分析失败: {e}") | 231 | + logger.error(f"获取最近分析失败: {e}") |
| 239 | return [] | 232 | return [] |
| 240 | 233 | ||
| 241 | # ==================== 命令行工具 ==================== | 234 | # ==================== 命令行工具 ==================== |
| @@ -260,17 +253,17 @@ async def run_extraction_command(sources=None, keywords_count=100, show_details= | @@ -260,17 +253,17 @@ async def run_extraction_command(sources=None, keywords_count=100, show_details= | ||
| 260 | news_data = result.get('news_collection', {}) | 253 | news_data = result.get('news_collection', {}) |
| 261 | topic_data = result.get('topic_extraction', {}) | 254 | topic_data = result.get('topic_extraction', {}) |
| 262 | 255 | ||
| 263 | - print(f"✅ 话题提取成功完成!") | ||
| 264 | - print(f" 收集新闻: {news_data.get('total_news', 0)} 条") | ||
| 265 | - print(f" 提取关键词: {len(topic_data.get('keywords', []))} 个") | ||
| 266 | - print(f" 生成总结: {len(topic_data.get('summary', ''))} 字符") | 256 | + logger.info(f"✅ 话题提取成功完成!") |
| 257 | + logger.info(f" 收集新闻: {news_data.get('total_news', 0)} 条") | ||
| 258 | + logger.info(f" 提取关键词: {len(topic_data.get('keywords', []))} 个") | ||
| 259 | + logger.info(f" 生成总结: {len(topic_data.get('summary', ''))} 字符") | ||
| 267 | 260 | ||
| 268 | # 获取爬取关键词 | 261 | # 获取爬取关键词 |
| 269 | crawling_keywords = extractor.get_keywords_for_crawling() | 262 | crawling_keywords = extractor.get_keywords_for_crawling() |
| 270 | 263 | ||
| 271 | if crawling_keywords: | 264 | if crawling_keywords: |
| 272 | - print(f"\n🔑 为DeepSentimentCrawling准备的搜索关键词:") | ||
| 273 | - print(f" {', '.join(crawling_keywords)}") | 265 | + logger.info(f"\n🔑 为DeepSentimentCrawling准备的搜索关键词:") |
| 266 | + logger.info(f" {', '.join(crawling_keywords)}") | ||
| 274 | 267 | ||
| 275 | # 保存关键词到文件 | 268 | # 保存关键词到文件 |
| 276 | keywords_file = project_root / "data" / "daily_keywords.txt" | 269 | keywords_file = project_root / "data" / "daily_keywords.txt" |
| @@ -279,16 +272,16 @@ async def run_extraction_command(sources=None, keywords_count=100, show_details= | @@ -279,16 +272,16 @@ async def run_extraction_command(sources=None, keywords_count=100, show_details= | ||
| 279 | with open(keywords_file, 'w', encoding='utf-8') as f: | 272 | with open(keywords_file, 'w', encoding='utf-8') as f: |
| 280 | f.write('\n'.join(crawling_keywords)) | 273 | f.write('\n'.join(crawling_keywords)) |
| 281 | 274 | ||
| 282 | - print(f" 关键词已保存到: {keywords_file}") | 275 | + logger.info(f" 关键词已保存到: {keywords_file}") |
| 283 | 276 | ||
| 284 | return True | 277 | return True |
| 285 | 278 | ||
| 286 | else: | 279 | else: |
| 287 | - print(f"❌ 话题提取失败: {result.get('error', '未知错误')}") | 280 | + logger.error(f"❌ 话题提取失败: {result.get('error', '未知错误')}") |
| 288 | return False | 281 | return False |
| 289 | 282 | ||
| 290 | except Exception as e: | 283 | except Exception as e: |
| 291 | - print(f"❌ 执行过程中发生错误: {e}") | 284 | + logger.error(f"❌ 执行过程中发生错误: {e}") |
| 292 | return False | 285 | return False |
| 293 | 286 | ||
| 294 | def main(): | 287 | def main(): |
| @@ -304,14 +297,14 @@ def main(): | @@ -304,14 +297,14 @@ def main(): | ||
| 304 | 297 | ||
| 305 | # 显示支持的新闻源 | 298 | # 显示支持的新闻源 |
| 306 | if args.list_sources: | 299 | if args.list_sources: |
| 307 | - print("支持的新闻源平台:") | 300 | + logger.info("支持的新闻源平台:") |
| 308 | for source, name in SOURCE_NAMES.items(): | 301 | for source, name in SOURCE_NAMES.items(): |
| 309 | - print(f" {source:<25} {name}") | 302 | + logger.info(f" {source:<25} {name}") |
| 310 | return | 303 | return |
| 311 | 304 | ||
| 312 | # 验证参数 | 305 | # 验证参数 |
| 313 | if args.keywords < 1 or args.keywords > 200: | 306 | if args.keywords < 1 or args.keywords > 200: |
| 314 | - print("关键词数量应在1-200之间") | 307 | + logger.error("关键词数量应在1-200之间") |
| 315 | sys.exit(1) | 308 | sys.exit(1) |
| 316 | 309 | ||
| 317 | # 运行提取 | 310 | # 运行提取 |
| @@ -325,7 +318,7 @@ def main(): | @@ -325,7 +318,7 @@ def main(): | ||
| 325 | sys.exit(0 if success else 1) | 318 | sys.exit(0 if success else 1) |
| 326 | 319 | ||
| 327 | except KeyboardInterrupt: | 320 | except KeyboardInterrupt: |
| 328 | - print("\n用户中断操作") | 321 | + logger.info("用户中断操作") |
| 329 | sys.exit(1) | 322 | sys.exit(1) |
| 330 | 323 | ||
| 331 | if __name__ == "__main__": | 324 | if __name__ == "__main__": |
| @@ -18,19 +18,20 @@ sys.path.append(str(project_root)) | @@ -18,19 +18,20 @@ sys.path.append(str(project_root)) | ||
| 18 | 18 | ||
| 19 | try: | 19 | try: |
| 20 | import config | 20 | import config |
| 21 | + from config import settings | ||
| 21 | except ImportError: | 22 | except ImportError: |
| 22 | - raise ImportError("无法导入config.py配置文件") | 23 | + raise ImportError("无法导入settings.py配置文件") |
| 23 | 24 | ||
| 24 | class TopicExtractor: | 25 | class TopicExtractor: |
| 25 | """话题提取器""" | 26 | """话题提取器""" |
| 26 | - | 27 | + |
| 27 | def __init__(self): | 28 | def __init__(self): |
| 28 | """初始化话题提取器""" | 29 | """初始化话题提取器""" |
| 29 | self.client = OpenAI( | 30 | self.client = OpenAI( |
| 30 | - api_key=config.DEEPSEEK_API_KEY, | ||
| 31 | - base_url="https://api.deepseek.com" | 31 | + api_key=settings.MINDSPIDER_API_KEY, |
| 32 | + base_url=settings.MINDSPIDER_BASE_URL | ||
| 32 | ) | 33 | ) |
| 33 | - self.model = "deepseek-chat" | 34 | + self.model = settings.MINDSPIDER_MODEL_NAME |
| 34 | 35 | ||
| 35 | def extract_keywords_and_summary(self, news_list: List[Dict], max_keywords: int = 100) -> Tuple[List[str], str]: | 36 | def extract_keywords_and_summary(self, news_list: List[Dict], max_keywords: int = 100) -> Tuple[List[str], str]: |
| 36 | """ | 37 | """ |
| @@ -11,8 +11,8 @@ from datetime import date, timedelta, datetime | @@ -11,8 +11,8 @@ from datetime import date, timedelta, datetime | ||
| 11 | from pathlib import Path | 11 | from pathlib import Path |
| 12 | from typing import List, Dict, Optional | 12 | from typing import List, Dict, Optional |
| 13 | import random | 13 | import random |
| 14 | -import pymysql | ||
| 15 | -from pymysql.cursors import DictCursor | 14 | +from sqlalchemy import create_engine, text |
| 15 | +from sqlalchemy.engine import Engine | ||
| 16 | 16 | ||
| 17 | # 添加项目根目录到路径 | 17 | # 添加项目根目录到路径 |
| 18 | project_root = Path(__file__).parent.parent | 18 | project_root = Path(__file__).parent.parent |
| @@ -23,30 +23,38 @@ try: | @@ -23,30 +23,38 @@ try: | ||
| 23 | except ImportError: | 23 | except ImportError: |
| 24 | raise ImportError("无法导入config.py配置文件") | 24 | raise ImportError("无法导入config.py配置文件") |
| 25 | 25 | ||
| 26 | +from config import settings | ||
| 27 | +from loguru import logger | ||
| 28 | + | ||
| 26 | class KeywordManager: | 29 | class KeywordManager: |
| 27 | """关键词管理器""" | 30 | """关键词管理器""" |
| 28 | 31 | ||
| 29 | def __init__(self): | 32 | def __init__(self): |
| 30 | """初始化关键词管理器""" | 33 | """初始化关键词管理器""" |
| 31 | - self.connection = None | 34 | + self.engine: Engine = None |
| 32 | self.connect() | 35 | self.connect() |
| 33 | 36 | ||
| 34 | def connect(self): | 37 | def connect(self): |
| 35 | """连接数据库""" | 38 | """连接数据库""" |
| 36 | try: | 39 | try: |
| 37 | - self.connection = pymysql.connect( | ||
| 38 | - host=config.DB_HOST, | ||
| 39 | - port=config.DB_PORT, | ||
| 40 | - user=config.DB_USER, | ||
| 41 | - password=config.DB_PASSWORD, | ||
| 42 | - database=config.DB_NAME, | ||
| 43 | - charset=config.DB_CHARSET, | ||
| 44 | - autocommit=True, | ||
| 45 | - cursorclass=DictCursor | ||
| 46 | - ) | ||
| 47 | - print(f"关键词管理器成功连接到数据库: {config.DB_NAME}") | 40 | + dialect = (settings.DB_DIALECT or "mysql").lower() |
| 41 | + if dialect in ("postgresql", "postgres"): | ||
| 42 | + url = f"postgresql+psycopg://{settings.DB_USER}:{settings.DB_PASSWORD}@{settings.DB_HOST}:{settings.DB_PORT}/{settings.DB_NAME}" | ||
| 43 | + else: | ||
| 44 | + url = f"mysql+pymysql://{settings.DB_USER}:{settings.DB_PASSWORD}@{settings.DB_HOST}:{settings.DB_PORT}/{settings.DB_NAME}?charset={settings.DB_CHARSET}" | ||
| 45 | + self.engine = create_engine(url, future=True) | ||
| 46 | + logger.info(f"关键词管理器成功连接到数据库: {settings.DB_NAME}") | ||
| 47 | + except ModuleNotFoundError as e: | ||
| 48 | + missing: str = str(e) | ||
| 49 | + if "psycopg" in missing: | ||
| 50 | + logger.error("数据库连接失败: 未安装PostgreSQL驱动 psycopg。请安装: psycopg[binary]。参考指令:uv pip install psycopg[binary]") | ||
| 51 | + elif "pymysql" in missing: | ||
| 52 | + logger.error("数据库连接失败: 未安装MySQL驱动 pymysql。请安装: pymysql。参考指令:uv pip install pymysql") | ||
| 53 | + else: | ||
| 54 | + logger.error(f"数据库连接失败(缺少驱动): {e}") | ||
| 55 | + raise | ||
| 48 | except Exception as e: | 56 | except Exception as e: |
| 49 | - print(f"关键词管理器数据库连接失败: {e}") | 57 | + logger.exception(f"关键词管理器数据库连接失败: {e}") |
| 50 | raise | 58 | raise |
| 51 | 59 | ||
| 52 | def get_latest_keywords(self, target_date: date = None, max_keywords: int = 100) -> List[str]: | 60 | def get_latest_keywords(self, target_date: date = None, max_keywords: int = 100) -> List[str]: |
| @@ -63,24 +71,24 @@ class KeywordManager: | @@ -63,24 +71,24 @@ class KeywordManager: | ||
| 63 | if not target_date: | 71 | if not target_date: |
| 64 | target_date = date.today() | 72 | target_date = date.today() |
| 65 | 73 | ||
| 66 | - print(f"正在获取 {target_date} 的关键词...") | 74 | + logger.info(f"正在获取 {target_date} 的关键词...") |
| 67 | 75 | ||
| 68 | # 首先尝试获取指定日期的关键词 | 76 | # 首先尝试获取指定日期的关键词 |
| 69 | topics_data = self.get_daily_topics(target_date) | 77 | topics_data = self.get_daily_topics(target_date) |
| 70 | 78 | ||
| 71 | if topics_data and topics_data.get('keywords'): | 79 | if topics_data and topics_data.get('keywords'): |
| 72 | keywords = topics_data['keywords'] | 80 | keywords = topics_data['keywords'] |
| 73 | - print(f"成功获取 {target_date} 的 {len(keywords)} 个关键词") | 81 | + logger.info(f"成功获取 {target_date} 的 {len(keywords)} 个关键词") |
| 74 | 82 | ||
| 75 | # 如果关键词太多,随机选择指定数量 | 83 | # 如果关键词太多,随机选择指定数量 |
| 76 | if len(keywords) > max_keywords: | 84 | if len(keywords) > max_keywords: |
| 77 | keywords = random.sample(keywords, max_keywords) | 85 | keywords = random.sample(keywords, max_keywords) |
| 78 | - print(f"随机选择了 {max_keywords} 个关键词") | 86 | + logger.info(f"随机选择了 {max_keywords} 个关键词") |
| 79 | 87 | ||
| 80 | return keywords | 88 | return keywords |
| 81 | 89 | ||
| 82 | # 如果没有当天的关键词,尝试获取最近几天的 | 90 | # 如果没有当天的关键词,尝试获取最近几天的 |
| 83 | - print(f"{target_date} 没有关键词数据,尝试获取最近的关键词...") | 91 | + logger.info(f"{target_date} 没有关键词数据,尝试获取最近的关键词...") |
| 84 | recent_topics = self.get_recent_topics(days=7) | 92 | recent_topics = self.get_recent_topics(days=7) |
| 85 | 93 | ||
| 86 | if recent_topics: | 94 | if recent_topics: |
| @@ -95,11 +103,11 @@ class KeywordManager: | @@ -95,11 +103,11 @@ class KeywordManager: | ||
| 95 | if len(unique_keywords) > max_keywords: | 103 | if len(unique_keywords) > max_keywords: |
| 96 | unique_keywords = random.sample(unique_keywords, max_keywords) | 104 | unique_keywords = random.sample(unique_keywords, max_keywords) |
| 97 | 105 | ||
| 98 | - print(f"从最近7天的数据中获取到 {len(unique_keywords)} 个关键词") | 106 | + logger.info(f"从最近7天的数据中获取到 {len(unique_keywords)} 个关键词") |
| 99 | return unique_keywords | 107 | return unique_keywords |
| 100 | 108 | ||
| 101 | # 如果都没有,返回默认关键词 | 109 | # 如果都没有,返回默认关键词 |
| 102 | - print("没有找到任何关键词数据,使用默认关键词") | 110 | + logger.info("没有找到任何关键词数据,使用默认关键词") |
| 103 | return self._get_default_keywords() | 111 | return self._get_default_keywords() |
| 104 | 112 | ||
| 105 | def get_daily_topics(self, extract_date: date = None) -> Optional[Dict]: | 113 | def get_daily_topics(self, extract_date: date = None) -> Optional[Dict]: |
| @@ -116,20 +124,22 @@ class KeywordManager: | @@ -116,20 +124,22 @@ class KeywordManager: | ||
| 116 | extract_date = date.today() | 124 | extract_date = date.today() |
| 117 | 125 | ||
| 118 | try: | 126 | try: |
| 119 | - cursor = self.connection.cursor() | ||
| 120 | - query = "SELECT * FROM daily_topics WHERE extract_date = %s" | ||
| 121 | - cursor.execute(query, (extract_date,)) | ||
| 122 | - result = cursor.fetchone() | 127 | + with self.engine.connect() as conn: |
| 128 | + result = conn.execute( | ||
| 129 | + text("SELECT * FROM daily_topics WHERE extract_date = :d"), | ||
| 130 | + {"d": extract_date}, | ||
| 131 | + ).mappings().first() | ||
| 123 | 132 | ||
| 124 | if result: | 133 | if result: |
| 125 | - # 解析关键词JSON | ||
| 126 | - result['keywords'] = json.loads(result['keywords']) | 134 | + # 转为可变dict再赋值 |
| 135 | + result = dict(result) | ||
| 136 | + result['keywords'] = json.loads(result['keywords']) if result.get('keywords') else [] | ||
| 127 | return result | 137 | return result |
| 128 | else: | 138 | else: |
| 129 | return None | 139 | return None |
| 130 | 140 | ||
| 131 | except Exception as e: | 141 | except Exception as e: |
| 132 | - print(f"获取话题分析失败: {e}") | 142 | + logger.exception(f"获取话题分析失败: {e}") |
| 133 | return None | 143 | return None |
| 134 | 144 | ||
| 135 | def get_recent_topics(self, days: int = 7) -> List[Dict]: | 145 | def get_recent_topics(self, days: int = 7) -> List[Dict]: |
| @@ -143,23 +153,28 @@ class KeywordManager: | @@ -143,23 +153,28 @@ class KeywordManager: | ||
| 143 | 话题分析列表 | 153 | 话题分析列表 |
| 144 | """ | 154 | """ |
| 145 | try: | 155 | try: |
| 146 | - cursor = self.connection.cursor() | ||
| 147 | - query = """ | ||
| 148 | - SELECT * FROM daily_topics | ||
| 149 | - WHERE extract_date >= DATE_SUB(CURDATE(), INTERVAL %s DAY) | ||
| 150 | - ORDER BY extract_date DESC | ||
| 151 | - """ | ||
| 152 | - cursor.execute(query, (days,)) | ||
| 153 | - results = cursor.fetchall() | 156 | + start_date = date.today() - timedelta(days=days) |
| 157 | + with self.engine.connect() as conn: | ||
| 158 | + results = conn.execute( | ||
| 159 | + text( | ||
| 160 | + """ | ||
| 161 | + SELECT * FROM daily_topics | ||
| 162 | + WHERE extract_date >= :start_date | ||
| 163 | + ORDER BY extract_date DESC | ||
| 164 | + """ | ||
| 165 | + ), | ||
| 166 | + {"start_date": start_date}, | ||
| 167 | + ).mappings().all() | ||
| 154 | 168 | ||
| 155 | - # 解析每个结果的关键词JSON | 169 | + # 转为可变dict列表再处理 |
| 170 | + results = [dict(r) for r in results] | ||
| 156 | for result in results: | 171 | for result in results: |
| 157 | - result['keywords'] = json.loads(result['keywords']) | 172 | + result['keywords'] = json.loads(result['keywords']) if result.get('keywords') else [] |
| 158 | 173 | ||
| 159 | return results | 174 | return results |
| 160 | 175 | ||
| 161 | except Exception as e: | 176 | except Exception as e: |
| 162 | - print(f"获取最近话题分析失败: {e}") | 177 | + logger.exception(f"获取最近话题分析失败: {e}") |
| 163 | return [] | 178 | return [] |
| 164 | 179 | ||
| 165 | def _get_default_keywords(self) -> List[str]: | 180 | def _get_default_keywords(self) -> List[str]: |
| @@ -190,8 +205,8 @@ class KeywordManager: | @@ -190,8 +205,8 @@ class KeywordManager: | ||
| 190 | keywords = self.get_latest_keywords(target_date, max_keywords) | 205 | keywords = self.get_latest_keywords(target_date, max_keywords) |
| 191 | 206 | ||
| 192 | if keywords: | 207 | if keywords: |
| 193 | - print(f"为 {len(platforms)} 个平台准备了相同的 {len(keywords)} 个关键词") | ||
| 194 | - print(f"每个关键词将在所有平台上进行爬取") | 208 | + logger.info(f"为 {len(platforms)} 个平台准备了相同的 {len(keywords)} 个关键词") |
| 209 | + logger.info(f"每个关键词将在所有平台上进行爬取") | ||
| 195 | 210 | ||
| 196 | return keywords | 211 | return keywords |
| 197 | 212 | ||
| @@ -210,7 +225,7 @@ class KeywordManager: | @@ -210,7 +225,7 @@ class KeywordManager: | ||
| 210 | """ | 225 | """ |
| 211 | keywords = self.get_latest_keywords(target_date, max_keywords) | 226 | keywords = self.get_latest_keywords(target_date, max_keywords) |
| 212 | 227 | ||
| 213 | - print(f"为平台 {platform} 准备了 {len(keywords)} 个关键词(与其他平台相同)") | 228 | + logger.info(f"为平台 {platform} 准备了 {len(keywords)} 个关键词(与其他平台相同)") |
| 214 | return keywords | 229 | return keywords |
| 215 | 230 | ||
| 216 | def _filter_keywords_by_platform(self, keywords: List[str], platform: str) -> List[str]: | 231 | def _filter_keywords_by_platform(self, keywords: List[str], platform: str) -> List[str]: |
| @@ -290,9 +305,9 @@ class KeywordManager: | @@ -290,9 +305,9 @@ class KeywordManager: | ||
| 290 | 305 | ||
| 291 | def close(self): | 306 | def close(self): |
| 292 | """关闭数据库连接""" | 307 | """关闭数据库连接""" |
| 293 | - if self.connection: | ||
| 294 | - self.connection.close() | ||
| 295 | - print("关键词管理器数据库连接已关闭") | 308 | + if self.engine: |
| 309 | + self.engine.dispose() | ||
| 310 | + logger.info("关键词管理器数据库连接已关闭") | ||
| 296 | 311 | ||
| 297 | def __enter__(self): | 312 | def __enter__(self): |
| 298 | return self | 313 | return self |
| @@ -305,16 +320,16 @@ if __name__ == "__main__": | @@ -305,16 +320,16 @@ if __name__ == "__main__": | ||
| 305 | with KeywordManager() as km: | 320 | with KeywordManager() as km: |
| 306 | # 测试获取关键词 | 321 | # 测试获取关键词 |
| 307 | keywords = km.get_latest_keywords(max_keywords=20) | 322 | keywords = km.get_latest_keywords(max_keywords=20) |
| 308 | - print(f"获取到的关键词: {keywords}") | 323 | + logger.info(f"获取到的关键词: {keywords}") |
| 309 | 324 | ||
| 310 | # 测试平台分配 | 325 | # 测试平台分配 |
| 311 | platforms = ['xhs', 'dy', 'bili'] | 326 | platforms = ['xhs', 'dy', 'bili'] |
| 312 | distribution = km.distribute_keywords_by_platform(keywords, platforms) | 327 | distribution = km.distribute_keywords_by_platform(keywords, platforms) |
| 313 | for platform, kws in distribution.items(): | 328 | for platform, kws in distribution.items(): |
| 314 | - print(f"{platform}: {kws}") | 329 | + logger.info(f"{platform}: {kws}") |
| 315 | 330 | ||
| 316 | # 测试爬取摘要 | 331 | # 测试爬取摘要 |
| 317 | summary = km.get_crawling_summary() | 332 | summary = km.get_crawling_summary() |
| 318 | - print(f"爬取摘要: {summary}") | 333 | + logger.info(f"爬取摘要: {summary}") |
| 319 | 334 | ||
| 320 | - print("关键词管理器测试完成!") | 335 | + logger.info("关键词管理器测试完成!") |
| @@ -13,6 +13,7 @@ from datetime import datetime | @@ -13,6 +13,7 @@ from datetime import datetime | ||
| 13 | from pathlib import Path | 13 | from pathlib import Path |
| 14 | from typing import List, Dict, Optional | 14 | from typing import List, Dict, Optional |
| 15 | import json | 15 | import json |
| 16 | +from loguru import logger | ||
| 16 | 17 | ||
| 17 | # 添加项目根目录到路径 | 18 | # 添加项目根目录到路径 |
| 18 | project_root = Path(__file__).parent.parent | 19 | project_root = Path(__file__).parent.parent |
| @@ -36,11 +37,15 @@ class PlatformCrawler: | @@ -36,11 +37,15 @@ class PlatformCrawler: | ||
| 36 | if not self.mediacrawler_path.exists(): | 37 | if not self.mediacrawler_path.exists(): |
| 37 | raise FileNotFoundError(f"MediaCrawler目录不存在: {self.mediacrawler_path}") | 38 | raise FileNotFoundError(f"MediaCrawler目录不存在: {self.mediacrawler_path}") |
| 38 | 39 | ||
| 39 | - print(f"初始化平台爬虫管理器,MediaCrawler路径: {self.mediacrawler_path}") | 40 | + logger.info(f"初始化平台爬虫管理器,MediaCrawler路径: {self.mediacrawler_path}") |
| 40 | 41 | ||
| 41 | def configure_mediacrawler_db(self): | 42 | def configure_mediacrawler_db(self): |
| 42 | - """配置MediaCrawler使用我们的MySQL数据库""" | 43 | + """配置MediaCrawler使用我们的数据库(MySQL或PostgreSQL)""" |
| 43 | try: | 44 | try: |
| 45 | + # 判断数据库类型 | ||
| 46 | + db_dialect = (config.settings.DB_DIALECT or "mysql").lower() | ||
| 47 | + is_postgresql = db_dialect in ("postgresql", "postgres") | ||
| 48 | + | ||
| 44 | # 修改MediaCrawler的数据库配置 | 49 | # 修改MediaCrawler的数据库配置 |
| 45 | db_config_path = self.mediacrawler_path / "config" / "db_config.py" | 50 | db_config_path = self.mediacrawler_path / "config" / "db_config.py" |
| 46 | 51 | ||
| @@ -48,7 +53,14 @@ class PlatformCrawler: | @@ -48,7 +53,14 @@ class PlatformCrawler: | ||
| 48 | with open(db_config_path, 'r', encoding='utf-8') as f: | 53 | with open(db_config_path, 'r', encoding='utf-8') as f: |
| 49 | content = f.read() | 54 | content = f.read() |
| 50 | 55 | ||
| 51 | - # 替换数据库配置 | 56 | + # PostgreSQL配置值:如果使用PostgreSQL则使用MindSpider配置,否则使用默认值或环境变量 |
| 57 | + pg_password = config.settings.DB_PASSWORD if is_postgresql else "bettafish" | ||
| 58 | + pg_user = config.settings.DB_USER if is_postgresql else "bettafish" | ||
| 59 | + pg_host = config.settings.DB_HOST if is_postgresql else "127.0.0.1" | ||
| 60 | + pg_port = config.settings.DB_PORT if is_postgresql else 5432 | ||
| 61 | + pg_db_name = config.settings.DB_NAME if is_postgresql else "bettafish" | ||
| 62 | + | ||
| 63 | + # 替换数据库配置 - 使用MindSpider的数据库配置 | ||
| 52 | new_config = f'''# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: | 64 | new_config = f'''# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: |
| 53 | # 1. 不得用于任何商业用途。 | 65 | # 1. 不得用于任何商业用途。 |
| 54 | # 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 | 66 | # 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 |
| @@ -63,11 +75,19 @@ class PlatformCrawler: | @@ -63,11 +75,19 @@ class PlatformCrawler: | ||
| 63 | import os | 75 | import os |
| 64 | 76 | ||
| 65 | # mysql config - 使用MindSpider的数据库配置 | 77 | # mysql config - 使用MindSpider的数据库配置 |
| 66 | -MYSQL_DB_PWD = "{config.DB_PASSWORD}" | ||
| 67 | -MYSQL_DB_USER = "{config.DB_USER}" | ||
| 68 | -MYSQL_DB_HOST = "{config.DB_HOST}" | ||
| 69 | -MYSQL_DB_PORT = {config.DB_PORT} | ||
| 70 | -MYSQL_DB_NAME = "{config.DB_NAME}" | 78 | +MYSQL_DB_PWD = "{config.settings.DB_PASSWORD}" |
| 79 | +MYSQL_DB_USER = "{config.settings.DB_USER}" | ||
| 80 | +MYSQL_DB_HOST = "{config.settings.DB_HOST}" | ||
| 81 | +MYSQL_DB_PORT = {config.settings.DB_PORT} | ||
| 82 | +MYSQL_DB_NAME = "{config.settings.DB_NAME}" | ||
| 83 | + | ||
| 84 | +mysql_db_config = {{ | ||
| 85 | + "user": MYSQL_DB_USER, | ||
| 86 | + "password": MYSQL_DB_PWD, | ||
| 87 | + "host": MYSQL_DB_HOST, | ||
| 88 | + "port": MYSQL_DB_PORT, | ||
| 89 | + "db_name": MYSQL_DB_NAME, | ||
| 90 | +}} | ||
| 71 | 91 | ||
| 72 | 92 | ||
| 73 | # redis config | 93 | # redis config |
| @@ -81,17 +101,39 @@ CACHE_TYPE_REDIS = "redis" | @@ -81,17 +101,39 @@ CACHE_TYPE_REDIS = "redis" | ||
| 81 | CACHE_TYPE_MEMORY = "memory" | 101 | CACHE_TYPE_MEMORY = "memory" |
| 82 | 102 | ||
| 83 | # sqlite config | 103 | # sqlite config |
| 84 | -SQLITE_DB_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "schema", "sqlite_tables.db")''' | 104 | +SQLITE_DB_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "database", "sqlite_tables.db") |
| 105 | + | ||
| 106 | +sqlite_db_config = {{ | ||
| 107 | + "db_path": SQLITE_DB_PATH | ||
| 108 | +}} | ||
| 109 | + | ||
| 110 | +# postgresql config - 使用MindSpider的数据库配置(如果DB_DIALECT是postgresql)或环境变量 | ||
| 111 | +POSTGRESQL_DB_PWD = os.getenv("POSTGRESQL_DB_PWD", "{pg_password}") | ||
| 112 | +POSTGRESQL_DB_USER = os.getenv("POSTGRESQL_DB_USER", "{pg_user}") | ||
| 113 | +POSTGRESQL_DB_HOST = os.getenv("POSTGRESQL_DB_HOST", "{pg_host}") | ||
| 114 | +POSTGRESQL_DB_PORT = os.getenv("POSTGRESQL_DB_PORT", "{pg_port}") | ||
| 115 | +POSTGRESQL_DB_NAME = os.getenv("POSTGRESQL_DB_NAME", "{pg_db_name}") | ||
| 116 | + | ||
| 117 | +postgresql_db_config = {{ | ||
| 118 | + "user": POSTGRESQL_DB_USER, | ||
| 119 | + "password": POSTGRESQL_DB_PWD, | ||
| 120 | + "host": POSTGRESQL_DB_HOST, | ||
| 121 | + "port": POSTGRESQL_DB_PORT, | ||
| 122 | + "db_name": POSTGRESQL_DB_NAME, | ||
| 123 | +}} | ||
| 124 | + | ||
| 125 | +''' | ||
| 85 | 126 | ||
| 86 | # 写入新配置 | 127 | # 写入新配置 |
| 87 | with open(db_config_path, 'w', encoding='utf-8') as f: | 128 | with open(db_config_path, 'w', encoding='utf-8') as f: |
| 88 | f.write(new_config) | 129 | f.write(new_config) |
| 89 | 130 | ||
| 90 | - print("已配置MediaCrawler使用MindSpider数据库") | 131 | + db_type = "PostgreSQL" if is_postgresql else "MySQL" |
| 132 | + logger.info(f"已配置MediaCrawler使用MindSpider {db_type}数据库") | ||
| 91 | return True | 133 | return True |
| 92 | 134 | ||
| 93 | except Exception as e: | 135 | except Exception as e: |
| 94 | - print(f"配置MediaCrawler数据库失败: {e}") | 136 | + logger.exception(f"配置MediaCrawler数据库失败: {e}") |
| 95 | return False | 137 | return False |
| 96 | 138 | ||
| 97 | def create_base_config(self, platform: str, keywords: List[str], | 139 | def create_base_config(self, platform: str, keywords: List[str], |
| @@ -109,6 +151,11 @@ SQLITE_DB_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "schem | @@ -109,6 +151,11 @@ SQLITE_DB_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "schem | ||
| 109 | 是否配置成功 | 151 | 是否配置成功 |
| 110 | """ | 152 | """ |
| 111 | try: | 153 | try: |
| 154 | + # 判断数据库类型,确定 SAVE_DATA_OPTION | ||
| 155 | + db_dialect = (config.settings.DB_DIALECT or "mysql").lower() | ||
| 156 | + is_postgresql = db_dialect in ("postgresql", "postgres") | ||
| 157 | + save_data_option = "postgresql" if is_postgresql else "db" | ||
| 158 | + | ||
| 112 | base_config_path = self.mediacrawler_path / "config" / "base_config.py" | 159 | base_config_path = self.mediacrawler_path / "config" / "base_config.py" |
| 113 | 160 | ||
| 114 | # 将关键词列表转换为逗号分隔的字符串 | 161 | # 将关键词列表转换为逗号分隔的字符串 |
| @@ -130,7 +177,7 @@ SQLITE_DB_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "schem | @@ -130,7 +177,7 @@ SQLITE_DB_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "schem | ||
| 130 | elif line.startswith('CRAWLER_TYPE = '): | 177 | elif line.startswith('CRAWLER_TYPE = '): |
| 131 | new_lines.append(f'CRAWLER_TYPE = "{crawler_type}" # 爬取类型,search(关键词搜索) | detail(帖子详情)| creator(创作者主页数据)') | 178 | new_lines.append(f'CRAWLER_TYPE = "{crawler_type}" # 爬取类型,search(关键词搜索) | detail(帖子详情)| creator(创作者主页数据)') |
| 132 | elif line.startswith('SAVE_DATA_OPTION = '): | 179 | elif line.startswith('SAVE_DATA_OPTION = '): |
| 133 | - new_lines.append('SAVE_DATA_OPTION = "db" # csv or db or json or sqlite') | 180 | + new_lines.append(f'SAVE_DATA_OPTION = "{save_data_option}" # csv or db or json or sqlite or postgresql') |
| 134 | elif line.startswith('CRAWLER_MAX_NOTES_COUNT = '): | 181 | elif line.startswith('CRAWLER_MAX_NOTES_COUNT = '): |
| 135 | new_lines.append(f'CRAWLER_MAX_NOTES_COUNT = {max_notes}') | 182 | new_lines.append(f'CRAWLER_MAX_NOTES_COUNT = {max_notes}') |
| 136 | elif line.startswith('ENABLE_GET_COMMENTS = '): | 183 | elif line.startswith('ENABLE_GET_COMMENTS = '): |
| @@ -146,11 +193,11 @@ SQLITE_DB_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "schem | @@ -146,11 +193,11 @@ SQLITE_DB_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "schem | ||
| 146 | with open(base_config_path, 'w', encoding='utf-8') as f: | 193 | with open(base_config_path, 'w', encoding='utf-8') as f: |
| 147 | f.write('\n'.join(new_lines)) | 194 | f.write('\n'.join(new_lines)) |
| 148 | 195 | ||
| 149 | - print(f"已配置 {platform} 平台,关键词数量: {len(keywords)}") | 196 | + logger.info(f"已配置 {platform} 平台,爬取类型: {crawler_type},关键词数量: {len(keywords)},最大爬取数量: {max_notes},保存数据方式: {save_data_option}") |
| 150 | return True | 197 | return True |
| 151 | 198 | ||
| 152 | except Exception as e: | 199 | except Exception as e: |
| 153 | - print(f"创建基础配置失败: {e}") | 200 | + logger.exception(f"创建基础配置失败: {e}") |
| 154 | return False | 201 | return False |
| 155 | 202 | ||
| 156 | def run_crawler(self, platform: str, keywords: List[str], | 203 | def run_crawler(self, platform: str, keywords: List[str], |
| @@ -173,8 +220,9 @@ SQLITE_DB_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "schem | @@ -173,8 +220,9 @@ SQLITE_DB_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "schem | ||
| 173 | if not keywords: | 220 | if not keywords: |
| 174 | raise ValueError("关键词列表不能为空") | 221 | raise ValueError("关键词列表不能为空") |
| 175 | 222 | ||
| 176 | - print(f"\n开始爬取平台: {platform}") | ||
| 177 | - print(f"关键词: {keywords[:5]}{'...' if len(keywords) > 5 else ''} (共{len(keywords)}个)") | 223 | + start_message = f"\n开始爬取平台: {platform}" |
| 224 | + start_message += f"\n关键词: {keywords[:5]}{'...' if len(keywords) > 5 else ''} (共{len(keywords)}个)" | ||
| 225 | + logger.info(start_message) | ||
| 178 | 226 | ||
| 179 | start_time = datetime.now() | 227 | start_time = datetime.now() |
| 180 | 228 | ||
| @@ -187,22 +235,27 @@ SQLITE_DB_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "schem | @@ -187,22 +235,27 @@ SQLITE_DB_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "schem | ||
| 187 | if not self.create_base_config(platform, keywords, "search", max_notes): | 235 | if not self.create_base_config(platform, keywords, "search", max_notes): |
| 188 | return {"success": False, "error": "基础配置创建失败"} | 236 | return {"success": False, "error": "基础配置创建失败"} |
| 189 | 237 | ||
| 238 | + # 判断数据库类型,确定 save_data_option | ||
| 239 | + db_dialect = (config.settings.DB_DIALECT or "mysql").lower() | ||
| 240 | + is_postgresql = db_dialect in ("postgresql", "postgres") | ||
| 241 | + save_data_option = "postgresql" if is_postgresql else "db" | ||
| 242 | + | ||
| 190 | # 构建命令 | 243 | # 构建命令 |
| 191 | cmd = [ | 244 | cmd = [ |
| 192 | sys.executable, "main.py", | 245 | sys.executable, "main.py", |
| 193 | "--platform", platform, | 246 | "--platform", platform, |
| 194 | "--lt", login_type, | 247 | "--lt", login_type, |
| 195 | "--type", "search", | 248 | "--type", "search", |
| 196 | - "--save_data_option", "db" | 249 | + "--save_data_option", save_data_option |
| 197 | ] | 250 | ] |
| 198 | 251 | ||
| 199 | - print(f"执行命令: {' '.join(cmd)}") | 252 | + logger.info(f"执行命令: {' '.join(cmd)}") |
| 200 | 253 | ||
| 201 | # 切换到MediaCrawler目录并执行 | 254 | # 切换到MediaCrawler目录并执行 |
| 202 | result = subprocess.run( | 255 | result = subprocess.run( |
| 203 | cmd, | 256 | cmd, |
| 204 | cwd=self.mediacrawler_path, | 257 | cwd=self.mediacrawler_path, |
| 205 | - timeout=1800 # 30分钟超时 | 258 | + timeout=3600 # 60分钟超时 |
| 206 | ) | 259 | ) |
| 207 | 260 | ||
| 208 | end_time = datetime.now() | 261 | end_time = datetime.now() |
| @@ -226,17 +279,17 @@ SQLITE_DB_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "schem | @@ -226,17 +279,17 @@ SQLITE_DB_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "schem | ||
| 226 | self.crawl_stats[platform] = crawl_stats | 279 | self.crawl_stats[platform] = crawl_stats |
| 227 | 280 | ||
| 228 | if result.returncode == 0: | 281 | if result.returncode == 0: |
| 229 | - print(f"✅ {platform} 爬取完成,耗时: {duration:.1f}秒") | 282 | + logger.info(f"✅ {platform} 爬取完成,耗时: {duration:.1f}秒") |
| 230 | else: | 283 | else: |
| 231 | - print(f"❌ {platform} 爬取失败,返回码: {result.returncode}") | 284 | + logger.error(f"❌ {platform} 爬取失败,返回码: {result.returncode}") |
| 232 | 285 | ||
| 233 | return crawl_stats | 286 | return crawl_stats |
| 234 | 287 | ||
| 235 | except subprocess.TimeoutExpired: | 288 | except subprocess.TimeoutExpired: |
| 236 | - print(f"❌ {platform} 爬取超时") | 289 | + logger.exception(f"❌ {platform} 爬取超时") |
| 237 | return {"success": False, "error": "爬取超时", "platform": platform} | 290 | return {"success": False, "error": "爬取超时", "platform": platform} |
| 238 | except Exception as e: | 291 | except Exception as e: |
| 239 | - print(f"❌ {platform} 爬取异常: {e}") | 292 | + logger.exception(f"❌ {platform} 爬取异常: {e}") |
| 240 | return {"success": False, "error": str(e), "platform": platform} | 293 | return {"success": False, "error": str(e), "platform": platform} |
| 241 | 294 | ||
| 242 | def _parse_crawl_output(self, output_lines: List[str], error_lines: List[str]) -> Dict: | 295 | def _parse_crawl_output(self, output_lines: List[str], error_lines: List[str]) -> Dict: |
| @@ -291,10 +344,14 @@ SQLITE_DB_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "schem | @@ -291,10 +344,14 @@ SQLITE_DB_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "schem | ||
| 291 | Returns: | 344 | Returns: |
| 292 | 总体爬取统计 | 345 | 总体爬取统计 |
| 293 | """ | 346 | """ |
| 294 | - print(f"\n🚀 开始全平台关键词爬取") | ||
| 295 | - print(f" 关键词数量: {len(keywords)}") | ||
| 296 | - print(f" 平台数量: {len(platforms)}") | ||
| 297 | - print(f" 总爬取任务: {len(keywords)} × {len(platforms)} = {len(keywords) * len(platforms)}") | 347 | + |
| 348 | + start_message = f"\n🚀 开始全平台关键词爬取" | ||
| 349 | + start_message += f"\n 关键词数量: {len(keywords)}" | ||
| 350 | + start_message += f"\n 平台数量: {len(platforms)}" | ||
| 351 | + start_message += f"\n 登录方式: {login_type}" | ||
| 352 | + start_message += f"\n 每个关键词在每个平台的最大爬取数量: {max_notes_per_keyword}" | ||
| 353 | + start_message += f"\n 总爬取任务: {len(keywords)} × {len(platforms)} = {len(keywords) * len(platforms)}" | ||
| 354 | + logger.info(start_message) | ||
| 298 | 355 | ||
| 299 | total_stats = { | 356 | total_stats = { |
| 300 | "total_keywords": len(keywords), | 357 | "total_keywords": len(keywords), |
| @@ -319,8 +376,8 @@ SQLITE_DB_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "schem | @@ -319,8 +376,8 @@ SQLITE_DB_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "schem | ||
| 319 | 376 | ||
| 320 | # 对每个平台一次性爬取所有关键词 | 377 | # 对每个平台一次性爬取所有关键词 |
| 321 | for platform in platforms: | 378 | for platform in platforms: |
| 322 | - print(f"\n📝 在 {platform} 平台爬取所有关键词") | ||
| 323 | - print(f" 关键词: {', '.join(keywords[:5])}{'...' if len(keywords) > 5 else ''}") | 379 | + logger.info(f"\n📝 在 {platform} 平台爬取所有关键词") |
| 380 | + logger.info(f" 关键词: {', '.join(keywords[:5])}{'...' if len(keywords) > 5 else ''}") | ||
| 324 | 381 | ||
| 325 | try: | 382 | try: |
| 326 | # 一次性传递所有关键词给平台 | 383 | # 一次性传递所有关键词给平台 |
| @@ -344,7 +401,7 @@ SQLITE_DB_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "schem | @@ -344,7 +401,7 @@ SQLITE_DB_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "schem | ||
| 344 | total_stats["keyword_results"][keyword] = {} | 401 | total_stats["keyword_results"][keyword] = {} |
| 345 | total_stats["keyword_results"][keyword][platform] = result | 402 | total_stats["keyword_results"][keyword][platform] = result |
| 346 | 403 | ||
| 347 | - print(f" ✅ 成功: {notes_count} 条内容, {comments_count} 条评论") | 404 | + logger.info(f" ✅ 成功: {notes_count} 条内容, {comments_count} 条评论") |
| 348 | else: | 405 | else: |
| 349 | total_stats["failed_tasks"] += len(keywords) | 406 | total_stats["failed_tasks"] += len(keywords) |
| 350 | total_stats["platform_summary"][platform]["failed_keywords"] = len(keywords) | 407 | total_stats["platform_summary"][platform]["failed_keywords"] = len(keywords) |
| @@ -355,7 +412,7 @@ SQLITE_DB_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "schem | @@ -355,7 +412,7 @@ SQLITE_DB_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "schem | ||
| 355 | total_stats["keyword_results"][keyword] = {} | 412 | total_stats["keyword_results"][keyword] = {} |
| 356 | total_stats["keyword_results"][keyword][platform] = result | 413 | total_stats["keyword_results"][keyword][platform] = result |
| 357 | 414 | ||
| 358 | - print(f" ❌ 失败: {result.get('error', '未知错误')}") | 415 | + logger.error(f" ❌ 失败: {result.get('error', '未知错误')}") |
| 359 | 416 | ||
| 360 | except Exception as e: | 417 | except Exception as e: |
| 361 | total_stats["failed_tasks"] += len(keywords) | 418 | total_stats["failed_tasks"] += len(keywords) |
| @@ -368,22 +425,24 @@ SQLITE_DB_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "schem | @@ -368,22 +425,24 @@ SQLITE_DB_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "schem | ||
| 368 | total_stats["keyword_results"][keyword] = {} | 425 | total_stats["keyword_results"][keyword] = {} |
| 369 | total_stats["keyword_results"][keyword][platform] = error_result | 426 | total_stats["keyword_results"][keyword][platform] = error_result |
| 370 | 427 | ||
| 371 | - print(f" ❌ 异常: {e}") | 428 | + logger.error(f" ❌ 异常: {e}") |
| 372 | 429 | ||
| 373 | # 打印详细统计 | 430 | # 打印详细统计 |
| 374 | - print(f"\n📊 全平台关键词爬取完成!") | ||
| 375 | - print(f" 总任务: {total_stats['total_tasks']}") | ||
| 376 | - print(f" 成功: {total_stats['successful_tasks']}") | ||
| 377 | - print(f" 失败: {total_stats['failed_tasks']}") | ||
| 378 | - print(f" 成功率: {total_stats['successful_tasks']/total_stats['total_tasks']*100:.1f}%") | ||
| 379 | - print(f" 总内容: {total_stats['total_notes']} 条") | ||
| 380 | - print(f" 总评论: {total_stats['total_comments']} 条") | 431 | + finish_message = f"\n📊 全平台关键词爬取完成!" |
| 432 | + finish_message += f"\n 总任务: {total_stats['total_tasks']}" | ||
| 433 | + finish_message += f"\n 成功: {total_stats['successful_tasks']}" | ||
| 434 | + finish_message += f"\n 失败: {total_stats['failed_tasks']}" | ||
| 435 | + finish_message += f"\n 成功率: {total_stats['successful_tasks']/total_stats['total_tasks']*100:.1f}%" | ||
| 436 | + finish_message += f"\n 总内容: {total_stats['total_notes']} 条" | ||
| 437 | + finish_message += f"\n 总评论: {total_stats['total_comments']} 条" | ||
| 438 | + logger.info(finish_message) | ||
| 381 | 439 | ||
| 382 | - print(f"\n📈 各平台统计:") | 440 | + platform_summary_message = f"\n� 各平台统计:" |
| 383 | for platform, stats in total_stats["platform_summary"].items(): | 441 | for platform, stats in total_stats["platform_summary"].items(): |
| 384 | success_rate = stats["successful_keywords"] / len(keywords) * 100 if keywords else 0 | 442 | success_rate = stats["successful_keywords"] / len(keywords) * 100 if keywords else 0 |
| 385 | - print(f" {platform}: {stats['successful_keywords']}/{len(keywords)} 关键词成功 ({success_rate:.1f}%), " | ||
| 386 | - f"{stats['total_notes']} 条内容") | 443 | + platform_summary_message += f"\n {platform}: {stats['successful_keywords']}/{len(keywords)} 关键词成功 ({success_rate:.1f}%), " |
| 444 | + platform_summary_message += f"{stats['total_notes']} 条内容" | ||
| 445 | + logger.info(platform_summary_message) | ||
| 387 | 446 | ||
| 388 | return total_stats | 447 | return total_stats |
| 389 | 448 | ||
| @@ -403,9 +462,9 @@ SQLITE_DB_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "schem | @@ -403,9 +462,9 @@ SQLITE_DB_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "schem | ||
| 403 | try: | 462 | try: |
| 404 | with open(log_path, 'w', encoding='utf-8') as f: | 463 | with open(log_path, 'w', encoding='utf-8') as f: |
| 405 | json.dump(self.crawl_stats, f, ensure_ascii=False, indent=2) | 464 | json.dump(self.crawl_stats, f, ensure_ascii=False, indent=2) |
| 406 | - print(f"爬取日志已保存到: {log_path}") | 465 | + logger.info(f"爬取日志已保存到: {log_path}") |
| 407 | except Exception as e: | 466 | except Exception as e: |
| 408 | - print(f"保存爬取日志失败: {e}") | 467 | + logger.exception(f"保存爬取日志失败: {e}") |
| 409 | 468 | ||
| 410 | if __name__ == "__main__": | 469 | if __name__ == "__main__": |
| 411 | # 测试平台爬虫管理器 | 470 | # 测试平台爬虫管理器 |
| @@ -415,5 +474,5 @@ if __name__ == "__main__": | @@ -415,5 +474,5 @@ if __name__ == "__main__": | ||
| 415 | test_keywords = ["科技", "AI", "编程"] | 474 | test_keywords = ["科技", "AI", "编程"] |
| 416 | result = crawler.run_crawler("xhs", test_keywords, max_notes=5) | 475 | result = crawler.run_crawler("xhs", test_keywords, max_notes=5) |
| 417 | 476 | ||
| 418 | - print(f"测试结果: {result}") | ||
| 419 | - print("平台爬虫管理器测试完成!") | 477 | + logger.info(f"测试结果: {result}") |
| 478 | + logger.info("平台爬虫管理器测试完成!") |
| @@ -217,26 +217,54 @@ git clone https://github.com/yourusername/MindSpider.git | @@ -217,26 +217,54 @@ git clone https://github.com/yourusername/MindSpider.git | ||
| 217 | cd MindSpider | 217 | cd MindSpider |
| 218 | ``` | 218 | ``` |
| 219 | 219 | ||
| 220 | -### 2. 创建并激活Conda环境 | 220 | +### 2. 创建并激活环境 |
| 221 | + | ||
| 222 | +#### Conda配置方法 | ||
| 223 | + | ||
| 224 | +#### Conda配置方法 | ||
| 221 | 225 | ||
| 222 | ```bash | 226 | ```bash |
| 227 | +# 创建名为 pytorch_python11 的conda环境并指定Python版本 | ||
| 223 | conda create -n pytorch_python11 python=3.11 | 228 | conda create -n pytorch_python11 python=3.11 |
| 229 | +# 激活该环境 | ||
| 224 | conda activate pytorch_python11 | 230 | conda activate pytorch_python11 |
| 225 | ``` | 231 | ``` |
| 226 | 232 | ||
| 233 | +#### UV配置方法 | ||
| 234 | + | ||
| 235 | +> [UV 是一种快速轻量级 Python 包环境管理工具,适用于低依赖及便捷管理需求。可参考:https://github.com/astral-sh/uv] | ||
| 236 | + | ||
| 237 | +- 安装uv(如未安装) | ||
| 238 | +```bash | ||
| 239 | +pip install uv | ||
| 240 | +``` | ||
| 241 | +- 创建虚拟环境并激活 | ||
| 242 | +```bash | ||
| 243 | +uv venv --python 3.11 # 创建3.11环境 | ||
| 244 | +source .venv/bin/activate # Linux/macOS | ||
| 245 | +# 或 | ||
| 246 | +.venv\Scripts\activate # Windows | ||
| 247 | +``` | ||
| 248 | + | ||
| 249 | + | ||
| 227 | ### 3. 安装依赖 | 250 | ### 3. 安装依赖 |
| 228 | 251 | ||
| 229 | ```bash | 252 | ```bash |
| 230 | # 安装Python依赖 | 253 | # 安装Python依赖 |
| 231 | pip install -r requirements.txt | 254 | pip install -r requirements.txt |
| 232 | 255 | ||
| 256 | +或 | ||
| 257 | +# uv版本更加快速 | ||
| 258 | +uv pip install -r requirements.txt | ||
| 259 | + | ||
| 260 | + | ||
| 233 | # 安装Playwright浏览器驱动 | 261 | # 安装Playwright浏览器驱动 |
| 234 | playwright install | 262 | playwright install |
| 235 | ``` | 263 | ``` |
| 236 | 264 | ||
| 237 | ### 4. 配置系统 | 265 | ### 4. 配置系统 |
| 238 | 266 | ||
| 239 | -编辑 `config.py` 文件,设置数据库和API配置: | 267 | +复制.env.example文件为.env文件,放置在项目根目录。编辑 `.env` 文件,设置数据库和API配置: |
| 240 | 268 | ||
| 241 | ```python | 269 | ```python |
| 242 | # MySQL数据库配置 | 270 | # MySQL数据库配置 |
| @@ -248,7 +276,9 @@ DB_NAME = "mindspider" | @@ -248,7 +276,9 @@ DB_NAME = "mindspider" | ||
| 248 | DB_CHARSET = "utf8mb4" | 276 | DB_CHARSET = "utf8mb4" |
| 249 | 277 | ||
| 250 | # DeepSeek API密钥 | 278 | # DeepSeek API密钥 |
| 251 | -DEEPSEEK_API_KEY = "your_deepseek_api_key" | 279 | +MINDSPIDER_BASE_URL=your_api_base_url |
| 280 | +MINDSPIDER_API_KEY=sk-your-key | ||
| 281 | +MINDSPIDER_MODEL_NAME=deepseek-chat | ||
| 252 | ``` | 282 | ``` |
| 253 | 283 | ||
| 254 | ### 5. 初始化系统 | 284 | ### 5. 初始化系统 |
| @@ -418,6 +448,11 @@ python main.py --status | @@ -418,6 +448,11 @@ python main.py --status | ||
| 418 | ```bash | 448 | ```bash |
| 419 | # 重新安装 | 449 | # 重新安装 |
| 420 | pip install playwright | 450 | pip install playwright |
| 451 | + | ||
| 452 | +或 | ||
| 453 | + | ||
| 454 | +uv pip install playwright | ||
| 455 | + | ||
| 421 | playwright install | 456 | playwright install |
| 422 | ``` | 457 | ``` |
| 423 | 458 |
| @@ -3,13 +3,33 @@ | @@ -3,13 +3,33 @@ | ||
| 3 | 存储数据库连接信息和API密钥 | 3 | 存储数据库连接信息和API密钥 |
| 4 | """ | 4 | """ |
| 5 | 5 | ||
| 6 | -# MySQL数据库配置 | ||
| 7 | -DB_HOST = "your_host" | ||
| 8 | -DB_PORT = 3306 | ||
| 9 | -DB_USER = "your_username" | ||
| 10 | -DB_PASSWORD = "your_password" | ||
| 11 | -DB_NAME = "mindspider" | ||
| 12 | -DB_CHARSET = "utf8mb4" | ||
| 13 | - | ||
| 14 | -# DeepSeek API密钥 | ||
| 15 | -DEEPSEEK_API_KEY = "your_deepseek_api_key" | 6 | +from pydantic_settings import BaseSettings |
| 7 | +from typing import Optional | ||
| 8 | +from pydantic import Field | ||
| 9 | +from pathlib import Path | ||
| 10 | + | ||
| 11 | +# 计算 .env 优先级:优先当前工作目录,其次项目根目录(MindSpider 的上级目录) | ||
| 12 | +PROJECT_ROOT: Path = Path(__file__).resolve().parents[1] | ||
| 13 | +CWD_ENV: Path = Path.cwd() / ".env" | ||
| 14 | +ENV_FILE: str = str(CWD_ENV if CWD_ENV.exists() else (PROJECT_ROOT / ".env")) | ||
| 15 | + | ||
| 16 | +class Settings(BaseSettings): | ||
| 17 | + """全局配置管理,优先从环境变量和.env加载。支持MySQL/PostgreSQL统一数据库参数命名。""" | ||
| 18 | + DB_DIALECT: str = Field("mysql", description="数据库类型,支持'mysql'或'postgresql'") | ||
| 19 | + DB_HOST: str = Field("your_host", description="数据库主机名或IP地址") | ||
| 20 | + DB_PORT: int = Field(3306, description="数据库端口号") | ||
| 21 | + DB_USER: str = Field("your_username", description="数据库用户名") | ||
| 22 | + DB_PASSWORD: str = Field("your_password", description="数据库密码") | ||
| 23 | + DB_NAME: str = Field("mindspider", description="数据库名称") | ||
| 24 | + DB_CHARSET: str = Field("utf8mb4", description="数据库字符集") | ||
| 25 | + MINDSPIDER_API_KEY: Optional[str] = Field(None, description="MINDSPIDER API密钥") | ||
| 26 | + MINDSPIDER_BASE_URL: Optional[str] = Field("https://api.deepseek.com", description="MINDSPIDER API基础URL,推荐deepseek-chat模型使用https://api.deepseek.com") | ||
| 27 | + MINDSPIDER_MODEL_NAME: Optional[str] = Field("deepseek-chat", description="MINDSPIDER API模型名称, 推荐deepseek-chat") | ||
| 28 | + | ||
| 29 | + class Config: | ||
| 30 | + env_file = ENV_FILE | ||
| 31 | + env_prefix = "" | ||
| 32 | + case_sensitive = False | ||
| 33 | + extra = "allow" | ||
| 34 | + | ||
| 35 | +settings = Settings() |
| @@ -11,8 +11,13 @@ import argparse | @@ -11,8 +11,13 @@ import argparse | ||
| 11 | from datetime import date, datetime | 11 | from datetime import date, datetime |
| 12 | from pathlib import Path | 12 | from pathlib import Path |
| 13 | import subprocess | 13 | import subprocess |
| 14 | +import asyncio | ||
| 14 | import pymysql | 15 | import pymysql |
| 15 | from pymysql.cursors import DictCursor | 16 | from pymysql.cursors import DictCursor |
| 17 | +from sqlalchemy.ext.asyncio import create_async_engine, AsyncEngine | ||
| 18 | +from sqlalchemy import inspect, text | ||
| 19 | +from config import settings | ||
| 20 | +from loguru import logger | ||
| 16 | 21 | ||
| 17 | # 添加项目根目录到路径 | 22 | # 添加项目根目录到路径 |
| 18 | project_root = Path(__file__).parent | 23 | project_root = Path(__file__).parent |
| @@ -21,8 +26,8 @@ sys.path.append(str(project_root)) | @@ -21,8 +26,8 @@ sys.path.append(str(project_root)) | ||
| 21 | try: | 26 | try: |
| 22 | import config | 27 | import config |
| 23 | except ImportError: | 28 | except ImportError: |
| 24 | - print("错误:无法导入config.py配置文件") | ||
| 25 | - print("请确保项目根目录下存在config.py文件,并包含数据库和API配置信息") | 29 | + logger.error("错误:无法导入config.py配置文件") |
| 30 | + logger.error("请确保项目根目录下存在config.py文件,并包含数据库和API配置信息") | ||
| 26 | sys.exit(1) | 31 | sys.exit(1) |
| 27 | 32 | ||
| 28 | class MindSpider: | 33 | class MindSpider: |
| @@ -35,99 +40,110 @@ class MindSpider: | @@ -35,99 +40,110 @@ class MindSpider: | ||
| 35 | self.deep_sentiment_path = self.project_root / "DeepSentimentCrawling" | 40 | self.deep_sentiment_path = self.project_root / "DeepSentimentCrawling" |
| 36 | self.schema_path = self.project_root / "schema" | 41 | self.schema_path = self.project_root / "schema" |
| 37 | 42 | ||
| 38 | - print("MindSpider AI爬虫项目") | ||
| 39 | - print(f"项目路径: {self.project_root}") | 43 | + logger.info("MindSpider AI爬虫项目") |
| 44 | + logger.info(f"项目路径: {self.project_root}") | ||
| 40 | 45 | ||
| 41 | def check_config(self) -> bool: | 46 | def check_config(self) -> bool: |
| 42 | """检查基础配置""" | 47 | """检查基础配置""" |
| 43 | - print("\n检查基础配置...") | 48 | + logger.info("检查基础配置...") |
| 44 | 49 | ||
| 45 | - # 检查config.py配置项 | 50 | + # 检查settings配置项 |
| 46 | required_configs = [ | 51 | required_configs = [ |
| 47 | 'DB_HOST', 'DB_PORT', 'DB_USER', 'DB_PASSWORD', 'DB_NAME', 'DB_CHARSET', | 52 | 'DB_HOST', 'DB_PORT', 'DB_USER', 'DB_PASSWORD', 'DB_NAME', 'DB_CHARSET', |
| 48 | - 'DEEPSEEK_API_KEY' | 53 | + 'MINDSPIDER_API_KEY', 'MINDSPIDER_BASE_URL', 'MINDSPIDER_MODEL_NAME' |
| 49 | ] | 54 | ] |
| 50 | 55 | ||
| 51 | missing_configs = [] | 56 | missing_configs = [] |
| 52 | for config_name in required_configs: | 57 | for config_name in required_configs: |
| 53 | - if not hasattr(config, config_name) or not getattr(config, config_name): | 58 | + if not hasattr(settings, config_name) or not getattr(settings, config_name): |
| 54 | missing_configs.append(config_name) | 59 | missing_configs.append(config_name) |
| 55 | 60 | ||
| 56 | if missing_configs: | 61 | if missing_configs: |
| 57 | - print(f"配置缺失: {', '.join(missing_configs)}") | ||
| 58 | - print("请检查config.py文件中的配置信息") | 62 | + logger.error(f"配置缺失: {', '.join(missing_configs)}") |
| 63 | + logger.error("请检查config.py文件中的配置信息") | ||
| 59 | return False | 64 | return False |
| 60 | 65 | ||
| 61 | - print("基础配置检查通过") | 66 | + logger.info("基础配置检查通过") |
| 62 | return True | 67 | return True |
| 63 | 68 | ||
| 64 | def check_database_connection(self) -> bool: | 69 | def check_database_connection(self) -> bool: |
| 65 | """检查数据库连接""" | 70 | """检查数据库连接""" |
| 66 | - print("\n检查数据库连接...") | ||
| 67 | - | ||
| 68 | - try: | ||
| 69 | - connection = pymysql.connect( | ||
| 70 | - host=config.DB_HOST, | ||
| 71 | - port=config.DB_PORT, | ||
| 72 | - user=config.DB_USER, | ||
| 73 | - password=config.DB_PASSWORD, | ||
| 74 | - database=config.DB_NAME, | ||
| 75 | - charset=config.DB_CHARSET, | ||
| 76 | - cursorclass=DictCursor | 71 | + logger.info("检查数据库连接...") |
| 72 | + | ||
| 73 | + def build_async_url() -> str: | ||
| 74 | + dialect = (settings.DB_DIALECT or "mysql").lower() | ||
| 75 | + if dialect == "postgresql": | ||
| 76 | + return f"postgresql+asyncpg://{settings.DB_USER}:{settings.DB_PASSWORD}@{settings.DB_HOST}:{settings.DB_PORT}/{settings.DB_NAME}" | ||
| 77 | + # 默认使用 mysql 异步驱动 asyncmy | ||
| 78 | + return ( | ||
| 79 | + f"mysql+asyncmy://{settings.DB_USER}:{settings.DB_PASSWORD}" | ||
| 80 | + f"@{settings.DB_HOST}:{settings.DB_PORT}/{settings.DB_NAME}?charset={settings.DB_CHARSET}" | ||
| 77 | ) | 81 | ) |
| 78 | - connection.close() | ||
| 79 | - print("数据库连接正常") | 82 | + |
| 83 | + async def _test_connection(db_url: str) -> None: | ||
| 84 | + engine: AsyncEngine = create_async_engine(db_url, pool_pre_ping=True) | ||
| 85 | + try: | ||
| 86 | + async with engine.connect() as conn: | ||
| 87 | + await conn.execute(text("SELECT 1")) | ||
| 88 | + finally: | ||
| 89 | + await engine.dispose() | ||
| 90 | + | ||
| 91 | + try: | ||
| 92 | + db_url: str = build_async_url() | ||
| 93 | + asyncio.run(_test_connection(db_url)) | ||
| 94 | + logger.info("数据库连接正常") | ||
| 80 | return True | 95 | return True |
| 81 | except Exception as e: | 96 | except Exception as e: |
| 82 | - print(f"数据库连接失败: {e}") | 97 | + logger.exception(f"数据库连接失败: {e}") |
| 83 | return False | 98 | return False |
| 84 | 99 | ||
| 85 | def check_database_tables(self) -> bool: | 100 | def check_database_tables(self) -> bool: |
| 86 | """检查数据库表是否存在""" | 101 | """检查数据库表是否存在""" |
| 87 | - print("\n检查数据库表...") | ||
| 88 | - | ||
| 89 | - try: | ||
| 90 | - connection = pymysql.connect( | ||
| 91 | - host=config.DB_HOST, | ||
| 92 | - port=config.DB_PORT, | ||
| 93 | - user=config.DB_USER, | ||
| 94 | - password=config.DB_PASSWORD, | ||
| 95 | - database=config.DB_NAME, | ||
| 96 | - charset=config.DB_CHARSET, | ||
| 97 | - cursorclass=DictCursor | 102 | + logger.info("检查数据库表...") |
| 103 | + | ||
| 104 | + def build_async_url() -> str: | ||
| 105 | + dialect = (settings.DB_DIALECT or "mysql").lower() | ||
| 106 | + if dialect == "postgresql": | ||
| 107 | + return f"postgresql+asyncpg://{settings.DB_USER}:{settings.DB_PASSWORD}@{settings.DB_HOST}:{settings.DB_PORT}/{settings.DB_NAME}" | ||
| 108 | + return ( | ||
| 109 | + f"mysql+asyncmy://{settings.DB_USER}:{settings.DB_PASSWORD}" | ||
| 110 | + f"@{settings.DB_HOST}:{settings.DB_PORT}/{settings.DB_NAME}?charset={settings.DB_CHARSET}" | ||
| 98 | ) | 111 | ) |
| 99 | - | ||
| 100 | - cursor = connection.cursor() | ||
| 101 | - | ||
| 102 | - # 检查核心表是否存在 | 112 | + |
| 113 | + async def _check_tables(db_url: str) -> list[str]: | ||
| 114 | + engine: AsyncEngine = create_async_engine(db_url, pool_pre_ping=True) | ||
| 115 | + try: | ||
| 116 | + async with engine.connect() as conn: | ||
| 117 | + def _get_tables(sync_conn): | ||
| 118 | + return inspect(sync_conn).get_table_names() | ||
| 119 | + tables = await conn.run_sync(_get_tables) | ||
| 120 | + return tables | ||
| 121 | + finally: | ||
| 122 | + await engine.dispose() | ||
| 123 | + | ||
| 124 | + try: | ||
| 125 | + db_url: str = build_async_url() | ||
| 126 | + existing_tables = asyncio.run(_check_tables(db_url)) | ||
| 103 | required_tables = ['daily_news', 'daily_topics'] | 127 | required_tables = ['daily_news', 'daily_topics'] |
| 104 | - cursor.execute("SHOW TABLES") | ||
| 105 | - existing_tables = [row[f'Tables_in_{config.DB_NAME}'] for row in cursor.fetchall()] | ||
| 106 | - | ||
| 107 | - missing_tables = [table for table in required_tables if table not in existing_tables] | ||
| 108 | - | ||
| 109 | - connection.close() | ||
| 110 | - | 128 | + missing_tables = [t for t in required_tables if t not in existing_tables] |
| 111 | if missing_tables: | 129 | if missing_tables: |
| 112 | - print(f"缺少数据库表: {', '.join(missing_tables)}") | 130 | + logger.error(f"缺少数据库表: {', '.join(missing_tables)}") |
| 113 | return False | 131 | return False |
| 114 | - else: | ||
| 115 | - print("数据库表检查通过") | ||
| 116 | - return True | ||
| 117 | - | 132 | + logger.info("数据库表检查通过") |
| 133 | + return True | ||
| 118 | except Exception as e: | 134 | except Exception as e: |
| 119 | - print(f"检查数据库表失败: {e}") | 135 | + logger.exception(f"检查数据库表失败: {e}") |
| 120 | return False | 136 | return False |
| 121 | 137 | ||
| 122 | def initialize_database(self) -> bool: | 138 | def initialize_database(self) -> bool: |
| 123 | """初始化数据库""" | 139 | """初始化数据库""" |
| 124 | - print("\n初始化数据库...") | 140 | + logger.info("初始化数据库...") |
| 125 | 141 | ||
| 126 | try: | 142 | try: |
| 127 | # 运行数据库初始化脚本 | 143 | # 运行数据库初始化脚本 |
| 128 | init_script = self.schema_path / "init_database.py" | 144 | init_script = self.schema_path / "init_database.py" |
| 129 | if not init_script.exists(): | 145 | if not init_script.exists(): |
| 130 | - print("错误:找不到数据库初始化脚本") | 146 | + logger.error("错误:找不到数据库初始化脚本") |
| 131 | return False | 147 | return False |
| 132 | 148 | ||
| 133 | result = subprocess.run( | 149 | result = subprocess.run( |
| @@ -138,19 +154,19 @@ class MindSpider: | @@ -138,19 +154,19 @@ class MindSpider: | ||
| 138 | ) | 154 | ) |
| 139 | 155 | ||
| 140 | if result.returncode == 0: | 156 | if result.returncode == 0: |
| 141 | - print("数据库初始化成功") | 157 | + logger.info("数据库初始化成功") |
| 142 | return True | 158 | return True |
| 143 | else: | 159 | else: |
| 144 | - print(f"数据库初始化失败: {result.stderr}") | 160 | + logger.error(f"数据库初始化失败: {result.stderr}") |
| 145 | return False | 161 | return False |
| 146 | 162 | ||
| 147 | except Exception as e: | 163 | except Exception as e: |
| 148 | - print(f"数据库初始化异常: {e}") | 164 | + logger.exception(f"数据库初始化异常: {e}") |
| 149 | return False | 165 | return False |
| 150 | 166 | ||
| 151 | def check_dependencies(self) -> bool: | 167 | def check_dependencies(self) -> bool: |
| 152 | """检查依赖环境""" | 168 | """检查依赖环境""" |
| 153 | - print("\n检查依赖环境...") | 169 | + logger.info("检查依赖环境...") |
| 154 | 170 | ||
| 155 | # 检查Python包 | 171 | # 检查Python包 |
| 156 | required_packages = ['pymysql', 'requests', 'playwright'] | 172 | required_packages = ['pymysql', 'requests', 'playwright'] |
| @@ -163,22 +179,22 @@ class MindSpider: | @@ -163,22 +179,22 @@ class MindSpider: | ||
| 163 | missing_packages.append(package) | 179 | missing_packages.append(package) |
| 164 | 180 | ||
| 165 | if missing_packages: | 181 | if missing_packages: |
| 166 | - print(f"缺少Python包: {', '.join(missing_packages)}") | ||
| 167 | - print("请运行: pip install -r requirements.txt") | 182 | + logger.error(f"缺少Python包: {', '.join(missing_packages)}") |
| 183 | + logger.info("请运行: pip install -r requirements.txt") | ||
| 168 | return False | 184 | return False |
| 169 | 185 | ||
| 170 | # 检查MediaCrawler依赖 | 186 | # 检查MediaCrawler依赖 |
| 171 | mediacrawler_path = self.deep_sentiment_path / "MediaCrawler" | 187 | mediacrawler_path = self.deep_sentiment_path / "MediaCrawler" |
| 172 | if not mediacrawler_path.exists(): | 188 | if not mediacrawler_path.exists(): |
| 173 | - print("错误:找不到MediaCrawler目录") | 189 | + logger.error("错误:找不到MediaCrawler目录") |
| 174 | return False | 190 | return False |
| 175 | 191 | ||
| 176 | - print("依赖环境检查通过") | 192 | + logger.info("依赖环境检查通过") |
| 177 | return True | 193 | return True |
| 178 | 194 | ||
| 179 | def run_broad_topic_extraction(self, extract_date: date = None, keywords_count: int = 100) -> bool: | 195 | def run_broad_topic_extraction(self, extract_date: date = None, keywords_count: int = 100) -> bool: |
| 180 | """运行BroadTopicExtraction模块""" | 196 | """运行BroadTopicExtraction模块""" |
| 181 | - print(f"\n运行BroadTopicExtraction模块...") | 197 | + logger.info("运行BroadTopicExtraction模块...") |
| 182 | 198 | ||
| 183 | if not extract_date: | 199 | if not extract_date: |
| 184 | extract_date = date.today() | 200 | extract_date = date.today() |
| @@ -186,11 +202,10 @@ class MindSpider: | @@ -186,11 +202,10 @@ class MindSpider: | ||
| 186 | try: | 202 | try: |
| 187 | cmd = [ | 203 | cmd = [ |
| 188 | sys.executable, "main.py", | 204 | sys.executable, "main.py", |
| 189 | - "--date", extract_date.strftime("%Y-%m-%d"), | ||
| 190 | "--keywords", str(keywords_count) | 205 | "--keywords", str(keywords_count) |
| 191 | ] | 206 | ] |
| 192 | 207 | ||
| 193 | - print(f"执行命令: {' '.join(cmd)}") | 208 | + logger.info(f"执行命令: {' '.join(cmd)}") |
| 194 | 209 | ||
| 195 | result = subprocess.run( | 210 | result = subprocess.run( |
| 196 | cmd, | 211 | cmd, |
| @@ -199,24 +214,24 @@ class MindSpider: | @@ -199,24 +214,24 @@ class MindSpider: | ||
| 199 | ) | 214 | ) |
| 200 | 215 | ||
| 201 | if result.returncode == 0: | 216 | if result.returncode == 0: |
| 202 | - print("BroadTopicExtraction模块执行成功") | 217 | + logger.info("BroadTopicExtraction模块执行成功") |
| 203 | return True | 218 | return True |
| 204 | else: | 219 | else: |
| 205 | - print(f"BroadTopicExtraction模块执行失败,返回码: {result.returncode}") | 220 | + logger.error(f"BroadTopicExtraction模块执行失败,返回码: {result.returncode}") |
| 206 | return False | 221 | return False |
| 207 | 222 | ||
| 208 | except subprocess.TimeoutExpired: | 223 | except subprocess.TimeoutExpired: |
| 209 | - print("BroadTopicExtraction模块执行超时") | 224 | + logger.error("BroadTopicExtraction模块执行超时") |
| 210 | return False | 225 | return False |
| 211 | except Exception as e: | 226 | except Exception as e: |
| 212 | - print(f"BroadTopicExtraction模块执行异常: {e}") | 227 | + logger.exception(f"BroadTopicExtraction模块执行异常: {e}") |
| 213 | return False | 228 | return False |
| 214 | 229 | ||
| 215 | def run_deep_sentiment_crawling(self, target_date: date = None, platforms: list = None, | 230 | def run_deep_sentiment_crawling(self, target_date: date = None, platforms: list = None, |
| 216 | max_keywords: int = 50, max_notes: int = 50, | 231 | max_keywords: int = 50, max_notes: int = 50, |
| 217 | test_mode: bool = False) -> bool: | 232 | test_mode: bool = False) -> bool: |
| 218 | """运行DeepSentimentCrawling模块""" | 233 | """运行DeepSentimentCrawling模块""" |
| 219 | - print(f"\n运行DeepSentimentCrawling模块...") | 234 | + logger.info("运行DeepSentimentCrawling模块...") |
| 220 | 235 | ||
| 221 | if not target_date: | 236 | if not target_date: |
| 222 | target_date = date.today() | 237 | target_date = date.today() |
| @@ -238,7 +253,7 @@ class MindSpider: | @@ -238,7 +253,7 @@ class MindSpider: | ||
| 238 | if test_mode: | 253 | if test_mode: |
| 239 | cmd.append("--test") | 254 | cmd.append("--test") |
| 240 | 255 | ||
| 241 | - print(f"执行命令: {' '.join(cmd)}") | 256 | + logger.info(f"执行命令: {' '.join(cmd)}") |
| 242 | 257 | ||
| 243 | result = subprocess.run( | 258 | result = subprocess.run( |
| 244 | cmd, | 259 | cmd, |
| @@ -247,78 +262,78 @@ class MindSpider: | @@ -247,78 +262,78 @@ class MindSpider: | ||
| 247 | ) | 262 | ) |
| 248 | 263 | ||
| 249 | if result.returncode == 0: | 264 | if result.returncode == 0: |
| 250 | - print("DeepSentimentCrawling模块执行成功") | 265 | + logger.info("DeepSentimentCrawling模块执行成功") |
| 251 | return True | 266 | return True |
| 252 | else: | 267 | else: |
| 253 | - print(f"DeepSentimentCrawling模块执行失败,返回码: {result.returncode}") | 268 | + logger.error(f"DeepSentimentCrawling模块执行失败,返回码: {result.returncode}") |
| 254 | return False | 269 | return False |
| 255 | 270 | ||
| 256 | except subprocess.TimeoutExpired: | 271 | except subprocess.TimeoutExpired: |
| 257 | - print("DeepSentimentCrawling模块执行超时") | 272 | + logger.error("DeepSentimentCrawling模块执行超时") |
| 258 | return False | 273 | return False |
| 259 | except Exception as e: | 274 | except Exception as e: |
| 260 | - print(f"DeepSentimentCrawling模块执行异常: {e}") | 275 | + logger.exception(f"DeepSentimentCrawling模块执行异常: {e}") |
| 261 | return False | 276 | return False |
| 262 | 277 | ||
| 263 | def run_complete_workflow(self, target_date: date = None, platforms: list = None, | 278 | def run_complete_workflow(self, target_date: date = None, platforms: list = None, |
| 264 | keywords_count: int = 100, max_keywords: int = 50, | 279 | keywords_count: int = 100, max_keywords: int = 50, |
| 265 | max_notes: int = 50, test_mode: bool = False) -> bool: | 280 | max_notes: int = 50, test_mode: bool = False) -> bool: |
| 266 | """运行完整工作流程""" | 281 | """运行完整工作流程""" |
| 267 | - print(f"\n开始完整的MindSpider工作流程") | 282 | + logger.info("开始完整的MindSpider工作流程") |
| 268 | 283 | ||
| 269 | if not target_date: | 284 | if not target_date: |
| 270 | target_date = date.today() | 285 | target_date = date.today() |
| 271 | 286 | ||
| 272 | - print(f"目标日期: {target_date}") | ||
| 273 | - print(f"平台列表: {platforms if platforms else '所有支持的平台'}") | ||
| 274 | - print(f"测试模式: {'是' if test_mode else '否'}") | 287 | + logger.info(f"目标日期: {target_date}") |
| 288 | + logger.info(f"平台列表: {platforms if platforms else '所有支持的平台'}") | ||
| 289 | + logger.info(f"测试模式: {'是' if test_mode else '否'}") | ||
| 275 | 290 | ||
| 276 | # 第一步:运行话题提取 | 291 | # 第一步:运行话题提取 |
| 277 | - print(f"\n=== 第一步:话题提取 ===") | 292 | + logger.info("=== 第一步:话题提取 ===") |
| 278 | if not self.run_broad_topic_extraction(target_date, keywords_count): | 293 | if not self.run_broad_topic_extraction(target_date, keywords_count): |
| 279 | - print("话题提取失败,终止流程") | 294 | + logger.error("话题提取失败,终止流程") |
| 280 | return False | 295 | return False |
| 281 | 296 | ||
| 282 | # 第二步:运行情感爬取 | 297 | # 第二步:运行情感爬取 |
| 283 | - print(f"\n=== 第二步:情感爬取 ===") | 298 | + logger.info("=== 第二步:情感爬取 ===") |
| 284 | if not self.run_deep_sentiment_crawling(target_date, platforms, max_keywords, max_notes, test_mode): | 299 | if not self.run_deep_sentiment_crawling(target_date, platforms, max_keywords, max_notes, test_mode): |
| 285 | - print("情感爬取失败,但话题提取已完成") | 300 | + logger.error("情感爬取失败,但话题提取已完成") |
| 286 | return False | 301 | return False |
| 287 | 302 | ||
| 288 | - print(f"\n完整工作流程执行成功!") | 303 | + logger.info("完整工作流程执行成功!") |
| 289 | return True | 304 | return True |
| 290 | 305 | ||
| 291 | def show_status(self): | 306 | def show_status(self): |
| 292 | """显示项目状态""" | 307 | """显示项目状态""" |
| 293 | - print(f"\nMindSpider项目状态:") | ||
| 294 | - print(f"项目路径: {self.project_root}") | 308 | + logger.info("MindSpider项目状态:") |
| 309 | + logger.info(f"项目路径: {self.project_root}") | ||
| 295 | 310 | ||
| 296 | # 配置状态 | 311 | # 配置状态 |
| 297 | config_ok = self.check_config() | 312 | config_ok = self.check_config() |
| 298 | - print(f"配置状态: {'正常' if config_ok else '异常'}") | 313 | + logger.info(f"配置状态: {'正常' if config_ok else '异常'}") |
| 299 | 314 | ||
| 300 | # 数据库状态 | 315 | # 数据库状态 |
| 301 | if config_ok: | 316 | if config_ok: |
| 302 | db_conn_ok = self.check_database_connection() | 317 | db_conn_ok = self.check_database_connection() |
| 303 | - print(f"数据库连接: {'正常' if db_conn_ok else '异常'}") | 318 | + logger.info(f"数据库连接: {'正常' if db_conn_ok else '异常'}") |
| 304 | 319 | ||
| 305 | if db_conn_ok: | 320 | if db_conn_ok: |
| 306 | db_tables_ok = self.check_database_tables() | 321 | db_tables_ok = self.check_database_tables() |
| 307 | - print(f"数据库表: {'正常' if db_tables_ok else '需要初始化'}") | 322 | + logger.info(f"数据库表: {'正常' if db_tables_ok else '需要初始化'}") |
| 308 | 323 | ||
| 309 | # 依赖状态 | 324 | # 依赖状态 |
| 310 | deps_ok = self.check_dependencies() | 325 | deps_ok = self.check_dependencies() |
| 311 | - print(f"依赖环境: {'正常' if deps_ok else '异常'}") | 326 | + logger.info(f"依赖环境: {'正常' if deps_ok else '异常'}") |
| 312 | 327 | ||
| 313 | # 模块状态 | 328 | # 模块状态 |
| 314 | broad_topic_exists = self.broad_topic_path.exists() | 329 | broad_topic_exists = self.broad_topic_path.exists() |
| 315 | deep_sentiment_exists = self.deep_sentiment_path.exists() | 330 | deep_sentiment_exists = self.deep_sentiment_path.exists() |
| 316 | - print(f"BroadTopicExtraction模块: {'存在' if broad_topic_exists else '缺失'}") | ||
| 317 | - print(f"DeepSentimentCrawling模块: {'存在' if deep_sentiment_exists else '缺失'}") | 331 | + logger.info(f"BroadTopicExtraction模块: {'存在' if broad_topic_exists else '缺失'}") |
| 332 | + logger.info(f"DeepSentimentCrawling模块: {'存在' if deep_sentiment_exists else '缺失'}") | ||
| 318 | 333 | ||
| 319 | def setup_project(self) -> bool: | 334 | def setup_project(self) -> bool: |
| 320 | """项目初始化设置""" | 335 | """项目初始化设置""" |
| 321 | - print(f"\n开始MindSpider项目初始化...") | 336 | + logger.info("开始MindSpider项目初始化...") |
| 322 | 337 | ||
| 323 | # 1. 检查配置 | 338 | # 1. 检查配置 |
| 324 | if not self.check_config(): | 339 | if not self.check_config(): |
| @@ -334,11 +349,11 @@ class MindSpider: | @@ -334,11 +349,11 @@ class MindSpider: | ||
| 334 | 349 | ||
| 335 | # 4. 检查并初始化数据库表 | 350 | # 4. 检查并初始化数据库表 |
| 336 | if not self.check_database_tables(): | 351 | if not self.check_database_tables(): |
| 337 | - print("需要初始化数据库表...") | 352 | + logger.info("需要初始化数据库表...") |
| 338 | if not self.initialize_database(): | 353 | if not self.initialize_database(): |
| 339 | return False | 354 | return False |
| 340 | 355 | ||
| 341 | - print(f"\nMindSpider项目初始化完成!") | 356 | + logger.info("MindSpider项目初始化完成!") |
| 342 | return True | 357 | return True |
| 343 | 358 | ||
| 344 | def main(): | 359 | def main(): |
| @@ -373,7 +388,7 @@ def main(): | @@ -373,7 +388,7 @@ def main(): | ||
| 373 | try: | 388 | try: |
| 374 | target_date = datetime.strptime(args.date, "%Y-%m-%d").date() | 389 | target_date = datetime.strptime(args.date, "%Y-%m-%d").date() |
| 375 | except ValueError: | 390 | except ValueError: |
| 376 | - print("错误:日期格式不正确,请使用 YYYY-MM-DD 格式") | 391 | + logger.error("错误:日期格式不正确,请使用 YYYY-MM-DD 格式") |
| 377 | return | 392 | return |
| 378 | 393 | ||
| 379 | # 创建MindSpider实例 | 394 | # 创建MindSpider实例 |
| @@ -388,17 +403,17 @@ def main(): | @@ -388,17 +403,17 @@ def main(): | ||
| 388 | # 项目设置 | 403 | # 项目设置 |
| 389 | if args.setup: | 404 | if args.setup: |
| 390 | if spider.setup_project(): | 405 | if spider.setup_project(): |
| 391 | - print("项目设置完成,可以开始使用MindSpider!") | 406 | + logger.info("项目设置完成,可以开始使用MindSpider!") |
| 392 | else: | 407 | else: |
| 393 | - print("项目设置失败,请检查配置和环境") | 408 | + logger.error("项目设置失败,请检查配置和环境") |
| 394 | return | 409 | return |
| 395 | 410 | ||
| 396 | # 初始化数据库 | 411 | # 初始化数据库 |
| 397 | if args.init_db: | 412 | if args.init_db: |
| 398 | if spider.initialize_database(): | 413 | if spider.initialize_database(): |
| 399 | - print("数据库初始化成功") | 414 | + logger.info("数据库初始化成功") |
| 400 | else: | 415 | else: |
| 401 | - print("数据库初始化失败") | 416 | + logger.error("数据库初始化失败") |
| 402 | return | 417 | return |
| 403 | 418 | ||
| 404 | # 运行模块 | 419 | # 运行模块 |
| @@ -415,16 +430,16 @@ def main(): | @@ -415,16 +430,16 @@ def main(): | ||
| 415 | ) | 430 | ) |
| 416 | else: | 431 | else: |
| 417 | # 默认运行完整工作流程 | 432 | # 默认运行完整工作流程 |
| 418 | - print("运行完整MindSpider工作流程...") | 433 | + logger.info("运行完整MindSpider工作流程...") |
| 419 | spider.run_complete_workflow( | 434 | spider.run_complete_workflow( |
| 420 | target_date, args.platforms, args.keywords_count, | 435 | target_date, args.platforms, args.keywords_count, |
| 421 | args.max_keywords, args.max_notes, args.test | 436 | args.max_keywords, args.max_notes, args.test |
| 422 | ) | 437 | ) |
| 423 | 438 | ||
| 424 | except KeyboardInterrupt: | 439 | except KeyboardInterrupt: |
| 425 | - print("\n用户中断操作") | 440 | + logger.info("用户中断操作") |
| 426 | except Exception as e: | 441 | except Exception as e: |
| 427 | - print(f"\n执行出错: {e}") | 442 | + logger.exception(f"执行出错: {e}") |
| 428 | 443 | ||
| 429 | if __name__ == "__main__": | 444 | if __name__ == "__main__": |
| 430 | main() | 445 | main() |
| @@ -7,6 +7,8 @@ | @@ -7,6 +7,8 @@ | ||
| 7 | pymysql==1.1.0 | 7 | pymysql==1.1.0 |
| 8 | aiomysql==0.2.0 | 8 | aiomysql==0.2.0 |
| 9 | aiosqlite==0.21.0 | 9 | aiosqlite==0.21.0 |
| 10 | +asyncpg | ||
| 11 | +sqlalchemy | ||
| 10 | 12 | ||
| 11 | # =============================== | 13 | # =============================== |
| 12 | # HTTP请求和网络 | 14 | # HTTP请求和网络 |
| @@ -42,6 +44,8 @@ wordcloud==1.9.3 | @@ -42,6 +44,8 @@ wordcloud==1.9.3 | ||
| 42 | matplotlib==3.9.0 | 44 | matplotlib==3.9.0 |
| 43 | parsel==1.9.1 | 45 | parsel==1.9.1 |
| 44 | pyexecjs==1.5.1 | 46 | pyexecjs==1.5.1 |
| 47 | +typer>=0.12.3 | ||
| 48 | +pyhumps==3.8.0 | ||
| 45 | 49 | ||
| 46 | # =============================== | 50 | # =============================== |
| 47 | # 工具包 | 51 | # 工具包 |
| @@ -7,10 +7,12 @@ MindSpider AI爬虫项目 - 数据库管理工具 | @@ -7,10 +7,12 @@ MindSpider AI爬虫项目 - 数据库管理工具 | ||
| 7 | 7 | ||
| 8 | import os | 8 | import os |
| 9 | import sys | 9 | import sys |
| 10 | -import pymysql | 10 | +from sqlalchemy import create_engine, text, inspect |
| 11 | +from sqlalchemy.engine import Engine | ||
| 11 | import argparse | 12 | import argparse |
| 12 | from pathlib import Path | 13 | from pathlib import Path |
| 13 | from datetime import datetime, timedelta | 14 | from datetime import datetime, timedelta |
| 15 | +from loguru import logger | ||
| 14 | 16 | ||
| 15 | # 添加项目根目录到路径 | 17 | # 添加项目根目录到路径 |
| 16 | project_root = Path(__file__).parent.parent | 18 | project_root = Path(__file__).parent.parent |
| @@ -19,125 +21,132 @@ sys.path.append(str(project_root)) | @@ -19,125 +21,132 @@ sys.path.append(str(project_root)) | ||
| 19 | try: | 21 | try: |
| 20 | import config | 22 | import config |
| 21 | except ImportError: | 23 | except ImportError: |
| 22 | - print("错误: 无法导入config.py配置文件") | 24 | + logger.error("错误: 无法导入config.py配置文件") |
| 23 | sys.exit(1) | 25 | sys.exit(1) |
| 24 | 26 | ||
| 27 | +from MindSpider.config import settings | ||
| 28 | + | ||
| 25 | class DatabaseManager: | 29 | class DatabaseManager: |
| 26 | def __init__(self): | 30 | def __init__(self): |
| 27 | - self.connection = None | 31 | + self.engine: Engine = None |
| 28 | self.connect() | 32 | self.connect() |
| 29 | 33 | ||
| 30 | def connect(self): | 34 | def connect(self): |
| 31 | """连接数据库""" | 35 | """连接数据库""" |
| 32 | try: | 36 | try: |
| 33 | - self.connection = pymysql.connect( | ||
| 34 | - host=config.DB_HOST, | ||
| 35 | - port=config.DB_PORT, | ||
| 36 | - user=config.DB_USER, | ||
| 37 | - password=config.DB_PASSWORD, | ||
| 38 | - database=config.DB_NAME, | ||
| 39 | - charset=config.DB_CHARSET, | ||
| 40 | - autocommit=True | ||
| 41 | - ) | ||
| 42 | - print(f"成功连接到数据库: {config.DB_NAME}") | 37 | + dialect = (settings.DB_DIALECT or "mysql").lower() |
| 38 | + if dialect in ("postgresql", "postgres"): | ||
| 39 | + url = f"postgresql+psycopg://{settings.DB_USER}:{settings.DB_PASSWORD}@{settings.DB_HOST}:{settings.DB_PORT}/{settings.DB_NAME}" | ||
| 40 | + else: | ||
| 41 | + url = f"mysql+pymysql://{settings.DB_USER}:{settings.DB_PASSWORD}@{settings.DB_HOST}:{settings.DB_PORT}/{settings.DB_NAME}?charset={settings.DB_CHARSET}" | ||
| 42 | + self.engine = create_engine(url, future=True) | ||
| 43 | + logger.info(f"成功连接到数据库: {settings.DB_NAME}") | ||
| 43 | except Exception as e: | 44 | except Exception as e: |
| 44 | - print(f"数据库连接失败: {e}") | 45 | + logger.error(f"数据库连接失败: {e}") |
| 45 | sys.exit(1) | 46 | sys.exit(1) |
| 46 | 47 | ||
| 47 | def close(self): | 48 | def close(self): |
| 48 | """关闭数据库连接""" | 49 | """关闭数据库连接""" |
| 49 | - if self.connection: | ||
| 50 | - self.connection.close() | 50 | + if self.engine: |
| 51 | + self.engine.dispose() | ||
| 51 | 52 | ||
| 52 | def show_tables(self): | 53 | def show_tables(self): |
| 53 | """显示所有表""" | 54 | """显示所有表""" |
| 54 | - print("\n" + "=" * 60) | ||
| 55 | - print("数据库表列表") | ||
| 56 | - print("=" * 60) | 55 | + data_list_message = "" |
| 56 | + data_list_message += "\n" + "=" * 60 | ||
| 57 | + data_list_message += "数据库表列表" | ||
| 58 | + data_list_message += "=" * 60 | ||
| 59 | + logger.info(data_list_message) | ||
| 57 | 60 | ||
| 58 | - cursor = self.connection.cursor() | ||
| 59 | - cursor.execute("SHOW TABLES") | ||
| 60 | - tables = cursor.fetchall() | 61 | + inspector = inspect(self.engine) |
| 62 | + tables = inspector.get_table_names() | ||
| 61 | 63 | ||
| 62 | if not tables: | 64 | if not tables: |
| 63 | - print("数据库中没有表") | 65 | + logger.info("数据库中没有表") |
| 64 | return | 66 | return |
| 65 | 67 | ||
| 66 | # 分类显示表 | 68 | # 分类显示表 |
| 67 | mindspider_tables = [] | 69 | mindspider_tables = [] |
| 68 | mediacrawler_tables = [] | 70 | mediacrawler_tables = [] |
| 69 | 71 | ||
| 70 | - for table in tables: | ||
| 71 | - table_name = table[0] | 72 | + for table_name in tables: |
| 72 | if table_name in ['daily_news', 'daily_topics', 'topic_news_relation', 'crawling_tasks']: | 73 | if table_name in ['daily_news', 'daily_topics', 'topic_news_relation', 'crawling_tasks']: |
| 73 | mindspider_tables.append(table_name) | 74 | mindspider_tables.append(table_name) |
| 74 | else: | 75 | else: |
| 75 | mediacrawler_tables.append(table_name) | 76 | mediacrawler_tables.append(table_name) |
| 76 | 77 | ||
| 77 | - print("MindSpider核心表:") | 78 | + data_list_message += "MindSpider核心表:" |
| 79 | + data_list_message += "\n" | ||
| 78 | for table in mindspider_tables: | 80 | for table in mindspider_tables: |
| 79 | - cursor.execute(f"SELECT COUNT(*) FROM {table}") | ||
| 80 | - count = cursor.fetchone()[0] | ||
| 81 | - print(f" - {table:<25} ({count:>6} 条记录)") | 81 | + with self.engine.connect() as conn: |
| 82 | + count = conn.execute(text(f"SELECT COUNT(*) FROM {table}")).scalar_one() | ||
| 83 | + data_list_message += f" - {table:<25} ({count:>6} 条记录)" | ||
| 84 | + data_list_message += "\n" | ||
| 82 | 85 | ||
| 83 | - print("\nMediaCrawler平台表:") | 86 | + data_list_message += "\nMediaCrawler平台表:" |
| 87 | + data_list_message += "\n" | ||
| 84 | for table in mediacrawler_tables: | 88 | for table in mediacrawler_tables: |
| 85 | try: | 89 | try: |
| 86 | - cursor.execute(f"SELECT COUNT(*) FROM {table}") | ||
| 87 | - count = cursor.fetchone()[0] | ||
| 88 | - print(f" - {table:<25} ({count:>6} 条记录)") | 90 | + with self.engine.connect() as conn: |
| 91 | + count = conn.execute(text(f"SELECT COUNT(*) FROM {table}")).scalar_one() | ||
| 92 | + data_list_message += f" - {table:<25} ({count:>6} 条记录)" | ||
| 93 | + data_list_message += "\n" | ||
| 89 | except: | 94 | except: |
| 90 | - print(f" - {table:<25} (查询失败)") | 95 | + data_list_message += f" - {table:<25} (查询失败)" |
| 96 | + data_list_message += "\n" | ||
| 97 | + logger.info(data_list_message) | ||
| 91 | 98 | ||
| 92 | def show_statistics(self): | 99 | def show_statistics(self): |
| 93 | """显示数据统计""" | 100 | """显示数据统计""" |
| 94 | - print("\n" + "=" * 60) | ||
| 95 | - print("数据统计") | ||
| 96 | - print("=" * 60) | ||
| 97 | - | ||
| 98 | - cursor = self.connection.cursor() | 101 | + data_statistics_message = "" |
| 102 | + data_statistics_message += "\n" + "=" * 60 | ||
| 103 | + data_statistics_message += "数据统计" | ||
| 104 | + data_statistics_message += "=" * 60 | ||
| 105 | + data_statistics_message += "\n" | ||
| 99 | 106 | ||
| 100 | try: | 107 | try: |
| 101 | # 新闻统计 | 108 | # 新闻统计 |
| 102 | - cursor.execute("SELECT COUNT(*) FROM daily_news") | ||
| 103 | - news_count = cursor.fetchone()[0] | ||
| 104 | - | ||
| 105 | - cursor.execute("SELECT COUNT(DISTINCT crawl_date) FROM daily_news") | ||
| 106 | - news_days = cursor.fetchone()[0] | ||
| 107 | - | ||
| 108 | - cursor.execute("SELECT COUNT(DISTINCT source_platform) FROM daily_news") | ||
| 109 | - platforms = cursor.fetchone()[0] | ||
| 110 | - | ||
| 111 | - print(f"新闻数据:") | ||
| 112 | - print(f" - 总新闻数: {news_count}") | ||
| 113 | - print(f" - 覆盖天数: {news_days}") | ||
| 114 | - print(f" - 新闻平台: {platforms}") | 109 | + with self.engine.connect() as conn: |
| 110 | + news_count = conn.execute(text("SELECT COUNT(*) FROM daily_news")).scalar_one() | ||
| 111 | + news_days = conn.execute(text("SELECT COUNT(DISTINCT crawl_date) FROM daily_news")).scalar_one() | ||
| 112 | + platforms = conn.execute(text("SELECT COUNT(DISTINCT source_platform) FROM daily_news")).scalar_one() | ||
| 115 | 113 | ||
| 114 | + data_statistics_message += "新闻数据:" | ||
| 115 | + data_statistics_message += "\n" | ||
| 116 | + data_statistics_message += f" - 总新闻数: {news_count}" | ||
| 117 | + data_statistics_message += "\n" | ||
| 118 | + data_statistics_message += f" - 覆盖天数: {news_days}" | ||
| 119 | + data_statistics_message += "\n" | ||
| 120 | + data_statistics_message += f" - 新闻平台: {platforms}" | ||
| 121 | + data_statistics_message += "\n" | ||
| 116 | # 话题统计 | 122 | # 话题统计 |
| 117 | - cursor.execute("SELECT COUNT(*) FROM daily_topics") | ||
| 118 | - topic_count = cursor.fetchone()[0] | 123 | + with self.engine.connect() as conn: |
| 124 | + topic_count = conn.execute(text("SELECT COUNT(*) FROM daily_topics")).scalar_one() | ||
| 125 | + topic_days = conn.execute(text("SELECT COUNT(DISTINCT extract_date) FROM daily_topics")).scalar_one() | ||
| 119 | 126 | ||
| 120 | - cursor.execute("SELECT COUNT(DISTINCT extract_date) FROM daily_topics") | ||
| 121 | - topic_days = cursor.fetchone()[0] | ||
| 122 | - | ||
| 123 | - print(f"\n话题数据:") | ||
| 124 | - print(f" - 总话题数: {topic_count}") | ||
| 125 | - print(f" - 提取天数: {topic_days}") | 127 | + data_statistics_message += "话题数据:" |
| 128 | + data_statistics_message += "\n" | ||
| 129 | + data_statistics_message += f" - 总话题数: {topic_count}" | ||
| 130 | + data_statistics_message += "\n" | ||
| 131 | + data_statistics_message += f" - 提取天数: {topic_days}" | ||
| 132 | + data_statistics_message += "\n" | ||
| 126 | 133 | ||
| 127 | # 爬取任务统计 | 134 | # 爬取任务统计 |
| 128 | - cursor.execute("SELECT COUNT(*) FROM crawling_tasks") | ||
| 129 | - task_count = cursor.fetchone()[0] | ||
| 130 | - | ||
| 131 | - cursor.execute("SELECT task_status, COUNT(*) FROM crawling_tasks GROUP BY task_status") | ||
| 132 | - task_status = cursor.fetchall() | 135 | + with self.engine.connect() as conn: |
| 136 | + task_count = conn.execute(text("SELECT COUNT(*) FROM crawling_tasks")).scalar_one() | ||
| 137 | + task_status = conn.execute(text("SELECT task_status, COUNT(*) FROM crawling_tasks GROUP BY task_status")).all() | ||
| 133 | 138 | ||
| 134 | - print(f"\n爬取任务:") | ||
| 135 | - print(f" - 总任务数: {task_count}") | 139 | + data_statistics_message += "爬取任务:" |
| 140 | + data_statistics_message += "\n" | ||
| 141 | + data_statistics_message += f" - 总任务数: {task_count}" | ||
| 142 | + data_statistics_message += "\n" | ||
| 136 | for status, count in task_status: | 143 | for status, count in task_status: |
| 137 | - print(f" - {status}: {count}") | 144 | + data_statistics_message += f" - {status}: {count}" |
| 145 | + data_statistics_message += "\n" | ||
| 138 | 146 | ||
| 139 | # 爬取内容统计 | 147 | # 爬取内容统计 |
| 140 | - print(f"\n平台内容统计:") | 148 | + data_statistics_message += "平台内容统计:" |
| 149 | + data_statistics_message += "\n" | ||
| 141 | platform_tables = { | 150 | platform_tables = { |
| 142 | 'xhs_note': '小红书', | 151 | 'xhs_note': '小红书', |
| 143 | 'douyin_aweme': '抖音', | 152 | 'douyin_aweme': '抖音', |
| @@ -150,60 +159,78 @@ class DatabaseManager: | @@ -150,60 +159,78 @@ class DatabaseManager: | ||
| 150 | 159 | ||
| 151 | for table, platform in platform_tables.items(): | 160 | for table, platform in platform_tables.items(): |
| 152 | try: | 161 | try: |
| 153 | - cursor.execute(f"SELECT COUNT(*) FROM {table}") | ||
| 154 | - count = cursor.fetchone()[0] | ||
| 155 | - print(f" - {platform}: {count}") | 162 | + with self.engine.connect() as conn: |
| 163 | + count = conn.execute(text(f"SELECT COUNT(*) FROM {table}")).scalar_one() | ||
| 164 | + data_statistics_message += f" - {platform}: {count}" | ||
| 165 | + data_statistics_message += "\n" | ||
| 156 | except: | 166 | except: |
| 157 | - print(f" - {platform}: 表不存在") | ||
| 158 | - | 167 | + data_statistics_message += f" - {platform}: 表不存在" |
| 168 | + data_statistics_message += "\n" | ||
| 169 | + logger.info(data_statistics_message) | ||
| 159 | except Exception as e: | 170 | except Exception as e: |
| 160 | - print(f"统计查询失败: {e}") | 171 | + data_statistics_message += f"统计查询失败: {e}" |
| 172 | + data_statistics_message += "\n" | ||
| 173 | + logger.error(data_statistics_message) | ||
| 161 | 174 | ||
| 162 | def show_recent_data(self, days=7): | 175 | def show_recent_data(self, days=7): |
| 163 | """显示最近几天的数据""" | 176 | """显示最近几天的数据""" |
| 164 | - print(f"\n" + "=" * 60) | ||
| 165 | - print(f"最近{days}天的数据") | ||
| 166 | - print("=" * 60) | ||
| 167 | - | ||
| 168 | - cursor = self.connection.cursor() | 177 | + data_recent_message = "" |
| 178 | + data_recent_message += "\n" + "=" * 60 | ||
| 179 | + data_recent_message += "最近" + str(days) + "天的数据" | ||
| 180 | + data_recent_message += "=" * 60 | ||
| 169 | 181 | ||
| 182 | + from datetime import date, timedelta | ||
| 183 | + start_date = date.today() - timedelta(days=days) | ||
| 170 | # 最近的新闻 | 184 | # 最近的新闻 |
| 171 | - cursor.execute(""" | ||
| 172 | - SELECT crawl_date, COUNT(*) as news_count, COUNT(DISTINCT source_platform) as platforms | ||
| 173 | - FROM daily_news | ||
| 174 | - WHERE crawl_date >= DATE_SUB(CURDATE(), INTERVAL %s DAY) | ||
| 175 | - GROUP BY crawl_date | ||
| 176 | - ORDER BY crawl_date DESC | ||
| 177 | - """, (days,)) | ||
| 178 | - | ||
| 179 | - news_data = cursor.fetchall() | 185 | + with self.engine.connect() as conn: |
| 186 | + news_data = conn.execute( | ||
| 187 | + text( | ||
| 188 | + """ | ||
| 189 | + SELECT crawl_date, COUNT(*) as news_count, COUNT(DISTINCT source_platform) as platforms | ||
| 190 | + FROM daily_news | ||
| 191 | + WHERE crawl_date >= :start_date | ||
| 192 | + GROUP BY crawl_date | ||
| 193 | + ORDER BY crawl_date DESC | ||
| 194 | + """ | ||
| 195 | + ), | ||
| 196 | + {"start_date": start_date}, | ||
| 197 | + ).all() | ||
| 180 | if news_data: | 198 | if news_data: |
| 181 | - print("每日新闻统计:") | 199 | + data_recent_message += "每日新闻统计:" |
| 200 | + data_recent_message += "\n" | ||
| 182 | for date, count, platforms in news_data: | 201 | for date, count, platforms in news_data: |
| 183 | - print(f" {date}: {count} 条新闻, {platforms} 个平台") | 202 | + data_recent_message += f" {date}: {count} 条新闻, {platforms} 个平台" |
| 203 | + data_recent_message += "\n" | ||
| 184 | 204 | ||
| 185 | # 最近的话题 | 205 | # 最近的话题 |
| 186 | - cursor.execute(""" | ||
| 187 | - SELECT extract_date, COUNT(*) as topic_count | ||
| 188 | - FROM daily_topics | ||
| 189 | - WHERE extract_date >= DATE_SUB(CURDATE(), INTERVAL %s DAY) | ||
| 190 | - GROUP BY extract_date | ||
| 191 | - ORDER BY extract_date DESC | ||
| 192 | - """, (days,)) | ||
| 193 | - | ||
| 194 | - topic_data = cursor.fetchall() | 206 | + with self.engine.connect() as conn: |
| 207 | + topic_data = conn.execute( | ||
| 208 | + text( | ||
| 209 | + """ | ||
| 210 | + SELECT extract_date, COUNT(*) as topic_count | ||
| 211 | + FROM daily_topics | ||
| 212 | + WHERE extract_date >= :start_date | ||
| 213 | + GROUP BY extract_date | ||
| 214 | + ORDER BY extract_date DESC | ||
| 215 | + """ | ||
| 216 | + ), | ||
| 217 | + {"start_date": start_date}, | ||
| 218 | + ).all() | ||
| 195 | if topic_data: | 219 | if topic_data: |
| 196 | - print("\n每日话题统计:") | 220 | + data_recent_message += "每日话题统计:" |
| 221 | + data_recent_message += "\n" | ||
| 197 | for date, count in topic_data: | 222 | for date, count in topic_data: |
| 198 | - print(f" {date}: {count} 个话题") | 223 | + data_recent_message += f" {date}: {count} 个话题" |
| 224 | + data_recent_message += "\n" | ||
| 225 | + logger.info(data_recent_message) | ||
| 199 | 226 | ||
| 200 | def cleanup_old_data(self, days=90, dry_run=True): | 227 | def cleanup_old_data(self, days=90, dry_run=True): |
| 201 | """清理旧数据""" | 228 | """清理旧数据""" |
| 202 | - print(f"\n" + "=" * 60) | ||
| 203 | - print(f"清理{days}天前的数据 ({'预览模式' if dry_run else '执行模式'})") | ||
| 204 | - print("=" * 60) | 229 | + cleanup_message = "" |
| 230 | + cleanup_message += "\n" + "=" * 60 | ||
| 231 | + cleanup_message += f"清理{days}天前的数据 ({'预览模式' if dry_run else '执行模式'})" | ||
| 232 | + cleanup_message += "=" * 60 | ||
| 205 | 233 | ||
| 206 | - cursor = self.connection.cursor() | ||
| 207 | cutoff_date = datetime.now() - timedelta(days=days) | 234 | cutoff_date = datetime.now() - timedelta(days=days) |
| 208 | 235 | ||
| 209 | # 检查要删除的数据 | 236 | # 检查要删除的数据 |
| @@ -213,20 +240,25 @@ class DatabaseManager: | @@ -213,20 +240,25 @@ class DatabaseManager: | ||
| 213 | ("crawling_tasks", f"SELECT COUNT(*) FROM crawling_tasks WHERE scheduled_date < '{cutoff_date.date()}'") | 240 | ("crawling_tasks", f"SELECT COUNT(*) FROM crawling_tasks WHERE scheduled_date < '{cutoff_date.date()}'") |
| 214 | ] | 241 | ] |
| 215 | 242 | ||
| 216 | - for table, query in cleanup_queries: | ||
| 217 | - cursor.execute(query) | ||
| 218 | - count = cursor.fetchone()[0] | ||
| 219 | - if count > 0: | ||
| 220 | - print(f" {table}: {count} 条记录将被删除") | ||
| 221 | - if not dry_run: | ||
| 222 | - delete_query = query.replace("SELECT COUNT(*)", "DELETE") | ||
| 223 | - cursor.execute(delete_query) | ||
| 224 | - print(f" 已删除 {count} 条记录") | ||
| 225 | - else: | ||
| 226 | - print(f" {table}: 无需清理") | 243 | + with self.engine.begin() as conn: |
| 244 | + for table, query in cleanup_queries: | ||
| 245 | + count = conn.execute(text(query)).scalar_one() | ||
| 246 | + if count > 0: | ||
| 247 | + cleanup_message += f" {table}: {count} 条记录将被删除" | ||
| 248 | + cleanup_message += "\n" | ||
| 249 | + if not dry_run: | ||
| 250 | + delete_query = query.replace("SELECT COUNT(*)", "DELETE") | ||
| 251 | + conn.execute(text(delete_query)) | ||
| 252 | + cleanup_message += f" 已删除 {count} 条记录" | ||
| 253 | + cleanup_message += "\n" | ||
| 254 | + else: | ||
| 255 | + cleanup_message += f" {table}: 无需清理" | ||
| 256 | + cleanup_message += "\n" | ||
| 227 | 257 | ||
| 228 | if dry_run: | 258 | if dry_run: |
| 229 | - print("\n这是预览模式,没有实际删除数据。使用 --execute 参数执行实际清理。") | 259 | + cleanup_message += "\n这是预览模式,没有实际删除数据。使用 --execute 参数执行实际清理。" |
| 260 | + cleanup_message += "\n" | ||
| 261 | + logger.info(cleanup_message) | ||
| 230 | 262 | ||
| 231 | def main(): | 263 | def main(): |
| 232 | parser = argparse.ArgumentParser(description="MindSpider数据库管理工具") | 264 | parser = argparse.ArgumentParser(description="MindSpider数据库管理工具") |
| @@ -9,6 +9,7 @@ import os | @@ -9,6 +9,7 @@ import os | ||
| 9 | import sys | 9 | import sys |
| 10 | import pymysql | 10 | import pymysql |
| 11 | from pathlib import Path | 11 | from pathlib import Path |
| 12 | +from MindSpider.config import settings | ||
| 12 | 13 | ||
| 13 | # 添加项目根目录到路径 | 14 | # 添加项目根目录到路径 |
| 14 | project_root = Path(__file__).parent.parent | 15 | project_root = Path(__file__).parent.parent |
| @@ -26,14 +27,14 @@ def create_database_connection(): | @@ -26,14 +27,14 @@ def create_database_connection(): | ||
| 26 | """创建数据库连接""" | 27 | """创建数据库连接""" |
| 27 | try: | 28 | try: |
| 28 | connection = pymysql.connect( | 29 | connection = pymysql.connect( |
| 29 | - host=config.DB_HOST, | ||
| 30 | - port=config.DB_PORT, | ||
| 31 | - user=config.DB_USER, | ||
| 32 | - password=config.DB_PASSWORD, | ||
| 33 | - charset=config.DB_CHARSET, | 30 | + host=settings.db_host, |
| 31 | + port=settings.db_port, | ||
| 32 | + user=settings.db_user, | ||
| 33 | + password=settings.db_password, | ||
| 34 | + charset=settings.db_charset, | ||
| 34 | autocommit=True | 35 | autocommit=True |
| 35 | ) | 36 | ) |
| 36 | - print(f"成功连接到MySQL服务器: {config.DB_HOST}:{config.DB_PORT}") | 37 | + print(f"成功连接到MySQL服务器: {settings.db_host}:{settings.db_port}") |
| 37 | return connection | 38 | return connection |
| 38 | except Exception as e: | 39 | except Exception as e: |
| 39 | print(f"连接数据库失败: {e}") | 40 | print(f"连接数据库失败: {e}") |
| @@ -43,9 +44,9 @@ def create_database(connection): | @@ -43,9 +44,9 @@ def create_database(connection): | ||
| 43 | """创建数据库""" | 44 | """创建数据库""" |
| 44 | try: | 45 | try: |
| 45 | cursor = connection.cursor() | 46 | cursor = connection.cursor() |
| 46 | - cursor.execute(f"CREATE DATABASE IF NOT EXISTS `{config.DB_NAME}` CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci") | ||
| 47 | - cursor.execute(f"USE `{config.DB_NAME}`") | ||
| 48 | - print(f"数据库 '{config.DB_NAME}' 创建/选择成功") | 47 | + cursor.execute(f"CREATE DATABASE IF NOT EXISTS `{settings.db_name}` CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci") |
| 48 | + cursor.execute(f"USE `{settings.db_name}`") | ||
| 49 | + print(f"数据库 '{settings.db_name}' 创建/选择成功") | ||
| 49 | return True | 50 | return True |
| 50 | except Exception as e: | 51 | except Exception as e: |
| 51 | print(f"创建数据库失败: {e}") | 52 | print(f"创建数据库失败: {e}") |
| @@ -56,18 +57,18 @@ def execute_sql_file(connection, sql_file_path, description=""): | @@ -56,18 +57,18 @@ def execute_sql_file(connection, sql_file_path, description=""): | ||
| 56 | if not os.path.exists(sql_file_path): | 57 | if not os.path.exists(sql_file_path): |
| 57 | print(f"警告: SQL文件不存在: {sql_file_path}") | 58 | print(f"警告: SQL文件不存在: {sql_file_path}") |
| 58 | return False | 59 | return False |
| 59 | - | 60 | + |
| 60 | try: | 61 | try: |
| 61 | cursor = connection.cursor() | 62 | cursor = connection.cursor() |
| 62 | with open(sql_file_path, 'r', encoding='utf-8') as f: | 63 | with open(sql_file_path, 'r', encoding='utf-8') as f: |
| 63 | sql_content = f.read() | 64 | sql_content = f.read() |
| 64 | - | 65 | + |
| 65 | # 分割SQL语句(简单实现,按分号分割) | 66 | # 分割SQL语句(简单实现,按分号分割) |
| 66 | sql_statements = [stmt.strip() for stmt in sql_content.split(';') if stmt.strip()] | 67 | sql_statements = [stmt.strip() for stmt in sql_content.split(';') if stmt.strip()] |
| 67 | - | 68 | + |
| 68 | success_count = 0 | 69 | success_count = 0 |
| 69 | error_count = 0 | 70 | error_count = 0 |
| 70 | - | 71 | + |
| 71 | for stmt in sql_statements: | 72 | for stmt in sql_statements: |
| 72 | if not stmt or stmt.startswith('--'): | 73 | if not stmt or stmt.startswith('--'): |
| 73 | continue | 74 | continue |
| @@ -77,10 +78,10 @@ def execute_sql_file(connection, sql_file_path, description=""): | @@ -77,10 +78,10 @@ def execute_sql_file(connection, sql_file_path, description=""): | ||
| 77 | except Exception as e: | 78 | except Exception as e: |
| 78 | error_count += 1 | 79 | error_count += 1 |
| 79 | print(f"执行SQL语句失败: {str(e)[:100]}...") | 80 | print(f"执行SQL语句失败: {str(e)[:100]}...") |
| 80 | - | 81 | + |
| 81 | print(f"{description} - 成功执行: {success_count} 条语句, 失败: {error_count} 条语句") | 82 | print(f"{description} - 成功执行: {success_count} 条语句, 失败: {error_count} 条语句") |
| 82 | return error_count == 0 | 83 | return error_count == 0 |
| 83 | - | 84 | + |
| 84 | except Exception as e: | 85 | except Exception as e: |
| 85 | print(f"执行SQL文件失败 {sql_file_path}: {e}") | 86 | print(f"执行SQL文件失败 {sql_file_path}: {e}") |
| 86 | return False | 87 | return False |
| @@ -90,44 +91,44 @@ def main(): | @@ -90,44 +91,44 @@ def main(): | ||
| 90 | print("=" * 60) | 91 | print("=" * 60) |
| 91 | print("MindSpider AI爬虫项目 - 数据库初始化") | 92 | print("MindSpider AI爬虫项目 - 数据库初始化") |
| 92 | print("=" * 60) | 93 | print("=" * 60) |
| 93 | - | 94 | + |
| 94 | # 检查配置 | 95 | # 检查配置 |
| 95 | print("检查数据库配置...") | 96 | print("检查数据库配置...") |
| 96 | - print(f"数据库主机: {config.DB_HOST}") | ||
| 97 | - print(f"数据库端口: {config.DB_PORT}") | ||
| 98 | - print(f"数据库名称: {config.DB_NAME}") | ||
| 99 | - print(f"数据库用户: {config.DB_USER}") | ||
| 100 | - print(f"字符集: {config.DB_CHARSET}") | 97 | + print(f"数据库主机: {settings.db_host}") |
| 98 | + print(f"数据库端口: {settings.db_port}") | ||
| 99 | + print(f"数据库名称: {settings.db_name}") | ||
| 100 | + print(f"数据库用户: {settings.db_user}") | ||
| 101 | + print(f"字符集: {settings.db_charset}") | ||
| 101 | print() | 102 | print() |
| 102 | - | 103 | + |
| 103 | # 创建数据库连接 | 104 | # 创建数据库连接 |
| 104 | print("正在连接数据库...") | 105 | print("正在连接数据库...") |
| 105 | connection = create_database_connection() | 106 | connection = create_database_connection() |
| 106 | if not connection: | 107 | if not connection: |
| 107 | print("数据库初始化失败!") | 108 | print("数据库初始化失败!") |
| 108 | return False | 109 | return False |
| 109 | - | 110 | + |
| 110 | try: | 111 | try: |
| 111 | # 创建数据库 | 112 | # 创建数据库 |
| 112 | print("正在创建/选择数据库...") | 113 | print("正在创建/选择数据库...") |
| 113 | if not create_database(connection): | 114 | if not create_database(connection): |
| 114 | return False | 115 | return False |
| 115 | - | 116 | + |
| 116 | # 获取SQL文件路径 | 117 | # 获取SQL文件路径 |
| 117 | schema_dir = Path(__file__).parent | 118 | schema_dir = Path(__file__).parent |
| 118 | mediacrawler_sql = schema_dir.parent / "DeepSentimentCrawling" / "MediaCrawler" / "schema" / "tables.sql" | 119 | mediacrawler_sql = schema_dir.parent / "DeepSentimentCrawling" / "MediaCrawler" / "schema" / "tables.sql" |
| 119 | mindspider_sql = schema_dir / "mindspider_tables.sql" | 120 | mindspider_sql = schema_dir / "mindspider_tables.sql" |
| 120 | - | 121 | + |
| 121 | print() | 122 | print() |
| 122 | print("开始执行SQL脚本...") | 123 | print("开始执行SQL脚本...") |
| 123 | - | 124 | + |
| 124 | # 1. 执行MediaCrawler的原始表结构 | 125 | # 1. 执行MediaCrawler的原始表结构 |
| 125 | if mediacrawler_sql.exists(): | 126 | if mediacrawler_sql.exists(): |
| 126 | print("1. 创建MediaCrawler基础表...") | 127 | print("1. 创建MediaCrawler基础表...") |
| 127 | execute_sql_file(connection, str(mediacrawler_sql), "MediaCrawler基础表") | 128 | execute_sql_file(connection, str(mediacrawler_sql), "MediaCrawler基础表") |
| 128 | else: | 129 | else: |
| 129 | print("警告: MediaCrawler SQL文件不存在,跳过基础表创建") | 130 | print("警告: MediaCrawler SQL文件不存在,跳过基础表创建") |
| 130 | - | 131 | + |
| 131 | # 2. 执行MindSpider扩展表结构 | 132 | # 2. 执行MindSpider扩展表结构 |
| 132 | print("2. 创建MindSpider扩展表...") | 133 | print("2. 创建MindSpider扩展表...") |
| 133 | if mindspider_sql.exists(): | 134 | if mindspider_sql.exists(): |
| @@ -135,18 +136,18 @@ def main(): | @@ -135,18 +136,18 @@ def main(): | ||
| 135 | else: | 136 | else: |
| 136 | print("错误: MindSpider SQL文件不存在") | 137 | print("错误: MindSpider SQL文件不存在") |
| 137 | return False | 138 | return False |
| 138 | - | 139 | + |
| 139 | print() | 140 | print() |
| 140 | print("=" * 60) | 141 | print("=" * 60) |
| 141 | print("数据库初始化完成!") | 142 | print("数据库初始化完成!") |
| 142 | print("=" * 60) | 143 | print("=" * 60) |
| 143 | - | 144 | + |
| 144 | # 显示创建的表 | 145 | # 显示创建的表 |
| 145 | cursor = connection.cursor() | 146 | cursor = connection.cursor() |
| 146 | cursor.execute("SHOW TABLES") | 147 | cursor.execute("SHOW TABLES") |
| 147 | tables = cursor.fetchall() | 148 | tables = cursor.fetchall() |
| 148 | - | ||
| 149 | - print(f"数据库 '{config.DB_NAME}' 中共创建了 {len(tables)} 个表:") | 149 | + |
| 150 | + print(f"数据库 '{settings.db_name}' 中共创建了 {len(tables)} 个表:") | ||
| 150 | for table in tables: | 151 | for table in tables: |
| 151 | print(f" - {table[0]}") | 152 | print(f" - {table[0]}") |
| 152 | 153 |
-
Please register or login to post a comment