Doiiars

1. 同步MediaCrawler为最新版本

2. 修复数据库not null错误
3. 支持PG数据库
4. 规范环境变量及配置使用
5. 规范为uv安装
6. 使用loggru
@@ -7,11 +7,12 @@ BroadTopicExtraction模块 - 数据库管理器 @@ -7,11 +7,12 @@ BroadTopicExtraction模块 - 数据库管理器
7 7
8 import sys 8 import sys
9 import json 9 import json
10 -from datetime import datetime, date 10 +from datetime import datetime, date, timedelta
11 from pathlib import Path 11 from pathlib import Path
12 from typing import List, Dict, Optional 12 from typing import List, Dict, Optional
13 -import pymysql  
14 -from pymysql.cursors import DictCursor 13 +from sqlalchemy import create_engine, text
  14 +from sqlalchemy.engine import Engine
  15 +from loguru import logger
15 16
16 # 添加项目根目录到路径 17 # 添加项目根目录到路径
17 project_root = Path(__file__).parent.parent 18 project_root = Path(__file__).parent.parent
@@ -22,37 +23,44 @@ try: @@ -22,37 +23,44 @@ try:
22 except ImportError: 23 except ImportError:
23 raise ImportError("无法导入config.py配置文件") 24 raise ImportError("无法导入config.py配置文件")
24 25
  26 +from config import settings
  27 +
25 class DatabaseManager: 28 class DatabaseManager:
26 """数据库管理器""" 29 """数据库管理器"""
27 30
28 def __init__(self): 31 def __init__(self):
29 """初始化数据库管理器""" 32 """初始化数据库管理器"""
30 - self.connection = None 33 + self.engine: Engine = None
31 self.connect() 34 self.connect()
32 35
33 def connect(self): 36 def connect(self):
34 """连接数据库""" 37 """连接数据库"""
35 try: 38 try:
36 - self.connection = pymysql.connect(  
37 - host=config.DB_HOST,  
38 - port=config.DB_PORT,  
39 - user=config.DB_USER,  
40 - password=config.DB_PASSWORD,  
41 - database=config.DB_NAME,  
42 - charset=config.DB_CHARSET,  
43 - autocommit=True,  
44 - cursorclass=DictCursor  
45 - )  
46 - print(f"成功连接到数据库: {config.DB_NAME}") 39 + dialect = (settings.DB_DIALECT or "mysql").lower()
  40 + if dialect in ("postgresql", "postgres"):
  41 + url = f"postgresql+psycopg://{settings.DB_USER}:{settings.DB_PASSWORD}@{settings.DB_HOST}:{settings.DB_PORT}/{settings.DB_NAME}"
  42 + else:
  43 + url = f"mysql+pymysql://{settings.DB_USER}:{settings.DB_PASSWORD}@{settings.DB_HOST}:{settings.DB_PORT}/{settings.DB_NAME}?charset={settings.DB_CHARSET}"
  44 + self.engine = create_engine(url, future=True)
  45 + logger.info(f"成功连接到数据库: {settings.DB_NAME}")
  46 + except ModuleNotFoundError as e:
  47 + missing: str = str(e)
  48 + if "psycopg" in missing:
  49 + logger.error("数据库连接失败: 未安装PostgreSQL驱动 psycopg。请安装: psycopg[binary]。参考指令:uv pip install psycopg[binary]")
  50 + elif "pymysql" in missing:
  51 + logger.error("数据库连接失败: 未安装MySQL驱动 pymysql。请安装: pymysql。参考指令:uv pip install pymysql")
  52 + else:
  53 + logger.error(f"数据库连接失败(缺少驱动): {e}")
  54 + raise
47 except Exception as e: 55 except Exception as e:
48 - print(f"数据库连接失败: {e}") 56 + logger.error(f"数据库连接失败: {e}")
49 raise 57 raise
50 58
51 def close(self): 59 def close(self):
52 """关闭数据库连接""" 60 """关闭数据库连接"""
53 - if self.connection:  
54 - self.connection.close()  
55 - print("数据库连接已关闭") 61 + if self.engine:
  62 + self.engine.dispose()
  63 + logger.info("数据库连接已关闭")
56 64
57 def __enter__(self): 65 def __enter__(self):
58 return self 66 return self
@@ -79,48 +87,49 @@ class DatabaseManager: @@ -79,48 +87,49 @@ class DatabaseManager:
79 current_timestamp = int(datetime.now().timestamp()) 87 current_timestamp = int(datetime.now().timestamp())
80 88
81 try: 89 try:
82 - cursor = self.connection.cursor()  
83 -  
84 - # 先删除当天所有的新闻记录(覆盖模式)  
85 - delete_query = "DELETE FROM daily_news WHERE crawl_date = %s"  
86 - deleted_count = cursor.execute(delete_query, (crawl_date,))  
87 - if deleted_count > 0:  
88 - print(f"覆盖模式:删除了当天已有的 {deleted_count} 条新闻记录")  
89 -  
90 - # 批量插入新记录  
91 saved_count = 0 90 saved_count = 0
  91 + # 先独立事务执行删除,防止后续插入失败导致无法清理
  92 + with self.engine.begin() as conn:
  93 + deleted = conn.execute(text("DELETE FROM daily_news WHERE crawl_date = :d"), {"d": crawl_date}).rowcount
  94 + if deleted and deleted > 0:
  95 + logger.info(f"覆盖模式:删除了当天已有的 {deleted} 条新闻记录")
  96 +
  97 + # 逐条插入,单条失败不影响后续(每条独立事务)
92 for news_item in news_data: 98 for news_item in news_data:
93 try: 99 try:
94 - # 简化的新闻ID生成  
95 news_id = f"{news_item.get('source', 'unknown')}_{news_item.get('id', news_item.get('rank', 0))}" 100 news_id = f"{news_item.get('source', 'unknown')}_{news_item.get('id', news_item.get('rank', 0))}"
96 -  
97 - # 插入新记录  
98 - insert_query = """  
99 - INSERT INTO daily_news (  
100 - news_id, source_platform, title, url, crawl_date,  
101 - rank_position, add_ts  
102 - ) VALUES (%s, %s, %s, %s, %s, %s, %s)  
103 - """  
104 - cursor.execute(insert_query, (  
105 - news_id,  
106 - news_item.get('source', 'unknown'),  
107 - news_item.get('title', ''),  
108 - news_item.get('url', ''),  
109 - crawl_date,  
110 - news_item.get('rank', None),  
111 - current_timestamp  
112 - )) 101 + title_val = (news_item.get("title", "") or "")
  102 + if len(title_val) > 500:
  103 + title_val = title_val[:500]
  104 + with self.engine.begin() as conn:
  105 + conn.execute(
  106 + text(
  107 + """
  108 + INSERT INTO daily_news (
  109 + news_id, source_platform, title, url, crawl_date,
  110 + rank_position, add_ts, last_modify_ts
  111 + ) VALUES (:news_id, :source_platform, :title, :url, :crawl_date, :rank_position, :add_ts, :last_modify_ts)
  112 + """
  113 + ),
  114 + {
  115 + "news_id": news_id,
  116 + "source_platform": news_item.get("source", "unknown"),
  117 + "title": title_val,
  118 + "url": news_item.get("url", ""),
  119 + "crawl_date": crawl_date,
  120 + "rank_position": news_item.get("rank", None),
  121 + "add_ts": current_timestamp,
  122 + "last_modify_ts": current_timestamp,
  123 + },
  124 + )
113 saved_count += 1 125 saved_count += 1
114 -  
115 except Exception as e: 126 except Exception as e:
116 - print(f"保存单条新闻失败: {e}") 127 + logger.warning(f"保存单条新闻失败: {e}")
117 continue 128 continue
118 -  
119 - print(f"成功保存 {saved_count} 条新闻记录") 129 + logger.info(f"成功保存 {saved_count} 条新闻记录")
120 return saved_count 130 return saved_count
121 -  
122 except Exception as e: 131 except Exception as e:
123 - print(f"保存新闻数据失败: {e}") 132 + logger.exception(f"保存新闻数据失败: {e}")
124 return 0 133 return 0
125 134
126 def get_daily_news(self, crawl_date: date = None) -> List[Dict]: 135 def get_daily_news(self, crawl_date: date = None) -> List[Dict]:
@@ -136,15 +145,13 @@ class DatabaseManager: @@ -136,15 +145,13 @@ class DatabaseManager:
136 if not crawl_date: 145 if not crawl_date:
137 crawl_date = date.today() 146 crawl_date = date.today()
138 147
139 - query = """  
140 - SELECT * FROM daily_news  
141 - WHERE crawl_date = %s  
142 - ORDER BY rank_position ASC  
143 - """  
144 -  
145 - cursor = self.connection.cursor()  
146 - cursor.execute(query, (crawl_date,))  
147 - return cursor.fetchall() 148 + query = (
  149 + "SELECT * FROM daily_news WHERE crawl_date = :d ORDER BY rank_position ASC"
  150 + )
  151 + with self.engine.connect() as conn:
  152 + result = conn.execute(text(query), {"d": crawl_date})
  153 + rows = result.mappings().all()
  154 + return rows
148 155
149 # ==================== 话题数据操作 ==================== 156 # ==================== 话题数据操作 ====================
150 157
@@ -166,37 +173,31 @@ class DatabaseManager: @@ -166,37 +173,31 @@ class DatabaseManager:
166 current_timestamp = int(datetime.now().timestamp()) 173 current_timestamp = int(datetime.now().timestamp())
167 174
168 try: 175 try:
169 - cursor = self.connection.cursor()  
170 -  
171 - # 检查今天是否已有记录  
172 - check_query = "SELECT id FROM daily_topics WHERE extract_date = %s"  
173 - cursor.execute(check_query, (extract_date,))  
174 - existing = cursor.fetchone()  
175 -  
176 keywords_json = json.dumps(keywords, ensure_ascii=False) 176 keywords_json = json.dumps(keywords, ensure_ascii=False)
177 -  
178 - if existing:  
179 - # 更新现有记录  
180 - update_query = """  
181 - UPDATE daily_topics  
182 - SET keywords = %s, summary = %s, add_ts = %s  
183 - WHERE extract_date = %s  
184 - """  
185 - cursor.execute(update_query, (keywords_json, summary, current_timestamp, extract_date))  
186 - print(f"更新了 {extract_date} 的话题分析")  
187 - else:  
188 - # 插入新记录  
189 - insert_query = """  
190 - INSERT INTO daily_topics (extract_date, keywords, summary, add_ts)  
191 - VALUES (%s, %s, %s, %s)  
192 - """  
193 - cursor.execute(insert_query, (extract_date, keywords_json, summary, current_timestamp))  
194 - print(f"保存了 {extract_date} 的话题分析")  
195 - 177 + with self.engine.begin() as conn:
  178 + check = conn.execute(
  179 + text("SELECT id FROM daily_topics WHERE extract_date = :d AND topic_id = :tid"),
  180 + {"d": extract_date, "tid": "summary"},
  181 + ).first()
  182 + if check:
  183 + conn.execute(
  184 + text(
  185 + "UPDATE daily_topics SET keywords = :k, topic_description = :s, add_ts = :ts, last_modify_ts = :lmt, topic_name = :tn WHERE extract_date = :d AND topic_id = :tid"
  186 + ),
  187 + {"k": keywords_json, "s": summary, "ts": current_timestamp, "lmt": current_timestamp, "d": extract_date, "tid": "summary", "tn": "每日新闻分析"},
  188 + )
  189 + logger.info(f"更新了 {extract_date} 的话题分析")
  190 + else:
  191 + conn.execute(
  192 + text(
  193 + "INSERT INTO daily_topics (extract_date, topic_id, topic_name, keywords, topic_description, add_ts, last_modify_ts) VALUES (:d, :tid, :tn, :k, :s, :ts, :lmt)"
  194 + ),
  195 + {"d": extract_date, "tid": "summary", "tn": "每日新闻分析", "k": keywords_json, "s": summary, "ts": current_timestamp, "lmt": current_timestamp},
  196 + )
  197 + logger.info(f"保存了 {extract_date} 的话题分析")
196 return True 198 return True
197 -  
198 except Exception as e: 199 except Exception as e:
199 - print(f"保存话题分析失败: {e}") 200 + logger.exception(f"保存话题分析失败: {e}")
200 return False 201 return False
201 202
202 def get_daily_topics(self, extract_date: date = None) -> Optional[Dict]: 203 def get_daily_topics(self, extract_date: date = None) -> Optional[Dict]:
@@ -213,20 +214,15 @@ class DatabaseManager: @@ -213,20 +214,15 @@ class DatabaseManager:
213 extract_date = date.today() 214 extract_date = date.today()
214 215
215 try: 216 try:
216 - cursor = self.connection.cursor()  
217 - query = "SELECT * FROM daily_topics WHERE extract_date = %s"  
218 - cursor.execute(query, (extract_date,))  
219 - result = cursor.fetchone()  
220 -  
221 - if result:  
222 - # 解析关键词JSON  
223 - result['keywords'] = json.loads(result['keywords'])  
224 - return result  
225 - else: 217 + with self.engine.connect() as conn:
  218 + result = conn.execute(text("SELECT * FROM daily_topics WHERE extract_date = :d"), {"d": extract_date}).mappings().first()
  219 + if result:
  220 + result = dict(result) # 转为可变dict以支持item赋值
  221 + result["keywords"] = json.loads(result["keywords"]) if result.get("keywords") else []
  222 + return result
226 return None 223 return None
227 -  
228 except Exception as e: 224 except Exception as e:
229 - print(f"获取话题分析失败: {e}") 225 + logger.exception(f"获取话题分析失败: {e}")
230 return None 226 return None
231 227
232 def get_recent_topics(self, days: int = 7) -> List[Dict]: 228 def get_recent_topics(self, days: int = 7) -> List[Dict]:
@@ -240,23 +236,23 @@ class DatabaseManager: @@ -240,23 +236,23 @@ class DatabaseManager:
240 话题分析列表 236 话题分析列表
241 """ 237 """
242 try: 238 try:
243 - cursor = self.connection.cursor()  
244 - query = """  
245 - SELECT * FROM daily_topics  
246 - WHERE extract_date >= DATE_SUB(CURDATE(), INTERVAL %s DAY)  
247 - ORDER BY extract_date DESC  
248 - """  
249 - cursor.execute(query, (days,))  
250 - results = cursor.fetchall()  
251 -  
252 - # 解析每个结果的关键词JSON  
253 - for result in results:  
254 - result['keywords'] = json.loads(result['keywords'])  
255 -  
256 - return results  
257 - 239 + start_date = date.today() - timedelta(days=days)
  240 + with self.engine.connect() as conn:
  241 + results = conn.execute(
  242 + text(
  243 + """
  244 + SELECT * FROM daily_topics
  245 + WHERE extract_date >= :start_date
  246 + ORDER BY extract_date DESC
  247 + """
  248 + ),
  249 + {"start_date": start_date},
  250 + ).mappings().all()
  251 + for r in results:
  252 + r["keywords"] = json.loads(r["keywords"]) if r.get("keywords") else []
  253 + return results
258 except Exception as e: 254 except Exception as e:
259 - print(f"获取最近话题分析失败: {e}") 255 + logger.exception(f"获取最近话题分析失败: {e}")
260 return [] 256 return []
261 257
262 # ==================== 统计查询 ==================== 258 # ==================== 统计查询 ====================
@@ -264,56 +260,48 @@ class DatabaseManager: @@ -264,56 +260,48 @@ class DatabaseManager:
264 def get_summary_stats(self, days: int = 7) -> Dict: 260 def get_summary_stats(self, days: int = 7) -> Dict:
265 """获取统计摘要""" 261 """获取统计摘要"""
266 try: 262 try:
267 - cursor = self.connection.cursor()  
268 -  
269 - # 新闻统计  
270 - news_query = """  
271 - SELECT  
272 - crawl_date,  
273 - COUNT(*) as news_count,  
274 - COUNT(DISTINCT source_platform) as platforms_count  
275 - FROM daily_news  
276 - WHERE crawl_date >= DATE_SUB(CURDATE(), INTERVAL %s DAY)  
277 - GROUP BY crawl_date  
278 - ORDER BY crawl_date DESC  
279 - """  
280 - cursor.execute(news_query, (days,))  
281 - news_stats = cursor.fetchall()  
282 -  
283 - # 话题统计  
284 - topics_query = """  
285 - SELECT  
286 - extract_date,  
287 - keywords,  
288 - CHAR_LENGTH(summary) as summary_length  
289 - FROM daily_topics  
290 - WHERE extract_date >= DATE_SUB(CURDATE(), INTERVAL %s DAY)  
291 - ORDER BY extract_date DESC  
292 - """  
293 - cursor.execute(topics_query, (days,))  
294 - topics_stats = cursor.fetchall()  
295 -  
296 - return {  
297 - 'news_stats': news_stats,  
298 - 'topics_stats': topics_stats  
299 - }  
300 - 263 + start_date = date.today() - timedelta(days=days)
  264 + with self.engine.connect() as conn:
  265 + news_stats = conn.execute(
  266 + text(
  267 + """
  268 + SELECT crawl_date, COUNT(*) as news_count, COUNT(DISTINCT source_platform) as platforms_count
  269 + FROM daily_news
  270 + WHERE crawl_date >= :start_date
  271 + GROUP BY crawl_date
  272 + ORDER BY crawl_date DESC
  273 + """
  274 + ),
  275 + {"start_date": start_date},
  276 + ).all()
  277 + topics_stats = conn.execute(
  278 + text(
  279 + """
  280 + SELECT extract_date, keywords, CHAR_LENGTH(topic_description) as summary_length
  281 + FROM daily_topics
  282 + WHERE extract_date >= :start_date
  283 + ORDER BY extract_date DESC
  284 + """
  285 + ),
  286 + {"start_date": start_date},
  287 + ).all()
  288 + return {"news_stats": news_stats, "topics_stats": topics_stats}
301 except Exception as e: 289 except Exception as e:
302 - print(f"获取统计摘要失败: {e}")  
303 - return {'news_stats': [], 'topics_stats': []} 290 + logger.exception(f"获取统计摘要失败: {e}")
  291 + return {"news_stats": [], "topics_stats": []}
304 292
305 if __name__ == "__main__": 293 if __name__ == "__main__":
306 # 测试数据库管理器 294 # 测试数据库管理器
307 with DatabaseManager() as db: 295 with DatabaseManager() as db:
308 # 测试获取新闻 296 # 测试获取新闻
309 news = db.get_daily_news() 297 news = db.get_daily_news()
310 - print(f"今日新闻数量: {len(news)}") 298 + logger.info(f"今日新闻数量: {len(news)}")
311 299
312 # 测试获取话题 300 # 测试获取话题
313 topics = db.get_daily_topics() 301 topics = db.get_daily_topics()
314 if topics: 302 if topics:
315 - print(f"今日话题关键词: {topics['keywords']}") 303 + logger.info(f"今日话题关键词: {topics['keywords']}")
316 else: 304 else:
317 - print("今日暂无话题分析") 305 + logger.info("今日暂无话题分析")
318 306
319 - print("简化数据库管理器测试完成!") 307 + logger.info("简化数据库管理器测试完成!")
@@ -11,6 +11,7 @@ import argparse @@ -11,6 +11,7 @@ import argparse
11 from datetime import datetime, date 11 from datetime import datetime, date
12 from pathlib import Path 12 from pathlib import Path
13 from typing import List, Dict, Optional 13 from typing import List, Dict, Optional
  14 +from loguru import logger
14 15
15 # 添加项目根目录到路径 16 # 添加项目根目录到路径
16 project_root = Path(__file__).parent.parent 17 project_root = Path(__file__).parent.parent
@@ -21,8 +22,8 @@ try: @@ -21,8 +22,8 @@ try:
21 from BroadTopicExtraction.topic_extractor import TopicExtractor 22 from BroadTopicExtraction.topic_extractor import TopicExtractor
22 from BroadTopicExtraction.database_manager import DatabaseManager 23 from BroadTopicExtraction.database_manager import DatabaseManager
23 except ImportError as e: 24 except ImportError as e:
24 - print(f"导入模块失败: {e}")  
25 - print("请确保在项目根目录运行,并且已安装所有依赖") 25 + logger.exception(f"导入模块失败: {e}")
  26 + logger.error("请确保在项目根目录运行,并且已安装所有依赖")
26 sys.exit(1) 27 sys.exit(1)
27 28
28 class BroadTopicExtraction: 29 class BroadTopicExtraction:
@@ -34,7 +35,7 @@ class BroadTopicExtraction: @@ -34,7 +35,7 @@ class BroadTopicExtraction:
34 self.topic_extractor = TopicExtractor() 35 self.topic_extractor = TopicExtractor()
35 self.db_manager = DatabaseManager() 36 self.db_manager = DatabaseManager()
36 37
37 - print("BroadTopicExtraction 初始化完成") 38 + logger.info("BroadTopicExtraction 初始化完成")
38 39
39 def close(self): 40 def close(self):
40 """关闭资源""" 41 """关闭资源"""
@@ -68,21 +69,22 @@ class BroadTopicExtraction: @@ -68,21 +69,22 @@ class BroadTopicExtraction:
68 Returns: 69 Returns:
69 包含完整提取结果的字典 70 包含完整提取结果的字典
70 """ 71 """
71 - print("\n" + "=" * 80)  
72 - print("MindSpider AI爬虫 - 每日话题提取")  
73 - print("=" * 80)  
74 - print(f"执行时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")  
75 - print(f"目标日期: {date.today()}") 72 + extraction_result_message = ""
  73 + extraction_result_message += "\nMindSpider AI爬虫 - 每日话题提取\n"
  74 + extraction_result_message += f"执行时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n"
  75 + extraction_result_message += f"目标日期: {date.today()}\n"
76 76
77 if news_sources: 77 if news_sources:
78 - print(f"指定平台: {len(news_sources)} 个") 78 + extraction_result_message += f"指定平台: {len(news_sources)} 个\n"
79 for source in news_sources: 79 for source in news_sources:
80 source_name = SOURCE_NAMES.get(source, source) 80 source_name = SOURCE_NAMES.get(source, source)
81 - print(f" - {source_name}") 81 + extraction_result_message += f" - {source_name}\n"
82 else: 82 else:
83 - print(f"爬取平台: 全部 {len(SOURCE_NAMES)} 个平台") 83 + extraction_result_message += f"爬取平台: 全部 {len(SOURCE_NAMES)} 个平台\n"
84 84
85 - print(f"关键词数: 最多 {max_keywords} 个") 85 + extraction_result_message += f"关键词数: 最多 {max_keywords} 个\n"
  86 +
  87 + logger.info(extraction_result_message)
86 88
87 extraction_result = { 89 extraction_result = {
88 'success': False, 90 'success': False,
@@ -96,7 +98,7 @@ class BroadTopicExtraction: @@ -96,7 +98,7 @@ class BroadTopicExtraction:
96 98
97 try: 99 try:
98 # 步骤1: 收集新闻 100 # 步骤1: 收集新闻
99 - print("\n【步骤1】收集热点新闻...") 101 + logger.info("【步骤1】收集热点新闻...")
100 news_result = await self.news_collector.collect_and_save_news( 102 news_result = await self.news_collector.collect_and_save_news(
101 sources=news_sources 103 sources=news_sources
102 ) 104 )
@@ -112,7 +114,7 @@ class BroadTopicExtraction: @@ -112,7 +114,7 @@ class BroadTopicExtraction:
112 raise Exception("新闻收集失败或没有获取到新闻") 114 raise Exception("新闻收集失败或没有获取到新闻")
113 115
114 # 步骤2: 提取关键词和生成总结 116 # 步骤2: 提取关键词和生成总结
115 - print("\n【步骤2】提取关键词和生成总结...") 117 + logger.info("【步骤2】提取关键词和生成总结...")
116 keywords, summary = self.topic_extractor.extract_keywords_and_summary( 118 keywords, summary = self.topic_extractor.extract_keywords_and_summary(
117 news_result['news_list'], 119 news_result['news_list'],
118 max_keywords=max_keywords 120 max_keywords=max_keywords
@@ -126,10 +128,10 @@ class BroadTopicExtraction: @@ -126,10 +128,10 @@ class BroadTopicExtraction:
126 } 128 }
127 129
128 if not keywords: 130 if not keywords:
129 - print("警告: 没有提取到有效关键词") 131 + logger.warning("警告: 没有提取到有效关键词")
130 132
131 # 步骤3: 保存到数据库 133 # 步骤3: 保存到数据库
132 - print("\n【步骤3】保存分析结果到数据库...") 134 + logger.info("【步骤3】保存分析结果到数据库...")
133 save_success = self.db_manager.save_daily_topics( 135 save_success = self.db_manager.save_daily_topics(
134 keywords, summary, date.today() 136 keywords, summary, date.today()
135 ) 137 )
@@ -141,56 +143,47 @@ class BroadTopicExtraction: @@ -141,56 +143,47 @@ class BroadTopicExtraction:
141 extraction_result['success'] = True 143 extraction_result['success'] = True
142 extraction_result['end_time'] = datetime.now().isoformat() 144 extraction_result['end_time'] = datetime.now().isoformat()
143 145
144 - print("\n" + "=" * 80)  
145 - print("每日话题提取流程完成!")  
146 - print("=" * 80) 146 + logger.info("每日话题提取流程完成!")
147 147
148 return extraction_result 148 return extraction_result
149 149
150 except Exception as e: 150 except Exception as e:
151 - print(f"\n话题提取流程失败: {e}") 151 + logger.exception(f"话题提取流程失败: {e}")
152 extraction_result['error'] = str(e) 152 extraction_result['error'] = str(e)
153 extraction_result['end_time'] = datetime.now().isoformat() 153 extraction_result['end_time'] = datetime.now().isoformat()
154 return extraction_result 154 return extraction_result
155 155
156 def print_extraction_results(self, extraction_result: Dict): 156 def print_extraction_results(self, extraction_result: Dict):
157 """打印提取结果""" 157 """打印提取结果"""
158 - print("\n" + "=" * 80)  
159 - print("话题提取结果报告")  
160 - print("=" * 80)  
161 -  
162 - if not extraction_result['success']:  
163 - print(f"❌ 提取失败: {extraction_result.get('error', '未知错误')}")  
164 - return 158 + extraction_result_message = ""
165 159
166 # 新闻收集结果 160 # 新闻收集结果
167 news_data = extraction_result.get('news_collection', {}) 161 news_data = extraction_result.get('news_collection', {})
168 - print(f"📰 新闻收集: {news_data.get('total_news', 0)} 条新闻")  
169 - print(f" 成功源数: {news_data.get('successful_sources', 0)}/{news_data.get('total_sources', 0)}") 162 + extraction_result_message += f"\n📰 新闻收集: {news_data.get('total_news', 0)} 条新闻\n"
  163 + extraction_result_message += f" 成功源数: {news_data.get('successful_sources', 0)}/{news_data.get('total_sources', 0)}\n"
170 164
171 # 话题提取结果 165 # 话题提取结果
172 topic_data = extraction_result.get('topic_extraction', {}) 166 topic_data = extraction_result.get('topic_extraction', {})
173 keywords = topic_data.get('keywords', []) 167 keywords = topic_data.get('keywords', [])
174 summary = topic_data.get('summary', '') 168 summary = topic_data.get('summary', '')
175 169
176 - print(f"\n🔑 提取关键词: {len(keywords)} 个") 170 + extraction_result_message += f"\n🔑 提取关键词: {len(keywords)} 个\n"
177 if keywords: 171 if keywords:
178 # 每行显示5个关键词 172 # 每行显示5个关键词
179 for i in range(0, len(keywords), 5): 173 for i in range(0, len(keywords), 5):
180 keyword_group = keywords[i:i+5] 174 keyword_group = keywords[i:i+5]
181 - print(f" {', '.join(keyword_group)}") 175 + extraction_result_message += f" {', '.join(keyword_group)}\n"
182 176
183 - print(f"\n📝 新闻总结:")  
184 - print(f" {summary}") 177 + extraction_result_message += f"\n📝 新闻总结:\n {summary}\n"
185 178
186 # 数据库保存结果 179 # 数据库保存结果
187 db_data = extraction_result.get('database_save', {}) 180 db_data = extraction_result.get('database_save', {})
188 if db_data.get('success'): 181 if db_data.get('success'):
189 - print(f"\n💾 数据库保存: 成功") 182 + extraction_result_message += f"\n💾 数据库保存: 成功\n"
190 else: 183 else:
191 - print(f"\n💾 数据库保存: 失败") 184 + extraction_result_message += f"\n💾 数据库保存: 失败\n"
192 185
193 - print("\n" + "=" * 80) 186 + logger.info(extraction_result_message)
194 187
195 def get_keywords_for_crawling(self, extract_date: date = None) -> List[str]: 188 def get_keywords_for_crawling(self, extract_date: date = None) -> List[str]:
196 """ 189 """
@@ -207,7 +200,7 @@ class BroadTopicExtraction: @@ -207,7 +200,7 @@ class BroadTopicExtraction:
207 topics_data = self.db_manager.get_daily_topics(extract_date) 200 topics_data = self.db_manager.get_daily_topics(extract_date)
208 201
209 if not topics_data: 202 if not topics_data:
210 - print(f"没有找到 {extract_date or date.today()} 的话题数据") 203 + logger.info(f"没有找到 {extract_date or date.today()} 的话题数据")
211 return [] 204 return []
212 205
213 keywords = topics_data['keywords'] 206 keywords = topics_data['keywords']
@@ -215,11 +208,11 @@ class BroadTopicExtraction: @@ -215,11 +208,11 @@ class BroadTopicExtraction:
215 # 生成搜索关键词 208 # 生成搜索关键词
216 search_keywords = self.topic_extractor.get_search_keywords(keywords) 209 search_keywords = self.topic_extractor.get_search_keywords(keywords)
217 210
218 - print(f"准备了 {len(search_keywords)} 个关键词用于爬取") 211 + logger.info(f"准备了 {len(search_keywords)} 个关键词用于爬取")
219 return search_keywords 212 return search_keywords
220 213
221 except Exception as e: 214 except Exception as e:
222 - print(f"获取爬取关键词失败: {e}") 215 + logger.error(f"获取爬取关键词失败: {e}")
223 return [] 216 return []
224 217
225 def get_daily_analysis(self, target_date: date = None) -> Optional[Dict]: 218 def get_daily_analysis(self, target_date: date = None) -> Optional[Dict]:
@@ -227,7 +220,7 @@ class BroadTopicExtraction: @@ -227,7 +220,7 @@ class BroadTopicExtraction:
227 try: 220 try:
228 return self.db_manager.get_daily_topics(target_date) 221 return self.db_manager.get_daily_topics(target_date)
229 except Exception as e: 222 except Exception as e:
230 - print(f"获取每日分析失败: {e}") 223 + logger.error(f"获取每日分析失败: {e}")
231 return None 224 return None
232 225
233 def get_recent_analysis(self, days: int = 7) -> List[Dict]: 226 def get_recent_analysis(self, days: int = 7) -> List[Dict]:
@@ -235,7 +228,7 @@ class BroadTopicExtraction: @@ -235,7 +228,7 @@ class BroadTopicExtraction:
235 try: 228 try:
236 return self.db_manager.get_recent_topics(days) 229 return self.db_manager.get_recent_topics(days)
237 except Exception as e: 230 except Exception as e:
238 - print(f"获取最近分析失败: {e}") 231 + logger.error(f"获取最近分析失败: {e}")
239 return [] 232 return []
240 233
241 # ==================== 命令行工具 ==================== 234 # ==================== 命令行工具 ====================
@@ -260,17 +253,17 @@ async def run_extraction_command(sources=None, keywords_count=100, show_details= @@ -260,17 +253,17 @@ async def run_extraction_command(sources=None, keywords_count=100, show_details=
260 news_data = result.get('news_collection', {}) 253 news_data = result.get('news_collection', {})
261 topic_data = result.get('topic_extraction', {}) 254 topic_data = result.get('topic_extraction', {})
262 255
263 - print(f"✅ 话题提取成功完成!")  
264 - print(f" 收集新闻: {news_data.get('total_news', 0)} 条")  
265 - print(f" 提取关键词: {len(topic_data.get('keywords', []))} 个")  
266 - print(f" 生成总结: {len(topic_data.get('summary', ''))} 字符") 256 + logger.info(f"✅ 话题提取成功完成!")
  257 + logger.info(f" 收集新闻: {news_data.get('total_news', 0)} 条")
  258 + logger.info(f" 提取关键词: {len(topic_data.get('keywords', []))} 个")
  259 + logger.info(f" 生成总结: {len(topic_data.get('summary', ''))} 字符")
267 260
268 # 获取爬取关键词 261 # 获取爬取关键词
269 crawling_keywords = extractor.get_keywords_for_crawling() 262 crawling_keywords = extractor.get_keywords_for_crawling()
270 263
271 if crawling_keywords: 264 if crawling_keywords:
272 - print(f"\n🔑 为DeepSentimentCrawling准备的搜索关键词:")  
273 - print(f" {', '.join(crawling_keywords)}") 265 + logger.info(f"\n🔑 为DeepSentimentCrawling准备的搜索关键词:")
  266 + logger.info(f" {', '.join(crawling_keywords)}")
274 267
275 # 保存关键词到文件 268 # 保存关键词到文件
276 keywords_file = project_root / "data" / "daily_keywords.txt" 269 keywords_file = project_root / "data" / "daily_keywords.txt"
@@ -279,16 +272,16 @@ async def run_extraction_command(sources=None, keywords_count=100, show_details= @@ -279,16 +272,16 @@ async def run_extraction_command(sources=None, keywords_count=100, show_details=
279 with open(keywords_file, 'w', encoding='utf-8') as f: 272 with open(keywords_file, 'w', encoding='utf-8') as f:
280 f.write('\n'.join(crawling_keywords)) 273 f.write('\n'.join(crawling_keywords))
281 274
282 - print(f" 关键词已保存到: {keywords_file}") 275 + logger.info(f" 关键词已保存到: {keywords_file}")
283 276
284 return True 277 return True
285 278
286 else: 279 else:
287 - print(f"❌ 话题提取失败: {result.get('error', '未知错误')}") 280 + logger.error(f"❌ 话题提取失败: {result.get('error', '未知错误')}")
288 return False 281 return False
289 282
290 except Exception as e: 283 except Exception as e:
291 - print(f"❌ 执行过程中发生错误: {e}") 284 + logger.error(f"❌ 执行过程中发生错误: {e}")
292 return False 285 return False
293 286
294 def main(): 287 def main():
@@ -304,14 +297,14 @@ def main(): @@ -304,14 +297,14 @@ def main():
304 297
305 # 显示支持的新闻源 298 # 显示支持的新闻源
306 if args.list_sources: 299 if args.list_sources:
307 - print("支持的新闻源平台:") 300 + logger.info("支持的新闻源平台:")
308 for source, name in SOURCE_NAMES.items(): 301 for source, name in SOURCE_NAMES.items():
309 - print(f" {source:<25} {name}") 302 + logger.info(f" {source:<25} {name}")
310 return 303 return
311 304
312 # 验证参数 305 # 验证参数
313 if args.keywords < 1 or args.keywords > 200: 306 if args.keywords < 1 or args.keywords > 200:
314 - print("关键词数量应在1-200之间") 307 + logger.error("关键词数量应在1-200之间")
315 sys.exit(1) 308 sys.exit(1)
316 309
317 # 运行提取 310 # 运行提取
@@ -325,7 +318,7 @@ def main(): @@ -325,7 +318,7 @@ def main():
325 sys.exit(0 if success else 1) 318 sys.exit(0 if success else 1)
326 319
327 except KeyboardInterrupt: 320 except KeyboardInterrupt:
328 - print("\n用户中断操作") 321 + logger.info("用户中断操作")
329 sys.exit(1) 322 sys.exit(1)
330 323
331 if __name__ == "__main__": 324 if __name__ == "__main__":
@@ -18,19 +18,20 @@ sys.path.append(str(project_root)) @@ -18,19 +18,20 @@ sys.path.append(str(project_root))
18 18
19 try: 19 try:
20 import config 20 import config
  21 + from config import settings
21 except ImportError: 22 except ImportError:
22 - raise ImportError("无法导入config.py配置文件") 23 + raise ImportError("无法导入settings.py配置文件")
23 24
24 class TopicExtractor: 25 class TopicExtractor:
25 """话题提取器""" 26 """话题提取器"""
26 - 27 +
27 def __init__(self): 28 def __init__(self):
28 """初始化话题提取器""" 29 """初始化话题提取器"""
29 self.client = OpenAI( 30 self.client = OpenAI(
30 - api_key=config.DEEPSEEK_API_KEY,  
31 - base_url="https://api.deepseek.com" 31 + api_key=settings.MINDSPIDER_API_KEY,
  32 + base_url=settings.MINDSPIDER_BASE_URL
32 ) 33 )
33 - self.model = "deepseek-chat" 34 + self.model = settings.MINDSPIDER_MODEL_NAME
34 35
35 def extract_keywords_and_summary(self, news_list: List[Dict], max_keywords: int = 100) -> Tuple[List[str], str]: 36 def extract_keywords_and_summary(self, news_list: List[Dict], max_keywords: int = 100) -> Tuple[List[str], str]:
36 """ 37 """
@@ -11,8 +11,8 @@ from datetime import date, timedelta, datetime @@ -11,8 +11,8 @@ from datetime import date, timedelta, datetime
11 from pathlib import Path 11 from pathlib import Path
12 from typing import List, Dict, Optional 12 from typing import List, Dict, Optional
13 import random 13 import random
14 -import pymysql  
15 -from pymysql.cursors import DictCursor 14 +from sqlalchemy import create_engine, text
  15 +from sqlalchemy.engine import Engine
16 16
17 # 添加项目根目录到路径 17 # 添加项目根目录到路径
18 project_root = Path(__file__).parent.parent 18 project_root = Path(__file__).parent.parent
@@ -23,30 +23,38 @@ try: @@ -23,30 +23,38 @@ try:
23 except ImportError: 23 except ImportError:
24 raise ImportError("无法导入config.py配置文件") 24 raise ImportError("无法导入config.py配置文件")
25 25
  26 +from config import settings
  27 +from loguru import logger
  28 +
26 class KeywordManager: 29 class KeywordManager:
27 """关键词管理器""" 30 """关键词管理器"""
28 31
29 def __init__(self): 32 def __init__(self):
30 """初始化关键词管理器""" 33 """初始化关键词管理器"""
31 - self.connection = None 34 + self.engine: Engine = None
32 self.connect() 35 self.connect()
33 36
34 def connect(self): 37 def connect(self):
35 """连接数据库""" 38 """连接数据库"""
36 try: 39 try:
37 - self.connection = pymysql.connect(  
38 - host=config.DB_HOST,  
39 - port=config.DB_PORT,  
40 - user=config.DB_USER,  
41 - password=config.DB_PASSWORD,  
42 - database=config.DB_NAME,  
43 - charset=config.DB_CHARSET,  
44 - autocommit=True,  
45 - cursorclass=DictCursor  
46 - )  
47 - print(f"关键词管理器成功连接到数据库: {config.DB_NAME}") 40 + dialect = (settings.DB_DIALECT or "mysql").lower()
  41 + if dialect in ("postgresql", "postgres"):
  42 + url = f"postgresql+psycopg://{settings.DB_USER}:{settings.DB_PASSWORD}@{settings.DB_HOST}:{settings.DB_PORT}/{settings.DB_NAME}"
  43 + else:
  44 + url = f"mysql+pymysql://{settings.DB_USER}:{settings.DB_PASSWORD}@{settings.DB_HOST}:{settings.DB_PORT}/{settings.DB_NAME}?charset={settings.DB_CHARSET}"
  45 + self.engine = create_engine(url, future=True)
  46 + logger.info(f"关键词管理器成功连接到数据库: {settings.DB_NAME}")
  47 + except ModuleNotFoundError as e:
  48 + missing: str = str(e)
  49 + if "psycopg" in missing:
  50 + logger.error("数据库连接失败: 未安装PostgreSQL驱动 psycopg。请安装: psycopg[binary]。参考指令:uv pip install psycopg[binary]")
  51 + elif "pymysql" in missing:
  52 + logger.error("数据库连接失败: 未安装MySQL驱动 pymysql。请安装: pymysql。参考指令:uv pip install pymysql")
  53 + else:
  54 + logger.error(f"数据库连接失败(缺少驱动): {e}")
  55 + raise
48 except Exception as e: 56 except Exception as e:
49 - print(f"关键词管理器数据库连接失败: {e}") 57 + logger.exception(f"关键词管理器数据库连接失败: {e}")
50 raise 58 raise
51 59
52 def get_latest_keywords(self, target_date: date = None, max_keywords: int = 100) -> List[str]: 60 def get_latest_keywords(self, target_date: date = None, max_keywords: int = 100) -> List[str]:
@@ -63,24 +71,24 @@ class KeywordManager: @@ -63,24 +71,24 @@ class KeywordManager:
63 if not target_date: 71 if not target_date:
64 target_date = date.today() 72 target_date = date.today()
65 73
66 - print(f"正在获取 {target_date} 的关键词...") 74 + logger.info(f"正在获取 {target_date} 的关键词...")
67 75
68 # 首先尝试获取指定日期的关键词 76 # 首先尝试获取指定日期的关键词
69 topics_data = self.get_daily_topics(target_date) 77 topics_data = self.get_daily_topics(target_date)
70 78
71 if topics_data and topics_data.get('keywords'): 79 if topics_data and topics_data.get('keywords'):
72 keywords = topics_data['keywords'] 80 keywords = topics_data['keywords']
73 - print(f"成功获取 {target_date} 的 {len(keywords)} 个关键词") 81 + logger.info(f"成功获取 {target_date} 的 {len(keywords)} 个关键词")
74 82
75 # 如果关键词太多,随机选择指定数量 83 # 如果关键词太多,随机选择指定数量
76 if len(keywords) > max_keywords: 84 if len(keywords) > max_keywords:
77 keywords = random.sample(keywords, max_keywords) 85 keywords = random.sample(keywords, max_keywords)
78 - print(f"随机选择了 {max_keywords} 个关键词") 86 + logger.info(f"随机选择了 {max_keywords} 个关键词")
79 87
80 return keywords 88 return keywords
81 89
82 # 如果没有当天的关键词,尝试获取最近几天的 90 # 如果没有当天的关键词,尝试获取最近几天的
83 - print(f"{target_date} 没有关键词数据,尝试获取最近的关键词...") 91 + logger.info(f"{target_date} 没有关键词数据,尝试获取最近的关键词...")
84 recent_topics = self.get_recent_topics(days=7) 92 recent_topics = self.get_recent_topics(days=7)
85 93
86 if recent_topics: 94 if recent_topics:
@@ -95,11 +103,11 @@ class KeywordManager: @@ -95,11 +103,11 @@ class KeywordManager:
95 if len(unique_keywords) > max_keywords: 103 if len(unique_keywords) > max_keywords:
96 unique_keywords = random.sample(unique_keywords, max_keywords) 104 unique_keywords = random.sample(unique_keywords, max_keywords)
97 105
98 - print(f"从最近7天的数据中获取到 {len(unique_keywords)} 个关键词") 106 + logger.info(f"从最近7天的数据中获取到 {len(unique_keywords)} 个关键词")
99 return unique_keywords 107 return unique_keywords
100 108
101 # 如果都没有,返回默认关键词 109 # 如果都没有,返回默认关键词
102 - print("没有找到任何关键词数据,使用默认关键词") 110 + logger.info("没有找到任何关键词数据,使用默认关键词")
103 return self._get_default_keywords() 111 return self._get_default_keywords()
104 112
105 def get_daily_topics(self, extract_date: date = None) -> Optional[Dict]: 113 def get_daily_topics(self, extract_date: date = None) -> Optional[Dict]:
@@ -116,20 +124,22 @@ class KeywordManager: @@ -116,20 +124,22 @@ class KeywordManager:
116 extract_date = date.today() 124 extract_date = date.today()
117 125
118 try: 126 try:
119 - cursor = self.connection.cursor()  
120 - query = "SELECT * FROM daily_topics WHERE extract_date = %s"  
121 - cursor.execute(query, (extract_date,))  
122 - result = cursor.fetchone() 127 + with self.engine.connect() as conn:
  128 + result = conn.execute(
  129 + text("SELECT * FROM daily_topics WHERE extract_date = :d"),
  130 + {"d": extract_date},
  131 + ).mappings().first()
123 132
124 if result: 133 if result:
125 - # 解析关键词JSON  
126 - result['keywords'] = json.loads(result['keywords']) 134 + # 转为可变dict再赋值
  135 + result = dict(result)
  136 + result['keywords'] = json.loads(result['keywords']) if result.get('keywords') else []
127 return result 137 return result
128 else: 138 else:
129 return None 139 return None
130 140
131 except Exception as e: 141 except Exception as e:
132 - print(f"获取话题分析失败: {e}") 142 + logger.exception(f"获取话题分析失败: {e}")
133 return None 143 return None
134 144
135 def get_recent_topics(self, days: int = 7) -> List[Dict]: 145 def get_recent_topics(self, days: int = 7) -> List[Dict]:
@@ -143,23 +153,28 @@ class KeywordManager: @@ -143,23 +153,28 @@ class KeywordManager:
143 话题分析列表 153 话题分析列表
144 """ 154 """
145 try: 155 try:
146 - cursor = self.connection.cursor()  
147 - query = """  
148 - SELECT * FROM daily_topics  
149 - WHERE extract_date >= DATE_SUB(CURDATE(), INTERVAL %s DAY)  
150 - ORDER BY extract_date DESC  
151 - """  
152 - cursor.execute(query, (days,))  
153 - results = cursor.fetchall() 156 + start_date = date.today() - timedelta(days=days)
  157 + with self.engine.connect() as conn:
  158 + results = conn.execute(
  159 + text(
  160 + """
  161 + SELECT * FROM daily_topics
  162 + WHERE extract_date >= :start_date
  163 + ORDER BY extract_date DESC
  164 + """
  165 + ),
  166 + {"start_date": start_date},
  167 + ).mappings().all()
154 168
155 - # 解析每个结果的关键词JSON 169 + # 转为可变dict列表再处理
  170 + results = [dict(r) for r in results]
156 for result in results: 171 for result in results:
157 - result['keywords'] = json.loads(result['keywords']) 172 + result['keywords'] = json.loads(result['keywords']) if result.get('keywords') else []
158 173
159 return results 174 return results
160 175
161 except Exception as e: 176 except Exception as e:
162 - print(f"获取最近话题分析失败: {e}") 177 + logger.exception(f"获取最近话题分析失败: {e}")
163 return [] 178 return []
164 179
165 def _get_default_keywords(self) -> List[str]: 180 def _get_default_keywords(self) -> List[str]:
@@ -190,8 +205,8 @@ class KeywordManager: @@ -190,8 +205,8 @@ class KeywordManager:
190 keywords = self.get_latest_keywords(target_date, max_keywords) 205 keywords = self.get_latest_keywords(target_date, max_keywords)
191 206
192 if keywords: 207 if keywords:
193 - print(f"为 {len(platforms)} 个平台准备了相同的 {len(keywords)} 个关键词")  
194 - print(f"每个关键词将在所有平台上进行爬取") 208 + logger.info(f"为 {len(platforms)} 个平台准备了相同的 {len(keywords)} 个关键词")
  209 + logger.info(f"每个关键词将在所有平台上进行爬取")
195 210
196 return keywords 211 return keywords
197 212
@@ -210,7 +225,7 @@ class KeywordManager: @@ -210,7 +225,7 @@ class KeywordManager:
210 """ 225 """
211 keywords = self.get_latest_keywords(target_date, max_keywords) 226 keywords = self.get_latest_keywords(target_date, max_keywords)
212 227
213 - print(f"为平台 {platform} 准备了 {len(keywords)} 个关键词(与其他平台相同)") 228 + logger.info(f"为平台 {platform} 准备了 {len(keywords)} 个关键词(与其他平台相同)")
214 return keywords 229 return keywords
215 230
216 def _filter_keywords_by_platform(self, keywords: List[str], platform: str) -> List[str]: 231 def _filter_keywords_by_platform(self, keywords: List[str], platform: str) -> List[str]:
@@ -290,9 +305,9 @@ class KeywordManager: @@ -290,9 +305,9 @@ class KeywordManager:
290 305
291 def close(self): 306 def close(self):
292 """关闭数据库连接""" 307 """关闭数据库连接"""
293 - if self.connection:  
294 - self.connection.close()  
295 - print("关键词管理器数据库连接已关闭") 308 + if self.engine:
  309 + self.engine.dispose()
  310 + logger.info("关键词管理器数据库连接已关闭")
296 311
297 def __enter__(self): 312 def __enter__(self):
298 return self 313 return self
@@ -305,16 +320,16 @@ if __name__ == "__main__": @@ -305,16 +320,16 @@ if __name__ == "__main__":
305 with KeywordManager() as km: 320 with KeywordManager() as km:
306 # 测试获取关键词 321 # 测试获取关键词
307 keywords = km.get_latest_keywords(max_keywords=20) 322 keywords = km.get_latest_keywords(max_keywords=20)
308 - print(f"获取到的关键词: {keywords}") 323 + logger.info(f"获取到的关键词: {keywords}")
309 324
310 # 测试平台分配 325 # 测试平台分配
311 platforms = ['xhs', 'dy', 'bili'] 326 platforms = ['xhs', 'dy', 'bili']
312 distribution = km.distribute_keywords_by_platform(keywords, platforms) 327 distribution = km.distribute_keywords_by_platform(keywords, platforms)
313 for platform, kws in distribution.items(): 328 for platform, kws in distribution.items():
314 - print(f"{platform}: {kws}") 329 + logger.info(f"{platform}: {kws}")
315 330
316 # 测试爬取摘要 331 # 测试爬取摘要
317 summary = km.get_crawling_summary() 332 summary = km.get_crawling_summary()
318 - print(f"爬取摘要: {summary}") 333 + logger.info(f"爬取摘要: {summary}")
319 334
320 - print("关键词管理器测试完成!") 335 + logger.info("关键词管理器测试完成!")
@@ -13,6 +13,7 @@ from datetime import datetime @@ -13,6 +13,7 @@ from datetime import datetime
13 from pathlib import Path 13 from pathlib import Path
14 from typing import List, Dict, Optional 14 from typing import List, Dict, Optional
15 import json 15 import json
  16 +from loguru import logger
16 17
17 # 添加项目根目录到路径 18 # 添加项目根目录到路径
18 project_root = Path(__file__).parent.parent 19 project_root = Path(__file__).parent.parent
@@ -36,11 +37,15 @@ class PlatformCrawler: @@ -36,11 +37,15 @@ class PlatformCrawler:
36 if not self.mediacrawler_path.exists(): 37 if not self.mediacrawler_path.exists():
37 raise FileNotFoundError(f"MediaCrawler目录不存在: {self.mediacrawler_path}") 38 raise FileNotFoundError(f"MediaCrawler目录不存在: {self.mediacrawler_path}")
38 39
39 - print(f"初始化平台爬虫管理器,MediaCrawler路径: {self.mediacrawler_path}") 40 + logger.info(f"初始化平台爬虫管理器,MediaCrawler路径: {self.mediacrawler_path}")
40 41
41 def configure_mediacrawler_db(self): 42 def configure_mediacrawler_db(self):
42 - """配置MediaCrawler使用我们的MySQL数据库""" 43 + """配置MediaCrawler使用我们的数据库(MySQL或PostgreSQL)"""
43 try: 44 try:
  45 + # 判断数据库类型
  46 + db_dialect = (config.settings.DB_DIALECT or "mysql").lower()
  47 + is_postgresql = db_dialect in ("postgresql", "postgres")
  48 +
44 # 修改MediaCrawler的数据库配置 49 # 修改MediaCrawler的数据库配置
45 db_config_path = self.mediacrawler_path / "config" / "db_config.py" 50 db_config_path = self.mediacrawler_path / "config" / "db_config.py"
46 51
@@ -48,7 +53,14 @@ class PlatformCrawler: @@ -48,7 +53,14 @@ class PlatformCrawler:
48 with open(db_config_path, 'r', encoding='utf-8') as f: 53 with open(db_config_path, 'r', encoding='utf-8') as f:
49 content = f.read() 54 content = f.read()
50 55
51 - # 替换数据库配置 56 + # PostgreSQL配置值:如果使用PostgreSQL则使用MindSpider配置,否则使用默认值或环境变量
  57 + pg_password = config.settings.DB_PASSWORD if is_postgresql else "bettafish"
  58 + pg_user = config.settings.DB_USER if is_postgresql else "bettafish"
  59 + pg_host = config.settings.DB_HOST if is_postgresql else "127.0.0.1"
  60 + pg_port = config.settings.DB_PORT if is_postgresql else 5432
  61 + pg_db_name = config.settings.DB_NAME if is_postgresql else "bettafish"
  62 +
  63 + # 替换数据库配置 - 使用MindSpider的数据库配置
52 new_config = f'''# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: 64 new_config = f'''# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
53 # 1. 不得用于任何商业用途。 65 # 1. 不得用于任何商业用途。
54 # 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 66 # 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
@@ -63,11 +75,19 @@ class PlatformCrawler: @@ -63,11 +75,19 @@ class PlatformCrawler:
63 import os 75 import os
64 76
65 # mysql config - 使用MindSpider的数据库配置 77 # mysql config - 使用MindSpider的数据库配置
66 -MYSQL_DB_PWD = "{config.DB_PASSWORD}"  
67 -MYSQL_DB_USER = "{config.DB_USER}"  
68 -MYSQL_DB_HOST = "{config.DB_HOST}"  
69 -MYSQL_DB_PORT = {config.DB_PORT}  
70 -MYSQL_DB_NAME = "{config.DB_NAME}" 78 +MYSQL_DB_PWD = "{config.settings.DB_PASSWORD}"
  79 +MYSQL_DB_USER = "{config.settings.DB_USER}"
  80 +MYSQL_DB_HOST = "{config.settings.DB_HOST}"
  81 +MYSQL_DB_PORT = {config.settings.DB_PORT}
  82 +MYSQL_DB_NAME = "{config.settings.DB_NAME}"
  83 +
  84 +mysql_db_config = {{
  85 + "user": MYSQL_DB_USER,
  86 + "password": MYSQL_DB_PWD,
  87 + "host": MYSQL_DB_HOST,
  88 + "port": MYSQL_DB_PORT,
  89 + "db_name": MYSQL_DB_NAME,
  90 +}}
71 91
72 92
73 # redis config 93 # redis config
@@ -81,17 +101,39 @@ CACHE_TYPE_REDIS = "redis" @@ -81,17 +101,39 @@ CACHE_TYPE_REDIS = "redis"
81 CACHE_TYPE_MEMORY = "memory" 101 CACHE_TYPE_MEMORY = "memory"
82 102
83 # sqlite config 103 # sqlite config
84 -SQLITE_DB_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "schema", "sqlite_tables.db")''' 104 +SQLITE_DB_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "database", "sqlite_tables.db")
  105 +
  106 +sqlite_db_config = {{
  107 + "db_path": SQLITE_DB_PATH
  108 +}}
  109 +
  110 +# postgresql config - 使用MindSpider的数据库配置(如果DB_DIALECT是postgresql)或环境变量
  111 +POSTGRESQL_DB_PWD = os.getenv("POSTGRESQL_DB_PWD", "{pg_password}")
  112 +POSTGRESQL_DB_USER = os.getenv("POSTGRESQL_DB_USER", "{pg_user}")
  113 +POSTGRESQL_DB_HOST = os.getenv("POSTGRESQL_DB_HOST", "{pg_host}")
  114 +POSTGRESQL_DB_PORT = os.getenv("POSTGRESQL_DB_PORT", "{pg_port}")
  115 +POSTGRESQL_DB_NAME = os.getenv("POSTGRESQL_DB_NAME", "{pg_db_name}")
  116 +
  117 +postgresql_db_config = {{
  118 + "user": POSTGRESQL_DB_USER,
  119 + "password": POSTGRESQL_DB_PWD,
  120 + "host": POSTGRESQL_DB_HOST,
  121 + "port": POSTGRESQL_DB_PORT,
  122 + "db_name": POSTGRESQL_DB_NAME,
  123 +}}
  124 +
  125 +'''
85 126
86 # 写入新配置 127 # 写入新配置
87 with open(db_config_path, 'w', encoding='utf-8') as f: 128 with open(db_config_path, 'w', encoding='utf-8') as f:
88 f.write(new_config) 129 f.write(new_config)
89 130
90 - print("已配置MediaCrawler使用MindSpider数据库") 131 + db_type = "PostgreSQL" if is_postgresql else "MySQL"
  132 + logger.info(f"已配置MediaCrawler使用MindSpider {db_type}数据库")
91 return True 133 return True
92 134
93 except Exception as e: 135 except Exception as e:
94 - print(f"配置MediaCrawler数据库失败: {e}") 136 + logger.exception(f"配置MediaCrawler数据库失败: {e}")
95 return False 137 return False
96 138
97 def create_base_config(self, platform: str, keywords: List[str], 139 def create_base_config(self, platform: str, keywords: List[str],
@@ -109,6 +151,11 @@ SQLITE_DB_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "schem @@ -109,6 +151,11 @@ SQLITE_DB_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "schem
109 是否配置成功 151 是否配置成功
110 """ 152 """
111 try: 153 try:
  154 + # 判断数据库类型,确定 SAVE_DATA_OPTION
  155 + db_dialect = (config.settings.DB_DIALECT or "mysql").lower()
  156 + is_postgresql = db_dialect in ("postgresql", "postgres")
  157 + save_data_option = "postgresql" if is_postgresql else "db"
  158 +
112 base_config_path = self.mediacrawler_path / "config" / "base_config.py" 159 base_config_path = self.mediacrawler_path / "config" / "base_config.py"
113 160
114 # 将关键词列表转换为逗号分隔的字符串 161 # 将关键词列表转换为逗号分隔的字符串
@@ -130,7 +177,7 @@ SQLITE_DB_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "schem @@ -130,7 +177,7 @@ SQLITE_DB_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "schem
130 elif line.startswith('CRAWLER_TYPE = '): 177 elif line.startswith('CRAWLER_TYPE = '):
131 new_lines.append(f'CRAWLER_TYPE = "{crawler_type}" # 爬取类型,search(关键词搜索) | detail(帖子详情)| creator(创作者主页数据)') 178 new_lines.append(f'CRAWLER_TYPE = "{crawler_type}" # 爬取类型,search(关键词搜索) | detail(帖子详情)| creator(创作者主页数据)')
132 elif line.startswith('SAVE_DATA_OPTION = '): 179 elif line.startswith('SAVE_DATA_OPTION = '):
133 - new_lines.append('SAVE_DATA_OPTION = "db" # csv or db or json or sqlite') 180 + new_lines.append(f'SAVE_DATA_OPTION = "{save_data_option}" # csv or db or json or sqlite or postgresql')
134 elif line.startswith('CRAWLER_MAX_NOTES_COUNT = '): 181 elif line.startswith('CRAWLER_MAX_NOTES_COUNT = '):
135 new_lines.append(f'CRAWLER_MAX_NOTES_COUNT = {max_notes}') 182 new_lines.append(f'CRAWLER_MAX_NOTES_COUNT = {max_notes}')
136 elif line.startswith('ENABLE_GET_COMMENTS = '): 183 elif line.startswith('ENABLE_GET_COMMENTS = '):
@@ -146,11 +193,11 @@ SQLITE_DB_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "schem @@ -146,11 +193,11 @@ SQLITE_DB_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "schem
146 with open(base_config_path, 'w', encoding='utf-8') as f: 193 with open(base_config_path, 'w', encoding='utf-8') as f:
147 f.write('\n'.join(new_lines)) 194 f.write('\n'.join(new_lines))
148 195
149 - print(f"已配置 {platform} 平台,关键词数量: {len(keywords)}") 196 + logger.info(f"已配置 {platform} 平台,爬取类型: {crawler_type},关键词数量: {len(keywords)},最大爬取数量: {max_notes},保存数据方式: {save_data_option}")
150 return True 197 return True
151 198
152 except Exception as e: 199 except Exception as e:
153 - print(f"创建基础配置失败: {e}") 200 + logger.exception(f"创建基础配置失败: {e}")
154 return False 201 return False
155 202
156 def run_crawler(self, platform: str, keywords: List[str], 203 def run_crawler(self, platform: str, keywords: List[str],
@@ -173,8 +220,9 @@ SQLITE_DB_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "schem @@ -173,8 +220,9 @@ SQLITE_DB_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "schem
173 if not keywords: 220 if not keywords:
174 raise ValueError("关键词列表不能为空") 221 raise ValueError("关键词列表不能为空")
175 222
176 - print(f"\n开始爬取平台: {platform}")  
177 - print(f"关键词: {keywords[:5]}{'...' if len(keywords) > 5 else ''} (共{len(keywords)}个)") 223 + start_message = f"\n开始爬取平台: {platform}"
  224 + start_message += f"\n关键词: {keywords[:5]}{'...' if len(keywords) > 5 else ''} (共{len(keywords)}个)"
  225 + logger.info(start_message)
178 226
179 start_time = datetime.now() 227 start_time = datetime.now()
180 228
@@ -187,22 +235,27 @@ SQLITE_DB_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "schem @@ -187,22 +235,27 @@ SQLITE_DB_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "schem
187 if not self.create_base_config(platform, keywords, "search", max_notes): 235 if not self.create_base_config(platform, keywords, "search", max_notes):
188 return {"success": False, "error": "基础配置创建失败"} 236 return {"success": False, "error": "基础配置创建失败"}
189 237
  238 + # 判断数据库类型,确定 save_data_option
  239 + db_dialect = (config.settings.DB_DIALECT or "mysql").lower()
  240 + is_postgresql = db_dialect in ("postgresql", "postgres")
  241 + save_data_option = "postgresql" if is_postgresql else "db"
  242 +
190 # 构建命令 243 # 构建命令
191 cmd = [ 244 cmd = [
192 sys.executable, "main.py", 245 sys.executable, "main.py",
193 "--platform", platform, 246 "--platform", platform,
194 "--lt", login_type, 247 "--lt", login_type,
195 "--type", "search", 248 "--type", "search",
196 - "--save_data_option", "db" 249 + "--save_data_option", save_data_option
197 ] 250 ]
198 251
199 - print(f"执行命令: {' '.join(cmd)}") 252 + logger.info(f"执行命令: {' '.join(cmd)}")
200 253
201 # 切换到MediaCrawler目录并执行 254 # 切换到MediaCrawler目录并执行
202 result = subprocess.run( 255 result = subprocess.run(
203 cmd, 256 cmd,
204 cwd=self.mediacrawler_path, 257 cwd=self.mediacrawler_path,
205 - timeout=1800 # 30分钟超时 258 + timeout=3600 # 60分钟超时
206 ) 259 )
207 260
208 end_time = datetime.now() 261 end_time = datetime.now()
@@ -226,17 +279,17 @@ SQLITE_DB_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "schem @@ -226,17 +279,17 @@ SQLITE_DB_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "schem
226 self.crawl_stats[platform] = crawl_stats 279 self.crawl_stats[platform] = crawl_stats
227 280
228 if result.returncode == 0: 281 if result.returncode == 0:
229 - print(f"✅ {platform} 爬取完成,耗时: {duration:.1f}秒") 282 + logger.info(f"✅ {platform} 爬取完成,耗时: {duration:.1f}秒")
230 else: 283 else:
231 - print(f"❌ {platform} 爬取失败,返回码: {result.returncode}") 284 + logger.error(f"❌ {platform} 爬取失败,返回码: {result.returncode}")
232 285
233 return crawl_stats 286 return crawl_stats
234 287
235 except subprocess.TimeoutExpired: 288 except subprocess.TimeoutExpired:
236 - print(f"❌ {platform} 爬取超时") 289 + logger.exception(f"❌ {platform} 爬取超时")
237 return {"success": False, "error": "爬取超时", "platform": platform} 290 return {"success": False, "error": "爬取超时", "platform": platform}
238 except Exception as e: 291 except Exception as e:
239 - print(f"❌ {platform} 爬取异常: {e}") 292 + logger.exception(f"❌ {platform} 爬取异常: {e}")
240 return {"success": False, "error": str(e), "platform": platform} 293 return {"success": False, "error": str(e), "platform": platform}
241 294
242 def _parse_crawl_output(self, output_lines: List[str], error_lines: List[str]) -> Dict: 295 def _parse_crawl_output(self, output_lines: List[str], error_lines: List[str]) -> Dict:
@@ -291,10 +344,14 @@ SQLITE_DB_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "schem @@ -291,10 +344,14 @@ SQLITE_DB_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "schem
291 Returns: 344 Returns:
292 总体爬取统计 345 总体爬取统计
293 """ 346 """
294 - print(f"\n🚀 开始全平台关键词爬取")  
295 - print(f" 关键词数量: {len(keywords)}")  
296 - print(f" 平台数量: {len(platforms)}")  
297 - print(f" 总爬取任务: {len(keywords)} × {len(platforms)} = {len(keywords) * len(platforms)}") 347 +
  348 + start_message = f"\n🚀 开始全平台关键词爬取"
  349 + start_message += f"\n 关键词数量: {len(keywords)}"
  350 + start_message += f"\n 平台数量: {len(platforms)}"
  351 + start_message += f"\n 登录方式: {login_type}"
  352 + start_message += f"\n 每个关键词在每个平台的最大爬取数量: {max_notes_per_keyword}"
  353 + start_message += f"\n 总爬取任务: {len(keywords)} × {len(platforms)} = {len(keywords) * len(platforms)}"
  354 + logger.info(start_message)
298 355
299 total_stats = { 356 total_stats = {
300 "total_keywords": len(keywords), 357 "total_keywords": len(keywords),
@@ -319,8 +376,8 @@ SQLITE_DB_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "schem @@ -319,8 +376,8 @@ SQLITE_DB_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "schem
319 376
320 # 对每个平台一次性爬取所有关键词 377 # 对每个平台一次性爬取所有关键词
321 for platform in platforms: 378 for platform in platforms:
322 - print(f"\n📝 在 {platform} 平台爬取所有关键词")  
323 - print(f" 关键词: {', '.join(keywords[:5])}{'...' if len(keywords) > 5 else ''}") 379 + logger.info(f"\n📝 在 {platform} 平台爬取所有关键词")
  380 + logger.info(f" 关键词: {', '.join(keywords[:5])}{'...' if len(keywords) > 5 else ''}")
324 381
325 try: 382 try:
326 # 一次性传递所有关键词给平台 383 # 一次性传递所有关键词给平台
@@ -344,7 +401,7 @@ SQLITE_DB_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "schem @@ -344,7 +401,7 @@ SQLITE_DB_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "schem
344 total_stats["keyword_results"][keyword] = {} 401 total_stats["keyword_results"][keyword] = {}
345 total_stats["keyword_results"][keyword][platform] = result 402 total_stats["keyword_results"][keyword][platform] = result
346 403
347 - print(f" ✅ 成功: {notes_count} 条内容, {comments_count} 条评论") 404 + logger.info(f" ✅ 成功: {notes_count} 条内容, {comments_count} 条评论")
348 else: 405 else:
349 total_stats["failed_tasks"] += len(keywords) 406 total_stats["failed_tasks"] += len(keywords)
350 total_stats["platform_summary"][platform]["failed_keywords"] = len(keywords) 407 total_stats["platform_summary"][platform]["failed_keywords"] = len(keywords)
@@ -355,7 +412,7 @@ SQLITE_DB_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "schem @@ -355,7 +412,7 @@ SQLITE_DB_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "schem
355 total_stats["keyword_results"][keyword] = {} 412 total_stats["keyword_results"][keyword] = {}
356 total_stats["keyword_results"][keyword][platform] = result 413 total_stats["keyword_results"][keyword][platform] = result
357 414
358 - print(f" ❌ 失败: {result.get('error', '未知错误')}") 415 + logger.error(f" ❌ 失败: {result.get('error', '未知错误')}")
359 416
360 except Exception as e: 417 except Exception as e:
361 total_stats["failed_tasks"] += len(keywords) 418 total_stats["failed_tasks"] += len(keywords)
@@ -368,22 +425,24 @@ SQLITE_DB_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "schem @@ -368,22 +425,24 @@ SQLITE_DB_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "schem
368 total_stats["keyword_results"][keyword] = {} 425 total_stats["keyword_results"][keyword] = {}
369 total_stats["keyword_results"][keyword][platform] = error_result 426 total_stats["keyword_results"][keyword][platform] = error_result
370 427
371 - print(f" ❌ 异常: {e}") 428 + logger.error(f" ❌ 异常: {e}")
372 429
373 # 打印详细统计 430 # 打印详细统计
374 - print(f"\n📊 全平台关键词爬取完成!")  
375 - print(f" 总任务: {total_stats['total_tasks']}")  
376 - print(f" 成功: {total_stats['successful_tasks']}")  
377 - print(f" 失败: {total_stats['failed_tasks']}")  
378 - print(f" 成功率: {total_stats['successful_tasks']/total_stats['total_tasks']*100:.1f}%")  
379 - print(f" 总内容: {total_stats['total_notes']} 条")  
380 - print(f" 总评论: {total_stats['total_comments']} 条") 431 + finish_message = f"\n📊 全平台关键词爬取完成!"
  432 + finish_message += f"\n 总任务: {total_stats['total_tasks']}"
  433 + finish_message += f"\n 成功: {total_stats['successful_tasks']}"
  434 + finish_message += f"\n 失败: {total_stats['failed_tasks']}"
  435 + finish_message += f"\n 成功率: {total_stats['successful_tasks']/total_stats['total_tasks']*100:.1f}%"
  436 + finish_message += f"\n 总内容: {total_stats['total_notes']} 条"
  437 + finish_message += f"\n 总评论: {total_stats['total_comments']} 条"
  438 + logger.info(finish_message)
381 439
382 - print(f"\n📈 各平台统计:") 440 + platform_summary_message = f"\n� 各平台统计:"
383 for platform, stats in total_stats["platform_summary"].items(): 441 for platform, stats in total_stats["platform_summary"].items():
384 success_rate = stats["successful_keywords"] / len(keywords) * 100 if keywords else 0 442 success_rate = stats["successful_keywords"] / len(keywords) * 100 if keywords else 0
385 - print(f" {platform}: {stats['successful_keywords']}/{len(keywords)} 关键词成功 ({success_rate:.1f}%), "  
386 - f"{stats['total_notes']} 条内容") 443 + platform_summary_message += f"\n {platform}: {stats['successful_keywords']}/{len(keywords)} 关键词成功 ({success_rate:.1f}%), "
  444 + platform_summary_message += f"{stats['total_notes']} 条内容"
  445 + logger.info(platform_summary_message)
387 446
388 return total_stats 447 return total_stats
389 448
@@ -403,9 +462,9 @@ SQLITE_DB_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "schem @@ -403,9 +462,9 @@ SQLITE_DB_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "schem
403 try: 462 try:
404 with open(log_path, 'w', encoding='utf-8') as f: 463 with open(log_path, 'w', encoding='utf-8') as f:
405 json.dump(self.crawl_stats, f, ensure_ascii=False, indent=2) 464 json.dump(self.crawl_stats, f, ensure_ascii=False, indent=2)
406 - print(f"爬取日志已保存到: {log_path}") 465 + logger.info(f"爬取日志已保存到: {log_path}")
407 except Exception as e: 466 except Exception as e:
408 - print(f"保存爬取日志失败: {e}") 467 + logger.exception(f"保存爬取日志失败: {e}")
409 468
410 if __name__ == "__main__": 469 if __name__ == "__main__":
411 # 测试平台爬虫管理器 470 # 测试平台爬虫管理器
@@ -415,5 +474,5 @@ if __name__ == "__main__": @@ -415,5 +474,5 @@ if __name__ == "__main__":
415 test_keywords = ["科技", "AI", "编程"] 474 test_keywords = ["科技", "AI", "编程"]
416 result = crawler.run_crawler("xhs", test_keywords, max_notes=5) 475 result = crawler.run_crawler("xhs", test_keywords, max_notes=5)
417 476
418 - print(f"测试结果: {result}")  
419 - print("平台爬虫管理器测试完成!") 477 + logger.info(f"测试结果: {result}")
  478 + logger.info("平台爬虫管理器测试完成!")
@@ -217,26 +217,54 @@ git clone https://github.com/yourusername/MindSpider.git @@ -217,26 +217,54 @@ git clone https://github.com/yourusername/MindSpider.git
217 cd MindSpider 217 cd MindSpider
218 ``` 218 ```
219 219
220 -### 2. 创建并激活Conda环境 220 +### 2. 创建并激活环境
  221 +
  222 +#### Conda配置方法
  223 +
  224 +#### Conda配置方法
221 225
222 ```bash 226 ```bash
  227 +# 创建名为 pytorch_python11 的conda环境并指定Python版本
223 conda create -n pytorch_python11 python=3.11 228 conda create -n pytorch_python11 python=3.11
  229 +# 激活该环境
224 conda activate pytorch_python11 230 conda activate pytorch_python11
225 ``` 231 ```
226 232
  233 +#### UV配置方法
  234 +
  235 +> [UV 是一种快速轻量级 Python 包环境管理工具,适用于低依赖及便捷管理需求。可参考:https://github.com/astral-sh/uv]
  236 +
  237 +- 安装uv(如未安装)
  238 +```bash
  239 +pip install uv
  240 +```
  241 +- 创建虚拟环境并激活
  242 +```bash
  243 +uv venv --python 3.11 # 创建3.11环境
  244 +source .venv/bin/activate # Linux/macOS
  245 +# 或
  246 +.venv\Scripts\activate # Windows
  247 +```
  248 +
  249 +
227 ### 3. 安装依赖 250 ### 3. 安装依赖
228 251
229 ```bash 252 ```bash
230 # 安装Python依赖 253 # 安装Python依赖
231 pip install -r requirements.txt 254 pip install -r requirements.txt
232 255
  256 +
  257 +# uv版本更加快速
  258 +uv pip install -r requirements.txt
  259 +
  260 +
233 # 安装Playwright浏览器驱动 261 # 安装Playwright浏览器驱动
234 playwright install 262 playwright install
235 ``` 263 ```
236 264
237 ### 4. 配置系统 265 ### 4. 配置系统
238 266
239 -编辑 `config.py` 文件,设置数据库和API配置: 267 +复制.env.example文件为.env文件,放置在项目根目录。编辑 `.env` 文件,设置数据库和API配置:
240 268
241 ```python 269 ```python
242 # MySQL数据库配置 270 # MySQL数据库配置
@@ -248,7 +276,9 @@ DB_NAME = "mindspider" @@ -248,7 +276,9 @@ DB_NAME = "mindspider"
248 DB_CHARSET = "utf8mb4" 276 DB_CHARSET = "utf8mb4"
249 277
250 # DeepSeek API密钥 278 # DeepSeek API密钥
251 -DEEPSEEK_API_KEY = "your_deepseek_api_key" 279 +MINDSPIDER_BASE_URL=your_api_base_url
  280 +MINDSPIDER_API_KEY=sk-your-key
  281 +MINDSPIDER_MODEL_NAME=deepseek-chat
252 ``` 282 ```
253 283
254 ### 5. 初始化系统 284 ### 5. 初始化系统
@@ -418,6 +448,11 @@ python main.py --status @@ -418,6 +448,11 @@ python main.py --status
418 ```bash 448 ```bash
419 # 重新安装 449 # 重新安装
420 pip install playwright 450 pip install playwright
  451 +
  452 +
  453 +
  454 +uv pip install playwright
  455 +
421 playwright install 456 playwright install
422 ``` 457 ```
423 458
@@ -3,13 +3,33 @@ @@ -3,13 +3,33 @@
3 存储数据库连接信息和API密钥 3 存储数据库连接信息和API密钥
4 """ 4 """
5 5
6 -# MySQL数据库配置  
7 -DB_HOST = "your_host"  
8 -DB_PORT = 3306  
9 -DB_USER = "your_username"  
10 -DB_PASSWORD = "your_password"  
11 -DB_NAME = "mindspider"  
12 -DB_CHARSET = "utf8mb4"  
13 -  
14 -# DeepSeek API密钥  
15 -DEEPSEEK_API_KEY = "your_deepseek_api_key" 6 +from pydantic_settings import BaseSettings
  7 +from typing import Optional
  8 +from pydantic import Field
  9 +from pathlib import Path
  10 +
  11 +# 计算 .env 优先级:优先当前工作目录,其次项目根目录(MindSpider 的上级目录)
  12 +PROJECT_ROOT: Path = Path(__file__).resolve().parents[1]
  13 +CWD_ENV: Path = Path.cwd() / ".env"
  14 +ENV_FILE: str = str(CWD_ENV if CWD_ENV.exists() else (PROJECT_ROOT / ".env"))
  15 +
  16 +class Settings(BaseSettings):
  17 + """全局配置管理,优先从环境变量和.env加载。支持MySQL/PostgreSQL统一数据库参数命名。"""
  18 + DB_DIALECT: str = Field("mysql", description="数据库类型,支持'mysql'或'postgresql'")
  19 + DB_HOST: str = Field("your_host", description="数据库主机名或IP地址")
  20 + DB_PORT: int = Field(3306, description="数据库端口号")
  21 + DB_USER: str = Field("your_username", description="数据库用户名")
  22 + DB_PASSWORD: str = Field("your_password", description="数据库密码")
  23 + DB_NAME: str = Field("mindspider", description="数据库名称")
  24 + DB_CHARSET: str = Field("utf8mb4", description="数据库字符集")
  25 + MINDSPIDER_API_KEY: Optional[str] = Field(None, description="MINDSPIDER API密钥")
  26 + MINDSPIDER_BASE_URL: Optional[str] = Field("https://api.deepseek.com", description="MINDSPIDER API基础URL,推荐deepseek-chat模型使用https://api.deepseek.com")
  27 + MINDSPIDER_MODEL_NAME: Optional[str] = Field("deepseek-chat", description="MINDSPIDER API模型名称, 推荐deepseek-chat")
  28 +
  29 + class Config:
  30 + env_file = ENV_FILE
  31 + env_prefix = ""
  32 + case_sensitive = False
  33 + extra = "allow"
  34 +
  35 +settings = Settings()
@@ -11,8 +11,13 @@ import argparse @@ -11,8 +11,13 @@ import argparse
11 from datetime import date, datetime 11 from datetime import date, datetime
12 from pathlib import Path 12 from pathlib import Path
13 import subprocess 13 import subprocess
  14 +import asyncio
14 import pymysql 15 import pymysql
15 from pymysql.cursors import DictCursor 16 from pymysql.cursors import DictCursor
  17 +from sqlalchemy.ext.asyncio import create_async_engine, AsyncEngine
  18 +from sqlalchemy import inspect, text
  19 +from config import settings
  20 +from loguru import logger
16 21
17 # 添加项目根目录到路径 22 # 添加项目根目录到路径
18 project_root = Path(__file__).parent 23 project_root = Path(__file__).parent
@@ -21,8 +26,8 @@ sys.path.append(str(project_root)) @@ -21,8 +26,8 @@ sys.path.append(str(project_root))
21 try: 26 try:
22 import config 27 import config
23 except ImportError: 28 except ImportError:
24 - print("错误:无法导入config.py配置文件")  
25 - print("请确保项目根目录下存在config.py文件,并包含数据库和API配置信息") 29 + logger.error("错误:无法导入config.py配置文件")
  30 + logger.error("请确保项目根目录下存在config.py文件,并包含数据库和API配置信息")
26 sys.exit(1) 31 sys.exit(1)
27 32
28 class MindSpider: 33 class MindSpider:
@@ -35,99 +40,110 @@ class MindSpider: @@ -35,99 +40,110 @@ class MindSpider:
35 self.deep_sentiment_path = self.project_root / "DeepSentimentCrawling" 40 self.deep_sentiment_path = self.project_root / "DeepSentimentCrawling"
36 self.schema_path = self.project_root / "schema" 41 self.schema_path = self.project_root / "schema"
37 42
38 - print("MindSpider AI爬虫项目")  
39 - print(f"项目路径: {self.project_root}") 43 + logger.info("MindSpider AI爬虫项目")
  44 + logger.info(f"项目路径: {self.project_root}")
40 45
41 def check_config(self) -> bool: 46 def check_config(self) -> bool:
42 """检查基础配置""" 47 """检查基础配置"""
43 - print("\n检查基础配置...") 48 + logger.info("检查基础配置...")
44 49
45 - # 检查config.py配置项 50 + # 检查settings配置项
46 required_configs = [ 51 required_configs = [
47 'DB_HOST', 'DB_PORT', 'DB_USER', 'DB_PASSWORD', 'DB_NAME', 'DB_CHARSET', 52 'DB_HOST', 'DB_PORT', 'DB_USER', 'DB_PASSWORD', 'DB_NAME', 'DB_CHARSET',
48 - 'DEEPSEEK_API_KEY' 53 + 'MINDSPIDER_API_KEY', 'MINDSPIDER_BASE_URL', 'MINDSPIDER_MODEL_NAME'
49 ] 54 ]
50 55
51 missing_configs = [] 56 missing_configs = []
52 for config_name in required_configs: 57 for config_name in required_configs:
53 - if not hasattr(config, config_name) or not getattr(config, config_name): 58 + if not hasattr(settings, config_name) or not getattr(settings, config_name):
54 missing_configs.append(config_name) 59 missing_configs.append(config_name)
55 60
56 if missing_configs: 61 if missing_configs:
57 - print(f"配置缺失: {', '.join(missing_configs)}")  
58 - print("请检查config.py文件中的配置信息") 62 + logger.error(f"配置缺失: {', '.join(missing_configs)}")
  63 + logger.error("请检查config.py文件中的配置信息")
59 return False 64 return False
60 65
61 - print("基础配置检查通过") 66 + logger.info("基础配置检查通过")
62 return True 67 return True
63 68
64 def check_database_connection(self) -> bool: 69 def check_database_connection(self) -> bool:
65 """检查数据库连接""" 70 """检查数据库连接"""
66 - print("\n检查数据库连接...")  
67 -  
68 - try:  
69 - connection = pymysql.connect(  
70 - host=config.DB_HOST,  
71 - port=config.DB_PORT,  
72 - user=config.DB_USER,  
73 - password=config.DB_PASSWORD,  
74 - database=config.DB_NAME,  
75 - charset=config.DB_CHARSET,  
76 - cursorclass=DictCursor 71 + logger.info("检查数据库连接...")
  72 +
  73 + def build_async_url() -> str:
  74 + dialect = (settings.DB_DIALECT or "mysql").lower()
  75 + if dialect == "postgresql":
  76 + return f"postgresql+asyncpg://{settings.DB_USER}:{settings.DB_PASSWORD}@{settings.DB_HOST}:{settings.DB_PORT}/{settings.DB_NAME}"
  77 + # 默认使用 mysql 异步驱动 asyncmy
  78 + return (
  79 + f"mysql+asyncmy://{settings.DB_USER}:{settings.DB_PASSWORD}"
  80 + f"@{settings.DB_HOST}:{settings.DB_PORT}/{settings.DB_NAME}?charset={settings.DB_CHARSET}"
77 ) 81 )
78 - connection.close()  
79 - print("数据库连接正常") 82 +
  83 + async def _test_connection(db_url: str) -> None:
  84 + engine: AsyncEngine = create_async_engine(db_url, pool_pre_ping=True)
  85 + try:
  86 + async with engine.connect() as conn:
  87 + await conn.execute(text("SELECT 1"))
  88 + finally:
  89 + await engine.dispose()
  90 +
  91 + try:
  92 + db_url: str = build_async_url()
  93 + asyncio.run(_test_connection(db_url))
  94 + logger.info("数据库连接正常")
80 return True 95 return True
81 except Exception as e: 96 except Exception as e:
82 - print(f"数据库连接失败: {e}") 97 + logger.exception(f"数据库连接失败: {e}")
83 return False 98 return False
84 99
85 def check_database_tables(self) -> bool: 100 def check_database_tables(self) -> bool:
86 """检查数据库表是否存在""" 101 """检查数据库表是否存在"""
87 - print("\n检查数据库表...")  
88 -  
89 - try:  
90 - connection = pymysql.connect(  
91 - host=config.DB_HOST,  
92 - port=config.DB_PORT,  
93 - user=config.DB_USER,  
94 - password=config.DB_PASSWORD,  
95 - database=config.DB_NAME,  
96 - charset=config.DB_CHARSET,  
97 - cursorclass=DictCursor 102 + logger.info("检查数据库表...")
  103 +
  104 + def build_async_url() -> str:
  105 + dialect = (settings.DB_DIALECT or "mysql").lower()
  106 + if dialect == "postgresql":
  107 + return f"postgresql+asyncpg://{settings.DB_USER}:{settings.DB_PASSWORD}@{settings.DB_HOST}:{settings.DB_PORT}/{settings.DB_NAME}"
  108 + return (
  109 + f"mysql+asyncmy://{settings.DB_USER}:{settings.DB_PASSWORD}"
  110 + f"@{settings.DB_HOST}:{settings.DB_PORT}/{settings.DB_NAME}?charset={settings.DB_CHARSET}"
98 ) 111 )
99 -  
100 - cursor = connection.cursor()  
101 -  
102 - # 检查核心表是否存在 112 +
  113 + async def _check_tables(db_url: str) -> list[str]:
  114 + engine: AsyncEngine = create_async_engine(db_url, pool_pre_ping=True)
  115 + try:
  116 + async with engine.connect() as conn:
  117 + def _get_tables(sync_conn):
  118 + return inspect(sync_conn).get_table_names()
  119 + tables = await conn.run_sync(_get_tables)
  120 + return tables
  121 + finally:
  122 + await engine.dispose()
  123 +
  124 + try:
  125 + db_url: str = build_async_url()
  126 + existing_tables = asyncio.run(_check_tables(db_url))
103 required_tables = ['daily_news', 'daily_topics'] 127 required_tables = ['daily_news', 'daily_topics']
104 - cursor.execute("SHOW TABLES")  
105 - existing_tables = [row[f'Tables_in_{config.DB_NAME}'] for row in cursor.fetchall()]  
106 -  
107 - missing_tables = [table for table in required_tables if table not in existing_tables]  
108 -  
109 - connection.close()  
110 - 128 + missing_tables = [t for t in required_tables if t not in existing_tables]
111 if missing_tables: 129 if missing_tables:
112 - print(f"缺少数据库表: {', '.join(missing_tables)}") 130 + logger.error(f"缺少数据库表: {', '.join(missing_tables)}")
113 return False 131 return False
114 - else:  
115 - print("数据库表检查通过")  
116 - return True  
117 - 132 + logger.info("数据库表检查通过")
  133 + return True
118 except Exception as e: 134 except Exception as e:
119 - print(f"检查数据库表失败: {e}") 135 + logger.exception(f"检查数据库表失败: {e}")
120 return False 136 return False
121 137
122 def initialize_database(self) -> bool: 138 def initialize_database(self) -> bool:
123 """初始化数据库""" 139 """初始化数据库"""
124 - print("\n初始化数据库...") 140 + logger.info("初始化数据库...")
125 141
126 try: 142 try:
127 # 运行数据库初始化脚本 143 # 运行数据库初始化脚本
128 init_script = self.schema_path / "init_database.py" 144 init_script = self.schema_path / "init_database.py"
129 if not init_script.exists(): 145 if not init_script.exists():
130 - print("错误:找不到数据库初始化脚本") 146 + logger.error("错误:找不到数据库初始化脚本")
131 return False 147 return False
132 148
133 result = subprocess.run( 149 result = subprocess.run(
@@ -138,19 +154,19 @@ class MindSpider: @@ -138,19 +154,19 @@ class MindSpider:
138 ) 154 )
139 155
140 if result.returncode == 0: 156 if result.returncode == 0:
141 - print("数据库初始化成功") 157 + logger.info("数据库初始化成功")
142 return True 158 return True
143 else: 159 else:
144 - print(f"数据库初始化失败: {result.stderr}") 160 + logger.error(f"数据库初始化失败: {result.stderr}")
145 return False 161 return False
146 162
147 except Exception as e: 163 except Exception as e:
148 - print(f"数据库初始化异常: {e}") 164 + logger.exception(f"数据库初始化异常: {e}")
149 return False 165 return False
150 166
151 def check_dependencies(self) -> bool: 167 def check_dependencies(self) -> bool:
152 """检查依赖环境""" 168 """检查依赖环境"""
153 - print("\n检查依赖环境...") 169 + logger.info("检查依赖环境...")
154 170
155 # 检查Python包 171 # 检查Python包
156 required_packages = ['pymysql', 'requests', 'playwright'] 172 required_packages = ['pymysql', 'requests', 'playwright']
@@ -163,22 +179,22 @@ class MindSpider: @@ -163,22 +179,22 @@ class MindSpider:
163 missing_packages.append(package) 179 missing_packages.append(package)
164 180
165 if missing_packages: 181 if missing_packages:
166 - print(f"缺少Python包: {', '.join(missing_packages)}")  
167 - print("请运行: pip install -r requirements.txt") 182 + logger.error(f"缺少Python包: {', '.join(missing_packages)}")
  183 + logger.info("请运行: pip install -r requirements.txt")
168 return False 184 return False
169 185
170 # 检查MediaCrawler依赖 186 # 检查MediaCrawler依赖
171 mediacrawler_path = self.deep_sentiment_path / "MediaCrawler" 187 mediacrawler_path = self.deep_sentiment_path / "MediaCrawler"
172 if not mediacrawler_path.exists(): 188 if not mediacrawler_path.exists():
173 - print("错误:找不到MediaCrawler目录") 189 + logger.error("错误:找不到MediaCrawler目录")
174 return False 190 return False
175 191
176 - print("依赖环境检查通过") 192 + logger.info("依赖环境检查通过")
177 return True 193 return True
178 194
179 def run_broad_topic_extraction(self, extract_date: date = None, keywords_count: int = 100) -> bool: 195 def run_broad_topic_extraction(self, extract_date: date = None, keywords_count: int = 100) -> bool:
180 """运行BroadTopicExtraction模块""" 196 """运行BroadTopicExtraction模块"""
181 - print(f"\n运行BroadTopicExtraction模块...") 197 + logger.info("运行BroadTopicExtraction模块...")
182 198
183 if not extract_date: 199 if not extract_date:
184 extract_date = date.today() 200 extract_date = date.today()
@@ -186,11 +202,10 @@ class MindSpider: @@ -186,11 +202,10 @@ class MindSpider:
186 try: 202 try:
187 cmd = [ 203 cmd = [
188 sys.executable, "main.py", 204 sys.executable, "main.py",
189 - "--date", extract_date.strftime("%Y-%m-%d"),  
190 "--keywords", str(keywords_count) 205 "--keywords", str(keywords_count)
191 ] 206 ]
192 207
193 - print(f"执行命令: {' '.join(cmd)}") 208 + logger.info(f"执行命令: {' '.join(cmd)}")
194 209
195 result = subprocess.run( 210 result = subprocess.run(
196 cmd, 211 cmd,
@@ -199,24 +214,24 @@ class MindSpider: @@ -199,24 +214,24 @@ class MindSpider:
199 ) 214 )
200 215
201 if result.returncode == 0: 216 if result.returncode == 0:
202 - print("BroadTopicExtraction模块执行成功") 217 + logger.info("BroadTopicExtraction模块执行成功")
203 return True 218 return True
204 else: 219 else:
205 - print(f"BroadTopicExtraction模块执行失败,返回码: {result.returncode}") 220 + logger.error(f"BroadTopicExtraction模块执行失败,返回码: {result.returncode}")
206 return False 221 return False
207 222
208 except subprocess.TimeoutExpired: 223 except subprocess.TimeoutExpired:
209 - print("BroadTopicExtraction模块执行超时") 224 + logger.error("BroadTopicExtraction模块执行超时")
210 return False 225 return False
211 except Exception as e: 226 except Exception as e:
212 - print(f"BroadTopicExtraction模块执行异常: {e}") 227 + logger.exception(f"BroadTopicExtraction模块执行异常: {e}")
213 return False 228 return False
214 229
215 def run_deep_sentiment_crawling(self, target_date: date = None, platforms: list = None, 230 def run_deep_sentiment_crawling(self, target_date: date = None, platforms: list = None,
216 max_keywords: int = 50, max_notes: int = 50, 231 max_keywords: int = 50, max_notes: int = 50,
217 test_mode: bool = False) -> bool: 232 test_mode: bool = False) -> bool:
218 """运行DeepSentimentCrawling模块""" 233 """运行DeepSentimentCrawling模块"""
219 - print(f"\n运行DeepSentimentCrawling模块...") 234 + logger.info("运行DeepSentimentCrawling模块...")
220 235
221 if not target_date: 236 if not target_date:
222 target_date = date.today() 237 target_date = date.today()
@@ -238,7 +253,7 @@ class MindSpider: @@ -238,7 +253,7 @@ class MindSpider:
238 if test_mode: 253 if test_mode:
239 cmd.append("--test") 254 cmd.append("--test")
240 255
241 - print(f"执行命令: {' '.join(cmd)}") 256 + logger.info(f"执行命令: {' '.join(cmd)}")
242 257
243 result = subprocess.run( 258 result = subprocess.run(
244 cmd, 259 cmd,
@@ -247,78 +262,78 @@ class MindSpider: @@ -247,78 +262,78 @@ class MindSpider:
247 ) 262 )
248 263
249 if result.returncode == 0: 264 if result.returncode == 0:
250 - print("DeepSentimentCrawling模块执行成功") 265 + logger.info("DeepSentimentCrawling模块执行成功")
251 return True 266 return True
252 else: 267 else:
253 - print(f"DeepSentimentCrawling模块执行失败,返回码: {result.returncode}") 268 + logger.error(f"DeepSentimentCrawling模块执行失败,返回码: {result.returncode}")
254 return False 269 return False
255 270
256 except subprocess.TimeoutExpired: 271 except subprocess.TimeoutExpired:
257 - print("DeepSentimentCrawling模块执行超时") 272 + logger.error("DeepSentimentCrawling模块执行超时")
258 return False 273 return False
259 except Exception as e: 274 except Exception as e:
260 - print(f"DeepSentimentCrawling模块执行异常: {e}") 275 + logger.exception(f"DeepSentimentCrawling模块执行异常: {e}")
261 return False 276 return False
262 277
263 def run_complete_workflow(self, target_date: date = None, platforms: list = None, 278 def run_complete_workflow(self, target_date: date = None, platforms: list = None,
264 keywords_count: int = 100, max_keywords: int = 50, 279 keywords_count: int = 100, max_keywords: int = 50,
265 max_notes: int = 50, test_mode: bool = False) -> bool: 280 max_notes: int = 50, test_mode: bool = False) -> bool:
266 """运行完整工作流程""" 281 """运行完整工作流程"""
267 - print(f"\n开始完整的MindSpider工作流程") 282 + logger.info("开始完整的MindSpider工作流程")
268 283
269 if not target_date: 284 if not target_date:
270 target_date = date.today() 285 target_date = date.today()
271 286
272 - print(f"目标日期: {target_date}")  
273 - print(f"平台列表: {platforms if platforms else '所有支持的平台'}")  
274 - print(f"测试模式: {'是' if test_mode else '否'}") 287 + logger.info(f"目标日期: {target_date}")
  288 + logger.info(f"平台列表: {platforms if platforms else '所有支持的平台'}")
  289 + logger.info(f"测试模式: {'是' if test_mode else '否'}")
275 290
276 # 第一步:运行话题提取 291 # 第一步:运行话题提取
277 - print(f"\n=== 第一步:话题提取 ===") 292 + logger.info("=== 第一步:话题提取 ===")
278 if not self.run_broad_topic_extraction(target_date, keywords_count): 293 if not self.run_broad_topic_extraction(target_date, keywords_count):
279 - print("话题提取失败,终止流程") 294 + logger.error("话题提取失败,终止流程")
280 return False 295 return False
281 296
282 # 第二步:运行情感爬取 297 # 第二步:运行情感爬取
283 - print(f"\n=== 第二步:情感爬取 ===") 298 + logger.info("=== 第二步:情感爬取 ===")
284 if not self.run_deep_sentiment_crawling(target_date, platforms, max_keywords, max_notes, test_mode): 299 if not self.run_deep_sentiment_crawling(target_date, platforms, max_keywords, max_notes, test_mode):
285 - print("情感爬取失败,但话题提取已完成") 300 + logger.error("情感爬取失败,但话题提取已完成")
286 return False 301 return False
287 302
288 - print(f"\n完整工作流程执行成功!") 303 + logger.info("完整工作流程执行成功!")
289 return True 304 return True
290 305
291 def show_status(self): 306 def show_status(self):
292 """显示项目状态""" 307 """显示项目状态"""
293 - print(f"\nMindSpider项目状态:")  
294 - print(f"项目路径: {self.project_root}") 308 + logger.info("MindSpider项目状态:")
  309 + logger.info(f"项目路径: {self.project_root}")
295 310
296 # 配置状态 311 # 配置状态
297 config_ok = self.check_config() 312 config_ok = self.check_config()
298 - print(f"配置状态: {'正常' if config_ok else '异常'}") 313 + logger.info(f"配置状态: {'正常' if config_ok else '异常'}")
299 314
300 # 数据库状态 315 # 数据库状态
301 if config_ok: 316 if config_ok:
302 db_conn_ok = self.check_database_connection() 317 db_conn_ok = self.check_database_connection()
303 - print(f"数据库连接: {'正常' if db_conn_ok else '异常'}") 318 + logger.info(f"数据库连接: {'正常' if db_conn_ok else '异常'}")
304 319
305 if db_conn_ok: 320 if db_conn_ok:
306 db_tables_ok = self.check_database_tables() 321 db_tables_ok = self.check_database_tables()
307 - print(f"数据库表: {'正常' if db_tables_ok else '需要初始化'}") 322 + logger.info(f"数据库表: {'正常' if db_tables_ok else '需要初始化'}")
308 323
309 # 依赖状态 324 # 依赖状态
310 deps_ok = self.check_dependencies() 325 deps_ok = self.check_dependencies()
311 - print(f"依赖环境: {'正常' if deps_ok else '异常'}") 326 + logger.info(f"依赖环境: {'正常' if deps_ok else '异常'}")
312 327
313 # 模块状态 328 # 模块状态
314 broad_topic_exists = self.broad_topic_path.exists() 329 broad_topic_exists = self.broad_topic_path.exists()
315 deep_sentiment_exists = self.deep_sentiment_path.exists() 330 deep_sentiment_exists = self.deep_sentiment_path.exists()
316 - print(f"BroadTopicExtraction模块: {'存在' if broad_topic_exists else '缺失'}")  
317 - print(f"DeepSentimentCrawling模块: {'存在' if deep_sentiment_exists else '缺失'}") 331 + logger.info(f"BroadTopicExtraction模块: {'存在' if broad_topic_exists else '缺失'}")
  332 + logger.info(f"DeepSentimentCrawling模块: {'存在' if deep_sentiment_exists else '缺失'}")
318 333
319 def setup_project(self) -> bool: 334 def setup_project(self) -> bool:
320 """项目初始化设置""" 335 """项目初始化设置"""
321 - print(f"\n开始MindSpider项目初始化...") 336 + logger.info("开始MindSpider项目初始化...")
322 337
323 # 1. 检查配置 338 # 1. 检查配置
324 if not self.check_config(): 339 if not self.check_config():
@@ -334,11 +349,11 @@ class MindSpider: @@ -334,11 +349,11 @@ class MindSpider:
334 349
335 # 4. 检查并初始化数据库表 350 # 4. 检查并初始化数据库表
336 if not self.check_database_tables(): 351 if not self.check_database_tables():
337 - print("需要初始化数据库表...") 352 + logger.info("需要初始化数据库表...")
338 if not self.initialize_database(): 353 if not self.initialize_database():
339 return False 354 return False
340 355
341 - print(f"\nMindSpider项目初始化完成!") 356 + logger.info("MindSpider项目初始化完成!")
342 return True 357 return True
343 358
344 def main(): 359 def main():
@@ -373,7 +388,7 @@ def main(): @@ -373,7 +388,7 @@ def main():
373 try: 388 try:
374 target_date = datetime.strptime(args.date, "%Y-%m-%d").date() 389 target_date = datetime.strptime(args.date, "%Y-%m-%d").date()
375 except ValueError: 390 except ValueError:
376 - print("错误:日期格式不正确,请使用 YYYY-MM-DD 格式") 391 + logger.error("错误:日期格式不正确,请使用 YYYY-MM-DD 格式")
377 return 392 return
378 393
379 # 创建MindSpider实例 394 # 创建MindSpider实例
@@ -388,17 +403,17 @@ def main(): @@ -388,17 +403,17 @@ def main():
388 # 项目设置 403 # 项目设置
389 if args.setup: 404 if args.setup:
390 if spider.setup_project(): 405 if spider.setup_project():
391 - print("项目设置完成,可以开始使用MindSpider!") 406 + logger.info("项目设置完成,可以开始使用MindSpider!")
392 else: 407 else:
393 - print("项目设置失败,请检查配置和环境") 408 + logger.error("项目设置失败,请检查配置和环境")
394 return 409 return
395 410
396 # 初始化数据库 411 # 初始化数据库
397 if args.init_db: 412 if args.init_db:
398 if spider.initialize_database(): 413 if spider.initialize_database():
399 - print("数据库初始化成功") 414 + logger.info("数据库初始化成功")
400 else: 415 else:
401 - print("数据库初始化失败") 416 + logger.error("数据库初始化失败")
402 return 417 return
403 418
404 # 运行模块 419 # 运行模块
@@ -415,16 +430,16 @@ def main(): @@ -415,16 +430,16 @@ def main():
415 ) 430 )
416 else: 431 else:
417 # 默认运行完整工作流程 432 # 默认运行完整工作流程
418 - print("运行完整MindSpider工作流程...") 433 + logger.info("运行完整MindSpider工作流程...")
419 spider.run_complete_workflow( 434 spider.run_complete_workflow(
420 target_date, args.platforms, args.keywords_count, 435 target_date, args.platforms, args.keywords_count,
421 args.max_keywords, args.max_notes, args.test 436 args.max_keywords, args.max_notes, args.test
422 ) 437 )
423 438
424 except KeyboardInterrupt: 439 except KeyboardInterrupt:
425 - print("\n用户中断操作") 440 + logger.info("用户中断操作")
426 except Exception as e: 441 except Exception as e:
427 - print(f"\n执行出错: {e}") 442 + logger.exception(f"执行出错: {e}")
428 443
429 if __name__ == "__main__": 444 if __name__ == "__main__":
430 main() 445 main()
@@ -7,6 +7,8 @@ @@ -7,6 +7,8 @@
7 pymysql==1.1.0 7 pymysql==1.1.0
8 aiomysql==0.2.0 8 aiomysql==0.2.0
9 aiosqlite==0.21.0 9 aiosqlite==0.21.0
  10 +asyncpg
  11 +sqlalchemy
10 12
11 # =============================== 13 # ===============================
12 # HTTP请求和网络 14 # HTTP请求和网络
@@ -42,6 +44,8 @@ wordcloud==1.9.3 @@ -42,6 +44,8 @@ wordcloud==1.9.3
42 matplotlib==3.9.0 44 matplotlib==3.9.0
43 parsel==1.9.1 45 parsel==1.9.1
44 pyexecjs==1.5.1 46 pyexecjs==1.5.1
  47 +typer>=0.12.3
  48 +pyhumps==3.8.0
45 49
46 # =============================== 50 # ===============================
47 # 工具包 51 # 工具包
@@ -7,10 +7,12 @@ MindSpider AI爬虫项目 - 数据库管理工具 @@ -7,10 +7,12 @@ MindSpider AI爬虫项目 - 数据库管理工具
7 7
8 import os 8 import os
9 import sys 9 import sys
10 -import pymysql 10 +from sqlalchemy import create_engine, text, inspect
  11 +from sqlalchemy.engine import Engine
11 import argparse 12 import argparse
12 from pathlib import Path 13 from pathlib import Path
13 from datetime import datetime, timedelta 14 from datetime import datetime, timedelta
  15 +from loguru import logger
14 16
15 # 添加项目根目录到路径 17 # 添加项目根目录到路径
16 project_root = Path(__file__).parent.parent 18 project_root = Path(__file__).parent.parent
@@ -19,125 +21,132 @@ sys.path.append(str(project_root)) @@ -19,125 +21,132 @@ sys.path.append(str(project_root))
19 try: 21 try:
20 import config 22 import config
21 except ImportError: 23 except ImportError:
22 - print("错误: 无法导入config.py配置文件") 24 + logger.error("错误: 无法导入config.py配置文件")
23 sys.exit(1) 25 sys.exit(1)
24 26
  27 +from MindSpider.config import settings
  28 +
25 class DatabaseManager: 29 class DatabaseManager:
26 def __init__(self): 30 def __init__(self):
27 - self.connection = None 31 + self.engine: Engine = None
28 self.connect() 32 self.connect()
29 33
30 def connect(self): 34 def connect(self):
31 """连接数据库""" 35 """连接数据库"""
32 try: 36 try:
33 - self.connection = pymysql.connect(  
34 - host=config.DB_HOST,  
35 - port=config.DB_PORT,  
36 - user=config.DB_USER,  
37 - password=config.DB_PASSWORD,  
38 - database=config.DB_NAME,  
39 - charset=config.DB_CHARSET,  
40 - autocommit=True  
41 - )  
42 - print(f"成功连接到数据库: {config.DB_NAME}") 37 + dialect = (settings.DB_DIALECT or "mysql").lower()
  38 + if dialect in ("postgresql", "postgres"):
  39 + url = f"postgresql+psycopg://{settings.DB_USER}:{settings.DB_PASSWORD}@{settings.DB_HOST}:{settings.DB_PORT}/{settings.DB_NAME}"
  40 + else:
  41 + url = f"mysql+pymysql://{settings.DB_USER}:{settings.DB_PASSWORD}@{settings.DB_HOST}:{settings.DB_PORT}/{settings.DB_NAME}?charset={settings.DB_CHARSET}"
  42 + self.engine = create_engine(url, future=True)
  43 + logger.info(f"成功连接到数据库: {settings.DB_NAME}")
43 except Exception as e: 44 except Exception as e:
44 - print(f"数据库连接失败: {e}") 45 + logger.error(f"数据库连接失败: {e}")
45 sys.exit(1) 46 sys.exit(1)
46 47
47 def close(self): 48 def close(self):
48 """关闭数据库连接""" 49 """关闭数据库连接"""
49 - if self.connection:  
50 - self.connection.close() 50 + if self.engine:
  51 + self.engine.dispose()
51 52
52 def show_tables(self): 53 def show_tables(self):
53 """显示所有表""" 54 """显示所有表"""
54 - print("\n" + "=" * 60)  
55 - print("数据库表列表")  
56 - print("=" * 60) 55 + data_list_message = ""
  56 + data_list_message += "\n" + "=" * 60
  57 + data_list_message += "数据库表列表"
  58 + data_list_message += "=" * 60
  59 + logger.info(data_list_message)
57 60
58 - cursor = self.connection.cursor()  
59 - cursor.execute("SHOW TABLES")  
60 - tables = cursor.fetchall() 61 + inspector = inspect(self.engine)
  62 + tables = inspector.get_table_names()
61 63
62 if not tables: 64 if not tables:
63 - print("数据库中没有表") 65 + logger.info("数据库中没有表")
64 return 66 return
65 67
66 # 分类显示表 68 # 分类显示表
67 mindspider_tables = [] 69 mindspider_tables = []
68 mediacrawler_tables = [] 70 mediacrawler_tables = []
69 71
70 - for table in tables:  
71 - table_name = table[0] 72 + for table_name in tables:
72 if table_name in ['daily_news', 'daily_topics', 'topic_news_relation', 'crawling_tasks']: 73 if table_name in ['daily_news', 'daily_topics', 'topic_news_relation', 'crawling_tasks']:
73 mindspider_tables.append(table_name) 74 mindspider_tables.append(table_name)
74 else: 75 else:
75 mediacrawler_tables.append(table_name) 76 mediacrawler_tables.append(table_name)
76 77
77 - print("MindSpider核心表:") 78 + data_list_message += "MindSpider核心表:"
  79 + data_list_message += "\n"
78 for table in mindspider_tables: 80 for table in mindspider_tables:
79 - cursor.execute(f"SELECT COUNT(*) FROM {table}")  
80 - count = cursor.fetchone()[0]  
81 - print(f" - {table:<25} ({count:>6} 条记录)") 81 + with self.engine.connect() as conn:
  82 + count = conn.execute(text(f"SELECT COUNT(*) FROM {table}")).scalar_one()
  83 + data_list_message += f" - {table:<25} ({count:>6} 条记录)"
  84 + data_list_message += "\n"
82 85
83 - print("\nMediaCrawler平台表:") 86 + data_list_message += "\nMediaCrawler平台表:"
  87 + data_list_message += "\n"
84 for table in mediacrawler_tables: 88 for table in mediacrawler_tables:
85 try: 89 try:
86 - cursor.execute(f"SELECT COUNT(*) FROM {table}")  
87 - count = cursor.fetchone()[0]  
88 - print(f" - {table:<25} ({count:>6} 条记录)") 90 + with self.engine.connect() as conn:
  91 + count = conn.execute(text(f"SELECT COUNT(*) FROM {table}")).scalar_one()
  92 + data_list_message += f" - {table:<25} ({count:>6} 条记录)"
  93 + data_list_message += "\n"
89 except: 94 except:
90 - print(f" - {table:<25} (查询失败)") 95 + data_list_message += f" - {table:<25} (查询失败)"
  96 + data_list_message += "\n"
  97 + logger.info(data_list_message)
91 98
92 def show_statistics(self): 99 def show_statistics(self):
93 """显示数据统计""" 100 """显示数据统计"""
94 - print("\n" + "=" * 60)  
95 - print("数据统计")  
96 - print("=" * 60)  
97 -  
98 - cursor = self.connection.cursor() 101 + data_statistics_message = ""
  102 + data_statistics_message += "\n" + "=" * 60
  103 + data_statistics_message += "数据统计"
  104 + data_statistics_message += "=" * 60
  105 + data_statistics_message += "\n"
99 106
100 try: 107 try:
101 # 新闻统计 108 # 新闻统计
102 - cursor.execute("SELECT COUNT(*) FROM daily_news")  
103 - news_count = cursor.fetchone()[0]  
104 -  
105 - cursor.execute("SELECT COUNT(DISTINCT crawl_date) FROM daily_news")  
106 - news_days = cursor.fetchone()[0]  
107 -  
108 - cursor.execute("SELECT COUNT(DISTINCT source_platform) FROM daily_news")  
109 - platforms = cursor.fetchone()[0]  
110 -  
111 - print(f"新闻数据:")  
112 - print(f" - 总新闻数: {news_count}")  
113 - print(f" - 覆盖天数: {news_days}")  
114 - print(f" - 新闻平台: {platforms}") 109 + with self.engine.connect() as conn:
  110 + news_count = conn.execute(text("SELECT COUNT(*) FROM daily_news")).scalar_one()
  111 + news_days = conn.execute(text("SELECT COUNT(DISTINCT crawl_date) FROM daily_news")).scalar_one()
  112 + platforms = conn.execute(text("SELECT COUNT(DISTINCT source_platform) FROM daily_news")).scalar_one()
115 113
  114 + data_statistics_message += "新闻数据:"
  115 + data_statistics_message += "\n"
  116 + data_statistics_message += f" - 总新闻数: {news_count}"
  117 + data_statistics_message += "\n"
  118 + data_statistics_message += f" - 覆盖天数: {news_days}"
  119 + data_statistics_message += "\n"
  120 + data_statistics_message += f" - 新闻平台: {platforms}"
  121 + data_statistics_message += "\n"
116 # 话题统计 122 # 话题统计
117 - cursor.execute("SELECT COUNT(*) FROM daily_topics")  
118 - topic_count = cursor.fetchone()[0] 123 + with self.engine.connect() as conn:
  124 + topic_count = conn.execute(text("SELECT COUNT(*) FROM daily_topics")).scalar_one()
  125 + topic_days = conn.execute(text("SELECT COUNT(DISTINCT extract_date) FROM daily_topics")).scalar_one()
119 126
120 - cursor.execute("SELECT COUNT(DISTINCT extract_date) FROM daily_topics")  
121 - topic_days = cursor.fetchone()[0]  
122 -  
123 - print(f"\n话题数据:")  
124 - print(f" - 总话题数: {topic_count}")  
125 - print(f" - 提取天数: {topic_days}") 127 + data_statistics_message += "话题数据:"
  128 + data_statistics_message += "\n"
  129 + data_statistics_message += f" - 总话题数: {topic_count}"
  130 + data_statistics_message += "\n"
  131 + data_statistics_message += f" - 提取天数: {topic_days}"
  132 + data_statistics_message += "\n"
126 133
127 # 爬取任务统计 134 # 爬取任务统计
128 - cursor.execute("SELECT COUNT(*) FROM crawling_tasks")  
129 - task_count = cursor.fetchone()[0]  
130 -  
131 - cursor.execute("SELECT task_status, COUNT(*) FROM crawling_tasks GROUP BY task_status")  
132 - task_status = cursor.fetchall() 135 + with self.engine.connect() as conn:
  136 + task_count = conn.execute(text("SELECT COUNT(*) FROM crawling_tasks")).scalar_one()
  137 + task_status = conn.execute(text("SELECT task_status, COUNT(*) FROM crawling_tasks GROUP BY task_status")).all()
133 138
134 - print(f"\n爬取任务:")  
135 - print(f" - 总任务数: {task_count}") 139 + data_statistics_message += "爬取任务:"
  140 + data_statistics_message += "\n"
  141 + data_statistics_message += f" - 总任务数: {task_count}"
  142 + data_statistics_message += "\n"
136 for status, count in task_status: 143 for status, count in task_status:
137 - print(f" - {status}: {count}") 144 + data_statistics_message += f" - {status}: {count}"
  145 + data_statistics_message += "\n"
138 146
139 # 爬取内容统计 147 # 爬取内容统计
140 - print(f"\n平台内容统计:") 148 + data_statistics_message += "平台内容统计:"
  149 + data_statistics_message += "\n"
141 platform_tables = { 150 platform_tables = {
142 'xhs_note': '小红书', 151 'xhs_note': '小红书',
143 'douyin_aweme': '抖音', 152 'douyin_aweme': '抖音',
@@ -150,60 +159,78 @@ class DatabaseManager: @@ -150,60 +159,78 @@ class DatabaseManager:
150 159
151 for table, platform in platform_tables.items(): 160 for table, platform in platform_tables.items():
152 try: 161 try:
153 - cursor.execute(f"SELECT COUNT(*) FROM {table}")  
154 - count = cursor.fetchone()[0]  
155 - print(f" - {platform}: {count}") 162 + with self.engine.connect() as conn:
  163 + count = conn.execute(text(f"SELECT COUNT(*) FROM {table}")).scalar_one()
  164 + data_statistics_message += f" - {platform}: {count}"
  165 + data_statistics_message += "\n"
156 except: 166 except:
157 - print(f" - {platform}: 表不存在")  
158 - 167 + data_statistics_message += f" - {platform}: 表不存在"
  168 + data_statistics_message += "\n"
  169 + logger.info(data_statistics_message)
159 except Exception as e: 170 except Exception as e:
160 - print(f"统计查询失败: {e}") 171 + data_statistics_message += f"统计查询失败: {e}"
  172 + data_statistics_message += "\n"
  173 + logger.error(data_statistics_message)
161 174
162 def show_recent_data(self, days=7): 175 def show_recent_data(self, days=7):
163 """显示最近几天的数据""" 176 """显示最近几天的数据"""
164 - print(f"\n" + "=" * 60)  
165 - print(f"最近{days}天的数据")  
166 - print("=" * 60)  
167 -  
168 - cursor = self.connection.cursor() 177 + data_recent_message = ""
  178 + data_recent_message += "\n" + "=" * 60
  179 + data_recent_message += "最近" + str(days) + "天的数据"
  180 + data_recent_message += "=" * 60
169 181
  182 + from datetime import date, timedelta
  183 + start_date = date.today() - timedelta(days=days)
170 # 最近的新闻 184 # 最近的新闻
171 - cursor.execute("""  
172 - SELECT crawl_date, COUNT(*) as news_count, COUNT(DISTINCT source_platform) as platforms  
173 - FROM daily_news  
174 - WHERE crawl_date >= DATE_SUB(CURDATE(), INTERVAL %s DAY)  
175 - GROUP BY crawl_date  
176 - ORDER BY crawl_date DESC  
177 - """, (days,))  
178 -  
179 - news_data = cursor.fetchall() 185 + with self.engine.connect() as conn:
  186 + news_data = conn.execute(
  187 + text(
  188 + """
  189 + SELECT crawl_date, COUNT(*) as news_count, COUNT(DISTINCT source_platform) as platforms
  190 + FROM daily_news
  191 + WHERE crawl_date >= :start_date
  192 + GROUP BY crawl_date
  193 + ORDER BY crawl_date DESC
  194 + """
  195 + ),
  196 + {"start_date": start_date},
  197 + ).all()
180 if news_data: 198 if news_data:
181 - print("每日新闻统计:") 199 + data_recent_message += "每日新闻统计:"
  200 + data_recent_message += "\n"
182 for date, count, platforms in news_data: 201 for date, count, platforms in news_data:
183 - print(f" {date}: {count} 条新闻, {platforms} 个平台") 202 + data_recent_message += f" {date}: {count} 条新闻, {platforms} 个平台"
  203 + data_recent_message += "\n"
184 204
185 # 最近的话题 205 # 最近的话题
186 - cursor.execute("""  
187 - SELECT extract_date, COUNT(*) as topic_count  
188 - FROM daily_topics  
189 - WHERE extract_date >= DATE_SUB(CURDATE(), INTERVAL %s DAY)  
190 - GROUP BY extract_date  
191 - ORDER BY extract_date DESC  
192 - """, (days,))  
193 -  
194 - topic_data = cursor.fetchall() 206 + with self.engine.connect() as conn:
  207 + topic_data = conn.execute(
  208 + text(
  209 + """
  210 + SELECT extract_date, COUNT(*) as topic_count
  211 + FROM daily_topics
  212 + WHERE extract_date >= :start_date
  213 + GROUP BY extract_date
  214 + ORDER BY extract_date DESC
  215 + """
  216 + ),
  217 + {"start_date": start_date},
  218 + ).all()
195 if topic_data: 219 if topic_data:
196 - print("\n每日话题统计:") 220 + data_recent_message += "每日话题统计:"
  221 + data_recent_message += "\n"
197 for date, count in topic_data: 222 for date, count in topic_data:
198 - print(f" {date}: {count} 个话题") 223 + data_recent_message += f" {date}: {count} 个话题"
  224 + data_recent_message += "\n"
  225 + logger.info(data_recent_message)
199 226
200 def cleanup_old_data(self, days=90, dry_run=True): 227 def cleanup_old_data(self, days=90, dry_run=True):
201 """清理旧数据""" 228 """清理旧数据"""
202 - print(f"\n" + "=" * 60)  
203 - print(f"清理{days}天前的数据 ({'预览模式' if dry_run else '执行模式'})")  
204 - print("=" * 60) 229 + cleanup_message = ""
  230 + cleanup_message += "\n" + "=" * 60
  231 + cleanup_message += f"清理{days}天前的数据 ({'预览模式' if dry_run else '执行模式'})"
  232 + cleanup_message += "=" * 60
205 233
206 - cursor = self.connection.cursor()  
207 cutoff_date = datetime.now() - timedelta(days=days) 234 cutoff_date = datetime.now() - timedelta(days=days)
208 235
209 # 检查要删除的数据 236 # 检查要删除的数据
@@ -213,20 +240,25 @@ class DatabaseManager: @@ -213,20 +240,25 @@ class DatabaseManager:
213 ("crawling_tasks", f"SELECT COUNT(*) FROM crawling_tasks WHERE scheduled_date < '{cutoff_date.date()}'") 240 ("crawling_tasks", f"SELECT COUNT(*) FROM crawling_tasks WHERE scheduled_date < '{cutoff_date.date()}'")
214 ] 241 ]
215 242
216 - for table, query in cleanup_queries:  
217 - cursor.execute(query)  
218 - count = cursor.fetchone()[0]  
219 - if count > 0:  
220 - print(f" {table}: {count} 条记录将被删除")  
221 - if not dry_run:  
222 - delete_query = query.replace("SELECT COUNT(*)", "DELETE")  
223 - cursor.execute(delete_query)  
224 - print(f" 已删除 {count} 条记录")  
225 - else:  
226 - print(f" {table}: 无需清理") 243 + with self.engine.begin() as conn:
  244 + for table, query in cleanup_queries:
  245 + count = conn.execute(text(query)).scalar_one()
  246 + if count > 0:
  247 + cleanup_message += f" {table}: {count} 条记录将被删除"
  248 + cleanup_message += "\n"
  249 + if not dry_run:
  250 + delete_query = query.replace("SELECT COUNT(*)", "DELETE")
  251 + conn.execute(text(delete_query))
  252 + cleanup_message += f" 已删除 {count} 条记录"
  253 + cleanup_message += "\n"
  254 + else:
  255 + cleanup_message += f" {table}: 无需清理"
  256 + cleanup_message += "\n"
227 257
228 if dry_run: 258 if dry_run:
229 - print("\n这是预览模式,没有实际删除数据。使用 --execute 参数执行实际清理。") 259 + cleanup_message += "\n这是预览模式,没有实际删除数据。使用 --execute 参数执行实际清理。"
  260 + cleanup_message += "\n"
  261 + logger.info(cleanup_message)
230 262
231 def main(): 263 def main():
232 parser = argparse.ArgumentParser(description="MindSpider数据库管理工具") 264 parser = argparse.ArgumentParser(description="MindSpider数据库管理工具")
@@ -9,6 +9,7 @@ import os @@ -9,6 +9,7 @@ import os
9 import sys 9 import sys
10 import pymysql 10 import pymysql
11 from pathlib import Path 11 from pathlib import Path
  12 +from MindSpider.config import settings
12 13
13 # 添加项目根目录到路径 14 # 添加项目根目录到路径
14 project_root = Path(__file__).parent.parent 15 project_root = Path(__file__).parent.parent
@@ -26,14 +27,14 @@ def create_database_connection(): @@ -26,14 +27,14 @@ def create_database_connection():
26 """创建数据库连接""" 27 """创建数据库连接"""
27 try: 28 try:
28 connection = pymysql.connect( 29 connection = pymysql.connect(
29 - host=config.DB_HOST,  
30 - port=config.DB_PORT,  
31 - user=config.DB_USER,  
32 - password=config.DB_PASSWORD,  
33 - charset=config.DB_CHARSET, 30 + host=settings.db_host,
  31 + port=settings.db_port,
  32 + user=settings.db_user,
  33 + password=settings.db_password,
  34 + charset=settings.db_charset,
34 autocommit=True 35 autocommit=True
35 ) 36 )
36 - print(f"成功连接到MySQL服务器: {config.DB_HOST}:{config.DB_PORT}") 37 + print(f"成功连接到MySQL服务器: {settings.db_host}:{settings.db_port}")
37 return connection 38 return connection
38 except Exception as e: 39 except Exception as e:
39 print(f"连接数据库失败: {e}") 40 print(f"连接数据库失败: {e}")
@@ -43,9 +44,9 @@ def create_database(connection): @@ -43,9 +44,9 @@ def create_database(connection):
43 """创建数据库""" 44 """创建数据库"""
44 try: 45 try:
45 cursor = connection.cursor() 46 cursor = connection.cursor()
46 - cursor.execute(f"CREATE DATABASE IF NOT EXISTS `{config.DB_NAME}` CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci")  
47 - cursor.execute(f"USE `{config.DB_NAME}`")  
48 - print(f"数据库 '{config.DB_NAME}' 创建/选择成功") 47 + cursor.execute(f"CREATE DATABASE IF NOT EXISTS `{settings.db_name}` CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci")
  48 + cursor.execute(f"USE `{settings.db_name}`")
  49 + print(f"数据库 '{settings.db_name}' 创建/选择成功")
49 return True 50 return True
50 except Exception as e: 51 except Exception as e:
51 print(f"创建数据库失败: {e}") 52 print(f"创建数据库失败: {e}")
@@ -56,18 +57,18 @@ def execute_sql_file(connection, sql_file_path, description=""): @@ -56,18 +57,18 @@ def execute_sql_file(connection, sql_file_path, description=""):
56 if not os.path.exists(sql_file_path): 57 if not os.path.exists(sql_file_path):
57 print(f"警告: SQL文件不存在: {sql_file_path}") 58 print(f"警告: SQL文件不存在: {sql_file_path}")
58 return False 59 return False
59 - 60 +
60 try: 61 try:
61 cursor = connection.cursor() 62 cursor = connection.cursor()
62 with open(sql_file_path, 'r', encoding='utf-8') as f: 63 with open(sql_file_path, 'r', encoding='utf-8') as f:
63 sql_content = f.read() 64 sql_content = f.read()
64 - 65 +
65 # 分割SQL语句(简单实现,按分号分割) 66 # 分割SQL语句(简单实现,按分号分割)
66 sql_statements = [stmt.strip() for stmt in sql_content.split(';') if stmt.strip()] 67 sql_statements = [stmt.strip() for stmt in sql_content.split(';') if stmt.strip()]
67 - 68 +
68 success_count = 0 69 success_count = 0
69 error_count = 0 70 error_count = 0
70 - 71 +
71 for stmt in sql_statements: 72 for stmt in sql_statements:
72 if not stmt or stmt.startswith('--'): 73 if not stmt or stmt.startswith('--'):
73 continue 74 continue
@@ -77,10 +78,10 @@ def execute_sql_file(connection, sql_file_path, description=""): @@ -77,10 +78,10 @@ def execute_sql_file(connection, sql_file_path, description=""):
77 except Exception as e: 78 except Exception as e:
78 error_count += 1 79 error_count += 1
79 print(f"执行SQL语句失败: {str(e)[:100]}...") 80 print(f"执行SQL语句失败: {str(e)[:100]}...")
80 - 81 +
81 print(f"{description} - 成功执行: {success_count} 条语句, 失败: {error_count} 条语句") 82 print(f"{description} - 成功执行: {success_count} 条语句, 失败: {error_count} 条语句")
82 return error_count == 0 83 return error_count == 0
83 - 84 +
84 except Exception as e: 85 except Exception as e:
85 print(f"执行SQL文件失败 {sql_file_path}: {e}") 86 print(f"执行SQL文件失败 {sql_file_path}: {e}")
86 return False 87 return False
@@ -90,44 +91,44 @@ def main(): @@ -90,44 +91,44 @@ def main():
90 print("=" * 60) 91 print("=" * 60)
91 print("MindSpider AI爬虫项目 - 数据库初始化") 92 print("MindSpider AI爬虫项目 - 数据库初始化")
92 print("=" * 60) 93 print("=" * 60)
93 - 94 +
94 # 检查配置 95 # 检查配置
95 print("检查数据库配置...") 96 print("检查数据库配置...")
96 - print(f"数据库主机: {config.DB_HOST}")  
97 - print(f"数据库端口: {config.DB_PORT}")  
98 - print(f"数据库名称: {config.DB_NAME}")  
99 - print(f"数据库用户: {config.DB_USER}")  
100 - print(f"字符集: {config.DB_CHARSET}") 97 + print(f"数据库主机: {settings.db_host}")
  98 + print(f"数据库端口: {settings.db_port}")
  99 + print(f"数据库名称: {settings.db_name}")
  100 + print(f"数据库用户: {settings.db_user}")
  101 + print(f"字符集: {settings.db_charset}")
101 print() 102 print()
102 - 103 +
103 # 创建数据库连接 104 # 创建数据库连接
104 print("正在连接数据库...") 105 print("正在连接数据库...")
105 connection = create_database_connection() 106 connection = create_database_connection()
106 if not connection: 107 if not connection:
107 print("数据库初始化失败!") 108 print("数据库初始化失败!")
108 return False 109 return False
109 - 110 +
110 try: 111 try:
111 # 创建数据库 112 # 创建数据库
112 print("正在创建/选择数据库...") 113 print("正在创建/选择数据库...")
113 if not create_database(connection): 114 if not create_database(connection):
114 return False 115 return False
115 - 116 +
116 # 获取SQL文件路径 117 # 获取SQL文件路径
117 schema_dir = Path(__file__).parent 118 schema_dir = Path(__file__).parent
118 mediacrawler_sql = schema_dir.parent / "DeepSentimentCrawling" / "MediaCrawler" / "schema" / "tables.sql" 119 mediacrawler_sql = schema_dir.parent / "DeepSentimentCrawling" / "MediaCrawler" / "schema" / "tables.sql"
119 mindspider_sql = schema_dir / "mindspider_tables.sql" 120 mindspider_sql = schema_dir / "mindspider_tables.sql"
120 - 121 +
121 print() 122 print()
122 print("开始执行SQL脚本...") 123 print("开始执行SQL脚本...")
123 - 124 +
124 # 1. 执行MediaCrawler的原始表结构 125 # 1. 执行MediaCrawler的原始表结构
125 if mediacrawler_sql.exists(): 126 if mediacrawler_sql.exists():
126 print("1. 创建MediaCrawler基础表...") 127 print("1. 创建MediaCrawler基础表...")
127 execute_sql_file(connection, str(mediacrawler_sql), "MediaCrawler基础表") 128 execute_sql_file(connection, str(mediacrawler_sql), "MediaCrawler基础表")
128 else: 129 else:
129 print("警告: MediaCrawler SQL文件不存在,跳过基础表创建") 130 print("警告: MediaCrawler SQL文件不存在,跳过基础表创建")
130 - 131 +
131 # 2. 执行MindSpider扩展表结构 132 # 2. 执行MindSpider扩展表结构
132 print("2. 创建MindSpider扩展表...") 133 print("2. 创建MindSpider扩展表...")
133 if mindspider_sql.exists(): 134 if mindspider_sql.exists():
@@ -135,18 +136,18 @@ def main(): @@ -135,18 +136,18 @@ def main():
135 else: 136 else:
136 print("错误: MindSpider SQL文件不存在") 137 print("错误: MindSpider SQL文件不存在")
137 return False 138 return False
138 - 139 +
139 print() 140 print()
140 print("=" * 60) 141 print("=" * 60)
141 print("数据库初始化完成!") 142 print("数据库初始化完成!")
142 print("=" * 60) 143 print("=" * 60)
143 - 144 +
144 # 显示创建的表 145 # 显示创建的表
145 cursor = connection.cursor() 146 cursor = connection.cursor()
146 cursor.execute("SHOW TABLES") 147 cursor.execute("SHOW TABLES")
147 tables = cursor.fetchall() 148 tables = cursor.fetchall()
148 -  
149 - print(f"数据库 '{config.DB_NAME}' 中共创建了 {len(tables)} 个表:") 149 +
  150 + print(f"数据库 '{settings.db_name}' 中共创建了 {len(tables)} 个表:")
150 for table in tables: 151 for table in tables:
151 print(f" - {table[0]}") 152 print(f" - {table[0]}")
152 153