Doiiars
Committed by GitHub

Merge pull request #116 from DoiiarX/fix-news-id

修复news-id冲突问题
@@ -25,14 +25,15 @@ except ImportError: @@ -25,14 +25,15 @@ except ImportError:
25 25
26 from config import settings 26 from config import settings
27 27
  28 +
28 class DatabaseManager: 29 class DatabaseManager:
29 """数据库管理器""" 30 """数据库管理器"""
30 - 31 +
31 def __init__(self): 32 def __init__(self):
32 """初始化数据库管理器""" 33 """初始化数据库管理器"""
33 self.engine: Engine = None 34 self.engine: Engine = None
34 self.connect() 35 self.connect()
35 - 36 +
36 def connect(self): 37 def connect(self):
37 """连接数据库""" 38 """连接数据库"""
38 try: 39 try:
@@ -46,46 +47,47 @@ class DatabaseManager: @@ -46,46 +47,47 @@ class DatabaseManager:
46 except ModuleNotFoundError as e: 47 except ModuleNotFoundError as e:
47 missing: str = str(e) 48 missing: str = str(e)
48 if "psycopg" in missing: 49 if "psycopg" in missing:
49 - logger.error("数据库连接失败: 未安装PostgreSQL驱动 psycopg。请安装: psycopg[binary]。参考指令:uv pip install psycopg[binary]") 50 + logger.error(
  51 + "数据库连接失败: 未安装PostgreSQL驱动 psycopg。请安装: psycopg[binary]。参考指令:uv pip install psycopg[binary]")
50 elif "pymysql" in missing: 52 elif "pymysql" in missing:
51 logger.error("数据库连接失败: 未安装MySQL驱动 pymysql。请安装: pymysql。参考指令:uv pip install pymysql") 53 logger.error("数据库连接失败: 未安装MySQL驱动 pymysql。请安装: pymysql。参考指令:uv pip install pymysql")
52 else: 54 else:
53 logger.error(f"数据库连接失败(缺少驱动): {e}") 55 logger.error(f"数据库连接失败(缺少驱动): {e}")
54 raise 56 raise
55 except Exception as e: 57 except Exception as e:
56 - logger.error(f"数据库连接失败: {e}") 58 + logger.exception(f"数据库连接失败: {e}")
57 raise 59 raise
58 - 60 +
59 def close(self): 61 def close(self):
60 """关闭数据库连接""" 62 """关闭数据库连接"""
61 if self.engine: 63 if self.engine:
62 self.engine.dispose() 64 self.engine.dispose()
63 logger.info("数据库连接已关闭") 65 logger.info("数据库连接已关闭")
64 - 66 +
65 def __enter__(self): 67 def __enter__(self):
66 return self 68 return self
67 - 69 +
68 def __exit__(self, exc_type, exc_val, exc_tb): 70 def __exit__(self, exc_type, exc_val, exc_tb):
69 self.close() 71 self.close()
70 - 72 +
71 # ==================== 新闻数据操作 ==================== 73 # ==================== 新闻数据操作 ====================
72 - 74 +
73 def save_daily_news(self, news_data: List[Dict], crawl_date: date = None) -> int: 75 def save_daily_news(self, news_data: List[Dict], crawl_date: date = None) -> int:
74 """ 76 """
75 保存每日新闻数据,如果当天已有数据则覆盖 77 保存每日新闻数据,如果当天已有数据则覆盖
76 - 78 +
77 Args: 79 Args:
78 news_data: 新闻数据列表 80 news_data: 新闻数据列表
79 crawl_date: 爬取日期,默认为今天 81 crawl_date: 爬取日期,默认为今天
80 - 82 +
81 Returns: 83 Returns:
82 保存的新闻数量 84 保存的新闻数量
83 """ 85 """
84 if not crawl_date: 86 if not crawl_date:
85 crawl_date = date.today() 87 crawl_date = date.today()
86 - 88 +
87 current_timestamp = int(datetime.now().timestamp()) 89 current_timestamp = int(datetime.now().timestamp())
88 - 90 +
89 try: 91 try:
90 saved_count = 0 92 saved_count = 0
91 # 先独立事务执行删除,防止后续插入失败导致无法清理 93 # 先独立事务执行删除,防止后续插入失败导致无法清理
@@ -97,7 +99,13 @@ class DatabaseManager: @@ -97,7 +99,13 @@ class DatabaseManager:
97 # 逐条插入,单条失败不影响后续(每条独立事务) 99 # 逐条插入,单条失败不影响后续(每条独立事务)
98 for news_item in news_data: 100 for news_item in news_data:
99 try: 101 try:
100 - news_id = f"{news_item.get('source', 'unknown')}_{news_item.get('id', news_item.get('rank', 0))}" 102 + # news_item.get('id') 已经是完整的 news_id(格式:source_item_id)
  103 + # 为了支持同一条新闻在不同日期出现,将 crawl_date 加入到 news_id 中
  104 + base_news_id = news_item.get(
  105 + 'id') or f"{news_item.get('source', 'unknown')}_rank_{news_item.get('rank', 0)}"
  106 + # 将日期格式化为字符串并加入到 news_id 中,确保全局唯一性
  107 + news_id = f"{base_news_id}_{crawl_date.strftime('%Y%m%d')}"
  108 +
101 title_val = (news_item.get("title", "") or "") 109 title_val = (news_item.get("title", "") or "")
102 if len(title_val) > 500: 110 if len(title_val) > 500:
103 title_val = title_val[:500] 111 title_val = title_val[:500]
@@ -124,27 +132,27 @@ class DatabaseManager: @@ -124,27 +132,27 @@ class DatabaseManager:
124 ) 132 )
125 saved_count += 1 133 saved_count += 1
126 except Exception as e: 134 except Exception as e:
127 - logger.warning(f"保存单条新闻失败: {e}") 135 + logger.exception(f"保存单条新闻失败: {e}")
128 continue 136 continue
129 logger.info(f"成功保存 {saved_count} 条新闻记录") 137 logger.info(f"成功保存 {saved_count} 条新闻记录")
130 return saved_count 138 return saved_count
131 except Exception as e: 139 except Exception as e:
132 logger.exception(f"保存新闻数据失败: {e}") 140 logger.exception(f"保存新闻数据失败: {e}")
133 return 0 141 return 0
134 - 142 +
135 def get_daily_news(self, crawl_date: date = None) -> List[Dict]: 143 def get_daily_news(self, crawl_date: date = None) -> List[Dict]:
136 """ 144 """
137 获取每日新闻数据 145 获取每日新闻数据
138 - 146 +
139 Args: 147 Args:
140 crawl_date: 爬取日期,默认为今天 148 crawl_date: 爬取日期,默认为今天
141 - 149 +
142 Returns: 150 Returns:
143 新闻列表 151 新闻列表
144 """ 152 """
145 if not crawl_date: 153 if not crawl_date:
146 crawl_date = date.today() 154 crawl_date = date.today()
147 - 155 +
148 query = ( 156 query = (
149 "SELECT * FROM daily_news WHERE crawl_date = :d ORDER BY rank_position ASC" 157 "SELECT * FROM daily_news WHERE crawl_date = :d ORDER BY rank_position ASC"
150 ) 158 )
@@ -152,39 +160,43 @@ class DatabaseManager: @@ -152,39 +160,43 @@ class DatabaseManager:
152 result = conn.execute(text(query), {"d": crawl_date}) 160 result = conn.execute(text(query), {"d": crawl_date})
153 rows = result.mappings().all() 161 rows = result.mappings().all()
154 return rows 162 return rows
155 - 163 +
156 # ==================== 话题数据操作 ==================== 164 # ==================== 话题数据操作 ====================
157 - 165 +
158 def save_daily_topics(self, keywords: List[str], summary: str, extract_date: date = None) -> bool: 166 def save_daily_topics(self, keywords: List[str], summary: str, extract_date: date = None) -> bool:
159 """ 167 """
160 保存每日话题分析 168 保存每日话题分析
161 - 169 +
162 Args: 170 Args:
163 keywords: 话题关键词列表 171 keywords: 话题关键词列表
164 summary: 新闻分析总结 172 summary: 新闻分析总结
165 extract_date: 提取日期,默认为今天 173 extract_date: 提取日期,默认为今天
166 - 174 +
167 Returns: 175 Returns:
168 是否保存成功 176 是否保存成功
169 """ 177 """
170 if not extract_date: 178 if not extract_date:
171 extract_date = date.today() 179 extract_date = date.today()
172 - 180 +
173 current_timestamp = int(datetime.now().timestamp()) 181 current_timestamp = int(datetime.now().timestamp())
174 - 182 +
175 try: 183 try:
176 keywords_json = json.dumps(keywords, ensure_ascii=False) 184 keywords_json = json.dumps(keywords, ensure_ascii=False)
  185 + # 为了支持外键引用,topic_id 需要全局唯一,所以将日期加入到 topic_id 中
  186 + topic_id = f"summary_{extract_date.strftime('%Y%m%d')}"
  187 +
177 with self.engine.begin() as conn: 188 with self.engine.begin() as conn:
178 check = conn.execute( 189 check = conn.execute(
179 text("SELECT id FROM daily_topics WHERE extract_date = :d AND topic_id = :tid"), 190 text("SELECT id FROM daily_topics WHERE extract_date = :d AND topic_id = :tid"),
180 - {"d": extract_date, "tid": "summary"}, 191 + {"d": extract_date, "tid": topic_id},
181 ).first() 192 ).first()
182 if check: 193 if check:
183 conn.execute( 194 conn.execute(
184 text( 195 text(
185 "UPDATE daily_topics SET keywords = :k, topic_description = :s, add_ts = :ts, last_modify_ts = :lmt, topic_name = :tn WHERE extract_date = :d AND topic_id = :tid" 196 "UPDATE daily_topics SET keywords = :k, topic_description = :s, add_ts = :ts, last_modify_ts = :lmt, topic_name = :tn WHERE extract_date = :d AND topic_id = :tid"
186 ), 197 ),
187 - {"k": keywords_json, "s": summary, "ts": current_timestamp, "lmt": current_timestamp, "d": extract_date, "tid": "summary", "tn": "每日新闻分析"}, 198 + {"k": keywords_json, "s": summary, "ts": current_timestamp, "lmt": current_timestamp,
  199 + "d": extract_date, "tid": topic_id, "tn": "每日新闻分析"},
188 ) 200 )
189 logger.info(f"更新了 {extract_date} 的话题分析") 201 logger.info(f"更新了 {extract_date} 的话题分析")
190 else: 202 else:
@@ -192,30 +204,32 @@ class DatabaseManager: @@ -192,30 +204,32 @@ class DatabaseManager:
192 text( 204 text(
193 "INSERT INTO daily_topics (extract_date, topic_id, topic_name, keywords, topic_description, add_ts, last_modify_ts) VALUES (:d, :tid, :tn, :k, :s, :ts, :lmt)" 205 "INSERT INTO daily_topics (extract_date, topic_id, topic_name, keywords, topic_description, add_ts, last_modify_ts) VALUES (:d, :tid, :tn, :k, :s, :ts, :lmt)"
194 ), 206 ),
195 - {"d": extract_date, "tid": "summary", "tn": "每日新闻分析", "k": keywords_json, "s": summary, "ts": current_timestamp, "lmt": current_timestamp}, 207 + {"d": extract_date, "tid": topic_id, "tn": "每日新闻分析", "k": keywords_json, "s": summary,
  208 + "ts": current_timestamp, "lmt": current_timestamp},
196 ) 209 )
197 logger.info(f"保存了 {extract_date} 的话题分析") 210 logger.info(f"保存了 {extract_date} 的话题分析")
198 return True 211 return True
199 except Exception as e: 212 except Exception as e:
200 logger.exception(f"保存话题分析失败: {e}") 213 logger.exception(f"保存话题分析失败: {e}")
201 return False 214 return False
202 - 215 +
203 def get_daily_topics(self, extract_date: date = None) -> Optional[Dict]: 216 def get_daily_topics(self, extract_date: date = None) -> Optional[Dict]:
204 """ 217 """
205 获取每日话题分析 218 获取每日话题分析
206 - 219 +
207 Args: 220 Args:
208 extract_date: 提取日期,默认为今天 221 extract_date: 提取日期,默认为今天
209 - 222 +
210 Returns: 223 Returns:
211 话题分析数据,如果不存在返回None 224 话题分析数据,如果不存在返回None
212 """ 225 """
213 if not extract_date: 226 if not extract_date:
214 extract_date = date.today() 227 extract_date = date.today()
215 - 228 +
216 try: 229 try:
217 with self.engine.connect() as conn: 230 with self.engine.connect() as conn:
218 - result = conn.execute(text("SELECT * FROM daily_topics WHERE extract_date = :d"), {"d": extract_date}).mappings().first() 231 + result = conn.execute(text("SELECT * FROM daily_topics WHERE extract_date = :d"),
  232 + {"d": extract_date}).mappings().first()
219 if result: 233 if result:
220 result = dict(result) # 转为可变dict以支持item赋值 234 result = dict(result) # 转为可变dict以支持item赋值
221 result["keywords"] = json.loads(result["keywords"]) if result.get("keywords") else [] 235 result["keywords"] = json.loads(result["keywords"]) if result.get("keywords") else []
@@ -224,14 +238,14 @@ class DatabaseManager: @@ -224,14 +238,14 @@ class DatabaseManager:
224 except Exception as e: 238 except Exception as e:
225 logger.exception(f"获取话题分析失败: {e}") 239 logger.exception(f"获取话题分析失败: {e}")
226 return None 240 return None
227 - 241 +
228 def get_recent_topics(self, days: int = 7) -> List[Dict]: 242 def get_recent_topics(self, days: int = 7) -> List[Dict]:
229 """ 243 """
230 获取最近几天的话题分析 244 获取最近几天的话题分析
231 - 245 +
232 Args: 246 Args:
233 days: 天数 247 days: 天数
234 - 248 +
235 Returns: 249 Returns:
236 话题分析列表 250 话题分析列表
237 """ 251 """
@@ -254,9 +268,9 @@ class DatabaseManager: @@ -254,9 +268,9 @@ class DatabaseManager:
254 except Exception as e: 268 except Exception as e:
255 logger.exception(f"获取最近话题分析失败: {e}") 269 logger.exception(f"获取最近话题分析失败: {e}")
256 return [] 270 return []
257 - 271 +
258 # ==================== 统计查询 ==================== 272 # ==================== 统计查询 ====================
259 - 273 +
260 def get_summary_stats(self, days: int = 7) -> Dict: 274 def get_summary_stats(self, days: int = 7) -> Dict:
261 """获取统计摘要""" 275 """获取统计摘要"""
262 try: 276 try:
@@ -290,18 +304,19 @@ class DatabaseManager: @@ -290,18 +304,19 @@ class DatabaseManager:
290 logger.exception(f"获取统计摘要失败: {e}") 304 logger.exception(f"获取统计摘要失败: {e}")
291 return {"news_stats": [], "topics_stats": []} 305 return {"news_stats": [], "topics_stats": []}
292 306
  307 +
293 if __name__ == "__main__": 308 if __name__ == "__main__":
294 # 测试数据库管理器 309 # 测试数据库管理器
295 with DatabaseManager() as db: 310 with DatabaseManager() as db:
296 # 测试获取新闻 311 # 测试获取新闻
297 news = db.get_daily_news() 312 news = db.get_daily_news()
298 logger.info(f"今日新闻数量: {len(news)}") 313 logger.info(f"今日新闻数量: {len(news)}")
299 - 314 +
300 # 测试获取话题 315 # 测试获取话题
301 topics = db.get_daily_topics() 316 topics = db.get_daily_topics()
302 if topics: 317 if topics:
303 logger.info(f"今日话题关键词: {topics['keywords']}") 318 logger.info(f"今日话题关键词: {topics['keywords']}")
304 else: 319 else:
305 logger.info("今日暂无话题分析") 320 logger.info("今日暂无话题分析")
306 - 321 +
307 logger.info("简化数据库管理器测试完成!") 322 logger.info("简化数据库管理器测试完成!")