Doiiars
Committed by GitHub

Merge pull request #116 from DoiiarX/fix-news-id

修复news-id冲突问题
@@ -25,6 +25,7 @@ except ImportError: @@ -25,6 +25,7 @@ except ImportError:
25 25
26 from config import settings 26 from config import settings
27 27
  28 +
28 class DatabaseManager: 29 class DatabaseManager:
29 """数据库管理器""" 30 """数据库管理器"""
30 31
@@ -46,14 +47,15 @@ class DatabaseManager: @@ -46,14 +47,15 @@ class DatabaseManager:
46 except ModuleNotFoundError as e: 47 except ModuleNotFoundError as e:
47 missing: str = str(e) 48 missing: str = str(e)
48 if "psycopg" in missing: 49 if "psycopg" in missing:
49 - logger.error("数据库连接失败: 未安装PostgreSQL驱动 psycopg。请安装: psycopg[binary]。参考指令:uv pip install psycopg[binary]") 50 + logger.error(
  51 + "数据库连接失败: 未安装PostgreSQL驱动 psycopg。请安装: psycopg[binary]。参考指令:uv pip install psycopg[binary]")
50 elif "pymysql" in missing: 52 elif "pymysql" in missing:
51 logger.error("数据库连接失败: 未安装MySQL驱动 pymysql。请安装: pymysql。参考指令:uv pip install pymysql") 53 logger.error("数据库连接失败: 未安装MySQL驱动 pymysql。请安装: pymysql。参考指令:uv pip install pymysql")
52 else: 54 else:
53 logger.error(f"数据库连接失败(缺少驱动): {e}") 55 logger.error(f"数据库连接失败(缺少驱动): {e}")
54 raise 56 raise
55 except Exception as e: 57 except Exception as e:
56 - logger.error(f"数据库连接失败: {e}") 58 + logger.exception(f"数据库连接失败: {e}")
57 raise 59 raise
58 60
59 def close(self): 61 def close(self):
@@ -97,7 +99,13 @@ class DatabaseManager: @@ -97,7 +99,13 @@ class DatabaseManager:
97 # 逐条插入,单条失败不影响后续(每条独立事务) 99 # 逐条插入,单条失败不影响后续(每条独立事务)
98 for news_item in news_data: 100 for news_item in news_data:
99 try: 101 try:
100 - news_id = f"{news_item.get('source', 'unknown')}_{news_item.get('id', news_item.get('rank', 0))}" 102 + # news_item.get('id') 已经是完整的 news_id(格式:source_item_id)
  103 + # 为了支持同一条新闻在不同日期出现,将 crawl_date 加入到 news_id 中
  104 + base_news_id = news_item.get(
  105 + 'id') or f"{news_item.get('source', 'unknown')}_rank_{news_item.get('rank', 0)}"
  106 + # 将日期格式化为字符串并加入到 news_id 中,确保全局唯一性
  107 + news_id = f"{base_news_id}_{crawl_date.strftime('%Y%m%d')}"
  108 +
101 title_val = (news_item.get("title", "") or "") 109 title_val = (news_item.get("title", "") or "")
102 if len(title_val) > 500: 110 if len(title_val) > 500:
103 title_val = title_val[:500] 111 title_val = title_val[:500]
@@ -124,7 +132,7 @@ class DatabaseManager: @@ -124,7 +132,7 @@ class DatabaseManager:
124 ) 132 )
125 saved_count += 1 133 saved_count += 1
126 except Exception as e: 134 except Exception as e:
127 - logger.warning(f"保存单条新闻失败: {e}") 135 + logger.exception(f"保存单条新闻失败: {e}")
128 continue 136 continue
129 logger.info(f"成功保存 {saved_count} 条新闻记录") 137 logger.info(f"成功保存 {saved_count} 条新闻记录")
130 return saved_count 138 return saved_count
@@ -174,17 +182,21 @@ class DatabaseManager: @@ -174,17 +182,21 @@ class DatabaseManager:
174 182
175 try: 183 try:
176 keywords_json = json.dumps(keywords, ensure_ascii=False) 184 keywords_json = json.dumps(keywords, ensure_ascii=False)
  185 + # 为了支持外键引用,topic_id 需要全局唯一,所以将日期加入到 topic_id 中
  186 + topic_id = f"summary_{extract_date.strftime('%Y%m%d')}"
  187 +
177 with self.engine.begin() as conn: 188 with self.engine.begin() as conn:
178 check = conn.execute( 189 check = conn.execute(
179 text("SELECT id FROM daily_topics WHERE extract_date = :d AND topic_id = :tid"), 190 text("SELECT id FROM daily_topics WHERE extract_date = :d AND topic_id = :tid"),
180 - {"d": extract_date, "tid": "summary"}, 191 + {"d": extract_date, "tid": topic_id},
181 ).first() 192 ).first()
182 if check: 193 if check:
183 conn.execute( 194 conn.execute(
184 text( 195 text(
185 "UPDATE daily_topics SET keywords = :k, topic_description = :s, add_ts = :ts, last_modify_ts = :lmt, topic_name = :tn WHERE extract_date = :d AND topic_id = :tid" 196 "UPDATE daily_topics SET keywords = :k, topic_description = :s, add_ts = :ts, last_modify_ts = :lmt, topic_name = :tn WHERE extract_date = :d AND topic_id = :tid"
186 ), 197 ),
187 - {"k": keywords_json, "s": summary, "ts": current_timestamp, "lmt": current_timestamp, "d": extract_date, "tid": "summary", "tn": "每日新闻分析"}, 198 + {"k": keywords_json, "s": summary, "ts": current_timestamp, "lmt": current_timestamp,
  199 + "d": extract_date, "tid": topic_id, "tn": "每日新闻分析"},
188 ) 200 )
189 logger.info(f"更新了 {extract_date} 的话题分析") 201 logger.info(f"更新了 {extract_date} 的话题分析")
190 else: 202 else:
@@ -192,7 +204,8 @@ class DatabaseManager: @@ -192,7 +204,8 @@ class DatabaseManager:
192 text( 204 text(
193 "INSERT INTO daily_topics (extract_date, topic_id, topic_name, keywords, topic_description, add_ts, last_modify_ts) VALUES (:d, :tid, :tn, :k, :s, :ts, :lmt)" 205 "INSERT INTO daily_topics (extract_date, topic_id, topic_name, keywords, topic_description, add_ts, last_modify_ts) VALUES (:d, :tid, :tn, :k, :s, :ts, :lmt)"
194 ), 206 ),
195 - {"d": extract_date, "tid": "summary", "tn": "每日新闻分析", "k": keywords_json, "s": summary, "ts": current_timestamp, "lmt": current_timestamp}, 207 + {"d": extract_date, "tid": topic_id, "tn": "每日新闻分析", "k": keywords_json, "s": summary,
  208 + "ts": current_timestamp, "lmt": current_timestamp},
196 ) 209 )
197 logger.info(f"保存了 {extract_date} 的话题分析") 210 logger.info(f"保存了 {extract_date} 的话题分析")
198 return True 211 return True
@@ -215,7 +228,8 @@ class DatabaseManager: @@ -215,7 +228,8 @@ class DatabaseManager:
215 228
216 try: 229 try:
217 with self.engine.connect() as conn: 230 with self.engine.connect() as conn:
218 - result = conn.execute(text("SELECT * FROM daily_topics WHERE extract_date = :d"), {"d": extract_date}).mappings().first() 231 + result = conn.execute(text("SELECT * FROM daily_topics WHERE extract_date = :d"),
  232 + {"d": extract_date}).mappings().first()
219 if result: 233 if result:
220 result = dict(result) # 转为可变dict以支持item赋值 234 result = dict(result) # 转为可变dict以支持item赋值
221 result["keywords"] = json.loads(result["keywords"]) if result.get("keywords") else [] 235 result["keywords"] = json.loads(result["keywords"]) if result.get("keywords") else []
@@ -290,6 +304,7 @@ class DatabaseManager: @@ -290,6 +304,7 @@ class DatabaseManager:
290 logger.exception(f"获取统计摘要失败: {e}") 304 logger.exception(f"获取统计摘要失败: {e}")
291 return {"news_stats": [], "topics_stats": []} 305 return {"news_stats": [], "topics_stats": []}
292 306
  307 +
293 if __name__ == "__main__": 308 if __name__ == "__main__":
294 # 测试数据库管理器 309 # 测试数据库管理器
295 with DatabaseManager() as db: 310 with DatabaseManager() as db: