Committed by
GitHub
Merge pull request #116 from DoiiarX/fix-news-id
修复news-id冲突问题
Showing
1 changed file
with
23 additions
and
8 deletions
| @@ -25,6 +25,7 @@ except ImportError: | @@ -25,6 +25,7 @@ except ImportError: | ||
| 25 | 25 | ||
| 26 | from config import settings | 26 | from config import settings |
| 27 | 27 | ||
| 28 | + | ||
| 28 | class DatabaseManager: | 29 | class DatabaseManager: |
| 29 | """数据库管理器""" | 30 | """数据库管理器""" |
| 30 | 31 | ||
| @@ -46,14 +47,15 @@ class DatabaseManager: | @@ -46,14 +47,15 @@ class DatabaseManager: | ||
| 46 | except ModuleNotFoundError as e: | 47 | except ModuleNotFoundError as e: |
| 47 | missing: str = str(e) | 48 | missing: str = str(e) |
| 48 | if "psycopg" in missing: | 49 | if "psycopg" in missing: |
| 49 | - logger.error("数据库连接失败: 未安装PostgreSQL驱动 psycopg。请安装: psycopg[binary]。参考指令:uv pip install psycopg[binary]") | 50 | + logger.error( |
| 51 | + "数据库连接失败: 未安装PostgreSQL驱动 psycopg。请安装: psycopg[binary]。参考指令:uv pip install psycopg[binary]") | ||
| 50 | elif "pymysql" in missing: | 52 | elif "pymysql" in missing: |
| 51 | logger.error("数据库连接失败: 未安装MySQL驱动 pymysql。请安装: pymysql。参考指令:uv pip install pymysql") | 53 | logger.error("数据库连接失败: 未安装MySQL驱动 pymysql。请安装: pymysql。参考指令:uv pip install pymysql") |
| 52 | else: | 54 | else: |
| 53 | logger.error(f"数据库连接失败(缺少驱动): {e}") | 55 | logger.error(f"数据库连接失败(缺少驱动): {e}") |
| 54 | raise | 56 | raise |
| 55 | except Exception as e: | 57 | except Exception as e: |
| 56 | - logger.error(f"数据库连接失败: {e}") | 58 | + logger.exception(f"数据库连接失败: {e}") |
| 57 | raise | 59 | raise |
| 58 | 60 | ||
| 59 | def close(self): | 61 | def close(self): |
| @@ -97,7 +99,13 @@ class DatabaseManager: | @@ -97,7 +99,13 @@ class DatabaseManager: | ||
| 97 | # 逐条插入,单条失败不影响后续(每条独立事务) | 99 | # 逐条插入,单条失败不影响后续(每条独立事务) |
| 98 | for news_item in news_data: | 100 | for news_item in news_data: |
| 99 | try: | 101 | try: |
| 100 | - news_id = f"{news_item.get('source', 'unknown')}_{news_item.get('id', news_item.get('rank', 0))}" | 102 | + # news_item.get('id') 已经是完整的 news_id(格式:source_item_id) |
| 103 | + # 为了支持同一条新闻在不同日期出现,将 crawl_date 加入到 news_id 中 | ||
| 104 | + base_news_id = news_item.get( | ||
| 105 | + 'id') or f"{news_item.get('source', 'unknown')}_rank_{news_item.get('rank', 0)}" | ||
| 106 | + # 将日期格式化为字符串并加入到 news_id 中,确保全局唯一性 | ||
| 107 | + news_id = f"{base_news_id}_{crawl_date.strftime('%Y%m%d')}" | ||
| 108 | + | ||
| 101 | title_val = (news_item.get("title", "") or "") | 109 | title_val = (news_item.get("title", "") or "") |
| 102 | if len(title_val) > 500: | 110 | if len(title_val) > 500: |
| 103 | title_val = title_val[:500] | 111 | title_val = title_val[:500] |
| @@ -124,7 +132,7 @@ class DatabaseManager: | @@ -124,7 +132,7 @@ class DatabaseManager: | ||
| 124 | ) | 132 | ) |
| 125 | saved_count += 1 | 133 | saved_count += 1 |
| 126 | except Exception as e: | 134 | except Exception as e: |
| 127 | - logger.warning(f"保存单条新闻失败: {e}") | 135 | + logger.exception(f"保存单条新闻失败: {e}") |
| 128 | continue | 136 | continue |
| 129 | logger.info(f"成功保存 {saved_count} 条新闻记录") | 137 | logger.info(f"成功保存 {saved_count} 条新闻记录") |
| 130 | return saved_count | 138 | return saved_count |
| @@ -174,17 +182,21 @@ class DatabaseManager: | @@ -174,17 +182,21 @@ class DatabaseManager: | ||
| 174 | 182 | ||
| 175 | try: | 183 | try: |
| 176 | keywords_json = json.dumps(keywords, ensure_ascii=False) | 184 | keywords_json = json.dumps(keywords, ensure_ascii=False) |
| 185 | + # 为了支持外键引用,topic_id 需要全局唯一,所以将日期加入到 topic_id 中 | ||
| 186 | + topic_id = f"summary_{extract_date.strftime('%Y%m%d')}" | ||
| 187 | + | ||
| 177 | with self.engine.begin() as conn: | 188 | with self.engine.begin() as conn: |
| 178 | check = conn.execute( | 189 | check = conn.execute( |
| 179 | text("SELECT id FROM daily_topics WHERE extract_date = :d AND topic_id = :tid"), | 190 | text("SELECT id FROM daily_topics WHERE extract_date = :d AND topic_id = :tid"), |
| 180 | - {"d": extract_date, "tid": "summary"}, | 191 | + {"d": extract_date, "tid": topic_id}, |
| 181 | ).first() | 192 | ).first() |
| 182 | if check: | 193 | if check: |
| 183 | conn.execute( | 194 | conn.execute( |
| 184 | text( | 195 | text( |
| 185 | "UPDATE daily_topics SET keywords = :k, topic_description = :s, add_ts = :ts, last_modify_ts = :lmt, topic_name = :tn WHERE extract_date = :d AND topic_id = :tid" | 196 | "UPDATE daily_topics SET keywords = :k, topic_description = :s, add_ts = :ts, last_modify_ts = :lmt, topic_name = :tn WHERE extract_date = :d AND topic_id = :tid" |
| 186 | ), | 197 | ), |
| 187 | - {"k": keywords_json, "s": summary, "ts": current_timestamp, "lmt": current_timestamp, "d": extract_date, "tid": "summary", "tn": "每日新闻分析"}, | 198 | + {"k": keywords_json, "s": summary, "ts": current_timestamp, "lmt": current_timestamp, |
| 199 | + "d": extract_date, "tid": topic_id, "tn": "每日新闻分析"}, | ||
| 188 | ) | 200 | ) |
| 189 | logger.info(f"更新了 {extract_date} 的话题分析") | 201 | logger.info(f"更新了 {extract_date} 的话题分析") |
| 190 | else: | 202 | else: |
| @@ -192,7 +204,8 @@ class DatabaseManager: | @@ -192,7 +204,8 @@ class DatabaseManager: | ||
| 192 | text( | 204 | text( |
| 193 | "INSERT INTO daily_topics (extract_date, topic_id, topic_name, keywords, topic_description, add_ts, last_modify_ts) VALUES (:d, :tid, :tn, :k, :s, :ts, :lmt)" | 205 | "INSERT INTO daily_topics (extract_date, topic_id, topic_name, keywords, topic_description, add_ts, last_modify_ts) VALUES (:d, :tid, :tn, :k, :s, :ts, :lmt)" |
| 194 | ), | 206 | ), |
| 195 | - {"d": extract_date, "tid": "summary", "tn": "每日新闻分析", "k": keywords_json, "s": summary, "ts": current_timestamp, "lmt": current_timestamp}, | 207 | + {"d": extract_date, "tid": topic_id, "tn": "每日新闻分析", "k": keywords_json, "s": summary, |
| 208 | + "ts": current_timestamp, "lmt": current_timestamp}, | ||
| 196 | ) | 209 | ) |
| 197 | logger.info(f"保存了 {extract_date} 的话题分析") | 210 | logger.info(f"保存了 {extract_date} 的话题分析") |
| 198 | return True | 211 | return True |
| @@ -215,7 +228,8 @@ class DatabaseManager: | @@ -215,7 +228,8 @@ class DatabaseManager: | ||
| 215 | 228 | ||
| 216 | try: | 229 | try: |
| 217 | with self.engine.connect() as conn: | 230 | with self.engine.connect() as conn: |
| 218 | - result = conn.execute(text("SELECT * FROM daily_topics WHERE extract_date = :d"), {"d": extract_date}).mappings().first() | 231 | + result = conn.execute(text("SELECT * FROM daily_topics WHERE extract_date = :d"), |
| 232 | + {"d": extract_date}).mappings().first() | ||
| 219 | if result: | 233 | if result: |
| 220 | result = dict(result) # 转为可变dict以支持item赋值 | 234 | result = dict(result) # 转为可变dict以支持item赋值 |
| 221 | result["keywords"] = json.loads(result["keywords"]) if result.get("keywords") else [] | 235 | result["keywords"] = json.loads(result["keywords"]) if result.get("keywords") else [] |
| @@ -290,6 +304,7 @@ class DatabaseManager: | @@ -290,6 +304,7 @@ class DatabaseManager: | ||
| 290 | logger.exception(f"获取统计摘要失败: {e}") | 304 | logger.exception(f"获取统计摘要失败: {e}") |
| 291 | return {"news_stats": [], "topics_stats": []} | 305 | return {"news_stats": [], "topics_stats": []} |
| 292 | 306 | ||
| 307 | + | ||
| 293 | if __name__ == "__main__": | 308 | if __name__ == "__main__": |
| 294 | # 测试数据库管理器 | 309 | # 测试数据库管理器 |
| 295 | with DatabaseManager() as db: | 310 | with DatabaseManager() as db: |
-
Please register or login to post a comment