Committed by
GitHub
Merge pull request #116 from DoiiarX/fix-news-id
修复news-id冲突问题
Showing
1 changed file
with
55 additions
and
40 deletions
| @@ -25,14 +25,15 @@ except ImportError: | @@ -25,14 +25,15 @@ except ImportError: | ||
| 25 | 25 | ||
| 26 | from config import settings | 26 | from config import settings |
| 27 | 27 | ||
| 28 | + | ||
| 28 | class DatabaseManager: | 29 | class DatabaseManager: |
| 29 | """数据库管理器""" | 30 | """数据库管理器""" |
| 30 | - | 31 | + |
| 31 | def __init__(self): | 32 | def __init__(self): |
| 32 | """初始化数据库管理器""" | 33 | """初始化数据库管理器""" |
| 33 | self.engine: Engine = None | 34 | self.engine: Engine = None |
| 34 | self.connect() | 35 | self.connect() |
| 35 | - | 36 | + |
| 36 | def connect(self): | 37 | def connect(self): |
| 37 | """连接数据库""" | 38 | """连接数据库""" |
| 38 | try: | 39 | try: |
| @@ -46,46 +47,47 @@ class DatabaseManager: | @@ -46,46 +47,47 @@ class DatabaseManager: | ||
| 46 | except ModuleNotFoundError as e: | 47 | except ModuleNotFoundError as e: |
| 47 | missing: str = str(e) | 48 | missing: str = str(e) |
| 48 | if "psycopg" in missing: | 49 | if "psycopg" in missing: |
| 49 | - logger.error("数据库连接失败: 未安装PostgreSQL驱动 psycopg。请安装: psycopg[binary]。参考指令:uv pip install psycopg[binary]") | 50 | + logger.error( |
| 51 | + "数据库连接失败: 未安装PostgreSQL驱动 psycopg。请安装: psycopg[binary]。参考指令:uv pip install psycopg[binary]") | ||
| 50 | elif "pymysql" in missing: | 52 | elif "pymysql" in missing: |
| 51 | logger.error("数据库连接失败: 未安装MySQL驱动 pymysql。请安装: pymysql。参考指令:uv pip install pymysql") | 53 | logger.error("数据库连接失败: 未安装MySQL驱动 pymysql。请安装: pymysql。参考指令:uv pip install pymysql") |
| 52 | else: | 54 | else: |
| 53 | logger.error(f"数据库连接失败(缺少驱动): {e}") | 55 | logger.error(f"数据库连接失败(缺少驱动): {e}") |
| 54 | raise | 56 | raise |
| 55 | except Exception as e: | 57 | except Exception as e: |
| 56 | - logger.error(f"数据库连接失败: {e}") | 58 | + logger.exception(f"数据库连接失败: {e}") |
| 57 | raise | 59 | raise |
| 58 | - | 60 | + |
| 59 | def close(self): | 61 | def close(self): |
| 60 | """关闭数据库连接""" | 62 | """关闭数据库连接""" |
| 61 | if self.engine: | 63 | if self.engine: |
| 62 | self.engine.dispose() | 64 | self.engine.dispose() |
| 63 | logger.info("数据库连接已关闭") | 65 | logger.info("数据库连接已关闭") |
| 64 | - | 66 | + |
| 65 | def __enter__(self): | 67 | def __enter__(self): |
| 66 | return self | 68 | return self |
| 67 | - | 69 | + |
| 68 | def __exit__(self, exc_type, exc_val, exc_tb): | 70 | def __exit__(self, exc_type, exc_val, exc_tb): |
| 69 | self.close() | 71 | self.close() |
| 70 | - | 72 | + |
| 71 | # ==================== 新闻数据操作 ==================== | 73 | # ==================== 新闻数据操作 ==================== |
| 72 | - | 74 | + |
| 73 | def save_daily_news(self, news_data: List[Dict], crawl_date: date = None) -> int: | 75 | def save_daily_news(self, news_data: List[Dict], crawl_date: date = None) -> int: |
| 74 | """ | 76 | """ |
| 75 | 保存每日新闻数据,如果当天已有数据则覆盖 | 77 | 保存每日新闻数据,如果当天已有数据则覆盖 |
| 76 | - | 78 | + |
| 77 | Args: | 79 | Args: |
| 78 | news_data: 新闻数据列表 | 80 | news_data: 新闻数据列表 |
| 79 | crawl_date: 爬取日期,默认为今天 | 81 | crawl_date: 爬取日期,默认为今天 |
| 80 | - | 82 | + |
| 81 | Returns: | 83 | Returns: |
| 82 | 保存的新闻数量 | 84 | 保存的新闻数量 |
| 83 | """ | 85 | """ |
| 84 | if not crawl_date: | 86 | if not crawl_date: |
| 85 | crawl_date = date.today() | 87 | crawl_date = date.today() |
| 86 | - | 88 | + |
| 87 | current_timestamp = int(datetime.now().timestamp()) | 89 | current_timestamp = int(datetime.now().timestamp()) |
| 88 | - | 90 | + |
| 89 | try: | 91 | try: |
| 90 | saved_count = 0 | 92 | saved_count = 0 |
| 91 | # 先独立事务执行删除,防止后续插入失败导致无法清理 | 93 | # 先独立事务执行删除,防止后续插入失败导致无法清理 |
| @@ -97,7 +99,13 @@ class DatabaseManager: | @@ -97,7 +99,13 @@ class DatabaseManager: | ||
| 97 | # 逐条插入,单条失败不影响后续(每条独立事务) | 99 | # 逐条插入,单条失败不影响后续(每条独立事务) |
| 98 | for news_item in news_data: | 100 | for news_item in news_data: |
| 99 | try: | 101 | try: |
| 100 | - news_id = f"{news_item.get('source', 'unknown')}_{news_item.get('id', news_item.get('rank', 0))}" | 102 | + # news_item.get('id') 已经是完整的 news_id(格式:source_item_id) |
| 103 | + # 为了支持同一条新闻在不同日期出现,将 crawl_date 加入到 news_id 中 | ||
| 104 | + base_news_id = news_item.get( | ||
| 105 | + 'id') or f"{news_item.get('source', 'unknown')}_rank_{news_item.get('rank', 0)}" | ||
| 106 | + # 将日期格式化为字符串并加入到 news_id 中,确保全局唯一性 | ||
| 107 | + news_id = f"{base_news_id}_{crawl_date.strftime('%Y%m%d')}" | ||
| 108 | + | ||
| 101 | title_val = (news_item.get("title", "") or "") | 109 | title_val = (news_item.get("title", "") or "") |
| 102 | if len(title_val) > 500: | 110 | if len(title_val) > 500: |
| 103 | title_val = title_val[:500] | 111 | title_val = title_val[:500] |
| @@ -124,27 +132,27 @@ class DatabaseManager: | @@ -124,27 +132,27 @@ class DatabaseManager: | ||
| 124 | ) | 132 | ) |
| 125 | saved_count += 1 | 133 | saved_count += 1 |
| 126 | except Exception as e: | 134 | except Exception as e: |
| 127 | - logger.warning(f"保存单条新闻失败: {e}") | 135 | + logger.exception(f"保存单条新闻失败: {e}") |
| 128 | continue | 136 | continue |
| 129 | logger.info(f"成功保存 {saved_count} 条新闻记录") | 137 | logger.info(f"成功保存 {saved_count} 条新闻记录") |
| 130 | return saved_count | 138 | return saved_count |
| 131 | except Exception as e: | 139 | except Exception as e: |
| 132 | logger.exception(f"保存新闻数据失败: {e}") | 140 | logger.exception(f"保存新闻数据失败: {e}") |
| 133 | return 0 | 141 | return 0 |
| 134 | - | 142 | + |
| 135 | def get_daily_news(self, crawl_date: date = None) -> List[Dict]: | 143 | def get_daily_news(self, crawl_date: date = None) -> List[Dict]: |
| 136 | """ | 144 | """ |
| 137 | 获取每日新闻数据 | 145 | 获取每日新闻数据 |
| 138 | - | 146 | + |
| 139 | Args: | 147 | Args: |
| 140 | crawl_date: 爬取日期,默认为今天 | 148 | crawl_date: 爬取日期,默认为今天 |
| 141 | - | 149 | + |
| 142 | Returns: | 150 | Returns: |
| 143 | 新闻列表 | 151 | 新闻列表 |
| 144 | """ | 152 | """ |
| 145 | if not crawl_date: | 153 | if not crawl_date: |
| 146 | crawl_date = date.today() | 154 | crawl_date = date.today() |
| 147 | - | 155 | + |
| 148 | query = ( | 156 | query = ( |
| 149 | "SELECT * FROM daily_news WHERE crawl_date = :d ORDER BY rank_position ASC" | 157 | "SELECT * FROM daily_news WHERE crawl_date = :d ORDER BY rank_position ASC" |
| 150 | ) | 158 | ) |
| @@ -152,39 +160,43 @@ class DatabaseManager: | @@ -152,39 +160,43 @@ class DatabaseManager: | ||
| 152 | result = conn.execute(text(query), {"d": crawl_date}) | 160 | result = conn.execute(text(query), {"d": crawl_date}) |
| 153 | rows = result.mappings().all() | 161 | rows = result.mappings().all() |
| 154 | return rows | 162 | return rows |
| 155 | - | 163 | + |
| 156 | # ==================== 话题数据操作 ==================== | 164 | # ==================== 话题数据操作 ==================== |
| 157 | - | 165 | + |
| 158 | def save_daily_topics(self, keywords: List[str], summary: str, extract_date: date = None) -> bool: | 166 | def save_daily_topics(self, keywords: List[str], summary: str, extract_date: date = None) -> bool: |
| 159 | """ | 167 | """ |
| 160 | 保存每日话题分析 | 168 | 保存每日话题分析 |
| 161 | - | 169 | + |
| 162 | Args: | 170 | Args: |
| 163 | keywords: 话题关键词列表 | 171 | keywords: 话题关键词列表 |
| 164 | summary: 新闻分析总结 | 172 | summary: 新闻分析总结 |
| 165 | extract_date: 提取日期,默认为今天 | 173 | extract_date: 提取日期,默认为今天 |
| 166 | - | 174 | + |
| 167 | Returns: | 175 | Returns: |
| 168 | 是否保存成功 | 176 | 是否保存成功 |
| 169 | """ | 177 | """ |
| 170 | if not extract_date: | 178 | if not extract_date: |
| 171 | extract_date = date.today() | 179 | extract_date = date.today() |
| 172 | - | 180 | + |
| 173 | current_timestamp = int(datetime.now().timestamp()) | 181 | current_timestamp = int(datetime.now().timestamp()) |
| 174 | - | 182 | + |
| 175 | try: | 183 | try: |
| 176 | keywords_json = json.dumps(keywords, ensure_ascii=False) | 184 | keywords_json = json.dumps(keywords, ensure_ascii=False) |
| 185 | + # 为了支持外键引用,topic_id 需要全局唯一,所以将日期加入到 topic_id 中 | ||
| 186 | + topic_id = f"summary_{extract_date.strftime('%Y%m%d')}" | ||
| 187 | + | ||
| 177 | with self.engine.begin() as conn: | 188 | with self.engine.begin() as conn: |
| 178 | check = conn.execute( | 189 | check = conn.execute( |
| 179 | text("SELECT id FROM daily_topics WHERE extract_date = :d AND topic_id = :tid"), | 190 | text("SELECT id FROM daily_topics WHERE extract_date = :d AND topic_id = :tid"), |
| 180 | - {"d": extract_date, "tid": "summary"}, | 191 | + {"d": extract_date, "tid": topic_id}, |
| 181 | ).first() | 192 | ).first() |
| 182 | if check: | 193 | if check: |
| 183 | conn.execute( | 194 | conn.execute( |
| 184 | text( | 195 | text( |
| 185 | "UPDATE daily_topics SET keywords = :k, topic_description = :s, add_ts = :ts, last_modify_ts = :lmt, topic_name = :tn WHERE extract_date = :d AND topic_id = :tid" | 196 | "UPDATE daily_topics SET keywords = :k, topic_description = :s, add_ts = :ts, last_modify_ts = :lmt, topic_name = :tn WHERE extract_date = :d AND topic_id = :tid" |
| 186 | ), | 197 | ), |
| 187 | - {"k": keywords_json, "s": summary, "ts": current_timestamp, "lmt": current_timestamp, "d": extract_date, "tid": "summary", "tn": "每日新闻分析"}, | 198 | + {"k": keywords_json, "s": summary, "ts": current_timestamp, "lmt": current_timestamp, |
| 199 | + "d": extract_date, "tid": topic_id, "tn": "每日新闻分析"}, | ||
| 188 | ) | 200 | ) |
| 189 | logger.info(f"更新了 {extract_date} 的话题分析") | 201 | logger.info(f"更新了 {extract_date} 的话题分析") |
| 190 | else: | 202 | else: |
| @@ -192,30 +204,32 @@ class DatabaseManager: | @@ -192,30 +204,32 @@ class DatabaseManager: | ||
| 192 | text( | 204 | text( |
| 193 | "INSERT INTO daily_topics (extract_date, topic_id, topic_name, keywords, topic_description, add_ts, last_modify_ts) VALUES (:d, :tid, :tn, :k, :s, :ts, :lmt)" | 205 | "INSERT INTO daily_topics (extract_date, topic_id, topic_name, keywords, topic_description, add_ts, last_modify_ts) VALUES (:d, :tid, :tn, :k, :s, :ts, :lmt)" |
| 194 | ), | 206 | ), |
| 195 | - {"d": extract_date, "tid": "summary", "tn": "每日新闻分析", "k": keywords_json, "s": summary, "ts": current_timestamp, "lmt": current_timestamp}, | 207 | + {"d": extract_date, "tid": topic_id, "tn": "每日新闻分析", "k": keywords_json, "s": summary, |
| 208 | + "ts": current_timestamp, "lmt": current_timestamp}, | ||
| 196 | ) | 209 | ) |
| 197 | logger.info(f"保存了 {extract_date} 的话题分析") | 210 | logger.info(f"保存了 {extract_date} 的话题分析") |
| 198 | return True | 211 | return True |
| 199 | except Exception as e: | 212 | except Exception as e: |
| 200 | logger.exception(f"保存话题分析失败: {e}") | 213 | logger.exception(f"保存话题分析失败: {e}") |
| 201 | return False | 214 | return False |
| 202 | - | 215 | + |
| 203 | def get_daily_topics(self, extract_date: date = None) -> Optional[Dict]: | 216 | def get_daily_topics(self, extract_date: date = None) -> Optional[Dict]: |
| 204 | """ | 217 | """ |
| 205 | 获取每日话题分析 | 218 | 获取每日话题分析 |
| 206 | - | 219 | + |
| 207 | Args: | 220 | Args: |
| 208 | extract_date: 提取日期,默认为今天 | 221 | extract_date: 提取日期,默认为今天 |
| 209 | - | 222 | + |
| 210 | Returns: | 223 | Returns: |
| 211 | 话题分析数据,如果不存在返回None | 224 | 话题分析数据,如果不存在返回None |
| 212 | """ | 225 | """ |
| 213 | if not extract_date: | 226 | if not extract_date: |
| 214 | extract_date = date.today() | 227 | extract_date = date.today() |
| 215 | - | 228 | + |
| 216 | try: | 229 | try: |
| 217 | with self.engine.connect() as conn: | 230 | with self.engine.connect() as conn: |
| 218 | - result = conn.execute(text("SELECT * FROM daily_topics WHERE extract_date = :d"), {"d": extract_date}).mappings().first() | 231 | + result = conn.execute(text("SELECT * FROM daily_topics WHERE extract_date = :d"), |
| 232 | + {"d": extract_date}).mappings().first() | ||
| 219 | if result: | 233 | if result: |
| 220 | result = dict(result) # 转为可变dict以支持item赋值 | 234 | result = dict(result) # 转为可变dict以支持item赋值 |
| 221 | result["keywords"] = json.loads(result["keywords"]) if result.get("keywords") else [] | 235 | result["keywords"] = json.loads(result["keywords"]) if result.get("keywords") else [] |
| @@ -224,14 +238,14 @@ class DatabaseManager: | @@ -224,14 +238,14 @@ class DatabaseManager: | ||
| 224 | except Exception as e: | 238 | except Exception as e: |
| 225 | logger.exception(f"获取话题分析失败: {e}") | 239 | logger.exception(f"获取话题分析失败: {e}") |
| 226 | return None | 240 | return None |
| 227 | - | 241 | + |
| 228 | def get_recent_topics(self, days: int = 7) -> List[Dict]: | 242 | def get_recent_topics(self, days: int = 7) -> List[Dict]: |
| 229 | """ | 243 | """ |
| 230 | 获取最近几天的话题分析 | 244 | 获取最近几天的话题分析 |
| 231 | - | 245 | + |
| 232 | Args: | 246 | Args: |
| 233 | days: 天数 | 247 | days: 天数 |
| 234 | - | 248 | + |
| 235 | Returns: | 249 | Returns: |
| 236 | 话题分析列表 | 250 | 话题分析列表 |
| 237 | """ | 251 | """ |
| @@ -254,9 +268,9 @@ class DatabaseManager: | @@ -254,9 +268,9 @@ class DatabaseManager: | ||
| 254 | except Exception as e: | 268 | except Exception as e: |
| 255 | logger.exception(f"获取最近话题分析失败: {e}") | 269 | logger.exception(f"获取最近话题分析失败: {e}") |
| 256 | return [] | 270 | return [] |
| 257 | - | 271 | + |
| 258 | # ==================== 统计查询 ==================== | 272 | # ==================== 统计查询 ==================== |
| 259 | - | 273 | + |
| 260 | def get_summary_stats(self, days: int = 7) -> Dict: | 274 | def get_summary_stats(self, days: int = 7) -> Dict: |
| 261 | """获取统计摘要""" | 275 | """获取统计摘要""" |
| 262 | try: | 276 | try: |
| @@ -290,18 +304,19 @@ class DatabaseManager: | @@ -290,18 +304,19 @@ class DatabaseManager: | ||
| 290 | logger.exception(f"获取统计摘要失败: {e}") | 304 | logger.exception(f"获取统计摘要失败: {e}") |
| 291 | return {"news_stats": [], "topics_stats": []} | 305 | return {"news_stats": [], "topics_stats": []} |
| 292 | 306 | ||
| 307 | + | ||
| 293 | if __name__ == "__main__": | 308 | if __name__ == "__main__": |
| 294 | # 测试数据库管理器 | 309 | # 测试数据库管理器 |
| 295 | with DatabaseManager() as db: | 310 | with DatabaseManager() as db: |
| 296 | # 测试获取新闻 | 311 | # 测试获取新闻 |
| 297 | news = db.get_daily_news() | 312 | news = db.get_daily_news() |
| 298 | logger.info(f"今日新闻数量: {len(news)}") | 313 | logger.info(f"今日新闻数量: {len(news)}") |
| 299 | - | 314 | + |
| 300 | # 测试获取话题 | 315 | # 测试获取话题 |
| 301 | topics = db.get_daily_topics() | 316 | topics = db.get_daily_topics() |
| 302 | if topics: | 317 | if topics: |
| 303 | logger.info(f"今日话题关键词: {topics['keywords']}") | 318 | logger.info(f"今日话题关键词: {topics['keywords']}") |
| 304 | else: | 319 | else: |
| 305 | logger.info("今日暂无话题分析") | 320 | logger.info("今日暂无话题分析") |
| 306 | - | 321 | + |
| 307 | logger.info("简化数据库管理器测试完成!") | 322 | logger.info("简化数据库管理器测试完成!") |
-
Please register or login to post a comment