Showing
2 changed files
with
44 additions
and
35 deletions
| @@ -12,6 +12,7 @@ import json | @@ -12,6 +12,7 @@ import json | ||
| 12 | from datetime import datetime, date | 12 | from datetime import datetime, date |
| 13 | from pathlib import Path | 13 | from pathlib import Path |
| 14 | from typing import List, Dict, Optional | 14 | from typing import List, Dict, Optional |
| 15 | +from loguru import logger | ||
| 15 | 16 | ||
| 16 | # 添加项目根目录到路径 | 17 | # 添加项目根目录到路径 |
| 17 | project_root = Path(__file__).parent.parent | 18 | project_root = Path(__file__).parent.parent |
| @@ -38,8 +39,7 @@ SOURCE_NAMES = { | @@ -38,8 +39,7 @@ SOURCE_NAMES = { | ||
| 38 | "wallstreetcn": "华尔街见闻", | 39 | "wallstreetcn": "华尔街见闻", |
| 39 | "thepaper": "澎湃新闻", | 40 | "thepaper": "澎湃新闻", |
| 40 | "cls-hot": "财联社", | 41 | "cls-hot": "财联社", |
| 41 | - "xueqiu": "雪球热榜", | ||
| 42 | - "kuaishou": "快手热榜" | 42 | + "xueqiu": "雪球热榜" |
| 43 | } | 43 | } |
| 44 | 44 | ||
| 45 | class NewsCollector: | 45 | class NewsCollector: |
| @@ -72,15 +72,25 @@ class NewsCollector: | @@ -72,15 +72,25 @@ class NewsCollector: | ||
| 72 | async def fetch_news(self, source: str) -> dict: | 72 | async def fetch_news(self, source: str) -> dict: |
| 73 | """从指定源获取最新新闻""" | 73 | """从指定源获取最新新闻""" |
| 74 | url = f"{BASE_URL}/api/s?id={source}&latest" | 74 | url = f"{BASE_URL}/api/s?id={source}&latest" |
| 75 | - headers = {"Accept": "application/json"} | 75 | + headers = { |
| 76 | + "Accept": "application/json, text/plain, */*", | ||
| 77 | + "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", | ||
| 78 | + "User-Agent": ( | ||
| 79 | + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " | ||
| 80 | + "AppleWebKit/537.36 (KHTML, like Gecko) " | ||
| 81 | + "Chrome/124.0.0.0 Safari/537.36" | ||
| 82 | + ), | ||
| 83 | + "Referer": BASE_URL, | ||
| 84 | + "Connection": "keep-alive", | ||
| 85 | + } | ||
| 76 | 86 | ||
| 77 | try: | 87 | try: |
| 78 | - async with httpx.AsyncClient(timeout=30.0) as client: | 88 | + async with httpx.AsyncClient(timeout=30.0, follow_redirects=True) as client: |
| 79 | response = await client.get(url, headers=headers) | 89 | response = await client.get(url, headers=headers) |
| 80 | response.raise_for_status() | 90 | response.raise_for_status() |
| 81 | 91 | ||
| 82 | # 解析JSON响应 | 92 | # 解析JSON响应 |
| 83 | - data = json.loads(response.text) | 93 | + data = response.json() |
| 84 | return { | 94 | return { |
| 85 | "source": source, | 95 | "source": source, |
| 86 | "status": "success", | 96 | "status": "success", |
| @@ -91,21 +101,21 @@ class NewsCollector: | @@ -91,21 +101,21 @@ class NewsCollector: | ||
| 91 | return { | 101 | return { |
| 92 | "source": source, | 102 | "source": source, |
| 93 | "status": "timeout", | 103 | "status": "timeout", |
| 94 | - "error": "请求超时", | 104 | + "error": f"请求超时: {source}({url})", |
| 95 | "timestamp": datetime.now().isoformat() | 105 | "timestamp": datetime.now().isoformat() |
| 96 | } | 106 | } |
| 97 | except httpx.HTTPStatusError as e: | 107 | except httpx.HTTPStatusError as e: |
| 98 | return { | 108 | return { |
| 99 | "source": source, | 109 | "source": source, |
| 100 | "status": "http_error", | 110 | "status": "http_error", |
| 101 | - "error": f"HTTP错误: {e.response.status_code}", | 111 | + "error": f"HTTP错误: {source}({url}) - {e.response.status_code}", |
| 102 | "timestamp": datetime.now().isoformat() | 112 | "timestamp": datetime.now().isoformat() |
| 103 | } | 113 | } |
| 104 | except Exception as e: | 114 | except Exception as e: |
| 105 | return { | 115 | return { |
| 106 | "source": source, | 116 | "source": source, |
| 107 | "status": "error", | 117 | "status": "error", |
| 108 | - "error": f"未知错误: {str(e)}", | 118 | + "error": f"未知错误: {source}({url}) - {str(e)}", |
| 109 | "timestamp": datetime.now().isoformat() | 119 | "timestamp": datetime.now().isoformat() |
| 110 | } | 120 | } |
| 111 | 121 | ||
| @@ -114,13 +124,13 @@ class NewsCollector: | @@ -114,13 +124,13 @@ class NewsCollector: | ||
| 114 | if sources is None: | 124 | if sources is None: |
| 115 | sources = list(SOURCE_NAMES.keys()) | 125 | sources = list(SOURCE_NAMES.keys()) |
| 116 | 126 | ||
| 117 | - print(f"正在获取 {len(sources)} 个新闻源的最新内容...") | ||
| 118 | - print("=" * 80) | 127 | + logger.info(f"正在获取 {len(sources)} 个新闻源的最新内容...") |
| 128 | + logger.info("=" * 80) | ||
| 119 | 129 | ||
| 120 | results = [] | 130 | results = [] |
| 121 | for source in sources: | 131 | for source in sources: |
| 122 | source_name = SOURCE_NAMES.get(source, source) | 132 | source_name = SOURCE_NAMES.get(source, source) |
| 123 | - print(f"正在获取 {source_name} 的新闻...") | 133 | + logger.info(f"正在获取 {source_name} 的新闻...") |
| 124 | result = await self.fetch_news(source) | 134 | result = await self.fetch_news(source) |
| 125 | results.append(result) | 135 | results.append(result) |
| 126 | 136 | ||
| @@ -128,11 +138,11 @@ class NewsCollector: | @@ -128,11 +138,11 @@ class NewsCollector: | ||
| 128 | data = result["data"] | 138 | data = result["data"] |
| 129 | if 'items' in data and isinstance(data['items'], list): | 139 | if 'items' in data and isinstance(data['items'], list): |
| 130 | count = len(data['items']) | 140 | count = len(data['items']) |
| 131 | - print(f"✓ {source_name}: 获取成功,共 {count} 条新闻") | 141 | + logger.info(f"✓ {source_name}: 获取成功,共 {count} 条新闻") |
| 132 | else: | 142 | else: |
| 133 | - print(f"✓ {source_name}: 获取成功") | 143 | + logger.info(f"✓ {source_name}: 获取成功") |
| 134 | else: | 144 | else: |
| 135 | - print(f"✗ {source_name}: {result.get('error', '获取失败')}") | 145 | + logger.error(f"✗ {source_name}: {result.get('error', '获取失败')}") |
| 136 | 146 | ||
| 137 | # 避免请求过快 | 147 | # 避免请求过快 |
| 138 | await asyncio.sleep(0.5) | 148 | await asyncio.sleep(0.5) |
| @@ -151,18 +161,21 @@ class NewsCollector: | @@ -151,18 +161,21 @@ class NewsCollector: | ||
| 151 | Returns: | 161 | Returns: |
| 152 | 包含收集结果的字典 | 162 | 包含收集结果的字典 |
| 153 | """ | 163 | """ |
| 154 | - print(f"开始收集每日热点新闻...") | ||
| 155 | - print(f"时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") | 164 | + collection_summary_message = "" |
| 165 | + collection_summary_message += "\n开始收集每日热点新闻...\n" | ||
| 166 | + collection_summary_message += f"时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n" | ||
| 156 | 167 | ||
| 157 | # 选择新闻源 | 168 | # 选择新闻源 |
| 158 | if sources is None: | 169 | if sources is None: |
| 159 | # 使用所有支持的新闻源 | 170 | # 使用所有支持的新闻源 |
| 160 | sources = list(SOURCE_NAMES.keys()) | 171 | sources = list(SOURCE_NAMES.keys()) |
| 161 | 172 | ||
| 162 | - print(f"将从 {len(sources)} 个新闻源收集数据:") | 173 | + collection_summary_message += f"将从 {len(sources)} 个新闻源收集数据:\n" |
| 163 | for source in sources: | 174 | for source in sources: |
| 164 | source_name = SOURCE_NAMES.get(source, source) | 175 | source_name = SOURCE_NAMES.get(source, source) |
| 165 | - print(f" - {source_name}") | 176 | + collection_summary_message += f" - {source_name}\n" |
| 177 | + | ||
| 178 | + logger.info(collection_summary_message) | ||
| 166 | 179 | ||
| 167 | try: | 180 | try: |
| 168 | # 获取新闻数据 | 181 | # 获取新闻数据 |
| @@ -185,7 +198,7 @@ class NewsCollector: | @@ -185,7 +198,7 @@ class NewsCollector: | ||
| 185 | return processed_data | 198 | return processed_data |
| 186 | 199 | ||
| 187 | except Exception as e: | 200 | except Exception as e: |
| 188 | - print(f"收集新闻失败: {e}") | 201 | + logger.exception(f"收集新闻失败: {e}") |
| 189 | return { | 202 | return { |
| 190 | 'success': False, | 203 | 'success': False, |
| 191 | 'error': str(e), | 204 | 'error': str(e), |
| @@ -255,35 +268,30 @@ class NewsCollector: | @@ -255,35 +268,30 @@ class NewsCollector: | ||
| 255 | } | 268 | } |
| 256 | 269 | ||
| 257 | except Exception as e: | 270 | except Exception as e: |
| 258 | - print(f"处理新闻项失败: {e}") | 271 | + logger.exception(f"处理新闻项失败: {e}") |
| 259 | return None | 272 | return None |
| 260 | 273 | ||
| 261 | def _print_collection_summary(self, data: Dict): | 274 | def _print_collection_summary(self, data: Dict): |
| 262 | """打印收集摘要""" | 275 | """打印收集摘要""" |
| 263 | - print("\n" + "=" * 50) | ||
| 264 | - print("新闻收集摘要") | ||
| 265 | - print("=" * 50) | ||
| 266 | - | ||
| 267 | - print(f"总新闻源: {data['total_sources']}") | ||
| 268 | - print(f"成功源数: {data['successful_sources']}") | ||
| 269 | - print(f"总新闻数: {data['total_news']}") | ||
| 270 | - | 276 | + collection_summary_message = "" |
| 277 | + collection_summary_message += f"\n总新闻源: {data['total_sources']}\n" | ||
| 278 | + collection_summary_message += f"成功源数: {data['successful_sources']}\n" | ||
| 279 | + collection_summary_message += f"总新闻数: {data['total_news']}\n" | ||
| 271 | if 'saved_count' in data: | 280 | if 'saved_count' in data: |
| 272 | - print(f"已保存数: {data['saved_count']}") | ||
| 273 | - | ||
| 274 | - print("=" * 50) | 281 | + collection_summary_message += f"已保存数: {data['saved_count']}\n" |
| 282 | + logger.info(collection_summary_message) | ||
| 275 | 283 | ||
| 276 | def get_today_news(self) -> List[Dict]: | 284 | def get_today_news(self) -> List[Dict]: |
| 277 | """获取今天的新闻""" | 285 | """获取今天的新闻""" |
| 278 | try: | 286 | try: |
| 279 | return self.db_manager.get_daily_news(date.today()) | 287 | return self.db_manager.get_daily_news(date.today()) |
| 280 | except Exception as e: | 288 | except Exception as e: |
| 281 | - print(f"获取今日新闻失败: {e}") | 289 | + logger.exception(f"获取今日新闻失败: {e}") |
| 282 | return [] | 290 | return [] |
| 283 | 291 | ||
| 284 | async def main(): | 292 | async def main(): |
| 285 | """测试新闻收集器""" | 293 | """测试新闻收集器""" |
| 286 | - print("测试新闻收集器...") | 294 | + logger.info("测试新闻收集器...") |
| 287 | 295 | ||
| 288 | async with NewsCollector() as collector: | 296 | async with NewsCollector() as collector: |
| 289 | # 收集新闻 | 297 | # 收集新闻 |
| @@ -292,9 +300,9 @@ async def main(): | @@ -292,9 +300,9 @@ async def main(): | ||
| 292 | ) | 300 | ) |
| 293 | 301 | ||
| 294 | if result['success']: | 302 | if result['success']: |
| 295 | - print(f"收集成功!共获取 {result['total_news']} 条新闻") | 303 | + logger.info(f"收集成功!共获取 {result['total_news']} 条新闻") |
| 296 | else: | 304 | else: |
| 297 | - print(f"收集失败: {result.get('error', '未知错误')}") | 305 | + logger.error(f"收集失败: {result.get('error', '未知错误')}") |
| 298 | 306 | ||
| 299 | if __name__ == "__main__": | 307 | if __name__ == "__main__": |
| 300 | asyncio.run(main()) | 308 | asyncio.run(main()) |
-
Please register or login to post a comment