Merge branch 'main' of https://github.com/666ghj/Weibo_PublicOpinion_AnalysisSystem
Showing
8 changed files
with
65 additions
and
49 deletions
| @@ -12,6 +12,7 @@ import json | @@ -12,6 +12,7 @@ import json | ||
| 12 | from datetime import datetime, date | 12 | from datetime import datetime, date |
| 13 | from pathlib import Path | 13 | from pathlib import Path |
| 14 | from typing import List, Dict, Optional | 14 | from typing import List, Dict, Optional |
| 15 | +from loguru import logger | ||
| 15 | 16 | ||
| 16 | # 添加项目根目录到路径 | 17 | # 添加项目根目录到路径 |
| 17 | project_root = Path(__file__).parent.parent | 18 | project_root = Path(__file__).parent.parent |
| @@ -38,8 +39,7 @@ SOURCE_NAMES = { | @@ -38,8 +39,7 @@ SOURCE_NAMES = { | ||
| 38 | "wallstreetcn": "华尔街见闻", | 39 | "wallstreetcn": "华尔街见闻", |
| 39 | "thepaper": "澎湃新闻", | 40 | "thepaper": "澎湃新闻", |
| 40 | "cls-hot": "财联社", | 41 | "cls-hot": "财联社", |
| 41 | - "xueqiu": "雪球热榜", | ||
| 42 | - "kuaishou": "快手热榜" | 42 | + "xueqiu": "雪球热榜" |
| 43 | } | 43 | } |
| 44 | 44 | ||
| 45 | class NewsCollector: | 45 | class NewsCollector: |
| @@ -72,15 +72,25 @@ class NewsCollector: | @@ -72,15 +72,25 @@ class NewsCollector: | ||
| 72 | async def fetch_news(self, source: str) -> dict: | 72 | async def fetch_news(self, source: str) -> dict: |
| 73 | """从指定源获取最新新闻""" | 73 | """从指定源获取最新新闻""" |
| 74 | url = f"{BASE_URL}/api/s?id={source}&latest" | 74 | url = f"{BASE_URL}/api/s?id={source}&latest" |
| 75 | - headers = {"Accept": "application/json"} | 75 | + headers = { |
| 76 | + "Accept": "application/json, text/plain, */*", | ||
| 77 | + "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", | ||
| 78 | + "User-Agent": ( | ||
| 79 | + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " | ||
| 80 | + "AppleWebKit/537.36 (KHTML, like Gecko) " | ||
| 81 | + "Chrome/124.0.0.0 Safari/537.36" | ||
| 82 | + ), | ||
| 83 | + "Referer": BASE_URL, | ||
| 84 | + "Connection": "keep-alive", | ||
| 85 | + } | ||
| 76 | 86 | ||
| 77 | try: | 87 | try: |
| 78 | - async with httpx.AsyncClient(timeout=30.0) as client: | 88 | + async with httpx.AsyncClient(timeout=30.0, follow_redirects=True) as client: |
| 79 | response = await client.get(url, headers=headers) | 89 | response = await client.get(url, headers=headers) |
| 80 | response.raise_for_status() | 90 | response.raise_for_status() |
| 81 | 91 | ||
| 82 | # 解析JSON响应 | 92 | # 解析JSON响应 |
| 83 | - data = json.loads(response.text) | 93 | + data = response.json() |
| 84 | return { | 94 | return { |
| 85 | "source": source, | 95 | "source": source, |
| 86 | "status": "success", | 96 | "status": "success", |
| @@ -91,21 +101,21 @@ class NewsCollector: | @@ -91,21 +101,21 @@ class NewsCollector: | ||
| 91 | return { | 101 | return { |
| 92 | "source": source, | 102 | "source": source, |
| 93 | "status": "timeout", | 103 | "status": "timeout", |
| 94 | - "error": "请求超时", | 104 | + "error": f"请求超时: {source}({url})", |
| 95 | "timestamp": datetime.now().isoformat() | 105 | "timestamp": datetime.now().isoformat() |
| 96 | } | 106 | } |
| 97 | except httpx.HTTPStatusError as e: | 107 | except httpx.HTTPStatusError as e: |
| 98 | return { | 108 | return { |
| 99 | "source": source, | 109 | "source": source, |
| 100 | "status": "http_error", | 110 | "status": "http_error", |
| 101 | - "error": f"HTTP错误: {e.response.status_code}", | 111 | + "error": f"HTTP错误: {source}({url}) - {e.response.status_code}", |
| 102 | "timestamp": datetime.now().isoformat() | 112 | "timestamp": datetime.now().isoformat() |
| 103 | } | 113 | } |
| 104 | except Exception as e: | 114 | except Exception as e: |
| 105 | return { | 115 | return { |
| 106 | "source": source, | 116 | "source": source, |
| 107 | "status": "error", | 117 | "status": "error", |
| 108 | - "error": f"未知错误: {str(e)}", | 118 | + "error": f"未知错误: {source}({url}) - {str(e)}", |
| 109 | "timestamp": datetime.now().isoformat() | 119 | "timestamp": datetime.now().isoformat() |
| 110 | } | 120 | } |
| 111 | 121 | ||
| @@ -114,13 +124,13 @@ class NewsCollector: | @@ -114,13 +124,13 @@ class NewsCollector: | ||
| 114 | if sources is None: | 124 | if sources is None: |
| 115 | sources = list(SOURCE_NAMES.keys()) | 125 | sources = list(SOURCE_NAMES.keys()) |
| 116 | 126 | ||
| 117 | - print(f"正在获取 {len(sources)} 个新闻源的最新内容...") | ||
| 118 | - print("=" * 80) | 127 | + logger.info(f"正在获取 {len(sources)} 个新闻源的最新内容...") |
| 128 | + logger.info("=" * 80) | ||
| 119 | 129 | ||
| 120 | results = [] | 130 | results = [] |
| 121 | for source in sources: | 131 | for source in sources: |
| 122 | source_name = SOURCE_NAMES.get(source, source) | 132 | source_name = SOURCE_NAMES.get(source, source) |
| 123 | - print(f"正在获取 {source_name} 的新闻...") | 133 | + logger.info(f"正在获取 {source_name} 的新闻...") |
| 124 | result = await self.fetch_news(source) | 134 | result = await self.fetch_news(source) |
| 125 | results.append(result) | 135 | results.append(result) |
| 126 | 136 | ||
| @@ -128,11 +138,11 @@ class NewsCollector: | @@ -128,11 +138,11 @@ class NewsCollector: | ||
| 128 | data = result["data"] | 138 | data = result["data"] |
| 129 | if 'items' in data and isinstance(data['items'], list): | 139 | if 'items' in data and isinstance(data['items'], list): |
| 130 | count = len(data['items']) | 140 | count = len(data['items']) |
| 131 | - print(f"✓ {source_name}: 获取成功,共 {count} 条新闻") | 141 | + logger.info(f"✓ {source_name}: 获取成功,共 {count} 条新闻") |
| 132 | else: | 142 | else: |
| 133 | - print(f"✓ {source_name}: 获取成功") | 143 | + logger.info(f"✓ {source_name}: 获取成功") |
| 134 | else: | 144 | else: |
| 135 | - print(f"✗ {source_name}: {result.get('error', '获取失败')}") | 145 | + logger.error(f"✗ {source_name}: {result.get('error', '获取失败')}") |
| 136 | 146 | ||
| 137 | # 避免请求过快 | 147 | # 避免请求过快 |
| 138 | await asyncio.sleep(0.5) | 148 | await asyncio.sleep(0.5) |
| @@ -151,18 +161,21 @@ class NewsCollector: | @@ -151,18 +161,21 @@ class NewsCollector: | ||
| 151 | Returns: | 161 | Returns: |
| 152 | 包含收集结果的字典 | 162 | 包含收集结果的字典 |
| 153 | """ | 163 | """ |
| 154 | - print(f"开始收集每日热点新闻...") | ||
| 155 | - print(f"时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") | 164 | + collection_summary_message = "" |
| 165 | + collection_summary_message += "\n开始收集每日热点新闻...\n" | ||
| 166 | + collection_summary_message += f"时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n" | ||
| 156 | 167 | ||
| 157 | # 选择新闻源 | 168 | # 选择新闻源 |
| 158 | if sources is None: | 169 | if sources is None: |
| 159 | # 使用所有支持的新闻源 | 170 | # 使用所有支持的新闻源 |
| 160 | sources = list(SOURCE_NAMES.keys()) | 171 | sources = list(SOURCE_NAMES.keys()) |
| 161 | 172 | ||
| 162 | - print(f"将从 {len(sources)} 个新闻源收集数据:") | 173 | + collection_summary_message += f"将从 {len(sources)} 个新闻源收集数据:\n" |
| 163 | for source in sources: | 174 | for source in sources: |
| 164 | source_name = SOURCE_NAMES.get(source, source) | 175 | source_name = SOURCE_NAMES.get(source, source) |
| 165 | - print(f" - {source_name}") | 176 | + collection_summary_message += f" - {source_name}\n" |
| 177 | + | ||
| 178 | + logger.info(collection_summary_message) | ||
| 166 | 179 | ||
| 167 | try: | 180 | try: |
| 168 | # 获取新闻数据 | 181 | # 获取新闻数据 |
| @@ -185,7 +198,7 @@ class NewsCollector: | @@ -185,7 +198,7 @@ class NewsCollector: | ||
| 185 | return processed_data | 198 | return processed_data |
| 186 | 199 | ||
| 187 | except Exception as e: | 200 | except Exception as e: |
| 188 | - print(f"收集新闻失败: {e}") | 201 | + logger.exception(f"收集新闻失败: {e}") |
| 189 | return { | 202 | return { |
| 190 | 'success': False, | 203 | 'success': False, |
| 191 | 'error': str(e), | 204 | 'error': str(e), |
| @@ -255,35 +268,30 @@ class NewsCollector: | @@ -255,35 +268,30 @@ class NewsCollector: | ||
| 255 | } | 268 | } |
| 256 | 269 | ||
| 257 | except Exception as e: | 270 | except Exception as e: |
| 258 | - print(f"处理新闻项失败: {e}") | 271 | + logger.exception(f"处理新闻项失败: {e}") |
| 259 | return None | 272 | return None |
| 260 | 273 | ||
| 261 | def _print_collection_summary(self, data: Dict): | 274 | def _print_collection_summary(self, data: Dict): |
| 262 | """打印收集摘要""" | 275 | """打印收集摘要""" |
| 263 | - print("\n" + "=" * 50) | ||
| 264 | - print("新闻收集摘要") | ||
| 265 | - print("=" * 50) | ||
| 266 | - | ||
| 267 | - print(f"总新闻源: {data['total_sources']}") | ||
| 268 | - print(f"成功源数: {data['successful_sources']}") | ||
| 269 | - print(f"总新闻数: {data['total_news']}") | ||
| 270 | - | 276 | + collection_summary_message = "" |
| 277 | + collection_summary_message += f"\n总新闻源: {data['total_sources']}\n" | ||
| 278 | + collection_summary_message += f"成功源数: {data['successful_sources']}\n" | ||
| 279 | + collection_summary_message += f"总新闻数: {data['total_news']}\n" | ||
| 271 | if 'saved_count' in data: | 280 | if 'saved_count' in data: |
| 272 | - print(f"已保存数: {data['saved_count']}") | ||
| 273 | - | ||
| 274 | - print("=" * 50) | 281 | + collection_summary_message += f"已保存数: {data['saved_count']}\n" |
| 282 | + logger.info(collection_summary_message) | ||
| 275 | 283 | ||
| 276 | def get_today_news(self) -> List[Dict]: | 284 | def get_today_news(self) -> List[Dict]: |
| 277 | """获取今天的新闻""" | 285 | """获取今天的新闻""" |
| 278 | try: | 286 | try: |
| 279 | return self.db_manager.get_daily_news(date.today()) | 287 | return self.db_manager.get_daily_news(date.today()) |
| 280 | except Exception as e: | 288 | except Exception as e: |
| 281 | - print(f"获取今日新闻失败: {e}") | 289 | + logger.exception(f"获取今日新闻失败: {e}") |
| 282 | return [] | 290 | return [] |
| 283 | 291 | ||
| 284 | async def main(): | 292 | async def main(): |
| 285 | """测试新闻收集器""" | 293 | """测试新闻收集器""" |
| 286 | - print("测试新闻收集器...") | 294 | + logger.info("测试新闻收集器...") |
| 287 | 295 | ||
| 288 | async with NewsCollector() as collector: | 296 | async with NewsCollector() as collector: |
| 289 | # 收集新闻 | 297 | # 收集新闻 |
| @@ -292,9 +300,9 @@ async def main(): | @@ -292,9 +300,9 @@ async def main(): | ||
| 292 | ) | 300 | ) |
| 293 | 301 | ||
| 294 | if result['success']: | 302 | if result['success']: |
| 295 | - print(f"收集成功!共获取 {result['total_news']} 条新闻") | 303 | + logger.info(f"收集成功!共获取 {result['total_news']} 条新闻") |
| 296 | else: | 304 | else: |
| 297 | - print(f"收集失败: {result.get('error', '未知错误')}") | 305 | + logger.error(f"收集失败: {result.get('error', '未知错误')}") |
| 298 | 306 | ||
| 299 | if __name__ == "__main__": | 307 | if __name__ == "__main__": |
| 300 | asyncio.run(main()) | 308 | asyncio.run(main()) |
| @@ -455,19 +455,12 @@ CREATE TABLE tieba_comment | @@ -455,19 +455,12 @@ CREATE TABLE tieba_comment | ||
| 455 | KEY `idx_tieba_comment_publish_time` (`publish_time`) | 455 | KEY `idx_tieba_comment_publish_time` (`publish_time`) |
| 456 | ) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='贴吧评论表'; | 456 | ) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='贴吧评论表'; |
| 457 | 457 | ||
| 458 | --- 增加搜索来源关键字字段 | ||
| 459 | -alter table bilibili_video | ||
| 460 | - add column `source_keyword` varchar(255) default '' comment '搜索来源关键字'; | ||
| 461 | -alter table douyin_aweme | ||
| 462 | - add column `source_keyword` varchar(255) default '' comment '搜索来源关键字'; | ||
| 463 | -alter table kuaishou_video | ||
| 464 | - add column `source_keyword` varchar(255) default '' comment '搜索来源关键字'; | ||
| 465 | -alter table weibo_note | ||
| 466 | - add column `source_keyword` varchar(255) default '' comment '搜索来源关键字'; | ||
| 467 | -alter table xhs_note | ||
| 468 | - add column `source_keyword` varchar(255) default '' comment '搜索来源关键字'; | ||
| 469 | -alter table tieba_note | ||
| 470 | - add column `source_keyword` varchar(255) default '' comment '搜索来源关键字'; | 458 | +alter table bilibili_video add column `source_keyword` varchar(255) default '' comment '搜索来源关键字'; |
| 459 | +alter table douyin_aweme add column `source_keyword` varchar(255) default '' comment '搜索来源关键字'; | ||
| 460 | +alter table kuaishou_video add column `source_keyword` varchar(255) default '' comment '搜索来源关键字'; | ||
| 461 | +alter table weibo_note add column `source_keyword` varchar(255) default '' comment '搜索来源关键字'; | ||
| 462 | +alter table xhs_note add column `source_keyword` varchar(255) default '' comment '搜索来源关键字'; | ||
| 463 | +alter table tieba_note add column `source_keyword` varchar(255) default '' comment '搜索来源关键字'; | ||
| 471 | 464 | ||
| 472 | 465 | ||
| 473 | DROP TABLE IF EXISTS `weibo_creator`; | 466 | DROP TABLE IF EXISTS `weibo_creator`; |
| @@ -218,6 +218,8 @@ playwright install chromium | @@ -218,6 +218,8 @@ playwright install chromium | ||
| 218 | 218 | ||
| 219 | #### 4.1 Configure API Keys | 219 | #### 4.1 Configure API Keys |
| 220 | 220 | ||
| 221 | +Copy the `config.py.example` file to `config.py` | ||
| 222 | + | ||
| 221 | Edit the `config.py` file and fill in your API keys (you can also choose your own models and search proxies; see the config file for details): | 223 | Edit the `config.py` file and fill in your API keys (you can also choose your own models and search proxies; see the config file for details): |
| 222 | 224 | ||
| 223 | ```python | 225 | ```python |
| @@ -243,6 +245,9 @@ INSIGHT_ENGINE_MODEL_NAME = "kimi-k2-0711-preview" | @@ -243,6 +245,9 @@ INSIGHT_ENGINE_MODEL_NAME = "kimi-k2-0711-preview" | ||
| 243 | #### 4.2 Database Initialization | 245 | #### 4.2 Database Initialization |
| 244 | 246 | ||
| 245 | **Option 1: Use Local Database** | 247 | **Option 1: Use Local Database** |
| 248 | + | ||
| 249 | +You can refer to `MindSpider\config.py\config.py.example` for the configuration template, and you can copy this file and rename it to `config.py`. | ||
| 250 | + | ||
| 246 | ```bash | 251 | ```bash |
| 247 | # Local MySQL database initialization | 252 | # Local MySQL database initialization |
| 248 | cd MindSpider | 253 | cd MindSpider |
| @@ -21,6 +21,9 @@ | @@ -21,6 +21,9 @@ | ||
| 21 | 21 | ||
| 22 | </div> | 22 | </div> |
| 23 | 23 | ||
| 24 | +> [!IMPORTANT] | ||
| 25 | +> 周一(11.3)会上**在线一键部署体验**,欢迎持续关注! | ||
| 26 | + | ||
| 24 | ## ⚡ 项目概述 | 27 | ## ⚡ 项目概述 |
| 25 | 28 | ||
| 26 | “**微舆**” 是一个从0实现的创新型 多智能体 舆情分析系统,帮助大家破除信息茧房,还原舆情原貌,预测未来走向,辅助决策。用户只需像聊天一样提出分析需求,智能体开始全自动分析 国内外30+主流社媒 与 数百万条大众评论。 | 29 | “**微舆**” 是一个从0实现的创新型 多智能体 舆情分析系统,帮助大家破除信息茧房,还原舆情原貌,预测未来走向,辅助决策。用户只需像聊天一样提出分析需求,智能体开始全自动分析 国内外30+主流社媒 与 数百万条大众评论。 |
| @@ -220,6 +223,8 @@ playwright install chromium | @@ -220,6 +223,8 @@ playwright install chromium | ||
| 220 | 223 | ||
| 221 | #### 4.1 配置API密钥 | 224 | #### 4.1 配置API密钥 |
| 222 | 225 | ||
| 226 | +复制一份 `config.py.example` 文件,命名为 `config.py` | ||
| 227 | + | ||
| 223 | 编辑 `config.py` 文件,填入您的API密钥(您也可以选择自己的模型、搜索代理,详情见config文件内): | 228 | 编辑 `config.py` 文件,填入您的API密钥(您也可以选择自己的模型、搜索代理,详情见config文件内): |
| 224 | 229 | ||
| 225 | ```python | 230 | ```python |
| @@ -248,6 +253,8 @@ INSIGHT_ENGINE_MODEL_NAME = "kimi-k2-0711-preview" | @@ -248,6 +253,8 @@ INSIGHT_ENGINE_MODEL_NAME = "kimi-k2-0711-preview" | ||
| 248 | 253 | ||
| 249 | > MindSpider爬虫系统跟舆情系统是各自独立的,所以需要再去`MindSpider\config.py`配置一下 | 254 | > MindSpider爬虫系统跟舆情系统是各自独立的,所以需要再去`MindSpider\config.py`配置一下 |
| 250 | 255 | ||
| 256 | +配置模板可以参考`MindSpider\config.py\config.py.example`,可以复制该文件并命名为`config.py` | ||
| 257 | + | ||
| 251 | ```bash | 258 | ```bash |
| 252 | # 本地MySQL数据库初始化 | 259 | # 本地MySQL数据库初始化 |
| 253 | cd MindSpider | 260 | cd MindSpider |
-
Please register or login to post a comment