666ghj
@@ -295,6 +295,8 @@ secrets.json @@ -295,6 +295,8 @@ secrets.json
295 *.key 295 *.key
296 *.pem 296 *.pem
297 *.crt 297 *.crt
  298 +config.py
  299 +MindSpider/config.py
298 300
299 # API 密钥 301 # API 密钥
300 api_keys.txt 302 api_keys.txt
@@ -12,6 +12,7 @@ import json @@ -12,6 +12,7 @@ import json
12 from datetime import datetime, date 12 from datetime import datetime, date
13 from pathlib import Path 13 from pathlib import Path
14 from typing import List, Dict, Optional 14 from typing import List, Dict, Optional
  15 +from loguru import logger
15 16
16 # 添加项目根目录到路径 17 # 添加项目根目录到路径
17 project_root = Path(__file__).parent.parent 18 project_root = Path(__file__).parent.parent
@@ -38,8 +39,7 @@ SOURCE_NAMES = { @@ -38,8 +39,7 @@ SOURCE_NAMES = {
38 "wallstreetcn": "华尔街见闻", 39 "wallstreetcn": "华尔街见闻",
39 "thepaper": "澎湃新闻", 40 "thepaper": "澎湃新闻",
40 "cls-hot": "财联社", 41 "cls-hot": "财联社",
41 - "xueqiu": "雪球热榜",  
42 - "kuaishou": "快手热榜" 42 + "xueqiu": "雪球热榜"
43 } 43 }
44 44
45 class NewsCollector: 45 class NewsCollector:
@@ -72,15 +72,25 @@ class NewsCollector: @@ -72,15 +72,25 @@ class NewsCollector:
72 async def fetch_news(self, source: str) -> dict: 72 async def fetch_news(self, source: str) -> dict:
73 """从指定源获取最新新闻""" 73 """从指定源获取最新新闻"""
74 url = f"{BASE_URL}/api/s?id={source}&latest" 74 url = f"{BASE_URL}/api/s?id={source}&latest"
75 - headers = {"Accept": "application/json"} 75 + headers = {
  76 + "Accept": "application/json, text/plain, */*",
  77 + "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
  78 + "User-Agent": (
  79 + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
  80 + "AppleWebKit/537.36 (KHTML, like Gecko) "
  81 + "Chrome/124.0.0.0 Safari/537.36"
  82 + ),
  83 + "Referer": BASE_URL,
  84 + "Connection": "keep-alive",
  85 + }
76 86
77 try: 87 try:
78 - async with httpx.AsyncClient(timeout=30.0) as client: 88 + async with httpx.AsyncClient(timeout=30.0, follow_redirects=True) as client:
79 response = await client.get(url, headers=headers) 89 response = await client.get(url, headers=headers)
80 response.raise_for_status() 90 response.raise_for_status()
81 91
82 # 解析JSON响应 92 # 解析JSON响应
83 - data = json.loads(response.text) 93 + data = response.json()
84 return { 94 return {
85 "source": source, 95 "source": source,
86 "status": "success", 96 "status": "success",
@@ -91,21 +101,21 @@ class NewsCollector: @@ -91,21 +101,21 @@ class NewsCollector:
91 return { 101 return {
92 "source": source, 102 "source": source,
93 "status": "timeout", 103 "status": "timeout",
94 - "error": "请求超时", 104 + "error": f"请求超时: {source}({url})",
95 "timestamp": datetime.now().isoformat() 105 "timestamp": datetime.now().isoformat()
96 } 106 }
97 except httpx.HTTPStatusError as e: 107 except httpx.HTTPStatusError as e:
98 return { 108 return {
99 "source": source, 109 "source": source,
100 "status": "http_error", 110 "status": "http_error",
101 - "error": f"HTTP错误: {e.response.status_code}", 111 + "error": f"HTTP错误: {source}({url}) - {e.response.status_code}",
102 "timestamp": datetime.now().isoformat() 112 "timestamp": datetime.now().isoformat()
103 } 113 }
104 except Exception as e: 114 except Exception as e:
105 return { 115 return {
106 "source": source, 116 "source": source,
107 "status": "error", 117 "status": "error",
108 - "error": f"未知错误: {str(e)}", 118 + "error": f"未知错误: {source}({url}) - {str(e)}",
109 "timestamp": datetime.now().isoformat() 119 "timestamp": datetime.now().isoformat()
110 } 120 }
111 121
@@ -114,13 +124,13 @@ class NewsCollector: @@ -114,13 +124,13 @@ class NewsCollector:
114 if sources is None: 124 if sources is None:
115 sources = list(SOURCE_NAMES.keys()) 125 sources = list(SOURCE_NAMES.keys())
116 126
117 - print(f"正在获取 {len(sources)} 个新闻源的最新内容...")  
118 - print("=" * 80) 127 + logger.info(f"正在获取 {len(sources)} 个新闻源的最新内容...")
  128 + logger.info("=" * 80)
119 129
120 results = [] 130 results = []
121 for source in sources: 131 for source in sources:
122 source_name = SOURCE_NAMES.get(source, source) 132 source_name = SOURCE_NAMES.get(source, source)
123 - print(f"正在获取 {source_name} 的新闻...") 133 + logger.info(f"正在获取 {source_name} 的新闻...")
124 result = await self.fetch_news(source) 134 result = await self.fetch_news(source)
125 results.append(result) 135 results.append(result)
126 136
@@ -128,11 +138,11 @@ class NewsCollector: @@ -128,11 +138,11 @@ class NewsCollector:
128 data = result["data"] 138 data = result["data"]
129 if 'items' in data and isinstance(data['items'], list): 139 if 'items' in data and isinstance(data['items'], list):
130 count = len(data['items']) 140 count = len(data['items'])
131 - print(f"✓ {source_name}: 获取成功,共 {count} 条新闻") 141 + logger.info(f"✓ {source_name}: 获取成功,共 {count} 条新闻")
132 else: 142 else:
133 - print(f"✓ {source_name}: 获取成功") 143 + logger.info(f"✓ {source_name}: 获取成功")
134 else: 144 else:
135 - print(f"✗ {source_name}: {result.get('error', '获取失败')}") 145 + logger.error(f"✗ {source_name}: {result.get('error', '获取失败')}")
136 146
137 # 避免请求过快 147 # 避免请求过快
138 await asyncio.sleep(0.5) 148 await asyncio.sleep(0.5)
@@ -151,18 +161,21 @@ class NewsCollector: @@ -151,18 +161,21 @@ class NewsCollector:
151 Returns: 161 Returns:
152 包含收集结果的字典 162 包含收集结果的字典
153 """ 163 """
154 - print(f"开始收集每日热点新闻...")  
155 - print(f"时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") 164 + collection_summary_message = ""
  165 + collection_summary_message += "\n开始收集每日热点新闻...\n"
  166 + collection_summary_message += f"时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n"
156 167
157 # 选择新闻源 168 # 选择新闻源
158 if sources is None: 169 if sources is None:
159 # 使用所有支持的新闻源 170 # 使用所有支持的新闻源
160 sources = list(SOURCE_NAMES.keys()) 171 sources = list(SOURCE_NAMES.keys())
161 172
162 - print(f"将从 {len(sources)} 个新闻源收集数据:") 173 + collection_summary_message += f"将从 {len(sources)} 个新闻源收集数据:\n"
163 for source in sources: 174 for source in sources:
164 source_name = SOURCE_NAMES.get(source, source) 175 source_name = SOURCE_NAMES.get(source, source)
165 - print(f" - {source_name}") 176 + collection_summary_message += f" - {source_name}\n"
  177 +
  178 + logger.info(collection_summary_message)
166 179
167 try: 180 try:
168 # 获取新闻数据 181 # 获取新闻数据
@@ -185,7 +198,7 @@ class NewsCollector: @@ -185,7 +198,7 @@ class NewsCollector:
185 return processed_data 198 return processed_data
186 199
187 except Exception as e: 200 except Exception as e:
188 - print(f"收集新闻失败: {e}") 201 + logger.exception(f"收集新闻失败: {e}")
189 return { 202 return {
190 'success': False, 203 'success': False,
191 'error': str(e), 204 'error': str(e),
@@ -255,35 +268,30 @@ class NewsCollector: @@ -255,35 +268,30 @@ class NewsCollector:
255 } 268 }
256 269
257 except Exception as e: 270 except Exception as e:
258 - print(f"处理新闻项失败: {e}") 271 + logger.exception(f"处理新闻项失败: {e}")
259 return None 272 return None
260 273
261 def _print_collection_summary(self, data: Dict): 274 def _print_collection_summary(self, data: Dict):
262 """打印收集摘要""" 275 """打印收集摘要"""
263 - print("\n" + "=" * 50)  
264 - print("新闻收集摘要")  
265 - print("=" * 50)  
266 -  
267 - print(f"总新闻源: {data['total_sources']}")  
268 - print(f"成功源数: {data['successful_sources']}")  
269 - print(f"总新闻数: {data['total_news']}")  
270 - 276 + collection_summary_message = ""
  277 + collection_summary_message += f"\n总新闻源: {data['total_sources']}\n"
  278 + collection_summary_message += f"成功源数: {data['successful_sources']}\n"
  279 + collection_summary_message += f"总新闻数: {data['total_news']}\n"
271 if 'saved_count' in data: 280 if 'saved_count' in data:
272 - print(f"已保存数: {data['saved_count']}")  
273 -  
274 - print("=" * 50) 281 + collection_summary_message += f"已保存数: {data['saved_count']}\n"
  282 + logger.info(collection_summary_message)
275 283
276 def get_today_news(self) -> List[Dict]: 284 def get_today_news(self) -> List[Dict]:
277 """获取今天的新闻""" 285 """获取今天的新闻"""
278 try: 286 try:
279 return self.db_manager.get_daily_news(date.today()) 287 return self.db_manager.get_daily_news(date.today())
280 except Exception as e: 288 except Exception as e:
281 - print(f"获取今日新闻失败: {e}") 289 + logger.exception(f"获取今日新闻失败: {e}")
282 return [] 290 return []
283 291
284 async def main(): 292 async def main():
285 """测试新闻收集器""" 293 """测试新闻收集器"""
286 - print("测试新闻收集器...") 294 + logger.info("测试新闻收集器...")
287 295
288 async with NewsCollector() as collector: 296 async with NewsCollector() as collector:
289 # 收集新闻 297 # 收集新闻
@@ -292,9 +300,9 @@ async def main(): @@ -292,9 +300,9 @@ async def main():
292 ) 300 )
293 301
294 if result['success']: 302 if result['success']:
295 - print(f"收集成功!共获取 {result['total_news']} 条新闻") 303 + logger.info(f"收集成功!共获取 {result['total_news']} 条新闻")
296 else: 304 else:
297 - print(f"收集失败: {result.get('error', '未知错误')}") 305 + logger.error(f"收集失败: {result.get('error', '未知错误')}")
298 306
299 if __name__ == "__main__": 307 if __name__ == "__main__":
300 asyncio.run(main()) 308 asyncio.run(main())
@@ -455,19 +455,12 @@ CREATE TABLE tieba_comment @@ -455,19 +455,12 @@ CREATE TABLE tieba_comment
455 KEY `idx_tieba_comment_publish_time` (`publish_time`) 455 KEY `idx_tieba_comment_publish_time` (`publish_time`)
456 ) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='贴吧评论表'; 456 ) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='贴吧评论表';
457 457
458 --- 增加搜索来源关键字字段  
459 -alter table bilibili_video  
460 - add column `source_keyword` varchar(255) default '' comment '搜索来源关键字';  
461 -alter table douyin_aweme  
462 - add column `source_keyword` varchar(255) default '' comment '搜索来源关键字';  
463 -alter table kuaishou_video  
464 - add column `source_keyword` varchar(255) default '' comment '搜索来源关键字';  
465 -alter table weibo_note  
466 - add column `source_keyword` varchar(255) default '' comment '搜索来源关键字';  
467 -alter table xhs_note  
468 - add column `source_keyword` varchar(255) default '' comment '搜索来源关键字';  
469 -alter table tieba_note  
470 - add column `source_keyword` varchar(255) default '' comment '搜索来源关键字'; 458 +alter table bilibili_video add column `source_keyword` varchar(255) default '' comment '搜索来源关键字';
  459 +alter table douyin_aweme add column `source_keyword` varchar(255) default '' comment '搜索来源关键字';
  460 +alter table kuaishou_video add column `source_keyword` varchar(255) default '' comment '搜索来源关键字';
  461 +alter table weibo_note add column `source_keyword` varchar(255) default '' comment '搜索来源关键字';
  462 +alter table xhs_note add column `source_keyword` varchar(255) default '' comment '搜索来源关键字';
  463 +alter table tieba_note add column `source_keyword` varchar(255) default '' comment '搜索来源关键字';
471 464
472 465
473 DROP TABLE IF EXISTS `weibo_creator`; 466 DROP TABLE IF EXISTS `weibo_creator`;
@@ -218,6 +218,8 @@ playwright install chromium @@ -218,6 +218,8 @@ playwright install chromium
218 218
219 #### 4.1 Configure API Keys 219 #### 4.1 Configure API Keys
220 220
  221 +Copy the `config.py.example` file to `config.py`
  222 +
221 Edit the `config.py` file and fill in your API keys (you can also choose your own models and search proxies; see the config file for details): 223 Edit the `config.py` file and fill in your API keys (you can also choose your own models and search proxies; see the config file for details):
222 224
223 ```python 225 ```python
@@ -243,6 +245,9 @@ INSIGHT_ENGINE_MODEL_NAME = "kimi-k2-0711-preview" @@ -243,6 +245,9 @@ INSIGHT_ENGINE_MODEL_NAME = "kimi-k2-0711-preview"
243 #### 4.2 Database Initialization 245 #### 4.2 Database Initialization
244 246
245 **Option 1: Use Local Database** 247 **Option 1: Use Local Database**
  248 +
  249 +You can refer to `MindSpider\config.py\config.py.example` for the configuration template, and you can copy this file and rename it to `config.py`.
  250 +
246 ```bash 251 ```bash
247 # Local MySQL database initialization 252 # Local MySQL database initialization
248 cd MindSpider 253 cd MindSpider
@@ -21,6 +21,9 @@ @@ -21,6 +21,9 @@
21 21
22 </div> 22 </div>
23 23
  24 +> [!IMPORTANT]
  25 +> 周一(11.3)会上**在线一键部署体验**,欢迎持续关注!
  26 +
24 ## ⚡ 项目概述 27 ## ⚡ 项目概述
25 28
26 **微舆**” 是一个从0实现的创新型 多智能体 舆情分析系统,帮助大家破除信息茧房,还原舆情原貌,预测未来走向,辅助决策。用户只需像聊天一样提出分析需求,智能体开始全自动分析 国内外30+主流社媒 与 数百万条大众评论。 29 **微舆**” 是一个从0实现的创新型 多智能体 舆情分析系统,帮助大家破除信息茧房,还原舆情原貌,预测未来走向,辅助决策。用户只需像聊天一样提出分析需求,智能体开始全自动分析 国内外30+主流社媒 与 数百万条大众评论。
@@ -220,6 +223,8 @@ playwright install chromium @@ -220,6 +223,8 @@ playwright install chromium
220 223
221 #### 4.1 配置API密钥 224 #### 4.1 配置API密钥
222 225
  226 +复制一份 `config.py.example` 文件,命名为 `config.py`
  227 +
223 编辑 `config.py` 文件,填入您的API密钥(您也可以选择自己的模型、搜索代理,详情见config文件内): 228 编辑 `config.py` 文件,填入您的API密钥(您也可以选择自己的模型、搜索代理,详情见config文件内):
224 229
225 ```python 230 ```python
@@ -248,6 +253,8 @@ INSIGHT_ENGINE_MODEL_NAME = "kimi-k2-0711-preview" @@ -248,6 +253,8 @@ INSIGHT_ENGINE_MODEL_NAME = "kimi-k2-0711-preview"
248 253
249 > MindSpider爬虫系统跟舆情系统是各自独立的,所以需要再去`MindSpider\config.py`配置一下 254 > MindSpider爬虫系统跟舆情系统是各自独立的,所以需要再去`MindSpider\config.py`配置一下
250 255
  256 +配置模板可以参考`MindSpider\config.py\config.py.example`,可以复制该文件并命名为`config.py`
  257 +
251 ```bash 258 ```bash
252 # 本地MySQL数据库初始化 259 # 本地MySQL数据库初始化
253 cd MindSpider 260 cd MindSpider
@@ -72,4 +72,5 @@ flake8>=6.0.0 @@ -72,4 +72,5 @@ flake8>=6.0.0
72 72
73 # ===== Web服务器 ===== 73 # ===== Web服务器 =====
74 fastapi==0.110.2 74 fastapi==0.110.2
75 -uvicorn==0.29.0  
  75 +uvicorn==0.29.0
  76 +loguru