Merge pull request #350 from 666ghj/feature/insight_agent_cluster

feat(insight_agent): search results cluster

Merge pull request #350 from 666ghj/feature/insight_agent_cluster
feat(insight_agent): search results cluster
马一丁 · GitHub
Commit 160d1e52afbbc9e828984d165ec0be5ef8a9409e 160d1e52 2 parents 5d96bd0b 46cf1fd7
Showing 2 changed files with 305 additions and 108 deletions
InsightEngine/agent.py
requirements.txt
--- a/InsightEngine/agent.py
View file @160d1e5
+++ b/InsightEngine/agent.py
View file @160d1e5
@@ -7,22 +7,35 @@ import json
 import os
 import re
 from datetime import datetime
- from typing import Optional, Dict, Any, List, Union
+ from typing import Any, Dict, List, Optional, Union
+ 
+ import numpy as np
 from loguru import logger
+ from sentence_transformers import SentenceTransformer
+ from sklearn.cluster import KMeans
 
 from .llms import LLMClient
 from .nodes import (
-     ReportStructureNode,
     FirstSearchNode,
-     ReflectionNode,
     FirstSummaryNode,
+     ReflectionNode,
     ReflectionSummaryNode,
-     ReportFormattingNode
+     ReportFormattingNode,
+     ReportStructureNode,
 )
 from .state import State
- from .tools import MediaCrawlerDB, DBResponse, keyword_optimizer, multilingual_sentiment_analyzer
- from .utils.config import settings, Settings
+ from .tools import (
+     DBResponse,
+     MediaCrawlerDB,
+     keyword_optimizer,
+     multilingual_sentiment_analyzer,
+ )
 from .utils import format_search_results_for_prompt
+ from .utils.config import Settings, settings
+ 
+ ENABLE_CLUSTERING: bool = True  # 是否启用聚类采样
+ MAX_CLUSTERED_RESULTS: int = 50  # 聚类后最大返回结果数
+ RESULTS_PER_CLUSTER: int = 5  # 每个聚类返回的结果数
 
 
 class DeepSearchAgent:
@@ -40,10 +53,12 @@ class DeepSearchAgent:
         # 初始化LLM客户端
         self.llm_client = self._initialize_llm()
 
-         
         # 初始化搜索工具集
         self.search_agency = MediaCrawlerDB()
 
+         # 初始化聚类小模型（懒加载）
+         self._clustering_model = None
+ 
         # 初始化情感分析器
         self.sentiment_analyzer = multilingual_sentiment_analyzer
 
@@ -77,6 +92,15 @@ class DeepSearchAgent:
         self.reflection_summary_node = ReflectionSummaryNode(self.llm_client)
         self.report_formatting_node = ReportFormattingNode(self.llm_client)
 
+     def _get_clustering_model(self):
+         """懒加载聚类模型"""
+         if self._clustering_model is None:
+             logger.info("  加载聚类模型 (paraphrase-multilingual-MiniLM-L12-v2)...")
+             self._clustering_model = SentenceTransformer(
+                 "paraphrase-multilingual-MiniLM-L12-v2"
+             )
+         return self._clustering_model
+ 
     def _validate_date_format(self, date_str: str) -> bool:
         """
         验证日期格式是否为YYYY-MM-DD
@@ -91,17 +115,78 @@ class DeepSearchAgent:
             return False
 
         # 检查格式
-         pattern = r'^\d{4}-\d{2}-\d{2}$'
+         pattern = r"^\d{4}-\d{2}-\d{2}$"
         if not re.match(pattern, date_str):
             return False
 
         # 检查日期是否有效
         try:
-             datetime.strptime(date_str, '%Y-%m-%d')
+             datetime.strptime(date_str, "%Y-%m-%d")
             return True
         except ValueError:
             return False
 
+     def _cluster_and_sample_results(
+         self,
+         results: List,
+         max_results: int = MAX_CLUSTERED_RESULTS,
+         results_per_cluster: int = RESULTS_PER_CLUSTER,
+     ) -> List:
+         """
+         对搜索结果进行聚类并采样
+ 
+         Args:
+             results: 搜索结果列表
+             max_results: 最大返回结果数
+             results_per_cluster: 每个聚类返回的结果数
+ 
+         Returns:
+             采样后的结果列表
+         """
+         if len(results) <= max_results:
+             return results
+ 
+         try:
+             # 提取文本
+             texts = [r.title_or_content[:500] for r in results]
+ 
+             # 获取模型并编码
+             model = self._get_clustering_model()
+             embeddings = model.encode(texts, show_progress_bar=False)
+ 
+             # 计算聚类数
+             n_clusters = min(max(2, max_results // results_per_cluster), len(results))
+ 
+             # KMeans聚类
+             kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
+             labels = kmeans.fit_predict(embeddings)
+ 
+             # 从每个聚类采样
+             sampled_results = []
+             for cluster_id in range(n_clusters):
+                 cluster_indices = np.flatnonzero(labels == cluster_id)
+                 cluster_results = [(results[i], i) for i in cluster_indices]
+                 cluster_results.sort(
+                     key=lambda x: x[0].hotness_score or 0, reverse=True
+                 )
+ 
+                 for result, _ in cluster_results[:results_per_cluster]:
+                     sampled_results.append(result)
+                     if len(sampled_results) >= max_results:
+                         break
+ 
+                 if len(sampled_results) >= max_results:
+                     break
+ 
+             logger.info(
+                 f"  聚类完成: {len(results)} 条 -> {n_clusters} 个主题 -> {len(sampled_results)} 条代表性结果"
+             )
+             return sampled_results
+ 
+         except Exception as e:
+             logger.warning(f"  聚类失败，返回前{max_results}条: {str(e)}")
+             return results[:max_results]
+ 
     def execute_search_tool(self, tool_name: str, query: str, **kwargs) -> DBResponse:
         """
         执行指定的数据库查询工具（集成关键词优化中间件和情感分析）
@@ -127,7 +212,9 @@ class DeepSearchAgent:
         if tool_name == "search_hot_content":
             time_period = kwargs.get("time_period", "week")
             limit = kwargs.get("limit", 100)
-             response = self.search_agency.search_hot_content(time_period=time_period, limit=limit)
+             response = self.search_agency.search_hot_content(
+                 time_period=time_period, limit=limit
+             )
 
             # 检查是否需要进行情感分析
             enable_sentiment = kwargs.get("enable_sentiment", True)
@@ -151,17 +238,16 @@ class DeepSearchAgent:
                 tool_name="analyze_sentiment",
                 parameters={
                     "texts": texts if isinstance(texts, list) else [texts],
-                     **kwargs
+                     **kwargs,
                 },
                 results=[],  # 情感分析不返回搜索结果
                 results_count=0,
-                 metadata=sentiment_result
+                 metadata=sentiment_result,
             )
 
         # 对于需要搜索词的工具，使用关键词优化中间件
         optimized_response = keyword_optimizer.optimize_keywords(
-             original_query=query,
-             context=f"使用{tool_name}工具进行查询"
+             original_query=query, context=f"使用{tool_name}工具进行查询"
         )
 
         logger.info(f"  🔍 原始查询: '{query}'")
@@ -177,34 +263,62 @@ class DeepSearchAgent:
             try:
                 if tool_name == "search_topic_globally":
                     # 使用配置文件中的默认值，忽略agent提供的limit_per_table参数
-                     limit_per_table = self.config.DEFAULT_SEARCH_TOPIC_GLOBALLY_LIMIT_PER_TABLE
-                     response = self.search_agency.search_topic_globally(topic=keyword, limit_per_table=limit_per_table)
+                     limit_per_table = (
+                         self.config.DEFAULT_SEARCH_TOPIC_GLOBALLY_LIMIT_PER_TABLE
+                     )
+                     response = self.search_agency.search_topic_globally(
+                         topic=keyword, limit_per_table=limit_per_table
+                     )
                 elif tool_name == "search_topic_by_date":
                     start_date = kwargs.get("start_date")
                     end_date = kwargs.get("end_date")
                     # 使用配置文件中的默认值，忽略agent提供的limit_per_table参数
-                     limit_per_table = self.config.DEFAULT_SEARCH_TOPIC_BY_DATE_LIMIT_PER_TABLE
+                     limit_per_table = (
+                         self.config.DEFAULT_SEARCH_TOPIC_BY_DATE_LIMIT_PER_TABLE
+                     )
                     if not start_date or not end_date:
-                         raise ValueError("search_topic_by_date工具需要start_date和end_date参数")
-                     response = self.search_agency.search_topic_by_date(topic=keyword, start_date=start_date, end_date=end_date, limit_per_table=limit_per_table)
+                         raise ValueError(
+                             "search_topic_by_date工具需要start_date和end_date参数"
+                         )
+                     response = self.search_agency.search_topic_by_date(
+                         topic=keyword,
+                         start_date=start_date,
+                         end_date=end_date,
+                         limit_per_table=limit_per_table,
+                     )
                 elif tool_name == "get_comments_for_topic":
                     # 使用配置文件中的默认值，按关键词数量分配，但保证最小值
-                     limit = self.config.DEFAULT_GET_COMMENTS_FOR_TOPIC_LIMIT // len(optimized_response.optimized_keywords)
+                     limit = self.config.DEFAULT_GET_COMMENTS_FOR_TOPIC_LIMIT // len(
+                         optimized_response.optimized_keywords
+                     )
                     limit = max(limit, 50)
-                     response = self.search_agency.get_comments_for_topic(topic=keyword, limit=limit)
+                     response = self.search_agency.get_comments_for_topic(
+                         topic=keyword, limit=limit
+                     )
                 elif tool_name == "search_topic_on_platform":
                     platform = kwargs.get("platform")
                     start_date = kwargs.get("start_date")
                     end_date = kwargs.get("end_date")
                     # 使用配置文件中的默认值，按关键词数量分配，但保证最小值
-                     limit = self.config.DEFAULT_SEARCH_TOPIC_ON_PLATFORM_LIMIT // len(optimized_response.optimized_keywords)
+                     limit = self.config.DEFAULT_SEARCH_TOPIC_ON_PLATFORM_LIMIT // len(
+                         optimized_response.optimized_keywords
+                     )
                     limit = max(limit, 30)
                     if not platform:
                         raise ValueError("search_topic_on_platform工具需要platform参数")
-                     response = self.search_agency.search_topic_on_platform(platform=platform, topic=keyword, start_date=start_date, end_date=end_date, limit=limit)
+                     response = self.search_agency.search_topic_on_platform(
+                         platform=platform,
+                         topic=keyword,
+                         start_date=start_date,
+                         end_date=end_date,
+                         limit=limit,
+                     )
                 else:
                     logger.info(f"    未知的搜索工具: {tool_name}，使用默认全局搜索")
-                     response = self.search_agency.search_topic_globally(topic=keyword, limit_per_table=self.config.DEFAULT_SEARCH_TOPIC_GLOBALLY_LIMIT_PER_TABLE)
+                     response = self.search_agency.search_topic_globally(
+                         topic=keyword,
+                         limit_per_table=self.config.DEFAULT_SEARCH_TOPIC_GLOBALLY_LIMIT_PER_TABLE,
+                     )
 
                 # 收集结果
                 if response.results:
@@ -222,6 +336,13 @@ class DeepSearchAgent:
         unique_results = self._deduplicate_results(all_results)
         logger.info(f"  总计找到 {total_count} 条结果，去重后 {len(unique_results)} 条")
 
+         if ENABLE_CLUSTERING:
+             unique_results = self._cluster_and_sample_results(
+                 unique_results,
+                 max_results=MAX_CLUSTERED_RESULTS,
+                 results_per_cluster=RESULTS_PER_CLUSTER,
+             )
+ 
         # 构建整合后的响应
         integrated_response = DBResponse(
             tool_name=f"{tool_name}_optimized",
@@ -229,10 +350,10 @@ class DeepSearchAgent:
                 "original_query": query,
                 "optimized_keywords": optimized_response.optimized_keywords,
                 "optimization_reasoning": optimized_response.reasoning,
-                 **kwargs
+                 **kwargs,
             },
             results=unique_results,
-             results_count=len(unique_results)
+             results_count=len(unique_results),
         )
 
         # 检查是否需要进行情感分析
@@ -242,7 +363,9 @@ class DeepSearchAgent:
             sentiment_analysis = self._perform_sentiment_analysis(unique_results)
             if sentiment_analysis:
                 # 将情感分析结果添加到响应的parameters中
-                 integrated_response.parameters["sentiment_analysis"] = sentiment_analysis
+                 integrated_response.parameters["sentiment_analysis"] = (
+                     sentiment_analysis
+                 )
                 logger.info(f"  ✅ 情感分析完成")
 
         return integrated_response
@@ -275,7 +398,10 @@ class DeepSearchAgent:
         """
         try:
             # 初始化情感分析器（如果尚未初始化且未被禁用）
-             if not self.sentiment_analyzer.is_initialized and not self.sentiment_analyzer.is_disabled:
+             if (
+                 not self.sentiment_analyzer.is_initialized
+                 and not self.sentiment_analyzer.is_disabled
+             ):
                 logger.info("    初始化情感分析模型...")
                 if not self.sentiment_analyzer.initialize():
                     logger.info("     情感分析模型初始化失败，将直接透传原始文本")
@@ -290,15 +416,15 @@ class DeepSearchAgent:
                     "platform": result.platform,
                     "author": result.author_nickname,
                     "url": result.url,
-                     "publish_time": str(result.publish_time) if result.publish_time else None
+                     "publish_time": str(result.publish_time)
+                     if result.publish_time
+                     else None,
                 }
                 results_dict.append(result_dict)
 
             # 执行情感分析
             sentiment_analysis = self.sentiment_analyzer.analyze_query_results(
-                 query_results=results_dict,
-                 text_field="content",
-                 min_confidence=0.5
+                 query_results=results_dict, text_field="content", min_confidence=0.5
             )
 
             return sentiment_analysis.get("sentiment_analysis")
@@ -321,7 +447,10 @@ class DeepSearchAgent:
 
         try:
             # 初始化情感分析器（如果尚未初始化且未被禁用）
-             if not self.sentiment_analyzer.is_initialized and not self.sentiment_analyzer.is_disabled:
+             if (
+                 not self.sentiment_analyzer.is_initialized
+                 and not self.sentiment_analyzer.is_disabled
+             ):
                 logger.info("    初始化情感分析模型...")
                 if not self.sentiment_analyzer.initialize():
                     logger.info("     情感分析模型初始化失败，将直接透传原始文本")
@@ -334,28 +463,43 @@ class DeepSearchAgent:
                 result_dict = result.__dict__
                 response = {
                     "success": result.success and result.analysis_performed,
-                     "total_analyzed": 1 if result.analysis_performed and result.success else 0,
-                     "results": [result_dict]
+                     "total_analyzed": 1
+                     if result.analysis_performed and result.success
+                     else 0,
+                     "results": [result_dict],
                 }
                 if not result.analysis_performed:
                     response["success"] = False
-                     response["warning"] = result.error_message or "情感分析功能不可用，已直接返回原始文本"
+                     response["warning"] = (
+                         result.error_message or "情感分析功能不可用，已直接返回原始文本"
+                     )
                 return response
             else:
                 texts_list = list(texts)
-                 batch_result = self.sentiment_analyzer.analyze_batch(texts_list, show_progress=True)
+                 batch_result = self.sentiment_analyzer.analyze_batch(
+                     texts_list, show_progress=True
+                 )
                 response = {
-                     "success": batch_result.analysis_performed and batch_result.success_count > 0,
-                     "total_analyzed": batch_result.total_processed if batch_result.analysis_performed else 0,
+                     "success": batch_result.analysis_performed
+                     and batch_result.success_count > 0,
+                     "total_analyzed": batch_result.total_processed
+                     if batch_result.analysis_performed
+                     else 0,
                     "success_count": batch_result.success_count,
                     "failed_count": batch_result.failed_count,
-                     "average_confidence": batch_result.average_confidence if batch_result.analysis_performed else 0.0,
-                     "results": [result.__dict__ for result in batch_result.results]
+                     "average_confidence": batch_result.average_confidence
+                     if batch_result.analysis_performed
+                     else 0.0,
+                     "results": [result.__dict__ for result in batch_result.results],
                 }
                 if not batch_result.analysis_performed:
                     warning = next(
-                         (r.error_message for r in batch_result.results if r.error_message),
-                         "情感分析功能不可用，已直接返回原始文本"
+                         (
+                             r.error_message
+                             for r in batch_result.results
+                             if r.error_message
+                         ),
+                         "情感分析功能不可用，已直接返回原始文本",
                     )
                     response["success"] = False
                     response["warning"] = warning
@@ -363,11 +507,7 @@ class DeepSearchAgent:
 
         except Exception as e:
             logger.exception(f"    ❌ 情感分析过程中发生错误: {str(e)}")
-             return {
-                 "success": False,
-                 "error": str(e),
-                 "results": []
-             }
+             return {"success": False, "error": str(e), "results": []}
 
     def research(self, query: str, save_report: bool = True) -> str:
         """
@@ -380,9 +520,9 @@ class DeepSearchAgent:
         Returns:
             最终报告内容
         """
-         logger.info(f"\n{'='*60}")
+         logger.info(f"\n{'=' * 60}")
         logger.info(f"开始深度研究: {query}")
-         logger.info(f"{'='*60}")
+         logger.info(f"{'=' * 60}")
 
         try:
             # Step 1: 生成报告结构
@@ -426,7 +566,9 @@ class DeepSearchAgent:
         total_paragraphs = len(self.state.paragraphs)
 
         for i in range(total_paragraphs):
-             logger.info(f"\n[步骤 2.{i+1}] 处理段落: {self.state.paragraphs[i].title}")
+             logger.info(
+                 f"\n[步骤 2.{i + 1}] 处理段落: {self.state.paragraphs[i].title}"
+             )
             logger.info("-" * 50)
 
             # 初始搜索和总结
@@ -446,16 +588,15 @@ class DeepSearchAgent:
         paragraph = self.state.paragraphs[paragraph_index]
 
         # 准备搜索输入
-         search_input = {
-             "title": paragraph.title,
-             "content": paragraph.content
-         }
+         search_input = {"title": paragraph.title, "content": paragraph.content}
 
         # 生成搜索查询和工具选择
         logger.info("  - 生成搜索查询...")
         search_output = self.first_search_node.run(search_input)
         search_query = search_output["search_query"]
-         search_tool = search_output.get("search_tool", "search_topic_globally")  # 默认工具
+         search_tool = search_output.get(
+             "search_tool", "search_topic_globally"
+         )  # 默认工具
         reasoning = search_output["reasoning"]
 
         logger.info(f"  - 搜索查询: {search_query}")
@@ -475,13 +616,17 @@ class DeepSearchAgent:
 
             if start_date and end_date:
                 # 验证日期格式
-                 if self._validate_date_format(start_date) and self._validate_date_format(end_date):
+                 if self._validate_date_format(
+                     start_date
+                 ) and self._validate_date_format(end_date):
                     search_kwargs["start_date"] = start_date
                     search_kwargs["end_date"] = end_date
                     logger.info(f"  - 时间范围: {start_date} 到 {end_date}")
                 else:
                     logger.info(f"    日期格式错误（应为YYYY-MM-DD），改用全局搜索")
-                     logger.info(f"      提供的日期: start_date={start_date}, end_date={end_date}")
+                     logger.info(
+                         f"      提供的日期: start_date={start_date}, end_date={end_date}"
+                     )
                     search_tool = "search_topic_globally"
             elif search_tool == "search_topic_by_date":
                 logger.info(f"    search_topic_by_date工具缺少时间参数，改用全局搜索")
@@ -494,7 +639,9 @@ class DeepSearchAgent:
                 search_kwargs["platform"] = platform
                 logger.info(f"  - 指定平台: {platform}")
             else:
-                 logger.warning(f"    search_topic_on_platform工具缺少平台参数，改用全局搜索")
+                 logger.warning(
+                     f"    search_topic_on_platform工具缺少平台参数，改用全局搜索"
+                 )
                 search_tool = "search_topic_globally"
 
         # 处理限制参数，使用配置文件中的默认值而不是agent提供的参数
@@ -505,9 +652,13 @@ class DeepSearchAgent:
             search_kwargs["limit"] = limit
         elif search_tool in ["search_topic_globally", "search_topic_by_date"]:
             if search_tool == "search_topic_globally":
-                 limit_per_table = self.config.DEFAULT_SEARCH_TOPIC_GLOBALLY_LIMIT_PER_TABLE
+                 limit_per_table = (
+                     self.config.DEFAULT_SEARCH_TOPIC_GLOBALLY_LIMIT_PER_TABLE
+                 )
             else:  # search_topic_by_date
-                 limit_per_table = self.config.DEFAULT_SEARCH_TOPIC_BY_DATE_LIMIT_PER_TABLE
+                 limit_per_table = (
+                     self.config.DEFAULT_SEARCH_TOPIC_BY_DATE_LIMIT_PER_TABLE
+                 )
             search_kwargs["limit_per_table"] = limit_per_table
         elif search_tool in ["get_comments_for_topic", "search_topic_on_platform"]:
             if search_tool == "get_comments_for_topic":
@@ -516,34 +667,46 @@ class DeepSearchAgent:
                 limit = self.config.DEFAULT_SEARCH_TOPIC_ON_PLATFORM_LIMIT
             search_kwargs["limit"] = limit
 
-         search_response = self.execute_search_tool(search_tool, search_query, **search_kwargs)
+         search_response = self.execute_search_tool(
+             search_tool, search_query, **search_kwargs
+         )
 
         # 转换为兼容格式
         search_results = []
         if search_response and search_response.results:
             # 使用配置文件控制传递给LLM的结果数量，0表示不限制
             if self.config.MAX_SEARCH_RESULTS_FOR_LLM > 0:
-                 max_results = min(len(search_response.results), self.config.MAX_SEARCH_RESULTS_FOR_LLM)
+                 max_results = min(
+                     len(search_response.results), self.config.MAX_SEARCH_RESULTS_FOR_LLM
+                 )
             else:
                 max_results = len(search_response.results)  # 不限制，传递所有结果
             for result in search_response.results[:max_results]:
-                 search_results.append({
-                     'title': result.title_or_content,
-                     'url': result.url or "",
-                     'content': result.title_or_content,
-                     'score': result.hotness_score,
-                     'raw_content': result.title_or_content,
-                     'published_date': result.publish_time.isoformat() if result.publish_time else None,
-                     'platform': result.platform,
-                     'content_type': result.content_type,
-                     'author': result.author_nickname,
-                     'engagement': result.engagement
-                 })
+                 search_results.append(
+                     {
+                         "title": result.title_or_content,
+                         "url": result.url or "",
+                         "content": result.title_or_content,
+                         "score": result.hotness_score,
+                         "raw_content": result.title_or_content,
+                         "published_date": result.publish_time.isoformat()
+                         if result.publish_time
+                         else None,
+                         "platform": result.platform,
+                         "content_type": result.content_type,
+                         "author": result.author_nickname,
+                         "engagement": result.engagement,
+                     }
+                 )
 
         if search_results:
             _message = f"  - 找到 {len(search_results)} 个搜索结果"
             for j, result in enumerate(search_results, 1):
-                 date_info = f" (发布于: {result.get('published_date', 'N/A')})" if result.get('published_date') else ""
+                 date_info = (
+                     f" (发布于: {result.get('published_date', 'N/A')})"
+                     if result.get("published_date")
+                     else ""
+                 )
                 _message += f"\n    {j}. {result['title'][:50]}...{date_info}"
             logger.info(_message)
         else:
@@ -560,7 +723,7 @@ class DeepSearchAgent:
             "search_query": search_query,
             "search_results": format_search_results_for_prompt(
                 search_results, self.config.MAX_CONTENT_LENGTH
-             )
+             ),
         }
 
         # 更新状态
@@ -581,13 +744,15 @@ class DeepSearchAgent:
             reflection_input = {
                 "title": paragraph.title,
                 "content": paragraph.content,
-                 "paragraph_latest_state": paragraph.research.latest_summary
+                 "paragraph_latest_state": paragraph.research.latest_summary,
             }
 
             # 生成反思搜索查询
             reflection_output = self.reflection_node.run(reflection_input)
             search_query = reflection_output["search_query"]
-             search_tool = reflection_output.get("search_tool", "search_topic_globally")  # 默认工具
+             search_tool = reflection_output.get(
+                 "search_tool", "search_topic_globally"
+             )  # 默认工具
             reasoning = reflection_output["reasoning"]
 
             logger.info(f"    反思查询: {search_query}")
@@ -605,16 +770,24 @@ class DeepSearchAgent:
 
                 if start_date and end_date:
                     # 验证日期格式
-                     if self._validate_date_format(start_date) and self._validate_date_format(end_date):
+                     if self._validate_date_format(
+                         start_date
+                     ) and self._validate_date_format(end_date):
                         search_kwargs["start_date"] = start_date
                         search_kwargs["end_date"] = end_date
                         logger.info(f"    时间范围: {start_date} 到 {end_date}")
                     else:
-                         logger.info(f"      日期格式错误（应为YYYY-MM-DD），改用全局搜索")
-                         logger.info(f"        提供的日期: start_date={start_date}, end_date={end_date}")
+                         logger.info(
+                             f"      日期格式错误（应为YYYY-MM-DD），改用全局搜索"
+                         )
+                         logger.info(
+                             f"        提供的日期: start_date={start_date}, end_date={end_date}"
+                         )
                         search_tool = "search_topic_globally"
                 elif search_tool == "search_topic_by_date":
-                     logger.warning(f"      search_topic_by_date工具缺少时间参数，改用全局搜索")
+                     logger.warning(
+                         f"      search_topic_by_date工具缺少时间参数，改用全局搜索"
+                     )
                     search_tool = "search_topic_globally"
 
             # 处理需要平台参数的工具
@@ -624,7 +797,9 @@ class DeepSearchAgent:
                     search_kwargs["platform"] = platform
                     logger.info(f"    指定平台: {platform}")
                 else:
-                     logger.warning(f"      search_topic_on_platform工具缺少平台参数，改用全局搜索")
+                     logger.warning(
+                         f"      search_topic_on_platform工具缺少平台参数，改用全局搜索"
+                     )
                     search_tool = "search_topic_globally"
 
             # 处理限制参数
@@ -637,9 +812,13 @@ class DeepSearchAgent:
             elif search_tool in ["search_topic_globally", "search_topic_by_date"]:
                 # 使用配置文件中的默认值，不允许agent控制limit_per_table参数
                 if search_tool == "search_topic_globally":
-                     limit_per_table = self.config.DEFAULT_SEARCH_TOPIC_GLOBALLY_LIMIT_PER_TABLE
+                     limit_per_table = (
+                         self.config.DEFAULT_SEARCH_TOPIC_GLOBALLY_LIMIT_PER_TABLE
+                     )
                 else:  # search_topic_by_date
-                     limit_per_table = self.config.DEFAULT_SEARCH_TOPIC_BY_DATE_LIMIT_PER_TABLE
+                     limit_per_table = (
+                         self.config.DEFAULT_SEARCH_TOPIC_BY_DATE_LIMIT_PER_TABLE
+                     )
                 search_kwargs["limit_per_table"] = limit_per_table
             elif search_tool in ["get_comments_for_topic", "search_topic_on_platform"]:
                 # 使用配置文件中的默认值，不允许agent控制limit参数
@@ -649,34 +828,47 @@ class DeepSearchAgent:
                     limit = self.config.DEFAULT_SEARCH_TOPIC_ON_PLATFORM_LIMIT
                 search_kwargs["limit"] = limit
 
-             search_response = self.execute_search_tool(search_tool, search_query, **search_kwargs)
+             search_response = self.execute_search_tool(
+                 search_tool, search_query, **search_kwargs
+             )
 
             # 转换为兼容格式
             search_results = []
             if search_response and search_response.results:
                 # 使用配置文件控制传递给LLM的结果数量，0表示不限制
                 if self.config.MAX_SEARCH_RESULTS_FOR_LLM > 0:
-                     max_results = min(len(search_response.results), self.config.MAX_SEARCH_RESULTS_FOR_LLM)
+                     max_results = min(
+                         len(search_response.results),
+                         self.config.MAX_SEARCH_RESULTS_FOR_LLM,
+                     )
                 else:
                     max_results = len(search_response.results)  # 不限制，传递所有结果
                 for result in search_response.results[:max_results]:
-                     search_results.append({
-                         'title': result.title_or_content,
-                         'url': result.url or "",
-                         'content': result.title_or_content,
-                         'score': result.hotness_score,
-                         'raw_content': result.title_or_content,
-                         'published_date': result.publish_time.isoformat() if result.publish_time else None,
-                         'platform': result.platform,
-                         'content_type': result.content_type,
-                         'author': result.author_nickname,
-                         'engagement': result.engagement
-                     })
+                     search_results.append(
+                         {
+                             "title": result.title_or_content,
+                             "url": result.url or "",
+                             "content": result.title_or_content,
+                             "score": result.hotness_score,
+                             "raw_content": result.title_or_content,
+                             "published_date": result.publish_time.isoformat()
+                             if result.publish_time
+                             else None,
+                             "platform": result.platform,
+                             "content_type": result.content_type,
+                             "author": result.author_nickname,
+                             "engagement": result.engagement,
+                         }
+                     )
 
             if search_results:
                 _message = f"    找到 {len(search_results)} 个反思搜索结果"
                 for j, result in enumerate(search_results, 1):
-                     date_info = f" (发布于: {result.get('published_date', 'N/A')})" if result.get('published_date') else ""
+                     date_info = (
+                         f" (发布于: {result.get('published_date', 'N/A')})"
+                         if result.get("published_date")
+                         else ""
+                     )
                     _message += f"\n      {j}. {result['title'][:50]}...{date_info}"
                 logger.info(_message)
             else:
@@ -693,7 +885,7 @@ class DeepSearchAgent:
                 "search_results": format_search_results_for_prompt(
                     search_results, self.config.MAX_CONTENT_LENGTH
                 ),
-                 "paragraph_latest_state": paragraph.research.latest_summary
+                 "paragraph_latest_state": paragraph.research.latest_summary,
             }
 
             # 更新状态
@@ -710,10 +902,12 @@ class DeepSearchAgent:
         # 准备报告数据
         report_data = []
         for paragraph in self.state.paragraphs:
-             report_data.append({
+             report_data.append(
+                 {
                     "title": paragraph.title,
-                 "paragraph_latest_state": paragraph.research.latest_summary
-             })
+                     "paragraph_latest_state": paragraph.research.latest_summary,
+                 }
+             )
 
         # 格式化报告
         try:
@@ -735,14 +929,16 @@ class DeepSearchAgent:
         """保存报告到文件"""
         # 生成文件名
         timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-         query_safe = "".join(c for c in self.state.query if c.isalnum() or c in (' ', '-', '_')).rstrip()
-         query_safe = query_safe.replace(' ', '_')[:30]
+         query_safe = "".join(
+             c for c in self.state.query if c.isalnum() or c in (" ", "-", "_")
+         ).rstrip()
+         query_safe = query_safe.replace(" ", "_")[:30]
 
         filename = f"deep_search_report_{query_safe}_{timestamp}.md"
         filepath = os.path.join(self.config.OUTPUT_DIR, filename)
 
         # 保存报告
-         with open(filepath, 'w', encoding='utf-8') as f:
+         with open(filepath, "w", encoding="utf-8") as f:
             f.write(report_content)
 
         logger.info(f"报告已保存到: {filepath}")
--- a/requirements.txt
View file @160d1e5
+++ b/requirements.txt
View file @160d1e5
@@ -61,6 +61,7 @@ weasyprint>=60.0  # PDF导出，支持Python 3.9-3.13
 # ===== 机器学习（可选，用于情感分析，不安装也没事写了容错程序） =====
 torch>=2.0.0 # CPU版本
 transformers>=4.30.0
+ sentence-transformers>=2.2.2
 scikit-learn>=1.3.0
 xgboost>=2.0.0
 # NOTE：如果要安装GPU版本的torch，指令为pip3 install torch torchvision --index-url https://download.pytorch.org/whl/cu126