Merge pull request #350 from 666ghj/feature/insight_agent_cluster

feat(insight_agent): search results cluster

Merge pull request #350 from 666ghj/feature/insight_agent_cluster
feat(insight_agent): search results cluster
马一丁 · GitHub
Commit 160d1e52afbbc9e828984d165ec0be5ef8a9409e 160d1e52 2 parents 5d96bd0b 46cf1fd7
Showing 2 changed files with 426 additions and 229 deletions
InsightEngine/agent.py
requirements.txt
--- a/InsightEngine/agent.py
View file @160d1e5
+++ b/InsightEngine/agent.py
View file @160d1e5
@@ -7,60 +7,75 @@ import json
 import os
 import re
 from datetime import datetime
-from typing import Optional, Dict, Any, List, Union
+from typing import Any, Dict, List, Optional, Union
+
+import numpy as np
 from loguru import logger
+from sentence_transformers import SentenceTransformer
+from sklearn.cluster import KMeans
 from .llms import LLMClient
 from .nodes import (
-    ReportStructureNode,
-    FirstSearchNode, 
-    ReflectionNode,
+    FirstSearchNode,
     FirstSummaryNode,
+    ReflectionNode,
     ReflectionSummaryNode,
-    ReportFormattingNode
+    ReportFormattingNode,
+    ReportStructureNode,
 )
 from .state import State
-from .tools import MediaCrawlerDB, DBResponse, keyword_optimizer, multilingual_sentiment_analyzer
-from .utils.config import settings, Settings
+from .tools import (
+    DBResponse,
+    MediaCrawlerDB,
+    keyword_optimizer,
+    multilingual_sentiment_analyzer,
+)
 from .utils import format_search_results_for_prompt
+from .utils.config import Settings, settings
+
+ENABLE_CLUSTERING: bool = True  # 是否启用聚类采样
+MAX_CLUSTERED_RESULTS: int = 50  # 聚类后最大返回结果数
+RESULTS_PER_CLUSTER: int = 5  # 每个聚类返回的结果数
 class DeepSearchAgent:
     """Deep Search Agent主类"""
-    
+
     def __init__(self, config: Optional[Settings] = None):
         """
         初始化Deep Search Agent
-        
+
         Args:
             config: 可选配置对象（不填则用全局settings）
         """
         self.config = config or settings
-        
+
         # 初始化LLM客户端
         self.llm_client = self._initialize_llm()
-        
-        
+
         # 初始化搜索工具集
         self.search_agency = MediaCrawlerDB()
-        
+
+        # 初始化聚类小模型（懒加载）
+        self._clustering_model = None
+
         # 初始化情感分析器
         self.sentiment_analyzer = multilingual_sentiment_analyzer
-        
+
         # 初始化节点
         self._initialize_nodes()
-        
+
         # 状态
         self.state = State()
-        
+
         # 确保输出目录存在
         os.makedirs(self.config.OUTPUT_DIR, exist_ok=True)
-        
+
         logger.info(f"Insight Agent已初始化")
         logger.info(f"使用LLM: {self.llm_client.get_model_info()}")
         logger.info(f"搜索工具集: MediaCrawlerDB (支持5种本地数据库查询工具)")
         logger.info(f"情感分析: WeiboMultilingualSentiment (支持22种语言的情感分析)")
-    
+
     def _initialize_llm(self) -> LLMClient:
         """初始化LLM客户端"""
         return LLMClient(
@@ -68,7 +83,7 @@ class DeepSearchAgent:
             model_name=self.config.INSIGHT_ENGINE_MODEL_NAME,
             base_url=self.config.INSIGHT_ENGINE_BASE_URL,
         )
-    
+
     def _initialize_nodes(self):
         """初始化处理节点"""
         self.first_search_node = FirstSearchNode(self.llm_client)
@@ -76,36 +91,106 @@ class DeepSearchAgent:
         self.first_summary_node = FirstSummaryNode(self.llm_client)
         self.reflection_summary_node = ReflectionSummaryNode(self.llm_client)
         self.report_formatting_node = ReportFormattingNode(self.llm_client)
-    
+
+    def _get_clustering_model(self):
+        """懒加载聚类模型"""
+        if self._clustering_model is None:
+            logger.info("  加载聚类模型 (paraphrase-multilingual-MiniLM-L12-v2)...")
+            self._clustering_model = SentenceTransformer(
+                "paraphrase-multilingual-MiniLM-L12-v2"
+            )
+        return self._clustering_model
+
     def _validate_date_format(self, date_str: str) -> bool:
         """
         验证日期格式是否为YYYY-MM-DD
-        
+
         Args:
             date_str: 日期字符串
-            
+
         Returns:
             是否为有效格式
         """
         if not date_str:
             return False
-        
+
         # 检查格式
-        pattern = r'^\d{4}-\d{2}-\d{2}$'
+        pattern = r"^\d{4}-\d{2}-\d{2}$"
         if not re.match(pattern, date_str):
             return False
-        
+
         # 检查日期是否有效
         try:
-            datetime.strptime(date_str, '%Y-%m-%d')
+            datetime.strptime(date_str, "%Y-%m-%d")
             return True
         except ValueError:
             return False
-    
+
+    def _cluster_and_sample_results(
+        self,
+        results: List,
+        max_results: int = MAX_CLUSTERED_RESULTS,
+        results_per_cluster: int = RESULTS_PER_CLUSTER,
+    ) -> List:
+        """
+        对搜索结果进行聚类并采样
+
+        Args:
+            results: 搜索结果列表
+            max_results: 最大返回结果数
+            results_per_cluster: 每个聚类返回的结果数
+
+        Returns:
+            采样后的结果列表
+        """
+        if len(results) <= max_results:
+            return results
+
+        try:
+            # 提取文本
+            texts = [r.title_or_content[:500] for r in results]
+
+            # 获取模型并编码
+            model = self._get_clustering_model()
+            embeddings = model.encode(texts, show_progress_bar=False)
+
+            # 计算聚类数
+            n_clusters = min(max(2, max_results // results_per_cluster), len(results))
+
+            # KMeans聚类
+            kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
+            labels = kmeans.fit_predict(embeddings)
+
+            # 从每个聚类采样
+            sampled_results = []
+            for cluster_id in range(n_clusters):
+                cluster_indices = np.flatnonzero(labels == cluster_id)
+                cluster_results = [(results[i], i) for i in cluster_indices]
+                cluster_results.sort(
+                    key=lambda x: x[0].hotness_score or 0, reverse=True
+                )
+
+                for result, _ in cluster_results[:results_per_cluster]:
+                    sampled_results.append(result)
+                    if len(sampled_results) >= max_results:
+                        break
+
+                if len(sampled_results) >= max_results:
+                    break
+
+            logger.info(
+                f"  聚类完成: {len(results)} 条 -> {n_clusters} 个主题 -> {len(sampled_results)} 条代表性结果"
+            )
+            return sampled_results
+
+        except Exception as e:
+            logger.warning(f"  聚类失败，返回前{max_results}条: {str(e)}")
+            return results[:max_results]
+
     def execute_search_tool(self, tool_name: str, query: str, **kwargs) -> DBResponse:
         """
         执行指定的数据库查询工具（集成关键词优化中间件和情感分析）
-        
+
         Args:
             tool_name: 工具名称，可选值：
                 - "search_hot_content": 查找热点内容
@@ -117,18 +202,20 @@ class DeepSearchAgent:
             query: 搜索关键词/话题
             **kwargs: 额外参数（如start_date, end_date, platform, limit, enable_sentiment等）
                      enable_sentiment: 是否自动对搜索结果进行情感分析（默认True）
-            
+
         Returns:
             DBResponse对象（可能包含情感分析结果）
         """
         logger.info(f"  → 执行数据库查询工具: {tool_name}")
-        
+
         # 对于热点内容搜索，不需要关键词优化（因为不需要query参数）
         if tool_name == "search_hot_content":
             time_period = kwargs.get("time_period", "week")
             limit = kwargs.get("limit", 100)
-            response = self.search_agency.search_hot_content(time_period=time_period, limit=limit)
-            
+            response = self.search_agency.search_hot_content(
+                time_period=time_period, limit=limit
+            )
+
             # 检查是否需要进行情感分析
             enable_sentiment = kwargs.get("enable_sentiment", True)
             if enable_sentiment and response.results and len(response.results) > 0:
@@ -138,74 +225,101 @@ class DeepSearchAgent:
                     # 将情感分析结果添加到响应的parameters中
                     response.parameters["sentiment_analysis"] = sentiment_analysis
                     logger.info(f"  ✅ 情感分析完成")
-            
+
             return response
-        
+
         # 独立情感分析工具
         if tool_name == "analyze_sentiment":
             texts = kwargs.get("texts", query)  # 可以通过texts参数传递，或使用query
             sentiment_result = self.analyze_sentiment_only(texts)
-            
+
             # 构建DBResponse格式的响应
             return DBResponse(
                 tool_name="analyze_sentiment",
                 parameters={
                     "texts": texts if isinstance(texts, list) else [texts],
-                    **kwargs
+                    **kwargs,
                 },
                 results=[],  # 情感分析不返回搜索结果
                 results_count=0,
-                metadata=sentiment_result
+                metadata=sentiment_result,
             )
-        
+
         # 对于需要搜索词的工具，使用关键词优化中间件
         optimized_response = keyword_optimizer.optimize_keywords(
-            original_query=query,
-            context=f"使用{tool_name}工具进行查询"
+            original_query=query, context=f"使用{tool_name}工具进行查询"
         )
-        
+
         logger.info(f"  🔍 原始查询: '{query}'")
         logger.info(f"  ✨ 优化后关键词: {optimized_response.optimized_keywords}")
-        
+
         # 使用优化后的关键词进行多次查询并整合结果
         all_results = []
         total_count = 0
-        
+
         for keyword in optimized_response.optimized_keywords:
             logger.info(f"    查询关键词: '{keyword}'")
-            
+
             try:
                 if tool_name == "search_topic_globally":
                     # 使用配置文件中的默认值，忽略agent提供的limit_per_table参数
-                    limit_per_table = self.config.DEFAULT_SEARCH_TOPIC_GLOBALLY_LIMIT_PER_TABLE
-                    response = self.search_agency.search_topic_globally(topic=keyword, limit_per_table=limit_per_table)
+                    limit_per_table = (
+                        self.config.DEFAULT_SEARCH_TOPIC_GLOBALLY_LIMIT_PER_TABLE
+                    )
+                    response = self.search_agency.search_topic_globally(
+                        topic=keyword, limit_per_table=limit_per_table
+                    )
                 elif tool_name == "search_topic_by_date":
                     start_date = kwargs.get("start_date")
                     end_date = kwargs.get("end_date")
                     # 使用配置文件中的默认值，忽略agent提供的limit_per_table参数
-                    limit_per_table = self.config.DEFAULT_SEARCH_TOPIC_BY_DATE_LIMIT_PER_TABLE
+                    limit_per_table = (
+                        self.config.DEFAULT_SEARCH_TOPIC_BY_DATE_LIMIT_PER_TABLE
+                    )
                     if not start_date or not end_date:
-                        raise ValueError("search_topic_by_date工具需要start_date和end_date参数")
-                    response = self.search_agency.search_topic_by_date(topic=keyword, start_date=start_date, end_date=end_date, limit_per_table=limit_per_table)
+                        raise ValueError(
+                            "search_topic_by_date工具需要start_date和end_date参数"
+                        )
+                    response = self.search_agency.search_topic_by_date(
+                        topic=keyword,
+                        start_date=start_date,
+                        end_date=end_date,
+                        limit_per_table=limit_per_table,
+                    )
                 elif tool_name == "get_comments_for_topic":
                     # 使用配置文件中的默认值，按关键词数量分配，但保证最小值
-                    limit = self.config.DEFAULT_GET_COMMENTS_FOR_TOPIC_LIMIT // len(optimized_response.optimized_keywords)
+                    limit = self.config.DEFAULT_GET_COMMENTS_FOR_TOPIC_LIMIT // len(
+                        optimized_response.optimized_keywords
+                    )
                     limit = max(limit, 50)
-                    response = self.search_agency.get_comments_for_topic(topic=keyword, limit=limit)
+                    response = self.search_agency.get_comments_for_topic(
+                        topic=keyword, limit=limit
+                    )
                 elif tool_name == "search_topic_on_platform":
                     platform = kwargs.get("platform")
                     start_date = kwargs.get("start_date")
                     end_date = kwargs.get("end_date")
                     # 使用配置文件中的默认值，按关键词数量分配，但保证最小值
-                    limit = self.config.DEFAULT_SEARCH_TOPIC_ON_PLATFORM_LIMIT // len(optimized_response.optimized_keywords)
+                    limit = self.config.DEFAULT_SEARCH_TOPIC_ON_PLATFORM_LIMIT // len(
+                        optimized_response.optimized_keywords
+                    )
                     limit = max(limit, 30)
                     if not platform:
                         raise ValueError("search_topic_on_platform工具需要platform参数")
-                    response = self.search_agency.search_topic_on_platform(platform=platform, topic=keyword, start_date=start_date, end_date=end_date, limit=limit)
+                    response = self.search_agency.search_topic_on_platform(
+                        platform=platform,
+                        topic=keyword,
+                        start_date=start_date,
+                        end_date=end_date,
+                        limit=limit,
+                    )
                 else:
                     logger.info(f"    未知的搜索工具: {tool_name}，使用默认全局搜索")
-                    response = self.search_agency.search_topic_globally(topic=keyword, limit_per_table=self.config.DEFAULT_SEARCH_TOPIC_GLOBALLY_LIMIT_PER_TABLE)
-                
+                    response = self.search_agency.search_topic_globally(
+                        topic=keyword,
+                        limit_per_table=self.config.DEFAULT_SEARCH_TOPIC_GLOBALLY_LIMIT_PER_TABLE,
+                    )
+
                 # 收集结果
                 if response.results:
                     logger.info(f"     找到 {len(response.results)} 条结果")
@@ -213,15 +327,22 @@ class DeepSearchAgent:
                     total_count += len(response.results)
                 else:
                     logger.info(f"     未找到结果")
-                    
+
             except Exception as e:
                 logger.error(f"      查询'{keyword}'时出错: {str(e)}")
                 continue
-        
+
         # 去重和整合结果
         unique_results = self._deduplicate_results(all_results)
         logger.info(f"  总计找到 {total_count} 条结果，去重后 {len(unique_results)} 条")
-        
+
+        if ENABLE_CLUSTERING:
+            unique_results = self._cluster_and_sample_results(
+                unique_results,
+                max_results=MAX_CLUSTERED_RESULTS,
+                results_per_cluster=RESULTS_PER_CLUSTER,
+            )
+
         # 构建整合后的响应
         integrated_response = DBResponse(
             tool_name=f"{tool_name}_optimized",
@@ -229,12 +350,12 @@ class DeepSearchAgent:
                 "original_query": query,
                 "optimized_keywords": optimized_response.optimized_keywords,
                 "optimization_reasoning": optimized_response.reasoning,
-                **kwargs
+                **kwargs,
             },
             results=unique_results,
-            results_count=len(unique_results)
+            results_count=len(unique_results),
         )
-        
+
         # 检查是否需要进行情感分析
         enable_sentiment = kwargs.get("enable_sentiment", True)
         if enable_sentiment and unique_results and len(unique_results) > 0:
@@ -242,40 +363,45 @@ class DeepSearchAgent:
             sentiment_analysis = self._perform_sentiment_analysis(unique_results)
             if sentiment_analysis:
                 # 将情感分析结果添加到响应的parameters中
-                integrated_response.parameters["sentiment_analysis"] = sentiment_analysis
+                integrated_response.parameters["sentiment_analysis"] = (
+                    sentiment_analysis
+                )
                 logger.info(f"  ✅ 情感分析完成")
-        
+
         return integrated_response
-    
+
     def _deduplicate_results(self, results: List) -> List:
         """
         去重搜索结果
         """
         seen = set()
         unique_results = []
-        
+
         for result in results:
             # 使用URL或内容作为去重标识
             identifier = result.url if result.url else result.title_or_content[:100]
             if identifier not in seen:
                 seen.add(identifier)
                 unique_results.append(result)
-        
+
         return unique_results
-    
+
     def _perform_sentiment_analysis(self, results: List) -> Optional[Dict[str, Any]]:
         """
         对搜索结果执行情感分析
-        
+
         Args:
             results: 搜索结果列表
-            
+
         Returns:
             情感分析结果字典，如果失败则返回None
         """
         try:
             # 初始化情感分析器（如果尚未初始化且未被禁用）
-            if not self.sentiment_analyzer.is_initialized and not self.sentiment_analyzer.is_disabled:
+            if (
+                not self.sentiment_analyzer.is_initialized
+                and not self.sentiment_analyzer.is_disabled
+            ):
                 logger.info("    初始化情感分析模型...")
                 if not self.sentiment_analyzer.initialize():
                     logger.info("     情感分析模型初始化失败，将直接透传原始文本")
@@ -290,203 +416,222 @@ class DeepSearchAgent:
                     "platform": result.platform,
                     "author": result.author_nickname,
                     "url": result.url,
-                    "publish_time": str(result.publish_time) if result.publish_time else None
+                    "publish_time": str(result.publish_time)
+                    if result.publish_time
+                    else None,
                 }
                 results_dict.append(result_dict)
-            
+
             # 执行情感分析
             sentiment_analysis = self.sentiment_analyzer.analyze_query_results(
-                query_results=results_dict,
-                text_field="content",
-                min_confidence=0.5
+                query_results=results_dict, text_field="content", min_confidence=0.5
             )
-            
+
             return sentiment_analysis.get("sentiment_analysis")
-            
+
         except Exception as e:
             logger.exception(f"    ❌ 情感分析过程中发生错误: {str(e)}")
             return None
-    
+
     def analyze_sentiment_only(self, texts: Union[str, List[str]]) -> Dict[str, Any]:
         """
         独立的情感分析工具
-        
+
         Args:
             texts: 单个文本或文本列表
-            
+
         Returns:
             情感分析结果
         """
         logger.info(f"  → 执行独立情感分析")
-        
+
         try:
             # 初始化情感分析器（如果尚未初始化且未被禁用）
-            if not self.sentiment_analyzer.is_initialized and not self.sentiment_analyzer.is_disabled:
+            if (
+                not self.sentiment_analyzer.is_initialized
+                and not self.sentiment_analyzer.is_disabled
+            ):
                 logger.info("    初始化情感分析模型...")
                 if not self.sentiment_analyzer.initialize():
                     logger.info("     情感分析模型初始化失败，将直接透传原始文本")
             elif self.sentiment_analyzer.is_disabled:
                 logger.warning("     情感分析功能已禁用，直接透传原始文本")
-            
+
             # 执行分析
             if isinstance(texts, str):
                 result = self.sentiment_analyzer.analyze_single_text(texts)
                 result_dict = result.__dict__
                 response = {
                     "success": result.success and result.analysis_performed,
-                    "total_analyzed": 1 if result.analysis_performed and result.success else 0,
-                    "results": [result_dict]
+                    "total_analyzed": 1
+                    if result.analysis_performed and result.success
+                    else 0,
+                    "results": [result_dict],
                 }
                 if not result.analysis_performed:
                     response["success"] = False
-                    response["warning"] = result.error_message or "情感分析功能不可用，已直接返回原始文本"
+                    response["warning"] = (
+                        result.error_message or "情感分析功能不可用，已直接返回原始文本"
+                    )
                 return response
             else:
                 texts_list = list(texts)
-                batch_result = self.sentiment_analyzer.analyze_batch(texts_list, show_progress=True)
+                batch_result = self.sentiment_analyzer.analyze_batch(
+                    texts_list, show_progress=True
+                )
                 response = {
-                    "success": batch_result.analysis_performed and batch_result.success_count > 0,
-                    "total_analyzed": batch_result.total_processed if batch_result.analysis_performed else 0,
+                    "success": batch_result.analysis_performed
+                    and batch_result.success_count > 0,
+                    "total_analyzed": batch_result.total_processed
+                    if batch_result.analysis_performed
+                    else 0,
                     "success_count": batch_result.success_count,
                     "failed_count": batch_result.failed_count,
-                    "average_confidence": batch_result.average_confidence if batch_result.analysis_performed else 0.0,
-                    "results": [result.__dict__ for result in batch_result.results]
+                    "average_confidence": batch_result.average_confidence
+                    if batch_result.analysis_performed
+                    else 0.0,
+                    "results": [result.__dict__ for result in batch_result.results],
                 }
                 if not batch_result.analysis_performed:
                     warning = next(
-                        (r.error_message for r in batch_result.results if r.error_message),
-                        "情感分析功能不可用，已直接返回原始文本"
+                        (
+                            r.error_message
+                            for r in batch_result.results
+                            if r.error_message
+                        ),
+                        "情感分析功能不可用，已直接返回原始文本",
                     )
                     response["success"] = False
                     response["warning"] = warning
                 return response
-                
+
         except Exception as e:
             logger.exception(f"    ❌ 情感分析过程中发生错误: {str(e)}")
-            return {
-                "success": False,
-                "error": str(e),
-                "results": []
-            }
-    
+            return {"success": False, "error": str(e), "results": []}
+
     def research(self, query: str, save_report: bool = True) -> str:
         """
         执行深度研究
-        
+
         Args:
             query: 研究查询
             save_report: 是否保存报告到文件
-            
+
         Returns:
             最终报告内容
         """
-        logger.info(f"\n{'='*60}")
+        logger.info(f"\n{'=' * 60}")
         logger.info(f"开始深度研究: {query}")
-        logger.info(f"{'='*60}")
-        
+        logger.info(f"{'=' * 60}")
+
         try:
             # Step 1: 生成报告结构
             self._generate_report_structure(query)
-            
+
             # Step 2: 处理每个段落
             self._process_paragraphs()
-            
+
             # Step 3: 生成最终报告
             final_report = self._generate_final_report()
-            
+
             # Step 4: 保存报告
             if save_report:
                 self._save_report(final_report)
             logger.info("深度研究完成！")
-            
+
             return final_report
-            
+
         except Exception as e:
             logger.exception(f"研究过程中发生错误: {str(e)}")
             raise e
-    
+
     def _generate_report_structure(self, query: str):
         """生成报告结构"""
         logger.info(f"\n[步骤 1] 生成报告结构...")
-        
+
         # 创建报告结构节点
         report_structure_node = ReportStructureNode(self.llm_client, query)
-        
+
         # 生成结构并更新状态
         self.state = report_structure_node.mutate_state(state=self.state)
-        
+
         _message = f"报告结构已生成，共 {len(self.state.paragraphs)} 个段落:"
         for i, paragraph in enumerate(self.state.paragraphs, 1):
             _message += f"\n  {i}. {paragraph.title}"
         logger.info(_message)
-    
+
     def _process_paragraphs(self):
         """处理所有段落"""
         total_paragraphs = len(self.state.paragraphs)
-        
+
         for i in range(total_paragraphs):
-            logger.info(f"\n[步骤 2.{i+1}] 处理段落: {self.state.paragraphs[i].title}")
+            logger.info(
+                f"\n[步骤 2.{i + 1}] 处理段落: {self.state.paragraphs[i].title}"
+            )
             logger.info("-" * 50)
-            
+
             # 初始搜索和总结
             self._initial_search_and_summary(i)
-            
+
             # 反思循环
             self._reflection_loop(i)
-            
+
             # 标记段落完成
             self.state.paragraphs[i].research.mark_completed()
-            
+
             progress = (i + 1) / total_paragraphs * 100
             logger.info(f"段落处理完成 ({progress:.1f}%)")
-    
+
     def _initial_search_and_summary(self, paragraph_index: int):
         """执行初始搜索和总结"""
         paragraph = self.state.paragraphs[paragraph_index]
-        
+
         # 准备搜索输入
-        search_input = {
-            "title": paragraph.title,
-            "content": paragraph.content
-        }
-        
+        search_input = {"title": paragraph.title, "content": paragraph.content}
+
         # 生成搜索查询和工具选择
         logger.info("  - 生成搜索查询...")
         search_output = self.first_search_node.run(search_input)
         search_query = search_output["search_query"]
-        search_tool = search_output.get("search_tool", "search_topic_globally")  # 默认工具
+        search_tool = search_output.get(
+            "search_tool", "search_topic_globally"
+        )  # 默认工具
         reasoning = search_output["reasoning"]
-        
+
         logger.info(f"  - 搜索查询: {search_query}")
         logger.info(f"  - 选择的工具: {search_tool}")
         logger.info(f"  - 推理: {reasoning}")
-        
+
         # 执行搜索
         logger.info("  - 执行数据库查询...")
-        
+
         # 处理特殊参数
         search_kwargs = {}
-        
+
         # 处理需要日期的工具
         if search_tool in ["search_topic_by_date", "search_topic_on_platform"]:
             start_date = search_output.get("start_date")
             end_date = search_output.get("end_date")
-            
+
             if start_date and end_date:
                 # 验证日期格式
-                if self._validate_date_format(start_date) and self._validate_date_format(end_date):
+                if self._validate_date_format(
+                    start_date
+                ) and self._validate_date_format(end_date):
                     search_kwargs["start_date"] = start_date
                     search_kwargs["end_date"] = end_date
                     logger.info(f"  - 时间范围: {start_date} 到 {end_date}")
                 else:
                     logger.info(f"    日期格式错误（应为YYYY-MM-DD），改用全局搜索")
-                    logger.info(f"      提供的日期: start_date={start_date}, end_date={end_date}")
+                    logger.info(
+                        f"      提供的日期: start_date={start_date}, end_date={end_date}"
+                    )
                     search_tool = "search_topic_globally"
             elif search_tool == "search_topic_by_date":
                 logger.info(f"    search_topic_by_date工具缺少时间参数，改用全局搜索")
                 search_tool = "search_topic_globally"
-        
+
         # 处理需要平台参数的工具
         if search_tool == "search_topic_on_platform":
             platform = search_output.get("platform")
@@ -494,9 +639,11 @@ class DeepSearchAgent:
                 search_kwargs["platform"] = platform
                 logger.info(f"  - 指定平台: {platform}")
             else:
-                logger.warning(f"    search_topic_on_platform工具缺少平台参数，改用全局搜索")
+                logger.warning(
+                    f"    search_topic_on_platform工具缺少平台参数，改用全局搜索"
+                )
                 search_tool = "search_topic_globally"
-        
+
         # 处理限制参数，使用配置文件中的默认值而不是agent提供的参数
         if search_tool == "search_hot_content":
             time_period = search_output.get("time_period", "week")
@@ -505,9 +652,13 @@ class DeepSearchAgent:
             search_kwargs["limit"] = limit
         elif search_tool in ["search_topic_globally", "search_topic_by_date"]:
             if search_tool == "search_topic_globally":
-                limit_per_table = self.config.DEFAULT_SEARCH_TOPIC_GLOBALLY_LIMIT_PER_TABLE
+                limit_per_table = (
+                    self.config.DEFAULT_SEARCH_TOPIC_GLOBALLY_LIMIT_PER_TABLE
+                )
             else:  # search_topic_by_date
-                limit_per_table = self.config.DEFAULT_SEARCH_TOPIC_BY_DATE_LIMIT_PER_TABLE
+                limit_per_table = (
+                    self.config.DEFAULT_SEARCH_TOPIC_BY_DATE_LIMIT_PER_TABLE
+                )
             search_kwargs["limit_per_table"] = limit_per_table
         elif search_tool in ["get_comments_for_topic", "search_topic_on_platform"]:
             if search_tool == "get_comments_for_topic":
@@ -515,43 +666,55 @@ class DeepSearchAgent:
             else:  # search_topic_on_platform
                 limit = self.config.DEFAULT_SEARCH_TOPIC_ON_PLATFORM_LIMIT
             search_kwargs["limit"] = limit
-        
-        search_response = self.execute_search_tool(search_tool, search_query, **search_kwargs)
-        
+
+        search_response = self.execute_search_tool(
+            search_tool, search_query, **search_kwargs
+        )
+
         # 转换为兼容格式
         search_results = []
         if search_response and search_response.results:
             # 使用配置文件控制传递给LLM的结果数量，0表示不限制
             if self.config.MAX_SEARCH_RESULTS_FOR_LLM > 0:
-                max_results = min(len(search_response.results), self.config.MAX_SEARCH_RESULTS_FOR_LLM)
+                max_results = min(
+                    len(search_response.results), self.config.MAX_SEARCH_RESULTS_FOR_LLM
+                )
             else:
                 max_results = len(search_response.results)  # 不限制，传递所有结果
             for result in search_response.results[:max_results]:
-                search_results.append({
-                    'title': result.title_or_content,
-                    'url': result.url or "",
-                    'content': result.title_or_content,
-                    'score': result.hotness_score,
-                    'raw_content': result.title_or_content,
-                    'published_date': result.publish_time.isoformat() if result.publish_time else None,
-                    'platform': result.platform,
-                    'content_type': result.content_type,
-                    'author': result.author_nickname,
-                    'engagement': result.engagement
-                })
-        
+                search_results.append(
+                    {
+                        "title": result.title_or_content,
+                        "url": result.url or "",
+                        "content": result.title_or_content,
+                        "score": result.hotness_score,
+                        "raw_content": result.title_or_content,
+                        "published_date": result.publish_time.isoformat()
+                        if result.publish_time
+                        else None,
+                        "platform": result.platform,
+                        "content_type": result.content_type,
+                        "author": result.author_nickname,
+                        "engagement": result.engagement,
+                    }
+                )
+
         if search_results:
             _message = f"  - 找到 {len(search_results)} 个搜索结果"
             for j, result in enumerate(search_results, 1):
-                date_info = f" (发布于: {result.get('published_date', 'N/A')})" if result.get('published_date') else ""
+                date_info = (
+                    f" (发布于: {result.get('published_date', 'N/A')})"
+                    if result.get("published_date")
+                    else ""
+                )
                 _message += f"\n    {j}. {result['title'][:50]}...{date_info}"
             logger.info(_message)
         else:
             logger.info("  - 未找到搜索结果")
-        
+
         # 更新状态中的搜索历史
         paragraph.research.add_search_results(search_query, search_results)
-        
+
         # 生成初始总结
         logger.info("  - 生成初始总结...")
         summary_input = {
@@ -560,63 +723,73 @@ class DeepSearchAgent:
             "search_query": search_query,
             "search_results": format_search_results_for_prompt(
                 search_results, self.config.MAX_CONTENT_LENGTH
-            )
+            ),
         }
-        
+
         # 更新状态
         self.state = self.first_summary_node.mutate_state(
             summary_input, self.state, paragraph_index
         )
-        
+
         logger.info("  - 初始总结完成")
-    
+
     def _reflection_loop(self, paragraph_index: int):
         """执行反思循环"""
         paragraph = self.state.paragraphs[paragraph_index]
-        
+
         for reflection_i in range(self.config.MAX_REFLECTIONS):
             logger.info(f"  - 反思 {reflection_i + 1}/{self.config.MAX_REFLECTIONS}...")
-            
+
             # 准备反思输入
             reflection_input = {
                 "title": paragraph.title,
                 "content": paragraph.content,
-                "paragraph_latest_state": paragraph.research.latest_summary
+                "paragraph_latest_state": paragraph.research.latest_summary,
             }
-            
+
             # 生成反思搜索查询
             reflection_output = self.reflection_node.run(reflection_input)
             search_query = reflection_output["search_query"]
-            search_tool = reflection_output.get("search_tool", "search_topic_globally")  # 默认工具
+            search_tool = reflection_output.get(
+                "search_tool", "search_topic_globally"
+            )  # 默认工具
             reasoning = reflection_output["reasoning"]
-            
+
             logger.info(f"    反思查询: {search_query}")
             logger.info(f"    选择的工具: {search_tool}")
             logger.info(f"    反思推理: {reasoning}")
-            
+
             # 执行反思搜索
             # 处理特殊参数
             search_kwargs = {}
-            
+
             # 处理需要日期的工具
             if search_tool in ["search_topic_by_date", "search_topic_on_platform"]:
                 start_date = reflection_output.get("start_date")
                 end_date = reflection_output.get("end_date")
-                
+
                 if start_date and end_date:
                     # 验证日期格式
-                    if self._validate_date_format(start_date) and self._validate_date_format(end_date):
+                    if self._validate_date_format(
+                        start_date
+                    ) and self._validate_date_format(end_date):
                         search_kwargs["start_date"] = start_date
                         search_kwargs["end_date"] = end_date
                         logger.info(f"    时间范围: {start_date} 到 {end_date}")
                     else:
-                        logger.info(f"      日期格式错误（应为YYYY-MM-DD），改用全局搜索")
-                        logger.info(f"        提供的日期: start_date={start_date}, end_date={end_date}")
+                        logger.info(
+                            f"      日期格式错误（应为YYYY-MM-DD），改用全局搜索"
+                        )
+                        logger.info(
+                            f"        提供的日期: start_date={start_date}, end_date={end_date}"
+                        )
                         search_tool = "search_topic_globally"
                 elif search_tool == "search_topic_by_date":
-                    logger.warning(f"      search_topic_by_date工具缺少时间参数，改用全局搜索")
+                    logger.warning(
+                        f"      search_topic_by_date工具缺少时间参数，改用全局搜索"
+                    )
                     search_tool = "search_topic_globally"
-            
+
             # 处理需要平台参数的工具
             if search_tool == "search_topic_on_platform":
                 platform = reflection_output.get("platform")
@@ -624,9 +797,11 @@ class DeepSearchAgent:
                     search_kwargs["platform"] = platform
                     logger.info(f"    指定平台: {platform}")
                 else:
-                    logger.warning(f"      search_topic_on_platform工具缺少平台参数，改用全局搜索")
+                    logger.warning(
+                        f"      search_topic_on_platform工具缺少平台参数，改用全局搜索"
+                    )
                     search_tool = "search_topic_globally"
-            
+
             # 处理限制参数
             if search_tool == "search_hot_content":
                 time_period = reflection_output.get("time_period", "week")
@@ -637,9 +812,13 @@ class DeepSearchAgent:
             elif search_tool in ["search_topic_globally", "search_topic_by_date"]:
                 # 使用配置文件中的默认值，不允许agent控制limit_per_table参数
                 if search_tool == "search_topic_globally":
-                    limit_per_table = self.config.DEFAULT_SEARCH_TOPIC_GLOBALLY_LIMIT_PER_TABLE
+                    limit_per_table = (
+                        self.config.DEFAULT_SEARCH_TOPIC_GLOBALLY_LIMIT_PER_TABLE
+                    )
                 else:  # search_topic_by_date
-                    limit_per_table = self.config.DEFAULT_SEARCH_TOPIC_BY_DATE_LIMIT_PER_TABLE
+                    limit_per_table = (
+                        self.config.DEFAULT_SEARCH_TOPIC_BY_DATE_LIMIT_PER_TABLE
+                    )
                 search_kwargs["limit_per_table"] = limit_per_table
             elif search_tool in ["get_comments_for_topic", "search_topic_on_platform"]:
                 # 使用配置文件中的默认值，不允许agent控制limit参数
@@ -648,43 +827,56 @@ class DeepSearchAgent:
                 else:  # search_topic_on_platform
                     limit = self.config.DEFAULT_SEARCH_TOPIC_ON_PLATFORM_LIMIT
                 search_kwargs["limit"] = limit
-            
-            search_response = self.execute_search_tool(search_tool, search_query, **search_kwargs)
-            
+
+            search_response = self.execute_search_tool(
+                search_tool, search_query, **search_kwargs
+            )
+
             # 转换为兼容格式
             search_results = []
             if search_response and search_response.results:
                 # 使用配置文件控制传递给LLM的结果数量，0表示不限制
                 if self.config.MAX_SEARCH_RESULTS_FOR_LLM > 0:
-                    max_results = min(len(search_response.results), self.config.MAX_SEARCH_RESULTS_FOR_LLM)
+                    max_results = min(
+                        len(search_response.results),
+                        self.config.MAX_SEARCH_RESULTS_FOR_LLM,
+                    )
                 else:
                     max_results = len(search_response.results)  # 不限制，传递所有结果
                 for result in search_response.results[:max_results]:
-                    search_results.append({
-                        'title': result.title_or_content,
-                        'url': result.url or "",
-                        'content': result.title_or_content,
-                        'score': result.hotness_score,
-                        'raw_content': result.title_or_content,
-                        'published_date': result.publish_time.isoformat() if result.publish_time else None,
-                        'platform': result.platform,
-                        'content_type': result.content_type,
-                        'author': result.author_nickname,
-                        'engagement': result.engagement
-                    })
-            
+                    search_results.append(
+                        {
+                            "title": result.title_or_content,
+                            "url": result.url or "",
+                            "content": result.title_or_content,
+                            "score": result.hotness_score,
+                            "raw_content": result.title_or_content,
+                            "published_date": result.publish_time.isoformat()
+                            if result.publish_time
+                            else None,
+                            "platform": result.platform,
+                            "content_type": result.content_type,
+                            "author": result.author_nickname,
+                            "engagement": result.engagement,
+                        }
+                    )
+
             if search_results:
                 _message = f"    找到 {len(search_results)} 个反思搜索结果"
                 for j, result in enumerate(search_results, 1):
-                    date_info = f" (发布于: {result.get('published_date', 'N/A')})" if result.get('published_date') else ""
+                    date_info = (
+                        f" (发布于: {result.get('published_date', 'N/A')})"
+                        if result.get("published_date")
+                        else ""
+                    )
                     _message += f"\n      {j}. {result['title'][:50]}...{date_info}"
                 logger.info(_message)
             else:
                 logger.info("    未找到反思搜索结果")
-            
+
             # 更新搜索历史
             paragraph.research.add_search_results(search_query, search_results)
-            
+
             # 生成反思总结
             reflection_summary_input = {
                 "title": paragraph.title,
@@ -693,28 +885,30 @@ class DeepSearchAgent:
                 "search_results": format_search_results_for_prompt(
                     search_results, self.config.MAX_CONTENT_LENGTH
                 ),
-                "paragraph_latest_state": paragraph.research.latest_summary
+                "paragraph_latest_state": paragraph.research.latest_summary,
             }
-            
+
             # 更新状态
             self.state = self.reflection_summary_node.mutate_state(
                 reflection_summary_input, self.state, paragraph_index
             )
-            
+
             logger.info(f"    反思 {reflection_i + 1} 完成")
-    
+
     def _generate_final_report(self) -> str:
         """生成最终报告"""
         logger.info(f"\n[步骤 3] 生成最终报告...")
-        
+
         # 准备报告数据
         report_data = []
         for paragraph in self.state.paragraphs:
-            report_data.append({
-                "title": paragraph.title,
-                "paragraph_latest_state": paragraph.research.latest_summary
-            })
-        
+            report_data.append(
+                {
+                    "title": paragraph.title,
+                    "paragraph_latest_state": paragraph.research.latest_summary,
+                }
+            )
+
         # 格式化报告
         try:
             final_report = self.report_formatting_node.run(report_data)
@@ -723,46 +917,48 @@ class DeepSearchAgent:
             final_report = self.report_formatting_node.format_report_manually(
                 report_data, self.state.report_title
             )
-        
+
         # 更新状态
         self.state.final_report = final_report
         self.state.mark_completed()
-        
+
         logger.info("最终报告生成完成")
         return final_report
-    
+
     def _save_report(self, report_content: str):
         """保存报告到文件"""
         # 生成文件名
         timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-        query_safe = "".join(c for c in self.state.query if c.isalnum() or c in (' ', '-', '_')).rstrip()
-        query_safe = query_safe.replace(' ', '_')[:30]
-        
+        query_safe = "".join(
+            c for c in self.state.query if c.isalnum() or c in (" ", "-", "_")
+        ).rstrip()
+        query_safe = query_safe.replace(" ", "_")[:30]
+
         filename = f"deep_search_report_{query_safe}_{timestamp}.md"
         filepath = os.path.join(self.config.OUTPUT_DIR, filename)
-        
+
         # 保存报告
-        with open(filepath, 'w', encoding='utf-8') as f:
+        with open(filepath, "w", encoding="utf-8") as f:
             f.write(report_content)
-        
+
         logger.info(f"报告已保存到: {filepath}")
-        
+
         # 保存状态（如果配置允许）
         if self.config.SAVE_INTERMEDIATE_STATES:
             state_filename = f"state_{query_safe}_{timestamp}.json"
             state_filepath = os.path.join(self.config.OUTPUT_DIR, state_filename)
             self.state.save_to_file(state_filepath)
             logger.info(f"状态已保存到: {state_filepath}")
-    
+
     def get_progress_summary(self) -> Dict[str, Any]:
         """获取进度摘要"""
         return self.state.get_progress_summary()
-    
+
     def load_state(self, filepath: str):
         """从文件加载状态"""
         self.state = State.load_from_file(filepath)
         logger.info(f"状态已从 {filepath} 加载")
-    
+
     def save_state(self, filepath: str):
         """保存状态到文件"""
         self.state.save_to_file(filepath)
@@ -772,12 +968,12 @@ class DeepSearchAgent:
 def create_agent(config_file: Optional[str] = None) -> DeepSearchAgent:
     """
     创建Deep Search Agent实例的便捷函数
-    
+
     Args:
         config_file: 配置文件路径
-        
+
     Returns:
         DeepSearchAgent实例
     """
-    config = Settings() # 以空配置初始化，而从从环境变量初始化
+    config = Settings()  # 以空配置初始化，而从从环境变量初始化
     return DeepSearchAgent(config)
--- a/requirements.txt
View file @160d1e5
+++ b/requirements.txt
View file @160d1e5
@@ -61,6 +61,7 @@ weasyprint>=60.0  # PDF导出，支持Python 3.9-3.13
 # ===== 机器学习（可选，用于情感分析，不安装也没事写了容错程序） =====
 torch>=2.0.0 # CPU版本
 transformers>=4.30.0
+sentence-transformers>=2.2.2
 scikit-learn>=1.3.0
 xgboost>=2.0.0
 # NOTE：如果要安装GPU版本的torch，指令为pip3 install torch torchvision --index-url https://download.pytorch.org/whl/cu126