Merge pull request #350 from 666ghj/feature/insight_agent_cluster

feat(insight_agent): search results cluster

Merge pull request #350 from 666ghj/feature/insight_agent_cluster
feat(insight_agent): search results cluster
马一丁 · GitHub
Commit 160d1e52afbbc9e828984d165ec0be5ef8a9409e 160d1e52 2 parents 5d96bd0b 46cf1fd7
Showing 2 changed files with 305 additions and 108 deletions
InsightEngine/agent.py
requirements.txt
--- a/InsightEngine/agent.py
View file @160d1e5
+++ b/InsightEngine/agent.py
View file @160d1e5
@@ -7,22 +7,35 @@ import json
 import os
 import re
 from datetime import datetime
-from typing import Optional, Dict, Any, List, Union
+from typing import Any, Dict, List, Optional, Union
+
+import numpy as np
 from loguru import logger
+from sentence_transformers import SentenceTransformer
+from sklearn.cluster import KMeans
 from .llms import LLMClient
 from .nodes import (
-    ReportStructureNode,
     FirstSearchNode,
-    ReflectionNode,
     FirstSummaryNode,
+    ReflectionNode,
     ReflectionSummaryNode,
-    ReportFormattingNode
+    ReportFormattingNode,
+    ReportStructureNode,
 )
 from .state import State
-from .tools import MediaCrawlerDB, DBResponse, keyword_optimizer, multilingual_sentiment_analyzer
-from .utils.config import settings, Settings
+from .tools import (
+    DBResponse,
+    MediaCrawlerDB,
+    keyword_optimizer,
+    multilingual_sentiment_analyzer,
+)
 from .utils import format_search_results_for_prompt
+from .utils.config import Settings, settings
+
+ENABLE_CLUSTERING: bool = True  # 是否启用聚类采样
+MAX_CLUSTERED_RESULTS: int = 50  # 聚类后最大返回结果数
+RESULTS_PER_CLUSTER: int = 5  # 每个聚类返回的结果数
 class DeepSearchAgent:
@@ -40,10 +53,12 @@ class DeepSearchAgent:
         # 初始化LLM客户端
         self.llm_client = self._initialize_llm()
-        
         # 初始化搜索工具集
         self.search_agency = MediaCrawlerDB()
+        # 初始化聚类小模型（懒加载）
+        self._clustering_model = None
+
         # 初始化情感分析器
         self.sentiment_analyzer = multilingual_sentiment_analyzer
@@ -77,6 +92,15 @@ class DeepSearchAgent:
         self.reflection_summary_node = ReflectionSummaryNode(self.llm_client)
         self.report_formatting_node = ReportFormattingNode(self.llm_client)
+    def _get_clustering_model(self):
+        """懒加载聚类模型"""
+        if self._clustering_model is None:
+            logger.info("  加载聚类模型 (paraphrase-multilingual-MiniLM-L12-v2)...")
+            self._clustering_model = SentenceTransformer(
+                "paraphrase-multilingual-MiniLM-L12-v2"
+            )
+        return self._clustering_model
+
     def _validate_date_format(self, date_str: str) -> bool:
         """
         验证日期格式是否为YYYY-MM-DD
@@ -91,17 +115,78 @@ class DeepSearchAgent:
             return False
         # 检查格式
-        pattern = r'^\d{4}-\d{2}-\d{2}$'
+        pattern = r"^\d{4}-\d{2}-\d{2}$"
         if not re.match(pattern, date_str):
             return False
         # 检查日期是否有效
         try:
-            datetime.strptime(date_str, '%Y-%m-%d')
+            datetime.strptime(date_str, "%Y-%m-%d")
             return True
         except ValueError:
             return False
+    def _cluster_and_sample_results(
+        self,
+        results: List,
+        max_results: int = MAX_CLUSTERED_RESULTS,
+        results_per_cluster: int = RESULTS_PER_CLUSTER,
+    ) -> List:
+        """
+        对搜索结果进行聚类并采样
+
+        Args:
+            results: 搜索结果列表
+            max_results: 最大返回结果数
+            results_per_cluster: 每个聚类返回的结果数
+
+        Returns:
+            采样后的结果列表
+        """
+        if len(results) <= max_results:
+            return results
+
+        try:
+            # 提取文本
+            texts = [r.title_or_content[:500] for r in results]
+
+            # 获取模型并编码
+            model = self._get_clustering_model()
+            embeddings = model.encode(texts, show_progress_bar=False)
+
+            # 计算聚类数
+            n_clusters = min(max(2, max_results // results_per_cluster), len(results))
+
+            # KMeans聚类
+            kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
+            labels = kmeans.fit_predict(embeddings)
+
+            # 从每个聚类采样
+            sampled_results = []
+            for cluster_id in range(n_clusters):
+                cluster_indices = np.flatnonzero(labels == cluster_id)
+                cluster_results = [(results[i], i) for i in cluster_indices]
+                cluster_results.sort(
+                    key=lambda x: x[0].hotness_score or 0, reverse=True
+                )
+
+                for result, _ in cluster_results[:results_per_cluster]:
+                    sampled_results.append(result)
+                    if len(sampled_results) >= max_results:
+                        break
+
+                if len(sampled_results) >= max_results:
+                    break
+
+            logger.info(
+                f"  聚类完成: {len(results)} 条 -> {n_clusters} 个主题 -> {len(sampled_results)} 条代表性结果"
+            )
+            return sampled_results
+
+        except Exception as e:
+            logger.warning(f"  聚类失败，返回前{max_results}条: {str(e)}")
+            return results[:max_results]
+
     def execute_search_tool(self, tool_name: str, query: str, **kwargs) -> DBResponse:
         """
         执行指定的数据库查询工具（集成关键词优化中间件和情感分析）
@@ -127,7 +212,9 @@ class DeepSearchAgent:
         if tool_name == "search_hot_content":
             time_period = kwargs.get("time_period", "week")
             limit = kwargs.get("limit", 100)
-            response = self.search_agency.search_hot_content(time_period=time_period, limit=limit)
+            response = self.search_agency.search_hot_content(
+                time_period=time_period, limit=limit
+            )
             # 检查是否需要进行情感分析
             enable_sentiment = kwargs.get("enable_sentiment", True)
@@ -151,17 +238,16 @@ class DeepSearchAgent:
                 tool_name="analyze_sentiment",
                 parameters={
                     "texts": texts if isinstance(texts, list) else [texts],
-                    **kwargs
+                    **kwargs,
                 },
                 results=[],  # 情感分析不返回搜索结果
                 results_count=0,
-                metadata=sentiment_result
+                metadata=sentiment_result,
             )
         # 对于需要搜索词的工具，使用关键词优化中间件
         optimized_response = keyword_optimizer.optimize_keywords(
-            original_query=query,
-            context=f"使用{tool_name}工具进行查询"
+            original_query=query, context=f"使用{tool_name}工具进行查询"
         )
         logger.info(f"  🔍 原始查询: '{query}'")
@@ -177,34 +263,62 @@ class DeepSearchAgent:
             try:
                 if tool_name == "search_topic_globally":
                     # 使用配置文件中的默认值，忽略agent提供的limit_per_table参数
-                    limit_per_table = self.config.DEFAULT_SEARCH_TOPIC_GLOBALLY_LIMIT_PER_TABLE
-                    response = self.search_agency.search_topic_globally(topic=keyword, limit_per_table=limit_per_table)
+                    limit_per_table = (
+                        self.config.DEFAULT_SEARCH_TOPIC_GLOBALLY_LIMIT_PER_TABLE
+                    )
+                    response = self.search_agency.search_topic_globally(
+                        topic=keyword, limit_per_table=limit_per_table
+                    )
                 elif tool_name == "search_topic_by_date":
                     start_date = kwargs.get("start_date")
                     end_date = kwargs.get("end_date")
                     # 使用配置文件中的默认值，忽略agent提供的limit_per_table参数
-                    limit_per_table = self.config.DEFAULT_SEARCH_TOPIC_BY_DATE_LIMIT_PER_TABLE
+                    limit_per_table = (
+                        self.config.DEFAULT_SEARCH_TOPIC_BY_DATE_LIMIT_PER_TABLE
+                    )
                     if not start_date or not end_date:
-                        raise ValueError("search_topic_by_date工具需要start_date和end_date参数")
-                    response = self.search_agency.search_topic_by_date(topic=keyword, start_date=start_date, end_date=end_date, limit_per_table=limit_per_table)
+                        raise ValueError(
+                            "search_topic_by_date工具需要start_date和end_date参数"
+                        )
+                    response = self.search_agency.search_topic_by_date(
+                        topic=keyword,
+                        start_date=start_date,
+                        end_date=end_date,
+                        limit_per_table=limit_per_table,
+                    )
                 elif tool_name == "get_comments_for_topic":
                     # 使用配置文件中的默认值，按关键词数量分配，但保证最小值
-                    limit = self.config.DEFAULT_GET_COMMENTS_FOR_TOPIC_LIMIT // len(optimized_response.optimized_keywords)
+                    limit = self.config.DEFAULT_GET_COMMENTS_FOR_TOPIC_LIMIT // len(
+                        optimized_response.optimized_keywords
+                    )
                     limit = max(limit, 50)
-                    response = self.search_agency.get_comments_for_topic(topic=keyword, limit=limit)
+                    response = self.search_agency.get_comments_for_topic(
+                        topic=keyword, limit=limit
+                    )
                 elif tool_name == "search_topic_on_platform":
                     platform = kwargs.get("platform")
                     start_date = kwargs.get("start_date")
                     end_date = kwargs.get("end_date")
                     # 使用配置文件中的默认值，按关键词数量分配，但保证最小值
-                    limit = self.config.DEFAULT_SEARCH_TOPIC_ON_PLATFORM_LIMIT // len(optimized_response.optimized_keywords)
+                    limit = self.config.DEFAULT_SEARCH_TOPIC_ON_PLATFORM_LIMIT // len(
+                        optimized_response.optimized_keywords
+                    )
                     limit = max(limit, 30)
                     if not platform:
                         raise ValueError("search_topic_on_platform工具需要platform参数")
-                    response = self.search_agency.search_topic_on_platform(platform=platform, topic=keyword, start_date=start_date, end_date=end_date, limit=limit)
+                    response = self.search_agency.search_topic_on_platform(
+                        platform=platform,
+                        topic=keyword,
+                        start_date=start_date,
+                        end_date=end_date,
+                        limit=limit,
+                    )
                 else:
                     logger.info(f"    未知的搜索工具: {tool_name}，使用默认全局搜索")
-                    response = self.search_agency.search_topic_globally(topic=keyword, limit_per_table=self.config.DEFAULT_SEARCH_TOPIC_GLOBALLY_LIMIT_PER_TABLE)
+                    response = self.search_agency.search_topic_globally(
+                        topic=keyword,
+                        limit_per_table=self.config.DEFAULT_SEARCH_TOPIC_GLOBALLY_LIMIT_PER_TABLE,
+                    )
                 # 收集结果
                 if response.results:
@@ -222,6 +336,13 @@ class DeepSearchAgent:
         unique_results = self._deduplicate_results(all_results)
         logger.info(f"  总计找到 {total_count} 条结果，去重后 {len(unique_results)} 条")
+        if ENABLE_CLUSTERING:
+            unique_results = self._cluster_and_sample_results(
+                unique_results,
+                max_results=MAX_CLUSTERED_RESULTS,
+                results_per_cluster=RESULTS_PER_CLUSTER,
+            )
+
         # 构建整合后的响应
         integrated_response = DBResponse(
             tool_name=f"{tool_name}_optimized",
@@ -229,10 +350,10 @@ class DeepSearchAgent:
                 "original_query": query,
                 "optimized_keywords": optimized_response.optimized_keywords,
                 "optimization_reasoning": optimized_response.reasoning,
-                **kwargs
+                **kwargs,
             },
             results=unique_results,
-            results_count=len(unique_results)
+            results_count=len(unique_results),
         )
         # 检查是否需要进行情感分析
@@ -242,7 +363,9 @@ class DeepSearchAgent:
             sentiment_analysis = self._perform_sentiment_analysis(unique_results)
             if sentiment_analysis:
                 # 将情感分析结果添加到响应的parameters中
-                integrated_response.parameters["sentiment_analysis"] = sentiment_analysis
+                integrated_response.parameters["sentiment_analysis"] = (
+                    sentiment_analysis
+                )
                 logger.info(f"  ✅ 情感分析完成")
         return integrated_response
@@ -275,7 +398,10 @@ class DeepSearchAgent:
         """
         try:
             # 初始化情感分析器（如果尚未初始化且未被禁用）
-            if not self.sentiment_analyzer.is_initialized and not self.sentiment_analyzer.is_disabled:
+            if (
+                not self.sentiment_analyzer.is_initialized
+                and not self.sentiment_analyzer.is_disabled
+            ):
                 logger.info("    初始化情感分析模型...")
                 if not self.sentiment_analyzer.initialize():
                     logger.info("     情感分析模型初始化失败，将直接透传原始文本")
@@ -290,15 +416,15 @@ class DeepSearchAgent:
                     "platform": result.platform,
                     "author": result.author_nickname,
                     "url": result.url,
-                    "publish_time": str(result.publish_time) if result.publish_time else None
+                    "publish_time": str(result.publish_time)
+                    if result.publish_time
+                    else None,
                 }
                 results_dict.append(result_dict)
             # 执行情感分析
             sentiment_analysis = self.sentiment_analyzer.analyze_query_results(
-                query_results=results_dict,
-                text_field="content",
-                min_confidence=0.5
+                query_results=results_dict, text_field="content", min_confidence=0.5
             )
             return sentiment_analysis.get("sentiment_analysis")
@@ -321,7 +447,10 @@ class DeepSearchAgent:
         try:
             # 初始化情感分析器（如果尚未初始化且未被禁用）
-            if not self.sentiment_analyzer.is_initialized and not self.sentiment_analyzer.is_disabled:
+            if (
+                not self.sentiment_analyzer.is_initialized
+                and not self.sentiment_analyzer.is_disabled
+            ):
                 logger.info("    初始化情感分析模型...")
                 if not self.sentiment_analyzer.initialize():
                     logger.info("     情感分析模型初始化失败，将直接透传原始文本")
@@ -334,28 +463,43 @@ class DeepSearchAgent:
                 result_dict = result.__dict__
                 response = {
                     "success": result.success and result.analysis_performed,
-                    "total_analyzed": 1 if result.analysis_performed and result.success else 0,
-                    "results": [result_dict]
+                    "total_analyzed": 1
+                    if result.analysis_performed and result.success
+                    else 0,
+                    "results": [result_dict],
                 }
                 if not result.analysis_performed:
                     response["success"] = False
-                    response["warning"] = result.error_message or "情感分析功能不可用，已直接返回原始文本"
+                    response["warning"] = (
+                        result.error_message or "情感分析功能不可用，已直接返回原始文本"
+                    )
                 return response
             else:
                 texts_list = list(texts)
-                batch_result = self.sentiment_analyzer.analyze_batch(texts_list, show_progress=True)
+                batch_result = self.sentiment_analyzer.analyze_batch(
+                    texts_list, show_progress=True
+                )
                 response = {
-                    "success": batch_result.analysis_performed and batch_result.success_count > 0,
-                    "total_analyzed": batch_result.total_processed if batch_result.analysis_performed else 0,
+                    "success": batch_result.analysis_performed
+                    and batch_result.success_count > 0,
+                    "total_analyzed": batch_result.total_processed
+                    if batch_result.analysis_performed
+                    else 0,
                     "success_count": batch_result.success_count,
                     "failed_count": batch_result.failed_count,
-                    "average_confidence": batch_result.average_confidence if batch_result.analysis_performed else 0.0,
-                    "results": [result.__dict__ for result in batch_result.results]
+                    "average_confidence": batch_result.average_confidence
+                    if batch_result.analysis_performed
+                    else 0.0,
+                    "results": [result.__dict__ for result in batch_result.results],
                 }
                 if not batch_result.analysis_performed:
                     warning = next(
-                        (r.error_message for r in batch_result.results if r.error_message),
-                        "情感分析功能不可用，已直接返回原始文本"
+                        (
+                            r.error_message
+                            for r in batch_result.results
+                            if r.error_message
+                        ),
+                        "情感分析功能不可用，已直接返回原始文本",
                     )
                     response["success"] = False
                     response["warning"] = warning
@@ -363,11 +507,7 @@ class DeepSearchAgent:
         except Exception as e:
             logger.exception(f"    ❌ 情感分析过程中发生错误: {str(e)}")
-            return {
-                "success": False,
-                "error": str(e),
-                "results": []
-            }
+            return {"success": False, "error": str(e), "results": []}
     def research(self, query: str, save_report: bool = True) -> str:
         """
@@ -380,9 +520,9 @@ class DeepSearchAgent:
         Returns:
             最终报告内容
         """
-        logger.info(f"\n{'='*60}")
+        logger.info(f"\n{'=' * 60}")
         logger.info(f"开始深度研究: {query}")
-        logger.info(f"{'='*60}")
+        logger.info(f"{'=' * 60}")
         try:
             # Step 1: 生成报告结构
@@ -426,7 +566,9 @@ class DeepSearchAgent:
         total_paragraphs = len(self.state.paragraphs)
         for i in range(total_paragraphs):
-            logger.info(f"\n[步骤 2.{i+1}] 处理段落: {self.state.paragraphs[i].title}")
+            logger.info(
+                f"\n[步骤 2.{i + 1}] 处理段落: {self.state.paragraphs[i].title}"
+            )
             logger.info("-" * 50)
             # 初始搜索和总结
@@ -446,16 +588,15 @@ class DeepSearchAgent:
         paragraph = self.state.paragraphs[paragraph_index]
         # 准备搜索输入
-        search_input = {
-            "title": paragraph.title,
-            "content": paragraph.content
-        }
+        search_input = {"title": paragraph.title, "content": paragraph.content}
         # 生成搜索查询和工具选择
         logger.info("  - 生成搜索查询...")
         search_output = self.first_search_node.run(search_input)
         search_query = search_output["search_query"]
-        search_tool = search_output.get("search_tool", "search_topic_globally")  # 默认工具
+        search_tool = search_output.get(
+            "search_tool", "search_topic_globally"
+        )  # 默认工具
         reasoning = search_output["reasoning"]
         logger.info(f"  - 搜索查询: {search_query}")
@@ -475,13 +616,17 @@ class DeepSearchAgent:
             if start_date and end_date:
                 # 验证日期格式
-                if self._validate_date_format(start_date) and self._validate_date_format(end_date):
+                if self._validate_date_format(
+                    start_date
+                ) and self._validate_date_format(end_date):
                     search_kwargs["start_date"] = start_date
                     search_kwargs["end_date"] = end_date
                     logger.info(f"  - 时间范围: {start_date} 到 {end_date}")
                 else:
                     logger.info(f"    日期格式错误（应为YYYY-MM-DD），改用全局搜索")
-                    logger.info(f"      提供的日期: start_date={start_date}, end_date={end_date}")
+                    logger.info(
+                        f"      提供的日期: start_date={start_date}, end_date={end_date}"
+                    )
                     search_tool = "search_topic_globally"
             elif search_tool == "search_topic_by_date":
                 logger.info(f"    search_topic_by_date工具缺少时间参数，改用全局搜索")
@@ -494,7 +639,9 @@ class DeepSearchAgent:
                 search_kwargs["platform"] = platform
                 logger.info(f"  - 指定平台: {platform}")
             else:
-                logger.warning(f"    search_topic_on_platform工具缺少平台参数，改用全局搜索")
+                logger.warning(
+                    f"    search_topic_on_platform工具缺少平台参数，改用全局搜索"
+                )
                 search_tool = "search_topic_globally"
         # 处理限制参数，使用配置文件中的默认值而不是agent提供的参数
@@ -505,9 +652,13 @@ class DeepSearchAgent:
             search_kwargs["limit"] = limit
         elif search_tool in ["search_topic_globally", "search_topic_by_date"]:
             if search_tool == "search_topic_globally":
-                limit_per_table = self.config.DEFAULT_SEARCH_TOPIC_GLOBALLY_LIMIT_PER_TABLE
+                limit_per_table = (
+                    self.config.DEFAULT_SEARCH_TOPIC_GLOBALLY_LIMIT_PER_TABLE
+                )
             else:  # search_topic_by_date
-                limit_per_table = self.config.DEFAULT_SEARCH_TOPIC_BY_DATE_LIMIT_PER_TABLE
+                limit_per_table = (
+                    self.config.DEFAULT_SEARCH_TOPIC_BY_DATE_LIMIT_PER_TABLE
+                )
             search_kwargs["limit_per_table"] = limit_per_table
         elif search_tool in ["get_comments_for_topic", "search_topic_on_platform"]:
             if search_tool == "get_comments_for_topic":
@@ -516,34 +667,46 @@ class DeepSearchAgent:
                 limit = self.config.DEFAULT_SEARCH_TOPIC_ON_PLATFORM_LIMIT
             search_kwargs["limit"] = limit
-        search_response = self.execute_search_tool(search_tool, search_query, **search_kwargs)
+        search_response = self.execute_search_tool(
+            search_tool, search_query, **search_kwargs
+        )
         # 转换为兼容格式
         search_results = []
         if search_response and search_response.results:
             # 使用配置文件控制传递给LLM的结果数量，0表示不限制
             if self.config.MAX_SEARCH_RESULTS_FOR_LLM > 0:
-                max_results = min(len(search_response.results), self.config.MAX_SEARCH_RESULTS_FOR_LLM)
+                max_results = min(
+                    len(search_response.results), self.config.MAX_SEARCH_RESULTS_FOR_LLM
+                )
             else:
                 max_results = len(search_response.results)  # 不限制，传递所有结果
             for result in search_response.results[:max_results]:
-                search_results.append({
-                    'title': result.title_or_content,
-                    'url': result.url or "",
-                    'content': result.title_or_content,
-                    'score': result.hotness_score,
-                    'raw_content': result.title_or_content,
-                    'published_date': result.publish_time.isoformat() if result.publish_time else None,
-                    'platform': result.platform,
-                    'content_type': result.content_type,
-                    'author': result.author_nickname,
-                    'engagement': result.engagement
-                })
+                search_results.append(
+                    {
+                        "title": result.title_or_content,
+                        "url": result.url or "",
+                        "content": result.title_or_content,
+                        "score": result.hotness_score,
+                        "raw_content": result.title_or_content,
+                        "published_date": result.publish_time.isoformat()
+                        if result.publish_time
+                        else None,
+                        "platform": result.platform,
+                        "content_type": result.content_type,
+                        "author": result.author_nickname,
+                        "engagement": result.engagement,
+                    }
+                )
         if search_results:
             _message = f"  - 找到 {len(search_results)} 个搜索结果"
             for j, result in enumerate(search_results, 1):
-                date_info = f" (发布于: {result.get('published_date', 'N/A')})" if result.get('published_date') else ""
+                date_info = (
+                    f" (发布于: {result.get('published_date', 'N/A')})"
+                    if result.get("published_date")
+                    else ""
+                )
                 _message += f"\n    {j}. {result['title'][:50]}...{date_info}"
             logger.info(_message)
         else:
@@ -560,7 +723,7 @@ class DeepSearchAgent:
             "search_query": search_query,
             "search_results": format_search_results_for_prompt(
                 search_results, self.config.MAX_CONTENT_LENGTH
-            )
+            ),
         }
         # 更新状态
@@ -581,13 +744,15 @@ class DeepSearchAgent:
             reflection_input = {
                 "title": paragraph.title,
                 "content": paragraph.content,
-                "paragraph_latest_state": paragraph.research.latest_summary
+                "paragraph_latest_state": paragraph.research.latest_summary,
             }
             # 生成反思搜索查询
             reflection_output = self.reflection_node.run(reflection_input)
             search_query = reflection_output["search_query"]
-            search_tool = reflection_output.get("search_tool", "search_topic_globally")  # 默认工具
+            search_tool = reflection_output.get(
+                "search_tool", "search_topic_globally"
+            )  # 默认工具
             reasoning = reflection_output["reasoning"]
             logger.info(f"    反思查询: {search_query}")
@@ -605,16 +770,24 @@ class DeepSearchAgent:
                 if start_date and end_date:
                     # 验证日期格式
-                    if self._validate_date_format(start_date) and self._validate_date_format(end_date):
+                    if self._validate_date_format(
+                        start_date
+                    ) and self._validate_date_format(end_date):
                         search_kwargs["start_date"] = start_date
                         search_kwargs["end_date"] = end_date
                         logger.info(f"    时间范围: {start_date} 到 {end_date}")
                     else:
-                        logger.info(f"      日期格式错误（应为YYYY-MM-DD），改用全局搜索")
-                        logger.info(f"        提供的日期: start_date={start_date}, end_date={end_date}")
+                        logger.info(
+                            f"      日期格式错误（应为YYYY-MM-DD），改用全局搜索"
+                        )
+                        logger.info(
+                            f"        提供的日期: start_date={start_date}, end_date={end_date}"
+                        )
                         search_tool = "search_topic_globally"
                 elif search_tool == "search_topic_by_date":
-                    logger.warning(f"      search_topic_by_date工具缺少时间参数，改用全局搜索")
+                    logger.warning(
+                        f"      search_topic_by_date工具缺少时间参数，改用全局搜索"
+                    )
                     search_tool = "search_topic_globally"
             # 处理需要平台参数的工具
@@ -624,7 +797,9 @@ class DeepSearchAgent:
                     search_kwargs["platform"] = platform
                     logger.info(f"    指定平台: {platform}")
                 else:
-                    logger.warning(f"      search_topic_on_platform工具缺少平台参数，改用全局搜索")
+                    logger.warning(
+                        f"      search_topic_on_platform工具缺少平台参数，改用全局搜索"
+                    )
                     search_tool = "search_topic_globally"
             # 处理限制参数
@@ -637,9 +812,13 @@ class DeepSearchAgent:
             elif search_tool in ["search_topic_globally", "search_topic_by_date"]:
                 # 使用配置文件中的默认值，不允许agent控制limit_per_table参数
                 if search_tool == "search_topic_globally":
-                    limit_per_table = self.config.DEFAULT_SEARCH_TOPIC_GLOBALLY_LIMIT_PER_TABLE
+                    limit_per_table = (
+                        self.config.DEFAULT_SEARCH_TOPIC_GLOBALLY_LIMIT_PER_TABLE
+                    )
                 else:  # search_topic_by_date
-                    limit_per_table = self.config.DEFAULT_SEARCH_TOPIC_BY_DATE_LIMIT_PER_TABLE
+                    limit_per_table = (
+                        self.config.DEFAULT_SEARCH_TOPIC_BY_DATE_LIMIT_PER_TABLE
+                    )
                 search_kwargs["limit_per_table"] = limit_per_table
             elif search_tool in ["get_comments_for_topic", "search_topic_on_platform"]:
                 # 使用配置文件中的默认值，不允许agent控制limit参数
@@ -649,34 +828,47 @@ class DeepSearchAgent:
                     limit = self.config.DEFAULT_SEARCH_TOPIC_ON_PLATFORM_LIMIT
                 search_kwargs["limit"] = limit
-            search_response = self.execute_search_tool(search_tool, search_query, **search_kwargs)
+            search_response = self.execute_search_tool(
+                search_tool, search_query, **search_kwargs
+            )
             # 转换为兼容格式
             search_results = []
             if search_response and search_response.results:
                 # 使用配置文件控制传递给LLM的结果数量，0表示不限制
                 if self.config.MAX_SEARCH_RESULTS_FOR_LLM > 0:
-                    max_results = min(len(search_response.results), self.config.MAX_SEARCH_RESULTS_FOR_LLM)
+                    max_results = min(
+                        len(search_response.results),
+                        self.config.MAX_SEARCH_RESULTS_FOR_LLM,
+                    )
                 else:
                     max_results = len(search_response.results)  # 不限制，传递所有结果
                 for result in search_response.results[:max_results]:
-                    search_results.append({
-                        'title': result.title_or_content,
-                        'url': result.url or "",
-                        'content': result.title_or_content,
-                        'score': result.hotness_score,
-                        'raw_content': result.title_or_content,
-                        'published_date': result.publish_time.isoformat() if result.publish_time else None,
-                        'platform': result.platform,
-                        'content_type': result.content_type,
-                        'author': result.author_nickname,
-                        'engagement': result.engagement
-                    })
+                    search_results.append(
+                        {
+                            "title": result.title_or_content,
+                            "url": result.url or "",
+                            "content": result.title_or_content,
+                            "score": result.hotness_score,
+                            "raw_content": result.title_or_content,
+                            "published_date": result.publish_time.isoformat()
+                            if result.publish_time
+                            else None,
+                            "platform": result.platform,
+                            "content_type": result.content_type,
+                            "author": result.author_nickname,
+                            "engagement": result.engagement,
+                        }
+                    )
             if search_results:
                 _message = f"    找到 {len(search_results)} 个反思搜索结果"
                 for j, result in enumerate(search_results, 1):
-                    date_info = f" (发布于: {result.get('published_date', 'N/A')})" if result.get('published_date') else ""
+                    date_info = (
+                        f" (发布于: {result.get('published_date', 'N/A')})"
+                        if result.get("published_date")
+                        else ""
+                    )
                     _message += f"\n      {j}. {result['title'][:50]}...{date_info}"
                 logger.info(_message)
             else:
@@ -693,7 +885,7 @@ class DeepSearchAgent:
                 "search_results": format_search_results_for_prompt(
                     search_results, self.config.MAX_CONTENT_LENGTH
                 ),
-                "paragraph_latest_state": paragraph.research.latest_summary
+                "paragraph_latest_state": paragraph.research.latest_summary,
             }
             # 更新状态
@@ -710,10 +902,12 @@ class DeepSearchAgent:
         # 准备报告数据
         report_data = []
         for paragraph in self.state.paragraphs:
-            report_data.append({
+            report_data.append(
+                {
                     "title": paragraph.title,
-                "paragraph_latest_state": paragraph.research.latest_summary
-            })
+                    "paragraph_latest_state": paragraph.research.latest_summary,
+                }
+            )
         # 格式化报告
         try:
@@ -735,14 +929,16 @@ class DeepSearchAgent:
         """保存报告到文件"""
         # 生成文件名
         timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-        query_safe = "".join(c for c in self.state.query if c.isalnum() or c in (' ', '-', '_')).rstrip()
-        query_safe = query_safe.replace(' ', '_')[:30]
+        query_safe = "".join(
+            c for c in self.state.query if c.isalnum() or c in (" ", "-", "_")
+        ).rstrip()
+        query_safe = query_safe.replace(" ", "_")[:30]
         filename = f"deep_search_report_{query_safe}_{timestamp}.md"
         filepath = os.path.join(self.config.OUTPUT_DIR, filename)
         # 保存报告
-        with open(filepath, 'w', encoding='utf-8') as f:
+        with open(filepath, "w", encoding="utf-8") as f:
             f.write(report_content)
         logger.info(f"报告已保存到: {filepath}")
--- a/requirements.txt
View file @160d1e5
+++ b/requirements.txt
View file @160d1e5
@@ -61,6 +61,7 @@ weasyprint>=60.0  # PDF导出，支持Python 3.9-3.13
 # ===== 机器学习（可选，用于情感分析，不安装也没事写了容错程序） =====
 torch>=2.0.0 # CPU版本
 transformers>=4.30.0
+sentence-transformers>=2.2.2
 scikit-learn>=1.3.0
 xgboost>=2.0.0
 # NOTE：如果要安装GPU版本的torch，指令为pip3 install torch torchvision --index-url https://download.pytorch.org/whl/cu126