The private database analysis agent has been basically completed.

戒酒的李白
Commit c35a6baf0516d1524e4504306f83dd535e328060 c35a6baf 1 parent a33e4b3a
Showing 5 changed files with 368 additions and 143 deletions
InsightEngine/agent.py
InsightEngine/prompts/prompts.py
InsightEngine/tools/__init__.py
InsightEngine/utils/config.py
insight_engine_streamlit_app.py
--- a/InsightEngine/agent.py
View file @c35a6ba
+++ b/InsightEngine/agent.py
View file @c35a6ba
@@ -19,7 +19,7 @@ from .nodes import (
     ReportFormattingNode
 )
 from .state import State
- from .tools import TavilyNewsAgency, TavilyResponse
+ from .tools import MediaCrawlerDB, DBResponse
 from .utils import Config, load_config, format_search_results_for_prompt
 
 
@@ -39,8 +39,16 @@ class DeepSearchAgent:
         # 初始化LLM客户端
         self.llm_client = self._initialize_llm()
         
+         # 设置数据库环境变量
+         os.environ["DB_HOST"] = self.config.db_host or ""
+         os.environ["DB_USER"] = self.config.db_user or ""
+         os.environ["DB_PASSWORD"] = self.config.db_password or ""
+         os.environ["DB_NAME"] = self.config.db_name or ""
+         os.environ["DB_PORT"] = str(self.config.db_port)
+         os.environ["DB_CHARSET"] = self.config.db_charset
+         
         # 初始化搜索工具集
-         self.search_agency = TavilyNewsAgency(api_key=self.config.tavily_api_key)
+         self.search_agency = MediaCrawlerDB()
         
         # 初始化节点
         self._initialize_nodes()
@@ -53,7 +61,7 @@ class DeepSearchAgent:
         
         print(f"Deep Search Agent 已初始化")
         print(f"使用LLM: {self.llm_client.get_model_info()}")
-         print(f"搜索工具集: TavilyNewsAgency (支持6种搜索工具)")
+         print(f"搜索工具集: MediaCrawlerDB (支持5种本地数据库查询工具)")
     
     def _initialize_llm(self) -> BaseLLM:
         """初始化LLM客户端"""
@@ -103,46 +111,53 @@ class DeepSearchAgent:
         except ValueError:
             return False
     
-     def execute_search_tool(self, tool_name: str, query: str, **kwargs) -> TavilyResponse:
+     def execute_search_tool(self, tool_name: str, query: str, **kwargs) -> DBResponse:
         """
-         执行指定的搜索工具
+         执行指定的数据库查询工具
         
         Args:
             tool_name: 工具名称，可选值：
-                 - "basic_search_news": 基础新闻搜索（快速、通用）
-                 - "deep_search_news": 深度新闻分析
-                 - "search_news_last_24_hours": 24小时内最新新闻
-                 - "search_news_last_week": 本周新闻
-                 - "search_images_for_news": 新闻图片搜索
-                 - "search_news_by_date": 按日期范围搜索新闻
-             query: 搜索查询
-             **kwargs: 额外参数（如start_date, end_date, max_results）
+                 - "search_hot_content": 查找热点内容
+                 - "search_topic_globally": 全局话题搜索
+                 - "search_topic_by_date": 按日期搜索话题
+                 - "get_comments_for_topic": 获取话题评论
+                 - "search_topic_on_platform": 平台定向搜索
+             query: 搜索关键词/话题
+             **kwargs: 额外参数（如start_date, end_date, platform, limit等）
             
         Returns:
-             TavilyResponse对象
+             DBResponse对象
         """
-         print(f"  → 执行搜索工具: {tool_name}")
-         
-         if tool_name == "basic_search_news":
-             max_results = kwargs.get("max_results", 7)
-             return self.search_agency.basic_search_news(query, max_results)
-         elif tool_name == "deep_search_news":
-             return self.search_agency.deep_search_news(query)
-         elif tool_name == "search_news_last_24_hours":
-             return self.search_agency.search_news_last_24_hours(query)
-         elif tool_name == "search_news_last_week":
-             return self.search_agency.search_news_last_week(query)
-         elif tool_name == "search_images_for_news":
-             return self.search_agency.search_images_for_news(query)
-         elif tool_name == "search_news_by_date":
+         print(f"  → 执行数据库查询工具: {tool_name}")
+         
+         if tool_name == "search_hot_content":
+             time_period = kwargs.get("time_period", "week")
+             limit = kwargs.get("limit", 10)
+             return self.search_agency.search_hot_content(time_period=time_period, limit=limit)
+         elif tool_name == "search_topic_globally":
+             limit_per_table = kwargs.get("limit_per_table", 5)
+             return self.search_agency.search_topic_globally(topic=query, limit_per_table=limit_per_table)
+         elif tool_name == "search_topic_by_date":
             start_date = kwargs.get("start_date")
             end_date = kwargs.get("end_date")
+             limit_per_table = kwargs.get("limit_per_table", 10)
             if not start_date or not end_date:
-                 raise ValueError("search_news_by_date工具需要start_date和end_date参数")
-             return self.search_agency.search_news_by_date(query, start_date, end_date)
+                 raise ValueError("search_topic_by_date工具需要start_date和end_date参数")
+             return self.search_agency.search_topic_by_date(topic=query, start_date=start_date, end_date=end_date, limit_per_table=limit_per_table)
+         elif tool_name == "get_comments_for_topic":
+             limit = kwargs.get("limit", 50)
+             return self.search_agency.get_comments_for_topic(topic=query, limit=limit)
+         elif tool_name == "search_topic_on_platform":
+             platform = kwargs.get("platform")
+             start_date = kwargs.get("start_date")
+             end_date = kwargs.get("end_date")
+             limit = kwargs.get("limit", 20)
+             if not platform:
+                 raise ValueError("search_topic_on_platform工具需要platform参数")
+             return self.search_agency.search_topic_on_platform(platform=platform, topic=query, start_date=start_date, end_date=end_date, limit=limit)
         else:
-             print(f"  ⚠️  未知的搜索工具: {tool_name}，使用默认基础搜索")
-             return self.search_agency.basic_search_news(query)
+             print(f"  ⚠️  未知的搜索工具: {tool_name}，使用默认全局搜索")
+             return self.search_agency.search_topic_globally(topic=query)
     
     def research(self, query: str, save_report: bool = True) -> str:
         """
@@ -231,7 +246,7 @@ class DeepSearchAgent:
         print("  - 生成搜索查询...")
         search_output = self.first_search_node.run(search_input)
         search_query = search_output["search_query"]
-         search_tool = search_output.get("search_tool", "basic_search_news")  # 默认工具
+         search_tool = search_output.get("search_tool", "search_topic_globally")  # 默认工具
         reasoning = search_output["reasoning"]
         
         print(f"  - 搜索查询: {search_query}")
@@ -239,11 +254,13 @@ class DeepSearchAgent:
         print(f"  - 推理: {reasoning}")
         
         # 执行搜索
-         print("  - 执行网络搜索...")
+         print("  - 执行数据库查询...")
         
-         # 处理search_news_by_date的特殊参数
+         # 处理特殊参数
         search_kwargs = {}
-         if search_tool == "search_news_by_date":
+         
+         # 处理需要日期的工具
+         if search_tool in ["search_topic_by_date", "search_topic_on_platform"]:
             start_date = search_output.get("start_date")
             end_date = search_output.get("end_date")
             
@@ -254,12 +271,35 @@ class DeepSearchAgent:
                     search_kwargs["end_date"] = end_date
                     print(f"  - 时间范围: {start_date} 到 {end_date}")
                 else:
-                     print(f"  ⚠️  日期格式错误（应为YYYY-MM-DD），改用基础搜索")
+                     print(f"  ⚠️  日期格式错误（应为YYYY-MM-DD），改用全局搜索")
                     print(f"      提供的日期: start_date={start_date}, end_date={end_date}")
-                     search_tool = "basic_search_news"
+                     search_tool = "search_topic_globally"
+             elif search_tool == "search_topic_by_date":
+                 print(f"  ⚠️  search_topic_by_date工具缺少时间参数，改用全局搜索")
+                 search_tool = "search_topic_globally"
+         
+         # 处理需要平台参数的工具
+         if search_tool == "search_topic_on_platform":
+             platform = search_output.get("platform")
+             if platform:
+                 search_kwargs["platform"] = platform
+                 print(f"  - 指定平台: {platform}")
             else:
-                 print(f"  ⚠️  search_news_by_date工具缺少时间参数，改用基础搜索")
-                 search_tool = "basic_search_news"
+                 print(f"  ⚠️  search_topic_on_platform工具缺少平台参数，改用全局搜索")
+                 search_tool = "search_topic_globally"
+         
+         # 处理限制参数
+         if search_tool == "search_hot_content":
+             time_period = search_output.get("time_period", "week")
+             limit = search_output.get("limit", 10)
+             search_kwargs["time_period"] = time_period
+             search_kwargs["limit"] = limit
+         elif search_tool in ["search_topic_globally", "search_topic_by_date"]:
+             limit_per_table = search_output.get("limit_per_table", 5)
+             search_kwargs["limit_per_table"] = limit_per_table
+         elif search_tool in ["get_comments_for_topic", "search_topic_on_platform"]:
+             limit = search_output.get("limit", 20)
+             search_kwargs["limit"] = limit
         
         search_response = self.execute_search_tool(search_tool, search_query, **search_kwargs)
         
@@ -270,12 +310,16 @@ class DeepSearchAgent:
             max_results = min(len(search_response.results), 10)
             for result in search_response.results[:max_results]:
                 search_results.append({
-                     'title': result.title,
-                     'url': result.url,
-                     'content': result.content,
-                     'score': result.score,
-                     'raw_content': result.raw_content,
-                     'published_date': result.published_date  # 新增字段
+                     'title': result.title_or_content,
+                     'url': result.url or "",
+                     'content': result.title_or_content,
+                     'score': result.hotness_score,
+                     'raw_content': result.title_or_content,
+                     'published_date': result.publish_time.isoformat() if result.publish_time else None,
+                     'platform': result.platform,
+                     'content_type': result.content_type,
+                     'author': result.author_nickname,
+                     'engagement': result.engagement
                 })
         
         if search_results:
@@ -324,7 +368,7 @@ class DeepSearchAgent:
             # 生成反思搜索查询
             reflection_output = self.reflection_node.run(reflection_input)
             search_query = reflection_output["search_query"]
-             search_tool = reflection_output.get("search_tool", "basic_search_news")  # 默认工具
+             search_tool = reflection_output.get("search_tool", "search_topic_globally")  # 默认工具
             reasoning = reflection_output["reasoning"]
             
             print(f"    反思查询: {search_query}")
@@ -332,9 +376,11 @@ class DeepSearchAgent:
             print(f"    反思推理: {reasoning}")
             
             # 执行反思搜索
-             # 处理search_news_by_date的特殊参数
+             # 处理特殊参数
             search_kwargs = {}
-             if search_tool == "search_news_by_date":
+             
+             # 处理需要日期的工具
+             if search_tool in ["search_topic_by_date", "search_topic_on_platform"]:
                 start_date = reflection_output.get("start_date")
                 end_date = reflection_output.get("end_date")
                 
@@ -345,12 +391,35 @@ class DeepSearchAgent:
                         search_kwargs["end_date"] = end_date
                         print(f"    时间范围: {start_date} 到 {end_date}")
                     else:
-                         print(f"    ⚠️  日期格式错误（应为YYYY-MM-DD），改用基础搜索")
+                         print(f"    ⚠️  日期格式错误（应为YYYY-MM-DD），改用全局搜索")
                         print(f"        提供的日期: start_date={start_date}, end_date={end_date}")
-                         search_tool = "basic_search_news"
+                         search_tool = "search_topic_globally"
+                 elif search_tool == "search_topic_by_date":
+                     print(f"    ⚠️  search_topic_by_date工具缺少时间参数，改用全局搜索")
+                     search_tool = "search_topic_globally"
+             
+             # 处理需要平台参数的工具
+             if search_tool == "search_topic_on_platform":
+                 platform = reflection_output.get("platform")
+                 if platform:
+                     search_kwargs["platform"] = platform
+                     print(f"    指定平台: {platform}")
                 else:
-                     print(f"    ⚠️  search_news_by_date工具缺少时间参数，改用基础搜索")
-                     search_tool = "basic_search_news"
+                     print(f"    ⚠️  search_topic_on_platform工具缺少平台参数，改用全局搜索")
+                     search_tool = "search_topic_globally"
+             
+             # 处理限制参数
+             if search_tool == "search_hot_content":
+                 time_period = reflection_output.get("time_period", "week")
+                 limit = reflection_output.get("limit", 10)
+                 search_kwargs["time_period"] = time_period
+                 search_kwargs["limit"] = limit
+             elif search_tool in ["search_topic_globally", "search_topic_by_date"]:
+                 limit_per_table = reflection_output.get("limit_per_table", 5)
+                 search_kwargs["limit_per_table"] = limit_per_table
+             elif search_tool in ["get_comments_for_topic", "search_topic_on_platform"]:
+                 limit = reflection_output.get("limit", 20)
+                 search_kwargs["limit"] = limit
             
             search_response = self.execute_search_tool(search_tool, search_query, **search_kwargs)
             
@@ -361,12 +430,16 @@ class DeepSearchAgent:
                 max_results = min(len(search_response.results), 10)
                 for result in search_response.results[:max_results]:
                     search_results.append({
-                         'title': result.title,
-                         'url': result.url,
-                         'content': result.content,
-                         'score': result.score,
-                         'raw_content': result.raw_content,
-                         'published_date': result.published_date
+                         'title': result.title_or_content,
+                         'url': result.url or "",
+                         'content': result.title_or_content,
+                         'score': result.hotness_score,
+                         'raw_content': result.title_or_content,
+                         'published_date': result.publish_time.isoformat() if result.publish_time else None,
+                         'platform': result.platform,
+                         'content_type': result.content_type,
+                         'author': result.author_nickname,
+                         'engagement': result.engagement
                     })
             
             if search_results:
--- a/InsightEngine/prompts/prompts.py
View file @c35a6ba
+++ b/InsightEngine/prompts/prompts.py
View file @c35a6ba
@@ -35,8 +35,12 @@ output_schema_first_search = {
         "search_query": {"type": "string"},
         "search_tool": {"type": "string"},
         "reasoning": {"type": "string"},
-         "start_date": {"type": "string", "description": "开始日期，格式YYYY-MM-DD，仅search_news_by_date工具需要"},
-         "end_date": {"type": "string", "description": "结束日期，格式YYYY-MM-DD，仅search_news_by_date工具需要"}
+         "start_date": {"type": "string", "description": "开始日期，格式YYYY-MM-DD，search_topic_by_date和search_topic_on_platform工具可能需要"},
+         "end_date": {"type": "string", "description": "结束日期，格式YYYY-MM-DD，search_topic_by_date和search_topic_on_platform工具可能需要"},
+         "platform": {"type": "string", "description": "平台名称，search_topic_on_platform工具必需，可选值：bilibili, weibo, douyin, kuaishou, xhs, zhihu, tieba"},
+         "time_period": {"type": "string", "description": "时间周期，search_hot_content工具可选，可选值：24h, week, year"},
+         "limit": {"type": "integer", "description": "结果数量限制，各工具可选参数"},
+         "limit_per_table": {"type": "integer", "description": "每表结果数量限制，search_topic_globally和search_topic_by_date工具可选"}
     },
     "required": ["search_query", "search_tool", "reasoning"]
 }
@@ -80,8 +84,12 @@ output_schema_reflection = {
         "search_query": {"type": "string"},
         "search_tool": {"type": "string"},
         "reasoning": {"type": "string"},
-         "start_date": {"type": "string", "description": "开始日期，格式YYYY-MM-DD，仅search_news_by_date工具需要"},
-         "end_date": {"type": "string", "description": "结束日期，格式YYYY-MM-DD，仅search_news_by_date工具需要"}
+         "start_date": {"type": "string", "description": "开始日期，格式YYYY-MM-DD，search_topic_by_date和search_topic_on_platform工具可能需要"},
+         "end_date": {"type": "string", "description": "结束日期，格式YYYY-MM-DD，search_topic_by_date和search_topic_on_platform工具可能需要"},
+         "platform": {"type": "string", "description": "平台名称，search_topic_on_platform工具必需，可选值：bilibili, weibo, douyin, kuaishou, xhs, zhihu, tieba"},
+         "time_period": {"type": "string", "description": "时间周期，search_hot_content工具可选，可选值：24h, week, year"},
+         "limit": {"type": "integer", "description": "结果数量限制，各工具可选参数"},
+         "limit_per_table": {"type": "integer", "description": "每表结果数量限制，search_topic_globally和search_topic_by_date工具可选"}
     },
     "required": ["search_query", "search_tool", "reasoning"]
 }
@@ -141,47 +149,83 @@ SYSTEM_PROMPT_REPORT_STRUCTURE = f"""
 
 # 每个段落第一次搜索的系统提示词
 SYSTEM_PROMPT_FIRST_SEARCH = f"""
- 你是一位深度研究助手。你将获得报告中的一个段落，其标题和预期内容将按照以下JSON模式定义提供：
+ 你是一位专业的舆情分析师。你将获得报告中的一个段落，其标题和预期内容将按照以下JSON模式定义提供：
 
 <INPUT JSON SCHEMA>
 {json.dumps(input_schema_first_search, indent=2, ensure_ascii=False)}
 </INPUT JSON SCHEMA>
 
- 你可以使用以下6种专业的新闻搜索工具：
+ 你可以使用以下5种专业的本地舆情数据库查询工具来挖掘真实的民意和公众观点：
 
- 1. **basic_search_news** - 基础新闻搜索工具
-    - 适用于：一般性的新闻搜索，不确定需要何种特定搜索时
-    - 特点：快速、标准的通用搜索，是最常用的基础工具
+ 1. **search_hot_content** - 查找热点内容工具
+    - 适用于：挖掘当前最受关注的舆情事件和话题
+    - 特点：基于真实的点赞、评论、分享数据发现热门话题
+    - 参数：time_period ('24h', 'week', 'year')，limit（数量限制）
 
- 2. **deep_search_news** - 深度新闻分析工具
-    - 适用于：需要全面深入了解某个主题时
-    - 特点：提供最详细的分析结果，包含高级AI摘要
+ 2. **search_topic_globally** - 全局话题搜索工具
+    - 适用于：全面了解公众对特定话题的讨论和观点
+    - 特点：覆盖B站、微博、抖音、快手、小红书、知乎、贴吧等主流平台的真实用户声音
+    - 参数：limit_per_table（每个表的结果数量限制）
 
- 3. **search_news_last_24_hours** - 24小时最新新闻工具
-    - 适用于：需要了解最新动态、突发事件时
-    - 特点：只搜索过去24小时的新闻
+ 3. **search_topic_by_date** - 按日期搜索话题工具
+    - 适用于：追踪舆情事件的时间线发展和公众情绪变化
+    - 特点：精确的时间范围控制，适合分析舆情演变过程
+    - 特殊要求：需要提供start_date和end_date参数，格式为'YYYY-MM-DD'
+    - 参数：limit_per_table（每个表的结果数量限制）
 
- 4. **search_news_last_week** - 本周新闻工具
-    - 适用于：需要了解近期发展趋势时
-    - 特点：搜索过去一周的新闻报道
+ 4. **get_comments_for_topic** - 获取话题评论工具
+    - 适用于：深度挖掘网民的真实态度、情感和观点
+    - 特点：直接获取用户评论，了解民意走向和情感倾向
+    - 参数：limit（评论总数量限制）
 
- 5. **search_images_for_news** - 图片搜索工具
-    - 适用于：需要可视化信息、图片资料时
-    - 特点：提供相关图片和图片描述
+ 5. **search_topic_on_platform** - 平台定向搜索工具
+    - 适用于：分析特定社交平台用户群体的观点特征
+    - 特点：针对不同平台用户群体的观点差异进行精准分析
+    - 特殊要求：需要提供platform参数，可选start_date和end_date
+    - 参数：platform（必须），start_date, end_date（可选），limit（数量限制）
 
- 6. **search_news_by_date** - 按日期范围搜索工具
-    - 适用于：需要研究特定历史时期时
-    - 特点：可以指定开始和结束日期进行搜索
-    - 特殊要求：需要提供start_date和end_date参数，格式为'YYYY-MM-DD'
-    - 注意：只有这个工具需要额外的时间参数
+ **你的核心使命：挖掘真实的民意和人情味**
 
 你的任务是：
- 1. 根据段落主题选择最合适的搜索工具
- 2. 制定最佳的搜索查询
- 3. 如果选择search_news_by_date工具，必须同时提供start_date和end_date参数（格式：YYYY-MM-DD）
- 4. 解释你的选择理由
- 
- 注意：除了search_news_by_date工具外，其他工具都不需要额外参数。
+ 1. **深度理解段落需求**：根据段落主题，思考需要了解哪些具体的公众观点和情感
+ 2. **精准选择查询工具**：选择最能获取真实民意数据的工具
+ 3. **设计接地气的搜索词**：**这是最关键的环节！**
+    - **避免官方术语**：不要用"舆情传播"、"公众反应"、"情绪倾向"等书面语
+    - **使用网民真实表达**：模拟普通网友会怎么谈论这个话题
+    - **贴近生活语言**：用简单、直接、口语化的词汇
+    - **包含情感词汇**：网民常用的褒贬词、情绪词
+    - **考虑话题热词**：相关的网络流行语、缩写、昵称
+ 4. **参数优化配置**：
+    - search_topic_by_date: 必须提供start_date和end_date参数（格式：YYYY-MM-DD）
+    - search_topic_on_platform: 必须提供platform参数（bilibili, weibo, douyin, kuaishou, xhs, zhihu, tieba之一）
+    - 其他工具：合理配置limit参数以获取足够的样本
+ 5. **阐述选择理由**：说明为什么这样的查询能够获得最真实的民意反馈
+ 
+ **搜索词设计核心原则**：
+ - **想象网友怎么说**：如果你是个普通网友，你会怎么讨论这个话题？
+ - **避免学术词汇**：杜绝"舆情"、"传播"、"倾向"等专业术语
+ - **使用具体词汇**：用具体的事件、人名、地名、现象描述
+ - **包含情感表达**：如"支持"、"反对"、"担心"、"愤怒"、"点赞"等
+ - **考虑网络文化**：网民的表达习惯、缩写、俚语、表情符号文字描述
+ 
+ **举例说明**：
+ - ❌ 错误："武汉大学舆情 公众反应"
+ - ✅ 正确："武大" 或 "武汉大学怎么了" 或 "武大学生"
+ - ❌ 错误："校园事件 学生反应"  
+ - ✅ 正确："学校出事" 或 "同学们都在说" 或 "校友群炸了"
+ 
+ **不同平台语言特色参考**：
+ - **微博**：热搜词汇、话题标签，如 "武大又上热搜"、"心疼武大学子"
+ - **知乎**：问答式表达，如 "如何看待武汉大学"、"武大是什么体验"
+ - **B站**：弹幕文化，如 "武大yyds"、"武大人路过"、"我武最强"
+ - **贴吧**：直接称呼，如 "武大吧"、"武大的兄弟们"
+ - **抖音/快手**：短视频描述，如 "武大日常"、"武大vlog"
+ - **小红书**：分享式，如 "武大真的很美"、"武大攻略"
+ 
+ **情感表达词汇库**：
+ - 正面："太棒了"、"牛逼"、"绝了"、"爱了"、"yyds"、"666"
+ - 负面："无语"、"离谱"、"绝了"、"服了"、"麻了"、"破防"
+ - 中性："围观"、"吃瓜"、"路过"、"有一说一"、"实名"
 请按照以下JSON模式定义格式化输出（文字请使用中文）：
 
 <OUTPUT JSON SCHEMA>
@@ -194,13 +238,27 @@ SYSTEM_PROMPT_FIRST_SEARCH = f"""
 
 # 每个段落第一次总结的系统提示词
 SYSTEM_PROMPT_FIRST_SUMMARY = f"""
- 你是一位深度研究助手。你将获得搜索查询、搜索结果以及你正在研究的报告段落，数据将按照以下JSON模式定义提供：
+ 你是一位专业的舆情分析师和报告撰写专家。你将获得搜索查询、真实的社交媒体数据以及你正在研究的舆情报告段落：
 
 <INPUT JSON SCHEMA>
 {json.dumps(input_schema_first_summary, indent=2, ensure_ascii=False)}
 </INPUT JSON SCHEMA>
 
- 你的任务是作为研究者，使用搜索结果撰写与段落主题一致的内容，并适当地组织结构以便纳入报告中。
+ **你的核心任务：将真实的民意数据转化为有温度的舆情分析**
+ 
+ 撰写要求：
+ 1. **突出真实民意**：优先引用具体的用户评论、真实案例和情感表达
+ 2. **展现多元观点**：呈现不同平台、不同群体的观点差异和讨论重点
+ 3. **数据支撑分析**：用具体的点赞数、评论数、转发数等数据说明舆情热度
+ 4. **情感色彩描述**：准确描述公众的情感倾向（愤怒、支持、担忧、期待等）
+ 5. **避免套话官话**：使用贴近民众的语言，避免过度官方化的表述
+ 
+ 撰写风格：
+ - 语言生动，有感染力
+ - 引用真实的网民声音和具体案例
+ - 体现舆情的复杂性和多面性
+ - 突出社会情绪和价值观念的碰撞
+ - 让读者感受到真实的民意脉搏
 请按照以下JSON模式定义格式化输出：
 
 <OUTPUT JSON SCHEMA>
@@ -213,29 +271,67 @@ SYSTEM_PROMPT_FIRST_SUMMARY = f"""
 
 # 反思(Reflect)的系统提示词
 SYSTEM_PROMPT_REFLECTION = f"""
- 你是一位深度研究助手。你负责为研究报告构建全面的段落。你将获得段落标题、计划内容摘要，以及你已经创建的段落最新状态，所有这些都将按照以下JSON模式定义提供：
+ 你是一位资深的舆情分析师。你负责深化舆情报告的内容，让其更贴近真实的民意和社会情感。你将获得段落标题、计划内容摘要，以及你已经创建的段落最新状态：
 
 <INPUT JSON SCHEMA>
 {json.dumps(input_schema_reflection, indent=2, ensure_ascii=False)}
 </INPUT JSON SCHEMA>
 
- 你可以使用以下6种专业的新闻搜索工具：
+ 你可以使用以下5种专业的本地舆情数据库查询工具来深度挖掘民意：
 
- 1. **basic_search_news** - 基础新闻搜索工具
- 2. **deep_search_news** - 深度新闻分析工具
- 3. **search_news_last_24_hours** - 24小时最新新闻工具  
- 4. **search_news_last_week** - 本周新闻工具
- 5. **search_images_for_news** - 图片搜索工具
- 6. **search_news_by_date** - 按日期范围搜索工具（需要时间参数）
+ 1. **search_hot_content** - 查找热点内容工具
+ 2. **search_topic_globally** - 全局话题搜索工具  
+ 3. **search_topic_by_date** - 按日期搜索话题工具
+ 4. **get_comments_for_topic** - 获取话题评论工具
+ 5. **search_topic_on_platform** - 平台定向搜索工具
 
- 你的任务是：
- 1. 反思段落文本的当前状态，思考是否遗漏了主题的某些关键方面
- 2. 选择最合适的搜索工具来补充缺失信息
- 3. 制定精确的搜索查询
- 4. 如果选择search_news_by_date工具，必须同时提供start_date和end_date参数（格式：YYYY-MM-DD）
- 5. 解释你的选择和推理
+ **反思的核心目标：让报告更有人情味和真实感**
 
- 注意：除了search_news_by_date工具外，其他工具都不需要额外参数。
+ 你的任务是：
+ 1. **深度反思内容质量**：
+    - 当前段落是否过于官方化、套路化？
+    - 是否缺乏真实的民众声音和情感表达？
+    - 是否遗漏了重要的公众观点和争议焦点？
+    - 是否需要补充具体的网民评论和真实案例？
+ 
+ 2. **识别信息缺口**：
+    - 缺少哪个平台的用户观点？（如B站年轻人、微博话题讨论、知乎深度分析等）
+    - 缺少哪个时间段的舆情变化？
+    - 缺少哪些具体的民意表达和情感倾向？
+ 
+ 3. **精准补充查询**：
+    - 选择最能填补信息缺口的查询工具
+    - **设计接地气的搜索关键词**：
+      * 避免继续使用官方化、书面化的词汇
+      * 思考网民会用什么词来表达这个观点
+      * 使用具体的、有情感色彩的词汇
+      * 考虑不同平台的语言特色（如B站弹幕文化、微博热搜词汇等）
+    - 重点关注评论区和用户原创内容
+ 
+ 4. **参数配置要求**：
+    - search_topic_by_date: 必须提供start_date和end_date参数（格式：YYYY-MM-DD）
+    - search_topic_on_platform: 必须提供platform参数（bilibili, weibo, douyin, kuaishou, xhs, zhihu, tieba之一）
+    - 其他工具：合理配置参数以获取多样化的民意样本
+ 
+ 5. **阐述补充理由**：明确说明为什么需要这些额外的民意数据
+ 
+ **反思重点**：
+ - 报告是否反映了真实的社会情绪？
+ - 是否包含了不同群体的观点和声音？
+ - 是否有具体的用户评论和真实案例支撑？
+ - 是否体现了舆情的复杂性和多面性？
+ - 语言表达是否贴近民众，避免过度官方化？
+ 
+ **搜索词优化示例（重要！）**：
+ - 如果需要了解"武汉大学"相关内容：
+   * ❌ 不要用："武汉大学舆情"、"校园事件"、"学生反应"
+   * ✅ 应该用："武大"、"武汉大学"、"珞珈山"、"樱花大道"
+ - 如果需要了解争议话题：
+   * ❌ 不要用："争议事件"、"公众争议"
+   * ✅ 应该用："出事了"、"怎么回事"、"翻车"、"炸了"
+ - 如果需要了解情感态度：
+   * ❌ 不要用："情感倾向"、"态度分析"
+   * ✅ 应该用："支持"、"反对"、"心疼"、"气死"、"666"、"绝了"
 请按照以下JSON模式定义格式化输出：
 
 <OUTPUT JSON SCHEMA>
@@ -248,18 +344,28 @@ SYSTEM_PROMPT_REFLECTION = f"""
 
 # 总结反思的系统提示词
 SYSTEM_PROMPT_REFLECTION_SUMMARY = f"""
- 你是一位深度研究助手。
- 你将获得搜索查询、搜索结果、段落标题以及你正在研究的报告段落的预期内容。
- 你正在迭代完善这个段落，并且段落的最新状态也会提供给你。
+ 你是一位资深的舆情分析师和内容优化专家。
+ 你正在深化和完善舆情报告段落，让其更贴近真实民意、更有说服力和感染力。
 数据将按照以下JSON模式定义提供：
 
 <INPUT JSON SCHEMA>
 {json.dumps(input_schema_reflection_summary, indent=2, ensure_ascii=False)}
 </INPUT JSON SCHEMA>
 
- 你的任务是根据搜索结果和预期内容丰富段落的当前最新状态。
- 不要删除最新状态中的关键信息，尽量丰富它，只添加缺失的信息。
- 适当地组织段落结构以便纳入报告中。
+ **你的任务：让段落更有人情味和真实感**
+ 
+ 优化策略：
+ 1. **融入新的民意数据**：将补充搜索到的真实用户声音整合到段落中
+ 2. **丰富情感表达**：增加具体的情感描述和社会情绪分析
+ 3. **补充遗漏观点**：添加之前缺失的不同群体、平台的观点
+ 4. **强化数据支撑**：用具体数字和案例让分析更有说服力
+ 5. **优化语言表达**：让文字更生动、更贴近民众，减少官方套话
+ 
+ 注意事项：
+ - 保留段落的核心观点和重要信息
+ - 增强内容的真实性和可信度
+ - 体现舆情的复杂性和多样性
+ - 让读者能感受到真实的社会脉搏
 请按照以下JSON模式定义格式化输出：
 
 <OUTPUT JSON SCHEMA>
@@ -272,14 +378,28 @@ SYSTEM_PROMPT_REFLECTION_SUMMARY = f"""
 
 # 最终研究报告格式化的系统提示词
 SYSTEM_PROMPT_REPORT_FORMATTING = f"""
- 你是一位深度研究助手。你已经完成了研究并构建了报告中所有段落的最终版本。
+ 你是一位专业的舆情报告编辑和格式化专家。你已经完成了深度的舆情分析并构建了报告中所有段落的最终版本。
 你将获得以下JSON格式的数据：
 
 <INPUT JSON SCHEMA>
 {json.dumps(input_schema_report_formatting, indent=2, ensure_ascii=False)}
 </INPUT JSON SCHEMA>
 
- 你的任务是将报告格式化为美观的形式，并以Markdown格式返回。
- 如果没有结论段落，请根据其他段落的最新状态在报告末尾添加一个结论。
- 使用段落标题来创建报告的标题。
+ **你的任务：将舆情分析格式化为专业、有感染力的报告**
+ 
+ 格式化要求：
+ 1. **标题设计**：创建吸引人、有概括性的报告标题
+ 2. **结构优化**：确保段落逻辑清晰，层次分明
+ 3. **突出重点**：用**粗体**、*斜体*等格式突出关键观点和数据
+ 4. **数据可视**：用表格或列表呈现重要的舆情数据
+ 5. **增强可读性**：合理使用分段、标题层级和格式化元素
+ 
+ 结论撰写（如果需要）：
+ - 总结主要的舆情发现和民意倾向
+ - 突出不同平台和群体的观点特征
+ - 提炼深层的社会情绪和价值观念
+ - 用数据和具体案例支撑结论
+ - 语言简洁有力，避免空洞套话
+ 
+ 最终输出：专业的Markdown格式舆情分析报告
 """
--- a/InsightEngine/tools/__init__.py
View file @c35a6ba
+++ b/InsightEngine/tools/__init__.py
View file @c35a6ba
 """
 工具调用模块
- 提供外部工具接口，如网络搜索等
+ 提供外部工具接口，如本地数据库查询等
 """
 
 from .search import (
-     TavilyNewsAgency, 
-     SearchResult, 
-     TavilyResponse, 
-     ImageResult,
+     MediaCrawlerDB,
+     QueryResult,
+     DBResponse,
     print_response_summary
 )
 
 __all__ = [
-     "TavilyNewsAgency", 
-     "SearchResult", 
-     "TavilyResponse", 
-     "ImageResult",
+     "MediaCrawlerDB",
+     "QueryResult",
+     "DBResponse",
     "print_response_summary"
 ]
--- a/InsightEngine/utils/config.py
View file @c35a6ba
+++ b/InsightEngine/utils/config.py
View file @c35a6ba
@@ -14,7 +14,14 @@ class Config:
     # API密钥
     deepseek_api_key: Optional[str] = None
     openai_api_key: Optional[str] = None
-     tavily_api_key: Optional[str] = None
+     
+     # 数据库配置
+     db_host: Optional[str] = None
+     db_user: Optional[str] = None
+     db_password: Optional[str] = None
+     db_name: Optional[str] = None
+     db_port: int = 3306
+     db_charset: str = "utf8mb4"
     
     # 模型配置
     default_llm_provider: str = "deepseek"  # deepseek 或 openai
@@ -44,8 +51,8 @@ class Config:
             print("错误: OpenAI API Key未设置")
             return False
         
-         if not self.tavily_api_key:
-             print("错误: Tavily API Key未设置")
+         if not all([self.db_host, self.db_user, self.db_password, self.db_name]):
+             print("错误: 数据库连接信息不完整")
             return False
         
         return True
@@ -65,7 +72,14 @@ class Config:
             return cls(
                 deepseek_api_key=getattr(config_module, "DEEPSEEK_API_KEY", None),
                 openai_api_key=getattr(config_module, "OPENAI_API_KEY", None),
-                 tavily_api_key=getattr(config_module, "TAVILY_API_KEY", None),
+                 
+                 db_host=getattr(config_module, "DB_HOST", None),
+                 db_user=getattr(config_module, "DB_USER", None),
+                 db_password=getattr(config_module, "DB_PASSWORD", None),
+                 db_name=getattr(config_module, "DB_NAME", None),
+                 db_port=getattr(config_module, "DB_PORT", 3306),
+                 db_charset=getattr(config_module, "DB_CHARSET", "utf8mb4"),
+                 
                 default_llm_provider=getattr(config_module, "DEFAULT_LLM_PROVIDER", "deepseek"),
                 deepseek_model=getattr(config_module, "DEEPSEEK_MODEL", "deepseek-chat"),
                 openai_model=getattr(config_module, "OPENAI_MODEL", "gpt-4o-mini"),
@@ -92,7 +106,14 @@ class Config:
             return cls(
                 deepseek_api_key=config_dict.get("DEEPSEEK_API_KEY"),
                 openai_api_key=config_dict.get("OPENAI_API_KEY"),
-                 tavily_api_key=config_dict.get("TAVILY_API_KEY"),
+                 
+                 db_host=config_dict.get("DB_HOST"),
+                 db_user=config_dict.get("DB_USER"),
+                 db_password=config_dict.get("DB_PASSWORD"),
+                 db_name=config_dict.get("DB_NAME"),
+                 db_port=int(config_dict.get("DB_PORT", "3306")),
+                 db_charset=config_dict.get("DB_CHARSET", "utf8mb4"),
+                 
                 default_llm_provider=config_dict.get("DEFAULT_LLM_PROVIDER", "deepseek"),
                 deepseek_model=config_dict.get("DEEPSEEK_MODEL", "deepseek-chat"),
                 openai_model=config_dict.get("OPENAI_MODEL", "gpt-4o-mini"),
@@ -147,7 +168,7 @@ def print_config(config: Config):
     print(f"LLM提供商: {config.default_llm_provider}")
     print(f"DeepSeek模型: {config.deepseek_model}")
     print(f"OpenAI模型: {config.openai_model}")
-     print(f"最大搜索结果数: {config.max_search_results}")
+ 
     print(f"搜索超时: {config.search_timeout}秒")
     print(f"最大内容长度: {config.max_content_length}")
     print(f"最大反思次数: {config.max_reflections}")
@@ -155,8 +176,11 @@ def print_config(config: Config):
     print(f"输出目录: {config.output_dir}")
     print(f"保存中间状态: {config.save_intermediate_states}")
     
-     # 显示API密钥状态（不显示实际密钥）
+     # 显示API密钥和数据库状态（不显示实际密钥）
     print(f"DeepSeek API Key: {'已设置' if config.deepseek_api_key else '未设置'}")
     print(f"OpenAI API Key: {'已设置' if config.openai_api_key else '未设置'}")
-     print(f"Tavily API Key: {'已设置' if config.tavily_api_key else '未设置'}")
+     print(f"数据库连接: {'已配置' if all([config.db_host, config.db_user, config.db_password, config.db_name]) else '未配置'}")
+     print(f"数据库主机: {config.db_host}")
+     print(f"数据库端口: {config.db_port}")
+     print(f"数据库名称: {config.db_name}")
     print("==================\n")
--- a/insight_engine_streamlit_app.py
View file @c35a6ba
+++ b/insight_engine_streamlit_app.py
View file @c35a6ba
@@ -12,8 +12,8 @@ import json
 # 添加src目录到Python路径
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), '.'))
 
- from QueryEngine import DeepSearchAgent, Config
- from config import DEEPSEEK_API_KEY, TAVILY_API_KEY
+ from InsightEngine import DeepSearchAgent, Config
+ from config import DEEPSEEK_API_KEY, DB_HOST, DB_USER, DB_PASSWORD, DB_NAME, DB_PORT, DB_CHARSET
 
 
 def main():
@@ -24,8 +24,8 @@ def main():
         layout="wide"
     )
     
-     st.title("Deep Search Agent")
-     st.markdown("基于DeepSeek的无框架深度搜索AI代理")
+     st.title("Insight Engine Agent")
+     st.markdown("基于DeepSeek的本地舆情数据库深度分析AI代理")
     
     # 侧边栏配置
     with st.sidebar:
@@ -96,21 +96,31 @@ def main():
             st.error("请提供OpenAI API Key")
             return
         
-         # 自动使用配置文件中的API密钥
+         # 自动使用配置文件中的API密钥和数据库配置
         deepseek_key = DEEPSEEK_API_KEY
-         tavily_key = TAVILY_API_KEY
+         db_host = DB_HOST
+         db_user = DB_USER
+         db_password = DB_PASSWORD
+         db_name = DB_NAME
+         db_port = DB_PORT
+         db_charset = DB_CHARSET
         
         # 创建配置
         config = Config(
             deepseek_api_key=deepseek_key if llm_provider == "deepseek" else None,
             openai_api_key=openai_key if llm_provider == "openai" else None,
-             tavily_api_key=tavily_key,
+             db_host=db_host,
+             db_user=db_user,
+             db_password=db_password,
+             db_name=db_name,
+             db_port=db_port,
+             db_charset=db_charset,
             default_llm_provider=llm_provider,
             deepseek_model=model_name if llm_provider == "deepseek" else "deepseek-chat",
             openai_model=model_name if llm_provider == "openai" else "gpt-4o-mini",
             max_reflections=max_reflections,
             max_content_length=max_content_length,
-             output_dir="query_engine_streamlit_reports"
+             output_dir="insight_engine_streamlit_reports"
         )
         
         # 执行研究