Toggle navigation
Toggle navigation
This project
Loading...
Sign in
万朱浩
/
Venue-Ops
Go to a project
Toggle navigation
Projects
Groups
Snippets
Help
Toggle navigation pinning
Project
Activity
Repository
Pipelines
Graphs
Issues
0
Merge Requests
0
Wiki
Network
Create a new issue
Builds
Commits
Authored by
戒酒的李白
2025-08-23 15:11:51 +0800
Browse Files
Options
Browse Files
Download
Email Patches
Plain Diff
Commit
4e332246332e471daad1498753b20da911325245
4e332246
1 parent
c35a6baf
The Insight Engine agent has been basically completed.
Show whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
424 additions
and
41 deletions
InsightEngine/agent.py
InsightEngine/prompts/prompts.py
InsightEngine/tools/__init__.py
InsightEngine/tools/keyword_optimizer.py
InsightEngine/tools/search.py
InsightEngine/utils/config.py
insight_engine_streamlit_app.py
InsightEngine/agent.py
View file @
4e33224
...
...
@@ -19,7 +19,7 @@ from .nodes import (
ReportFormattingNode
)
from
.state
import
State
from
.tools
import
MediaCrawlerDB
,
DBResponse
from
.tools
import
MediaCrawlerDB
,
DBResponse
,
keyword_optimizer
from
.utils
import
Config
,
load_config
,
format_search_results_for_prompt
...
...
@@ -113,7 +113,7 @@ class DeepSearchAgent:
def
execute_search_tool
(
self
,
tool_name
:
str
,
query
:
str
,
**
kwargs
)
->
DBResponse
:
"""
执行指定的数据库查询工具
执行指定的数据库查询工具
(集成关键词优化中间件)
Args:
tool_name: 工具名称,可选值:
...
...
@@ -130,34 +130,102 @@ class DeepSearchAgent:
"""
print
(
f
" → 执行数据库查询工具: {tool_name}"
)
# 对于热点内容搜索,不需要关键词优化(因为不需要query参数)
if
tool_name
==
"search_hot_content"
:
time_period
=
kwargs
.
get
(
"time_period"
,
"week"
)
limit
=
kwargs
.
get
(
"limit"
,
10
)
limit
=
kwargs
.
get
(
"limit"
,
10
0
)
return
self
.
search_agency
.
search_hot_content
(
time_period
=
time_period
,
limit
=
limit
)
elif
tool_name
==
"search_topic_globally"
:
limit_per_table
=
kwargs
.
get
(
"limit_per_table"
,
5
)
return
self
.
search_agency
.
search_topic_globally
(
topic
=
query
,
limit_per_table
=
limit_per_table
)
# 对于需要搜索词的工具,使用关键词优化中间件
optimized_response
=
keyword_optimizer
.
optimize_keywords
(
original_query
=
query
,
context
=
f
"使用{tool_name}工具进行查询"
)
print
(
f
" 🔍 原始查询: '{query}'"
)
print
(
f
" ✨ 优化后关键词: {optimized_response.optimized_keywords}"
)
# 使用优化后的关键词进行多次查询并整合结果
all_results
=
[]
total_count
=
0
for
keyword
in
optimized_response
.
optimized_keywords
:
print
(
f
" 查询关键词: '{keyword}'"
)
try
:
if
tool_name
==
"search_topic_globally"
:
limit_per_table
=
kwargs
.
get
(
"limit_per_table"
,
100
)
response
=
self
.
search_agency
.
search_topic_globally
(
topic
=
keyword
,
limit_per_table
=
limit_per_table
)
elif
tool_name
==
"search_topic_by_date"
:
start_date
=
kwargs
.
get
(
"start_date"
)
end_date
=
kwargs
.
get
(
"end_date"
)
limit_per_table
=
kwargs
.
get
(
"limit_per_table"
,
1
0
)
limit_per_table
=
kwargs
.
get
(
"limit_per_table"
,
10
0
)
if
not
start_date
or
not
end_date
:
raise
ValueError
(
"search_topic_by_date工具需要start_date和end_date参数"
)
return
self
.
search_agency
.
search_topic_by_date
(
topic
=
query
,
start_date
=
start_date
,
end_date
=
end_date
,
limit_per_table
=
limit_per_table
)
response
=
self
.
search_agency
.
search_topic_by_date
(
topic
=
keyword
,
start_date
=
start_date
,
end_date
=
end_date
,
limit_per_table
=
limit_per_table
)
elif
tool_name
==
"get_comments_for_topic"
:
limit
=
kwargs
.
get
(
"limit"
,
50
)
return
self
.
search_agency
.
get_comments_for_topic
(
topic
=
query
,
limit
=
limit
)
limit
=
kwargs
.
get
(
"limit"
,
500
)
//
len
(
optimized_response
.
optimized_keywords
)
limit
=
max
(
limit
,
50
)
response
=
self
.
search_agency
.
get_comments_for_topic
(
topic
=
keyword
,
limit
=
limit
)
elif
tool_name
==
"search_topic_on_platform"
:
platform
=
kwargs
.
get
(
"platform"
)
start_date
=
kwargs
.
get
(
"start_date"
)
end_date
=
kwargs
.
get
(
"end_date"
)
limit
=
kwargs
.
get
(
"limit"
,
20
)
limit
=
kwargs
.
get
(
"limit"
,
200
)
//
len
(
optimized_response
.
optimized_keywords
)
limit
=
max
(
limit
,
30
)
if
not
platform
:
raise
ValueError
(
"search_topic_on_platform工具需要platform参数"
)
return
self
.
search_agency
.
search_topic_on_platform
(
platform
=
platform
,
topic
=
query
,
start_date
=
start_date
,
end_date
=
end_date
,
limit
=
limit
)
response
=
self
.
search_agency
.
search_topic_on_platform
(
platform
=
platform
,
topic
=
keyword
,
start_date
=
start_date
,
end_date
=
end_date
,
limit
=
limit
)
else
:
print
(
f
" ⚠️ 未知的搜索工具: {tool_name},使用默认全局搜索"
)
return
self
.
search_agency
.
search_topic_globally
(
topic
=
query
)
print
(
f
" 未知的搜索工具: {tool_name},使用默认全局搜索"
)
response
=
self
.
search_agency
.
search_topic_globally
(
topic
=
keyword
,
limit_per_table
=
100
)
# 收集结果
if
response
.
results
:
print
(
f
" 找到 {len(response.results)} 条结果"
)
all_results
.
extend
(
response
.
results
)
total_count
+=
len
(
response
.
results
)
else
:
print
(
f
" 未找到结果"
)
except
Exception
as
e
:
print
(
f
" 查询'{keyword}'时出错: {str(e)}"
)
continue
# 去重和整合结果
unique_results
=
self
.
_deduplicate_results
(
all_results
)
print
(
f
" 总计找到 {total_count} 条结果,去重后 {len(unique_results)} 条"
)
# 构建整合后的响应
integrated_response
=
DBResponse
(
tool_name
=
f
"{tool_name}_optimized"
,
parameters
=
{
"original_query"
:
query
,
"optimized_keywords"
:
optimized_response
.
optimized_keywords
,
"optimization_reasoning"
:
optimized_response
.
reasoning
,
**
kwargs
},
results
=
unique_results
,
results_count
=
len
(
unique_results
)
)
return
integrated_response
def
_deduplicate_results
(
self
,
results
:
List
)
->
List
:
"""
去重搜索结果
"""
seen
=
set
()
unique_results
=
[]
for
result
in
results
:
# 使用URL或内容作为去重标识
identifier
=
result
.
url
if
result
.
url
else
result
.
title_or_content
[:
100
]
if
identifier
not
in
seen
:
seen
.
add
(
identifier
)
unique_results
.
append
(
result
)
return
unique_results
def
research
(
self
,
query
:
str
,
save_report
:
bool
=
True
)
->
str
:
"""
...
...
@@ -291,14 +359,14 @@ class DeepSearchAgent:
# 处理限制参数
if
search_tool
==
"search_hot_content"
:
time_period
=
search_output
.
get
(
"time_period"
,
"week"
)
limit
=
search_output
.
get
(
"limit"
,
10
)
limit
=
search_output
.
get
(
"limit"
,
10
0
)
search_kwargs
[
"time_period"
]
=
time_period
search_kwargs
[
"limit"
]
=
limit
elif
search_tool
in
[
"search_topic_globally"
,
"search_topic_by_date"
]:
limit_per_table
=
search_output
.
get
(
"limit_per_table"
,
5
)
limit_per_table
=
search_output
.
get
(
"limit_per_table"
,
100
)
search_kwargs
[
"limit_per_table"
]
=
limit_per_table
elif
search_tool
in
[
"get_comments_for_topic"
,
"search_topic_on_platform"
]:
limit
=
search_output
.
get
(
"limit"
,
20
)
limit
=
search_output
.
get
(
"limit"
,
20
0
)
search_kwargs
[
"limit"
]
=
limit
search_response
=
self
.
execute_search_tool
(
search_tool
,
search_query
,
**
search_kwargs
)
...
...
@@ -306,8 +374,8 @@ class DeepSearchAgent:
# 转换为兼容格式
search_results
=
[]
if
search_response
and
search_response
.
results
:
# 每种搜索工具都有其特定的结果数量,这里取前10个作为上限
max_results
=
min
(
len
(
search_response
.
results
),
10
)
# 每种搜索工具都有其特定的结果数量,这里取前100个作为上限
max_results
=
min
(
len
(
search_response
.
results
),
100
)
for
result
in
search_response
.
results
[:
max_results
]:
search_results
.
append
({
'title'
:
result
.
title_or_content
,
...
...
@@ -426,8 +494,8 @@ class DeepSearchAgent:
# 转换为兼容格式
search_results
=
[]
if
search_response
and
search_response
.
results
:
# 每种搜索工具都有其特定的结果数量,这里取前10个作为上限
max_results
=
min
(
len
(
search_response
.
results
),
10
)
# 每种搜索工具都有其特定的结果数量,这里取前100个作为上限
max_results
=
min
(
len
(
search_response
.
results
),
100
)
for
result
in
search_response
.
results
[:
max_results
]:
search_results
.
append
({
'title'
:
result
.
title_or_content
,
...
...
InsightEngine/prompts/prompts.py
View file @
4e33224
...
...
@@ -198,7 +198,7 @@ SYSTEM_PROMPT_FIRST_SEARCH = f"""
4. **参数优化配置**:
- search_topic_by_date: 必须提供start_date和end_date参数(格式:YYYY-MM-DD)
- search_topic_on_platform: 必须提供platform参数(bilibili, weibo, douyin, kuaishou, xhs, zhihu, tieba之一)
- 其他工具:合理配置limit参数以获取足够的样本
- 其他工具:合理配置limit参数以获取足够的样本
(建议:search_hot_content limit>=100,search_topic_globally limit_per_table>=50,search_topic_by_date limit_per_table>=100,get_comments_for_topic limit>=500,search_topic_on_platform limit>=200)
5. **阐述选择理由**:说明为什么这样的查询能够获得最真实的民意反馈
**搜索词设计核心原则**:
...
...
@@ -311,7 +311,7 @@ SYSTEM_PROMPT_REFLECTION = f"""
4. **参数配置要求**:
- search_topic_by_date: 必须提供start_date和end_date参数(格式:YYYY-MM-DD)
- search_topic_on_platform: 必须提供platform参数(bilibili, weibo, douyin, kuaishou, xhs, zhihu, tieba之一)
- 其他工具:合理配置参数以获取多样化的民意样本
- 其他工具:合理配置参数以获取多样化的民意样本
(建议:search_hot_content limit>=100,search_topic_globally limit_per_table>=50,search_topic_by_date limit_per_table>=100,get_comments_for_topic limit>=500,search_topic_on_platform limit>=200)
5. **阐述补充理由**:明确说明为什么需要这些额外的民意数据
...
...
InsightEngine/tools/__init__.py
View file @
4e33224
...
...
@@ -9,10 +9,18 @@ from .search import (
DBResponse
,
print_response_summary
)
from
.keyword_optimizer
import
(
KeywordOptimizer
,
KeywordOptimizationResponse
,
keyword_optimizer
)
__all__
=
[
"MediaCrawlerDB"
,
"QueryResult"
,
"DBResponse"
,
"print_response_summary"
"print_response_summary"
,
"KeywordOptimizer"
,
"KeywordOptimizationResponse"
,
"keyword_optimizer"
]
...
...
InsightEngine/tools/keyword_optimizer.py
0 → 100644
View file @
4e33224
"""
关键词优化中间件
使用Qwen AI将Agent生成的搜索词优化为更适合舆情数据库查询的关键词
"""
import
requests
import
json
import
sys
import
os
from
typing
import
List
,
Dict
,
Any
from
dataclasses
import
dataclass
# 添加项目根目录到Python路径以导入config
sys
.
path
.
append
(
os
.
path
.
dirname
(
os
.
path
.
dirname
(
os
.
path
.
dirname
(
__file__
))))
from
config
import
GUIJI_QWEN3_API_KEY
@dataclass
class
KeywordOptimizationResponse
:
"""关键词优化响应"""
original_query
:
str
optimized_keywords
:
List
[
str
]
reasoning
:
str
success
:
bool
error_message
:
str
=
""
class
KeywordOptimizer
:
"""
关键词优化器
使用硅基流动的Qwen3模型将Agent生成的搜索词优化为更贴近真实舆情的关键词
"""
def
__init__
(
self
,
api_key
:
str
=
None
):
"""
初始化关键词优化器
Args:
api_key: 硅基流动API密钥,如果不提供则从配置文件读取
"""
self
.
api_key
=
api_key
or
GUIJI_QWEN3_API_KEY
self
.
base_url
=
"https://api.siliconflow.cn/v1/chat/completions"
self
.
model
=
"Qwen/Qwen3-30B-A3B-Instruct-2507"
if
not
self
.
api_key
:
raise
ValueError
(
"未找到硅基流动API密钥,请在config.py中设置GUIJI_QWEN3_API_KEY"
)
def
optimize_keywords
(
self
,
original_query
:
str
,
context
:
str
=
""
)
->
KeywordOptimizationResponse
:
"""
优化搜索关键词
Args:
original_query: Agent生成的原始搜索查询
context: 额外的上下文信息(如段落标题、内容描述等)
Returns:
KeywordOptimizationResponse: 优化后的关键词列表
"""
print
(
f
"🔍 关键词优化中间件: 处理查询 '{original_query}'"
)
try
:
# 构建优化prompt
system_prompt
=
self
.
_build_system_prompt
()
user_prompt
=
self
.
_build_user_prompt
(
original_query
,
context
)
# 调用Qwen API
response
=
self
.
_call_qwen_api
(
system_prompt
,
user_prompt
)
if
response
[
"success"
]:
# 解析响应
content
=
response
[
"content"
]
try
:
# 尝试解析JSON格式的响应
if
content
.
strip
()
.
startswith
(
'{'
):
parsed
=
json
.
loads
(
content
)
keywords
=
parsed
.
get
(
"keywords"
,
[])
reasoning
=
parsed
.
get
(
"reasoning"
,
""
)
else
:
# 如果不是JSON格式,尝试从文本中提取关键词
keywords
=
self
.
_extract_keywords_from_text
(
content
)
reasoning
=
content
# 验证关键词质量
validated_keywords
=
self
.
_validate_keywords
(
keywords
)
print
(
f
"✅ 优化成功: {len(validated_keywords)}个关键词"
)
for
i
,
keyword
in
enumerate
(
validated_keywords
,
1
):
print
(
f
" {i}. '{keyword}'"
)
return
KeywordOptimizationResponse
(
original_query
=
original_query
,
optimized_keywords
=
validated_keywords
,
reasoning
=
reasoning
,
success
=
True
)
except
Exception
as
e
:
print
(
f
"⚠️ 解析响应失败,使用备用方案: {str(e)}"
)
# 备用方案:从原始查询中提取关键词
fallback_keywords
=
self
.
_fallback_keyword_extraction
(
original_query
)
return
KeywordOptimizationResponse
(
original_query
=
original_query
,
optimized_keywords
=
fallback_keywords
,
reasoning
=
"API响应解析失败,使用备用关键词提取"
,
success
=
True
)
else
:
print
(
f
"❌ API调用失败: {response['error']}"
)
# 使用备用方案
fallback_keywords
=
self
.
_fallback_keyword_extraction
(
original_query
)
return
KeywordOptimizationResponse
(
original_query
=
original_query
,
optimized_keywords
=
fallback_keywords
,
reasoning
=
"API调用失败,使用备用关键词提取"
,
success
=
True
,
error_message
=
response
[
'error'
]
)
except
Exception
as
e
:
print
(
f
"❌ 关键词优化失败: {str(e)}"
)
# 最终备用方案
fallback_keywords
=
self
.
_fallback_keyword_extraction
(
original_query
)
return
KeywordOptimizationResponse
(
original_query
=
original_query
,
optimized_keywords
=
fallback_keywords
,
reasoning
=
"系统错误,使用备用关键词提取"
,
success
=
False
,
error_message
=
str
(
e
)
)
def
_build_system_prompt
(
self
)
->
str
:
"""构建系统prompt"""
return
"""你是一位专业的舆情数据挖掘专家。你的任务是将用户提供的搜索查询优化为更适合在社交媒体舆情数据库中查找的关键词。
**核心原则**:
1. **贴近网民语言**:使用普通网友在社交媒体上会使用的词汇
2. **避免专业术语**:不使用"舆情"、"传播"、"倾向"、"展望"等官方词汇
3. **简洁具体**:每个关键词要非常简洁明了,便于数据库匹配
4. **情感丰富**:包含网民常用的情感表达词汇
5. **数量控制**:最少提供10个关键词,最多提供20个关键词
6. **避免重复**:不要脱离初始查询的主题
**输出格式**:
请以JSON格式返回结果:
{
"keywords": ["关键词1", "关键词2", "关键词3"],
"reasoning": "选择这些关键词的理由"
}
**示例**:
输入:"武汉大学舆情管理 未来展望 发展趋势"
输出:
{
"keywords": ["武大", "武汉大学", "学校管理", "大学", "教育"],
"reasoning": "选择'武大'和'武汉大学'作为核心词汇,这是网民最常使用的称呼;'学校管理'比'舆情管理'更贴近日常表达;避免使用'未来展望'、'发展趋势'等网民很少使用的专业术语"
}"""
def
_build_user_prompt
(
self
,
original_query
:
str
,
context
:
str
)
->
str
:
"""构建用户prompt"""
prompt
=
f
"请将以下搜索查询优化为适合舆情数据库查询的关键词:
\n\n
原始查询:{original_query}"
if
context
:
prompt
+=
f
"
\n\n
上下文信息:{context}"
prompt
+=
"
\n\n
请记住:要使用网民在社交媒体上真实使用的词汇,避免官方术语和专业词汇。"
return
prompt
def
_call_qwen_api
(
self
,
system_prompt
:
str
,
user_prompt
:
str
)
->
Dict
[
str
,
Any
]:
"""调用Qwen API"""
headers
=
{
"Authorization"
:
f
"Bearer {self.api_key}"
,
"Content-Type"
:
"application/json"
}
data
=
{
"model"
:
self
.
model
,
"messages"
:
[
{
"role"
:
"system"
,
"content"
:
system_prompt
},
{
"role"
:
"user"
,
"content"
:
user_prompt
}
],
"max_tokens"
:
10000
,
"temperature"
:
0.7
}
try
:
response
=
requests
.
post
(
self
.
base_url
,
headers
=
headers
,
json
=
data
,
timeout
=
30
)
response
.
raise_for_status
()
result
=
response
.
json
()
if
"choices"
in
result
and
len
(
result
[
"choices"
])
>
0
:
content
=
result
[
"choices"
][
0
][
"message"
][
"content"
]
return
{
"success"
:
True
,
"content"
:
content
}
else
:
return
{
"success"
:
False
,
"error"
:
"API返回格式异常"
}
except
requests
.
exceptions
.
RequestException
as
e
:
return
{
"success"
:
False
,
"error"
:
f
"网络请求错误: {str(e)}"
}
except
Exception
as
e
:
return
{
"success"
:
False
,
"error"
:
f
"API调用异常: {str(e)}"
}
def
_extract_keywords_from_text
(
self
,
text
:
str
)
->
List
[
str
]:
"""从文本中提取关键词(当JSON解析失败时使用)"""
# 简单的关键词提取逻辑
lines
=
text
.
split
(
'
\n
'
)
keywords
=
[]
for
line
in
lines
:
line
=
line
.
strip
()
# 查找可能的关键词
if
':'
in
line
or
':'
in
line
:
parts
=
line
.
split
(
':'
)
if
':'
in
line
else
line
.
split
(
':'
)
if
len
(
parts
)
>
1
:
potential_keywords
=
parts
[
1
]
.
strip
()
# 尝试分割关键词
if
'、'
in
potential_keywords
:
keywords
.
extend
([
k
.
strip
()
for
k
in
potential_keywords
.
split
(
'、'
)])
elif
','
in
potential_keywords
:
keywords
.
extend
([
k
.
strip
()
for
k
in
potential_keywords
.
split
(
','
)])
else
:
keywords
.
append
(
potential_keywords
)
# 如果没有找到,尝试其他方法
if
not
keywords
:
# 查找引号中的内容
import
re
quoted_content
=
re
.
findall
(
r'["""
\'
](.*?)["""
\'
]'
,
text
)
keywords
.
extend
(
quoted_content
)
# 清理和验证关键词
cleaned_keywords
=
[]
for
keyword
in
keywords
[:
20
]:
# 最多5个
keyword
=
keyword
.
strip
()
.
strip
(
'"
\'
""'''
)
if
keyword
and
len
(
keyword
)
<=
20
:
# 合理长度
cleaned_keywords
.
append
(
keyword
)
return
cleaned_keywords
[:
20
]
def
_validate_keywords
(
self
,
keywords
:
List
[
str
])
->
List
[
str
]:
"""验证和清理关键词"""
validated
=
[]
# 不良关键词(过于专业或官方)
bad_keywords
=
{
'态度分析'
,
'公众反应'
,
'情绪倾向'
,
'未来展望'
,
'发展趋势'
,
'战略规划'
,
'政策导向'
,
'管理机制'
}
for
keyword
in
keywords
:
if
isinstance
(
keyword
,
str
):
keyword
=
keyword
.
strip
()
.
strip
(
'"
\'
""'''
)
# 基本验证
if
(
keyword
and
len
(
keyword
)
<=
20
and
len
(
keyword
)
>=
1
and
not
any
(
bad_word
in
keyword
for
bad_word
in
bad_keywords
)):
validated
.
append
(
keyword
)
return
validated
[:
20
]
# 最多返回20个关键词
def
_fallback_keyword_extraction
(
self
,
original_query
:
str
)
->
List
[
str
]:
"""备用关键词提取方案"""
# 简单的关键词提取逻辑
# 移除常见的无用词汇
stop_words
=
{
'、'
}
# 分割查询
import
re
# 按空格、标点分割
tokens
=
re
.
split
(
r'[
\
s,。!?;:、]+'
,
original_query
)
keywords
=
[]
for
token
in
tokens
:
token
=
token
.
strip
()
if
token
and
token
not
in
stop_words
and
len
(
token
)
>=
2
:
keywords
.
append
(
token
)
# 如果没有有效关键词,使用原始查询的第一个词
if
not
keywords
:
first_word
=
original_query
.
split
()[
0
]
if
original_query
.
split
()
else
original_query
keywords
=
[
first_word
]
if
first_word
else
[
"热门"
]
return
keywords
[:
20
]
# 全局实例
keyword_optimizer
=
KeywordOptimizer
()
...
...
InsightEngine/tools/search.py
View file @
4e33224
...
...
@@ -2,7 +2,7 @@
专为 AI Agent 设计的本地舆情数据库查询工具集 (MediaCrawlerDB)
版本: 3.0
最后更新: 2025-08-2
2
最后更新: 2025-08-2
3
此脚本将复杂的本地MySQL数据库查询功能封装成一系列目标明确、参数清晰的独立工具,
专为AI Agent调用而设计。Agent只需根据任务意图(如搜索热点、全局搜索话题、
...
...
@@ -44,7 +44,7 @@ class QueryResult:
publish_time
:
Optional
[
datetime
]
=
None
engagement
:
Dict
[
str
,
int
]
=
field
(
default_factory
=
dict
)
source_keyword
:
Optional
[
str
]
=
None
hotness_score
:
float
=
0.0
# 新增:综合热度分
hotness_score
:
float
=
0.0
source_table
:
str
=
""
@dataclass
...
...
@@ -136,14 +136,14 @@ class MediaCrawlerDB:
def
search_hot_content
(
self
,
time_period
:
Literal
[
'24h'
,
'week'
,
'year'
]
=
'week'
,
limit
:
int
=
1
0
limit
:
int
=
5
0
)
->
DBResponse
:
"""
【工具】查找热点内容:
(已简化)
获取最近一段时间内综合热度最高的内容。
【工具】查找热点内容: 获取最近一段时间内综合热度最高的内容。
Args:
time_period (Literal['24h', 'week', 'year']): 时间范围,默认为 'week'。
limit (int): 返回结果的最大数量,默认为
1
0。
limit (int): 返回结果的最大数量,默认为
5
0。
Returns:
DBResponse: 包含按综合热度排序后的内容列表。
...
...
@@ -190,13 +190,13 @@ class MediaCrawlerDB:
formatted_results
=
[
QueryResult
(
platform
=
r
[
'p'
],
content_type
=
r
[
't'
],
title_or_content
=
r
[
'title'
],
author_nickname
=
r
.
get
(
'author'
),
url
=
r
[
'url'
],
publish_time
=
self
.
_to_datetime
(
r
[
'ts'
]),
engagement
=
self
.
_extract_engagement
(
r
),
hotness_score
=
r
.
get
(
'hotness_score'
,
0.0
),
source_keyword
=
r
.
get
(
'source_keyword'
),
source_table
=
r
[
'tbl'
])
for
r
in
raw_results
]
return
DBResponse
(
"search_hot_content"
,
params_for_log
,
results
=
formatted_results
,
results_count
=
len
(
formatted_results
))
def
search_topic_globally
(
self
,
topic
:
str
,
limit_per_table
:
int
=
5
)
->
DBResponse
:
def
search_topic_globally
(
self
,
topic
:
str
,
limit_per_table
:
int
=
100
)
->
DBResponse
:
"""
【工具】全局话题搜索: 在数据库中(内容、评论、标签、来源关键字)全面搜索指定话题。
Args:
topic (str): 要搜索的话题关键词。
limit_per_table (int): 从每个相关表中返回的最大记录数,默认为
5
。
limit_per_table (int): 从每个相关表中返回的最大记录数,默认为
100
。
Returns:
DBResponse: 包含所有匹配结果的聚合列表。
...
...
@@ -227,7 +227,7 @@ class MediaCrawlerDB:
))
return
DBResponse
(
"search_topic_globally"
,
params_for_log
,
results
=
all_results
,
results_count
=
len
(
all_results
))
def
search_topic_by_date
(
self
,
topic
:
str
,
start_date
:
str
,
end_date
:
str
,
limit_per_table
:
int
=
10
)
->
DBResponse
:
def
search_topic_by_date
(
self
,
topic
:
str
,
start_date
:
str
,
end_date
:
str
,
limit_per_table
:
int
=
10
0
)
->
DBResponse
:
"""
【工具】按日期搜索话题: 在明确的历史时间段内,搜索与特定话题相关的内容。
...
...
@@ -235,7 +235,7 @@ class MediaCrawlerDB:
topic (str): 要搜索的话题关键词。
start_date (str): 开始日期,格式 'YYYY-MM-DD'。
end_date (str): 结束日期,格式 'YYYY-MM-DD'。
limit_per_table (int): 从每个相关表中返回的最大记录数,默认为 10。
limit_per_table (int): 从每个相关表中返回的最大记录数,默认为 10
0
。
Returns:
DBResponse: 包含在指定日期范围内找到的结果的聚合列表。
...
...
@@ -282,13 +282,13 @@ class MediaCrawlerDB:
))
return
DBResponse
(
"search_topic_by_date"
,
params_for_log
,
results
=
all_results
,
results_count
=
len
(
all_results
))
def
get_comments_for_topic
(
self
,
topic
:
str
,
limit
:
int
=
50
)
->
DBResponse
:
def
get_comments_for_topic
(
self
,
topic
:
str
,
limit
:
int
=
50
0
)
->
DBResponse
:
"""
【工具】获取话题评论: 专门搜索并返回所有平台中与特定话题相关的公众评论数据。
Args:
topic (str): 要搜索的话题关键词。
limit (int): 返回评论的总数量上限,默认为 50。
limit (int): 返回评论的总数量上限,默认为 50
0
。
Returns:
DBResponse: 包含匹配的评论列表。
...
...
InsightEngine/utils/config.py
View file @
4e33224
...
...
@@ -30,11 +30,18 @@ class Config:
# 搜索配置
search_timeout
:
int
=
240
max_content_length
:
int
=
20000
max_content_length
:
int
=
100000
# 数据库查询限制
default_search_hot_content_limit
:
int
=
100
default_search_topic_globally_limit_per_table
:
int
=
50
default_search_topic_by_date_limit_per_table
:
int
=
100
default_get_comments_for_topic_limit
:
int
=
500
default_search_topic_on_platform_limit
:
int
=
200
# Agent配置
max_reflections
:
int
=
2
max_paragraphs
:
int
=
5
max_reflections
:
int
=
3
max_paragraphs
:
int
=
6
# 输出配置
output_dir
:
str
=
"reports"
...
...
@@ -85,7 +92,14 @@ class Config:
openai_model
=
getattr
(
config_module
,
"OPENAI_MODEL"
,
"gpt-4o-mini"
),
search_timeout
=
getattr
(
config_module
,
"SEARCH_TIMEOUT"
,
240
),
max_content_length
=
getattr
(
config_module
,
"SEARCH_CONTENT_MAX_LENGTH"
,
20000
),
max_content_length
=
getattr
(
config_module
,
"SEARCH_CONTENT_MAX_LENGTH"
,
200000
),
default_search_hot_content_limit
=
getattr
(
config_module
,
"DEFAULT_SEARCH_HOT_CONTENT_LIMIT"
,
100
),
default_search_topic_globally_limit_per_table
=
getattr
(
config_module
,
"DEFAULT_SEARCH_TOPIC_GLOBALLY_LIMIT_PER_TABLE"
,
50
),
default_search_topic_by_date_limit_per_table
=
getattr
(
config_module
,
"DEFAULT_SEARCH_TOPIC_BY_DATE_LIMIT_PER_TABLE"
,
100
),
default_get_comments_for_topic_limit
=
getattr
(
config_module
,
"DEFAULT_GET_COMMENTS_FOR_TOPIC_LIMIT"
,
500
),
default_search_topic_on_platform_limit
=
getattr
(
config_module
,
"DEFAULT_SEARCH_TOPIC_ON_PLATFORM_LIMIT"
,
200
),
max_reflections
=
getattr
(
config_module
,
"MAX_REFLECTIONS"
,
2
),
max_paragraphs
=
getattr
(
config_module
,
"MAX_PARAGRAPHS"
,
5
),
output_dir
=
getattr
(
config_module
,
"OUTPUT_DIR"
,
"reports"
),
...
...
@@ -119,7 +133,14 @@ class Config:
openai_model
=
config_dict
.
get
(
"OPENAI_MODEL"
,
"gpt-4o-mini"
),
search_timeout
=
int
(
config_dict
.
get
(
"SEARCH_TIMEOUT"
,
"240"
)),
max_content_length
=
int
(
config_dict
.
get
(
"SEARCH_CONTENT_MAX_LENGTH"
,
"20000"
)),
max_content_length
=
int
(
config_dict
.
get
(
"SEARCH_CONTENT_MAX_LENGTH"
,
"200000"
)),
default_search_hot_content_limit
=
int
(
config_dict
.
get
(
"DEFAULT_SEARCH_HOT_CONTENT_LIMIT"
,
"100"
)),
default_search_topic_globally_limit_per_table
=
int
(
config_dict
.
get
(
"DEFAULT_SEARCH_TOPIC_GLOBALLY_LIMIT_PER_TABLE"
,
"50"
)),
default_search_topic_by_date_limit_per_table
=
int
(
config_dict
.
get
(
"DEFAULT_SEARCH_TOPIC_BY_DATE_LIMIT_PER_TABLE"
,
"100"
)),
default_get_comments_for_topic_limit
=
int
(
config_dict
.
get
(
"DEFAULT_GET_COMMENTS_FOR_TOPIC_LIMIT"
,
"500"
)),
default_search_topic_on_platform_limit
=
int
(
config_dict
.
get
(
"DEFAULT_SEARCH_TOPIC_ON_PLATFORM_LIMIT"
,
"200"
)),
max_reflections
=
int
(
config_dict
.
get
(
"MAX_REFLECTIONS"
,
"2"
)),
max_paragraphs
=
int
(
config_dict
.
get
(
"MAX_PARAGRAPHS"
,
"5"
)),
output_dir
=
config_dict
.
get
(
"OUTPUT_DIR"
,
"reports"
),
...
...
insight_engine_streamlit_app.py
View file @
4e33224
...
...
@@ -34,7 +34,7 @@ def main():
# 高级配置
st
.
subheader
(
"高级配置"
)
max_reflections
=
st
.
slider
(
"反思次数"
,
1
,
5
,
2
)
max_content_length
=
st
.
number_input
(
"最大内容长度"
,
1000
,
50000
,
20000
)
max_content_length
=
st
.
number_input
(
"最大内容长度"
,
1000
0
,
500000
,
200000
)
# 提高10倍:1000-50000-20000 → 10000-500000-200000
# 模型选择
llm_provider
=
st
.
selectbox
(
"LLM提供商"
,
[
"deepseek"
,
"openai"
])
...
...
Please
register
or
login
to post a comment