Toggle navigation
Toggle navigation
This project
Loading...
Sign in
万朱浩
/
Venue-Ops
Go to a project
Toggle navigation
Projects
Groups
Snippets
Help
Toggle navigation pinning
Project
Activity
Repository
Pipelines
Graphs
Issues
0
Merge Requests
0
Wiki
Network
Create a new issue
Builds
Commits
Authored by
ghmark675
2025-11-15 15:34:47 +0800
Browse Files
Options
Browse Files
Download
Email Patches
Plain Diff
Commit
3084df6d218b38316efc479036c59c8f7ef07614
3084df6d
1 parent
aa3b9130
feat(insight_agent): search results cluster
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
301 additions
and
108 deletions
InsightEngine/agent.py
InsightEngine/agent.py
View file @
3084df6
...
...
@@ -7,22 +7,35 @@ import json
import
os
import
re
from
datetime
import
datetime
from
typing
import
Optional
,
Dict
,
Any
,
List
,
Union
from
typing
import
Any
,
Dict
,
List
,
Optional
,
Union
import
numpy
as
np
from
loguru
import
logger
from
sentence_transformers
import
SentenceTransformer
from
sklearn.cluster
import
KMeans
from
.llms
import
LLMClient
from
.nodes
import
(
ReportStructureNode
,
FirstSearchNode
,
ReflectionNode
,
FirstSummaryNode
,
ReflectionNode
,
ReflectionSummaryNode
,
ReportFormattingNode
ReportFormattingNode
,
ReportStructureNode
,
)
from
.state
import
State
from
.tools
import
MediaCrawlerDB
,
DBResponse
,
keyword_optimizer
,
multilingual_sentiment_analyzer
from
.utils.config
import
settings
,
Settings
from
.tools
import
(
DBResponse
,
MediaCrawlerDB
,
keyword_optimizer
,
multilingual_sentiment_analyzer
,
)
from
.utils
import
format_search_results_for_prompt
from
.utils.config
import
Settings
,
settings
ENABLE_CLUSTERING
:
bool
=
True
# 是否启用聚类采样
MAX_CLUSTERED_RESULTS
:
int
=
50
# 聚类后最大返回结果数
RESULTS_PER_CLUSTER
:
int
=
5
# 每个聚类返回的结果数
class
DeepSearchAgent
:
...
...
@@ -40,10 +53,12 @@ class DeepSearchAgent:
# 初始化LLM客户端
self
.
llm_client
=
self
.
_initialize_llm
()
# 初始化搜索工具集
self
.
search_agency
=
MediaCrawlerDB
()
# 初始化聚类小模型(懒加载)
self
.
_clustering_model
=
None
# 初始化情感分析器
self
.
sentiment_analyzer
=
multilingual_sentiment_analyzer
...
...
@@ -77,6 +92,15 @@ class DeepSearchAgent:
self
.
reflection_summary_node
=
ReflectionSummaryNode
(
self
.
llm_client
)
self
.
report_formatting_node
=
ReportFormattingNode
(
self
.
llm_client
)
def
_get_clustering_model
(
self
):
"""懒加载聚类模型"""
if
self
.
_clustering_model
is
None
:
logger
.
info
(
" 加载聚类模型 (paraphrase-multilingual-MiniLM-L12-v2)..."
)
self
.
_clustering_model
=
SentenceTransformer
(
"paraphrase-multilingual-MiniLM-L12-v2"
)
return
self
.
_clustering_model
def
_validate_date_format
(
self
,
date_str
:
str
)
->
bool
:
"""
验证日期格式是否为YYYY-MM-DD
...
...
@@ -91,17 +115,75 @@ class DeepSearchAgent:
return
False
# 检查格式
pattern
=
r
'^
\
d{4}-
\
d{2}-
\
d{2}$'
pattern
=
r
"^
\
d{4}-
\
d{2}-
\
d{2}$"
if
not
re
.
match
(
pattern
,
date_str
):
return
False
# 检查日期是否有效
try
:
datetime
.
strptime
(
date_str
,
'
%
Y-
%
m-
%
d'
)
datetime
.
strptime
(
date_str
,
"
%
Y-
%
m-
%
d"
)
return
True
except
ValueError
:
return
False
def
_cluster_and_sample_results
(
self
,
results
:
List
,
max_results
:
int
=
50
,
results_per_cluster
:
int
=
5
)
->
List
:
"""
对搜索结果进行聚类并采样
Args:
results: 搜索结果列表
max_results: 最大返回结果数
results_per_cluster: 每个聚类返回的结果数
Returns:
采样后的结果列表
"""
if
len
(
results
)
<=
max_results
:
return
results
try
:
# 提取文本
texts
=
[
r
.
title_or_content
[:
500
]
for
r
in
results
]
# 获取模型并编码
model
=
self
.
_get_clustering_model
()
embeddings
=
model
.
encode
(
texts
,
show_progress_bar
=
False
)
# 计算聚类数
n_clusters
=
min
(
max
(
2
,
max_results
//
results_per_cluster
),
len
(
results
))
# KMeans聚类
kmeans
=
KMeans
(
n_clusters
=
n_clusters
,
random_state
=
42
,
n_init
=
10
)
labels
=
kmeans
.
fit_predict
(
embeddings
)
# 从每个聚类采样
sampled_results
=
[]
for
cluster_id
in
range
(
n_clusters
):
cluster_indices
=
np
.
where
(
labels
==
cluster_id
)[
0
]
cluster_results
=
[(
results
[
i
],
i
)
for
i
in
cluster_indices
]
cluster_results
.
sort
(
key
=
lambda
x
:
x
[
0
]
.
hotness_score
or
0
,
reverse
=
True
)
for
result
,
_
in
cluster_results
[:
results_per_cluster
]:
sampled_results
.
append
(
result
)
if
len
(
sampled_results
)
>=
max_results
:
break
if
len
(
sampled_results
)
>=
max_results
:
break
logger
.
info
(
f
" 聚类完成: {len(results)} 条 -> {n_clusters} 个主题 -> {len(sampled_results)} 条代表性结果"
)
return
sampled_results
except
Exception
as
e
:
logger
.
warning
(
f
" 聚类失败,返回前{max_results}条: {str(e)}"
)
return
results
[:
max_results
]
def
execute_search_tool
(
self
,
tool_name
:
str
,
query
:
str
,
**
kwargs
)
->
DBResponse
:
"""
执行指定的数据库查询工具(集成关键词优化中间件和情感分析)
...
...
@@ -127,7 +209,9 @@ class DeepSearchAgent:
if
tool_name
==
"search_hot_content"
:
time_period
=
kwargs
.
get
(
"time_period"
,
"week"
)
limit
=
kwargs
.
get
(
"limit"
,
100
)
response
=
self
.
search_agency
.
search_hot_content
(
time_period
=
time_period
,
limit
=
limit
)
response
=
self
.
search_agency
.
search_hot_content
(
time_period
=
time_period
,
limit
=
limit
)
# 检查是否需要进行情感分析
enable_sentiment
=
kwargs
.
get
(
"enable_sentiment"
,
True
)
...
...
@@ -151,17 +235,16 @@ class DeepSearchAgent:
tool_name
=
"analyze_sentiment"
,
parameters
=
{
"texts"
:
texts
if
isinstance
(
texts
,
list
)
else
[
texts
],
**
kwargs
**
kwargs
,
},
results
=
[],
# 情感分析不返回搜索结果
results_count
=
0
,
metadata
=
sentiment_result
metadata
=
sentiment_result
,
)
# 对于需要搜索词的工具,使用关键词优化中间件
optimized_response
=
keyword_optimizer
.
optimize_keywords
(
original_query
=
query
,
context
=
f
"使用{tool_name}工具进行查询"
original_query
=
query
,
context
=
f
"使用{tool_name}工具进行查询"
)
logger
.
info
(
f
" 🔍 原始查询: '{query}'"
)
...
...
@@ -177,34 +260,62 @@ class DeepSearchAgent:
try
:
if
tool_name
==
"search_topic_globally"
:
# 使用配置文件中的默认值,忽略agent提供的limit_per_table参数
limit_per_table
=
self
.
config
.
DEFAULT_SEARCH_TOPIC_GLOBALLY_LIMIT_PER_TABLE
response
=
self
.
search_agency
.
search_topic_globally
(
topic
=
keyword
,
limit_per_table
=
limit_per_table
)
limit_per_table
=
(
self
.
config
.
DEFAULT_SEARCH_TOPIC_GLOBALLY_LIMIT_PER_TABLE
)
response
=
self
.
search_agency
.
search_topic_globally
(
topic
=
keyword
,
limit_per_table
=
limit_per_table
)
elif
tool_name
==
"search_topic_by_date"
:
start_date
=
kwargs
.
get
(
"start_date"
)
end_date
=
kwargs
.
get
(
"end_date"
)
# 使用配置文件中的默认值,忽略agent提供的limit_per_table参数
limit_per_table
=
self
.
config
.
DEFAULT_SEARCH_TOPIC_BY_DATE_LIMIT_PER_TABLE
limit_per_table
=
(
self
.
config
.
DEFAULT_SEARCH_TOPIC_BY_DATE_LIMIT_PER_TABLE
)
if
not
start_date
or
not
end_date
:
raise
ValueError
(
"search_topic_by_date工具需要start_date和end_date参数"
)
response
=
self
.
search_agency
.
search_topic_by_date
(
topic
=
keyword
,
start_date
=
start_date
,
end_date
=
end_date
,
limit_per_table
=
limit_per_table
)
raise
ValueError
(
"search_topic_by_date工具需要start_date和end_date参数"
)
response
=
self
.
search_agency
.
search_topic_by_date
(
topic
=
keyword
,
start_date
=
start_date
,
end_date
=
end_date
,
limit_per_table
=
limit_per_table
,
)
elif
tool_name
==
"get_comments_for_topic"
:
# 使用配置文件中的默认值,按关键词数量分配,但保证最小值
limit
=
self
.
config
.
DEFAULT_GET_COMMENTS_FOR_TOPIC_LIMIT
//
len
(
optimized_response
.
optimized_keywords
)
limit
=
self
.
config
.
DEFAULT_GET_COMMENTS_FOR_TOPIC_LIMIT
//
len
(
optimized_response
.
optimized_keywords
)
limit
=
max
(
limit
,
50
)
response
=
self
.
search_agency
.
get_comments_for_topic
(
topic
=
keyword
,
limit
=
limit
)
response
=
self
.
search_agency
.
get_comments_for_topic
(
topic
=
keyword
,
limit
=
limit
)
elif
tool_name
==
"search_topic_on_platform"
:
platform
=
kwargs
.
get
(
"platform"
)
start_date
=
kwargs
.
get
(
"start_date"
)
end_date
=
kwargs
.
get
(
"end_date"
)
# 使用配置文件中的默认值,按关键词数量分配,但保证最小值
limit
=
self
.
config
.
DEFAULT_SEARCH_TOPIC_ON_PLATFORM_LIMIT
//
len
(
optimized_response
.
optimized_keywords
)
limit
=
self
.
config
.
DEFAULT_SEARCH_TOPIC_ON_PLATFORM_LIMIT
//
len
(
optimized_response
.
optimized_keywords
)
limit
=
max
(
limit
,
30
)
if
not
platform
:
raise
ValueError
(
"search_topic_on_platform工具需要platform参数"
)
response
=
self
.
search_agency
.
search_topic_on_platform
(
platform
=
platform
,
topic
=
keyword
,
start_date
=
start_date
,
end_date
=
end_date
,
limit
=
limit
)
response
=
self
.
search_agency
.
search_topic_on_platform
(
platform
=
platform
,
topic
=
keyword
,
start_date
=
start_date
,
end_date
=
end_date
,
limit
=
limit
,
)
else
:
logger
.
info
(
f
" 未知的搜索工具: {tool_name},使用默认全局搜索"
)
response
=
self
.
search_agency
.
search_topic_globally
(
topic
=
keyword
,
limit_per_table
=
self
.
config
.
DEFAULT_SEARCH_TOPIC_GLOBALLY_LIMIT_PER_TABLE
)
response
=
self
.
search_agency
.
search_topic_globally
(
topic
=
keyword
,
limit_per_table
=
self
.
config
.
DEFAULT_SEARCH_TOPIC_GLOBALLY_LIMIT_PER_TABLE
,
)
# 收集结果
if
response
.
results
:
...
...
@@ -222,6 +333,13 @@ class DeepSearchAgent:
unique_results
=
self
.
_deduplicate_results
(
all_results
)
logger
.
info
(
f
" 总计找到 {total_count} 条结果,去重后 {len(unique_results)} 条"
)
if
ENABLE_CLUSTERING
:
unique_results
=
self
.
_cluster_and_sample_results
(
unique_results
,
max_results
=
MAX_CLUSTERED_RESULTS
,
results_per_cluster
=
RESULTS_PER_CLUSTER
,
)
# 构建整合后的响应
integrated_response
=
DBResponse
(
tool_name
=
f
"{tool_name}_optimized"
,
...
...
@@ -229,10 +347,10 @@ class DeepSearchAgent:
"original_query"
:
query
,
"optimized_keywords"
:
optimized_response
.
optimized_keywords
,
"optimization_reasoning"
:
optimized_response
.
reasoning
,
**
kwargs
**
kwargs
,
},
results
=
unique_results
,
results_count
=
len
(
unique_results
)
results_count
=
len
(
unique_results
)
,
)
# 检查是否需要进行情感分析
...
...
@@ -242,7 +360,9 @@ class DeepSearchAgent:
sentiment_analysis
=
self
.
_perform_sentiment_analysis
(
unique_results
)
if
sentiment_analysis
:
# 将情感分析结果添加到响应的parameters中
integrated_response
.
parameters
[
"sentiment_analysis"
]
=
sentiment_analysis
integrated_response
.
parameters
[
"sentiment_analysis"
]
=
(
sentiment_analysis
)
logger
.
info
(
f
" ✅ 情感分析完成"
)
return
integrated_response
...
...
@@ -275,7 +395,10 @@ class DeepSearchAgent:
"""
try
:
# 初始化情感分析器(如果尚未初始化且未被禁用)
if
not
self
.
sentiment_analyzer
.
is_initialized
and
not
self
.
sentiment_analyzer
.
is_disabled
:
if
(
not
self
.
sentiment_analyzer
.
is_initialized
and
not
self
.
sentiment_analyzer
.
is_disabled
):
logger
.
info
(
" 初始化情感分析模型..."
)
if
not
self
.
sentiment_analyzer
.
initialize
():
logger
.
info
(
" 情感分析模型初始化失败,将直接透传原始文本"
)
...
...
@@ -290,15 +413,15 @@ class DeepSearchAgent:
"platform"
:
result
.
platform
,
"author"
:
result
.
author_nickname
,
"url"
:
result
.
url
,
"publish_time"
:
str
(
result
.
publish_time
)
if
result
.
publish_time
else
None
"publish_time"
:
str
(
result
.
publish_time
)
if
result
.
publish_time
else
None
,
}
results_dict
.
append
(
result_dict
)
# 执行情感分析
sentiment_analysis
=
self
.
sentiment_analyzer
.
analyze_query_results
(
query_results
=
results_dict
,
text_field
=
"content"
,
min_confidence
=
0.5
query_results
=
results_dict
,
text_field
=
"content"
,
min_confidence
=
0.5
)
return
sentiment_analysis
.
get
(
"sentiment_analysis"
)
...
...
@@ -321,7 +444,10 @@ class DeepSearchAgent:
try
:
# 初始化情感分析器(如果尚未初始化且未被禁用)
if
not
self
.
sentiment_analyzer
.
is_initialized
and
not
self
.
sentiment_analyzer
.
is_disabled
:
if
(
not
self
.
sentiment_analyzer
.
is_initialized
and
not
self
.
sentiment_analyzer
.
is_disabled
):
logger
.
info
(
" 初始化情感分析模型..."
)
if
not
self
.
sentiment_analyzer
.
initialize
():
logger
.
info
(
" 情感分析模型初始化失败,将直接透传原始文本"
)
...
...
@@ -334,28 +460,43 @@ class DeepSearchAgent:
result_dict
=
result
.
__dict__
response
=
{
"success"
:
result
.
success
and
result
.
analysis_performed
,
"total_analyzed"
:
1
if
result
.
analysis_performed
and
result
.
success
else
0
,
"results"
:
[
result_dict
]
"total_analyzed"
:
1
if
result
.
analysis_performed
and
result
.
success
else
0
,
"results"
:
[
result_dict
],
}
if
not
result
.
analysis_performed
:
response
[
"success"
]
=
False
response
[
"warning"
]
=
result
.
error_message
or
"情感分析功能不可用,已直接返回原始文本"
response
[
"warning"
]
=
(
result
.
error_message
or
"情感分析功能不可用,已直接返回原始文本"
)
return
response
else
:
texts_list
=
list
(
texts
)
batch_result
=
self
.
sentiment_analyzer
.
analyze_batch
(
texts_list
,
show_progress
=
True
)
batch_result
=
self
.
sentiment_analyzer
.
analyze_batch
(
texts_list
,
show_progress
=
True
)
response
=
{
"success"
:
batch_result
.
analysis_performed
and
batch_result
.
success_count
>
0
,
"total_analyzed"
:
batch_result
.
total_processed
if
batch_result
.
analysis_performed
else
0
,
"success"
:
batch_result
.
analysis_performed
and
batch_result
.
success_count
>
0
,
"total_analyzed"
:
batch_result
.
total_processed
if
batch_result
.
analysis_performed
else
0
,
"success_count"
:
batch_result
.
success_count
,
"failed_count"
:
batch_result
.
failed_count
,
"average_confidence"
:
batch_result
.
average_confidence
if
batch_result
.
analysis_performed
else
0.0
,
"results"
:
[
result
.
__dict__
for
result
in
batch_result
.
results
]
"average_confidence"
:
batch_result
.
average_confidence
if
batch_result
.
analysis_performed
else
0.0
,
"results"
:
[
result
.
__dict__
for
result
in
batch_result
.
results
],
}
if
not
batch_result
.
analysis_performed
:
warning
=
next
(
(
r
.
error_message
for
r
in
batch_result
.
results
if
r
.
error_message
),
"情感分析功能不可用,已直接返回原始文本"
(
r
.
error_message
for
r
in
batch_result
.
results
if
r
.
error_message
),
"情感分析功能不可用,已直接返回原始文本"
,
)
response
[
"success"
]
=
False
response
[
"warning"
]
=
warning
...
...
@@ -363,11 +504,7 @@ class DeepSearchAgent:
except
Exception
as
e
:
logger
.
exception
(
f
" ❌ 情感分析过程中发生错误: {str(e)}"
)
return
{
"success"
:
False
,
"error"
:
str
(
e
),
"results"
:
[]
}
return
{
"success"
:
False
,
"error"
:
str
(
e
),
"results"
:
[]}
def
research
(
self
,
query
:
str
,
save_report
:
bool
=
True
)
->
str
:
"""
...
...
@@ -380,9 +517,9 @@ class DeepSearchAgent:
Returns:
最终报告内容
"""
logger
.
info
(
f
"
\n
{'='
*
60}"
)
logger
.
info
(
f
"
\n
{'='
*
60}"
)
logger
.
info
(
f
"开始深度研究: {query}"
)
logger
.
info
(
f
"{'='
*
60}"
)
logger
.
info
(
f
"{'='
*
60}"
)
try
:
# Step 1: 生成报告结构
...
...
@@ -426,7 +563,9 @@ class DeepSearchAgent:
total_paragraphs
=
len
(
self
.
state
.
paragraphs
)
for
i
in
range
(
total_paragraphs
):
logger
.
info
(
f
"
\n
[步骤 2.{i+1}] 处理段落: {self.state.paragraphs[i].title}"
)
logger
.
info
(
f
"
\n
[步骤 2.{i + 1}] 处理段落: {self.state.paragraphs[i].title}"
)
logger
.
info
(
"-"
*
50
)
# 初始搜索和总结
...
...
@@ -446,16 +585,15 @@ class DeepSearchAgent:
paragraph
=
self
.
state
.
paragraphs
[
paragraph_index
]
# 准备搜索输入
search_input
=
{
"title"
:
paragraph
.
title
,
"content"
:
paragraph
.
content
}
search_input
=
{
"title"
:
paragraph
.
title
,
"content"
:
paragraph
.
content
}
# 生成搜索查询和工具选择
logger
.
info
(
" - 生成搜索查询..."
)
search_output
=
self
.
first_search_node
.
run
(
search_input
)
search_query
=
search_output
[
"search_query"
]
search_tool
=
search_output
.
get
(
"search_tool"
,
"search_topic_globally"
)
# 默认工具
search_tool
=
search_output
.
get
(
"search_tool"
,
"search_topic_globally"
)
# 默认工具
reasoning
=
search_output
[
"reasoning"
]
logger
.
info
(
f
" - 搜索查询: {search_query}"
)
...
...
@@ -475,13 +613,17 @@ class DeepSearchAgent:
if
start_date
and
end_date
:
# 验证日期格式
if
self
.
_validate_date_format
(
start_date
)
and
self
.
_validate_date_format
(
end_date
):
if
self
.
_validate_date_format
(
start_date
)
and
self
.
_validate_date_format
(
end_date
):
search_kwargs
[
"start_date"
]
=
start_date
search_kwargs
[
"end_date"
]
=
end_date
logger
.
info
(
f
" - 时间范围: {start_date} 到 {end_date}"
)
else
:
logger
.
info
(
f
" 日期格式错误(应为YYYY-MM-DD),改用全局搜索"
)
logger
.
info
(
f
" 提供的日期: start_date={start_date}, end_date={end_date}"
)
logger
.
info
(
f
" 提供的日期: start_date={start_date}, end_date={end_date}"
)
search_tool
=
"search_topic_globally"
elif
search_tool
==
"search_topic_by_date"
:
logger
.
info
(
f
" search_topic_by_date工具缺少时间参数,改用全局搜索"
)
...
...
@@ -494,7 +636,9 @@ class DeepSearchAgent:
search_kwargs
[
"platform"
]
=
platform
logger
.
info
(
f
" - 指定平台: {platform}"
)
else
:
logger
.
warning
(
f
" search_topic_on_platform工具缺少平台参数,改用全局搜索"
)
logger
.
warning
(
f
" search_topic_on_platform工具缺少平台参数,改用全局搜索"
)
search_tool
=
"search_topic_globally"
# 处理限制参数,使用配置文件中的默认值而不是agent提供的参数
...
...
@@ -505,9 +649,13 @@ class DeepSearchAgent:
search_kwargs
[
"limit"
]
=
limit
elif
search_tool
in
[
"search_topic_globally"
,
"search_topic_by_date"
]:
if
search_tool
==
"search_topic_globally"
:
limit_per_table
=
self
.
config
.
DEFAULT_SEARCH_TOPIC_GLOBALLY_LIMIT_PER_TABLE
limit_per_table
=
(
self
.
config
.
DEFAULT_SEARCH_TOPIC_GLOBALLY_LIMIT_PER_TABLE
)
else
:
# search_topic_by_date
limit_per_table
=
self
.
config
.
DEFAULT_SEARCH_TOPIC_BY_DATE_LIMIT_PER_TABLE
limit_per_table
=
(
self
.
config
.
DEFAULT_SEARCH_TOPIC_BY_DATE_LIMIT_PER_TABLE
)
search_kwargs
[
"limit_per_table"
]
=
limit_per_table
elif
search_tool
in
[
"get_comments_for_topic"
,
"search_topic_on_platform"
]:
if
search_tool
==
"get_comments_for_topic"
:
...
...
@@ -516,34 +664,46 @@ class DeepSearchAgent:
limit
=
self
.
config
.
DEFAULT_SEARCH_TOPIC_ON_PLATFORM_LIMIT
search_kwargs
[
"limit"
]
=
limit
search_response
=
self
.
execute_search_tool
(
search_tool
,
search_query
,
**
search_kwargs
)
search_response
=
self
.
execute_search_tool
(
search_tool
,
search_query
,
**
search_kwargs
)
# 转换为兼容格式
search_results
=
[]
if
search_response
and
search_response
.
results
:
# 使用配置文件控制传递给LLM的结果数量,0表示不限制
if
self
.
config
.
MAX_SEARCH_RESULTS_FOR_LLM
>
0
:
max_results
=
min
(
len
(
search_response
.
results
),
self
.
config
.
MAX_SEARCH_RESULTS_FOR_LLM
)
max_results
=
min
(
len
(
search_response
.
results
),
self
.
config
.
MAX_SEARCH_RESULTS_FOR_LLM
)
else
:
max_results
=
len
(
search_response
.
results
)
# 不限制,传递所有结果
for
result
in
search_response
.
results
[:
max_results
]:
search_results
.
append
({
'title'
:
result
.
title_or_content
,
'url'
:
result
.
url
or
""
,
'content'
:
result
.
title_or_content
,
'score'
:
result
.
hotness_score
,
'raw_content'
:
result
.
title_or_content
,
'published_date'
:
result
.
publish_time
.
isoformat
()
if
result
.
publish_time
else
None
,
'platform'
:
result
.
platform
,
'content_type'
:
result
.
content_type
,
'author'
:
result
.
author_nickname
,
'engagement'
:
result
.
engagement
})
search_results
.
append
(
{
"title"
:
result
.
title_or_content
,
"url"
:
result
.
url
or
""
,
"content"
:
result
.
title_or_content
,
"score"
:
result
.
hotness_score
,
"raw_content"
:
result
.
title_or_content
,
"published_date"
:
result
.
publish_time
.
isoformat
()
if
result
.
publish_time
else
None
,
"platform"
:
result
.
platform
,
"content_type"
:
result
.
content_type
,
"author"
:
result
.
author_nickname
,
"engagement"
:
result
.
engagement
,
}
)
if
search_results
:
_message
=
f
" - 找到 {len(search_results)} 个搜索结果"
for
j
,
result
in
enumerate
(
search_results
,
1
):
date_info
=
f
" (发布于: {result.get('published_date', 'N/A')})"
if
result
.
get
(
'published_date'
)
else
""
date_info
=
(
f
" (发布于: {result.get('published_date', 'N/A')})"
if
result
.
get
(
"published_date"
)
else
""
)
_message
+=
f
"
\n
{j}. {result['title'][:50]}...{date_info}"
logger
.
info
(
_message
)
else
:
...
...
@@ -560,7 +720,7 @@ class DeepSearchAgent:
"search_query"
:
search_query
,
"search_results"
:
format_search_results_for_prompt
(
search_results
,
self
.
config
.
MAX_CONTENT_LENGTH
)
)
,
}
# 更新状态
...
...
@@ -581,13 +741,15 @@ class DeepSearchAgent:
reflection_input
=
{
"title"
:
paragraph
.
title
,
"content"
:
paragraph
.
content
,
"paragraph_latest_state"
:
paragraph
.
research
.
latest_summary
"paragraph_latest_state"
:
paragraph
.
research
.
latest_summary
,
}
# 生成反思搜索查询
reflection_output
=
self
.
reflection_node
.
run
(
reflection_input
)
search_query
=
reflection_output
[
"search_query"
]
search_tool
=
reflection_output
.
get
(
"search_tool"
,
"search_topic_globally"
)
# 默认工具
search_tool
=
reflection_output
.
get
(
"search_tool"
,
"search_topic_globally"
)
# 默认工具
reasoning
=
reflection_output
[
"reasoning"
]
logger
.
info
(
f
" 反思查询: {search_query}"
)
...
...
@@ -605,16 +767,24 @@ class DeepSearchAgent:
if
start_date
and
end_date
:
# 验证日期格式
if
self
.
_validate_date_format
(
start_date
)
and
self
.
_validate_date_format
(
end_date
):
if
self
.
_validate_date_format
(
start_date
)
and
self
.
_validate_date_format
(
end_date
):
search_kwargs
[
"start_date"
]
=
start_date
search_kwargs
[
"end_date"
]
=
end_date
logger
.
info
(
f
" 时间范围: {start_date} 到 {end_date}"
)
else
:
logger
.
info
(
f
" 日期格式错误(应为YYYY-MM-DD),改用全局搜索"
)
logger
.
info
(
f
" 提供的日期: start_date={start_date}, end_date={end_date}"
)
logger
.
info
(
f
" 日期格式错误(应为YYYY-MM-DD),改用全局搜索"
)
logger
.
info
(
f
" 提供的日期: start_date={start_date}, end_date={end_date}"
)
search_tool
=
"search_topic_globally"
elif
search_tool
==
"search_topic_by_date"
:
logger
.
warning
(
f
" search_topic_by_date工具缺少时间参数,改用全局搜索"
)
logger
.
warning
(
f
" search_topic_by_date工具缺少时间参数,改用全局搜索"
)
search_tool
=
"search_topic_globally"
# 处理需要平台参数的工具
...
...
@@ -624,7 +794,9 @@ class DeepSearchAgent:
search_kwargs
[
"platform"
]
=
platform
logger
.
info
(
f
" 指定平台: {platform}"
)
else
:
logger
.
warning
(
f
" search_topic_on_platform工具缺少平台参数,改用全局搜索"
)
logger
.
warning
(
f
" search_topic_on_platform工具缺少平台参数,改用全局搜索"
)
search_tool
=
"search_topic_globally"
# 处理限制参数
...
...
@@ -637,9 +809,13 @@ class DeepSearchAgent:
elif
search_tool
in
[
"search_topic_globally"
,
"search_topic_by_date"
]:
# 使用配置文件中的默认值,不允许agent控制limit_per_table参数
if
search_tool
==
"search_topic_globally"
:
limit_per_table
=
self
.
config
.
DEFAULT_SEARCH_TOPIC_GLOBALLY_LIMIT_PER_TABLE
limit_per_table
=
(
self
.
config
.
DEFAULT_SEARCH_TOPIC_GLOBALLY_LIMIT_PER_TABLE
)
else
:
# search_topic_by_date
limit_per_table
=
self
.
config
.
DEFAULT_SEARCH_TOPIC_BY_DATE_LIMIT_PER_TABLE
limit_per_table
=
(
self
.
config
.
DEFAULT_SEARCH_TOPIC_BY_DATE_LIMIT_PER_TABLE
)
search_kwargs
[
"limit_per_table"
]
=
limit_per_table
elif
search_tool
in
[
"get_comments_for_topic"
,
"search_topic_on_platform"
]:
# 使用配置文件中的默认值,不允许agent控制limit参数
...
...
@@ -649,34 +825,47 @@ class DeepSearchAgent:
limit
=
self
.
config
.
DEFAULT_SEARCH_TOPIC_ON_PLATFORM_LIMIT
search_kwargs
[
"limit"
]
=
limit
search_response
=
self
.
execute_search_tool
(
search_tool
,
search_query
,
**
search_kwargs
)
search_response
=
self
.
execute_search_tool
(
search_tool
,
search_query
,
**
search_kwargs
)
# 转换为兼容格式
search_results
=
[]
if
search_response
and
search_response
.
results
:
# 使用配置文件控制传递给LLM的结果数量,0表示不限制
if
self
.
config
.
MAX_SEARCH_RESULTS_FOR_LLM
>
0
:
max_results
=
min
(
len
(
search_response
.
results
),
self
.
config
.
MAX_SEARCH_RESULTS_FOR_LLM
)
max_results
=
min
(
len
(
search_response
.
results
),
self
.
config
.
MAX_SEARCH_RESULTS_FOR_LLM
,
)
else
:
max_results
=
len
(
search_response
.
results
)
# 不限制,传递所有结果
for
result
in
search_response
.
results
[:
max_results
]:
search_results
.
append
({
'title'
:
result
.
title_or_content
,
'url'
:
result
.
url
or
""
,
'content'
:
result
.
title_or_content
,
'score'
:
result
.
hotness_score
,
'raw_content'
:
result
.
title_or_content
,
'published_date'
:
result
.
publish_time
.
isoformat
()
if
result
.
publish_time
else
None
,
'platform'
:
result
.
platform
,
'content_type'
:
result
.
content_type
,
'author'
:
result
.
author_nickname
,
'engagement'
:
result
.
engagement
})
search_results
.
append
(
{
"title"
:
result
.
title_or_content
,
"url"
:
result
.
url
or
""
,
"content"
:
result
.
title_or_content
,
"score"
:
result
.
hotness_score
,
"raw_content"
:
result
.
title_or_content
,
"published_date"
:
result
.
publish_time
.
isoformat
()
if
result
.
publish_time
else
None
,
"platform"
:
result
.
platform
,
"content_type"
:
result
.
content_type
,
"author"
:
result
.
author_nickname
,
"engagement"
:
result
.
engagement
,
}
)
if
search_results
:
_message
=
f
" 找到 {len(search_results)} 个反思搜索结果"
for
j
,
result
in
enumerate
(
search_results
,
1
):
date_info
=
f
" (发布于: {result.get('published_date', 'N/A')})"
if
result
.
get
(
'published_date'
)
else
""
date_info
=
(
f
" (发布于: {result.get('published_date', 'N/A')})"
if
result
.
get
(
"published_date"
)
else
""
)
_message
+=
f
"
\n
{j}. {result['title'][:50]}...{date_info}"
logger
.
info
(
_message
)
else
:
...
...
@@ -693,7 +882,7 @@ class DeepSearchAgent:
"search_results"
:
format_search_results_for_prompt
(
search_results
,
self
.
config
.
MAX_CONTENT_LENGTH
),
"paragraph_latest_state"
:
paragraph
.
research
.
latest_summary
"paragraph_latest_state"
:
paragraph
.
research
.
latest_summary
,
}
# 更新状态
...
...
@@ -710,10 +899,12 @@ class DeepSearchAgent:
# 准备报告数据
report_data
=
[]
for
paragraph
in
self
.
state
.
paragraphs
:
report_data
.
append
({
report_data
.
append
(
{
"title"
:
paragraph
.
title
,
"paragraph_latest_state"
:
paragraph
.
research
.
latest_summary
})
"paragraph_latest_state"
:
paragraph
.
research
.
latest_summary
,
}
)
# 格式化报告
try
:
...
...
@@ -735,14 +926,16 @@ class DeepSearchAgent:
"""保存报告到文件"""
# 生成文件名
timestamp
=
datetime
.
now
()
.
strftime
(
"
%
Y
%
m
%
d_
%
H
%
M
%
S"
)
query_safe
=
""
.
join
(
c
for
c
in
self
.
state
.
query
if
c
.
isalnum
()
or
c
in
(
' '
,
'-'
,
'_'
))
.
rstrip
()
query_safe
=
query_safe
.
replace
(
' '
,
'_'
)[:
30
]
query_safe
=
""
.
join
(
c
for
c
in
self
.
state
.
query
if
c
.
isalnum
()
or
c
in
(
" "
,
"-"
,
"_"
)
)
.
rstrip
()
query_safe
=
query_safe
.
replace
(
" "
,
"_"
)[:
30
]
filename
=
f
"deep_search_report_{query_safe}_{timestamp}.md"
filepath
=
os
.
path
.
join
(
self
.
config
.
OUTPUT_DIR
,
filename
)
# 保存报告
with
open
(
filepath
,
'w'
,
encoding
=
'utf-8'
)
as
f
:
with
open
(
filepath
,
"w"
,
encoding
=
"utf-8"
)
as
f
:
f
.
write
(
report_content
)
logger
.
info
(
f
"报告已保存到: {filepath}"
)
...
...
Please
register
or
login
to post a comment