Toggle navigation
Toggle navigation
This project
Loading...
Sign in
万朱浩
/
Venue-Ops
Go to a project
Toggle navigation
Projects
Groups
Snippets
Help
Toggle navigation pinning
Project
Activity
Repository
Pipelines
Graphs
Issues
0
Merge Requests
0
Wiki
Network
Create a new issue
Builds
Commits
Authored by
ghmark675
2025-11-10 19:00:06 +0800
Browse Files
Options
Browse Files
Download
Email Patches
Plain Diff
Committed by
666ghj
2025-11-11 01:11:32 +0800
Commit
aa11c529c8707615a2f40d8a301cd740f345dbad
aa11c529
1 parent
71f4b3ad
style(sentiment_analyzer): format file
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
178 additions
and
114 deletions
InsightEngine/tools/sentiment_analyzer.py
InsightEngine/tools/sentiment_analyzer.py
View file @
aa11c52
...
...
@@ -11,6 +11,7 @@ import re
try
:
import
torch
TORCH_AVAILABLE
=
True
except
ImportError
:
torch
=
None
# type: ignore
...
...
@@ -18,6 +19,7 @@ except ImportError:
try
:
from
transformers
import
AutoTokenizer
,
AutoModelForSequenceClassification
TRANSFORMERS_AVAILABLE
=
True
except
ImportError
:
AutoTokenizer
=
None
# type: ignore
...
...
@@ -28,6 +30,7 @@ except ImportError:
# INFO:若想跳过情感分析,可手动切换此开关为False
SENTIMENT_ANALYSIS_ENABLED
=
True
def
_describe_missing_dependencies
()
->
str
:
missing
=
[]
if
not
TORCH_AVAILABLE
:
...
...
@@ -36,14 +39,21 @@ def _describe_missing_dependencies() -> str:
missing
.
append
(
"Transformers"
)
return
" / "
.
join
(
missing
)
# 添加项目根目录到路径,以便导入WeiboMultilingualSentiment
project_root
=
os
.
path
.
dirname
(
os
.
path
.
dirname
(
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))))
weibo_sentiment_path
=
os
.
path
.
join
(
project_root
,
"SentimentAnalysisModel"
,
"WeiboMultilingualSentiment"
)
project_root
=
os
.
path
.
dirname
(
os
.
path
.
dirname
(
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
)))
)
weibo_sentiment_path
=
os
.
path
.
join
(
project_root
,
"SentimentAnalysisModel"
,
"WeiboMultilingualSentiment"
)
sys
.
path
.
append
(
weibo_sentiment_path
)
@dataclass
class
SentimentResult
:
"""情感分析结果数据类"""
text
:
str
sentiment_label
:
str
confidence
:
float
...
...
@@ -53,9 +63,10 @@ class SentimentResult:
analysis_performed
:
bool
=
True
@dataclass
@dataclass
class
BatchSentimentResult
:
"""批量情感分析结果数据类"""
results
:
List
[
SentimentResult
]
total_processed
:
int
success_count
:
int
...
...
@@ -69,7 +80,7 @@ class WeiboMultilingualSentimentAnalyzer:
多语言情感分析器
封装WeiboMultilingualSentiment模型,为AI Agent提供情感分析功能
"""
def
__init__
(
self
):
"""初始化情感分析器"""
self
.
model
=
None
...
...
@@ -78,14 +89,14 @@ class WeiboMultilingualSentimentAnalyzer:
self
.
is_initialized
=
False
self
.
is_disabled
=
False
self
.
disable_reason
:
Optional
[
str
]
=
None
# 情感标签映射(5级分类)
self
.
sentiment_map
=
{
0
:
"非常负面"
,
1
:
"负面"
,
2
:
"中性"
,
3
:
"正面"
,
4
:
"非常正面"
0
:
"非常负面"
,
1
:
"负面"
,
2
:
"中性"
,
3
:
"正面"
,
4
:
"非常正面"
,
}
if
not
SENTIMENT_ANALYSIS_ENABLED
:
...
...
@@ -96,9 +107,13 @@ class WeiboMultilingualSentimentAnalyzer:
if
self
.
is_disabled
:
reason
=
self
.
disable_reason
or
"Sentiment analysis disabled."
print
(
f
"WeiboMultilingualSentimentAnalyzer initialized but disabled: {reason}"
)
print
(
f
"WeiboMultilingualSentimentAnalyzer initialized but disabled: {reason}"
)
else
:
print
(
"WeiboMultilingualSentimentAnalyzer 已创建,调用 initialize() 来加载模型"
)
print
(
"WeiboMultilingualSentimentAnalyzer 已创建,调用 initialize() 来加载模型"
)
def
disable
(
self
,
reason
:
Optional
[
str
]
=
None
,
drop_state
:
bool
=
False
)
->
None
:
"""Disable sentiment analysis, optionally clearing loaded resources."""
...
...
@@ -130,14 +145,18 @@ class WeiboMultilingualSentimentAnalyzer:
if
torch
.
cuda
.
is_available
():
return
torch
.
device
(
"cuda"
)
mps_backend
=
getattr
(
torch
.
backends
,
"mps"
,
None
)
if
mps_backend
and
getattr
(
mps_backend
,
"is_available"
,
lambda
:
False
)()
and
getattr
(
mps_backend
,
"is_built"
,
lambda
:
False
)():
if
(
mps_backend
and
getattr
(
mps_backend
,
"is_available"
,
lambda
:
False
)()
and
getattr
(
mps_backend
,
"is_built"
,
lambda
:
False
)()
):
return
torch
.
device
(
"mps"
)
return
torch
.
device
(
"cpu"
)
def
initialize
(
self
)
->
bool
:
"""
初始化模型和分词器
Returns:
是否初始化成功
"""
...
...
@@ -155,31 +174,35 @@ class WeiboMultilingualSentimentAnalyzer:
if
self
.
is_initialized
:
print
(
"模型已经初始化,无需重复加载"
)
return
True
try
:
print
(
"正在加载多语言情感分析模型..."
)
# 使用多语言情感分析模型
model_name
=
"tabularisai/multilingual-sentiment-analysis"
local_model_path
=
os
.
path
.
join
(
weibo_sentiment_path
,
"model"
)
# 检查本地是否已有模型
if
os
.
path
.
exists
(
local_model_path
):
print
(
"从本地加载模型..."
)
self
.
tokenizer
=
AutoTokenizer
.
from_pretrained
(
local_model_path
)
self
.
model
=
AutoModelForSequenceClassification
.
from_pretrained
(
local_model_path
)
self
.
model
=
AutoModelForSequenceClassification
.
from_pretrained
(
local_model_path
)
else
:
print
(
"首次使用,正在下载模型到本地..."
)
# 下载并保存到本地
self
.
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_name
)
self
.
model
=
AutoModelForSequenceClassification
.
from_pretrained
(
model_name
)
self
.
model
=
AutoModelForSequenceClassification
.
from_pretrained
(
model_name
)
# 保存到本地
os
.
makedirs
(
local_model_path
,
exist_ok
=
True
)
self
.
tokenizer
.
save_pretrained
(
local_model_path
)
self
.
model
.
save_pretrained
(
local_model_path
)
print
(
f
"模型已保存到: {local_model_path}"
)
# 设置设备
device
=
self
.
_select_device
()
if
device
is
None
:
...
...
@@ -198,46 +221,46 @@ class WeiboMultilingualSentimentAnalyzer:
print
(
"检测到 Apple MPS 设备,已使用 MPS 进行推理。"
)
else
:
print
(
"未检测到 GPU,自动使用 CPU 进行推理。"
)
print
(
f
"模型加载成功! 使用设备: {self.device}"
)
print
(
"支持语言: 中文、英文、西班牙文、阿拉伯文、日文、韩文等22种语言"
)
print
(
"情感等级: 非常负面、负面、中性、正面、非常正面"
)
return
True
except
Exception
as
e
:
error_message
=
f
"模型加载失败: {e}"
print
(
error_message
)
print
(
"请检查网络连接或模型文件"
)
self
.
disable
(
error_message
,
drop_state
=
True
)
return
False
def
_preprocess_text
(
self
,
text
:
str
)
->
str
:
"""
文本预处理
Args:
text: 输入文本
Returns:
处理后的文本
"""
# 基本文本清理
if
not
text
or
not
text
.
strip
():
return
""
# 去除多余空格
text
=
re
.
sub
(
r'
\
s+'
,
' '
,
text
.
strip
())
text
=
re
.
sub
(
r"
\
s+"
,
" "
,
text
.
strip
())
return
text
def
analyze_single_text
(
self
,
text
:
str
)
->
SentimentResult
:
"""
对单个文本进行情感分析
Args:
text: 要分析的文本
Returns:
SentimentResult对象
"""
...
...
@@ -249,7 +272,7 @@ class WeiboMultilingualSentimentAnalyzer:
probability_distribution
=
{},
success
=
False
,
error_message
=
self
.
disable_reason
or
"情感分析功能已禁用"
,
analysis_performed
=
False
analysis_performed
=
False
,
)
if
not
self
.
is_initialized
:
...
...
@@ -260,7 +283,7 @@ class WeiboMultilingualSentimentAnalyzer:
probability_distribution
=
{},
success
=
False
,
error_message
=
"模型未初始化,请先调用initialize() 方法"
,
analysis_performed
=
False
analysis_performed
=
False
,
)
try
:
...
...
@@ -275,7 +298,7 @@ class WeiboMultilingualSentimentAnalyzer:
probability_distribution
=
{},
success
=
False
,
error_message
=
"输入文本为空或无效内容"
,
analysis_performed
=
False
analysis_performed
=
False
,
)
# 分词编码
...
...
@@ -284,7 +307,7 @@ class WeiboMultilingualSentimentAnalyzer:
max_length
=
512
,
padding
=
True
,
truncation
=
True
,
return_tensors
=
'pt'
return_tensors
=
"pt"
,
)
# 转移到设备
...
...
@@ -311,7 +334,7 @@ class WeiboMultilingualSentimentAnalyzer:
sentiment_label
=
label
,
confidence
=
confidence
,
probability_distribution
=
prob_dist
,
success
=
True
success
=
True
,
)
except
Exception
as
e
:
...
...
@@ -322,17 +345,19 @@ class WeiboMultilingualSentimentAnalyzer:
probability_distribution
=
{},
success
=
False
,
error_message
=
f
"预测时发生错误: {str(e)}"
,
analysis_performed
=
False
analysis_performed
=
False
,
)
def
analyze_batch
(
self
,
texts
:
List
[
str
],
show_progress
:
bool
=
True
)
->
BatchSentimentResult
:
def
analyze_batch
(
self
,
texts
:
List
[
str
],
show_progress
:
bool
=
True
)
->
BatchSentimentResult
:
"""
批量情感分析
Args:
texts: 文本列表
show_progress: 是否显示进度
Returns:
BatchSentimentResult对象
"""
...
...
@@ -343,9 +368,9 @@ class WeiboMultilingualSentimentAnalyzer:
success_count
=
0
,
failed_count
=
0
,
average_confidence
=
0.0
,
analysis_performed
=
not
self
.
is_disabled
and
self
.
is_initialized
analysis_performed
=
not
self
.
is_disabled
and
self
.
is_initialized
,
)
if
self
.
is_disabled
or
not
self
.
is_initialized
:
passthrough_results
=
[
SentimentResult
(
...
...
@@ -355,7 +380,7 @@ class WeiboMultilingualSentimentAnalyzer:
probability_distribution
=
{},
success
=
False
,
error_message
=
self
.
disable_reason
or
"情感分析功能不可用"
,
analysis_performed
=
False
analysis_performed
=
False
,
)
for
text
in
texts
]
...
...
@@ -365,42 +390,44 @@ class WeiboMultilingualSentimentAnalyzer:
success_count
=
0
,
failed_count
=
len
(
texts
),
average_confidence
=
0.0
,
analysis_performed
=
False
analysis_performed
=
False
,
)
results
=
[]
success_count
=
0
total_confidence
=
0.0
for
i
,
text
in
enumerate
(
texts
):
if
show_progress
and
len
(
texts
)
>
1
:
print
(
f
"处理进度: {i+1}/{len(texts)}"
)
print
(
f
"处理进度: {i + 1}/{len(texts)}"
)
result
=
self
.
analyze_single_text
(
text
)
results
.
append
(
result
)
if
result
.
success
:
success_count
+=
1
total_confidence
+=
result
.
confidence
average_confidence
=
total_confidence
/
success_count
if
success_count
>
0
else
0.0
average_confidence
=
(
total_confidence
/
success_count
if
success_count
>
0
else
0.0
)
failed_count
=
len
(
texts
)
-
success_count
return
BatchSentimentResult
(
results
=
results
,
total_processed
=
len
(
texts
),
success_count
=
success_count
,
failed_count
=
failed_count
,
average_confidence
=
average_confidence
,
analysis_performed
=
True
analysis_performed
=
True
,
)
def
_build_passthrough_analysis
(
self
,
original_data
:
List
[
Dict
[
str
,
Any
]],
reason
:
str
,
texts
:
Optional
[
List
[
str
]]
=
None
,
results
:
Optional
[
List
[
SentimentResult
]]
=
None
results
:
Optional
[
List
[
SentimentResult
]]
=
None
,
)
->
Dict
[
str
,
Any
]:
"""
构建在情感分析不可用时的透传结果
...
...
@@ -416,33 +443,36 @@ class WeiboMultilingualSentimentAnalyzer:
"sentiment_distribution"
:
{},
"high_confidence_results"
:
[],
"summary"
:
f
"情感分析未执行:{reason}"
,
"original_texts"
:
original_data
"original_texts"
:
original_data
,
}
}
if
texts
is
not
None
:
response
[
"sentiment_analysis"
][
"passthrough_texts"
]
=
texts
if
results
is
not
None
:
response
[
"sentiment_analysis"
][
"results"
]
=
[
result
.
__dict__
if
isinstance
(
result
,
SentimentResult
)
else
result
for
result
in
results
]
return
response
def
analyze_query_results
(
self
,
query_results
:
List
[
Dict
[
str
,
Any
]],
text_field
:
str
=
"content"
,
min_confidence
:
float
=
0.5
)
->
Dict
[
str
,
Any
]:
def
analyze_query_results
(
self
,
query_results
:
List
[
Dict
[
str
,
Any
]],
text_field
:
str
=
"content"
,
min_confidence
:
float
=
0.5
,
)
->
Dict
[
str
,
Any
]:
"""
对查询结果进行情感分析
专门用于分析从MediaCrawlerDB返回的查询结果
Args:
query_results: 查询结果列表,每个元素包含文本内容
text_field: 文本内容字段名,默认为"content"
min_confidence: 最小置信度阈值
Returns:
包含情感分析结果的字典
"""
...
...
@@ -452,14 +482,14 @@ class WeiboMultilingualSentimentAnalyzer:
"total_analyzed"
:
0
,
"sentiment_distribution"
:
{},
"high_confidence_results"
:
[],
"summary"
:
"没有内容需要分析"
"summary"
:
"没有内容需要分析"
,
}
}
# 提取文本内容
texts_to_analyze
=
[]
original_data
=
[]
for
item
in
query_results
:
# 尝试多个可能的文本字段
text_content
=
""
...
...
@@ -467,49 +497,52 @@ class WeiboMultilingualSentimentAnalyzer:
if
field
in
item
and
item
[
field
]:
text_content
=
str
(
item
[
field
])
break
if
text_content
.
strip
():
texts_to_analyze
.
append
(
text_content
)
original_data
.
append
(
item
)
if
not
texts_to_analyze
:
return
{
"sentiment_analysis"
:
{
"total_analyzed"
:
0
,
"sentiment_distribution"
:
{},
"high_confidence_results"
:
[],
"summary"
:
"查询结果中没有找到可分析的文本内容"
"summary"
:
"查询结果中没有找到可分析的文本内容"
,
}
}
if
self
.
is_disabled
:
return
self
.
_build_passthrough_analysis
(
original_data
=
original_data
,
reason
=
self
.
disable_reason
or
"情感分析模型不可用"
,
texts
=
texts_to_analyze
texts
=
texts_to_analyze
,
)
# 执行批量情感分析
print
(
f
"正在对{len(texts_to_analyze)}条内容进行情感分析..."
)
batch_result
=
self
.
analyze_batch
(
texts_to_analyze
,
show_progress
=
True
)
if
not
batch_result
.
analysis_performed
:
reason
=
self
.
disable_reason
or
"情感分析功能不可用"
if
batch_result
.
results
:
candidate_error
=
next
((
r
.
error_message
for
r
in
batch_result
.
results
if
r
.
error_message
),
None
)
candidate_error
=
next
(
(
r
.
error_message
for
r
in
batch_result
.
results
if
r
.
error_message
),
None
,
)
if
candidate_error
:
reason
=
candidate_error
return
self
.
_build_passthrough_analysis
(
original_data
=
original_data
,
reason
=
reason
,
texts
=
texts_to_analyze
,
results
=
batch_result
.
results
results
=
batch_result
.
results
,
)
# 统计情感分布
sentiment_distribution
=
{}
high_confidence_results
=
[]
for
result
,
original_item
in
zip
(
batch_result
.
results
,
original_data
):
if
result
.
success
:
# 统计情感分布
...
...
@@ -517,24 +550,28 @@ class WeiboMultilingualSentimentAnalyzer:
if
sentiment
not
in
sentiment_distribution
:
sentiment_distribution
[
sentiment
]
=
0
sentiment_distribution
[
sentiment
]
+=
1
# 收集高置信度结果
if
result
.
confidence
>=
min_confidence
:
high_confidence_results
.
append
({
"original_data"
:
original_item
,
"sentiment"
:
result
.
sentiment_label
,
"confidence"
:
result
.
confidence
,
"text_preview"
:
result
.
text
[:
100
]
+
"..."
if
len
(
result
.
text
)
>
100
else
result
.
text
})
high_confidence_results
.
append
(
{
"original_data"
:
original_item
,
"sentiment"
:
result
.
sentiment_label
,
"confidence"
:
result
.
confidence
,
"text_preview"
:
result
.
text
[:
100
]
+
"..."
if
len
(
result
.
text
)
>
100
else
result
.
text
,
}
)
# 生成情感分析摘要
total_analyzed
=
batch_result
.
success_count
if
total_analyzed
>
0
:
dominant_sentiment
=
max
(
sentiment_distribution
.
items
(),
key
=
lambda
x
:
x
[
1
])
sentiment_summary
=
f
"共分析{total_analyzed}条内容,主要情感倾向为'{dominant_sentiment[0]}'({dominant_sentiment[1]}条,占{dominant_sentiment[1]
/total_analyzed*
100:.1f}
%
)"
sentiment_summary
=
f
"共分析{total_analyzed}条内容,主要情感倾向为'{dominant_sentiment[0]}'({dominant_sentiment[1]}条,占{dominant_sentiment[1]
/ total_analyzed *
100:.1f}
%
)"
else
:
sentiment_summary
=
"情感分析失败"
return
{
"sentiment_analysis"
:
{
"total_analyzed"
:
total_analyzed
,
...
...
@@ -542,28 +579,46 @@ class WeiboMultilingualSentimentAnalyzer:
"average_confidence"
:
round
(
batch_result
.
average_confidence
,
4
),
"sentiment_distribution"
:
sentiment_distribution
,
"high_confidence_results"
:
high_confidence_results
,
# 返回所有高置信度结果,不做限制
"summary"
:
sentiment_summary
"summary"
:
sentiment_summary
,
}
}
def
get_model_info
(
self
)
->
Dict
[
str
,
Any
]:
"""
获取模型信息
Returns:
模型信息字典
"""
return
{
"model_name"
:
"tabularisai/multilingual-sentiment-analysis"
,
"supported_languages"
:
[
"中文"
,
"英文"
,
"西班牙文"
,
"阿拉伯文"
,
"日文"
,
"韩文"
,
"德文"
,
"法文"
,
"意大利文"
,
"葡萄牙文"
,
"俄文"
,
"荷兰文"
,
"波兰文"
,
"土耳其文"
,
"丹麦文"
,
"希腊文"
,
"芬兰文"
,
"瑞典文"
,
"挪威文"
,
"匈牙利文"
,
"捷克文"
,
"保加利亚文"
"中文"
,
"英文"
,
"西班牙文"
,
"阿拉伯文"
,
"日文"
,
"韩文"
,
"德文"
,
"法文"
,
"意大利文"
,
"葡萄牙文"
,
"俄文"
,
"荷兰文"
,
"波兰文"
,
"土耳其文"
,
"丹麦文"
,
"希腊文"
,
"芬兰文"
,
"瑞典文"
,
"挪威文"
,
"匈牙利文"
,
"捷克文"
,
"保加利亚文"
,
],
"sentiment_levels"
:
list
(
self
.
sentiment_map
.
values
()),
"is_initialized"
:
self
.
is_initialized
,
"device"
:
str
(
self
.
device
)
if
self
.
device
else
"未设置"
"device"
:
str
(
self
.
device
)
if
self
.
device
else
"未设置"
,
}
...
...
@@ -576,20 +631,23 @@ def enable_sentiment_analysis() -> bool:
return
multilingual_sentiment_analyzer
.
enable
()
def
disable_sentiment_analysis
(
reason
:
Optional
[
str
]
=
None
,
drop_state
:
bool
=
False
)
->
None
:
def
disable_sentiment_analysis
(
reason
:
Optional
[
str
]
=
None
,
drop_state
:
bool
=
False
)
->
None
:
"""Public helper to disable sentiment analysis at runtime."""
multilingual_sentiment_analyzer
.
disable
(
reason
=
reason
,
drop_state
=
drop_state
)
def
analyze_sentiment
(
text_or_texts
:
Union
[
str
,
List
[
str
]],
initialize_if_needed
:
bool
=
True
)
->
Union
[
SentimentResult
,
BatchSentimentResult
]:
def
analyze_sentiment
(
text_or_texts
:
Union
[
str
,
List
[
str
]],
initialize_if_needed
:
bool
=
True
)
->
Union
[
SentimentResult
,
BatchSentimentResult
]:
"""
便捷的情感分析函数
Args:
text_or_texts: 单个文本或文本列表
initialize_if_needed: 如果模型未初始化,是否自动初始化
Returns:
SentimentResult或BatchSentimentResult
"""
...
...
@@ -599,7 +657,7 @@ def analyze_sentiment(text_or_texts: Union[str, List[str]],
and
not
multilingual_sentiment_analyzer
.
is_disabled
):
multilingual_sentiment_analyzer
.
initialize
()
if
isinstance
(
text_or_texts
,
str
):
return
multilingual_sentiment_analyzer
.
analyze_single_text
(
text_or_texts
)
else
:
...
...
@@ -610,24 +668,30 @@ def analyze_sentiment(text_or_texts: Union[str, List[str]],
if
__name__
==
"__main__"
:
# 测试代码
analyzer
=
WeiboMultilingualSentimentAnalyzer
()
if
analyzer
.
initialize
():
# 测试单个文本
result
=
analyzer
.
analyze_single_text
(
"今天天气真好,心情特别棒!"
)
print
(
f
"单个文本分析: {result.sentiment_label} (置信度: {result.confidence:.4f})"
)
print
(
f
"单个文本分析: {result.sentiment_label} (置信度: {result.confidence:.4f})"
)
# 测试批量文本
test_texts
=
[
"这家餐厅的菜味道非常棒!"
,
"服务态度太差了,很失望"
,
"I absolutely love this product!"
,
"The customer service was disappointing."
"The customer service was disappointing."
,
]
batch_result
=
analyzer
.
analyze_batch
(
test_texts
)
print
(
f
"
\n
批量分析: 成功 {batch_result.success_count}/{batch_result.total_processed}"
)
print
(
f
"
\n
批量分析: 成功 {batch_result.success_count}/{batch_result.total_processed}"
)
for
result
in
batch_result
.
results
:
print
(
f
"'{result.text[:30]}...' -> {result.sentiment_label} ({result.confidence:.4f})"
)
print
(
f
"'{result.text[:30]}...' -> {result.sentiment_label} ({result.confidence:.4f})"
)
else
:
print
(
"模型初始化失败,无法进行测试"
)
...
...
Please
register
or
login
to post a comment