Toggle navigation
Toggle navigation
This project
Loading...
Sign in
万朱浩
/
Venue-Ops
Go to a project
Toggle navigation
Projects
Groups
Snippets
Help
Toggle navigation pinning
Project
Activity
Repository
Pipelines
Graphs
Issues
0
Merge Requests
0
Wiki
Network
Create a new issue
Builds
Commits
Authored by
redhongx
2024-07-03 09:56:18 +0800
Browse Files
Options
Browse Files
Download
Email Patches
Plain Diff
Commit
d59cf9aaa7cef4c937354e1a56d1d1aa4c7fc786
d59cf9aa
1 parent
deefba1a
【yuqing.py】更新代码,能在文件夹下直接运行
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
55 additions
and
48 deletions
model/yuqing.py
model/yuqing.py
View file @
d59cf9a
import
pandas
as
pd
# 用于数据处理
import
numpy
as
np
# 用于科学计算
import
csv
# 用于读取CSV文件
from
snownlp
import
SnowNLP
# 用于中文自然语言处理(此处未实际使用)
from
sklearn.feature_extraction.text
import
TfidfVectorizer
# 用于文本特征提取
from
sklearn.naive_bayes
import
MultinomialNB
# 用于多项式朴素贝叶斯分类
from
sklearn.model_selection
import
train_test_split
# 用于划分训练集和测试集
from
sklearn.metrics
import
accuracy_score
# 用于计算模型准确度
def
getSentiment_data
():
# 从CSV文件中读取情感数据
sentiment_data
=
[]
with
open
(
'./target.csv'
,
'r'
,
encoding
=
'utf8'
)
as
readerFile
:
reader
=
csv
.
reader
(
readerFile
)
for
data
in
reader
:
sentiment_data
.
append
(
data
)
return
sentiment_data
ef
model_train
():
# 获取情感数据并转换为DataFrame
sentiment_data
=
getSentiment_data
()
df
=
pd
.
DataFrame
(
sentiment_data
,
columns
=
[
'text'
,
'sentiment'
])
# 将数据集划分为训练集和测试集,测试集占20%
train_data
,
test_data
=
train_test_split
(
df
,
test_size
=
0.2
,
random_state
=
42
)
# 初始化TfidfVectorizer,并对训练集和测试集进行文本特征提取
vectorize
=
TfidfVectorizer
()
X_train
=
vectorize
.
fit_transform
(
train_data
[
'text'
])
y_train
=
train_data
[
'sentiment'
]
X_test
=
vectorize
.
transform
(
test_data
[
'text'
])
y_test
=
test_data
[
'sentiment'
]
# 初始化多项式朴素贝叶斯分类器,并进行训练
classifier
=
MultinomialNB
()
classifier
.
fit
(
X_train
,
y_train
)
# 对测试集进行预测
y_pred
=
classifier
.
predict
(
X_test
)
# 计算模型准确度
accuracy
=
accuracy_score
(
y_test
,
y_pred
)
if
__name__
==
"__main__"
:
model_train
()
# 训练模型并计算准确度
from
snownlp
import
SnowNLP
# 引入SnowNLP库,用于中文情感分析
import
csv
# 用于处理CSV文件的读写操作
import
os
# 用于操作系统相关功能
import
sys
import
os
# 获取当前文件的绝对路径
current_file_path
=
os
.
path
.
abspath
(
__file__
)
# 获取当前文件的父目录路径
parent_dir
=
os
.
path
.
dirname
(
current_file_path
)
# 获取父目录的父目录路径,也就是项目根目录
project_root_dir
=
os
.
path
.
dirname
(
parent_dir
)
# 将项目根目录添加到 Python 路径中
sys
.
path
.
append
(
project_root_dir
)
# 现在可以导入 utils 目录中的模块了
from
utils.getPublicData
import
getAllCommentsData
# 自定义函数,用于获取评论数据
def
targetFile
():
targetFile
=
'target.csv'
# 定义目标文件名称
commentsList
=
getAllCommentsData
()
# 获取所有评论数据
rateData
=
[]
# 用于存储处理后的评论数据
good
=
0
# 记录正面评论数量
bad
=
0
# 记录负面评论数量
middle
=
0
# 记录中性评论数量
# 遍历所有评论,进行情感分析
for
index
,
i
in
enumerate
(
commentsList
):
# enumerate 是 Python 中的一个内置函数,它允许我们在遍历可迭代对象(如列表、元组或字符串)时同时获取元素的索引和值。
# |articleId|created_at | likes_counts | region | content| authorName | authorGender | authorAddress | authorAvatar
value
=
SnowNLP
(
i
[
4
])
.
sentiments
# 对评论内容进行情感分析
if
value
>
0.5
:
# 如果情感值大于0.5,判定为正面评论
good
+=
1
rateData
.
append
([
i
[
4
],
'正面'
])
elif
value
==
0.5
:
# 如果情感值等于0.5,判定为中性评论
middle
+=
1
rateData
.
append
([
i
[
4
],
'中性'
])
elif
value
<
0.5
:
# 如果情感值小于0.5,判定为负面评论
bad
+=
1
rateData
.
append
([
i
[
4
],
'负面'
])
# 将处理后的评论数据写入目标文件
for
i
in
rateData
:
with
open
(
targetFile
,
'a+'
,
encoding
=
'utf8'
,
newline
=
''
)
as
f
:
writer
=
csv
.
writer
(
f
)
writer
.
writerow
(
i
)
# 将每条数据写入CSV文件
def
main
():
targetFile
()
# 调用targetFile函数进行数据处理
if
__name__
==
'__main__'
:
main
()
# 运行主函数
...
...
Please
register
or
login
to post a comment