Toggle navigation
Toggle navigation
This project
Loading...
Sign in
万朱浩
/
Venue-Ops
Go to a project
Toggle navigation
Projects
Groups
Snippets
Help
Toggle navigation pinning
Project
Activity
Repository
Pipelines
Graphs
Issues
0
Merge Requests
0
Wiki
Network
Create a new issue
Builds
Commits
Authored by
戒酒的李白
2025-01-09 23:08:55 +0800
Browse Files
Options
Browse Files
Download
Email Patches
Plain Diff
Commit
a30773715e9946b41ae9c935fc3fd89ddf80d070
a3077371
1 parent
ba56f3b0
Modify the database hardcoding to switch to command-line interactive database connection.
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
304 additions
and
66 deletions
.gitignore
spider/saveData.py
utils/query.py
wordCloudPicture.py
.gitignore
View file @
a307737
...
...
@@ -12,3 +12,4 @@ model2/*
*.pyz
*.pywz
.vscode
.VSCodeCounter
\ No newline at end of file
...
...
spider/saveData.py
View file @
a307737
import
os
from
sqlalchemy
import
create_engine
import
pandas
as
pd
from
spiderDataPackage.settings
import
articleAddr
,
commentsAddr
# from ..model.topicDefine import *
from
sqlalchemy
import
create_engine
from
getpass
import
getpass
import
logging
# 配置日志
logging
.
basicConfig
(
level
=
logging
.
INFO
,
format
=
'
%(asctime)
s [
%(levelname)
s]
%(message)
s'
,
handlers
=
[
logging
.
FileHandler
(
"save_data.log"
),
logging
.
StreamHandler
()
]
)
engine
=
create_engine
(
'mysql+pymysql://XiaoXueQi:XiaoXueQi@47.92.235.6/Weibo_PublicOpinion_AnalysisSystem?charset=utf8mb4'
)
# 假设 articleAddr 和 commentsAddr 是绝对路径或相对于脚本的路径
from
spiderDataPackage.settings
import
articleAddr
,
commentsAddr
def
get_db_connection_interactive
():
"""
通过终端交互获取数据库连接参数,若按回车则使用默认值。
返回 SQLAlchemy 的数据库引擎。
"""
print
(
"请依次输入数据库连接信息(直接按回车使用默认值):"
)
host
=
input
(
" 1. 主机 (默认: localhost): "
)
or
"localhost"
port_str
=
input
(
" 2. 端口 (默认: 3306): "
)
or
"3306"
try
:
port
=
int
(
port_str
)
except
ValueError
:
logging
.
warning
(
"端口号无效,使用默认端口 3306。"
)
port
=
3306
user
=
input
(
" 3. 用户名 (默认: root): "
)
or
"root"
password
=
getpass
(
" 4. 密码 (默认: 12345678): "
)
or
"12345678"
db_name
=
input
(
" 5. 数据库名 (默认: Weibo_PublicOpinion_AnalysisSystem): "
)
or
"Weibo_PublicOpinion_AnalysisSystem"
# 构建数据库连接字符串
connection_str
=
f
"mysql+pymysql://{user}:{password}@{host}:{port}/{db_name}?charset=utf8mb4"
try
:
engine
=
create_engine
(
connection_str
)
# 测试连接
with
engine
.
connect
()
as
connection
:
logging
.
info
(
f
"成功连接到数据库: {user}@{host}:{port}/{db_name}"
)
return
engine
except
Exception
as
e
:
logging
.
error
(
f
"无法连接到数据库: {e}"
)
exit
(
1
)
def
saveData
():
def
saveData
(
engine
):
"""
从数据库和CSV文件读取数据,合并后去重并保存回数据库。
最后删除CSV文件。
"""
try
:
oldArticle
=
pd
.
read_sql
(
'select * from article'
,
engine
)
# 读取旧数据
oldArticle
=
pd
.
read_sql
(
'SELECT * FROM article'
,
engine
)
oldComment
=
pd
.
read_sql
(
'SELECT * FROM comments'
,
engine
)
logging
.
info
(
"成功从数据库读取旧的文章和评论数据。"
)
# 读取新数据
newArticle
=
pd
.
read_csv
(
articleAddr
)
oldComment
=
pd
.
read_sql
(
'select * from comments'
,
engine
)
newComment
=
pd
.
read_csv
(
commentsAddr
)
mergeArticle
=
pd
.
concat
([
newArticle
,
oldArticle
],
join
=
'inner'
)
mergeComment
=
pd
.
concat
([
newComment
,
oldComment
],
join
=
'inner'
)
mergeArticle
.
drop_duplicates
(
subset
=
'id'
,
keep
=
'last'
,
inplace
=
True
)
mergeComment
.
drop_duplicates
(
subset
=
'content'
,
keep
=
'last'
,
inplace
=
True
)
logging
.
info
(
"成功从CSV文件读取新的文章和评论数据。"
)
# 合并数据
mergeArticle
=
pd
.
concat
([
newArticle
,
oldArticle
],
ignore_index
=
True
,
sort
=
False
)
mergeComment
=
pd
.
concat
([
newComment
,
oldComment
],
ignore_index
=
True
,
sort
=
False
)
logging
.
info
(
"成功合并新旧文章和评论数据。"
)
# 去重
mergeArticle
.
drop_duplicates
(
subset
=
'id'
,
keep
=
'last'
,
inplace
=
True
)
mergeComment
.
drop_duplicates
(
subset
=
'content'
,
keep
=
'last'
,
inplace
=
True
)
logging
.
info
(
"成功去除重复的文章和评论数据。"
)
# 保存回数据库
mergeArticle
.
to_sql
(
'article'
,
con
=
engine
,
if_exists
=
'replace'
,
index
=
False
)
mergeComment
.
to_sql
(
'comments'
,
con
=
engine
,
if_exists
=
'replace'
,
index
=
False
)
except
:
newArticle
=
pd
.
read_csv
(
articleAddr
)
newComment
=
pd
.
read_csv
(
commentsAddr
)
newArticle
.
to_sql
(
'article'
,
con
=
engine
,
if_exists
=
'replace'
,
index
=
False
)
newComment
.
to_sql
(
'comments'
,
con
=
engine
,
if_exists
=
'replace'
,
index
=
False
)
logging
.
info
(
"成功将合并后的数据保存回数据库。"
)
except
pd
.
errors
.
EmptyDataError
as
e
:
logging
.
error
(
f
"读取CSV文件时出错: {e}"
)
except
Exception
as
e
:
logging
.
error
(
f
"保存数据时出错: {e}"
)
else
:
# 删除CSV文件
try
:
os
.
remove
(
articleAddr
)
os
.
remove
(
commentsAddr
)
logging
.
info
(
"成功删除CSV文件。"
)
except
Exception
as
e
:
logging
.
warning
(
f
"删除CSV文件时出错: {e}"
)
os
.
remove
(
articleAddr
)
os
.
remove
(
commentsAddr
)
# update_data()
def
main
():
# 获取数据库连接
engine
=
get_db_connection_interactive
()
# 保存数据
saveData
(
engine
)
# 关闭引擎(可选,因为SQLAlchemy引擎会自动管理连接池)
engine
.
dispose
()
logging
.
info
(
"数据库连接已关闭。"
)
if
__name__
==
'__main__'
:
saveData
()
\ No newline at end of file
main
()
...
...
utils/query.py
View file @
a307737
from
pymysql
import
*
conn
=
connect
(
host
=
'47.92.235.6'
,
port
=
3306
,
user
=
'XiaoXueQi'
,
password
=
'XiaoXueQi'
,
database
=
'Weibo_PublicOpinion_AnalysisSystem'
)
import
getpass
import
pymysql
import
logging
# 配置日志
logging
.
basicConfig
(
level
=
logging
.
INFO
,
format
=
'
%(asctime)
s [
%(levelname)
s]
%(message)
s'
,
handlers
=
[
logging
.
FileHandler
(
"database_operations.log"
),
logging
.
StreamHandler
()
]
)
def
get_db_connection_interactive
():
"""
通过终端交互获取数据库连接参数,若按回车则使用默认值。
返回一个连接对象。
"""
print
(
"请依次输入数据库连接信息(直接按回车使用默认值):"
)
host
=
input
(
" 1. 主机 (默认: localhost): "
)
or
"localhost"
port_str
=
input
(
" 2. 端口 (默认: 3306): "
)
or
"3306"
try
:
port
=
int
(
port_str
)
except
ValueError
:
logging
.
warning
(
"端口号无效,使用默认端口 3306。"
)
port
=
3306
user
=
input
(
" 3. 用户名 (默认: root): "
)
or
"root"
password
=
getpass
.
getpass
(
" 4. 密码 (默认: 312517): "
)
or
"312517"
db_name
=
input
(
" 5. 数据库名 (默认: Weibo_PublicOpinion_AnalysisSystem): "
)
or
"Weibo_PublicOpinion_AnalysisSystem"
logging
.
info
(
f
"尝试连接到数据库: {user}@{host}:{port}/{db_name}"
)
try
:
connection
=
pymysql
.
connect
(
host
=
host
,
port
=
port
,
user
=
user
,
password
=
password
,
database
=
db_name
,
charset
=
'utf8mb4'
,
cursorclass
=
pymysql
.
cursors
.
DictCursor
# 返回字典格式
)
logging
.
info
(
"数据库连接成功。"
)
return
connection
except
pymysql
.
MySQLError
as
e
:
logging
.
error
(
f
"数据库连接失败: {e}"
)
exit
(
1
)
# 获取数据库连接
conn
=
get_db_connection_interactive
()
# 获取游标
cursor
=
conn
.
cursor
()
def
query
(
sql
,
params
,
type
=
"no_select"
):
params
=
tuple
(
params
)
cursor
.
execute
(
sql
,
params
)
conn
.
ping
(
reconnect
=
True
)
if
type
!=
'no_select'
:
data_list
=
cursor
.
fetchall
()
conn
.
commit
()
return
data_list
else
:
conn
.
commit
()
def
query
(
sql
,
params
=
None
,
query_type
=
"no_select"
):
"""
执行SQL查询或操作。
:param sql: SQL语句
:param params: SQL参数(可选)
:param query_type: 查询类型,默认为 "no_select"
如果不是 "no_select",则执行 fetch 操作
:return: 如果是查询操作,返回数据列表;否则返回 None
"""
try
:
if
params
:
params
=
tuple
(
params
)
cursor
.
execute
(
sql
,
params
)
else
:
cursor
.
execute
(
sql
)
# 确保连接保持活跃
conn
.
ping
(
reconnect
=
True
)
if
query_type
!=
"no_select"
:
data_list
=
cursor
.
fetchall
()
conn
.
commit
()
logging
.
info
(
"查询成功,已获取数据。"
)
return
data_list
else
:
conn
.
commit
()
logging
.
info
(
"操作成功,已提交事务。"
)
except
pymysql
.
MySQLError
as
e
:
logging
.
error
(
f
"执行SQL时出错: {e}"
)
conn
.
rollback
()
return
None
def
main
():
# 示例用法
# 执行查询操作
select_sql
=
"SELECT * FROM article LIMIT 5"
articles
=
query
(
select_sql
,
query_type
=
"select"
)
if
articles
:
for
article
in
articles
:
print
(
article
)
# 执行插入操作(根据实际表结构修改)
insert_sql
=
"INSERT INTO article (id, content) VALUES (
%
s,
%
s)"
new_article
=
(
12345
,
"这是一条新的文章内容。"
)
result
=
query
(
insert_sql
,
params
=
new_article
,
query_type
=
"no_select"
)
if
result
is
None
:
logging
.
info
(
"插入操作完成。"
)
# 关闭游标和连接
cursor
.
close
()
conn
.
close
()
logging
.
info
(
"数据库连接已关闭。"
)
if
__name__
==
'__main__'
:
main
()
...
...
wordCloudPicture.py
View file @
a307737
import
os
import
jieba
from
wordcloud
import
WordCloud
import
matplotlib.pyplot
as
plt
from
PIL
import
Image
,
ImageDraw
from
pymysql
import
*
import
json
from
PIL
import
Image
import
numpy
as
np
def
stopWordList
():
return
[
line
.
strip
()
for
line
in
open
(
'./model/stopWords.txt'
,
encoding
=
'utf8'
)
.
readlines
()]
def
get_img
(
field
,
tableName
,
targetImgSrc
,
resImgSrc
):
con
=
connect
(
host
=
'47.92.235.6'
,
user
=
'XiaoXueQi'
,
password
=
'XiaoXueQi'
,
database
=
'Weibo_PublicOpinion_AnalysisSystem'
,
port
=
3306
,
charset
=
'utf8mb4'
)
cuser
=
con
.
cursor
()
sql
=
f
'select {field} from {tableName}'
cuser
.
execute
(
sql
)
data
=
cuser
.
fetchall
()
text
=
''
for
item
in
data
:
text
+=
item
[
0
]
cuser
.
close
()
con
.
close
()
import
pymysql
cut
=
jieba
.
cut
(
text
)
newCut
=
[]
for
word
in
cut
:
if
word
not
in
stopWordList
():
newCut
.
append
(
word
)
string
=
' '
.
join
(
newCut
)
def
stopWordList
():
"""
如果 stopWords.txt 文件内容较大,或被频繁读取,
可以考虑将其缓存起来,避免重复读文件。
"""
with
open
(
'./model/stopWords.txt'
,
encoding
=
'utf8'
)
as
f
:
return
[
line
.
strip
()
for
line
in
f
.
readlines
()]
img
=
Image
.
open
(
targetImgSrc
)
def
generate_word_cloud
(
text
,
mask_path
,
font_path
,
output_path
):
"""生成词云并保存到 output_path"""
img
=
Image
.
open
(
mask_path
)
img_arr
=
np
.
array
(
img
)
wc
=
WordCloud
(
background_color
=
"#fff"
,
mask
=
img_arr
,
font_path
=
'STHUPO.TTF'
font_path
=
font_path
)
wc
.
generate_from_text
(
string
)
fig
=
plt
.
figure
(
1
)
plt
.
imshow
(
wc
)
wc
.
generate_from_text
(
text
)
plt
.
figure
(
figsize
=
(
8
,
6
))
plt
.
imshow
(
wc
,
interpolation
=
'bilinear'
)
plt
.
axis
(
'off'
)
plt
.
savefig
(
output_path
,
dpi
=
300
,
bbox_inches
=
'tight'
)
plt
.
close
()
# 保存后关闭
def
get_db_connection_interactive
():
"""
通过终端交互获取数据库连接参数,若按回车则使用默认值。
"""
print
(
"请依次输入数据库连接信息(直接按回车使用默认值):"
)
host
=
input
(
" 1. 主机 (默认: localhost): "
)
or
"localhost"
port_str
=
input
(
" 2. 端口 (默认: 3306): "
)
or
"3306"
port
=
int
(
port_str
)
user
=
input
(
" 3. 用户名 (默认: root): "
)
or
"root"
password
=
input
(
" 4. 密码 (默认: 312517): "
)
or
"12345678"
db_name
=
input
(
" 5. 数据库名 (默认: Weibo_PublicOpinion_AnalysisSystem): "
)
or
"Weibo_PublicOpinion_AnalysisSystem"
print
(
f
"
\n
即将连接到数据库: {user}@{host}:{port}/{db_name}
\n
"
)
return
pymysql
.
connect
(
host
=
host
,
user
=
user
,
password
=
password
,
database
=
db_name
,
port
=
port
,
charset
=
'utf8mb4'
)
def
get_img
(
field
,
table_name
,
target_img_src
,
res_img_src
,
connection
,
font_path
=
'STHUPO.TTF'
):
"""
从数据库拉取指定字段的文本数据,分词处理后生成词云。
:param field: 数据库字段名
:param table_name: 数据表名
:param target_img_src: 词云形状图
:param res_img_src: 输出词云文件路径
:param connection: 已建立的数据库连接
:param font_path: 字体文件路径
"""
cursor
=
connection
.
cursor
()
sql
=
f
'SELECT {field} FROM {table_name}'
cursor
.
execute
(
sql
)
data
=
cursor
.
fetchall
()
text
=
''
for
item
in
data
:
text
+=
item
[
0
]
# item 是元组 (内容,),取第一个元素即可
cursor
.
close
()
# 分词 & 去停用词
cut_words
=
jieba
.
cut
(
text
)
stop_words
=
set
(
stopWordList
())
filtered_words
=
[
word
for
word
in
cut_words
if
word
not
in
stop_words
]
final_text
=
' '
.
join
(
filtered_words
)
# 生成词云
generate_word_cloud
(
final_text
,
target_img_src
,
font_path
,
res_img_src
)
plt
.
savefig
(
resImgSrc
,
dpi
=
500
)
def
main
():
# 1. 获取数据库连接(交互式输入)
connection
=
get_db_connection_interactive
()
# 2. 根据需求生成词云
# 例如:从 article 表的 content 字段生成词云
try
:
get_img
(
field
=
'content'
,
table_name
=
'article'
,
target_img_src
=
'./static/content.jpg'
,
res_img_src
=
'./static/contentCloud.jpg'
,
connection
=
connection
)
print
(
"词云生成完毕!"
)
finally
:
# 关闭数据库连接
connection
.
close
()
# get_img('content','comments','./static/comment.jpg','./static/commentCloud.jpg')
get_img
(
'content'
,
'article'
,
'./static/content.jpg'
,
'./static/contentCloud.jpg'
)
if
__name__
==
'__main__'
:
main
()
...
...
Please
register
or
login
to post a comment