wordCloudPicture.py
3.38 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import os
import jieba
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from PIL import Image
import numpy as np
import pymysql
def stopWordList():
"""
如果 stopWords.txt 文件内容较大,或被频繁读取,
可以考虑将其缓存起来,避免重复读文件。
"""
with open('./model/stopWords.txt', encoding='utf8') as f:
return [line.strip() for line in f.readlines()]
def generate_word_cloud(text, mask_path, font_path, output_path):
"""生成词云并保存到 output_path"""
img = Image.open(mask_path)
img_arr = np.array(img)
wc = WordCloud(
background_color="#fff",
mask=img_arr,
font_path=font_path
)
wc.generate_from_text(text)
plt.figure(figsize=(8, 6))
plt.imshow(wc, interpolation='bilinear')
plt.axis('off')
plt.savefig(output_path, dpi=300, bbox_inches='tight')
plt.close() # 保存后关闭
def get_db_connection_interactive():
"""
通过终端交互获取数据库连接参数,若按回车则使用默认值。
"""
print("请依次输入数据库连接信息(直接按回车使用默认值):")
host = input(" 1. 主机 (默认: localhost): ") or "localhost"
port_str = input(" 2. 端口 (默认: 3306): ") or "3306"
port = int(port_str)
user = input(" 3. 用户名 (默认: root): ") or "root"
password = input(" 4. 密码 (默认: 312517): ") or "12345678"
db_name = input(" 5. 数据库名 (默认: Weibo_PublicOpinion_AnalysisSystem): ") or "Weibo_PublicOpinion_AnalysisSystem"
print(f"\n即将连接到数据库: {user}@{host}:{port}/{db_name}\n")
return pymysql.connect(
host=host,
user=user,
password=password,
database=db_name,
port=port,
charset='utf8mb4'
)
def get_img(field, table_name, target_img_src, res_img_src, connection, font_path='STHUPO.TTF'):
"""
从数据库拉取指定字段的文本数据,分词处理后生成词云。
:param field: 数据库字段名
:param table_name: 数据表名
:param target_img_src: 词云形状图
:param res_img_src: 输出词云文件路径
:param connection: 已建立的数据库连接
:param font_path: 字体文件路径
"""
cursor = connection.cursor()
sql = f'SELECT {field} FROM {table_name}'
cursor.execute(sql)
data = cursor.fetchall()
text = ''
for item in data:
text += item[0] # item 是元组 (内容,),取第一个元素即可
cursor.close()
# 分词 & 去停用词
cut_words = jieba.cut(text)
stop_words = set(stopWordList())
filtered_words = [word for word in cut_words if word not in stop_words]
final_text = ' '.join(filtered_words)
# 生成词云
generate_word_cloud(final_text, target_img_src, font_path, res_img_src)
def main():
# 1. 获取数据库连接(交互式输入)
connection = get_db_connection_interactive()
# 2. 根据需求生成词云
# 例如:从 article 表的 content 字段生成词云
try:
get_img(
field='content',
table_name='article',
target_img_src='./static/content.jpg',
res_img_src='./static/contentCloud.jpg',
connection=connection
)
print("词云生成完毕!")
finally:
# 关闭数据库连接
connection.close()
if __name__ == '__main__':
main()