戒酒的李白
Committed by GitHub

Merge pull request #14 from tianjy12/main

Removed unnecessary introduction and optimized the word cloud generation component. Added dynamic scheduling functionality to adjust crawler execution intervals based on crawl duration and data volume.
1 -### 项目中的大部分UI元素使用该模板搭建  
2 -  
3 -模板链接:https://iqonic.design/product/admin-templates/datum-crm-admin-deshboard-template/  
1 -2025-01-09 23:29:06,246 [INFO] 尝试连接到数据库: root@localhost:3306/Weibo_PublicOpinion_AnalysisSystem  
2 -2025-01-09 23:29:06,346 [ERROR] 数据库连接失败: (1045, "Access denied for user 'root'@'localhost' (using password: YES)")  
@@ -7,6 +7,8 @@ import subprocess @@ -7,6 +7,8 @@ import subprocess
7 from flask import Flask, session, request, redirect, render_template 7 from flask import Flask, session, request, redirect, render_template
8 from apscheduler.schedulers.background import BackgroundScheduler 8 from apscheduler.schedulers.background import BackgroundScheduler
9 from pytz import utc 9 from pytz import utc
  10 +from datetime import datetime, timedelta
  11 +import time
10 12
11 # 初始化日志记录 13 # 初始化日志记录
12 logging.basicConfig( 14 logging.basicConfig(
@@ -153,11 +155,90 @@ def run_script(): @@ -153,11 +155,90 @@ def run_script():
153 # 执行所有脚本 155 # 执行所有脚本
154 for script_name, script_path in scripts: 156 for script_name, script_path in scripts:
155 try: 157 try:
156 - print(f"Running {script_name}...") # 打印运行开始的信息 158 + logging.info(f"Running {script_name}...")
157 subprocess.run(['python', script_path], check=True) # 使用 subprocess 执行脚本 159 subprocess.run(['python', script_path], check=True) # 使用 subprocess 执行脚本
158 - print(f"{script_name} finished successfully.") # 打印脚本成功完成的消息 160 + logging.info(f"{script_name} finished successfully.")
159 except subprocess.CalledProcessError as e: 161 except subprocess.CalledProcessError as e:
160 - print(f"An error occurred while running {script_name}: {e}") # 打印错误信息 162 + logging.error(f"An error occurred while running {script_name}: {e}")
  163 +
  164 +# 新增功能:动态调度爬虫脚本
  165 +def check_database_empty():
  166 + """
  167 + 检查数据库中的指定表是否为空。
  168 +
  169 + :return: 如果表为空则返回 True,否则返回 False
  170 + """
  171 + try:
  172 + connection = pymysql.connect(**DB_CONFIG)
  173 + with connection.cursor() as cursor:
  174 + cursor.execute("SELECT COUNT(*) as count FROM article")
  175 + result = cursor.fetchone()
  176 + count = result['count'] if result and 'count' in result else 0
  177 + logging.info(f"数据库中共有 {count} 条记录。")
  178 + return count == 0
  179 + except pymysql.MySQLError as e:
  180 + logging.error(f"检查数据库失败: {e}")
  181 + return True # 连接失败时假设数据库为空,以防止阻塞
  182 + finally:
  183 + if 'connection' in locals():
  184 + connection.close()
  185 +
  186 +def dynamic_crawl():
  187 + """
  188 + 执行爬取任务并根据爬取耗时和获取的数据量动态调度下次爬取时间。
  189 + """
  190 + try:
  191 + start_time = time.time()
  192 + logging.info("开始爬取数据。")
  193 +
  194 + run_script() # 执行爬虫脚本
  195 +
  196 + end_time = time.time()
  197 + duration = end_time - start_time # 爬取耗时
  198 +
  199 + # 获取爬取后数据库中记录的数量作为数据量
  200 + try:
  201 + connection = pymysql.connect(**DB_CONFIG)
  202 + with connection.cursor() as cursor:
  203 + cursor.execute("SELECT COUNT(*) as count FROM article")
  204 + result = cursor.fetchone()
  205 + data_fetched = result['count'] if result and 'count' in result else 0
  206 + logging.info(f"爬取完成,耗时 {duration:.2f} 秒,数据库中共有 {data_fetched} 条记录。")
  207 + except pymysql.MySQLError as e:
  208 + logging.error(f"获取数据量失败: {e}")
  209 + data_fetched = 0
  210 + finally:
  211 + if 'connection' in locals():
  212 + connection.close()
  213 +
  214 + # 根据爬取耗时和数据量调整下次爬取时间
  215 + base_interval = 5 * 60 * 60 # 5小时的基础时间间隔(秒)
  216 +
  217 + if duration > 3600: # 爬取耗时超过1小时
  218 + next_interval = base_interval + duration
  219 + logging.info(f"检测到长时间爬取。下次爬取将在 {next_interval/3600:.2f} 小时后执行。")
  220 + elif data_fetched < 50: # 获取的数据量少于50条
  221 + next_interval = base_interval / 2
  222 + logging.info(f"获取数据量较少。下次爬取将在 {next_interval/60:.2f} 分钟后执行。")
  223 + else:
  224 + next_interval = base_interval
  225 + logging.info(f"标准爬取完成。下次爬取将在 {next_interval/3600:.2f} 小时后执行。")
  226 +
  227 + # 安排下次爬取任务
  228 + scheduler.add_job(dynamic_crawl, 'date', run_date=datetime.now() + timedelta(seconds=next_interval), id='dynamic_crawl')
  229 +
  230 + except Exception as e:
  231 + logging.error(f"动态爬取过程中发生错误: {e}")
  232 +
  233 +# 数据库配置,用于动态调度功能
  234 +DB_CONFIG = {
  235 + 'host': 'localhost',
  236 + 'user': 'root',
  237 + 'password': '12345678',
  238 + 'database': 'Weibo_PublicOpinion_AnalysisSystem',
  239 + 'port': 3306,
  240 + 'charset': 'utf8mb4'
  241 +}
161 242
162 # 主程序入口 243 # 主程序入口
163 if __name__ == '__main__': 244 if __name__ == '__main__':
@@ -174,11 +255,19 @@ if __name__ == '__main__': @@ -174,11 +255,19 @@ if __name__ == '__main__':
174 connection.close() 255 connection.close()
175 logging.info("数据库连接已关闭。") 256 logging.info("数据库连接已关闭。")
176 257
177 - # 设置定时任务,定期执行爬虫脚本 258 + # 设置定时任务,动态执行爬虫脚本
178 scheduler = BackgroundScheduler(timezone=utc) # 创建后台任务调度器 259 scheduler = BackgroundScheduler(timezone=utc) # 创建后台任务调度器
179 - scheduler.add_job(run_script, 'interval', hours=5) # 每5小时执行一次爬虫脚本  
180 scheduler.start() # 启动调度器 260 scheduler.start() # 启动调度器
181 - 261 +
  262 + # 初始化调度:如果数据库为空,立即爬取;否则,按照基础时间间隔安排首次爬取
  263 + if check_database_empty():
  264 + logging.info("数据库为空。立即开始初始爬取。")
  265 + dynamic_crawl()
  266 + else:
  267 + logging.info("数据库已有数据。安排首次爬取。")
  268 + base_interval = 5 * 60 * 60 # 5小时
  269 + scheduler.add_job(dynamic_crawl, 'date', run_date=datetime.now() + timedelta(seconds=base_interval), id='dynamic_crawl')
  270 +
182 try: 271 try:
183 app.run() # 启动 Flask 应用 272 app.run() # 启动 Flask 应用
184 finally: 273 finally:
@@ -5,95 +5,180 @@ import matplotlib.pyplot as plt @@ -5,95 +5,180 @@ import matplotlib.pyplot as plt
5 from PIL import Image 5 from PIL import Image
6 import numpy as np 6 import numpy as np
7 import pymysql 7 import pymysql
8 -  
9 -def stopWordList(): 8 +import logging
  9 +
  10 +# Configure logging
  11 +logging.basicConfig(
  12 + level=logging.INFO,
  13 + format='%(asctime)s [%(levelname)s] %(message)s',
  14 + handlers=[
  15 + logging.FileHandler("wordcloud_generator.log"),
  16 + logging.StreamHandler()
  17 + ]
  18 +)
  19 +
  20 +# Global cache for stop words
  21 +STOP_WORDS = set()
  22 +
  23 +def load_stop_words():
10 """ 24 """
11 - 如果 stopWords.txt 文件内容较大,或被频繁读取,  
12 - 可以考虑将其缓存起来,避免重复读文件。 25 + Load and cache stop words.
  26 + If the stop words file does not exist or fails to read, log an error and return an empty set.
13 """ 27 """
14 - with open('./model/stopWords.txt', encoding='utf8') as f:  
15 - return [line.strip() for line in f.readlines()] 28 + global STOP_WORDS
  29 + if STOP_WORDS:
  30 + return STOP_WORDS
  31 + stop_words_path = './model/stopWords.txt'
  32 + if not os.path.exists(stop_words_path):
  33 + logging.error(f"Stop words file does not exist: {stop_words_path}")
  34 + return set()
  35 + try:
  36 + with open(stop_words_path, encoding='utf8') as f:
  37 + STOP_WORDS = set(line.strip() for line in f if line.strip())
  38 + logging.info(f"Loaded {len(STOP_WORDS)} stop words")
  39 + except Exception as e:
  40 + logging.error(f"Failed to load stop words file: {e}")
  41 + return STOP_WORDS
16 42
17 def generate_word_cloud(text, mask_path, font_path, output_path): 43 def generate_word_cloud(text, mask_path, font_path, output_path):
18 - """生成词云并保存到 output_path"""  
19 - img = Image.open(mask_path)  
20 - img_arr = np.array(img)  
21 -  
22 - wc = WordCloud(  
23 - background_color="#fff",  
24 - mask=img_arr,  
25 - font_path=font_path  
26 - )  
27 - wc.generate_from_text(text)  
28 -  
29 - plt.figure(figsize=(8, 6))  
30 - plt.imshow(wc, interpolation='bilinear')  
31 - plt.axis('off')  
32 - plt.savefig(output_path, dpi=300, bbox_inches='tight')  
33 - plt.close() # 保存后关闭  
34 -  
35 -def get_db_connection_interactive():  
36 """ 44 """
37 - 通过终端交互获取数据库连接参数,若按回车则使用默认值。 45 + Generate a word cloud and save it to output_path.
  46 +
  47 + :param text: Processed text
  48 + :param mask_path: Path to the mask image
  49 + :param font_path: Path to the font file
  50 + :param output_path: Path to save the generated word cloud image
38 """ 51 """
39 - print("请依次输入数据库连接信息(直接按回车使用默认值):") 52 + if not os.path.exists(mask_path):
  53 + logging.error(f"Mask image file does not exist: {mask_path}")
  54 + return
  55 + try:
  56 + img = Image.open(mask_path)
  57 + img_arr = np.array(img)
  58 + logging.info(f"Successfully loaded mask image: {mask_path}")
  59 + except Exception as e:
  60 + logging.error(f"Failed to load mask image: {e}")
  61 + return
40 62
41 - host = input(" 1. 主机 (默认: localhost): ") or "localhost"  
42 - port_str = input(" 2. 端口 (默认: 3306): ") or "3306"  
43 - port = int(port_str) 63 + try:
  64 + wc = WordCloud(
  65 + background_color="#fff",
  66 + mask=img_arr,
  67 + font_path=font_path,
  68 + max_words=2000,
  69 + max_font_size=100,
  70 + random_state=42,
  71 + width=800,
  72 + height=600
  73 + )
  74 + wc.generate_from_text(text)
  75 + logging.info("Word cloud generated successfully")
  76 + except Exception as e:
  77 + logging.error(f"Failed to generate word cloud: {e}")
  78 + return
44 79
45 - user = input(" 3. 用户名 (默认: root): ") or "root"  
46 - password = input(" 4. 密码 (默认: 312517): ") or "12345678"  
47 - db_name = input(" 5. 数据库名 (默认: Weibo_PublicOpinion_AnalysisSystem): ") or "Weibo_PublicOpinion_AnalysisSystem" 80 + try:
  81 + plt.figure(figsize=(8, 6))
  82 + plt.imshow(wc, interpolation='bilinear')
  83 + plt.axis('off')
  84 + plt.savefig(output_path, dpi=300, bbox_inches='tight')
  85 + plt.close()
  86 + logging.info(f"Word cloud saved to: {output_path}")
  87 + except Exception as e:
  88 + logging.error(f"Failed to save word cloud image: {e}")
48 89
49 - print(f"\n即将连接到数据库: {user}@{host}:{port}/{db_name}\n") 90 +def get_db_connection_interactive():
  91 + """
  92 + Interactively obtain database connection parameters from the terminal.
  93 + Press Enter to use default values.
50 94
51 - return pymysql.connect(  
52 - host=host,  
53 - user=user,  
54 - password=password,  
55 - database=db_name,  
56 - port=port,  
57 - charset='utf8mb4'  
58 - )  
59 -  
60 -def get_img(field, table_name, target_img_src, res_img_src, connection, font_path='STHUPO.TTF'):  
61 - """  
62 - 从数据库拉取指定字段的文本数据,分词处理后生成词云。  
63 - :param field: 数据库字段名  
64 - :param table_name: 数据表名  
65 - :param target_img_src: 词云形状图  
66 - :param res_img_src: 输出词云文件路径  
67 - :param connection: 已建立的数据库连接  
68 - :param font_path: 字体文件路径 95 + :return: pymysql.connections.Connection object
69 """ 96 """
70 - cursor = connection.cursor()  
71 - sql = f'SELECT {field} FROM {table_name}'  
72 - cursor.execute(sql)  
73 - data = cursor.fetchall() 97 + print("Please enter database connection information (press Enter to use default values):")
74 98
75 - text = ''  
76 - for item in data:  
77 - text += item[0] # item 是元组 (内容,),取第一个元素即可 99 + host = input(" 1. Host (default: localhost): ") or "localhost"
  100 + port_str = input(" 2. Port (default: 3306): ") or "3306"
  101 + try:
  102 + port = int(port_str)
  103 + except ValueError:
  104 + logging.error(f"Invalid port number: {port_str}")
  105 + port = 3306
  106 +
  107 + user = input(" 3. Username (default: root): ") or "root"
  108 + password = input(" 4. Password (default: 12345678): ") or "12345678"
  109 + db_name = input(" 5. Database name (default: Weibo_PublicOpinion_AnalysisSystem): ") or "Weibo_PublicOpinion_AnalysisSystem"
78 110
79 - cursor.close() 111 + logging.info(f"Attempting to connect to database: {user}@{host}:{port}/{db_name}")
80 112
81 - # 分词 & 去停用词  
82 - cut_words = jieba.cut(text)  
83 - stop_words = set(stopWordList())  
84 - filtered_words = [word for word in cut_words if word not in stop_words]  
85 - final_text = ' '.join(filtered_words) 113 + try:
  114 + connection = pymysql.connect(
  115 + host=host,
  116 + user=user,
  117 + password=password,
  118 + database=db_name,
  119 + port=port,
  120 + charset='utf8mb4'
  121 + )
  122 + logging.info("Database connection successful")
  123 + return connection
  124 + except pymysql.MySQLError as e:
  125 + logging.error(f"Database connection failed: {e}")
  126 + raise
86 127
87 - # 生成词云 128 +def get_img(field, table_name, target_img_src, res_img_src, connection, font_path='STHUPO.TTF'):
  129 + """
  130 + Retrieve text data from a specified field and table in the database,
  131 + perform word segmentation and stop word removal, then generate a word cloud.
  132 +
  133 + :param field: Database field name
  134 + :param table_name: Database table name
  135 + :param target_img_src: Path to the mask image
  136 + :param res_img_src: Path to save the generated word cloud image
  137 + :param connection: Established database connection
  138 + :param font_path: Path to the font file
  139 + """
  140 + try:
  141 + with connection.cursor() as cursor:
  142 + sql = f'SELECT {field} FROM {table_name}'
  143 + cursor.execute(sql)
  144 + data = cursor.fetchall()
  145 + logging.info(f"Fetched {len(data)} records from '{table_name}' table, field '{field}'")
  146 + except pymysql.MySQLError as e:
  147 + logging.error(f"Database query failed: {e}")
  148 + return
  149 +
  150 + text = ''.join(item[0] for item in data if item[0])
  151 +
  152 + # Tokenization & Stop word removal
  153 + try:
  154 + stop_words = load_stop_words()
  155 + if not stop_words:
  156 + logging.warning("Stop words set is empty, proceeding without stop word removal")
  157 + cut_words = jieba.cut(text)
  158 + filtered_words = [word for word in cut_words if word not in stop_words]
  159 + final_text = ' '.join(filtered_words)
  160 + logging.info(f"Completed tokenization and stop word removal, generated {len(filtered_words)} words")
  161 + except Exception as e:
  162 + logging.error(f"Text processing failed: {e}")
  163 + return
  164 +
  165 + # Generate word cloud
88 generate_word_cloud(final_text, target_img_src, font_path, res_img_src) 166 generate_word_cloud(final_text, target_img_src, font_path, res_img_src)
89 167
90 def main(): 168 def main():
91 - # 1. 获取数据库连接(交互式输入)  
92 - connection = get_db_connection_interactive() 169 + """
  170 + Main function to execute the word cloud generation process.
  171 + """
  172 + try:
  173 + # Obtain database connection interactively
  174 + connection = get_db_connection_interactive()
  175 + except Exception:
  176 + logging.error("Failed to establish database connection, terminating program")
  177 + return
93 178
94 - # 2. 根据需求生成词云  
95 - # 例如:从 article 表的 content 字段生成词云  
96 try: 179 try:
  180 + # Generate word cloud as per requirements
  181 + # Example: Generate word cloud from 'content' field in 'article' table
97 get_img( 182 get_img(
98 field='content', 183 field='content',
99 table_name='article', 184 table_name='article',
@@ -101,10 +186,16 @@ def main(): @@ -101,10 +186,16 @@ def main():
101 res_img_src='./static/contentCloud.jpg', 186 res_img_src='./static/contentCloud.jpg',
102 connection=connection 187 connection=connection
103 ) 188 )
104 - print("词云生成完毕!") 189 + print("Word cloud generation completed!")
  190 + except Exception as e:
  191 + logging.error(f"An error occurred during word cloud generation: {e}")
105 finally: 192 finally:
106 - # 关闭数据库连接  
107 - connection.close() 193 + # Close the database connection
  194 + try:
  195 + connection.close()
  196 + logging.info("Database connection closed")
  197 + except Exception as e:
  198 + logging.error(f"Error closing database connection: {e}")
108 199
109 if __name__ == '__main__': 200 if __name__ == '__main__':
110 main() 201 main()