Modify the database hardcoding to switch to command-line interactive database connection.
Showing
4 changed files
with
292 additions
and
54 deletions
| 1 | import os | 1 | import os |
| 2 | -from sqlalchemy import create_engine | ||
| 3 | import pandas as pd | 2 | import pandas as pd |
| 4 | -from spiderDataPackage.settings import articleAddr,commentsAddr | ||
| 5 | -# from ..model.topicDefine import * | 3 | +from sqlalchemy import create_engine |
| 4 | +from getpass import getpass | ||
| 5 | +import logging | ||
| 6 | + | ||
| 7 | +# 配置日志 | ||
| 8 | +logging.basicConfig( | ||
| 9 | + level=logging.INFO, | ||
| 10 | + format='%(asctime)s [%(levelname)s] %(message)s', | ||
| 11 | + handlers=[ | ||
| 12 | + logging.FileHandler("save_data.log"), | ||
| 13 | + logging.StreamHandler() | ||
| 14 | + ] | ||
| 15 | +) | ||
| 6 | 16 | ||
| 7 | -engine = create_engine('mysql+pymysql://XiaoXueQi:XiaoXueQi@47.92.235.6/Weibo_PublicOpinion_AnalysisSystem?charset=utf8mb4') | 17 | +# 假设 articleAddr 和 commentsAddr 是绝对路径或相对于脚本的路径 |
| 18 | +from spiderDataPackage.settings import articleAddr, commentsAddr | ||
| 8 | 19 | ||
| 9 | -def saveData(): | 20 | +def get_db_connection_interactive(): |
| 21 | + """ | ||
| 22 | + 通过终端交互获取数据库连接参数,若按回车则使用默认值。 | ||
| 23 | + 返回 SQLAlchemy 的数据库引擎。 | ||
| 24 | + """ | ||
| 25 | + print("请依次输入数据库连接信息(直接按回车使用默认值):") | ||
| 26 | + | ||
| 27 | + host = input(" 1. 主机 (默认: localhost): ") or "localhost" | ||
| 28 | + port_str = input(" 2. 端口 (默认: 3306): ") or "3306" | ||
| 10 | try: | 29 | try: |
| 11 | - oldArticle = pd.read_sql('select * from article',engine) | 30 | + port = int(port_str) |
| 31 | + except ValueError: | ||
| 32 | + logging.warning("端口号无效,使用默认端口 3306。") | ||
| 33 | + port = 3306 | ||
| 34 | + | ||
| 35 | + user = input(" 3. 用户名 (默认: root): ") or "root" | ||
| 36 | + password = getpass(" 4. 密码 (默认: 12345678): ") or "12345678" | ||
| 37 | + db_name = input(" 5. 数据库名 (默认: Weibo_PublicOpinion_AnalysisSystem): ") or "Weibo_PublicOpinion_AnalysisSystem" | ||
| 38 | + | ||
| 39 | + # 构建数据库连接字符串 | ||
| 40 | + connection_str = f"mysql+pymysql://{user}:{password}@{host}:{port}/{db_name}?charset=utf8mb4" | ||
| 41 | + | ||
| 42 | + try: | ||
| 43 | + engine = create_engine(connection_str) | ||
| 44 | + # 测试连接 | ||
| 45 | + with engine.connect() as connection: | ||
| 46 | + logging.info(f"成功连接到数据库: {user}@{host}:{port}/{db_name}") | ||
| 47 | + return engine | ||
| 48 | + except Exception as e: | ||
| 49 | + logging.error(f"无法连接到数据库: {e}") | ||
| 50 | + exit(1) | ||
| 51 | + | ||
| 52 | +def saveData(engine): | ||
| 53 | + """ | ||
| 54 | + 从数据库和CSV文件读取数据,合并后去重并保存回数据库。 | ||
| 55 | + 最后删除CSV文件。 | ||
| 56 | + """ | ||
| 57 | + try: | ||
| 58 | + # 读取旧数据 | ||
| 59 | + oldArticle = pd.read_sql('SELECT * FROM article', engine) | ||
| 60 | + oldComment = pd.read_sql('SELECT * FROM comments', engine) | ||
| 61 | + logging.info("成功从数据库读取旧的文章和评论数据。") | ||
| 62 | + | ||
| 63 | + # 读取新数据 | ||
| 12 | newArticle = pd.read_csv(articleAddr) | 64 | newArticle = pd.read_csv(articleAddr) |
| 13 | - oldComment = pd.read_sql('select * from comments',engine) | ||
| 14 | newComment = pd.read_csv(commentsAddr) | 65 | newComment = pd.read_csv(commentsAddr) |
| 66 | + logging.info("成功从CSV文件读取新的文章和评论数据。") | ||
| 15 | 67 | ||
| 16 | - mergeArticle = pd.concat([newArticle,oldArticle],join='inner') | ||
| 17 | - mergeComment = pd.concat([newComment,oldComment],join='inner') | 68 | + # 合并数据 |
| 69 | + mergeArticle = pd.concat([newArticle, oldArticle], ignore_index=True, sort=False) | ||
| 70 | + mergeComment = pd.concat([newComment, oldComment], ignore_index=True, sort=False) | ||
| 71 | + logging.info("成功合并新旧文章和评论数据。") | ||
| 18 | 72 | ||
| 19 | - mergeArticle.drop_duplicates(subset='id',keep='last',inplace=True) | ||
| 20 | - mergeComment.drop_duplicates(subset='content',keep='last',inplace=True) | 73 | + # 去重 |
| 74 | + mergeArticle.drop_duplicates(subset='id', keep='last', inplace=True) | ||
| 75 | + mergeComment.drop_duplicates(subset='content', keep='last', inplace=True) | ||
| 76 | + logging.info("成功去除重复的文章和评论数据。") | ||
| 21 | 77 | ||
| 78 | + # 保存回数据库 | ||
| 22 | mergeArticle.to_sql('article', con=engine, if_exists='replace', index=False) | 79 | mergeArticle.to_sql('article', con=engine, if_exists='replace', index=False) |
| 23 | mergeComment.to_sql('comments', con=engine, if_exists='replace', index=False) | 80 | mergeComment.to_sql('comments', con=engine, if_exists='replace', index=False) |
| 24 | - except: | ||
| 25 | - newArticle = pd.read_csv(articleAddr) | ||
| 26 | - newComment = pd.read_csv(commentsAddr) | ||
| 27 | - newArticle.to_sql('article',con=engine,if_exists='replace',index=False) | ||
| 28 | - newComment.to_sql('comments',con=engine,if_exists='replace',index=False) | 81 | + logging.info("成功将合并后的数据保存回数据库。") |
| 29 | 82 | ||
| 83 | + except pd.errors.EmptyDataError as e: | ||
| 84 | + logging.error(f"读取CSV文件时出错: {e}") | ||
| 85 | + except Exception as e: | ||
| 86 | + logging.error(f"保存数据时出错: {e}") | ||
| 87 | + else: | ||
| 88 | + # 删除CSV文件 | ||
| 89 | + try: | ||
| 30 | os.remove(articleAddr) | 90 | os.remove(articleAddr) |
| 31 | os.remove(commentsAddr) | 91 | os.remove(commentsAddr) |
| 32 | - # update_data() | 92 | + logging.info("成功删除CSV文件。") |
| 93 | + except Exception as e: | ||
| 94 | + logging.warning(f"删除CSV文件时出错: {e}") | ||
| 95 | + | ||
| 96 | +def main(): | ||
| 97 | + # 获取数据库连接 | ||
| 98 | + engine = get_db_connection_interactive() | ||
| 99 | + | ||
| 100 | + # 保存数据 | ||
| 101 | + saveData(engine) | ||
| 102 | + | ||
| 103 | + # 关闭引擎(可选,因为SQLAlchemy引擎会自动管理连接池) | ||
| 104 | + engine.dispose() | ||
| 105 | + logging.info("数据库连接已关闭。") | ||
| 33 | 106 | ||
| 34 | if __name__ == '__main__': | 107 | if __name__ == '__main__': |
| 35 | - saveData() | ||
| 108 | + main() |
| 1 | -from pymysql import * | ||
| 2 | -conn = connect(host='47.92.235.6',port=3306,user='XiaoXueQi',password='XiaoXueQi',database='Weibo_PublicOpinion_AnalysisSystem') | 1 | +import getpass |
| 2 | +import pymysql | ||
| 3 | +import logging | ||
| 4 | + | ||
| 5 | +# 配置日志 | ||
| 6 | +logging.basicConfig( | ||
| 7 | + level=logging.INFO, | ||
| 8 | + format='%(asctime)s [%(levelname)s] %(message)s', | ||
| 9 | + handlers=[ | ||
| 10 | + logging.FileHandler("database_operations.log"), | ||
| 11 | + logging.StreamHandler() | ||
| 12 | + ] | ||
| 13 | +) | ||
| 14 | + | ||
| 15 | +def get_db_connection_interactive(): | ||
| 16 | + """ | ||
| 17 | + 通过终端交互获取数据库连接参数,若按回车则使用默认值。 | ||
| 18 | + 返回一个连接对象。 | ||
| 19 | + """ | ||
| 20 | + print("请依次输入数据库连接信息(直接按回车使用默认值):") | ||
| 21 | + | ||
| 22 | + host = input(" 1. 主机 (默认: localhost): ") or "localhost" | ||
| 23 | + port_str = input(" 2. 端口 (默认: 3306): ") or "3306" | ||
| 24 | + try: | ||
| 25 | + port = int(port_str) | ||
| 26 | + except ValueError: | ||
| 27 | + logging.warning("端口号无效,使用默认端口 3306。") | ||
| 28 | + port = 3306 | ||
| 29 | + | ||
| 30 | + user = input(" 3. 用户名 (默认: root): ") or "root" | ||
| 31 | + password = getpass.getpass(" 4. 密码 (默认: 312517): ") or "312517" | ||
| 32 | + db_name = input(" 5. 数据库名 (默认: Weibo_PublicOpinion_AnalysisSystem): ") or "Weibo_PublicOpinion_AnalysisSystem" | ||
| 33 | + | ||
| 34 | + logging.info(f"尝试连接到数据库: {user}@{host}:{port}/{db_name}") | ||
| 35 | + | ||
| 36 | + try: | ||
| 37 | + connection = pymysql.connect( | ||
| 38 | + host=host, | ||
| 39 | + port=port, | ||
| 40 | + user=user, | ||
| 41 | + password=password, | ||
| 42 | + database=db_name, | ||
| 43 | + charset='utf8mb4', | ||
| 44 | + cursorclass=pymysql.cursors.DictCursor # 返回字典格式 | ||
| 45 | + ) | ||
| 46 | + logging.info("数据库连接成功。") | ||
| 47 | + return connection | ||
| 48 | + except pymysql.MySQLError as e: | ||
| 49 | + logging.error(f"数据库连接失败: {e}") | ||
| 50 | + exit(1) | ||
| 51 | + | ||
| 52 | +# 获取数据库连接 | ||
| 53 | +conn = get_db_connection_interactive() | ||
| 54 | + | ||
| 55 | +# 获取游标 | ||
| 3 | cursor = conn.cursor() | 56 | cursor = conn.cursor() |
| 4 | -def query(sql,params,type="no_select"): | 57 | + |
| 58 | +def query(sql, params=None, query_type="no_select"): | ||
| 59 | + """ | ||
| 60 | + 执行SQL查询或操作。 | ||
| 61 | + | ||
| 62 | + :param sql: SQL语句 | ||
| 63 | + :param params: SQL参数(可选) | ||
| 64 | + :param query_type: 查询类型,默认为 "no_select" | ||
| 65 | + 如果不是 "no_select",则执行 fetch 操作 | ||
| 66 | + :return: 如果是查询操作,返回数据列表;否则返回 None | ||
| 67 | + """ | ||
| 68 | + try: | ||
| 69 | + if params: | ||
| 5 | params = tuple(params) | 70 | params = tuple(params) |
| 6 | - cursor.execute(sql,params) | 71 | + cursor.execute(sql, params) |
| 72 | + else: | ||
| 73 | + cursor.execute(sql) | ||
| 74 | + | ||
| 75 | + # 确保连接保持活跃 | ||
| 7 | conn.ping(reconnect=True) | 76 | conn.ping(reconnect=True) |
| 8 | - if type != 'no_select': | 77 | + |
| 78 | + if query_type != "no_select": | ||
| 9 | data_list = cursor.fetchall() | 79 | data_list = cursor.fetchall() |
| 10 | conn.commit() | 80 | conn.commit() |
| 81 | + logging.info("查询成功,已获取数据。") | ||
| 11 | return data_list | 82 | return data_list |
| 12 | else: | 83 | else: |
| 13 | conn.commit() | 84 | conn.commit() |
| 85 | + logging.info("操作成功,已提交事务。") | ||
| 86 | + except pymysql.MySQLError as e: | ||
| 87 | + logging.error(f"执行SQL时出错: {e}") | ||
| 88 | + conn.rollback() | ||
| 89 | + return None | ||
| 90 | + | ||
| 91 | +def main(): | ||
| 92 | + # 示例用法 | ||
| 93 | + | ||
| 94 | + # 执行查询操作 | ||
| 95 | + select_sql = "SELECT * FROM article LIMIT 5" | ||
| 96 | + articles = query(select_sql, query_type="select") | ||
| 97 | + if articles: | ||
| 98 | + for article in articles: | ||
| 99 | + print(article) | ||
| 100 | + | ||
| 101 | + # 执行插入操作(根据实际表结构修改) | ||
| 102 | + insert_sql = "INSERT INTO article (id, content) VALUES (%s, %s)" | ||
| 103 | + new_article = (12345, "这是一条新的文章内容。") | ||
| 104 | + result = query(insert_sql, params=new_article, query_type="no_select") | ||
| 105 | + if result is None: | ||
| 106 | + logging.info("插入操作完成。") | ||
| 107 | + | ||
| 108 | + # 关闭游标和连接 | ||
| 109 | + cursor.close() | ||
| 110 | + conn.close() | ||
| 111 | + logging.info("数据库连接已关闭。") | ||
| 112 | + | ||
| 113 | +if __name__ == '__main__': | ||
| 114 | + main() |
| 1 | +import os | ||
| 1 | import jieba | 2 | import jieba |
| 2 | from wordcloud import WordCloud | 3 | from wordcloud import WordCloud |
| 3 | import matplotlib.pyplot as plt | 4 | import matplotlib.pyplot as plt |
| 4 | -from PIL import Image,ImageDraw | ||
| 5 | -from pymysql import * | ||
| 6 | -import json | 5 | +from PIL import Image |
| 7 | import numpy as np | 6 | import numpy as np |
| 8 | -def stopWordList(): | ||
| 9 | - return [line.strip() for line in open('./model/stopWords.txt',encoding='utf8').readlines()] | ||
| 10 | - | ||
| 11 | -def get_img(field,tableName,targetImgSrc,resImgSrc): | ||
| 12 | - con = connect(host='47.92.235.6',user='XiaoXueQi',password='XiaoXueQi',database='Weibo_PublicOpinion_AnalysisSystem',port=3306,charset='utf8mb4') | ||
| 13 | - cuser = con.cursor() | ||
| 14 | - sql = f'select {field} from {tableName}' | ||
| 15 | - cuser.execute(sql) | ||
| 16 | - data = cuser.fetchall() | ||
| 17 | - text = '' | ||
| 18 | - for item in data: | ||
| 19 | - text += item[0] | ||
| 20 | - cuser.close() | ||
| 21 | - con.close() | 7 | +import pymysql |
| 22 | 8 | ||
| 23 | - cut = jieba.cut(text) | ||
| 24 | - newCut = [] | ||
| 25 | - for word in cut: | ||
| 26 | - if word not in stopWordList():newCut.append(word) | ||
| 27 | - string = ' '.join(newCut) | 9 | +def stopWordList(): |
| 10 | + """ | ||
| 11 | + 如果 stopWords.txt 文件内容较大,或被频繁读取, | ||
| 12 | + 可以考虑将其缓存起来,避免重复读文件。 | ||
| 13 | + """ | ||
| 14 | + with open('./model/stopWords.txt', encoding='utf8') as f: | ||
| 15 | + return [line.strip() for line in f.readlines()] | ||
| 28 | 16 | ||
| 29 | - img = Image.open(targetImgSrc) | 17 | +def generate_word_cloud(text, mask_path, font_path, output_path): |
| 18 | + """生成词云并保存到 output_path""" | ||
| 19 | + img = Image.open(mask_path) | ||
| 30 | img_arr = np.array(img) | 20 | img_arr = np.array(img) |
| 21 | + | ||
| 31 | wc = WordCloud( | 22 | wc = WordCloud( |
| 32 | background_color="#fff", | 23 | background_color="#fff", |
| 33 | mask=img_arr, | 24 | mask=img_arr, |
| 34 | - font_path='STHUPO.TTF' | 25 | + font_path=font_path |
| 35 | ) | 26 | ) |
| 36 | - wc.generate_from_text(string) | ||
| 37 | - | ||
| 38 | - fig = plt.figure(1) | ||
| 39 | - plt.imshow(wc) | 27 | + wc.generate_from_text(text) |
| 40 | 28 | ||
| 29 | + plt.figure(figsize=(8, 6)) | ||
| 30 | + plt.imshow(wc, interpolation='bilinear') | ||
| 41 | plt.axis('off') | 31 | plt.axis('off') |
| 32 | + plt.savefig(output_path, dpi=300, bbox_inches='tight') | ||
| 33 | + plt.close() # 保存后关闭 | ||
| 34 | + | ||
| 35 | +def get_db_connection_interactive(): | ||
| 36 | + """ | ||
| 37 | + 通过终端交互获取数据库连接参数,若按回车则使用默认值。 | ||
| 38 | + """ | ||
| 39 | + print("请依次输入数据库连接信息(直接按回车使用默认值):") | ||
| 40 | + | ||
| 41 | + host = input(" 1. 主机 (默认: localhost): ") or "localhost" | ||
| 42 | + port_str = input(" 2. 端口 (默认: 3306): ") or "3306" | ||
| 43 | + port = int(port_str) | ||
| 44 | + | ||
| 45 | + user = input(" 3. 用户名 (默认: root): ") or "root" | ||
| 46 | + password = input(" 4. 密码 (默认: 312517): ") or "12345678" | ||
| 47 | + db_name = input(" 5. 数据库名 (默认: Weibo_PublicOpinion_AnalysisSystem): ") or "Weibo_PublicOpinion_AnalysisSystem" | ||
| 48 | + | ||
| 49 | + print(f"\n即将连接到数据库: {user}@{host}:{port}/{db_name}\n") | ||
| 42 | 50 | ||
| 43 | - plt.savefig(resImgSrc,dpi=500) | 51 | + return pymysql.connect( |
| 52 | + host=host, | ||
| 53 | + user=user, | ||
| 54 | + password=password, | ||
| 55 | + database=db_name, | ||
| 56 | + port=port, | ||
| 57 | + charset='utf8mb4' | ||
| 58 | + ) | ||
| 59 | + | ||
| 60 | +def get_img(field, table_name, target_img_src, res_img_src, connection, font_path='STHUPO.TTF'): | ||
| 61 | + """ | ||
| 62 | + 从数据库拉取指定字段的文本数据,分词处理后生成词云。 | ||
| 63 | + :param field: 数据库字段名 | ||
| 64 | + :param table_name: 数据表名 | ||
| 65 | + :param target_img_src: 词云形状图 | ||
| 66 | + :param res_img_src: 输出词云文件路径 | ||
| 67 | + :param connection: 已建立的数据库连接 | ||
| 68 | + :param font_path: 字体文件路径 | ||
| 69 | + """ | ||
| 70 | + cursor = connection.cursor() | ||
| 71 | + sql = f'SELECT {field} FROM {table_name}' | ||
| 72 | + cursor.execute(sql) | ||
| 73 | + data = cursor.fetchall() | ||
| 74 | + | ||
| 75 | + text = '' | ||
| 76 | + for item in data: | ||
| 77 | + text += item[0] # item 是元组 (内容,),取第一个元素即可 | ||
| 44 | 78 | ||
| 79 | + cursor.close() | ||
| 80 | + | ||
| 81 | + # 分词 & 去停用词 | ||
| 82 | + cut_words = jieba.cut(text) | ||
| 83 | + stop_words = set(stopWordList()) | ||
| 84 | + filtered_words = [word for word in cut_words if word not in stop_words] | ||
| 85 | + final_text = ' '.join(filtered_words) | ||
| 86 | + | ||
| 87 | + # 生成词云 | ||
| 88 | + generate_word_cloud(final_text, target_img_src, font_path, res_img_src) | ||
| 89 | + | ||
| 90 | +def main(): | ||
| 91 | + # 1. 获取数据库连接(交互式输入) | ||
| 92 | + connection = get_db_connection_interactive() | ||
| 93 | + | ||
| 94 | + # 2. 根据需求生成词云 | ||
| 95 | + # 例如:从 article 表的 content 字段生成词云 | ||
| 96 | + try: | ||
| 97 | + get_img( | ||
| 98 | + field='content', | ||
| 99 | + table_name='article', | ||
| 100 | + target_img_src='./static/content.jpg', | ||
| 101 | + res_img_src='./static/contentCloud.jpg', | ||
| 102 | + connection=connection | ||
| 103 | + ) | ||
| 104 | + print("词云生成完毕!") | ||
| 105 | + finally: | ||
| 106 | + # 关闭数据库连接 | ||
| 107 | + connection.close() | ||
| 45 | 108 | ||
| 46 | -# get_img('content','comments','./static/comment.jpg','./static/commentCloud.jpg') | ||
| 47 | -get_img('content','article','./static/content.jpg','./static/contentCloud.jpg') | 109 | +if __name__ == '__main__': |
| 110 | + main() |
-
Please register or login to post a comment