Merge branch 'main' of https://github.com/666ghj/Weibo_PublicOpinion_AnalysisSystem

redhong-xy
Commit 07496a62a0a0214d776495ab63936abaf09ff413 07496a62 2 parents 37df0f52 01ed2ce6
Showing 5 changed files with 266 additions and 0 deletions
createTables.sql
spider/main.py
spider/spiderComments.py
utils/getEchartsData.py
utils/getTableData.py
--- a/createTables.sql 0 → 100644
View file @07496a6
+++ b/createTables.sql 0 → 100644
View file @07496a6
+ SET FOREIGN_KEY_CHECKS=0;
+ 
+ -- ----------------------------
+ -- article表
+ -- ----------------------------
+ CREATE TABLE `article` (
+   `id` bigint(20) DEFAULT NULL,
+   `likeNum` bigint(20) DEFAULT NULL,
+   `commentsLen` bigint(20) DEFAULT NULL,
+   `reposts_count` bigint(20) DEFAULT NULL,
+   `region` text,
+   `content` text,
+   `contentLen` bigint(20) DEFAULT NULL,
+   `created_at` text,
+   `type` text,
+   `detailUrl` text,
+   `authorAvatar` text,
+   `authorName` text,
+   `authorDetail` text,
+   `isVip` double DEFAULT NULL
+ ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci;
+ 
+ -- ----------------------------
+ -- comments表
+ -- ----------------------------
+ CREATE TABLE `comments` (
+   `articleId` bigint(20) DEFAULT NULL,
+   `created_at` text,
+   `likes_counts` bigint(20) DEFAULT NULL,
+   `region` text,
+   `content` text,
+   `authorName` text,
+   `authorGender` text,
+   `authorAddress` text,
+   `authorAvatar` text
+ ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci;
+ 
+ -- ----------------------------
+ -- user表
+ -- ----------------------------
+ CREATE TABLE `user` (
+   `username` varchar(255) DEFAULT NULL,
+   `password` varchar(255) DEFAULT NULL,
+   `id` int(11) NOT NULL AUTO_INCREMENT,
+   `createTime` varchar(255) DEFAULT NULL,
+   PRIMARY KEY (`id`)
+ ) ENGINE=InnoDB AUTO_INCREMENT=4 DEFAULT CHARSET=utf8;
\ No newline at end of file
--- a/spider/main.py 0 → 100644
View file @07496a6
+++ b/spider/main.py 0 → 100644
View file @07496a6
+ from spiderContent import start as spiderContentStart
+ from spiderComments import start as spiderCommentsStart
+ import os
+ from sqlalchemy import create_engine
+ import pandas as pd
+ 
+ engine = create_engine('mysql+pymysql://XiaoXueQi:XiaoXueQi@10.92.35.13/Weibo_PublicOpinion_AnalysisSystem?charset=utf8mb4')
+ 
+ def save_to_sql():
+     try:
+         artileOldPd = pd.read_sql('select * from article',engine)
+         articleNewPd = pd.read_csv('articleData.csv')
+         commentOldPd = pd.read_sql('select * from comments',engine)
+         commentNewPd = pd.read_csv('articleComments.csv')
+ 
+         concatArticlePd = pd.concat([articleNewPd,artileOldPd],join='inner')
+         concatCommentsPd = pd.concat([commentNewPd,commentOldPd],join='inner')
+ 
+         concatArticlePd.drop_duplicates(subset='id',keep='last',inplace=True)
+         concatCommentsPd.drop_duplicates(subset='content',keep='last',inplace=True)
+ 
+         concatArticlePd.to_sql('article', con=engine, if_exists='replace', index=False)
+         concatCommentsPd.to_sql('comments', con=engine, if_exists='replace', index=False)
+     except:
+         articleNewPd = pd.read_csv('articleData.csv')
+         commentNewPd = pd.read_csv('articleComments.csv')
+         articleNewPd.to_sql('article',con=engine,if_exists='replace',index=False)
+         commentNewPd.to_sql('comments',con=engine,if_exists='replace',index=False)
+ 
+     os.remove('./articleData.csv')
+     os.remove('./articleComments.csv')
+ 
+ def main():
+     print('正在爬取文章数据')
+     spiderContentStart(1,1)
+     print('正在爬取文章评论数据')
+     spiderCommentsStart()
+     print('正在存储数据')
+     save_to_sql()
+ 
+ 
+ if __name__ == '__main__':
+     main()
\ No newline at end of file
--- a/spider/spiderComments.py 0 → 100644
View file @07496a6
+++ b/spider/spiderComments.py 0 → 100644
View file @07496a6
+ import time
+ import requests
+ import csv
+ import os
+ from datetime import datetime
+ 
+ def init():
+     if not os.path.exists('./articleComments.csv'):
+         with open('./articleComments.csv','w',encoding='utf-8',newline='') as csvFile:
+             writer = csv.writer(csvFile)
+             writer.writerow([
+                 'articleId',
+                 'created_at',
+                 'likes_counts',
+                 'region',
+                 'content',
+                 'authorName',
+                 'authorGender',
+                 'authorAddress',
+                 'authorAvatar'
+             ])
+ 
+ def writerRow(row):
+     with open('./articleComments.csv', 'a', encoding='utf-8', newline='') as csvFile:
+         writer = csv.writer(csvFile)
+         writer.writerow(row)
+ 
+ def get_data(url,params):
+     headers = {
+         'Cookie':'SINAGLOBAL=2555941826014.1074.1676801766625; ULV=1719829459275:6:1:2:4660996305989.918.1719827559898:1719743122299; UOR=,,www.baidu.com; XSRF-TOKEN=VtLXviYSIs8lor7sz4iGyigL; SUB=_2A25LhvU9DeRhGeFH6FIX-S3MyD2IHXVo-gj1rDV8PUJbkNAGLRXMkW1Ne2nhI3Gle25QJK0Z99J3trq_NZn6YKJ-; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WW3Mv8V5EupQbbKh.vaZIwU5JpX5KzhUgL.FoM4e05c1Ke7e022dJLoIp7LxKML1KBLBKnLxKqL1hnLBoM41hz41hqReKqN; WBPSESS=Dt2hbAUaXfkVprjyrAZT_LRaDLsnxG-kIbeYwnBb5OUKZiwfVr_UrcYfWuqG-4ZVDM5HeU3HXkDNK_thfRfdS9Ao6ezT30jDksv-CpaVmlTAqGUHjJ7PYkH5aCK4HLxmRq14ZalmQNwzfWMPa4y0VNRLuYdg7L1s49ymNq_5v5vusoz0r4ki6u-MHGraF0fbUTgX14x0kHayEwOoxfLI-w==; SCF=AqmJWo31oFV5itnRgWNU1-wHQTL6PmkBLf3gDuqpdqAIfaWguDTMre6Oxjf5Uzs74JAh2r0DdV1sJ1g6m-wJ5NQ.; _s_tentry=-; Apache=4660996305989.918.1719827559898; PC_TOKEN=7955a7ab1f; appkey=; geetest_token=602cd4e3a7ed1898808f8adfe1a2048b; ALF=1722421868',
+         'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:127.0) Gecko/20100101 Firefox/127.0'
+     }
+     response = requests.get(url,headers=headers,params=params)
+     if response.status_code == 200:
+         return response.json()['data']
+     else:
+         return None
+ 
+ def getAllArticleList():
+     artileList = []
+     with open('./articleData.csv','r',encoding='utf-8') as reader:
+         readerCsv = csv.reader(reader)
+         next(reader)
+         for nav in readerCsv:
+             artileList.append(nav)
+     return artileList
+ 
+ def parse_json(response,artileId):
+     for comment in response:
+         created_at = datetime.strptime(comment['created_at'],'%a %b %d %H:%M:%S %z %Y').strftime('%Y-%m-%d')
+         likes_counts = comment['like_counts']
+         try:
+             region = comment['source'].replace('来自', '')
+         except:
+             region = '无'
+         content = comment['text_raw']
+         authorName = comment['user']['screen_name']
+         authorGender = comment['user']['gender']
+         authorAddress = comment['user']['location']
+         authorAvatar = comment['user']['avatar_large']
+         writerRow([
+             artileId,
+             created_at,
+             likes_counts,
+             region,
+             content,
+             authorName,
+             authorGender,
+             authorAddress,
+             authorAvatar
+         ])
+ 
+ def start():
+     commentUrl = 'https://weibo.com/ajax/statuses/buildComments'
+     init()
+     articleList = getAllArticleList()
+     for article in articleList:
+         articleId = article[0]
+         print('正在爬取id值为%s的文章评论' % articleId)
+         time.sleep(2)
+         params = {
+             'id':int(articleId),
+             'is_show_bulletin':2
+         }
+         response = get_data(commentUrl,params)
+         parse_json(response,articleId)
+ 
+ 
+ 
+ if __name__ == '__main__':
+     start()
+ 
+ 
+ 
+ 
+ 
+ 
+ 
+ 
--- a/utils/getEchartsData.py
View file @07496a6
+++ b/utils/getEchartsData.py
View file @07496a6
@@ -152,3 +152,59 @@ def getCommentCharDataTwo():# 统计评论数据中不同性别的数量
         })
     return resultData
 
+ def getYuQingCharDataOne():# 统计热词中正面、中性、负面的数量
+     hotWordList = getAllHotWords()
+     xData = ['正面','中性','负面']
+     yData = [0,0,0]
+     for word in hotWordList:
+         emotionValue = SnowNLP(word[0]).sentiments
+         if emotionValue > 0.5:
+             yData[0] += 1
+         elif emotionValue == 0.5:
+             yData[1] += 1
+         elif emotionValue < 0.5:
+             yData[2] += 1
+     finalData = [{
+         'name':x,
+         'value':yData[index]
+     } for index,x in enumerate(xData)]
+     return xData,yData,finalData
+ 
+ def getYuQingCharDataTwo():# 统计评论列表和文章列表中的情感值
+     xData = ['正面', '中性', '负面']
+     finalData1 = [{
+         'name':x,
+         'value':0
+     } for x in xData]
+     finalData2 = [{
+         'name': x,
+         'value': 0
+     } for x in xData]
+ 
+     for comment in commentList:
+         emotionValue = SnowNLP(comment[4]).sentiments
+         if emotionValue > 0.5:
+             finalData1[0]['value'] += 1
+         elif emotionValue == 0.5:
+             finalData1[1]['value'] += 1
+         elif emotionValue < 0.5:
+             finalData1[2]['value'] += 1
+     for artile in articleList:
+         emotionValue = SnowNLP(artile[5]).sentiments
+         if emotionValue > 0.5:
+             finalData2[0]['value'] += 1
+         elif emotionValue == 0.5:
+             finalData2[1]['value'] += 1
+         elif emotionValue < 0.5:
+             finalData2[2]['value'] += 1
+     return finalData1,finalData2
+ 
+ def getYuQingCharDataThree():# 提取前10个热词及其对应的出现频率
+     hotWordList = getAllHotWords()
+     xData = []
+     yData = []
+     for i in hotWordList[:10]:
+         xData.append(i[0])
+         yData.append(int(i[1]))
+     return xData,yData
+ 
--- a/utils/getTableData.py 0 → 100644
View file @07496a6
+++ b/utils/getTableData.py 0 → 100644
View file @07496a6
+ from utils.getPublicData import getAllArticleData
+ from snownlp import SnowNLP
+ 
+ def getTableDataList(flag):
+     if flag:
+         tableList = []
+         articeList = getAllArticleData()
+         for article in articeList:
+             item = list(article)
+             value = ''
+             if SnowNLP(item[5]).sentiments > 0.5:
+                 value = '正面'
+             elif SnowNLP(item[5]).sentiments < 0.5:
+                 value = '负面'
+             else:
+                 value = '中性'
+             item.append(value)
+             tableList.append(item)
+         return tableList
+     else:
+         return getAllArticleData()
\ No newline at end of file