戒酒的李白

【大修bug】添加csv表格原始数据,修改词频统计函数bug

('哈哈', 1236)
('哈哈哈', 537)
('哈哈哈哈', 157)
('真的', 154)
('期待', 89)
('喜欢', 89)
('doge', 88)
('宝宝', 87)
('可爱', 79)
('第一', 73)
('演唱', 71)
('亲亲', 71)
('苦涩', 70)
('啊啊啊', 68)
('抱抱', 64)
('cry', 64)
('宝贝', 62)
('姐姐', 51)
('花花', 50)
('送花', 48)
('开心', 47)
('加油', 47)
('老师', 46)
('call', 45)
('特别', 42)
('一个', 42)
('抓狂', 40)
('嘻嘻', 39)
('心心', 38)
('悲伤', 38)
('世界', 37)
('感觉', 35)
('孩子', 35)
('朋友', 34)
('鲜花', 34)
('开学', 34)
('好好', 34)
('演唱会', 33)
('感谢', 32)
('憧憬', 31)
('学季', 31)
('快乐', 30)
('漂亮', 30)
('中国', 30)
('音乐', 29)
('电影', 28)
('莲花', 28)
('骄阳', 28)
('视频', 27)
('老公', 27)
('老婆', 27)
('值得', 26)
('好看', 26)
('消失', 26)
('希望', 25)
('呜呜', 25)
('少年', 25)
('东西', 25)
('实力', 24)
('评论', 24)
('舞台', 24)
('生活', 24)
('单身', 24)
('努力', 23)
('唯一', 23)
('幸福', 23)
('时间', 23)
('超级', 23)
('辈子', 22)
('童年', 22)
('时代', 22)
('可怜', 21)
('不见', 21)
('工作', 21)
('有人', 21)
('终于', 21)
('粉丝', 21)
('国家', 21)
('callcallcall', 21)
('永远', 21)
('太阳', 20)
('直播', 20)
('小时', 20)
('星期', 20)
('安全', 20)
('代言', 19)
('支持', 19)
('彩虹', 19)
('妈妈', 18)
('华为', 18)
('优秀', 18)
('好像', 18)
('越来', 18)
('大人', 18)
('父母', 18)
('害怕', 18)
('安哥', 18)
('加班', 18)
('一点', 18)
('一场', 17)
... ...
This diff could not be displayed because it is too large.
typeName,gid,containerid
热门,102803,102803
同城,1028032222,102803_2222
榜单,102803600169,102803_ctg1_600169_-_ctg1_600169
男篮,102803600279,102803_ctg1_600279_-_ctg1_600279
明星,1028034288,102803_ctg1_4288_-_ctg1_4288
车展,1028035188,102803_ctg1_5188_-_ctg1_5188
搞笑,1028034388,102803_ctg1_4388_-_ctg1_4388
情感,1028031988,102803_ctg1_1988_-_ctg1_1988
周末,102803600195,102803_ctg1_600195_-_ctg1_600195
电影,1028033288,102803_ctg1_3288_-_ctg1_3288
社会,1028034188,102803_ctg1_4188_-_ctg1_4188
电视剧,1028032488,102803_ctg1_2488_-_ctg1_2488
美食,1028032688,102803_ctg1_2688_-_ctg1_2688
俄乌局势,102803600267,102803_ctg1_600267_-_ctg1_600267
国际,1028036288,102803_ctg1_6288_-_ctg1_6288
深度,102803600155,102803_ctg1_600155_-_ctg1_600155
财经,1028036388,102803_ctg1_6388_-_ctg1_6388
读书,1028034588,102803_ctg1_4588_-_ctg1_4588
摄影,1028034988,102803_ctg1_4988_-_ctg1_4988
颜值,102803600165,102803_ctg1_600165_-_ctg1_600165
体育,1028031388,102803_ctg1_1388_-_ctg1_1388
数码,1028035088,102803_ctg1_5088_-_ctg1_5088
综艺,1028034688,102803_ctg1_4688_-_ctg1_4688
时尚,1028034488,102803_ctg1_4488_-_ctg1_4488
星座,1028031688,102803_ctg1_1688_-_ctg1_1688
军事,1028036688,102803_ctg1_6688_-_ctg1_6688
股市,1028031288,102803_ctg1_1288_-_ctg1_1288
房产,1028035588,102803_ctg1_5588_-_ctg1_5588
家居,1028035888,102803_ctg1_5888_-_ctg1_5888
萌宠,1028032788,102803_ctg1_2788_-_ctg1_2788
科技,1028032088,102803_ctg1_2088_-_ctg1_2088
科普,1028035988,102803_ctg1_5988_-_ctg1_5988
动漫,1028032388,102803_ctg1_2388_-_ctg1_2388
运动健身,1028034788,102803_ctg1_4788_-_ctg1_4788
旅游,1028032588,102803_ctg1_2588_-_ctg1_2588
瘦身,1028036488,102803_ctg1_6488_-_ctg1_6488
好物,102803600094,102803_ctg1_600094_-_ctg1_600094
历史,1028036788,102803_ctg1_6788_-_ctg1_6788
艺术,1028035488,102803_ctg1_5488_-_ctg1_5488
美妆,1028031588,102803_ctg1_1588_-_ctg1_1588
法律,1028037388,102803_ctg1_7388_-_ctg1_7388
设计,1028035388,102803_ctg1_5388_-_ctg1_5388
健康,1028032188,102803_ctg1_2188_-_ctg1_2188
音乐,1028035288,102803_ctg1_5288_-_ctg1_5288
游戏,1028034888,102803_ctg1_4888_-_ctg1_4888
新时代,1028037968,102803_ctg1_7968_-_ctg1_7968
校园,102803600177,102803_ctg1_600177_-_ctg1_600177
收藏,1028038189,102803_ctg1_8189_-_ctg1_8189
政务,1028035788,102803_ctg1_5788_-_ctg1_5788
养生,1028036588,102803_ctg1_6588_-_ctg1_6588
育儿,1028033188,102803_ctg1_3188_-_ctg1_3188
抽奖,102803600037,102803_ctg1_600037_-_ctg1_600037
教育,102803600080,102803_ctg1_600080_-_ctg1_600080
婚恋,1028031788,102803_ctg1_1788_-_ctg1_1788
舞蹈,1028038788,102803_ctg1_8788_-_ctg1_8788
辟谣,1028036988,102803_ctg1_6988_-_ctg1_6988
公益,102803600057,102803_ctg1_600057_-_ctg1_600057
问答,1028037977,102803_ctg1_7977_-_ctg1_7977
三农,1028037188,102803_ctg1_7188_-_ctg1_7188
... ...
from utils.getPublicData import *
from snownlp import SnowNLP
articleList = getAllArticleData()
commentList = getAllCommentsData()
def getTypeList():# 返回爬取到的所有文章的类型(已去重)
def getTypeList():
return list(set([x[8] for x in getAllArticleData()]))
def getArticleByType(type):# 根据特定文章类型筛选文章
def getArticleByType(type):
articles = []
for i in articleList:
if i[8] == type:
articles.append(i)
return articles
def getArticleCharLikeCount(type):# 统计特定类型文章的点赞数分布
def getArticleCharLikeCount(type):
articles = getArticleByType(type)
xData = ['0-100','100-1000','1000-5000','5000-15000','15000-30000','30000-50000','50000-~']
yData = [0 for x in range(len(xData))]# 初始化为长度和xData相同但是每一个元素都是零的列表
yData = [0 for x in range(len(xData))]
for article in articles:
likeCount = int(article[1])
if likeCount < 100:
... ... @@ -34,10 +35,10 @@ def getArticleCharLikeCount(type):# 统计特定类型文章的点赞数分布
yData[6] += 1
return xData,yData
def getArticleCharCommentsLen(type):# 统计特定类型文章的评论数分布
def getArticleCharCommentsLen(type):
articles = getArticleByType(type)
xData = ['0-100','100-500','500-1000','1000-1500','1500-3000','3000-5000','5000-10000','10000-15000','15000-~']
yData = [0 for x in range(len(xData))]# 初始化为长度和xData相同但是每一个元素都是零的列表
yData = [0 for x in range(len(xData))]
for article in articles:
commentLen = int(article[2])
if commentLen < 100:
... ... @@ -60,7 +61,7 @@ def getArticleCharCommentsLen(type):# 统计特定类型文章的评论数分布
yData[8] += 1
return xData,yData
def getArticleCharRepotsLen(type):# 统计特定类型文章的转发数分布
def getArticleCharRepotsLen(type):
articles = getArticleByType(type)
xData = ['0-100','100-300','300-500','500-1000','1000-2000','2000-3000','3000-4000','4000-5000','5000-10000','10000-15000','15000-30000','30000-70000','70000-~']
yData = [0 for x in range(len(xData))]
... ... @@ -92,14 +93,14 @@ def getArticleCharRepotsLen(type):# 统计特定类型文章的转发数分布
yData[11] += 1
return xData,yData
def getIPCharByArticleRegion():#统计文章发布地域的分布情况
def getIPCharByArticleRegion():
articleRegionDic = {}
for i in articleList:
if i[4] != '无':# 如果ip为确定值的话就进行下一步
if i[4] in articleRegionDic.keys():
articleRegionDic[i[4]] += 1
else:
if i[4] != '无':
if articleRegionDic.get(i[4],-1) == -1:
articleRegionDic[i[4]] = 1
else:
articleRegionDic[i[4]] += 1
resultData = []
for key,value in articleRegionDic.items():
resultData.append({
... ... @@ -108,14 +109,14 @@ def getIPCharByArticleRegion():#统计文章发布地域的分布情况
})
return resultData
def getIPCharByCommentsRegion():#统计评论发布地域的分布情况
def getIPCharByCommentsRegion():
commentRegionDic = {}
for i in commentList:
if i[3] != '无':
if i[3] in commentRegionDic.keys():
commentRegionDic[i[3]] += 1
else:
if commentRegionDic.get(i[3],-1) == -1:
commentRegionDic[i[3]] = 1
else:
commentRegionDic[i[3]] += 1
resultData = []
for key,value in commentRegionDic.items():
resultData.append({
... ... @@ -124,35 +125,33 @@ def getIPCharByCommentsRegion():#统计评论发布地域的分布情况
})
return resultData
def getCommentCharDataOne():# 统计评论点赞数的分布情况
def getCommentCharDataOne():
xData = []
rangeNum = 20
for item in range(100):
for item in range(1,100):
xData.append(str(rangeNum * item) + '-' + str(rangeNum * (item + 1)))
yData = [0 for x in range(len(xData))]
for comment in commentList:
for item in range(100):
if int(comment[2]) < rangeNum * (item + 1):
for item in range(99):
if int(comment[2]) < rangeNum * (item + 2):
yData[item] += 1
break
return xData,yData
def getCommentCharDataTwo():# 统计评论数据中不同性别的数量
def getCommentCharDataTwo():
genderDic = {}
for i in commentList:
if i[6] in genderDic.keys():
genderDic[i[6]] += 1
else:
if genderDic.get(i[6],-1) == -1:
genderDic[i[6]] = 1
resultData = []
for key,value in genderDic.items():
resultData.append({
'name':key,
'value':value
})
else:
genderDic[i[6]] += 1
resultData = [{
'name':x[0],
'value':x[1]
} for x in genderDic.items()]
return resultData
def getYuQingCharDataOne():# 统计热词中正面、中性、负面的数量
def getYuQingCharDataOne():
hotWordList = getAllHotWords()
xData = ['正面','中性','负面']
yData = [0,0,0]
... ... @@ -164,19 +163,19 @@ def getYuQingCharDataOne():# 统计热词中正面、中性、负面的数量
yData[1] += 1
elif emotionValue < 0.5:
yData[2] += 1
finalData = [{
bieData = [{
'name':x,
'value':yData[index]
} for index,x in enumerate(xData)]
return xData,yData,finalData
return xData,yData,bieData
def getYuQingCharDataTwo():# 统计评论列表和文章列表中的情感值
def getYuQingCharDataTwo():
xData = ['正面', '中性', '负面']
finalData1 = [{
bieData1 = [{
'name':x,
'value':0
} for x in xData]
finalData2 = [{
bieData2 = [{
'name': x,
'value': 0
} for x in xData]
... ... @@ -184,27 +183,27 @@ def getYuQingCharDataTwo():# 统计评论列表和文章列表中的情感值
for comment in commentList:
emotionValue = SnowNLP(comment[4]).sentiments
if emotionValue > 0.5:
finalData1[0]['value'] += 1
bieData1[0]['value'] += 1
elif emotionValue == 0.5:
finalData1[1]['value'] += 1
bieData1[1]['value'] += 1
elif emotionValue < 0.5:
finalData1[2]['value'] += 1
bieData1[2]['value'] += 1
for artile in articleList:
emotionValue = SnowNLP(artile[5]).sentiments
if emotionValue > 0.5:
finalData2[0]['value'] += 1
bieData2[0]['value'] += 1
elif emotionValue == 0.5:
finalData2[1]['value'] += 1
bieData2[1]['value'] += 1
elif emotionValue < 0.5:
finalData2[2]['value'] += 1
return finalData1,finalData2
bieData2[2]['value'] += 1
return bieData1,bieData2
def getYuQingCharDataThree():# 提取前10个热词及其对应的出现频率
def getYuQingCharDataThree():
hotWordList = getAllHotWords()
xData = []
yData = []
x1Data = []
y1Data = []
for i in hotWordList[:10]:
xData.append(i[0])
yData.append(int(i[1]))
return xData,yData
x1Data.append(i[0])
y1Data.append(int(i[1]))
return x1Data,y1Data
... ...