【大修bug】添加csv表格原始数据，修改词频统计函数bug

戒酒的李白
Commit f98d111c323a873bd13486cad1c8c690e73faa6a f98d111c 1 parent d93da880
Showing 6 changed files with 208 additions and 49 deletions
model/cipingTotal.csv
utils/cipingTotal.py → model/cipingTotal.py
utils/cutComments.py → model/cutComments.py
model/target.csv
spider/navData.csv
utils/getEchartsData.py
--- a/model/cipingTotal.csv 0 → 100644
View file @f98d111
+++ b/model/cipingTotal.csv 0 → 100644
View file @f98d111
+ ('哈哈', 1236)
+ ('哈哈哈', 537)
+ ('哈哈哈哈', 157)
+ ('真的', 154)
+ ('期待', 89)
+ ('喜欢', 89)
+ ('doge', 88)
+ ('宝宝', 87)
+ ('可爱', 79)
+ ('第一', 73)
+ ('演唱', 71)
+ ('亲亲', 71)
+ ('苦涩', 70)
+ ('啊啊啊', 68)
+ ('抱抱', 64)
+ ('cry', 64)
+ ('宝贝', 62)
+ ('姐姐', 51)
+ ('花花', 50)
+ ('送花', 48)
+ ('开心', 47)
+ ('加油', 47)
+ ('老师', 46)
+ ('call', 45)
+ ('特别', 42)
+ ('一个', 42)
+ ('抓狂', 40)
+ ('嘻嘻', 39)
+ ('心心', 38)
+ ('悲伤', 38)
+ ('世界', 37)
+ ('感觉', 35)
+ ('孩子', 35)
+ ('朋友', 34)
+ ('鲜花', 34)
+ ('开学', 34)
+ ('好好', 34)
+ ('演唱会', 33)
+ ('感谢', 32)
+ ('憧憬', 31)
+ ('学季', 31)
+ ('快乐', 30)
+ ('漂亮', 30)
+ ('中国', 30)
+ ('音乐', 29)
+ ('电影', 28)
+ ('莲花', 28)
+ ('骄阳', 28)
+ ('视频', 27)
+ ('老公', 27)
+ ('老婆', 27)
+ ('值得', 26)
+ ('好看', 26)
+ ('消失', 26)
+ ('希望', 25)
+ ('呜呜', 25)
+ ('少年', 25)
+ ('东西', 25)
+ ('实力', 24)
+ ('评论', 24)
+ ('舞台', 24)
+ ('生活', 24)
+ ('单身', 24)
+ ('努力', 23)
+ ('唯一', 23)
+ ('幸福', 23)
+ ('时间', 23)
+ ('超级', 23)
+ ('辈子', 22)
+ ('童年', 22)
+ ('时代', 22)
+ ('可怜', 21)
+ ('不见', 21)
+ ('工作', 21)
+ ('有人', 21)
+ ('终于', 21)
+ ('粉丝', 21)
+ ('国家', 21)
+ ('callcallcall', 21)
+ ('永远', 21)
+ ('太阳', 20)
+ ('直播', 20)
+ ('小时', 20)
+ ('星期', 20)
+ ('安全', 20)
+ ('代言', 19)
+ ('支持', 19)
+ ('彩虹', 19)
+ ('妈妈', 18)
+ ('华为', 18)
+ ('优秀', 18)
+ ('好像', 18)
+ ('越来', 18)
+ ('大人', 18)
+ ('父母', 18)
+ ('害怕', 18)
+ ('安哥', 18)
+ ('加班', 18)
+ ('一点', 18)
+ ('一场', 17)
--- a/utils/cipingTotal.py → model/cipingTotal.py
View file @f98d111
+++ b/utils/cipingTotal.py → model/cipingTotal.py
View file @f98d111
--- a/utils/cutComments.py → model/cutComments.py
View file @f98d111
+++ b/utils/cutComments.py → model/cutComments.py
View file @f98d111
--- a/model/target.csv 0 → 100644
View file @f98d111
+++ b/model/target.csv 0 → 100644
View file @f98d111
--- a/spider/navData.csv 0 → 100644
View file @f98d111
+++ b/spider/navData.csv 0 → 100644
View file @f98d111
+ typeName,gid,containerid
+ 热门,102803,102803
+ 同城,1028032222,102803_2222
+ 榜单,102803600169,102803_ctg1_600169_-_ctg1_600169
+ 男篮,102803600279,102803_ctg1_600279_-_ctg1_600279
+ 明星,1028034288,102803_ctg1_4288_-_ctg1_4288
+ 车展,1028035188,102803_ctg1_5188_-_ctg1_5188
+ 搞笑,1028034388,102803_ctg1_4388_-_ctg1_4388
+ 情感,1028031988,102803_ctg1_1988_-_ctg1_1988
+ 周末,102803600195,102803_ctg1_600195_-_ctg1_600195
+ 电影,1028033288,102803_ctg1_3288_-_ctg1_3288
+ 社会,1028034188,102803_ctg1_4188_-_ctg1_4188
+ 电视剧,1028032488,102803_ctg1_2488_-_ctg1_2488
+ 美食,1028032688,102803_ctg1_2688_-_ctg1_2688
+ 俄乌局势,102803600267,102803_ctg1_600267_-_ctg1_600267
+ 国际,1028036288,102803_ctg1_6288_-_ctg1_6288
+ 深度,102803600155,102803_ctg1_600155_-_ctg1_600155
+ 财经,1028036388,102803_ctg1_6388_-_ctg1_6388
+ 读书,1028034588,102803_ctg1_4588_-_ctg1_4588
+ 摄影,1028034988,102803_ctg1_4988_-_ctg1_4988
+ 颜值,102803600165,102803_ctg1_600165_-_ctg1_600165
+ 体育,1028031388,102803_ctg1_1388_-_ctg1_1388
+ 数码,1028035088,102803_ctg1_5088_-_ctg1_5088
+ 综艺,1028034688,102803_ctg1_4688_-_ctg1_4688
+ 时尚,1028034488,102803_ctg1_4488_-_ctg1_4488
+ 星座,1028031688,102803_ctg1_1688_-_ctg1_1688
+ 军事,1028036688,102803_ctg1_6688_-_ctg1_6688
+ 股市,1028031288,102803_ctg1_1288_-_ctg1_1288
+ 房产,1028035588,102803_ctg1_5588_-_ctg1_5588
+ 家居,1028035888,102803_ctg1_5888_-_ctg1_5888
+ 萌宠,1028032788,102803_ctg1_2788_-_ctg1_2788
+ 科技,1028032088,102803_ctg1_2088_-_ctg1_2088
+ 科普,1028035988,102803_ctg1_5988_-_ctg1_5988
+ 动漫,1028032388,102803_ctg1_2388_-_ctg1_2388
+ 运动健身,1028034788,102803_ctg1_4788_-_ctg1_4788
+ 旅游,1028032588,102803_ctg1_2588_-_ctg1_2588
+ 瘦身,1028036488,102803_ctg1_6488_-_ctg1_6488
+ 好物,102803600094,102803_ctg1_600094_-_ctg1_600094
+ 历史,1028036788,102803_ctg1_6788_-_ctg1_6788
+ 艺术,1028035488,102803_ctg1_5488_-_ctg1_5488
+ 美妆,1028031588,102803_ctg1_1588_-_ctg1_1588
+ 法律,1028037388,102803_ctg1_7388_-_ctg1_7388
+ 设计,1028035388,102803_ctg1_5388_-_ctg1_5388
+ 健康,1028032188,102803_ctg1_2188_-_ctg1_2188
+ 音乐,1028035288,102803_ctg1_5288_-_ctg1_5288
+ 游戏,1028034888,102803_ctg1_4888_-_ctg1_4888
+ 新时代,1028037968,102803_ctg1_7968_-_ctg1_7968
+ 校园,102803600177,102803_ctg1_600177_-_ctg1_600177
+ 收藏,1028038189,102803_ctg1_8189_-_ctg1_8189
+ 政务,1028035788,102803_ctg1_5788_-_ctg1_5788
+ 养生,1028036588,102803_ctg1_6588_-_ctg1_6588
+ 育儿,1028033188,102803_ctg1_3188_-_ctg1_3188
+ 抽奖,102803600037,102803_ctg1_600037_-_ctg1_600037
+ 教育,102803600080,102803_ctg1_600080_-_ctg1_600080
+ 婚恋,1028031788,102803_ctg1_1788_-_ctg1_1788
+ 舞蹈,1028038788,102803_ctg1_8788_-_ctg1_8788
+ 辟谣,1028036988,102803_ctg1_6988_-_ctg1_6988
+ 公益,102803600057,102803_ctg1_600057_-_ctg1_600057
+ 问答,1028037977,102803_ctg1_7977_-_ctg1_7977
+ 三农,1028037188,102803_ctg1_7188_-_ctg1_7188
--- a/utils/getEchartsData.py
View file @f98d111
+++ b/utils/getEchartsData.py
View file @f98d111
 from utils.getPublicData import *
+ from snownlp import SnowNLP
 articleList = getAllArticleData()
 commentList = getAllCommentsData()
 
- def getTypeList():# 返回爬取到的所有文章的类型（已去重）
+ def getTypeList():
     return list(set([x[8] for x in getAllArticleData()]))
 
- def getArticleByType(type):# 根据特定文章类型筛选文章
+ def getArticleByType(type):
     articles = []
     for i in articleList:
         if i[8] == type:
             articles.append(i)
     return articles
 
- def getArticleCharLikeCount(type):# 统计特定类型文章的点赞数分布
+ def getArticleCharLikeCount(type):
     articles = getArticleByType(type)
     xData = ['0-100','100-1000','1000-5000','5000-15000','15000-30000','30000-50000','50000-~']
-     yData = [0 for x in range(len(xData))]# 初始化为长度和xData相同但是每一个元素都是零的列表
+     yData = [0 for x in range(len(xData))]
     for article in articles:
         likeCount = int(article[1])
         if likeCount < 100:
@@ -34,10 +35,10 @@ def getArticleCharLikeCount(type):# 统计特定类型文章的点赞数分布
             yData[6] += 1
     return xData,yData
 
- def getArticleCharCommentsLen(type):# 统计特定类型文章的评论数分布
+ def getArticleCharCommentsLen(type):
     articles = getArticleByType(type)
     xData = ['0-100','100-500','500-1000','1000-1500','1500-3000','3000-5000','5000-10000','10000-15000','15000-~']
-     yData = [0 for x in range(len(xData))]# 初始化为长度和xData相同但是每一个元素都是零的列表
+     yData = [0 for x in range(len(xData))]
     for article in articles:
         commentLen = int(article[2])
         if commentLen < 100:
@@ -60,7 +61,7 @@ def getArticleCharCommentsLen(type):# 统计特定类型文章的评论数分布
             yData[8] += 1
     return xData,yData
 
- def getArticleCharRepotsLen(type):# 统计特定类型文章的转发数分布
+ def getArticleCharRepotsLen(type):
     articles = getArticleByType(type)
     xData = ['0-100','100-300','300-500','500-1000','1000-2000','2000-3000','3000-4000','4000-5000','5000-10000','10000-15000','15000-30000','30000-70000','70000-~']
     yData = [0 for x in range(len(xData))]
@@ -92,14 +93,14 @@ def getArticleCharRepotsLen(type):# 统计特定类型文章的转发数分布
             yData[11] += 1
     return xData,yData
 
- def getIPCharByArticleRegion():#统计文章发布地域的分布情况
+ def getIPCharByArticleRegion():
     articleRegionDic = {}
     for i in articleList:
-         if i[4] != '无':# 如果ip为确定值的话就进行下一步
-             if i[4] in articleRegionDic.keys():
-                 articleRegionDic[i[4]] += 1
-             else:
+         if i[4] != '无':
+             if articleRegionDic.get(i[4],-1) == -1:
                 articleRegionDic[i[4]] = 1
+             else:
+                 articleRegionDic[i[4]] += 1
     resultData = []
     for key,value in articleRegionDic.items():
         resultData.append({
@@ -108,14 +109,14 @@ def getIPCharByArticleRegion():#统计文章发布地域的分布情况
         })
     return resultData
 
- def getIPCharByCommentsRegion():#统计评论发布地域的分布情况
+ def getIPCharByCommentsRegion():
     commentRegionDic = {}
     for i in commentList:
         if i[3] != '无':
-             if i[3] in commentRegionDic.keys():
-                 commentRegionDic[i[3]] += 1
-             else:
+             if commentRegionDic.get(i[3],-1) == -1:
                 commentRegionDic[i[3]] = 1
+             else:
+                 commentRegionDic[i[3]] += 1
     resultData = []
     for key,value in commentRegionDic.items():
         resultData.append({
@@ -124,35 +125,33 @@ def getIPCharByCommentsRegion():#统计评论发布地域的分布情况
         })
     return resultData
 
- def getCommentCharDataOne():# 统计评论点赞数的分布情况
+ def getCommentCharDataOne():
     xData = []
     rangeNum = 20
-     for item in range(100):
+     for item in range(1,100):
         xData.append(str(rangeNum * item) + '-' + str(rangeNum * (item + 1)))
     yData = [0 for x in range(len(xData))]
     for comment in commentList:
-         for item in range(100):
-             if int(comment[2]) < rangeNum * (item + 1):
+         for item in range(99):
+             if int(comment[2]) < rangeNum * (item + 2):
                 yData[item] += 1
                 break
     return xData,yData
 
- def getCommentCharDataTwo():# 统计评论数据中不同性别的数量
+ def getCommentCharDataTwo():
     genderDic = {}
     for i in commentList:
-         if i[6] in genderDic.keys():
-             genderDic[i[6]] += 1
-         else:
+         if genderDic.get(i[6],-1) == -1:
             genderDic[i[6]] = 1
-     resultData = []
-     for key,value in genderDic.items():
-         resultData.append({
-             'name':key,
-             'value':value
-         })
+         else:
+             genderDic[i[6]] += 1
+     resultData = [{
+         'name':x[0],
+         'value':x[1]
+     } for x in genderDic.items()]
     return resultData
 
- def getYuQingCharDataOne():# 统计热词中正面、中性、负面的数量
+ def getYuQingCharDataOne():
     hotWordList = getAllHotWords()
     xData = ['正面','中性','负面']
     yData = [0,0,0]
@@ -164,19 +163,19 @@ def getYuQingCharDataOne():# 统计热词中正面、中性、负面的数量
             yData[1] += 1
         elif emotionValue < 0.5:
             yData[2] += 1
-     finalData = [{
+     bieData = [{
         'name':x,
         'value':yData[index]
     } for index,x in enumerate(xData)]
-     return xData,yData,finalData
+     return xData,yData,bieData
 
- def getYuQingCharDataTwo():# 统计评论列表和文章列表中的情感值
+ def getYuQingCharDataTwo():
     xData = ['正面', '中性', '负面']
-     finalData1 = [{
+     bieData1 = [{
         'name':x,
         'value':0
     } for x in xData]
-     finalData2 = [{
+     bieData2 = [{
         'name': x,
         'value': 0
     } for x in xData]
@@ -184,27 +183,27 @@ def getYuQingCharDataTwo():# 统计评论列表和文章列表中的情感值
     for comment in commentList:
         emotionValue = SnowNLP(comment[4]).sentiments
         if emotionValue > 0.5:
-             finalData1[0]['value'] += 1
+             bieData1[0]['value'] += 1
         elif emotionValue == 0.5:
-             finalData1[1]['value'] += 1
+             bieData1[1]['value'] += 1
         elif emotionValue < 0.5:
-             finalData1[2]['value'] += 1
+             bieData1[2]['value'] += 1
     for artile in articleList:
         emotionValue = SnowNLP(artile[5]).sentiments
         if emotionValue > 0.5:
-             finalData2[0]['value'] += 1
+             bieData2[0]['value'] += 1
         elif emotionValue == 0.5:
-             finalData2[1]['value'] += 1
+             bieData2[1]['value'] += 1
         elif emotionValue < 0.5:
-             finalData2[2]['value'] += 1
-     return finalData1,finalData2
+             bieData2[2]['value'] += 1
+     return bieData1,bieData2
 
- def getYuQingCharDataThree():# 提取前10个热词及其对应的出现频率
+ def getYuQingCharDataThree():
     hotWordList = getAllHotWords()
-     xData = []
-     yData = []
+     x1Data = []
+     y1Data = []
     for i in hotWordList[:10]:
-         xData.append(i[0])
-         yData.append(int(i[1]))
-     return xData,yData
+         x1Data.append(i[0])
+         y1Data.append(int(i[1]))
+     return x1Data,y1Data