戒酒的李白

【大修bug】添加csv表格原始数据,修改词频统计函数bug

  1 +('哈哈', 1236)
  2 +('哈哈哈', 537)
  3 +('哈哈哈哈', 157)
  4 +('真的', 154)
  5 +('期待', 89)
  6 +('喜欢', 89)
  7 +('doge', 88)
  8 +('宝宝', 87)
  9 +('可爱', 79)
  10 +('第一', 73)
  11 +('演唱', 71)
  12 +('亲亲', 71)
  13 +('苦涩', 70)
  14 +('啊啊啊', 68)
  15 +('抱抱', 64)
  16 +('cry', 64)
  17 +('宝贝', 62)
  18 +('姐姐', 51)
  19 +('花花', 50)
  20 +('送花', 48)
  21 +('开心', 47)
  22 +('加油', 47)
  23 +('老师', 46)
  24 +('call', 45)
  25 +('特别', 42)
  26 +('一个', 42)
  27 +('抓狂', 40)
  28 +('嘻嘻', 39)
  29 +('心心', 38)
  30 +('悲伤', 38)
  31 +('世界', 37)
  32 +('感觉', 35)
  33 +('孩子', 35)
  34 +('朋友', 34)
  35 +('鲜花', 34)
  36 +('开学', 34)
  37 +('好好', 34)
  38 +('演唱会', 33)
  39 +('感谢', 32)
  40 +('憧憬', 31)
  41 +('学季', 31)
  42 +('快乐', 30)
  43 +('漂亮', 30)
  44 +('中国', 30)
  45 +('音乐', 29)
  46 +('电影', 28)
  47 +('莲花', 28)
  48 +('骄阳', 28)
  49 +('视频', 27)
  50 +('老公', 27)
  51 +('老婆', 27)
  52 +('值得', 26)
  53 +('好看', 26)
  54 +('消失', 26)
  55 +('希望', 25)
  56 +('呜呜', 25)
  57 +('少年', 25)
  58 +('东西', 25)
  59 +('实力', 24)
  60 +('评论', 24)
  61 +('舞台', 24)
  62 +('生活', 24)
  63 +('单身', 24)
  64 +('努力', 23)
  65 +('唯一', 23)
  66 +('幸福', 23)
  67 +('时间', 23)
  68 +('超级', 23)
  69 +('辈子', 22)
  70 +('童年', 22)
  71 +('时代', 22)
  72 +('可怜', 21)
  73 +('不见', 21)
  74 +('工作', 21)
  75 +('有人', 21)
  76 +('终于', 21)
  77 +('粉丝', 21)
  78 +('国家', 21)
  79 +('callcallcall', 21)
  80 +('永远', 21)
  81 +('太阳', 20)
  82 +('直播', 20)
  83 +('小时', 20)
  84 +('星期', 20)
  85 +('安全', 20)
  86 +('代言', 19)
  87 +('支持', 19)
  88 +('彩虹', 19)
  89 +('妈妈', 18)
  90 +('华为', 18)
  91 +('优秀', 18)
  92 +('好像', 18)
  93 +('越来', 18)
  94 +('大人', 18)
  95 +('父母', 18)
  96 +('害怕', 18)
  97 +('安哥', 18)
  98 +('加班', 18)
  99 +('一点', 18)
  100 +('一场', 17)
This diff could not be displayed because it is too large.
  1 +typeName,gid,containerid
  2 +热门,102803,102803
  3 +同城,1028032222,102803_2222
  4 +榜单,102803600169,102803_ctg1_600169_-_ctg1_600169
  5 +男篮,102803600279,102803_ctg1_600279_-_ctg1_600279
  6 +明星,1028034288,102803_ctg1_4288_-_ctg1_4288
  7 +车展,1028035188,102803_ctg1_5188_-_ctg1_5188
  8 +搞笑,1028034388,102803_ctg1_4388_-_ctg1_4388
  9 +情感,1028031988,102803_ctg1_1988_-_ctg1_1988
  10 +周末,102803600195,102803_ctg1_600195_-_ctg1_600195
  11 +电影,1028033288,102803_ctg1_3288_-_ctg1_3288
  12 +社会,1028034188,102803_ctg1_4188_-_ctg1_4188
  13 +电视剧,1028032488,102803_ctg1_2488_-_ctg1_2488
  14 +美食,1028032688,102803_ctg1_2688_-_ctg1_2688
  15 +俄乌局势,102803600267,102803_ctg1_600267_-_ctg1_600267
  16 +国际,1028036288,102803_ctg1_6288_-_ctg1_6288
  17 +深度,102803600155,102803_ctg1_600155_-_ctg1_600155
  18 +财经,1028036388,102803_ctg1_6388_-_ctg1_6388
  19 +读书,1028034588,102803_ctg1_4588_-_ctg1_4588
  20 +摄影,1028034988,102803_ctg1_4988_-_ctg1_4988
  21 +颜值,102803600165,102803_ctg1_600165_-_ctg1_600165
  22 +体育,1028031388,102803_ctg1_1388_-_ctg1_1388
  23 +数码,1028035088,102803_ctg1_5088_-_ctg1_5088
  24 +综艺,1028034688,102803_ctg1_4688_-_ctg1_4688
  25 +时尚,1028034488,102803_ctg1_4488_-_ctg1_4488
  26 +星座,1028031688,102803_ctg1_1688_-_ctg1_1688
  27 +军事,1028036688,102803_ctg1_6688_-_ctg1_6688
  28 +股市,1028031288,102803_ctg1_1288_-_ctg1_1288
  29 +房产,1028035588,102803_ctg1_5588_-_ctg1_5588
  30 +家居,1028035888,102803_ctg1_5888_-_ctg1_5888
  31 +萌宠,1028032788,102803_ctg1_2788_-_ctg1_2788
  32 +科技,1028032088,102803_ctg1_2088_-_ctg1_2088
  33 +科普,1028035988,102803_ctg1_5988_-_ctg1_5988
  34 +动漫,1028032388,102803_ctg1_2388_-_ctg1_2388
  35 +运动健身,1028034788,102803_ctg1_4788_-_ctg1_4788
  36 +旅游,1028032588,102803_ctg1_2588_-_ctg1_2588
  37 +瘦身,1028036488,102803_ctg1_6488_-_ctg1_6488
  38 +好物,102803600094,102803_ctg1_600094_-_ctg1_600094
  39 +历史,1028036788,102803_ctg1_6788_-_ctg1_6788
  40 +艺术,1028035488,102803_ctg1_5488_-_ctg1_5488
  41 +美妆,1028031588,102803_ctg1_1588_-_ctg1_1588
  42 +法律,1028037388,102803_ctg1_7388_-_ctg1_7388
  43 +设计,1028035388,102803_ctg1_5388_-_ctg1_5388
  44 +健康,1028032188,102803_ctg1_2188_-_ctg1_2188
  45 +音乐,1028035288,102803_ctg1_5288_-_ctg1_5288
  46 +游戏,1028034888,102803_ctg1_4888_-_ctg1_4888
  47 +新时代,1028037968,102803_ctg1_7968_-_ctg1_7968
  48 +校园,102803600177,102803_ctg1_600177_-_ctg1_600177
  49 +收藏,1028038189,102803_ctg1_8189_-_ctg1_8189
  50 +政务,1028035788,102803_ctg1_5788_-_ctg1_5788
  51 +养生,1028036588,102803_ctg1_6588_-_ctg1_6588
  52 +育儿,1028033188,102803_ctg1_3188_-_ctg1_3188
  53 +抽奖,102803600037,102803_ctg1_600037_-_ctg1_600037
  54 +教育,102803600080,102803_ctg1_600080_-_ctg1_600080
  55 +婚恋,1028031788,102803_ctg1_1788_-_ctg1_1788
  56 +舞蹈,1028038788,102803_ctg1_8788_-_ctg1_8788
  57 +辟谣,1028036988,102803_ctg1_6988_-_ctg1_6988
  58 +公益,102803600057,102803_ctg1_600057_-_ctg1_600057
  59 +问答,1028037977,102803_ctg1_7977_-_ctg1_7977
  60 +三农,1028037188,102803_ctg1_7188_-_ctg1_7188
1 from utils.getPublicData import * 1 from utils.getPublicData import *
  2 +from snownlp import SnowNLP
2 articleList = getAllArticleData() 3 articleList = getAllArticleData()
3 commentList = getAllCommentsData() 4 commentList = getAllCommentsData()
4 5
5 -def getTypeList():# 返回爬取到的所有文章的类型(已去重) 6 +def getTypeList():
6 return list(set([x[8] for x in getAllArticleData()])) 7 return list(set([x[8] for x in getAllArticleData()]))
7 8
8 -def getArticleByType(type):# 根据特定文章类型筛选文章 9 +def getArticleByType(type):
9 articles = [] 10 articles = []
10 for i in articleList: 11 for i in articleList:
11 if i[8] == type: 12 if i[8] == type:
12 articles.append(i) 13 articles.append(i)
13 return articles 14 return articles
14 15
15 -def getArticleCharLikeCount(type):# 统计特定类型文章的点赞数分布 16 +def getArticleCharLikeCount(type):
16 articles = getArticleByType(type) 17 articles = getArticleByType(type)
17 xData = ['0-100','100-1000','1000-5000','5000-15000','15000-30000','30000-50000','50000-~'] 18 xData = ['0-100','100-1000','1000-5000','5000-15000','15000-30000','30000-50000','50000-~']
18 - yData = [0 for x in range(len(xData))]# 初始化为长度和xData相同但是每一个元素都是零的列表 19 + yData = [0 for x in range(len(xData))]
19 for article in articles: 20 for article in articles:
20 likeCount = int(article[1]) 21 likeCount = int(article[1])
21 if likeCount < 100: 22 if likeCount < 100:
@@ -34,10 +35,10 @@ def getArticleCharLikeCount(type):# 统计特定类型文章的点赞数分布 @@ -34,10 +35,10 @@ def getArticleCharLikeCount(type):# 统计特定类型文章的点赞数分布
34 yData[6] += 1 35 yData[6] += 1
35 return xData,yData 36 return xData,yData
36 37
37 -def getArticleCharCommentsLen(type):# 统计特定类型文章的评论数分布 38 +def getArticleCharCommentsLen(type):
38 articles = getArticleByType(type) 39 articles = getArticleByType(type)
39 xData = ['0-100','100-500','500-1000','1000-1500','1500-3000','3000-5000','5000-10000','10000-15000','15000-~'] 40 xData = ['0-100','100-500','500-1000','1000-1500','1500-3000','3000-5000','5000-10000','10000-15000','15000-~']
40 - yData = [0 for x in range(len(xData))]# 初始化为长度和xData相同但是每一个元素都是零的列表 41 + yData = [0 for x in range(len(xData))]
41 for article in articles: 42 for article in articles:
42 commentLen = int(article[2]) 43 commentLen = int(article[2])
43 if commentLen < 100: 44 if commentLen < 100:
@@ -60,7 +61,7 @@ def getArticleCharCommentsLen(type):# 统计特定类型文章的评论数分布 @@ -60,7 +61,7 @@ def getArticleCharCommentsLen(type):# 统计特定类型文章的评论数分布
60 yData[8] += 1 61 yData[8] += 1
61 return xData,yData 62 return xData,yData
62 63
63 -def getArticleCharRepotsLen(type):# 统计特定类型文章的转发数分布 64 +def getArticleCharRepotsLen(type):
64 articles = getArticleByType(type) 65 articles = getArticleByType(type)
65 xData = ['0-100','100-300','300-500','500-1000','1000-2000','2000-3000','3000-4000','4000-5000','5000-10000','10000-15000','15000-30000','30000-70000','70000-~'] 66 xData = ['0-100','100-300','300-500','500-1000','1000-2000','2000-3000','3000-4000','4000-5000','5000-10000','10000-15000','15000-30000','30000-70000','70000-~']
66 yData = [0 for x in range(len(xData))] 67 yData = [0 for x in range(len(xData))]
@@ -92,14 +93,14 @@ def getArticleCharRepotsLen(type):# 统计特定类型文章的转发数分布 @@ -92,14 +93,14 @@ def getArticleCharRepotsLen(type):# 统计特定类型文章的转发数分布
92 yData[11] += 1 93 yData[11] += 1
93 return xData,yData 94 return xData,yData
94 95
95 -def getIPCharByArticleRegion():#统计文章发布地域的分布情况 96 +def getIPCharByArticleRegion():
96 articleRegionDic = {} 97 articleRegionDic = {}
97 for i in articleList: 98 for i in articleList:
98 - if i[4] != '无':# 如果ip为确定值的话就进行下一步  
99 - if i[4] in articleRegionDic.keys():  
100 - articleRegionDic[i[4]] += 1  
101 - else: 99 + if i[4] != '无':
  100 + if articleRegionDic.get(i[4],-1) == -1:
102 articleRegionDic[i[4]] = 1 101 articleRegionDic[i[4]] = 1
  102 + else:
  103 + articleRegionDic[i[4]] += 1
103 resultData = [] 104 resultData = []
104 for key,value in articleRegionDic.items(): 105 for key,value in articleRegionDic.items():
105 resultData.append({ 106 resultData.append({
@@ -108,14 +109,14 @@ def getIPCharByArticleRegion():#统计文章发布地域的分布情况 @@ -108,14 +109,14 @@ def getIPCharByArticleRegion():#统计文章发布地域的分布情况
108 }) 109 })
109 return resultData 110 return resultData
110 111
111 -def getIPCharByCommentsRegion():#统计评论发布地域的分布情况 112 +def getIPCharByCommentsRegion():
112 commentRegionDic = {} 113 commentRegionDic = {}
113 for i in commentList: 114 for i in commentList:
114 if i[3] != '无': 115 if i[3] != '无':
115 - if i[3] in commentRegionDic.keys():  
116 - commentRegionDic[i[3]] += 1  
117 - else: 116 + if commentRegionDic.get(i[3],-1) == -1:
118 commentRegionDic[i[3]] = 1 117 commentRegionDic[i[3]] = 1
  118 + else:
  119 + commentRegionDic[i[3]] += 1
119 resultData = [] 120 resultData = []
120 for key,value in commentRegionDic.items(): 121 for key,value in commentRegionDic.items():
121 resultData.append({ 122 resultData.append({
@@ -124,35 +125,33 @@ def getIPCharByCommentsRegion():#统计评论发布地域的分布情况 @@ -124,35 +125,33 @@ def getIPCharByCommentsRegion():#统计评论发布地域的分布情况
124 }) 125 })
125 return resultData 126 return resultData
126 127
127 -def getCommentCharDataOne():# 统计评论点赞数的分布情况 128 +def getCommentCharDataOne():
128 xData = [] 129 xData = []
129 rangeNum = 20 130 rangeNum = 20
130 - for item in range(100): 131 + for item in range(1,100):
131 xData.append(str(rangeNum * item) + '-' + str(rangeNum * (item + 1))) 132 xData.append(str(rangeNum * item) + '-' + str(rangeNum * (item + 1)))
132 yData = [0 for x in range(len(xData))] 133 yData = [0 for x in range(len(xData))]
133 for comment in commentList: 134 for comment in commentList:
134 - for item in range(100):  
135 - if int(comment[2]) < rangeNum * (item + 1): 135 + for item in range(99):
  136 + if int(comment[2]) < rangeNum * (item + 2):
136 yData[item] += 1 137 yData[item] += 1
137 break 138 break
138 return xData,yData 139 return xData,yData
139 140
140 -def getCommentCharDataTwo():# 统计评论数据中不同性别的数量 141 +def getCommentCharDataTwo():
141 genderDic = {} 142 genderDic = {}
142 for i in commentList: 143 for i in commentList:
143 - if i[6] in genderDic.keys():  
144 - genderDic[i[6]] += 1  
145 - else: 144 + if genderDic.get(i[6],-1) == -1:
146 genderDic[i[6]] = 1 145 genderDic[i[6]] = 1
147 - resultData = []  
148 - for key,value in genderDic.items():  
149 - resultData.append({  
150 - 'name':key,  
151 - 'value':value  
152 - }) 146 + else:
  147 + genderDic[i[6]] += 1
  148 + resultData = [{
  149 + 'name':x[0],
  150 + 'value':x[1]
  151 + } for x in genderDic.items()]
153 return resultData 152 return resultData
154 153
155 -def getYuQingCharDataOne():# 统计热词中正面、中性、负面的数量 154 +def getYuQingCharDataOne():
156 hotWordList = getAllHotWords() 155 hotWordList = getAllHotWords()
157 xData = ['正面','中性','负面'] 156 xData = ['正面','中性','负面']
158 yData = [0,0,0] 157 yData = [0,0,0]
@@ -164,19 +163,19 @@ def getYuQingCharDataOne():# 统计热词中正面、中性、负面的数量 @@ -164,19 +163,19 @@ def getYuQingCharDataOne():# 统计热词中正面、中性、负面的数量
164 yData[1] += 1 163 yData[1] += 1
165 elif emotionValue < 0.5: 164 elif emotionValue < 0.5:
166 yData[2] += 1 165 yData[2] += 1
167 - finalData = [{ 166 + bieData = [{
168 'name':x, 167 'name':x,
169 'value':yData[index] 168 'value':yData[index]
170 } for index,x in enumerate(xData)] 169 } for index,x in enumerate(xData)]
171 - return xData,yData,finalData 170 + return xData,yData,bieData
172 171
173 -def getYuQingCharDataTwo():# 统计评论列表和文章列表中的情感值 172 +def getYuQingCharDataTwo():
174 xData = ['正面', '中性', '负面'] 173 xData = ['正面', '中性', '负面']
175 - finalData1 = [{ 174 + bieData1 = [{
176 'name':x, 175 'name':x,
177 'value':0 176 'value':0
178 } for x in xData] 177 } for x in xData]
179 - finalData2 = [{ 178 + bieData2 = [{
180 'name': x, 179 'name': x,
181 'value': 0 180 'value': 0
182 } for x in xData] 181 } for x in xData]
@@ -184,27 +183,27 @@ def getYuQingCharDataTwo():# 统计评论列表和文章列表中的情感值 @@ -184,27 +183,27 @@ def getYuQingCharDataTwo():# 统计评论列表和文章列表中的情感值
184 for comment in commentList: 183 for comment in commentList:
185 emotionValue = SnowNLP(comment[4]).sentiments 184 emotionValue = SnowNLP(comment[4]).sentiments
186 if emotionValue > 0.5: 185 if emotionValue > 0.5:
187 - finalData1[0]['value'] += 1 186 + bieData1[0]['value'] += 1
188 elif emotionValue == 0.5: 187 elif emotionValue == 0.5:
189 - finalData1[1]['value'] += 1 188 + bieData1[1]['value'] += 1
190 elif emotionValue < 0.5: 189 elif emotionValue < 0.5:
191 - finalData1[2]['value'] += 1 190 + bieData1[2]['value'] += 1
192 for artile in articleList: 191 for artile in articleList:
193 emotionValue = SnowNLP(artile[5]).sentiments 192 emotionValue = SnowNLP(artile[5]).sentiments
194 if emotionValue > 0.5: 193 if emotionValue > 0.5:
195 - finalData2[0]['value'] += 1 194 + bieData2[0]['value'] += 1
196 elif emotionValue == 0.5: 195 elif emotionValue == 0.5:
197 - finalData2[1]['value'] += 1 196 + bieData2[1]['value'] += 1
198 elif emotionValue < 0.5: 197 elif emotionValue < 0.5:
199 - finalData2[2]['value'] += 1  
200 - return finalData1,finalData2 198 + bieData2[2]['value'] += 1
  199 + return bieData1,bieData2
201 200
202 -def getYuQingCharDataThree():# 提取前10个热词及其对应的出现频率 201 +def getYuQingCharDataThree():
203 hotWordList = getAllHotWords() 202 hotWordList = getAllHotWords()
204 - xData = []  
205 - yData = [] 203 + x1Data = []
  204 + y1Data = []
206 for i in hotWordList[:10]: 205 for i in hotWordList[:10]:
207 - xData.append(i[0])  
208 - yData.append(int(i[1]))  
209 - return xData,yData 206 + x1Data.append(i[0])
  207 + y1Data.append(int(i[1]))
  208 + return x1Data,y1Data
210 209