Committed by
GitHub
Merge pull request #15 from zhaisang111/main
Optimized the getEchartsData.py script, improving code efficiency and…
Showing
1 changed file
with
101 additions
and
155 deletions
| 1 | -from utils.getPublicData import * | ||
| 2 | -from utils.mynlp import SnowNLP | ||
| 3 | -articleList = getAllArticleData() | ||
| 4 | -commentList = getAllCommentsData() | 1 | +from utils.getPublicData import * # Import utility functions for data retrieval |
| 2 | +from utils.mynlp import SnowNLP # Import SnowNLP for sentiment analysis | ||
| 3 | +from collections import Counter # Import Counter for counting occurrences | ||
| 4 | + | ||
| 5 | +articleList = getAllArticleData() # Retrieve all article data | ||
| 6 | +commentList = getAllCommentsData() # Retrieve all comment data | ||
| 5 | 7 | ||
| 6 | def getTypeList(): | 8 | def getTypeList(): |
| 7 | - return list(set([x[8] for x in getAllArticleData()])) | 9 | + # Return a list of unique article types |
| 10 | + return list(set([x[8] for x in articleList])) | ||
| 8 | 11 | ||
| 9 | def getArticleByType(type): | 12 | def getArticleByType(type): |
| 10 | - articles = [] | ||
| 11 | - for i in articleList: | ||
| 12 | - if i[8] == type: | ||
| 13 | - articles.append(i) | ||
| 14 | - return articles | 13 | + # Return a list of articles that match the specified type |
| 14 | + return [article for article in articleList if article[8] == type] | ||
| 15 | 15 | ||
| 16 | def getArticleLikeCount(type): | 16 | def getArticleLikeCount(type): |
| 17 | + # Categorize articles by the number of likes they have | ||
| 17 | articles = getArticleByType(type) | 18 | articles = getArticleByType(type) |
| 18 | - X = ['0-100','100-1000','1000-5000','5000-15000','15000-30000','30000-50000','50000-~'] | ||
| 19 | - Y = [0 for x in range(len(X))] | 19 | + intervals = [(0, 100), (100, 1000), (1000, 5000), (5000, 15000), |
| 20 | + (15000, 30000), (30000, 50000), (50000, float('inf'))] | ||
| 21 | + X = ['0-100','100-1000','1000-5000','5000-15000','15000-30000', | ||
| 22 | + '30000-50000','50000-~'] | ||
| 23 | + Y = [0] * len(intervals) | ||
| 20 | for article in articles: | 24 | for article in articles: |
| 21 | likeCount = int(article[1]) | 25 | likeCount = int(article[1]) |
| 22 | - if likeCount < 100: | ||
| 23 | - Y[0] += 1 | ||
| 24 | - elif likeCount < 1000: | ||
| 25 | - Y[1] += 1 | ||
| 26 | - elif likeCount < 5000: | ||
| 27 | - Y[2] += 1 | ||
| 28 | - elif likeCount < 15000: | ||
| 29 | - Y[3] += 1 | ||
| 30 | - elif likeCount < 30000: | ||
| 31 | - Y[4] += 1 | ||
| 32 | - elif likeCount < 50000: | ||
| 33 | - Y[5] += 1 | ||
| 34 | - elif likeCount >= 50000: | ||
| 35 | - Y[6] += 1 | ||
| 36 | - return X,Y | 26 | + for i, (lower, upper) in enumerate(intervals): |
| 27 | + if lower <= likeCount < upper: | ||
| 28 | + Y[i] += 1 | ||
| 29 | + break | ||
| 30 | + return X, Y | ||
| 37 | 31 | ||
| 38 | def getArticleCommentsLen(type): | 32 | def getArticleCommentsLen(type): |
| 33 | + # Categorize articles by the length of comments they have | ||
| 39 | articles = getArticleByType(type) | 34 | articles = getArticleByType(type) |
| 40 | - X = ['0-100','100-500','500-1000','1000-1500','1500-3000','3000-5000','5000-10000','10000-15000','15000-~'] | ||
| 41 | - Y = [0 for x in range(len(X))] | 35 | + intervals = [(0, 100), (100, 500), (500, 1000), (1000, 1500), |
| 36 | + (1500, 3000), (3000, 5000), (5000, 10000), | ||
| 37 | + (10000, 15000), (15000, float('inf'))] | ||
| 38 | + X = ['0-100','100-500','500-1000','1000-1500','1500-3000', | ||
| 39 | + '3000-5000','5000-10000','10000-15000','15000-~'] | ||
| 40 | + Y = [0] * len(intervals) | ||
| 42 | for article in articles: | 41 | for article in articles: |
| 43 | commentLen = int(article[2]) | 42 | commentLen = int(article[2]) |
| 44 | - if commentLen < 100: | ||
| 45 | - Y[0] += 1 | ||
| 46 | - elif commentLen < 500: | ||
| 47 | - Y[1] += 1 | ||
| 48 | - elif commentLen < 5000: | ||
| 49 | - Y[2] += 1 | ||
| 50 | - elif commentLen < 1000: | ||
| 51 | - Y[3] += 1 | ||
| 52 | - elif commentLen < 1500: | ||
| 53 | - Y[4] += 1 | ||
| 54 | - elif commentLen < 3000: | ||
| 55 | - Y[5] += 1 | ||
| 56 | - elif commentLen < 5000: | ||
| 57 | - Y[6] += 1 | ||
| 58 | - elif commentLen < 10000: | ||
| 59 | - Y[7] += 1 | ||
| 60 | - elif commentLen >= 15000: | ||
| 61 | - Y[8] += 1 | ||
| 62 | - return X,Y | 43 | + for i, (lower, upper) in enumerate(intervals): |
| 44 | + if lower <= commentLen < upper: | ||
| 45 | + Y[i] += 1 | ||
| 46 | + break | ||
| 47 | + return X, Y | ||
| 63 | 48 | ||
| 64 | def getArticleRepotsLen(type): | 49 | def getArticleRepotsLen(type): |
| 50 | + # Categorize articles by the number of reposts | ||
| 65 | articles = getArticleByType(type) | 51 | articles = getArticleByType(type) |
| 66 | - X = ['0-100','100-300','300-500','500-1000','1000-2000','2000-3000','3000-4000','4000-5000','5000-10000','10000-15000','15000-30000','30000-70000','70000-~'] | ||
| 67 | - Y = [0 for x in range(len(X))] | 52 | + intervals = [(0, 100), (100, 300), (300, 500), (500, 1000), |
| 53 | + (1000, 2000), (2000, 3000), (3000, 4000), | ||
| 54 | + (4000, 5000), (5000, 10000), (10000, 15000), | ||
| 55 | + (15000, 30000), (30000, 70000), (70000, float('inf'))] | ||
| 56 | + X = ['0-100','100-300','300-500','500-1000','1000-2000','2000-3000', | ||
| 57 | + '3000-4000','4000-5000','5000-10000','10000-15000','15000-30000', | ||
| 58 | + '30000-70000','70000-~'] | ||
| 59 | + Y = [0] * len(intervals) | ||
| 68 | for article in articles: | 60 | for article in articles: |
| 69 | repostsCount = int(article[3]) | 61 | repostsCount = int(article[3]) |
| 70 | - if repostsCount < 100: | ||
| 71 | - Y[0] += 1 | ||
| 72 | - elif repostsCount < 300: | ||
| 73 | - Y[1] += 1 | ||
| 74 | - elif repostsCount < 500: | ||
| 75 | - Y[2] += 1 | ||
| 76 | - elif repostsCount < 1000: | ||
| 77 | - Y[3] += 1 | ||
| 78 | - elif repostsCount < 3000: | ||
| 79 | - Y[4] += 1 | ||
| 80 | - elif repostsCount < 4000: | ||
| 81 | - Y[5] += 1 | ||
| 82 | - elif repostsCount < 5000: | ||
| 83 | - Y[6] += 1 | ||
| 84 | - elif repostsCount < 10000: | ||
| 85 | - Y[7] += 1 | ||
| 86 | - elif repostsCount < 15000: | ||
| 87 | - Y[8] += 1 | ||
| 88 | - elif repostsCount < 30000: | ||
| 89 | - Y[9] += 1 | ||
| 90 | - elif repostsCount < 70000: | ||
| 91 | - Y[10] += 1 | ||
| 92 | - elif repostsCount >= 70000: | ||
| 93 | - Y[11] += 1 | ||
| 94 | - return X,Y | 62 | + for i, (lower, upper) in enumerate(intervals): |
| 63 | + if lower <= repostsCount < upper: | ||
| 64 | + Y[i] += 1 | ||
| 65 | + break | ||
| 66 | + return X, Y | ||
| 95 | 67 | ||
| 96 | def getIPByArticleRegion(): | 68 | def getIPByArticleRegion(): |
| 97 | - articleRegionDic = {} | ||
| 98 | - for i in articleList: | ||
| 99 | - if i[4] != '无': | ||
| 100 | - if i[4] in articleRegionDic.keys(): | ||
| 101 | - articleRegionDic[i[4]] += 1 | ||
| 102 | - else: | ||
| 103 | - articleRegionDic[i[4]] = 1 | ||
| 104 | - resultData = [] | ||
| 105 | - for key,value in articleRegionDic.items(): | ||
| 106 | - resultData.append({ | ||
| 107 | - 'name':key, | ||
| 108 | - 'value':value | ||
| 109 | - }) | 69 | + # Count articles by their regions, excluding '无' |
| 70 | + regions = [article[4] for article in articleList if article[4] != '无'] | ||
| 71 | + region_counts = Counter(regions) | ||
| 72 | + resultData = [{'name': key, 'value': value} for key, value in region_counts.items()] | ||
| 110 | return resultData | 73 | return resultData |
| 111 | 74 | ||
| 112 | def getIPByCommentsRegion(): | 75 | def getIPByCommentsRegion(): |
| 113 | - commentRegionDic = {} | ||
| 114 | - for i in commentList: | ||
| 115 | - if i[3] != '无': | ||
| 116 | - if i[3] in commentRegionDic.keys(): | ||
| 117 | - commentRegionDic[i[3]] += 1 | ||
| 118 | - else: | ||
| 119 | - commentRegionDic[i[3]] = 1 | ||
| 120 | - resultData = [] | ||
| 121 | - for key,value in commentRegionDic.items(): | ||
| 122 | - resultData.append({ | ||
| 123 | - 'name':key, | ||
| 124 | - 'value':value | ||
| 125 | - }) | 76 | + # Count comments by their regions, excluding '无' |
| 77 | + regions = [comment[3] for comment in commentList if comment[3] != '无'] | ||
| 78 | + region_counts = Counter(regions) | ||
| 79 | + resultData = [{'name': key, 'value': value} for key, value in region_counts.items()] | ||
| 126 | return resultData | 80 | return resultData |
| 127 | 81 | ||
| 128 | def getCommentDataOne(): | 82 | def getCommentDataOne(): |
| 129 | - X = [] | 83 | + # Categorize comments based on some numerical value, possibly length or count |
| 130 | rangeNum = 20 | 84 | rangeNum = 20 |
| 131 | - for item in range(100): | ||
| 132 | - X.append(str(rangeNum * item) + '-' + str(rangeNum * (item + 1))) | ||
| 133 | - Y = [0 for x in range(len(X))] | 85 | + intervals = [(rangeNum * i, rangeNum * (i + 1)) for i in range(100)] |
| 86 | + X = [f"{lower}-{upper}" for lower, upper in intervals] | ||
| 87 | + Y = [0] * len(intervals) | ||
| 134 | for comment in commentList: | 88 | for comment in commentList: |
| 135 | - for item in range(100): | ||
| 136 | - if int(comment[2]) < rangeNum * (item + 1): | ||
| 137 | - Y[item] += 1 | 89 | + comment_value = int(comment[2]) |
| 90 | + for i, (lower, upper) in enumerate(intervals): | ||
| 91 | + if lower <= comment_value < upper: | ||
| 92 | + Y[i] += 1 | ||
| 138 | break | 93 | break |
| 139 | - return X,Y | 94 | + return X, Y |
| 140 | 95 | ||
| 141 | def getCommentDataTwo(): | 96 | def getCommentDataTwo(): |
| 142 | - genderDic = {} | ||
| 143 | - for i in commentList: | ||
| 144 | - if i[6] in genderDic.keys(): | ||
| 145 | - genderDic[i[6]] += 1 | ||
| 146 | - else: | ||
| 147 | - genderDic[i[6]] = 1 | ||
| 148 | - resultData = [{ | ||
| 149 | - 'name':x[0], | ||
| 150 | - 'value':x[1] | ||
| 151 | - } for x in genderDic.items()] | 97 | + # Count comments by gender |
| 98 | + genders = [comment[6] for comment in commentList] | ||
| 99 | + gender_counts = Counter(genders) | ||
| 100 | + resultData = [{'name': key, 'value': value} for key, value in gender_counts.items()] | ||
| 152 | return resultData | 101 | return resultData |
| 153 | 102 | ||
| 154 | def getYuQingCharDataOne(): | 103 | def getYuQingCharDataOne(): |
| 104 | + # Analyze sentiment of hot words | ||
| 155 | hotWordList = getAllHotWords() | 105 | hotWordList = getAllHotWords() |
| 156 | - X = ['正面','中性','负面'] | ||
| 157 | - Y = [0,0,0] | 106 | + sentiments = [] |
| 158 | for word in hotWordList: | 107 | for word in hotWordList: |
| 159 | emotionValue = SnowNLP(word[0]).sentiments | 108 | emotionValue = SnowNLP(word[0]).sentiments |
| 160 | if emotionValue > 0.4: | 109 | if emotionValue > 0.4: |
| 161 | - Y[0] += 1 | 110 | + sentiments.append('正面') |
| 162 | elif emotionValue < 0.2: | 111 | elif emotionValue < 0.2: |
| 163 | - Y[2] += 1 | 112 | + sentiments.append('负面') |
| 164 | else: | 113 | else: |
| 165 | - Y[1] += 1 | ||
| 166 | - biedata = [{ | ||
| 167 | - 'name':x, | ||
| 168 | - 'value':Y[index] | ||
| 169 | - } for index,x in enumerate(X)] | ||
| 170 | - return X,Y,biedata | 114 | + sentiments.append('中性') |
| 115 | + counts = Counter(sentiments) | ||
| 116 | + X = ['正面','中性','负面'] | ||
| 117 | + Y = [counts.get(sentiment, 0) for sentiment in X] | ||
| 118 | + biedata = [{'name': x, 'value': y} for x, y in zip(X, Y)] | ||
| 119 | + return X, Y, biedata | ||
| 171 | 120 | ||
| 172 | def getYuQingCharDataTwo(): | 121 | def getYuQingCharDataTwo(): |
| 173 | - X = ['正面', '中性', '负面'] | ||
| 174 | - biedata1 = [{ | ||
| 175 | - 'name':x, | ||
| 176 | - 'value':0 | ||
| 177 | - } for x in X] | ||
| 178 | - biedata2 = [{ | ||
| 179 | - 'name': x, | ||
| 180 | - 'value': 0 | ||
| 181 | - } for x in X] | ||
| 182 | - | 122 | + # Analyze sentiment of comments and articles |
| 123 | + comment_sentiments = [] | ||
| 183 | for comment in commentList: | 124 | for comment in commentList: |
| 184 | emotionValue = SnowNLP(comment[4]).sentiments | 125 | emotionValue = SnowNLP(comment[4]).sentiments |
| 185 | if emotionValue > 0.4: | 126 | if emotionValue > 0.4: |
| 186 | - biedata1[0]['value'] += 1 | 127 | + comment_sentiments.append('正面') |
| 187 | elif emotionValue < 0.2: | 128 | elif emotionValue < 0.2: |
| 188 | - biedata1[2]['value'] += 1 | 129 | + comment_sentiments.append('负面') |
| 189 | else: | 130 | else: |
| 190 | - biedata1[1]['value'] += 1 | ||
| 191 | - for artile in articleList: | ||
| 192 | - emotionValue = SnowNLP(artile[5]).sentiments | 131 | + comment_sentiments.append('中性') |
| 132 | + comment_counts = Counter(comment_sentiments) | ||
| 133 | + | ||
| 134 | + article_sentiments = [] | ||
| 135 | + for article in articleList: | ||
| 136 | + emotionValue = SnowNLP(article[5]).sentiments | ||
| 193 | if emotionValue > 0.4: | 137 | if emotionValue > 0.4: |
| 194 | - biedata2[0]['value'] += 1 | 138 | + article_sentiments.append('正面') |
| 195 | elif emotionValue < 0.2: | 139 | elif emotionValue < 0.2: |
| 196 | - biedata2[2]['value'] += 1 | 140 | + article_sentiments.append('负面') |
| 197 | else: | 141 | else: |
| 198 | - biedata2[1]['value'] += 1 | ||
| 199 | - return biedata1,biedata2 | 142 | + article_sentiments.append('中性') |
| 143 | + article_counts = Counter(article_sentiments) | ||
| 144 | + | ||
| 145 | + X = ['正面', '中性', '负面'] | ||
| 146 | + biedata1 = [{'name': x, 'value': comment_counts.get(x, 0)} for x in X] | ||
| 147 | + biedata2 = [{'name': x, 'value': article_counts.get(x, 0)} for x in X] | ||
| 148 | + return biedata1, biedata2 | ||
| 200 | 149 | ||
| 201 | def getYuQingCharDataThree(): | 150 | def getYuQingCharDataThree(): |
| 151 | + # Retrieve top 10 hot words and their counts | ||
| 202 | hotWordList = getAllHotWords() | 152 | hotWordList = getAllHotWords() |
| 203 | - x1Data = [] | ||
| 204 | - y1Data = [] | ||
| 205 | - for i in hotWordList[:10]: | ||
| 206 | - x1Data.append(i[0]) | ||
| 207 | - y1Data.append(int(i[1])) | ||
| 208 | - return x1Data,y1Data | ||
| 209 | - | 153 | + x1Data = [word[0] for word in hotWordList[:10]] |
| 154 | + y1Data = [int(word[1]) for word in hotWordList[:10]] | ||
| 155 | + return x1Data, y1Data |
-
Please register or login to post a comment