【cutComments.py】分词统计词频函数定义

juanboy
Commit a94d2cdf80a55d3bc50f52af3a0c4a86c0bbff7f a94d2cdf 1 parent 50781187
Showing 2 changed files with 60 additions and 0 deletions
utils/cipingTotal.py
utils/cutComments.py
--- a/utils/cipingTotal.py 0 → 100644
View file @a94d2cd
+++ b/utils/cipingTotal.py 0 → 100644
View file @a94d2cd
+ import jieba
+ import re
+ 
+ def main():
+     reader = open('./cutComments.txt','r',encoding='utf8')
+     strs = reader.read()
+     result = open('./cipingTotal.csv','w',encoding='utf8')
+ 
+     # 分词，去重，列表
+     word_list = jieba.cut(strs,cut_all=True)
+ 
+     new_words = []
+     for i in word_list:
+         m = re.search("\d+",i)
+         n = re.search("\W+",i)
+         if not m and not n and len(i) > 1:
+             new_words.append(i)
+ 
+     # 统计词频
+     word_count = {}
+     for i in set(new_words):
+         word_count[i] = new_words.count(i)
+ 
+     # 格式整理
+     list_count = sorted(word_count.items(),key=lambda x:x[1],reverse=True)
+ 
+     for i in range(100):
+         print(list_count[i],file=result)
+ 
+ if __name__ == '__main__':
+     main()
\ No newline at end of file
--- a/utils/cutComments.py 0 → 100644
View file @a94d2cd
+++ b/utils/cutComments.py 0 → 100644
View file @a94d2cd
+ from utils.getPublicData import getAllCommentsData
+ import jieba
+ targetTxt = 'cutComments.txt'
+ 
+ def stopWordList():
+     stopWords = [line.strip() for line in open('./stopWords.txt',encoding='utf8').readlines()]
+     return stopWords
+ 
+ def seg_depart(sentence):
+     sentence_depart = jieba.cut(" ".join([x[4] for x in sentence]).strip())
+     stopWords = stopWordList()
+     outStr = ''
+     for word in sentence_depart:
+         if word not in stopWords:
+             if word != '\t':
+                 outStr += word
+     return outStr
+ 
+ def writer_comments_cuts():
+     with open(targetTxt,'a+',encoding='utf-8') as targetFile:
+         seg = jieba.cut(seg_depart(getAllCommentsData()),cut_all=True)
+         output = ' '.join(seg)
+         targetFile.write(output)
+         targetFile.write('\n')
+         print('写入成功')
+ 
+ 
+ if __name__ == '__main__':
+     writer_comments_cuts()
\ No newline at end of file