kpt
  1 +import jieba
  2 +import re
  3 +
  4 +def main():
  5 + reader = open('./cutComments.txt','r',encoding='utf8')
  6 + strs = reader.read()
  7 + result = open('./cipingTotal.csv','w',encoding='utf8')
  8 +
  9 + # 分词,去重,列表
  10 + word_list = jieba.cut(strs,cut_all=True)
  11 +
  12 + new_words = []
  13 + for i in word_list:
  14 + m = re.search("\d+",i)
  15 + n = re.search("\W+",i)
  16 + if not m and not n and len(i) > 1:
  17 + new_words.append(i)
  18 +
  19 + # 统计词频
  20 + word_count = {}
  21 + for i in set(new_words):
  22 + word_count[i] = new_words.count(i)
  23 +
  24 + # 格式整理
  25 + list_count = sorted(word_count.items(),key=lambda x:x[1],reverse=True)
  26 +
  27 + for i in range(100):
  28 + print(list_count[i],file=result)
  29 +
  30 +if __name__ == '__main__':
  31 + main()
  1 +from utils.getPublicData import getAllCommentsData
  2 +import jieba
  3 +targetTxt = 'cutComments.txt'
  4 +
  5 +def stopWordList():
  6 + stopWords = [line.strip() for line in open('./stopWords.txt',encoding='utf8').readlines()]
  7 + return stopWords
  8 +
  9 +def seg_depart(sentence):
  10 + sentence_depart = jieba.cut(" ".join([x[4] for x in sentence]).strip())
  11 + stopWords = stopWordList()
  12 + outStr = ''
  13 + for word in sentence_depart:
  14 + if word not in stopWords:
  15 + if word != '\t':
  16 + outStr += word
  17 + return outStr
  18 +
  19 +def writer_comments_cuts():
  20 + with open(targetTxt,'a+',encoding='utf-8') as targetFile:
  21 + seg = jieba.cut(seg_depart(getAllCommentsData()),cut_all=True)
  22 + output = ' '.join(seg)
  23 + targetFile.write(output)
  24 + targetFile.write('\n')
  25 + print('写入成功')
  26 +
  27 +
  28 +if __name__ == '__main__':
  29 + writer_comments_cuts()