__init__.py
1.42 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import os
import re
import codecs
from . import zh
from . import pinyin
stop_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
'stopwords.txt')
pinyin_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
'pinyin.txt')
stop = set()
fr = codecs.open(stop_path, 'r', 'utf-8')
for word in fr:
stop.add(word.strip())
fr.close()
pin = pinyin.PinYin(pinyin_path)
re_zh = re.compile('([\u4E00-\u9FA5]+)')
def filter_stop(words):
return list(filter(lambda x: x not in stop, words))
def zh2hans(sent):
return zh.transfer(sent)
def get_sentences(doc):
line_break = re.compile('[\r\n]')
delimiter = re.compile('[,。?!;]')
sentences = []
for line in line_break.split(doc):
line = line.strip()
if not line:
continue
for sent in delimiter.split(line):
sent = sent.strip()
if not sent:
continue
sentences.append(sent)
return sentences
def get_pinyin(sentence):
ret = []
for s in re_zh.split(sentence):
s = s.strip()
if not s:
continue
if re_zh.match(s):
ret += pin.get(s)
else:
for word in s.split():
word = word.strip()
if word:
ret.append(word)
return ret