bayes.py
2.08 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import sys
import gzip
import marshal
from math import log, exp
from ..utils.frequency import AddOneProb
class Bayes(object):
def __init__(self):
self.d = {}
self.total = 0
def save(self, fname, iszip=True):
d = {}
d['total'] = self.total
d['d'] = {}
for k, v in self.d.items():
d['d'][k] = v.__dict__
if sys.version_info[0] == 3:
fname = fname + '.3'
if not iszip:
marshal.dump(d, open(fname, 'wb'))
else:
f = gzip.open(fname, 'wb')
f.write(marshal.dumps(d))
f.close()
def load(self, fname, iszip=True):
if sys.version_info[0] == 3:
fname = fname + '.3'
if not iszip:
d = marshal.load(open(fname, 'rb'))
else:
try:
f = gzip.open(fname, 'rb')
d = marshal.loads(f.read())
except IOError:
f = open(fname, 'rb')
d = marshal.loads(f.read())
f.close()
self.total = d['total']
self.d = {}
for k, v in d['d'].items():
self.d[k] = AddOneProb()
self.d[k].__dict__ = v
def train(self, data):
for d in data:
c = d[1]
if c not in self.d:
self.d[c] = AddOneProb()
for word in d[0]:
self.d[c].add(word, 1)
self.total = sum(map(lambda x: self.d[x].getsum(), self.d.keys()))
def classify(self, x):
tmp = {}
for k in self.d:
tmp[k] = log(self.d[k].getsum()) - log(self.total)
for word in x:
tmp[k] += log(self.d[k].freq(word))
ret, prob = 0, 0
for k in self.d:
now = 0
try:
for otherk in self.d:
now += exp(tmp[otherk]-tmp[k])
now = 1/now
except OverflowError:
now = 0
if now > prob:
ret, prob = k, now
return (ret, prob)