-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtest_classifications.py
101 lines (59 loc) · 2.08 KB
/
test_classifications.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
# encoding=utf-8
from cPickle import Unpickler
import math
import time
import Utility
from machinelearning import naivebayes_classification
def testNaiveBayes():
begin = time.time()
counter = 0
fileToRead = open(naivebayes_classification.str_dict_word_in_cat)
dict_words = Unpickler(fileToRead).load()
fileToRead.close()
fileToRead = open(naivebayes_classification.str_dict_cat_count)
dict_cat_count = Unpickler(fileToRead).load()
fileToRead.close()
fileToRead = open(naivebayes_classification.str_dict_priors)
dict_priors = Unpickler(fileToRead).load()
fileToRead.close()
numErrors = 0
for line in open(Utility.path_dataset):
parts = line.decode('utf-8').strip().split('\t')
category = parts[0]
text = parts[1]
words = Utility.getWords(text)
nb_category = get_NB_category(words, dict_words, dict_cat_count, dict_priors)
if nb_category != category:
#print 'correct category: %s my category: %s' % (category, nb_category)
numErrors += 1
counter += 1
if counter % 5000 == 0:
print 'counter: %d\n' % counter
print 'time: %d\n' % (time.time() - begin)
print 'accuracy: %f' % (100*(1 - numErrors*1.0/counter))
def get_NB_category(words, dict_words, dict_cats, dict_priors):
maxCat = ''
maxProb = 0
flag_firstTime = True
for cat in dict_cats:
prob = compute_NB_prob(words, dict_words, dict_cats, cat,dict_priors)
if flag_firstTime:
flag_firstTime = False
maxProb = prob
continue
#print 'prob: %f\n' % prob
if prob > maxProb:
maxProb = prob
maxCat = cat
return maxCat
def compute_NB_prob(words, dict_words, dict_cats, cat, priors):
total_prob = 1
den = dict_cats[cat]
den += len(dict_words[cat])
for word in words:
nom = dict_words[cat].get(word, 0) + 1
toAdd = (1.0*nom)/den
total_prob +=math.log10(toAdd)
total_prob += math.log10(priors[cat])
return total_prob
#testNaiveBayes()