-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathkeywords.py~
159 lines (139 loc) · 5.07 KB
/
keywords.py~
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
#Use top_keywords function to get top n most relevant keywords from a
#single document
#Based on a paper by Yutaka Matsuo, Mitsuru Ishizuka
#Modified to get unigrams AND bigrams
from gensim.parsing import PorterStemmer
#Generate set of stopwords
#Build set of stopwords
f=open('stopwords')
stoplist = []
for line in f :
stoplist.append(line[0:-1])
f.close()
stoplist = set(stoplist)
#Initialize stemmer
stemmer = PorterStemmer()
#Build stem lookup
#This lookup prevents words with same stemmed form
#from being treated as different words in the corpus
class stemmingHelper(object):
def __init__(self):
self.word_lookup = {}
def __call__(self, word):
stemmed = stemmer.stem(word)
if stemmed not in self.word_lookup:
self.word_lookup[stemmed] = word
return word
else:
return self.word_lookup[stemmed]
primary_form = stemmingHelper()
def get_word_sets(filename):
""" Returns dict of total vocabulary and dicts of sentence vocabularies """
f = open(filename, 'r')
sentences = []
for line in f.readlines():
temp = line.strip().lower().translate(None,
'?,:;()[]"`-|><^%"*1234567890').split('.')
for i, x in enumerate(temp):
if len(x) > 2:
sentences.append(x)
sentence_words = []
vocabulary = {}
for x in sentences:
sentence_vocab = {}
temp = x.split()
for i in range(len(temp)-1):
if temp[i] not in stoplist and temp[i+1] not in stoplist:
bigram = ' '.join([temp[i], temp[i+1]])
sentence_vocab[bigram] = sentence_vocab.get(bigram, 0) + 1
vocabulary[bigram] = vocabulary.get(bigram, 0) + 1
if temp[i] not in stoplist and len(temp[i]) > 2:
word = primary_form(temp[i])
sentence_vocab[word] = sentence_vocab.get(word, 0) + 1
vocabulary[word] = vocabulary.get(word, 0) + 1
try:
if temp[-1] not in stoplist and len(temp[-1]) > 2:
word = primary_form(temp[-1])
sentence_vocab[word] = sentence_vocab.get(word, 0) + 1
vocabulary[word] = vocabulary.get(word, 0) + 1
except:
pass
sentence_words.append(sentence_vocab)
return vocabulary, sentence_words
def get_param_matrices(vocabulary, sentence_words, n=None):
"""
Returns -
1. Top n most frequent words
2. co-occurence matrix wrt top-n words(dict)
3. vector of Pg of most-frequent n words(dict)
4. nw of each word(dict)
"""
if n is None or n > len(vocabulary):
n = int(0.33 * len(vocabulary))
topwords = vocabulary.keys()
topwords.sort(key = lambda x: vocabulary[x], reverse = True)
topwords = topwords[:n]
nw = {}
co_occur = {}
for x in vocabulary:
co_occur[x] = [0 for i in range(len(topwords))]
for sentence in sentence_words:
total_words = sum(sentence.values())
top_indices = []
for word in sentence:
if word in topwords:
top_indices.append(topwords.index(word))
for word in sentence:
nw[word] = nw.get(word, 0) + total_words
for index in top_indices:
co_occur[word][index] += sentence[word] * \
sentence[topwords[index]]
Pg = {}
N = sum(vocabulary.values())
for x in topwords:
Pg[x] = float(nw[x])/N
return topwords, co_occur, Pg, nw
def get_main_values(vocabulary, topwords, co_occur, Pg, nw):
""" Calculates the main comparison values to find top keywords """
result = {}
N = sum(vocabulary.values())
for word in vocabulary:
result[word] = 0
for x in Pg:
expected_cooccur = nw[word] * Pg[x]
result[word] += (co_occur[word][topwords.index(x)] - \
expected_cooccur)**2/ float(expected_cooccur)
result[word] *= vocabulary[word]/float(N)
return result
def top_keywords(filename, n=5):
vocabulary, sentence_words = get_word_sets(filename)
topwords, co_occur, Pg, nw = get_param_matrices(vocabulary, sentence_words)
result = get_main_values(vocabulary, topwords, co_occur, Pg, nw)
if n> len(vocabulary):
raise ValueError("Value of n cannot exceed vocabulary size :" + `len(vocabulary)`)
words = result.keys()
words.sort(key = lambda x: result[x], reverse=True)
count = 0
i = 0
wordset = set([])
topn = []
while count < n:
if ' ' in words[i]:
for x in words[i].split():
if x in wordset:
try:
topn.remove(x)
count -= 1
except:
pass
else:
wordset.add(x)
topn.append(words[i])
count += 1
else:
if words[i] not in wordset:
topn.append(words[i])
wordset.add(words[i])
count += 1
i += 1
return topn