-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathUtility.py
451 lines (324 loc) · 20.6 KB
/
Utility.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
#encoding=utf-8
# Description:
# This file contains utility functions, dictionaries which are used through out the application
import math
import time
from cPickle import Pickler
from cPickle import Unpickler
from model.Source import Source
from model.Category import Category
training_file_path='/Users/igorpetrovski/Desktop/Fakultet/7 semestar/NLP/dataset-hw.txt'
categories_file_path='/Users/igorpetrovski/Desktop/NewsReaderServer/categories.txt'
def isInStopWords(word):
return word in stop_words
# source boundaries
# i have noticed that specific links put constant tokens at the beginning/end of a document
# so i have decided to ignore(delete) them
#set of stopwords which will be ignored when processing a text
stop_words = set(['сте', 'ве', 'ви', 'вие', 'вас', 'но', 'го', 'а', 'е', 'на', 'во', 'и', 'ни', 'ние', 'или',
'та', 'ма', 'ти', 'се', 'за', 'од', 'да', 'со', 'ќе', 'дека', 'што', 'не', 'ги', 'ја', 'јас',
'тие', 'тоа', 'таа', 'тој', 'мк', 'отсто', 'гр', 'мл', 'тв', 'ул', 'врз', 'сите', 'иако', 'друг', 'друга',
'при', 'цел', 'меѓу', 'околу', 'нив', 'кои', 'кога', 'поради', 'има', 'без', 'биле', 'она', 'кое', 'кај',
'овој', 'него', 'некои', 'оваа', 'веќе', 'оние', 'уште', 'може', 'меѓутоа',
'the', 'of', 'in', 'and', 'is', 'a', 'to', 'are', 'for', 'that', 'all', 'an', 'i', 'on',
'fg', '3p', 'ft', 'reb', 'pos', 'pts', 'min', 'eff', 'bs', 'pf', 'am', 'pm', '3rd', '4th', '5th', '6th', 'nbsp'
'kurir', 'црнобело', 'kanal5'])
#categories
categories = [
Category(0,'MAKEDONIJA', title='Македонија', imgUrl='https://doc-14-5o-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/7kju035bftsor3bdaoffdp7dhs14032c/1466272800000/05927223757565951184/*/0B_Ady6W8AROOR3ZSeWZFYjVRYjg'),
Category(1,'SVET', title='Свет', imgUrl='https://doc-14-5o-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/fu3mr23e95fjtt9s73sr9s2fsapdu9dk/1466272800000/05927223757565951184/*/0B_Ady6W8AROONVBvSkNTMjF6RXM'),
Category(2,'EKONOMIJA', title='Економија', imgUrl='https://doc-08-5o-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/au19653q6edu19m7cq1dlfnjt7mfg12a/1466272800000/05927223757565951184/*/0B_Ady6W8AROOYlVGWDFTaFJFeU0'),
Category(3,'SCENA', title='Сцена', imgUrl='https://doc-04-5o-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/6poc3gjlaiiif51gvtekk172n3hpp0k8/1466272800000/05927223757565951184/*/0B_Ady6W8AROOaEl6V3JrNU12aGc'),
Category(4,'ZIVOT', title='Живот', imgUrl='https://doc-0c-5o-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/8169c509hpbd2ogqaorajsg69cle9n8l/1466272800000/05927223757565951184/*/0B_Ady6W8AROORE1ZN3hrV2VfV1E'),
Category(5,'KULTURA', title='Култура', imgUrl='https://doc-04-5o-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/129j7mcjdbgjim3n5ue6beggtcfhebfo/1466272800000/05927223757565951184/*/0B_Ady6W8AROOVkFkRE83WUI4UXM'),
Category(6,'ZDRAVJE', title='Здравје', imgUrl='https://doc-00-5o-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/5rqorhjc11k0uc93ap9b2e2dckfipt6i/1466272800000/05927223757565951184/*/0B_Ady6W8AROOWUVVSUQzQ1ZoVzQ'),
Category(7,'TEHNOLOGIJA', title='Технологија', imgUrl='https://doc-0g-5o-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/um6go4s5migkjfij5nm1f9dvom0b28bp/1466272800000/05927223757565951184/*/0B_Ady6W8AROOSnM5TFRXZlBscE0'),
Category(8,'FUDBAL', title='Фудбал', imgUrl='https://doc-0c-5o-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/hrfm918d5v2238nkff5a7csg4qictedk/1466272800000/05927223757565951184/*/0B_Ady6W8AROOZXpZUG9HZFNDLVk'),
Category(9,'KOSARKA', title='Кошарка', imgUrl='https://doc-10-5o-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/k32nuatls1vg3cvj0seash7bhuciagik/1466272800000/05927223757565951184/*/0B_Ady6W8AROOWWxtTm5ocFUwYnM'),
Category(10,'RAKOMET', title='Ракомет', imgUrl='https://doc-0k-5o-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/qunsh2q0aqkfs64aluf7c8tg05erkvcf/1466272800000/05927223757565951184/*/0B_Ady6W8AROOdWlIRDFVNE4zV2s'),
Category(11,'TENIS', title='Тенис', imgUrl='https://doc-14-5o-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/t18fk42kf55eiru3o5p4daq3119m1isc/1466272800000/05927223757565951184/*/0B_Ady6W8AROOMWppU09pRndEbUE')
]
# sources where we collection data from
sources = [
Source(0, 'http://vecer.mk', ['http://vecer.mk/rss.xml']),#opoziconen
Source(1,'http://www.crnobelo.com',['http://www.crnobelo.com/?format=feed&type=rss']),
Source(2,'http://sitel.mk',['http://sitel.mk/rss.xml']),#opoziconen
Source(3,'http://kurir.mk',['http://kurir.mk/feed/']),#opoziconen
Source(4,'http://republika.mk',['http://republika.mk/?feed=rss2']),#opoziconen
#Source(5,'http://plusinfo.mk',['http://plusinfo.mk/rss/biznis','http://plusinfo.mk/rss/skopje', 'http://plusinfo.mk/rss/kultura',
# 'http://plusinfo.mk/rss/zdravje', 'http://plusinfo.mk/rss/svet', 'http://plusinfo.mk/rss/scena']),
Source(6,'http://www.vest.mk',['http://www.vest.mk/rssGenerator/']),#opoziconen
Source(7,'http://alsat.mk',['http://alsat.mk/RssFeed']),
Source(8,'http://www.mkd.mk/',[ 'http://www.mkd.mk/feed']),
Source(9,'http://www.sport.com.mk',['http://www.sport.com.mk/rssgenerator/rss.aspx']),
Source(10,'http://www.dnevnik.mk',['http://www.dnevnik.mk/rssGenerator/']),#opoziconen
Source(11, 'http://interesno.com.mk' , ['http://interesno.com.mk/index.php?format=feed&type=rss' ]),
Source(12, 'http://www.fakulteti.mk/',['http://www.fakulteti.mk/rss/rss.ashx']),
Source(13, 'http://novatv.mk/', ['http://novatv.mk/feed']), # ne se load-a feedot
Source(14, 'http://kajgana.com/', ['http://kajgana.com/rss.xml']),
Source(15, 'http://derbi.mk/', ['http://derbi.mk/feed/']),
Source(16, 'http://www.libertas.mk/', ['http://www.libertas.mk/feed/']),
Source(17, 'http://mkd-news.com/', ['http://mkd-news.com/feed/']),
Source(18, 'http://www.brif.mk/', ['http://www.brif.mk/feed/']),
Source(19, 'http://off.net.mk/', ['http://feeds.feedburner.com/offnetmk']),
Source(20, 'http://it.mk/', ['http://it.mk/feed/']),
Source(21, 'http://www.smartportal.mk/',['http://www.smartportal.mk/feed/']),
Source(22, 'http://popularno.mk/',['http://www.popularno.mk/feed/']),
Source(23, 'http://www.femina.mk/',['http://www.femina.mk/rss']),
Source(24, 'http://doktori.mk/',['http://feeds.feedburner.com/doktorimk']),
Source(25, 'http://tocka.com.mk/',['http://tocka.com.mk/rss.php']),
Source(26, 'http://koli.com.mk/',['http://koli.com.mk/rss.ashx']),
Source(27, 'http://a1on.mk/wordpress/',['http://a1on.mk/wordpress/feed']),
Source(28, 'http://reporter.mk/',['http://reporter.mk/feed/']),
Source(29, 'http://lokalno.mk/',['http://lokalno.mk/feed/']),
Source(30, 'http://automedia.mk/',['http://automedia.mk/?feed=rss2']),
Source(31, 'http://netpress.com.mk/',['http://netpress.com.mk/feed/']),#opoziconen
Source(32, 'http://evesti.mk/', ['http://feeds.feedburner.com/evesti']),
Source(33, 'http://www.novamakedonija.com.mk/', ['http://www.novamakedonija.com.mk/rssAll.asp']),
Source(34, 'http://makfax.com.mk/', ['http://makfax.com.mk/_feeds/rss2/news']),
Source(35, 'http://ekonomski.mk/', ['http://ekonomski.mk/feed/']),
Source(36, 'http://www.spektra.com.mk/', ['http://www.spektra.com.mk/rss']),
Source(37, 'http://bi.mk/', ['http://bi.mk/feed/']),
Source(38, 'http://faktor.mk/', ['http://faktor.mk/feed/']),
Source(39, 'http://aktuelno24.mk/', ['http://feeds.feedburner.com/aktuelno24/ywvm']), #losho lista sliki,
Source(40, 'http://pozitiv.mk/', ['http://pozitiv.mk/feed/']),
Source(41, 'http://www.idividi.com.mk/', ['http://www.idividi.com.mk/rss.aspx']),
Source(42, 'http://zase.mk/', ['http://zase.mk/feed/']), #ne raboti feedot koga vlecam od localhost
# ----------- PROVERKA -----------------
#vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv
Source(43, 'http://www.avtomagazin.com.mk/', ['http://www.avtomagazin.com.mk/avto-magazin.rss']), #PROVERENO, TEKSTOT SE ZEMA DOBRO
Source(44, 'http://meta.mk/', ['http://meta.mk/feed/']), # PROVERENO, TEKSTOT SE ZEMA DOBRO
Source(45, 'http://slobodna.mk/', ['http://slobodna.mk/feed/']), #PROVERENO TEKSTOT SE ZEMA DOBRO (NEKOGAS LISTA I META DATA)
Source(47, 'http://www.utrinski.mk/', ['http://www.utrinski.mk/rssgenerator/rss.aspx']), # PROVERENO, TEKSTOT SE ZEMA DOBRO (samo naslovot se povtoruva duplo)
Source(48, 'http://kapital.mk/', ['http://kapital.mk/feed/']), # LOSH FEED
Source(49, 'http://www.akademik.mk/', ['http://www.akademik.mk/feed/']),# PROVERENO, TEKSTOT SE ZEMA DOBRO
Source(50, 'http://mk.rbth.com/', ['http://mk.rbth.com/xml/index.xml'], 'Руска реч'),# PROVERENO, TEKSTOT SE ZEMA DOBRO
Source(51, 'http://telma.com.mk/', ['http://telma.com.mk/rss.xml'], 'Телма'), # PROVERENO, TEKSTOT SE ZEMA DOBRO
Source(52, 'http://kanal77.mk/', ['http://kanal77.mk/feed/']), # PROVERENO, TEKSTOT SE ZEMA DOBRO
Source(53, 'http://24vesti.mk/', ['http://24vesti.mk/rss.xml']), # PROVERENO, TEKSTOT SE ZEMA DOBRO
Source(54, 'http://marketing365.mk/', ['http://marketing365.mk/feed/']),# PROVERENO, TEKSTOT SE ZEMA DOBRO
Source(55, 'http://maxim.mk/', ['http://maxim.mk/rss.xml']), # PROVERENO, TEKSTOT SE ZEMA DOBRO
# PROVERENO, RAZLICNI TAGOVI. pr: entry mesto item itn
# ako ima povekje vakvi slucaevi togas da se razmisli kako da se opfati ovoj slucaj (povtorno e mozhno so dictionaries)
Source(56, 'http://prizma.birn.eu.com/', ['http://mix.chimpfeedr.com/5ee9b-Prizma'], 'Призма')
]
#the dictionary that tells us which paragraphs we need to ignore for a given source
fetch_text_specifications = {
'http://vecer.mk/rss.xml':[0,-3],
'http://www.crnobelo.com/?format=feed&type=rss': [0,-2],
'http://novatv.mk/rss.xml?tip=2' :[2,-2],
'http://novatv.mk/rss.xml?tip=5' :[2,-2],
'http://novatv.mk/rss.xml?tip=7' :[2,-2],
'http://novatv.mk/rss.xml?tip=23' :[2,-2],
'http://www.vest.mk/rssGenerator/' :[0,-7],
'http://alsat.mk/RssFeed' :[1,-3],
'http://www.mkd.mk/feed':{'tag_type':'div', 'attribute_type':'id', 'attribute_value':'main-wrapper',
'nested_tag_type':'div', 'nested_attribute_type':'class', 'nested_attribute_value':'field-body',
},
'http://www.sport.com.mk/rssgenerator/rss.aspx': [1],
'http://reporter.mk/feed/':[3,-10],
'http://tocka.com.mk/rss.php':{'tag_type':'div', 'attribute_type':'id', 'attribute_value':'sodrzina_vest'},
'http://www.femina.mk/rss':[0,-7],
'http://feeds.feedburner.com/offnetmk':{'tag_type':'div', 'attribute_type':'class',
'attribute_value':['sodrzhina-full', 'voved']},
'http://lokalno.mk/feed/': [0,-9],
'http://it.mk/feed/':[2,-28],
'http://netpress.com.mk/feed/': [0,-1],
'http://feeds.feedburner.com/doktorimk':[1,-1],
'http://feeds.feedburner.com/evesti': [0,-10],
'http://makfax.com.mk/_feeds/rss2/news':[2,-5],
'http://ekonomski.mk/feed/':[0,-4],
'http://www.spektra.com.mk/rss':[0,-12],
'http://bi.mk/feed/':[0,-23], #da se proveruva za update!!!,
'http://faktor.mk/feed/': [0,-20],
'http://pozitiv.mk/feed/': [0,-2],
'http://www.motika.com.mk/feed/': [4],
'http://www.idividi.com.mk/rss.aspx':[0,-13],
'http://kurir.mk/feed/': [0,-5],
'http://www.avtomagazin.com.mk/avto-magazin.rss':[0,-23],
'http://slobodna.mk/feed/':{'tag_type':'div', 'attribute_type':'class', 'attribute_value':'entry clearfix'},
'http://meta.mk/feed/':[1,-14],
'http://maxim.mk/rss.xml':[0,-6],
'http://marketing365.mk/feed/':[0,-21],
'http://www.utrinski.mk/rssgenerator/rss.aspx': [3,-6],
'http://telma.com.mk/rss.xml':[0,-1],
'http://kanal77.mk/feed/':{'tag_type':'div', 'attribute_type':'class', 'attribute_value':'entry', 'nested_tag_type':'p',
'nested_attribute_type':'class', 'nested_attribute_value': None},
'http://www.akademik.mk/feed/':[0,-1],
'http://mk.rbth.com/xml/index.xml':[0,-2],
'http://www.avtomagazin.com.mk/avto-magazin.rss':{'tag_type':'section', 'attribute_type':'class', 'attribute_value':'link-arrow'},
'http://24vesti.mk/rss.xml':{'tag_type':'div', 'attribute_type':'class', 'attribute_value':'main-content',
'nested_tag_type':'div', 'nested_attribute_type':'class', 'nested_attribute_value':'content',
'limit':1},
'http://republika.mk/?feed=rss2':{'tag_type':'div', 'attribute_type':'id', 'attribute_value':'article_text'}
}
str_idf_dict = 'dict_idf'
path_dataset = '/Users/igorpetrovski/Desktop/Fakultet/7 semestar/NLP/dataset-hw.txt'
#function that for a given text (string) returns a list of words contained in that text
def getWords(line, to_lower=True):
"""
the basic function that parses a line
:param line: line that needs to be parsed
:return: list of words
"""
if to_lower:
line = line.lower()
out = []
word = []
for c in line:
if c.isalpha():
word.append(c)
continue
else:
if word:
newWord = ''.join(word)
if not newWord in stop_words:
if len(newWord) > 6:
newWord = newWord[0:6]
out.append(newWord)
word = []
if word:
newWord = ''.join(word)
if not newWord in stop_words:
if len(newWord) > 6:
newWord = newWord[0:6]
out.append(newWord)
return out
def get_idf_dictionary():
file_to_read = open(str_idf_dict, 'r')
idf_dict = Unpickler(file_to_read).load()
file_to_read.close()
return idf_dict
def calculate_idf_dictionary(file_path):
"""
function for calculating the idf dictionary
only one execution needed
:param file_path: the file where we build the dictionary from
:return: we return the built dictionary
"""
#idf for a word: log (number of total documents / number of documents that word appears in )
begin = time.time()
num_documents = 0
idf_dict = {}
for line in open(file_path):
parts = line.decode('utf-8').strip().split('\t')
category = parts[0] #unimportant for now..
text = parts[1]
words = set(getWords(text))
for word in words:
idf_dict[word] = 1 + idf_dict.get(word, 0)
num_documents += 1
if num_documents % 10000 == 0:
print 'processed %d number of documents' % num_documents
for word in idf_dict:
idf_dict[word] = math.log(num_documents / (idf_dict[word] * 1.0))
print 'number of words: ', len(idf_dict)
print 'number of documents: ', num_documents
print 'Done for: %d seconds.' % (time.time()-begin)
fileToWrite = open(str_idf_dict, 'w')
Pickler(fileToWrite).dump(idf_dict)
fileToWrite.close()
return idf_dict
def create_files_for_categories(file_path_input, file_path_store):
"""
function for storing the categories in file
the i-th line will contain the ith-category
from this file we will be able to create the dictionary category -> id_category and reversed dictionary id_category -> category
:param file_path_input: the file where we read the categories from
:param file_path_store: where we need to store the categories
:return: nothing
"""
categories = []
for line in open(file_path_input):
category = line.decode(encoding='utf-8').strip().split('\t')[0]
categories.append(category)
categories = set(categories)
file_to_write = open(file_path_store, 'w')
for category in categories:
file_to_write.write(('%s\n' % category).encode('utf-8'))
file_to_write.close()
def load_categories():
"""
Loading the categories that we need for text categorization
:return: returns cat_to_ind and ind_to_cat dictionaries
"""
cat_to_ind = {}
ind_to_cat = []
ind_line = 0
for line in open(categories_file_path):
category = line.decode(encoding='utf-8').strip()
cat_to_ind[category] = ind_line
ind_to_cat.append(line)
ind_line += 1
return cat_to_ind, ind_to_cat
def load_vocabulary(num_words=50000):
print 'loading vocabulary..'
vocabulary = {}
ind = 0
for line in open('vocabulary%d.txt' % num_words):
line = line.decode('utf-8').strip()
if line:
vocabulary[line] = ind
ind += 1
print 'vocabulary size: %d' % len(vocabulary)
return vocabulary
def create_vocabulary(file_path_source, file_path_dest,num_words=50000):
"""
creates a vocabulary of most common words
:param file_path_source: source where we should read the sentnces from
:param file_path_dest: where should we store the list
:param num_words: how many words to take into considerations after sorting
:return: the list of 'num_words' most common words
"""
word_count = {}
t0 = time.time()
for line in open(file_path_source):
text = line.decode(encoding='utf-8').strip().split('\t')[1]
words = getWords(text)
for w in words:
word_count[w] = 1 + word_count.get(w, 0)
print 'Processed the source for %d seconds.' % (time.time() - t0)
list_words = sorted([(w, word_count[w]) for w in word_count], key=lambda x: -x[1])
num_words = min(len(list_words), num_words)
list_words = list_words[:num_words]
t0 = time.time()
file_to_write = open('%s%d.txt' % (file_path_dest, num_words), 'w')
for word in list_words:
file_to_write.write((word[0] + '\n').encode('utf-8'))
file_to_write.close()
print 'Wrote the words in a file for %d seconds' % (time.time() - t0)
return list_words
def transform_single_text_tf_idf(text_input):
words = getWords(text_input, to_lower=True)
word_count = {}
for w in words:
word_count[w] = 1 + word_count.get(w, 0)
dict_tf_idf = {}
idf = get_idf_dictionary()
# we pass the words that we got when instantiating the object one by one
for word in word_count:
# we calculate the term frequency for each word
tf = (word_count[word] * 1.0)
# we read the inverse document frequency if it exists
val_idf = idf.get(word, None)
if val_idf is not None:
if tf > 0:
tf = 1 + math.log10(tf)
else:
tf = 0
dict_tf_idf[word] = val_idf*tf
else:
dict_tf_idf[word] = 0
return dict_tf_idf
def category_count(source_file, write_to_file=False):
dict = {}
for line in open(source_file):
cat = line.strip().split('\t')[0]
dict[cat] = 1 + dict.get(cat,0)
categories = sorted([(category, dict[category]) for category in dict],key=lambda x: -x[1])
if not write_to_file: return
file_to_write = open('categories_count', 'w')
for tuple in categories:
file_to_write.write('%s\t%d\n' % (tuple[0], tuple[1]))
file_to_write.close()
#category_count(source_file=path_dataset, write_to_file=True)