forked from WDPS-Group2/Entity-Linking
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathstarter-code.py
113 lines (94 loc) · 3.5 KB
/
starter-code.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
from html2text import html2text
from nlp_preproc_spark import nlp_preproc
from elasticsearch import search
from sparql import query_abstract
from sklearn.feature_extraction.text import TfidfVectorizer
import math
KEYNAME = "WARC-TREC-ID"
def find_key(payload):
key = None
for line in payload.splitlines():
if line.startswith(KEYNAME):
key = line.split(': ')[1]
return key;
return '';
def split_records(stream):
payload = ''
for line in stream:
if line.strip() == "WARC/1.0":
yield payload
payload = ''
else:
payload += line
vectorizer = TfidfVectorizer()
def cosine_sim(text1, text2):
tfidf = vectorizer.fit_transform([text1, text2])
return ((tfidf * tfidf.T).A)[0,1]
def isEnglish(s):
try:
s.encode(encoding='utf-8').decode('ascii')
except UnicodeDecodeError:
return False
else:
return True
# {token:[candidate Freebase entity IDs]}
entity_dict = {}
def search_candidate(token):
entities = None
if entity_dict.__contains__(token) and isEnglish(token):
entities = entity_dict[token]
else:
entities = search(ELASTICSEARCH,token).items()
entity_dict[token] = entities
return entities#entity,id,score,label
# {FreebaseID:abstract}
abstract_dict = {}
def query_candidate_abstract(entity):
abstract = ""
if abstract_dict.__contains__(entity):
abstract = abstract_dict[entity]
else:
abstract = query_abstract(SPARQL, entity)
abstract_dict[entity] = abstract
return abstract
if __name__ == '__main__':
import sys
try:
_, INPUT, ELASTICSEARCH,SPARQL = sys.argv
except Exception as e:
print('Usage: python starter-code.py INPUT ELASTICSEARCH SPARQL')
sys.exit(0)
with open(INPUT, errors='ignore') as fo:
for record in split_records(fo):
key = find_key(record);
if key != '':
text = html2text(record)
tagged_tokens = nlp_preproc(text)
mentions_types = []
entity_result_dict = {}
for token in tagged_tokens:
if token not in mentions_types:
mentions_types.append(token)
for token in mentions_types:
Tfidf_score_max = 0
entity_score_max = ""
entities = search_candidate(token[0])
for entity,labels in entities:
if labels['freebase_label'] == token[0]:
score = math.inf
Tfidf_score_max = score
entity_score_max = entity
break
abstract = query_candidate_abstract(entity)
score = labels['freebase_score']
if abstract != None:
score = score + cosine_sim(text,abstract)
# find the max score entity
if score > Tfidf_score_max:
Tfidf_score_max = score
entity_score_max = entity
if Tfidf_score_max != 0:
entity_result_dict[token[0]] = entity_score_max
for token in tagged_tokens:
if entity_result_dict.__contains__(token[0]):
print( key + '\t' + token[0] + '\t' + entity_result_dict[token[0]])