-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmodel.py
256 lines (224 loc) · 7.61 KB
/
model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
"""
Support functions for app.py
"""
import sys
sys.path.append("C:/")
# if owlready2 installed in C:/, avoids traceback on windows machines
import json
from io import StringIO
# import owlready2
from owlready2 import *
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from PyPDF2 import PdfFileReader
import spacy
import en_core_web_sm
import boto3
# from boto3.dynamodb.types import TypeSerializer, TypeDeserializer
# Access to AWS DynamoDB cluster __________________
# client for connection to hosted DynamoDB on AWS
client = boto3.client("dynamodb", region_name="eu-west-3")
# cliient for connection to local DynamoDB
# client = boto3.client("dynamodb", endpoint_url="http://localhost:8000")
dynamoTableName = "arxivTable"
# __________________________________________________
def scan_recursive(tableName, **kwargs):
"""
Helper function to recursive scan dynamodb, avoids 1Mb max response limit
"""
response = client.scan(TableName=tableName, **kwargs)
if kwargs.get("Select") == "COUNT":
return response.get("Count")
data = response.get("Items")
while "LastEvaluatedKey" in response:
response = kwargs.get("table").scan(
ExclusiveStartKey=response["LastEvaluatedKey"], **kwargs
)
data.extend(response["Items"])
return data
def AWS_db_persisted_files():
"""
Returns list of files already in DB.
"""
file_lst = []
instances = scan_recursive(dynamoTableName)
for instance in instances:
file_lst.append(instance["_id"]["S"])
return file_lst
def AWS_db_check():
"""
Returns number of items in DB or -1 if impossible to connect
"""
try:
count = scan_recursive(dynamoTableName, Select="COUNT")
return count
except:
return -1
def create_onto():
"""
Create ontology architecture.
"""
onto = get_ontology("http://fil_rouge/onto.owl/") # new ontology, chose an IRI
with onto:
class Person(Thing):
# class Person is a subclass of Thing
pass
class Document(Thing):
# subclass of not a Person
is_a = [Not(onto.Person)]
pass
class has_title(Document >> str):
# data property
pass
AllDisjoint([Person, Document])
# A Person can make a document
class makes(Person >> Document):
pass
class Author(Person):
# as a Person who teaches some Course
equivalent_to = [And([Person, makes.some(Document)])]
class isReferredIn(Person >> Document):
# A Person can be referred in a Document
pass
class Reference(Person):
# as a Person referred in a document
equivalent_to = [And([Person, isReferredIn.some(Document)])]
# create ontology file from instances in AWS DynamoDB.
instances = scan_recursive(dynamoTableName)
for instance in instances:
# loop in results from the query
# for each item, instanciate onto
with onto:
doc_name = str(instance["_id"]["S"])
new_doc = Document(name=doc_name)
new_doc.has_title.append(instance["title"]["S"])
for author in instance["Authors"]["SS"]:
author_name = [str(author).replace("%20", " ")]
#author_name = str(author).replace("%20", "_").replace(" ","_")
#print(author_name)
new_author = Person(name=author_name)
new_author.makes.append(new_doc)
for reference in instance["References"]["SS"]:
ref_name = [str(reference).replace("%20", " ")]
#ref_name = str(reference).replace("%20", "_").replace(" ","_")
new_reference = Person(name=ref_name)
new_reference.isReferredIn.append(new_doc)
default_world.save(
"./ontologies/world.owl"
) # generates ontology structure such as: <stud:Person rdf:about="http://students.org/alice">
return default_world
def convert_pdf_to_txt(path):
"""get pdf content with pdfminer library"""
rsrcmgr = PDFResourceManager()
retstr = StringIO()
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, laparams=laparams)
with open(path, "rb") as file:
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos = set()
for page in PDFPage.get_pages(
file,
pagenos,
maxpages=maxpages,
password=password,
caching=caching,
check_extractable=True,
):
interpreter.process_page(page)
text = retstr.getvalue()
device.close()
retstr.close()
return text
def arxiv_db_check(arxiv_id):
"""check if file already in DB."""
filter_dict = {"_id": {"S": arxiv_id}}
resp = client.get_item(TableName=dynamoTableName, Key=filter_dict)
item = resp.get("Item")
if not item:
return True
else:
return False
def process_arxiv_file(path, arxiv_id, title, authors_lst, post_trigger):
"""
extracts necessary data and execute query for given arxiv file
uses function defined above to get content of pdf file
keeps text only after first occurence of word "REFERENCES"
keeps all text in case REFERENCES cannot be found
"""
# get full content
content = convert_pdf_to_txt(path)
# get references part only
ref_position = content.find("REFERENCES")
# in case references not found, keep all text
if ref_position == -1:
ref_position = content.find("References")
if ref_position == -1:
content_ref = content
else:
content_ref = content[ref_position:]
else:
content_ref = content[ref_position:]
# to get named entities
nlp = spacy.load("en_core_web_sm")
doc = nlp(content_ref)
ref_lst = []
for ent in doc.ents:
if ent.label_ == "PERSON":
ref_lst.append(ent.text)
# to remove duplicates
ref_lst = list(dict.fromkeys(ref_lst))
# Filter references based on exceptions
exceptions = [
"0",
"1",
"2",
"3",
"4",
"5",
"6",
"7",
"8",
"9",
"10",
"arxiv",
",",
":",
"/",
"]",
"[",
"\n",
">",
"<",
"ˇ",
]
references = ref_lst
filter_data = [x for x in references if all(y not in x for y in exceptions)]
# Filter to keep only strings with a whitespace
filter_data = [x for x in filter_data if " " in x]
with open(path, "rb") as file:
pdf = PdfFileReader(file)
doc_info = pdf.getDocumentInfo()
info = (
arxiv_id,
title,
authors_lst,
doc_info.producer,
doc_info.subject,
pdf.getNumPages(),
)
params = info + (filter_data,)
if post_trigger == 1:
# post to DynamoDB
post = {
"_id": {"S": arxiv_id},
"title": {"S": title},
"Authors": {"SS": json.loads(authors_lst)},
"References": {"SS": filter_data},
}
resp = client.put_item(TableName=dynamoTableName, Item=post)
return params