-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrelation_extractor.py
52 lines (40 loc) · 1.63 KB
/
relation_extractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
# -*- coding: utf-8 -*-
import numpy as np
from string import punctuation
from collections import Counter
from nltk import word_tokenize
from nltk.corpus import stopwords
from model.relation import Relation
from model.possible_relation import PossibleRelation
spanish_stopwords = stopwords.words('spanish')
non_words = list(punctuation)
non_words.extend(['¿', '¡'])
non_words.extend(map(str, range(10)))
def remove_stop_words(dirty_text):
cleaned_text = ''
for word in dirty_text.lower().split():
if word in spanish_stopwords or word in non_words:
continue
else:
cleaned_text += word + ' '
return cleaned_text
def remove_punctuation(dirty_string):
for word in non_words:
dirty_string = dirty_string.replace(word, '')
return dirty_string
def get_possible_relations(tweet):
possible_relations = []
entities_for_relations = tweet.nee_entities
if entities_for_relations:
for first, second in zip(entities_for_relations, entities_for_relations[1:]):
try:
text_in_between = tweet.full_text[(tweet.full_text.index(first.text) + len(
first.text)):tweet.full_text.index(second.text)]
cleaned_text = remove_punctuation(text_in_between)
cleaned_text = remove_stop_words(cleaned_text)
possible_relations.append(PossibleRelation(
text_in_between.strip(), cleaned_text.strip(), first, second, tweet))
except Exception as e:
# Do nothing, expectable exception if no substring is founded
pass
return possible_relations