-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathelmo_features.py
94 lines (76 loc) · 2.88 KB
/
elmo_features.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
from allennlp.commands.elmo import ElmoEmbedder
from allennlp.data.tokenizers import word_tokenizer
import numpy as np
import time, sys
import pyprind
elmo = ElmoEmbedder()
def get_elmo_embs(data, mean=True, print_length=False):
n = len(data)
tk = word_tokenizer.WordTokenizer()
tokens = tk.batch_tokenize(data)
idx = []
bar = pyprind.ProgBar(n, stream=sys.stdout)
for i in range(len(tokens)):
idx.append([x.idx for x in tokens[i]])
tokens[i] = [x.text for x in tokens[i]]
bar.update()
bar = pyprind.ProgBar(n, stream=sys.stdout)
vectors = []
for seq in tokens:
vector = elmo.embed_sentence(seq[0:21])
if mean:
vectors.append(np.mean(vector[2], axis=0))
else:
vectors.append(vector[2])
if print_length:
print('Length of a sequence: {} with final emb vector shape: {}'.format(len(seq), vector.shape))
bar.update()
return vectors
def get_elmo_tokens(data):
n = len(data)
# elmo = ElmoEmbedder()
tk = word_tokenizer.WordTokenizer()
tokens = tk.batch_tokenize(data)
idx = []
bar = pyprind.ProgBar(n, stream=sys.stdout)
for i in range(len(tokens)):
idx.append([x.idx for x in tokens[i]])
tokens[i] = [x.text for x in tokens[i]]
bar.update()
return tokens
def get_elmo_features(data):
def get_nearest(slot, target):
for i in range(target, -1, -1):
if i in slot:
return i
tk = word_tokenizer.WordTokenizer()
tokens = tk.batch_tokenize(data)
idx = []
for i in range(len(tokens)):
idx.append([x.idx for x in tokens[i]])
tokens[i] = [x.text for x in tokens[i]]
vectors = elmo.embed_sentences(tokens)
ans = []
for i, vector in enumerate([v for v in vectors]):
P_l = data.iloc[i].Pronoun
A_l = data.iloc[i].A.split()
B_l = data.iloc[i].B.split()
P_offset = data.iloc[i]['Pronoun-offset']
A_offset = data.iloc[i]['A-offset']
B_offset = data.iloc[i]['B-offset']
if P_offset not in idx[i]:
P_offset = get_nearest(idx[i], P_offset)
if A_offset not in idx[i]:
A_offset = get_nearest(idx[i], A_offset)
if B_offset not in idx[i]:
B_offset = get_nearest(idx[i], B_offset)
emb_P = np.mean(vector[1:3, idx[i].index(P_offset), :], axis=0, keepdims=True)
emb_A = np.mean(vector[1:3, idx[i].index(A_offset):idx[i].index(A_offset) + len(A_l), :], axis=(1, 0),
keepdims=True)
emb_A = np.squeeze(emb_A, axis=0)
emb_B = np.mean(vector[1:3, idx[i].index(B_offset):idx[i].index(B_offset) + len(B_l), :], axis=(1, 0),
keepdims=True)
emb_B = np.squeeze(emb_B, axis=0)
ans.append(np.concatenate([emb_A, emb_B, emb_P], axis=1))
emb = np.concatenate(ans, axis=0)
return emb