forked from asafamr/SymPatternWSI
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathspwsi_elmo_batch_run.py
203 lines (171 loc) · 8.05 KB
/
spwsi_elmo_batch_run.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
import os
import logging
from time import strftime
from spwsi.spwsi import DEFAULT_PARAMS, SPWSI
import multiprocessing
import numpy as np
from tqdm import tqdm
import time
import csv
import sys
# will be set per process during worker init and persist until the end
spwsi_runner = None
# Changing these requires a restart
LM_BATCH_SIZE = 20
LM_VOACB_CUTOFF = 50000
DEBUG_DIR = 'debug'
run_name = '' # set in main
cuda_device_dispatcher = None # set in main
gpus = [0, 0, 1, 1, 2, 2, 3, 3] # multiple workers per gpu
# generate configurations to run
def get_configs_ablations():
"""
generates 10 of each ablation scenario
"""
for _ in range(30):
yield dict()
yield dict(disable_symmetric_patterns=True)
yield dict(disable_lemmatization=True)
yield dict(disable_tfidf=True)
yield dict(disable_symmetric_patterns=True, disable_lemmatization=True)
yield dict(disable_symmetric_patterns=True, disable_lemmatization=True, disable_tfidf=True)
def get_configs_cluster_size():
"""
generates 10 of each cluster size
"""
for _ in range(10):
for n_clusters in range(4, 16):
yield dict(n_clusters=n_clusters)
def get_configs_random_search():
"""
used to validate our parameters are sane and abaltion results are consistent
we put some prior on the default params which seem good,
keeping default values according to a coin flip
"""
def random_log_uni_int(low, high):
low = np.log(low)
high = np.log(high)
return int(np.exp(np.random.uniform(low, high)))
def flip_coin():
return np.random.choice([True, False])
while True:
proposed_conf = DEFAULT_PARAMS.copy()
method = np.random.choice(['no-disable', 'nosp', 'rand', 'none'])
if method == 'nosp':
proposed_conf['disable_symmetric_patterns'] = True
proposed_conf['disable_tfidf'] = False
proposed_conf['disable_lemmatization'] = False
elif method == 'rand':
proposed_conf['disable_symmetric_patterns'] = flip_coin()
proposed_conf['disable_tfidf'] = flip_coin()
proposed_conf['disable_lemmatization'] = flip_coin()
elif method == 'none':
proposed_conf['disable_symmetric_patterns'] = True
proposed_conf['disable_tfidf'] = True
proposed_conf['disable_lemmatization'] = True
else:
proposed_conf['disable_symmetric_patterns'] = False
proposed_conf['disable_tfidf'] = False
proposed_conf['disable_lemmatization'] = False
if flip_coin():
proposed_conf['prediction_cutoff'] = random_log_uni_int(20, 1000)
if flip_coin():
proposed_conf['n_clusters'] = np.random.randint(5, 10)
if flip_coin():
proposed_conf['n_represent'] = random_log_uni_int(4, 100)
if flip_coin():
proposed_conf['n_samples_side'] = np.random.randint(5, 10)
yield proposed_conf
def worker_init():
global spwsi_runner, LM_BATCH_SIZE, LM_VOACB_CUTOFF, cuda_device_dispatcher
from spwsi.bilm_elmo import BilmElmo # this is intentionally here, when ELMo is imported some state is set
worker_id, cuda_device = cuda_device_dispatcher.get()
np.random.seed((int(time.time() * 100) % 10000) + worker_id)
root_logger = logging.getLogger()
root_logger.setLevel(logging.INFO)
handler = logging.FileHandler(os.path.join(debug_dir, '%s.worker.%d.log.txt' % (run_name, cuda_device)), 'w',
'utf-8')
formatter = logging.Formatter(fmt='%(asctime)s %(message)s', datefmt='%H:%M:%S')
handler.setFormatter(formatter) # Pass handler as a parameter, not assign
root_logger.addHandler(handler)
elmo_vocab_path = './resources/vocab-2016-09-10.txt'
elmo_as_lm = BilmElmo(cuda_device, './resources/elmo_2x4096_512_2048cnn_2xhighway_softmax_weights.hdf5',
elmo_vocab_path, batch_size=LM_BATCH_SIZE,
cutoff_elmo_vocab=LM_VOACB_CUTOFF
)
logging.info('created ELMo on cuda device %d' % cuda_device)
spwsi_runner = SPWSI(elmo_as_lm)
def worker_do(idx_conf):
idx, worker_conf = idx_conf
global spwsi_runner, run_name
run_name_full = '%s.%d' % (run_name, idx)
params = DEFAULT_PARAMS.copy()
params.update(worker_conf)
logging.info('running with config %s' % params)
print('running with config %s' % params)
res = spwsi_runner.run(n_clusters=params['n_clusters'],
n_represent=params['n_represent'],
n_samples_side=params['n_samples_side'],
disable_tfidf=params['disable_tfidf'],
disable_lemmatization=params['disable_lemmatization'],
disable_symmetric_patterns=params['disable_symmetric_patterns'],
prediction_cutoff=params['prediction_cutoff'],
run_name=run_name_full,
debug_dir=DEBUG_DIR,
print_progress=False)
return run_name_full, params, res
def create_lemmatized_if_needed():
# done in a different process to avid polluting the global environment when importing everything
from spwsi.bilm_elmo import BilmElmo # this is intentionally here, when ELMo is imported some state is set
elmo_vocab_path = './resources/vocab-2016-09-10.txt'
BilmElmo.create_lemmatized_vocabulary_if_needed(elmo_vocab_path)
if __name__ == '__main__':
print('BiLM Symmetric Patterns WSI Demo - Batch run')
debug_dir = 'debug'
run_name = 'batch-' + strftime("%m%d-%H%M%S")
if not os.path.exists(debug_dir):
os.makedirs(debug_dir)
target_function = None
gen_name = sys.argv[1] if len(sys.argv) > 1 else None
if gen_name == 'ablation':
target_function = get_configs_ablations
elif gen_name == 'search':
target_function = get_configs_random_search
elif gen_name == 'n_clusters':
target_function = get_configs_cluster_size
else:
raise Exception(
'missing valid scenario in script arguments, valid scenarios are: ablation, search, n_clusters')
run_name += '-' + gen_name
print('scenario: %s' % gen_name)
if len(sys.argv) > 2:
# cuda devices in second arguments
gpus = [int(x) for x in sys.argv[2].split(',')]
print('gpus set in command line arguments: %s' % gpus)
# lemmatizer = multiprocessing.Process(target=create_lemmatized_if_needed)
# lemmatizer.start()
# lemmatizer.join()
cuda_device_dispatcher = multiprocessing.Queue()
# for i, gpu in enumerate(gpus):
# cuda_device_dispatcher.put((i, gpu))
pool = multiprocessing.Pool(1, initializer=worker_init)
out_csv_path = os.path.join(debug_dir, run_name + '.data.csv')
print('starting batch run. results will be written to %s. this might take a while...' % out_csv_path)
# in addition to per target scores, an "all" entry row will contain the final result for a run
with open(out_csv_path, 'a') as fout:
writer = csv.writer(fout)
conf_params_report = ['n_clusters', 'n_represent', 'n_samples_side', 'disable_lemmatization',
'disable_symmetric_patterns', 'disable_tfidf', 'prediction_cutoff']
writer.writerow(
['run_name', 'target', 'FBC', 'FNMI', 'AVG', 'lm_batch_size', 'cutoff_lm_vocab'] + conf_params_report)
for run_name_done, conf, scores in pool.imap_unordered(worker_do, enumerate(target_function())):
print(len(scores.keys()))
for target, target_scores in scores.items():
writer.writerow([run_name_done, target,
target_scores['FBC'],
target_scores['FNMI'],
np.sqrt(target_scores['FBC'] * target_scores['FNMI']),
LM_BATCH_SIZE,
LM_VOACB_CUTOFF,
] + [conf[x] for x in conf_params_report])
fout.flush()