-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathprepare.py
49 lines (46 loc) · 1.52 KB
/
prepare.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
"""
Prepare test and training data for the experiments.
"""
from pkg.util import Trainer
from pathlib import Path
import json
from tabulate import tabulate
from tqdm import tqdm as progressbar
datasets = {
"wangbai": ("ProtoBai", "Bai"),
"hillburmish": ("ProtoBurmish", "Burmish"),
"ltkkaren": ("ProtoKaren", "Karen"),
"yanglalo": ("ProtoLalo", "Lalo"),
"carvalhopurus": ("ProtoPurus", "Purus"),
"meloniromance": ("Latin", "Romance"),
}
table = []
for ds, (proto, name) in progressbar(datasets.items()):
trn = Trainer(
str(Path("data", ds+".tsv")),
ref="cogids",
fuzzy=True,
target=proto
)
cognates, words = 0, 0
etd = trn.get_etymdict(ref="cogids")
for cogid, idxs in etd.items():
lngs = [trn[idx[0], "doculect"] for idx in idxs if idx]
if proto in lngs and len(lngs) > 2:
cognates += 1
words += len(lngs)
table += [[name, trn.width, cognates, words]]
for i in range(100):
wl, test_set, _ = trn.split(proportions=(90, 10, 0))
wl.output(
"tsv",
filename=str(Path(
"results", "testlists", name, "test-{0}".format(i+1))),
ignore="all",
prettify=False
)
with open(Path(
"results", "testitems", name, "test-{0}.json".format(i+1)),
"w") as f:
json.dump(test_set, f)
print(tabulate(table, tablefmt="latex"))