From e2794c03eaff3435b69cdc1cf5c6eff8fc3524f0 Mon Sep 17 00:00:00 2001 From: Johann-Mattis List Date: Thu, 7 Apr 2022 09:38:33 +0200 Subject: [PATCH] Reconstruct (#29) * update reconstruction code * update code, fix bug * update * finalize for v1.2 * pep8 ; extended test coverage; py3.10 support Co-authored-by: lingulist Co-authored-by: lingulist Co-authored-by: Robert Forkel --- .github/workflows/python-package.yml | 2 +- .zenodo.json | 2 +- README.md | 12 +- setup.py | 1 + src/lingrex/borrowing.py | 2 +- src/lingrex/reconstruct.py | 395 +++++++++++++++++++++++++++ src/lingrex/util.py | 102 ++++++- tests/data/hillburmish.tsv | 37 +++ tests/test_reconstruct.py | 100 +++++++ tests/test_util.py | 59 ++++ tox.ini | 2 +- 11 files changed, 705 insertions(+), 9 deletions(-) create mode 100644 src/lingrex/reconstruct.py create mode 100644 tests/data/hillburmish.tsv create mode 100644 tests/test_reconstruct.py diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 701c551..fc2365e 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -12,7 +12,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: [3.6, 3.7, 3.8, 3.9] + python-version: [3.6, 3.7, 3.8, 3.9, "3.10"] steps: - uses: actions/checkout@v2 diff --git a/.zenodo.json b/.zenodo.json index a5c61ae..624979a 100644 --- a/.zenodo.json +++ b/.zenodo.json @@ -19,7 +19,7 @@ "upload_type": "software", "communities": [ { - "identifier": "dighl" + "identifier": "digling" }, { "identifier": "calc" diff --git a/README.md b/README.md index 3b2e5c5..cff3f59 100644 --- a/README.md +++ b/README.md @@ -11,13 +11,19 @@ LingRex offers the code needed for the automatic inference of sound corresponden To test this workflow, please check the workflow code example in `tests/workflows/list-2019`. -When using this package in your research, please make sure to quote the paper accordingly, and quote the software package as follows: +LingRex offers also the code needed for a baseline algorithm for automatic word prediction or automatic phonological reconstruction in a supervised fashion. -> List, Johann-Mattis and Forkel, Robert (2021): LingRex: Linguistic Reconstruction with LingPy. [Computer software, Version 1.1.1]. Geneva: Zenodo. [DOI: 10.5281/zenodo.1544943](https://doi.org/10.5281/zenodo.1544943) +> List, J.-M. and R. Forkel and N. W. Hill (forthcoming): A New Framework for Fast Automated Phonological Reconstruction Using Trimmed Alignments and Sound Correspondence Patterns. Proceedings of the 3rd International Workshop on Computational Approaches to Historical Language Change (LChange 2022). Dublin. Ireland. + +This algorithm is also used as a baseline for an upcoming Shared Task on the Prediction of Cognate Reflexes (https://sigtyp.github.io/st2022.html), organized as part of the SIGTYP Workshop at NAACL 2022. + +When using this package in your research, please make sure to quote the respective papers, depending on the algorithms you use, and quote the software package as follows: + +> List, J.-M. and R. Forkel (2022): LingRex: Linguistic Reconstruction with LingPy. [Computer software, Version 1.2.0]. Geneva: Zenodo. [DOI: 10.5281/zenodo.1544943](https://doi.org/10.5281/zenodo.1544943) Since this software package itself makes use of LingPy's alignment algorithms, you should also quote the LingPy package itself. -> List, J.-M. and R. Forkel (2021): LingPy. A Python library for quantitative tasks in historical linguistics. Version 2.6.7. Version 2.6.7. Max Planck Institute for Evolutionary Anthropology: Leipzig. https://lingpy.org +> List, J.-M. and R. Forkel (2021): LingPy. A Python library for quantitative tasks in historical linguistics. Version 2.6.9. Max Planck Institute for Evolutionary Anthropology: Leipzig. https://lingpy.org ## Installation diff --git a/setup.py b/setup.py index 5b5d829..6937cad 100644 --- a/setup.py +++ b/setup.py @@ -16,6 +16,7 @@ 'Programming Language :: Python :: 3.7', 'Programming Language :: Python :: 3.8', 'Programming Language :: Python :: 3.9', + 'Programming Language :: Python :: 3.10', ], version='1.1.2.dev0', packages=find_packages(where='src'), diff --git a/src/lingrex/borrowing.py b/src/lingrex/borrowing.py index c75c142..b348f7b 100644 --- a/src/lingrex/borrowing.py +++ b/src/lingrex/borrowing.py @@ -179,7 +179,7 @@ def external_cognates( # compare groups for (gA, iA), (gB, iB) in itertools.combinations(list(groups.items()), r=2): - if G.nodes[str(gA)][family] != G.nodes[str(gB)][family]: + if G.nodes[str(gA)]["family"] != G.nodes[str(gB)]["family"]: wpairs = [(a[2], b[2]) for a, b in itertools.product(iA, iB)] pairs = Pairwise(wpairs) diff --git a/src/lingrex/reconstruct.py b/src/lingrex/reconstruct.py new file mode 100644 index 0000000..734b9a1 --- /dev/null +++ b/src/lingrex/reconstruct.py @@ -0,0 +1,395 @@ +""" +Module provides methods for linguistic reconstruction. +""" +import itertools +import collections + +from lingpy.align.sca import Alignments, get_consensus +from lingpy.sequence.sound_classes import prosodic_string, class2tokens +from lingpy.align.multiple import Multiple +from lingpy.align.pairwise import edit_dist, nw_align +from lingpy.evaluate.acd import _get_bcubed_score as get_bcubed_score +from lingpy.align.sca import normalize_alignment +import networkx as nx +from networkx.algorithms.clique import find_cliques +from lingpy import log + +from lingrex.util import clean_sound, ungap, alm2tok + + +class CorPaRClassifier(object): + + def __init__(self, minrefs=2, missing=0, threshold=1): + self.G = nx.Graph() + self.missing = 0 + self.threshold = threshold + + def compatible(self, ptA, ptB): + """ + Check for compatibility of two patterns. + """ + res = {True: 0, False: 0} + for a, b in zip(ptA, ptB): + if a and b: + res[a == b] += 1 + return res[True], res[False] + + def consensus(self, nodes): + """ + Create a consensus pattern of multiple alignment sites. + """ + cons = [] + for i in range(len(nodes[0])): + nocons = True + for node in nodes: + if node[i] != self.missing: + cons += [node[i]] + nocons = False + break + if nocons: + cons += [self.missing] + return tuple(cons) + + def fit(self, X, y): + """ + Train the prediction of data in y with data in X. + + :param X: Two-dimensional array with observations. + :param y: One-dimensional array with results. + """ + # get identical patterns + P = collections.defaultdict(list) + for i, row in enumerate(X): + P[tuple(row + [y[i]])] += [i] + # make graph + for (pA, vA), (pB, vB) in itertools.combinations(P.items(), r=2): + match_, mismatch = self.compatible(pA, pB) + if not mismatch and match_ >= self.threshold: + if pA not in self.G: + self.G.add_node(pA, freq=len(vA)) + if pB not in self.G: + self.G.add_node(pB, freq=len(vB)) + self.G.add_edge(pA, pB, weight=match_) + self.patterns = collections.defaultdict(collections.Counter) + self.lookup = collections.defaultdict(collections.Counter) + # get cliques + for nodes in find_cliques(self.G): + cons = self.consensus(list(nodes)) + self.patterns[cons[:-1]][cons[-1]] = len(nodes) + for node in nodes: + self.lookup[node[:-1]][cons[:-1]] += len(nodes) + self.predictions = { + ptn: counts.most_common(1)[0][0] for ptn, counts in self.patterns.items()} + for ptn, counts in self.lookup.items(): + self.predictions[ptn] = self.predictions[counts.most_common(1)[0][0]] + + # make index of data points for quick search based on attested data + self.ptnlkp = collections.defaultdict(list) + for ptn in self.patterns: + for i in range(len(ptn)): + if ptn[i] != self.missing: + self.ptnlkp[i, ptn[i]] += [ptn] + + def predict(self, matrix): + out = [] + for row in matrix: + ptn = tuple(row) + if ptn in self.predictions: + out.append(self.predictions[ptn]) + else: + candidates = collections.Counter() + for i in range(len(ptn) - 1): + if ptn[i] != self.missing: + for ptnB in self.ptnlkp[i, ptn[i]]: + if ptnB not in candidates: + match_, mismatch = self.compatible(ptn, ptnB) + if match_ and not mismatch: + candidates[ptnB] = match_ + len(ptn) + elif match_ - mismatch: + candidates[ptnB] = match_ - mismatch + if candidates: + self.predictions[tuple(row)] = self.predictions[candidates.most_common(1)[0][0]] + out += [self.predictions[tuple(row)]] + else: + out += [self.missing] + return out + + +class ReconstructionBase(Alignments): + """ + Basic class for the phonological reconstruction. + """ + def __init__( + self, infile, target=None, ref="cogids", fuzzy=True, + transcription="form", missing="Ø", gap="-"): + Alignments.__init__(self, infile, fuzzy=fuzzy, ref=ref, transcription=transcription) + self.target = target + self.missing = missing + self.gap = gap + self.languages = [t for t in self.cols if t != target] + self.target = target + self.tgtidx = self.cols.index(target) + self.lngidx = {t: self.cols.index(t) for t in self.languages} + + def iter_sequences(self, aligned=False): + """ + Iterate over aligned or unaligned sequences with or without the target \ + sequence. + """ + seq_ref = self._alignments if aligned else self._segments + for cogid, idxs in self.etd[self._ref].items(): + if idxs[self.tgtidx]: + if self._mode == "fuzzy": + target = self[idxs[self.tgtidx][0], seq_ref].n[ + self[idxs[self.tgtidx][0], self._ref].index(cogid)] + else: + target = self[idxs[self.tgtidx][0], seq_ref] + alignment, languages = [], [] + for j, lng in enumerate(self.languages): + lidx = self.lngidx[lng] + if idxs[lidx]: + languages += [lng] + idx = idxs[lidx][0] + if self._mode == "fuzzy": + alm = self[idx, seq_ref].n[self[idx, self._ref].index(cogid)] + else: + alm = self[idx, seq_ref] + alignment.append([clean_sound(x) for x in alm]) + alignment.append([clean_sound(x) for x in target]) + if aligned: + alignment = normalize_alignment(alignment) + languages.append(self.target) + yield cogid, alignment, languages + + +class OneHot(object): + """ + Create a one-hot-encoder from a matrix. + """ + + def __init__(self, matrix): + self.vals = [] + for i in range(len(matrix[0])): + cols = [row[i] for row in matrix] + self.vals += [sorted(set(cols)) + ["?"]] + + def __call__(self, matrix): + out = [[] for row in matrix] + for i, vals in enumerate(self.vals): + for j in range(len(matrix)): + template = [0 for k in vals] + try: + template[matrix[j][i]] = 1 + except IndexError: + template[-1] = 1 + out[j] += template + return out + + +def transform_alignment(seqs, + languages, + all_languages, + align=True, + training=True, + missing="Ø", + gap="-", + startend=False, + prosody=False, + position=False, + firstlast=False): + """ + Basic alignment function used for phonological reconstruction. + """ + if align: + seqs = [[s for s in seq if s != gap] for seq in seqs] + msa = Multiple([[s for s in seq if s != gap] for seq in seqs]) + msa.prog_align() + alms = [alm for alm in msa.alm_matrix] + else: + seqs = [[s for s in seq if s != gap] for seq in seqs] + alms = normalize_alignment([s for s in seqs]) + if training: + alms = ungap(alms, languages, languages[-1]) + these_seqs = seqs[:-1] + else: + these_seqs = seqs + matrix = [[missing for x in all_languages] for y in alms[0]] + for i in range(len(alms[0])): + for j, lng in enumerate(languages): + lidx = all_languages.index(lng) + snd = clean_sound(alms[j][i]) + matrix[i][lidx] = snd + if position: + for i in range(len(matrix)): + matrix[i] += [i] + if startend: + matrix[0] += [0] + for i in range(1, len(matrix) - 1): + matrix[i] += [1] + if len(matrix) > 1: + matrix[-1] += [2] + if prosody: + for i, c in enumerate( + get_consensus( + [class2tokens(prosodic_string(seqs[j], _output="CcV"), alms[j]) + for j in range(len(these_seqs))], + gaps=True)): + matrix[i] += [c] + if firstlast: + if training: + all_seqs = len(all_languages) - 1 + else: + all_seqs = len(all_languages) + for i, row in enumerate(matrix): + for j in range(all_seqs): + matrix[i] += [matrix[0][j], matrix[-1][j]] + + # for debugging + for row in matrix: + assert len(row) == len(matrix[0]) + return matrix + + +class PatternReconstructor(ReconstructionBase): + """ + Automatic reconstruction with correspondence patterns. + """ + + def fit(self, clf=None, onehot=False, func=None, aligned=False): + """ + Fit a classifier to the data. + + :param clf: a classifier with a predict function. + """ + self.patterns = collections.defaultdict(lambda: collections.defaultdict(list)) + self.occurrences = collections.defaultdict(list) + self.func = func or transform_alignment + + for cogid, alignment, languages in self.iter_sequences(): + if len(alignment) >= 2: + matrix = self.func( + alignment, + languages, + self.languages + [self.target], + training=True) + for i, row in enumerate(matrix): + ptn = tuple(row[:len(self.languages)] + row[len(self.languages) + 1:]) + self.patterns[ptn][row[len(self.languages)]] += [ + (cogid, i)] + for j, lng in enumerate(self.languages): + if row[j] not in [self.missing]: + self.occurrences[lng, j, row[j]] += [(cogid, i)] + for j in range(len(self.languages) + 1, len(row)): + self.occurrences["feature-{0}".format(j - 1), j - 1, row[j]] += [(cogid, i)] + + self.snd2idx = {(i, self.missing): 0 for i in range(len(matrix[0]))} + for i in range(len(matrix[0])): + self.snd2idx[i, self.gap] = 1 + + idxtracker = {i: 2 for i in range(len(matrix[0]))} + for lng, lidx, sound in self.occurrences: + last_idx = idxtracker[lidx] + if (lidx, sound) not in self.snd2idx: + self.snd2idx[lidx, sound] = last_idx + idxtracker[lidx] += 1 + + self.tgt2idx = {} + idx = 1 + for pattern in self.patterns: + for sound in self.patterns[pattern]: + if sound not in self.tgt2idx: + self.tgt2idx[sound] = idx + idx += 1 + + self.matrix = [] + self.solutions = [] + for pattern, sounds in self.patterns.items(): + for sound, vals in sounds.items(): + tidx = self.tgt2idx[sound] + row = [] + for i in range(len(pattern)): + sidx = self.snd2idx[i, pattern[i]] + row += [sidx] + for cogid, idx in vals: + self.matrix += [row] + self.solutions += [tidx] + self.dim = len(self.matrix[0]) + if clf is not None: + self.clf = clf + else: + self.clf = CorPaRClassifier() + log.info("fitting classifier") + if onehot: + self.onehot = OneHot(self.matrix) + self.clf.fit(self.onehot(self.matrix), self.solutions) + else: + self.clf.fit(self.matrix, self.solutions) + self.idx2tgt = {v: k for k, v in self.tgt2idx.items()} + log.info("fitted the classifier") + + def predict( + self, alignment, languages, unknown="?", onehot=False, + desegment=True): + """ + Predict a word form from an alignment. + + :param desegment: Return the form without gaps and ungapped tokens. + """ + matrix = self.func(alignment, languages, self.languages, training=False) + for row in matrix: + assert len(row) == self.dim + new_matrix = [[0 for char in row] for row in matrix] + for i, row in enumerate(matrix): + for j, char in enumerate(row): + new_matrix[i][j] = self.snd2idx.get((j, char), 0) + if hasattr(self, "onehot"): + new_matrix = self.onehot(new_matrix) + out = [self.idx2tgt.get(idx, unknown) for idx in self.clf.predict(new_matrix)] + return alm2tok(out) if desegment else out + + +def eval_by_dist(data, func=None, **kw): + """ + Evaluate by measuring distances between sequences. + + :param data: List of tuples with prediction and attested sequence. + :param func: Alignment function (defaults to edit distance) + + :note: Defaults to the unnormalized edit distance. + """ + func = func or edit_dist + scores = [] + for seqA, seqB in data: + if not seqA: + seqA = ["?"] + if not seqB: + seqB = ["?"] + scores += [func(seqA, seqB, **kw)] + return sum(scores) / len(scores) + + +def eval_by_bcubes(data, func=None, **kw): + """ + Evaluate by measuring B-Cubed F-scores. + + :param data: List of tuples with prediction and attested sequence. + :param func: Alignment function (defaults to Needleman-Wunsch) + """ + numsA, numsB = {"": 0}, {"": 0} + func = func or nw_align + almsA, almsB = [], [] + for seqA, seqB in data: + if not seqA: + seqA = ["?"] + if not seqB: + seqB = ["?"] + almA, almB, score = func(seqA, seqB, **kw) + for a, b in zip(almA, almB): + if a not in numsA: + numsA[a] = max(numsA.values()) + 1 + if b not in numsB: + numsB[b] = max(numsB.values()) + 1 + almsA += [numsA[a]] + almsB += [numsB[b]] + p, r = get_bcubed_score(almsA, almsB), get_bcubed_score(almsB, almsA) + return 2 * (p * r) / (p + r) diff --git a/src/lingrex/util.py b/src/lingrex/util.py index ed8ac3f..831d4d6 100644 --- a/src/lingrex/util.py +++ b/src/lingrex/util.py @@ -1,20 +1,118 @@ """ Utility functions for the lingrex package. """ +import math import pathlib from lingpy import tokens2class, prosodic_string from lingpy.align.sca import get_consensus from lingpy import basictypes as bt +from lingpy.sequence.ngrams import get_n_ngrams def lingrex_path(*comps): return str(pathlib.Path(__file__).parent.joinpath(*comps)) +def bleu_score(word, reference, n=4, weights=None, trim=True): + """ + Compute the BLEU score for predicted word and reference. + + :param word: the predicted word + :param reference: the predicted reference + :param n: the order of ngrams + :param weights: list of weights, should be the same size as n + :param trim: bool, decide to trim n-grams or not + """ + + if not weights: + weights = [1 / n for x in range(n)] + + scores = [] + for i in range(1, n + 1): + new_wrd = list(get_n_ngrams(word, i)) + new_ref = list(get_n_ngrams(reference, i)) + if trim and i > 1: + new_wrd = new_wrd[i - 1:-(i - 1)] + new_ref = new_ref[i - 1:-(i - 1)] + + clipped, divide = [], [] + for itm in set(new_wrd): + clipped += [new_ref.count(itm)] + divide += [new_wrd.count(itm)] + scores += [sum(clipped) / sum(divide)] + + # calculate arithmetic mean + out_score = 1 + for weight, score in zip(weights, scores): + out_score = out_score * (score ** weight) + + bp = 1 if len(word) > len(reference) else math.e ** (1 - (len(reference) / len(word))) + return bp * (out_score ** (1 / sum(weights))) + + +def clean_sound(sound): + """ + Get rid of "a/b" notation for sound segments. + """ + return ".".join([s.split('/')[1] if "/" in s else s for s in sound.split('.')]) + + +def alm2tok(seq, gap="-"): + """Turn an alignment into a sequence.""" + return [clean_sound(x) for x in unjoin(seq) if x != gap] + + +def unjoin(seq): + """ + Turn segments joined by a dot into unjoined segments. + """ + out = [] + for itm in seq: + out += itm.split('.') + return out + + +def ungap(alignment, languages, proto): + """ + Trim an MSA to remove all gaps in the target sequence. + + :examples: + >>> ungap([['a', 'b'], ['x', '-'], ['y', '-']], ['proto', 'l1', 'l2'], 'proto') + ... [['a.b'], ['x'], ['y']] + >>> ungap([['a', 'b'], ['x', '-'], ['y', 'h']], ['proto', 'l1', 'l2'], 'proto') + ... [['a', 'b'], ['x', '-'], ['y', 'h']] + """ + pidxs = [i for i, taxon in enumerate(languages) if taxon == proto] + merges = [] + for i in range(len(alignment[0])): # go through the rows of the alignment ... + col = [row[i] for row in alignment] + # ... looking for gap-only alignments (in non-proto languages): + if {site for j, site in enumerate(col) if j not in pidxs} == {'-'}: + merges += [i] + if not merges: + return alignment + new_alms = [] + for i, row in enumerate(alignment): + new_alm, mergeit, started = [], False, True + for j, cell in enumerate(row): + if j in merges or mergeit: + mergeit = False + if not started: # j != 0: + if cell != "-": + new_alm[-1] += '.' + cell if new_alm[-1] else cell + else: + mergeit = True + new_alm.append("" if cell == "-" else cell) + else: + started = False + new_alm.append(cell) + new_alms.append([cell or "-" for cell in new_alm]) + return new_alms + + def add_structure( - wordlist, model="cv", segments="tokens", structure="structure", ref="cogid", gap="-" -): + wordlist, model="cv", segments="tokens", structure="structure", ref="cogid", gap="-"): """Add structure to a wordlist to make sure correspondence patterns can be inferred""" if model not in ["cv", "c", "CcV", "ps", "nogap"]: diff --git a/tests/data/hillburmish.tsv b/tests/data/hillburmish.tsv new file mode 100644 index 0000000..4dc07c0 --- /dev/null +++ b/tests/data/hillburmish.tsv @@ -0,0 +1,37 @@ +ID DOCULECT CONCEPT VALUE FORM TOKENS NOTE COGIDS +1 OldBurmese I ṅa ṅa ṅ/ŋ a 665 +4 Atsi I ŋo⁵¹ ŋo⁵¹ ŋ o ⁵¹ 665 +6 Lashi I ŋo³¹ ŋo³¹ ŋ o ³¹ 665 +9 ProtoBurmish I *ŋa ŋa¹ ŋ a ¹ 665 +147 Atsi banana (plantain) ŋoʔ⁵⁵ mjuʔ²¹ ŋoʔ⁵⁵+mjuʔ²¹ ŋ o ʔ ⁵⁵ + m j u ʔ ²¹ 681 3302 +149 Lashi banana (plantain) ŋɔʔ⁵⁵ mju̱k⁵⁵ ŋɔʔ⁵⁵+mju̱k⁵⁵ ŋ ɔ ʔ ⁵⁵ + m j u̱ k ⁵⁵ 681 3304 +151 ProtoBurmish banana (plantain) *ŋak ŋak⁴ ŋ a k ⁴ 681 +167 ProtoBurmish be (in the house) *ŋji ŋji¹ ŋ j i ¹ 488 +283 ProtoBurmish blue *ŋjuŋ ŋjuŋ¹ ŋ j u ŋ ¹ 698 +284 ProtoBurmish blue *ŋju ŋju¹ ŋ j u ¹ 699 +665 Atsi cooked, be (rice) / done ŋjoʔ²¹ ŋjoʔ²¹ ŋ j o ʔ ²¹ 684 +667 Lashi cooked, be (rice) / done ŋjɔːʔ³¹ ŋjɔːʔ³¹ ŋ j ɔː ʔ ³¹ 684 +670 ProtoBurmish cooked, be (rice) / done *ŋjak ŋjak⁴ ŋ j a k ⁴ 684 +733 OldBurmese cry ṅuiw ṅui̯ ṅ/ŋ ui̯ 693 +736 Atsi cry ŋau⁵¹ ŋau⁵¹ ŋ au ⁵¹ 693 +738 Lashi cry ŋaːu³¹ ŋaːu³¹ ŋ aːu ³¹ 693 +741 ProtoBurmish cry *ŋu ŋu¹ ŋ u ¹ 693 +799 ProtoBurmish day (time) *ŋjiX ŋji³ ŋ j i ³ 491 +1230 Lashi fifteen tshĕ³³ ŋ³³ tshĕ³³+ŋ³³ tsʰ ĕ ³³ + ŋ ³³ 3295 666 +1232 ProtoBurmish fifteen *ŋaX ŋa³ ŋ a ³ 667 +1275 OldBurmese five ṅaḥ ṅaḥ ṅ/ŋ a ḥ/⁵ 666 +1278 Atsi five ŋo²¹ ŋo²¹ ŋ o ²¹ 666 +1283 ProtoBurmish five *ŋaH ŋa² ŋ a ² 666 +2888 OldBurmese salty ṅan ṅan ṅ/ŋ a n 683 +2895 ProtoBurmish salty *ŋan ŋan¹ ŋ a n ¹ 683 +3109 OldBurmese silver ṅuy ṅuj ṅ/ŋ u j 696 +3112 Atsi silver ŋun⁵¹ ŋun⁵¹ ŋ u n ⁵¹ 696 +3114 Lashi silver ŋə³¹ ŋə³¹ ŋ ə ³¹ 696 +3117 ProtoBurmish silver *ŋui ŋui¹ ŋ ui ¹ 696 +3170 OldBurmese small ṅay ṅai ṅ/ŋ ai 668 +3174 Lashi small ŋɛː³¹ ŋɛː³¹ ŋ ɛː ³¹ 668 +3177 ProtoBurmish small *ŋai ŋai¹ ŋ ai ¹ 668 +3619 Atsi tongs (fire) ŋjap²¹ ŋjap²¹ ŋ j a p ²¹ 686 +3621 Lashi tongs (fire) ŋjap³¹ tsei⁵⁵ ŋjap³¹+tsei⁵⁵ ŋ j a p ³¹ + ts ei ⁵⁵ 686 3310 +3623 ProtoBurmish tongs (fire) *ŋjat ŋjat⁴ ŋ j a t ⁴ 686 +4030 OldBurmese young ṅay ṅai ṅ/ŋ ai 668 diff --git a/tests/test_reconstruct.py b/tests/test_reconstruct.py new file mode 100644 index 0000000..a7e17c8 --- /dev/null +++ b/tests/test_reconstruct.py @@ -0,0 +1,100 @@ +""" +Test the reconstruction module of lingrex. +""" +import pytest +from lingrex.reconstruct import ( + CorPaRClassifier, + OneHot, + ReconstructionBase, + PatternReconstructor, + transform_alignment, + eval_by_dist, + eval_by_bcubes + ) +from functools import partial + + + +def test_transform_alignment(): + + out = transform_alignment( + [["b", "a", "k"], ["b", "a"]], + ["a", "b"], + ["a", "b", "u"], + training=False + ) + assert len(out) == 3 + + out = transform_alignment( + [["b", "k"], ["b", "a", "k"]], + ["a", "b"], + ["a", "b", "u"], + training=True, + + ) + assert len(out) == 2 + + out = transform_alignment( + [["b", "k"], ["b", "a", "k"]], + ["a", "b"], + ["a", "b", "u"], + training=True, + firstlast=True + + ) + assert out[0][-1] == "k" + + out = transform_alignment( + [["b", "k"], ["b", "a", "k"]], + ["a", "b"], + ["a", "b", "u"], + training=True, + startend=True + ) + assert out[0][-1] == 0 + + +def test_PatternReconstructor(data): + + pt = PatternReconstructor(str(data / "hillburmish.tsv"), ref="cogids", + target="ProtoBurmish") + t1 = partial(transform_alignment, align=True, position=False, + prosody=False, startend=False, firstlast=False) + t2 = partial(transform_alignment, align=True, position=True, + prosody=True, startend=True, firstlast=True) + pt.fit(func=t1) + assert pt.predict( + pt.msa["cogids"][665]["seqs"][:3], + ["Atsi", "Lashi", "OldBurmese"], + desegment=True + ) == ['ŋ', 'a', '¹'] + pt.fit(func=t2) + assert pt.predict( + pt.msa["cogids"][665]["seqs"][:3], + ["Atsi", "Lashi", "OldBurmese"], + desegment=True + ) == ['ŋ', 'a', '¹'] + + pt.fit(func=t1, onehot=True) + assert pt.predict( + pt.msa["cogids"][665]["seqs"][:3], + ["Atsi", "Lashi", "OldBurmese"], + desegment=True + ) == ['ŋ', 'a', '¹'] + +def test_eval_by_dist(): + assert eval_by_dist([[["t", "a"], ["t", "o"]]]) == 1 + assert eval_by_dist([[["t", "a"], []]]) == 2 + + assert eval_by_dist([[["t", "a"], ["t", "o"]]], normalized=True) == 0.5 + +def test_eval_by_bcubes(): + assert eval_by_bcubes([[["t", "a"], ["t", "a"]]]) == 1 + assert eval_by_bcubes([ + [["t", "a"], ["t", "o"]] + ]) == 1.0 + assert eval_by_bcubes([ + [["t", "a"], []] + ]) == 1 + + diff --git a/tests/test_util.py b/tests/test_util.py index 08b4d02..09b9a61 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -1,6 +1,65 @@ import pytest from lingrex.util import lingrex_path, add_structure from lingpy import Wordlist, Alignments +from lingrex.util import ungap, clean_sound, unjoin, alm2tok, bleu_score + + +def test_bleu_score(): + candidate = "this is a test".split() + reference = "this is a small test".split() + + assert round( + bleu_score( + candidate, + reference, + weights=[0.5, 0.5], + n=2, + trim=True + ), + 2) == 0.64 + + assert round( + bleu_score( + candidate, + reference, + weights=[0.5, 0.5], + n=2, + trim=False), + 2) == 0.70 + + assert round( + bleu_score( + candidate, + reference, + n=2, + trim=False), + 2) == 0.70 + + +def test_ungap(): + matrix = ungap([['a', 'b'], ['x', '-'], ['y', '-']], ['proto', 'l1', 'l2'], 'proto') + assert matrix[0][0] == 'a.b' + assert matrix[1][0] == 'x' + assert matrix[2][0] == "y" + matrix2 = ungap([['a', 'b'], ['x', '-'], ['y', 'h']], ['proto', 'l1', 'l2'], 'proto') + assert matrix2[0][1] == ["a", "b"][1] + assert matrix2[1][1] == ["x", "-"][1] + assert matrix2[2][1] == ["y", "h"][1] + + out = ungap([["p", "-", "a"], ["p", "j", "a"]], ["German", "E"], "E") + assert out[1][0] == "p.j" + + alm = [['a', 'b'], ['-', '-'], ['-', '-']] + assert ungap(alm, ['p', 'l1', 'l2'], 'p') == alm + +def test_clean_sound(): + assert clean_sound("a/b") == "b" + assert clean_sound("a") == "a" + assert clean_sound("a/b.c/d") == "b.d" + + +def test_unjoin(): + assert unjoin("k.p a p u k.a/b".split())[0] == "k" def test_lingrex_path(): diff --git a/tox.ini b/tox.ini index edb96bf..f66f5a4 100644 --- a/tox.ini +++ b/tox.ini @@ -1,5 +1,5 @@ [tox] -envlist = py{36,37,38,39} +envlist = py{36,37,38,39,310} skip_missing_interpreters = true [testenv]