From e2794c03eaff3435b69cdc1cf5c6eff8fc3524f0 Mon Sep 17 00:00:00 2001
From: Johann-Mattis List <LinguList@users.noreply.github.com>
Date: Thu, 7 Apr 2022 09:38:33 +0200
Subject: [PATCH] Reconstruct (#29)

* update reconstruction code

* update code, fix bug

* update

* finalize for v1.2

* pep8 ; extended test coverage; py3.10 support

Co-authored-by: lingulist <mattis.list@lingulist.de>
Co-authored-by: lingulist <mattis.list@lingpy.org>
Co-authored-by: Robert Forkel <xrotwang@googlemail.com>
---
 .github/workflows/python-package.yml |   2 +-
 .zenodo.json                         |   2 +-
 README.md                            |  12 +-
 setup.py                             |   1 +
 src/lingrex/borrowing.py             |   2 +-
 src/lingrex/reconstruct.py           | 395 +++++++++++++++++++++++++++
 src/lingrex/util.py                  | 102 ++++++-
 tests/data/hillburmish.tsv           |  37 +++
 tests/test_reconstruct.py            | 100 +++++++
 tests/test_util.py                   |  59 ++++
 tox.ini                              |   2 +-
 11 files changed, 705 insertions(+), 9 deletions(-)
 create mode 100644 src/lingrex/reconstruct.py
 create mode 100644 tests/data/hillburmish.tsv
 create mode 100644 tests/test_reconstruct.py

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index 701c551..fc2365e 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -12,7 +12,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: [3.6, 3.7, 3.8, 3.9]
+        python-version: [3.6, 3.7, 3.8, 3.9, "3.10"]
 
     steps:
     - uses: actions/checkout@v2
diff --git a/.zenodo.json b/.zenodo.json
index a5c61ae..624979a 100644
--- a/.zenodo.json
+++ b/.zenodo.json
@@ -19,7 +19,7 @@
     "upload_type": "software",
     "communities": [
         {
-            "identifier": "dighl"
+            "identifier": "digling"
         },
         {
             "identifier": "calc"
diff --git a/README.md b/README.md
index 3b2e5c5..cff3f59 100644
--- a/README.md
+++ b/README.md
@@ -11,13 +11,19 @@ LingRex offers the code needed for the automatic inference of sound corresponden
 
 To test this workflow, please check the workflow code example in `tests/workflows/list-2019`.
 
-When using this package in your research, please make sure to quote the paper accordingly, and quote the software package as follows:
+LingRex offers also the code needed for a baseline algorithm for automatic word prediction or automatic phonological reconstruction in a supervised fashion.
 
-> List, Johann-Mattis and Forkel, Robert (2021): LingRex: Linguistic Reconstruction with LingPy. [Computer software, Version 1.1.1]. Geneva: Zenodo. [DOI: 10.5281/zenodo.1544943](https://doi.org/10.5281/zenodo.1544943)
+> List, J.-M. and R. Forkel and N. W. Hill (forthcoming): A New Framework for Fast Automated Phonological Reconstruction Using Trimmed Alignments and Sound Correspondence Patterns. Proceedings of the 3rd International Workshop on Computational Approaches to Historical Language Change (LChange 2022). Dublin. Ireland.
+
+This algorithm is also used as a baseline for an upcoming Shared Task on the Prediction of Cognate Reflexes (https://sigtyp.github.io/st2022.html), organized as part of the SIGTYP Workshop at NAACL 2022.
+
+When using this package in your research, please make sure to quote the respective papers, depending on the algorithms you use, and quote the software package as follows:
+
+> List, J.-M. and R. Forkel (2022): LingRex: Linguistic Reconstruction with LingPy. [Computer software, Version 1.2.0]. Geneva: Zenodo. [DOI: 10.5281/zenodo.1544943](https://doi.org/10.5281/zenodo.1544943)
 
 Since this software package itself makes use of LingPy's alignment algorithms, you should also quote the LingPy package itself.
 
-> List, J.-M. and R. Forkel (2021): LingPy. A Python library for quantitative tasks in historical linguistics. Version 2.6.7. Version 2.6.7. Max Planck Institute for Evolutionary Anthropology: Leipzig. https://lingpy.org
+> List, J.-M. and R. Forkel (2021): LingPy. A Python library for quantitative tasks in historical linguistics. Version 2.6.9. Max Planck Institute for Evolutionary Anthropology: Leipzig. https://lingpy.org
 
 ## Installation
 
diff --git a/setup.py b/setup.py
index 5b5d829..6937cad 100644
--- a/setup.py
+++ b/setup.py
@@ -16,6 +16,7 @@
             'Programming Language :: Python :: 3.7',
             'Programming Language :: Python :: 3.8',
             'Programming Language :: Python :: 3.9',
+            'Programming Language :: Python :: 3.10',
         ],
         version='1.1.2.dev0',
         packages=find_packages(where='src'),
diff --git a/src/lingrex/borrowing.py b/src/lingrex/borrowing.py
index c75c142..b348f7b 100644
--- a/src/lingrex/borrowing.py
+++ b/src/lingrex/borrowing.py
@@ -179,7 +179,7 @@ def external_cognates(
 
             # compare groups
             for (gA, iA), (gB, iB) in itertools.combinations(list(groups.items()), r=2):
-                if G.nodes[str(gA)][family] != G.nodes[str(gB)][family]:
+                if G.nodes[str(gA)]["family"] != G.nodes[str(gB)]["family"]:
                     wpairs = [(a[2], b[2]) for a, b in itertools.product(iA, iB)]
 
                     pairs = Pairwise(wpairs)
diff --git a/src/lingrex/reconstruct.py b/src/lingrex/reconstruct.py
new file mode 100644
index 0000000..734b9a1
--- /dev/null
+++ b/src/lingrex/reconstruct.py
@@ -0,0 +1,395 @@
+"""
+Module provides methods for linguistic reconstruction.
+"""
+import itertools
+import collections
+
+from lingpy.align.sca import Alignments, get_consensus
+from lingpy.sequence.sound_classes import prosodic_string, class2tokens
+from lingpy.align.multiple import Multiple
+from lingpy.align.pairwise import edit_dist, nw_align
+from lingpy.evaluate.acd import _get_bcubed_score as get_bcubed_score
+from lingpy.align.sca import normalize_alignment
+import networkx as nx
+from networkx.algorithms.clique import find_cliques
+from lingpy import log
+
+from lingrex.util import clean_sound, ungap, alm2tok
+
+
+class CorPaRClassifier(object):
+
+    def __init__(self, minrefs=2, missing=0, threshold=1):
+        self.G = nx.Graph()
+        self.missing = 0
+        self.threshold = threshold
+
+    def compatible(self, ptA, ptB):
+        """
+        Check for compatibility of two patterns.
+        """
+        res = {True: 0, False: 0}
+        for a, b in zip(ptA, ptB):
+            if a and b:
+                res[a == b] += 1
+        return res[True], res[False]
+
+    def consensus(self, nodes):
+        """
+        Create a consensus pattern of multiple alignment sites.
+        """
+        cons = []
+        for i in range(len(nodes[0])):
+            nocons = True
+            for node in nodes:
+                if node[i] != self.missing:
+                    cons += [node[i]]
+                    nocons = False
+                    break
+            if nocons:
+                cons += [self.missing]
+        return tuple(cons)
+
+    def fit(self, X, y):
+        """
+        Train the prediction of data in y with data in X.
+
+        :param X: Two-dimensional array with observations.
+        :param y: One-dimensional array with results.
+        """
+        # get identical patterns
+        P = collections.defaultdict(list)
+        for i, row in enumerate(X):
+            P[tuple(row + [y[i]])] += [i]
+        # make graph
+        for (pA, vA), (pB, vB) in itertools.combinations(P.items(), r=2):
+            match_, mismatch = self.compatible(pA, pB)
+            if not mismatch and match_ >= self.threshold:
+                if pA not in self.G:
+                    self.G.add_node(pA, freq=len(vA))
+                if pB not in self.G:
+                    self.G.add_node(pB, freq=len(vB))
+                self.G.add_edge(pA, pB, weight=match_)
+        self.patterns = collections.defaultdict(collections.Counter)
+        self.lookup = collections.defaultdict(collections.Counter)
+        # get cliques
+        for nodes in find_cliques(self.G):
+            cons = self.consensus(list(nodes))
+            self.patterns[cons[:-1]][cons[-1]] = len(nodes)
+            for node in nodes:
+                self.lookup[node[:-1]][cons[:-1]] += len(nodes)
+        self.predictions = {
+            ptn: counts.most_common(1)[0][0] for ptn, counts in self.patterns.items()}
+        for ptn, counts in self.lookup.items():
+            self.predictions[ptn] = self.predictions[counts.most_common(1)[0][0]]
+
+        # make index of data points for quick search based on attested data
+        self.ptnlkp = collections.defaultdict(list)
+        for ptn in self.patterns:
+            for i in range(len(ptn)):
+                if ptn[i] != self.missing:
+                    self.ptnlkp[i, ptn[i]] += [ptn]
+
+    def predict(self, matrix):
+        out = []
+        for row in matrix:
+            ptn = tuple(row)
+            if ptn in self.predictions:
+                out.append(self.predictions[ptn])
+            else:
+                candidates = collections.Counter()
+                for i in range(len(ptn) - 1):
+                    if ptn[i] != self.missing:
+                        for ptnB in self.ptnlkp[i, ptn[i]]:
+                            if ptnB not in candidates:
+                                match_, mismatch = self.compatible(ptn, ptnB)
+                                if match_ and not mismatch:
+                                    candidates[ptnB] = match_ + len(ptn)
+                                elif match_ - mismatch:
+                                    candidates[ptnB] = match_ - mismatch
+                if candidates:
+                    self.predictions[tuple(row)] = self.predictions[candidates.most_common(1)[0][0]]
+                    out += [self.predictions[tuple(row)]]
+                else:
+                    out += [self.missing]
+        return out
+
+
+class ReconstructionBase(Alignments):
+    """
+    Basic class for the phonological reconstruction.
+    """
+    def __init__(
+            self, infile, target=None, ref="cogids", fuzzy=True,
+            transcription="form", missing="Ø", gap="-"):
+        Alignments.__init__(self, infile, fuzzy=fuzzy, ref=ref, transcription=transcription)
+        self.target = target
+        self.missing = missing
+        self.gap = gap
+        self.languages = [t for t in self.cols if t != target]
+        self.target = target
+        self.tgtidx = self.cols.index(target)
+        self.lngidx = {t: self.cols.index(t) for t in self.languages}
+
+    def iter_sequences(self, aligned=False):
+        """
+        Iterate over aligned or unaligned sequences with or without the target \
+                sequence.
+        """
+        seq_ref = self._alignments if aligned else self._segments
+        for cogid, idxs in self.etd[self._ref].items():
+            if idxs[self.tgtidx]:
+                if self._mode == "fuzzy":
+                    target = self[idxs[self.tgtidx][0], seq_ref].n[
+                        self[idxs[self.tgtidx][0], self._ref].index(cogid)]
+                else:
+                    target = self[idxs[self.tgtidx][0], seq_ref]
+                alignment, languages = [], []
+                for j, lng in enumerate(self.languages):
+                    lidx = self.lngidx[lng]
+                    if idxs[lidx]:
+                        languages += [lng]
+                        idx = idxs[lidx][0]
+                        if self._mode == "fuzzy":
+                            alm = self[idx, seq_ref].n[self[idx, self._ref].index(cogid)]
+                        else:
+                            alm = self[idx, seq_ref]
+                        alignment.append([clean_sound(x) for x in alm])
+                alignment.append([clean_sound(x) for x in target])
+                if aligned:
+                    alignment = normalize_alignment(alignment)
+                languages.append(self.target)
+                yield cogid, alignment, languages
+
+
+class OneHot(object):
+    """
+    Create a one-hot-encoder from a matrix.
+    """
+
+    def __init__(self, matrix):
+        self.vals = []
+        for i in range(len(matrix[0])):
+            cols = [row[i] for row in matrix]
+            self.vals += [sorted(set(cols)) + ["?"]]
+
+    def __call__(self, matrix):
+        out = [[] for row in matrix]
+        for i, vals in enumerate(self.vals):
+            for j in range(len(matrix)):
+                template = [0 for k in vals]
+                try:
+                    template[matrix[j][i]] = 1
+                except IndexError:
+                    template[-1] = 1
+                out[j] += template
+        return out
+
+
+def transform_alignment(seqs,
+                        languages,
+                        all_languages,
+                        align=True,
+                        training=True,
+                        missing="Ø",
+                        gap="-",
+                        startend=False,
+                        prosody=False,
+                        position=False,
+                        firstlast=False):
+    """
+    Basic alignment function used for phonological reconstruction.
+    """
+    if align:
+        seqs = [[s for s in seq if s != gap] for seq in seqs]
+        msa = Multiple([[s for s in seq if s != gap] for seq in seqs])
+        msa.prog_align()
+        alms = [alm for alm in msa.alm_matrix]
+    else:
+        seqs = [[s for s in seq if s != gap] for seq in seqs]
+        alms = normalize_alignment([s for s in seqs])
+    if training:
+        alms = ungap(alms, languages, languages[-1])
+        these_seqs = seqs[:-1]
+    else:
+        these_seqs = seqs
+    matrix = [[missing for x in all_languages] for y in alms[0]]
+    for i in range(len(alms[0])):
+        for j, lng in enumerate(languages):
+            lidx = all_languages.index(lng)
+            snd = clean_sound(alms[j][i])
+            matrix[i][lidx] = snd
+    if position:
+        for i in range(len(matrix)):
+            matrix[i] += [i]
+    if startend:
+        matrix[0] += [0]
+        for i in range(1, len(matrix) - 1):
+            matrix[i] += [1]
+        if len(matrix) > 1:
+            matrix[-1] += [2]
+    if prosody:
+        for i, c in enumerate(
+                get_consensus(
+                    [class2tokens(prosodic_string(seqs[j], _output="CcV"), alms[j])
+                     for j in range(len(these_seqs))],
+                    gaps=True)):
+            matrix[i] += [c]
+    if firstlast:
+        if training:
+            all_seqs = len(all_languages) - 1
+        else:
+            all_seqs = len(all_languages)
+        for i, row in enumerate(matrix):
+            for j in range(all_seqs):
+                matrix[i] += [matrix[0][j], matrix[-1][j]]
+
+    # for debugging
+    for row in matrix:
+        assert len(row) == len(matrix[0])
+    return matrix
+
+
+class PatternReconstructor(ReconstructionBase):
+    """
+    Automatic reconstruction with correspondence patterns.
+    """
+
+    def fit(self, clf=None, onehot=False, func=None, aligned=False):
+        """
+        Fit a classifier to the data.
+
+        :param clf: a classifier with a predict function.
+        """
+        self.patterns = collections.defaultdict(lambda: collections.defaultdict(list))
+        self.occurrences = collections.defaultdict(list)
+        self.func = func or transform_alignment
+
+        for cogid, alignment, languages in self.iter_sequences():
+            if len(alignment) >= 2:
+                matrix = self.func(
+                    alignment,
+                    languages,
+                    self.languages + [self.target],
+                    training=True)
+                for i, row in enumerate(matrix):
+                    ptn = tuple(row[:len(self.languages)] + row[len(self.languages) + 1:])
+                    self.patterns[ptn][row[len(self.languages)]] += [
+                        (cogid, i)]
+                    for j, lng in enumerate(self.languages):
+                        if row[j] not in [self.missing]:
+                            self.occurrences[lng, j, row[j]] += [(cogid, i)]
+                    for j in range(len(self.languages) + 1, len(row)):
+                        self.occurrences["feature-{0}".format(j - 1), j - 1, row[j]] += [(cogid, i)]
+
+        self.snd2idx = {(i, self.missing): 0 for i in range(len(matrix[0]))}
+        for i in range(len(matrix[0])):
+            self.snd2idx[i, self.gap] = 1
+
+        idxtracker = {i: 2 for i in range(len(matrix[0]))}
+        for lng, lidx, sound in self.occurrences:
+            last_idx = idxtracker[lidx]
+            if (lidx, sound) not in self.snd2idx:
+                self.snd2idx[lidx, sound] = last_idx
+                idxtracker[lidx] += 1
+
+        self.tgt2idx = {}
+        idx = 1
+        for pattern in self.patterns:
+            for sound in self.patterns[pattern]:
+                if sound not in self.tgt2idx:
+                    self.tgt2idx[sound] = idx
+                    idx += 1
+
+        self.matrix = []
+        self.solutions = []
+        for pattern, sounds in self.patterns.items():
+            for sound, vals in sounds.items():
+                tidx = self.tgt2idx[sound]
+                row = []
+                for i in range(len(pattern)):
+                    sidx = self.snd2idx[i, pattern[i]]
+                    row += [sidx]
+                for cogid, idx in vals:
+                    self.matrix += [row]
+                    self.solutions += [tidx]
+        self.dim = len(self.matrix[0])
+        if clf is not None:
+            self.clf = clf
+        else:
+            self.clf = CorPaRClassifier()
+        log.info("fitting classifier")
+        if onehot:
+            self.onehot = OneHot(self.matrix)
+            self.clf.fit(self.onehot(self.matrix), self.solutions)
+        else:
+            self.clf.fit(self.matrix, self.solutions)
+        self.idx2tgt = {v: k for k, v in self.tgt2idx.items()}
+        log.info("fitted the classifier")
+
+    def predict(
+            self, alignment, languages, unknown="?", onehot=False,
+            desegment=True):
+        """
+        Predict a word form from an alignment.
+
+        :param desegment: Return the form without gaps and ungapped tokens.
+        """
+        matrix = self.func(alignment, languages, self.languages, training=False)
+        for row in matrix:
+            assert len(row) == self.dim
+        new_matrix = [[0 for char in row] for row in matrix]
+        for i, row in enumerate(matrix):
+            for j, char in enumerate(row):
+                new_matrix[i][j] = self.snd2idx.get((j, char), 0)
+        if hasattr(self, "onehot"):
+            new_matrix = self.onehot(new_matrix)
+        out = [self.idx2tgt.get(idx, unknown) for idx in self.clf.predict(new_matrix)]
+        return alm2tok(out) if desegment else out
+
+
+def eval_by_dist(data, func=None, **kw):
+    """
+    Evaluate by measuring distances between sequences.
+
+    :param data: List of tuples with prediction and attested sequence.
+    :param func: Alignment function (defaults to edit distance)
+
+    :note: Defaults to the unnormalized edit distance.
+    """
+    func = func or edit_dist
+    scores = []
+    for seqA, seqB in data:
+        if not seqA:
+            seqA = ["?"]
+        if not seqB:
+            seqB = ["?"]
+        scores += [func(seqA, seqB, **kw)]
+    return sum(scores) / len(scores)
+
+
+def eval_by_bcubes(data, func=None, **kw):
+    """
+    Evaluate by measuring B-Cubed F-scores.
+
+    :param data: List of tuples with prediction and attested sequence.
+    :param func: Alignment function (defaults to Needleman-Wunsch)
+    """
+    numsA, numsB = {"": 0}, {"": 0}
+    func = func or nw_align
+    almsA, almsB = [], []
+    for seqA, seqB in data:
+        if not seqA:
+            seqA = ["?"]
+        if not seqB:
+            seqB = ["?"]
+        almA, almB, score = func(seqA, seqB, **kw)
+        for a, b in zip(almA, almB):
+            if a not in numsA:
+                numsA[a] = max(numsA.values()) + 1
+            if b not in numsB:
+                numsB[b] = max(numsB.values()) + 1
+            almsA += [numsA[a]]
+            almsB += [numsB[b]]
+    p, r = get_bcubed_score(almsA, almsB), get_bcubed_score(almsB, almsA)
+    return 2 * (p * r) / (p + r)
diff --git a/src/lingrex/util.py b/src/lingrex/util.py
index ed8ac3f..831d4d6 100644
--- a/src/lingrex/util.py
+++ b/src/lingrex/util.py
@@ -1,20 +1,118 @@
 """
 Utility functions for the lingrex package.
 """
+import math
 import pathlib
 
 from lingpy import tokens2class, prosodic_string
 from lingpy.align.sca import get_consensus
 from lingpy import basictypes as bt
+from lingpy.sequence.ngrams import get_n_ngrams
 
 
 def lingrex_path(*comps):
     return str(pathlib.Path(__file__).parent.joinpath(*comps))
 
 
+def bleu_score(word, reference, n=4, weights=None, trim=True):
+    """
+    Compute the BLEU score for predicted word and reference.
+
+    :param word: the predicted word
+    :param reference: the predicted reference
+    :param n: the order of ngrams
+    :param weights: list of weights, should be the same size as n
+    :param trim: bool, decide to trim n-grams or not
+    """
+
+    if not weights:
+        weights = [1 / n for x in range(n)]
+
+    scores = []
+    for i in range(1, n + 1):
+        new_wrd = list(get_n_ngrams(word, i))
+        new_ref = list(get_n_ngrams(reference, i))
+        if trim and i > 1:
+            new_wrd = new_wrd[i - 1:-(i - 1)]
+            new_ref = new_ref[i - 1:-(i - 1)]
+
+        clipped, divide = [], []
+        for itm in set(new_wrd):
+            clipped += [new_ref.count(itm)]
+            divide += [new_wrd.count(itm)]
+        scores += [sum(clipped) / sum(divide)]
+
+    # calculate arithmetic mean
+    out_score = 1
+    for weight, score in zip(weights, scores):
+        out_score = out_score * (score ** weight)
+
+    bp = 1 if len(word) > len(reference) else math.e ** (1 - (len(reference) / len(word)))
+    return bp * (out_score ** (1 / sum(weights)))
+
+
+def clean_sound(sound):
+    """
+    Get rid of "a/b" notation for sound segments.
+    """
+    return ".".join([s.split('/')[1] if "/" in s else s for s in sound.split('.')])
+
+
+def alm2tok(seq, gap="-"):
+    """Turn an alignment into a sequence."""
+    return [clean_sound(x) for x in unjoin(seq) if x != gap]
+
+
+def unjoin(seq):
+    """
+    Turn segments joined by a dot into unjoined segments.
+    """
+    out = []
+    for itm in seq:
+        out += itm.split('.')
+    return out
+
+
+def ungap(alignment, languages, proto):
+    """
+    Trim an MSA to remove all gaps in the target sequence.
+
+    :examples:
+      >>> ungap([['a', 'b'], ['x', '-'], ['y', '-']], ['proto', 'l1', 'l2'], 'proto')
+      ... [['a.b'], ['x'], ['y']]
+      >>> ungap([['a', 'b'], ['x', '-'], ['y', 'h']], ['proto', 'l1', 'l2'], 'proto')
+      ... [['a', 'b'], ['x', '-'], ['y', 'h']]
+    """
+    pidxs = [i for i, taxon in enumerate(languages) if taxon == proto]
+    merges = []
+    for i in range(len(alignment[0])):  # go through the rows of the alignment ...
+        col = [row[i] for row in alignment]
+        # ... looking for gap-only alignments (in non-proto languages):
+        if {site for j, site in enumerate(col) if j not in pidxs} == {'-'}:
+            merges += [i]
+    if not merges:
+        return alignment
+    new_alms = []
+    for i, row in enumerate(alignment):
+        new_alm, mergeit, started = [], False, True
+        for j, cell in enumerate(row):
+            if j in merges or mergeit:
+                mergeit = False
+                if not started:  # j != 0:
+                    if cell != "-":
+                        new_alm[-1] += '.' + cell if new_alm[-1] else cell
+                else:
+                    mergeit = True
+                    new_alm.append("" if cell == "-" else cell)
+            else:
+                started = False
+                new_alm.append(cell)
+        new_alms.append([cell or "-" for cell in new_alm])
+    return new_alms
+
+
 def add_structure(
-    wordlist, model="cv", segments="tokens", structure="structure", ref="cogid", gap="-"
-):
+        wordlist, model="cv", segments="tokens", structure="structure", ref="cogid", gap="-"):
     """Add structure to a wordlist to make sure correspondence patterns can be
     inferred"""
     if model not in ["cv", "c", "CcV", "ps", "nogap"]:
diff --git a/tests/data/hillburmish.tsv b/tests/data/hillburmish.tsv
new file mode 100644
index 0000000..4dc07c0
--- /dev/null
+++ b/tests/data/hillburmish.tsv
@@ -0,0 +1,37 @@
+ID	DOCULECT	CONCEPT	VALUE	FORM	TOKENS	NOTE	COGIDS
+1	OldBurmese	I	ṅa	ṅa	ṅ/ŋ a		665
+4	Atsi	I	ŋo⁵¹	ŋo⁵¹	ŋ o ⁵¹		665
+6	Lashi	I	ŋo³¹	ŋo³¹	ŋ o ³¹		665
+9	ProtoBurmish	I	*ŋa	ŋa¹	ŋ a ¹		665
+147	Atsi	banana (plantain)	ŋoʔ⁵⁵ mjuʔ²¹	ŋoʔ⁵⁵+mjuʔ²¹	ŋ o ʔ ⁵⁵ + m j u ʔ ²¹		681 3302
+149	Lashi	banana (plantain)	ŋɔʔ⁵⁵ mju̱k⁵⁵	ŋɔʔ⁵⁵+mju̱k⁵⁵	ŋ ɔ ʔ ⁵⁵ + m j u̱ k ⁵⁵		681 3304
+151	ProtoBurmish	banana (plantain)	*ŋak	ŋak⁴	ŋ a k ⁴		681
+167	ProtoBurmish	be (in the house)	*ŋji	ŋji¹	ŋ j i ¹		488
+283	ProtoBurmish	blue	*ŋjuŋ	ŋjuŋ¹	ŋ j u ŋ ¹		698
+284	ProtoBurmish	blue	*ŋju	ŋju¹	ŋ j u ¹		699
+665	Atsi	cooked, be (rice) / done	ŋjoʔ²¹	ŋjoʔ²¹	ŋ j o ʔ ²¹		684
+667	Lashi	cooked, be (rice) / done	ŋjɔːʔ³¹	ŋjɔːʔ³¹	ŋ j ɔː ʔ ³¹		684
+670	ProtoBurmish	cooked, be (rice) / done	*ŋjak	ŋjak⁴	ŋ j a k ⁴		684
+733	OldBurmese	cry	ṅuiw	ṅui̯	ṅ/ŋ ui̯		693
+736	Atsi	cry	ŋau⁵¹	ŋau⁵¹	ŋ au ⁵¹		693
+738	Lashi	cry	ŋaːu³¹	ŋaːu³¹	ŋ aːu ³¹		693
+741	ProtoBurmish	cry	*ŋu	ŋu¹	ŋ u ¹		693
+799	ProtoBurmish	day (time)	*ŋjiX	ŋji³	ŋ j i ³		491
+1230	Lashi	fifteen	tshĕ³³ ŋ³³	tshĕ³³+ŋ³³	tsʰ ĕ ³³ + ŋ ³³		3295 666
+1232	ProtoBurmish	fifteen	*ŋaX	ŋa³	ŋ a ³		667
+1275	OldBurmese	five	ṅaḥ	ṅaḥ	ṅ/ŋ a ḥ/⁵		666
+1278	Atsi	five	ŋo²¹	ŋo²¹	ŋ o ²¹		666
+1283	ProtoBurmish	five	*ŋaH	ŋa²	ŋ a ²		666
+2888	OldBurmese	salty	ṅan	ṅan	ṅ/ŋ a n		683
+2895	ProtoBurmish	salty	*ŋan	ŋan¹	ŋ a n ¹		683
+3109	OldBurmese	silver	ṅuy	ṅuj	ṅ/ŋ u j		696
+3112	Atsi	silver	ŋun⁵¹	ŋun⁵¹	ŋ u n ⁵¹		696
+3114	Lashi	silver	ŋə³¹	ŋə³¹	ŋ ə ³¹		696
+3117	ProtoBurmish	silver	*ŋui	ŋui¹	ŋ ui ¹		696
+3170	OldBurmese	small	ṅay	ṅai	ṅ/ŋ ai		668
+3174	Lashi	small	ŋɛː³¹	ŋɛː³¹	ŋ ɛː ³¹		668
+3177	ProtoBurmish	small	*ŋai	ŋai¹	ŋ ai ¹		668
+3619	Atsi	tongs (fire)	ŋjap²¹	ŋjap²¹	ŋ j a p ²¹		686
+3621	Lashi	tongs (fire)	ŋjap³¹ tsei⁵⁵	ŋjap³¹+tsei⁵⁵	ŋ j a p ³¹ + ts ei ⁵⁵		686 3310
+3623	ProtoBurmish	tongs (fire)	*ŋjat	ŋjat⁴	ŋ j a t ⁴		686
+4030	OldBurmese	young	ṅay	ṅai	ṅ/ŋ ai		668
diff --git a/tests/test_reconstruct.py b/tests/test_reconstruct.py
new file mode 100644
index 0000000..a7e17c8
--- /dev/null
+++ b/tests/test_reconstruct.py
@@ -0,0 +1,100 @@
+"""
+Test the reconstruction module of lingrex.
+"""
+import pytest
+from lingrex.reconstruct import (
+        CorPaRClassifier,
+        OneHot,
+        ReconstructionBase,
+        PatternReconstructor,
+        transform_alignment,
+        eval_by_dist,
+        eval_by_bcubes
+        )
+from functools import partial
+
+
+
+def test_transform_alignment():
+
+    out = transform_alignment(
+            [["b", "a", "k"], ["b", "a"]],
+            ["a", "b"],
+            ["a", "b", "u"],
+            training=False
+            )
+    assert len(out) == 3
+
+    out = transform_alignment(
+            [["b", "k"], ["b", "a", "k"]],
+            ["a", "b"],
+            ["a", "b", "u"],
+            training=True,
+
+            )
+    assert len(out) == 2
+
+    out = transform_alignment(
+            [["b", "k"], ["b", "a", "k"]],
+            ["a", "b"],
+            ["a", "b", "u"],
+            training=True,
+            firstlast=True
+
+            )
+    assert out[0][-1] == "k"
+
+    out = transform_alignment(
+            [["b", "k"], ["b", "a", "k"]],
+            ["a", "b"],
+            ["a", "b", "u"],
+            training=True,
+            startend=True
+            )
+    assert out[0][-1] == 0
+
+
+def test_PatternReconstructor(data):
+
+    pt = PatternReconstructor(str(data / "hillburmish.tsv"), ref="cogids",
+            target="ProtoBurmish")
+    t1 = partial(transform_alignment, align=True, position=False,
+            prosody=False, startend=False, firstlast=False)
+    t2 = partial(transform_alignment, align=True, position=True,
+            prosody=True, startend=True, firstlast=True)
+    pt.fit(func=t1)
+    assert pt.predict(
+            pt.msa["cogids"][665]["seqs"][:3],
+            ["Atsi", "Lashi", "OldBurmese"],
+            desegment=True
+            ) == ['ŋ', 'a', '¹']
+    pt.fit(func=t2)
+    assert pt.predict(
+            pt.msa["cogids"][665]["seqs"][:3],
+            ["Atsi", "Lashi", "OldBurmese"],
+            desegment=True
+            ) == ['ŋ', 'a', '¹']
+
+    pt.fit(func=t1, onehot=True)
+    assert pt.predict(
+            pt.msa["cogids"][665]["seqs"][:3],
+            ["Atsi", "Lashi", "OldBurmese"],
+            desegment=True
+            ) == ['ŋ', 'a', '¹']
+
+def test_eval_by_dist():
+    assert eval_by_dist([[["t", "a"], ["t", "o"]]]) == 1
+    assert eval_by_dist([[["t", "a"], []]]) == 2
+
+    assert eval_by_dist([[["t", "a"], ["t", "o"]]], normalized=True) == 0.5
+
+def test_eval_by_bcubes():
+    assert eval_by_bcubes([[["t", "a"], ["t", "a"]]]) == 1
+    assert eval_by_bcubes([
+        [["t", "a"], ["t", "o"]]
+        ]) == 1.0
+    assert eval_by_bcubes([
+        [["t", "a"], []]
+        ]) == 1
+
+
diff --git a/tests/test_util.py b/tests/test_util.py
index 08b4d02..09b9a61 100644
--- a/tests/test_util.py
+++ b/tests/test_util.py
@@ -1,6 +1,65 @@
 import pytest
 from lingrex.util import lingrex_path, add_structure
 from lingpy import Wordlist, Alignments
+from lingrex.util import ungap, clean_sound, unjoin, alm2tok, bleu_score
+
+
+def test_bleu_score():
+    candidate = "this is a test".split()
+    reference = "this is a small test".split()
+
+    assert round(
+            bleu_score(
+                candidate, 
+                reference, 
+                weights=[0.5, 0.5],
+                n=2,
+                trim=True
+                ),
+            2) == 0.64
+
+    assert round(
+            bleu_score(
+                candidate,
+                reference,
+                weights=[0.5, 0.5],
+                n=2,
+                trim=False),
+            2) == 0.70
+
+    assert round(
+        bleu_score(
+            candidate,
+            reference,
+            n=2,
+            trim=False),
+        2) == 0.70
+
+
+def test_ungap():
+    matrix = ungap([['a', 'b'], ['x', '-'], ['y', '-']], ['proto', 'l1', 'l2'], 'proto')
+    assert matrix[0][0] == 'a.b'
+    assert matrix[1][0] == 'x'
+    assert matrix[2][0] == "y"
+    matrix2 = ungap([['a', 'b'], ['x', '-'], ['y', 'h']], ['proto', 'l1', 'l2'], 'proto')
+    assert matrix2[0][1] == ["a", "b"][1]
+    assert matrix2[1][1] == ["x", "-"][1]
+    assert matrix2[2][1] == ["y", "h"][1]
+
+    out = ungap([["p", "-", "a"], ["p", "j", "a"]], ["German", "E"], "E")
+    assert out[1][0] == "p.j"
+
+    alm = [['a', 'b'], ['-', '-'], ['-', '-']]
+    assert ungap(alm, ['p', 'l1', 'l2'], 'p') == alm
+
+def test_clean_sound():
+    assert clean_sound("a/b") == "b"
+    assert clean_sound("a") == "a"
+    assert clean_sound("a/b.c/d") == "b.d"
+
+
+def test_unjoin():
+    assert unjoin("k.p a p u k.a/b".split())[0] == "k"
 
 
 def test_lingrex_path():
diff --git a/tox.ini b/tox.ini
index edb96bf..f66f5a4 100644
--- a/tox.ini
+++ b/tox.ini
@@ -1,5 +1,5 @@
 [tox]
-envlist = py{36,37,38,39}
+envlist = py{36,37,38,39,310}
 skip_missing_interpreters = true
 
 [testenv]