Initial plink updates for missing fields

jeromekelleher · jeromekelleher · commit 9bc497975c18 · 2025-05-28T12:13:00.000Z
diff --git a/tests/test_tskit_data.py b/tests/test_tskit_data.py
@@ -3,6 +3,7 @@
 with various outputs.
 """
 
+import bio2zarr.plink as p2z
 import bio2zarr.tskit as ts2z
 import bio2zarr.vcf as v2z
 import msprime
@@ -13,6 +14,7 @@
 import tskit
 import xarray.testing as xt
 
+from vcztools.plink import write_plink
 from vcztools.vcf_writer import write_vcf
 
 
@@ -35,14 +37,126 @@ def add_mutations(ts):
 @pytest.fixture()
 def fx_diploid_msprime_sim(tmp_path):
     seed = 1234
-    ts = msprime.sim_ancestry(5, sequence_length=100, random_seed=seed)
-    ts = msprime.sim_mutations(ts, rate=0.5, random_seed=seed)
+    ts = msprime.sim_ancestry(5, sequence_length=10_000, random_seed=seed)
+    ts = msprime.sim_mutations(ts, rate=1e-4, random_seed=seed)
     assert ts.num_mutations > 0
+    assert ts.num_mutations == ts.num_sites  # make sure we have biallelic sites
     zarr_path = tmp_path / "sim.vcz"
     ts2z.convert(ts, zarr_path)
     return zarr_path
 
 
+@pytest.fixture()
+def fx_haploid_missing_data(tmp_path):
+    # 2.00┊   4     ┊
+    #     ┊ ┏━┻┓    ┊
+    # 1.00┊ ┃  3    ┊
+    #     ┊ ┃ ┏┻┓   ┊
+    # 0.00┊ 0 1 2 5 ┊
+    #     0        10
+    #      |      |
+    #  pos 2      9
+    #  anc A      T
+    ts = tskit.Tree.generate_balanced(3, span=10).tree_sequence
+    tables = ts.dump_tables()
+    tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0)
+    tables.sites.add_row(2, ancestral_state="A")
+    tables.sites.add_row(9, ancestral_state="T")
+    tables.mutations.add_row(site=0, node=0, derived_state="G")
+    tables.mutations.add_row(site=1, node=3, derived_state="C")
+    zarr_path = tmp_path / "sim.vcz"
+    ts2z.convert(tables.tree_sequence(), zarr_path, isolated_as_missing=True)
+    return zarr_path
+
+
+def test_haploid_missing_data(fx_haploid_missing_data):
+    ds = sg.load_dataset(fx_haploid_missing_data)
+    nt.assert_array_equal(
+        ds.call_genotype.values,
+        [
+            [[1], [0], [0], [-1]],
+            [[0], [1], [1], [-1]],
+        ],
+    )
+
+
+@pytest.fixture()
+def fx_diploid_missing_data(tmp_path):
+    # 2.00┊    6       ┊
+    #     ┊  ┏━┻━┓     ┊
+    # 1.00┊  4   5     ┊
+    #     ┊ ┏┻┓ ┏┻┓    ┊
+    # 0.00┊ 0 1 2 3 7 8┊
+    #     0            10
+    #      |         |
+    #  pos 2         9
+    #  anc A         T
+    ts = tskit.Tree.generate_balanced(4, span=10).tree_sequence
+    tables = ts.dump_tables()
+    tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0)
+    u = tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0)
+    assert u == 8
+    tables.sites.add_row(2, ancestral_state="A")
+    tables.sites.add_row(9, ancestral_state="T")
+    tables.mutations.add_row(site=0, node=0, derived_state="G")
+    tables.mutations.add_row(site=1, node=5, derived_state="C")
+    zarr_path = tmp_path / "sim.vcz"
+    ts = tables.tree_sequence()
+    model_map = ts.map_to_vcf_model(ploidy=2)
+    ts2z.convert(ts, zarr_path, model_mapping=model_map, isolated_as_missing=True)
+    return zarr_path
+
+
+def test_diploid_missing_data(fx_diploid_missing_data):
+    ds = sg.load_dataset(fx_diploid_missing_data)
+    nt.assert_array_equal(
+        ds.call_genotype.values,
+        [
+            [[1, 0], [0, 0], [-1, -1]],
+            [[0, 0], [1, 1], [-1, -1]],
+        ],
+    )
+
+
+@pytest.fixture()
+def fx_diploid_multi_allelic(tmp_path):
+    # 2.00┊    6    ┊
+    #     ┊  ┏━┻━┓  ┊
+    # 1.00┊  4   5  ┊
+    #     ┊ ┏┻┓ ┏┻┓ ┊
+    # 0.00┊ 0 1 2 3 ┊
+    #     0         10
+    #      |       |
+    #  pos 2       9
+    #  anc A       T
+    ts = tskit.Tree.generate_balanced(4, span=10).tree_sequence
+    tables = ts.dump_tables()
+    tables.sites.add_row(2, ancestral_state="A")
+    tables.sites.add_row(9, ancestral_state="T")
+    tables.mutations.add_row(site=0, node=0, derived_state="G")
+    tables.mutations.add_row(site=1, node=1, derived_state="G")
+    tables.mutations.add_row(site=1, node=5, derived_state="C")
+    zarr_path = tmp_path / "sim.vcz"
+    ts = tables.tree_sequence()
+    model_map = ts.map_to_vcf_model(ploidy=2)
+    ts2z.convert(ts, zarr_path, model_mapping=model_map)
+    return zarr_path
+
+
+def test_diploid_multi_allelic(fx_diploid_multi_allelic):
+    ds = sg.load_dataset(fx_diploid_multi_allelic)
+    # NOTE this example is constructed so that the rarest allele is in the middle
+    # of the alleles array
+    nt.assert_array_equal(ds.variant_allele.values, [["A", "G", ""], ["T", "G", "C"]])
+    nt.assert_array_equal(
+        ds.call_genotype.values,
+        [
+            [[1, 0], [0, 0]],
+            [[0, 1], [2, 2]],
+        ],
+    )
+
+
 @pytest.fixture()
 def fx_haploid_msprime_sim(tmp_path):
     seed = 12345
@@ -107,8 +221,8 @@ def assert_bio2zarr_rt(self, tmp_path, tskit_vcz):
             "variant_filter",
             "variant_quality",
         ]
-        xt.assert_equal(ds1, ds2.drop(drop_fields))
-        num_variants = ds2.dims["variants"]
+        xt.assert_equal(ds1, ds2.drop_vars(drop_fields))
+        num_variants = ds2.sizes["variants"]
         assert np.all(np.isnan(ds2["variant_quality"].values))
         nt.assert_array_equal(
             ds2["variant_filter"], np.ones((num_variants, 1), dtype=bool)
@@ -123,3 +237,66 @@ def test_haploid_msprime_sim(self, tmp_path, fx_haploid_msprime_sim):
 
     def test_simple_ts(self, tmp_path, fx_simple_ts):
         self.assert_bio2zarr_rt(tmp_path, fx_simple_ts)
+
+    def test_haploid_missing_data(self, tmp_path, fx_haploid_missing_data):
+        self.assert_bio2zarr_rt(tmp_path, fx_haploid_missing_data)
+
+    def test_diploid_missing_data(self, tmp_path, fx_diploid_missing_data):
+        self.assert_bio2zarr_rt(tmp_path, fx_diploid_missing_data)
+
+    def test_diploid_multi_allelic(self, tmp_path, fx_diploid_multi_allelic):
+        self.assert_bio2zarr_rt(tmp_path, fx_diploid_multi_allelic)
+
+
+def recode_plink_hets(G):
+    """
+    Returns a copy of the specified genotype matrix in which hets are all
+    in the canonical unphased plink orientation, [0, 1]
+    """
+    G = G.copy()
+    for j in range(G.shape[0]):
+        for k in range(G.shape[1]):
+            if G[j, k, 0] == 1 and G[j, k, 1] == 0:
+                G[j, k, 0] = 0
+                G[j, k, 1] = 1
+    return G
+
+
+class TestPlinkRoundTrip:
+    def assert_bio2zarr_rt(self, tmp_path, tskit_vcz):
+        # import pathlib
+        # tmp_path = pathlib.Path("tmp/plink")
+        plink_path = tmp_path / "plink"
+        write_plink(tskit_vcz, plink_path)
+        rt_vcz_path = tmp_path / "rt.vcz"
+        p2z.convert(plink_path, rt_vcz_path)
+        ds1 = sg.load_dataset(tskit_vcz)
+        ds2 = sg.load_dataset(rt_vcz_path)
+
+        assert np.all(ds1["call_genotype_phased"])
+        assert np.all(~ds2["call_genotype_phased"])
+
+        nt.assert_array_equal(
+            recode_plink_hets(ds1["call_genotype"].values), ds2["call_genotype"]
+        )
+
+        drop_fields = [
+            "variant_id",
+            "variant_id_mask",
+            "call_genotype",
+            "call_genotype_phased",
+        ]
+        xt.assert_equal(
+            ds1.drop_vars(["call_genotype", "call_genotype_phased"]),
+            ds2.drop_vars(drop_fields),
+        )
+
+    def test_diploid_msprime_sim(self, tmp_path, fx_diploid_msprime_sim):
+        self.assert_bio2zarr_rt(tmp_path, fx_diploid_msprime_sim)
+
+    def test_diploid_missing_data(self, tmp_path, fx_diploid_missing_data):
+        self.assert_bio2zarr_rt(tmp_path, fx_diploid_missing_data)
+
+    def test_diploid_multi_allelic(self, tmp_path, fx_diploid_multi_allelic):
+        with pytest.raises(ValueError, match="Only biallelic VCFs supported"):
+            self.assert_bio2zarr_rt(tmp_path, fx_diploid_multi_allelic)
diff --git a/vcztools/cli.py b/vcztools/cli.py
@@ -1,6 +1,5 @@
 import contextlib
 import os
-import pathlib
 import sys
 from functools import wraps
 
@@ -293,16 +292,7 @@ def view_plink1(path, include, exclude, out):
     -o intermediate.vcf && plink 1.9 --vcf intermediate.vcf [plink options]``
     without generating the intermediate VCF.
     """
-    out_prefix = pathlib.Path(out)
-    writer = plink.Writer(
-        path,
-        bed_path=out_prefix.with_suffix(".bed"),
-        fam_path=out_prefix.with_suffix(".fam"),
-        bim_path=out_prefix.with_suffix(".bim"),
-        include=include,
-        exclude=exclude,
-    )
-    writer.run()
+    plink.write_plink(path, out, include=include, exclude=exclude)
 
 
 @version
diff --git a/vcztools/plink.py b/vcztools/plink.py
@@ -2,6 +2,8 @@
 Convert VCZ to plink 1 binary format.
 """
 
+import pathlib
+
 import numpy as np
 import pandas as pd
 import zarr
@@ -46,10 +48,17 @@ def generate_bim(root, a12_allele):
     allele_1 = alleles[np.arange(num_variants), a12_allele[:, 0]]
     single_allele_sites = np.where(a12_allele[:, 0] == -1)
     allele_1[single_allele_sites] = "0"
+
+    num_variants = np.sum(select)
+    if "variant_id" in root:
+        variant_id = root["variant_id"][:][select]
+    else:
+        variant_id = np.array(["."] * num_variants, dtype="S")
+
     df = pd.DataFrame(
         {
             "Chrom": contig_id[root["variant_contig"][:][select]],
-            "VariantId": root["variant_id"][:][select],
+            "VariantId": variant_id,
             "GeneticPosition": np.zeros(np.sum(select), dtype=int),
             "Position": root["variant_position"][:][select],
             "Allele1": allele_1,
@@ -74,6 +83,12 @@ def _compute_alleles(self, G, alleles):
         Returns the a12 alleles for the specified chunk of data.
         """
         max_alleles = alleles.shape[1]
+        if max_alleles != 2:
+            raise ValueError(
+                "Only biallelic VCFs supported currently: "
+                "please comment on https://github.com/sgkit-dev/vcztools/issues/224 "
+                "if this limitation affects you"
+            )
         num_variants = G.shape[0]
         num_samples = G.shape[1]
         a12_allele = np.zeros((num_variants, 2), dtype=int) - 1
@@ -138,3 +153,17 @@ def run(self):
 
         with open(self.fam_path, "w") as f:
             f.write(generate_fam(self.root))
+
+
+def write_plink(vcz_path, out, include=None, exclude=None):
+    out_prefix = pathlib.Path(out)
+    # out_prefix.mkdir(exist_ok=True)
+    writer = Writer(
+        vcz_path,
+        bed_path=out_prefix.with_suffix(".bed"),
+        fam_path=out_prefix.with_suffix(".fam"),
+        bim_path=out_prefix.with_suffix(".bim"),
+        include=include,
+        exclude=exclude,
+    )
+    writer.run()