Skip to content

Commit 9bc4979

Browse files
Initial plink updates for missing fields
1 parent 316d264 commit 9bc4979

File tree

3 files changed

+212
-16
lines changed

3 files changed

+212
-16
lines changed

tests/test_tskit_data.py

Lines changed: 181 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
with various outputs.
44
"""
55

6+
import bio2zarr.plink as p2z
67
import bio2zarr.tskit as ts2z
78
import bio2zarr.vcf as v2z
89
import msprime
@@ -13,6 +14,7 @@
1314
import tskit
1415
import xarray.testing as xt
1516

17+
from vcztools.plink import write_plink
1618
from vcztools.vcf_writer import write_vcf
1719

1820

@@ -35,14 +37,126 @@ def add_mutations(ts):
3537
@pytest.fixture()
3638
def fx_diploid_msprime_sim(tmp_path):
3739
seed = 1234
38-
ts = msprime.sim_ancestry(5, sequence_length=100, random_seed=seed)
39-
ts = msprime.sim_mutations(ts, rate=0.5, random_seed=seed)
40+
ts = msprime.sim_ancestry(5, sequence_length=10_000, random_seed=seed)
41+
ts = msprime.sim_mutations(ts, rate=1e-4, random_seed=seed)
4042
assert ts.num_mutations > 0
43+
assert ts.num_mutations == ts.num_sites # make sure we have biallelic sites
4144
zarr_path = tmp_path / "sim.vcz"
4245
ts2z.convert(ts, zarr_path)
4346
return zarr_path
4447

4548

49+
@pytest.fixture()
50+
def fx_haploid_missing_data(tmp_path):
51+
# 2.00┊ 4 ┊
52+
# ┊ ┏━┻┓ ┊
53+
# 1.00┊ ┃ 3 ┊
54+
# ┊ ┃ ┏┻┓ ┊
55+
# 0.00┊ 0 1 2 5 ┊
56+
# 0 10
57+
# | |
58+
# pos 2 9
59+
# anc A T
60+
ts = tskit.Tree.generate_balanced(3, span=10).tree_sequence
61+
tables = ts.dump_tables()
62+
tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0)
63+
tables.sites.add_row(2, ancestral_state="A")
64+
tables.sites.add_row(9, ancestral_state="T")
65+
tables.mutations.add_row(site=0, node=0, derived_state="G")
66+
tables.mutations.add_row(site=1, node=3, derived_state="C")
67+
zarr_path = tmp_path / "sim.vcz"
68+
ts2z.convert(tables.tree_sequence(), zarr_path, isolated_as_missing=True)
69+
return zarr_path
70+
71+
72+
def test_haploid_missing_data(fx_haploid_missing_data):
73+
ds = sg.load_dataset(fx_haploid_missing_data)
74+
nt.assert_array_equal(
75+
ds.call_genotype.values,
76+
[
77+
[[1], [0], [0], [-1]],
78+
[[0], [1], [1], [-1]],
79+
],
80+
)
81+
82+
83+
@pytest.fixture()
84+
def fx_diploid_missing_data(tmp_path):
85+
# 2.00┊ 6 ┊
86+
# ┊ ┏━┻━┓ ┊
87+
# 1.00┊ 4 5 ┊
88+
# ┊ ┏┻┓ ┏┻┓ ┊
89+
# 0.00┊ 0 1 2 3 7 8┊
90+
# 0 10
91+
# | |
92+
# pos 2 9
93+
# anc A T
94+
ts = tskit.Tree.generate_balanced(4, span=10).tree_sequence
95+
tables = ts.dump_tables()
96+
tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0)
97+
u = tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0)
98+
assert u == 8
99+
tables.sites.add_row(2, ancestral_state="A")
100+
tables.sites.add_row(9, ancestral_state="T")
101+
tables.mutations.add_row(site=0, node=0, derived_state="G")
102+
tables.mutations.add_row(site=1, node=5, derived_state="C")
103+
zarr_path = tmp_path / "sim.vcz"
104+
ts = tables.tree_sequence()
105+
model_map = ts.map_to_vcf_model(ploidy=2)
106+
ts2z.convert(ts, zarr_path, model_mapping=model_map, isolated_as_missing=True)
107+
return zarr_path
108+
109+
110+
def test_diploid_missing_data(fx_diploid_missing_data):
111+
ds = sg.load_dataset(fx_diploid_missing_data)
112+
nt.assert_array_equal(
113+
ds.call_genotype.values,
114+
[
115+
[[1, 0], [0, 0], [-1, -1]],
116+
[[0, 0], [1, 1], [-1, -1]],
117+
],
118+
)
119+
120+
121+
@pytest.fixture()
122+
def fx_diploid_multi_allelic(tmp_path):
123+
# 2.00┊ 6 ┊
124+
# ┊ ┏━┻━┓ ┊
125+
# 1.00┊ 4 5 ┊
126+
# ┊ ┏┻┓ ┏┻┓ ┊
127+
# 0.00┊ 0 1 2 3 ┊
128+
# 0 10
129+
# | |
130+
# pos 2 9
131+
# anc A T
132+
ts = tskit.Tree.generate_balanced(4, span=10).tree_sequence
133+
tables = ts.dump_tables()
134+
tables.sites.add_row(2, ancestral_state="A")
135+
tables.sites.add_row(9, ancestral_state="T")
136+
tables.mutations.add_row(site=0, node=0, derived_state="G")
137+
tables.mutations.add_row(site=1, node=1, derived_state="G")
138+
tables.mutations.add_row(site=1, node=5, derived_state="C")
139+
zarr_path = tmp_path / "sim.vcz"
140+
ts = tables.tree_sequence()
141+
model_map = ts.map_to_vcf_model(ploidy=2)
142+
ts2z.convert(ts, zarr_path, model_mapping=model_map)
143+
return zarr_path
144+
145+
146+
def test_diploid_multi_allelic(fx_diploid_multi_allelic):
147+
ds = sg.load_dataset(fx_diploid_multi_allelic)
148+
# NOTE this example is constructed so that the rarest allele is in the middle
149+
# of the alleles array
150+
nt.assert_array_equal(ds.variant_allele.values, [["A", "G", ""], ["T", "G", "C"]])
151+
nt.assert_array_equal(
152+
ds.call_genotype.values,
153+
[
154+
[[1, 0], [0, 0]],
155+
[[0, 1], [2, 2]],
156+
],
157+
)
158+
159+
46160
@pytest.fixture()
47161
def fx_haploid_msprime_sim(tmp_path):
48162
seed = 12345
@@ -107,8 +221,8 @@ def assert_bio2zarr_rt(self, tmp_path, tskit_vcz):
107221
"variant_filter",
108222
"variant_quality",
109223
]
110-
xt.assert_equal(ds1, ds2.drop(drop_fields))
111-
num_variants = ds2.dims["variants"]
224+
xt.assert_equal(ds1, ds2.drop_vars(drop_fields))
225+
num_variants = ds2.sizes["variants"]
112226
assert np.all(np.isnan(ds2["variant_quality"].values))
113227
nt.assert_array_equal(
114228
ds2["variant_filter"], np.ones((num_variants, 1), dtype=bool)
@@ -123,3 +237,66 @@ def test_haploid_msprime_sim(self, tmp_path, fx_haploid_msprime_sim):
123237

124238
def test_simple_ts(self, tmp_path, fx_simple_ts):
125239
self.assert_bio2zarr_rt(tmp_path, fx_simple_ts)
240+
241+
def test_haploid_missing_data(self, tmp_path, fx_haploid_missing_data):
242+
self.assert_bio2zarr_rt(tmp_path, fx_haploid_missing_data)
243+
244+
def test_diploid_missing_data(self, tmp_path, fx_diploid_missing_data):
245+
self.assert_bio2zarr_rt(tmp_path, fx_diploid_missing_data)
246+
247+
def test_diploid_multi_allelic(self, tmp_path, fx_diploid_multi_allelic):
248+
self.assert_bio2zarr_rt(tmp_path, fx_diploid_multi_allelic)
249+
250+
251+
def recode_plink_hets(G):
252+
"""
253+
Returns a copy of the specified genotype matrix in which hets are all
254+
in the canonical unphased plink orientation, [0, 1]
255+
"""
256+
G = G.copy()
257+
for j in range(G.shape[0]):
258+
for k in range(G.shape[1]):
259+
if G[j, k, 0] == 1 and G[j, k, 1] == 0:
260+
G[j, k, 0] = 0
261+
G[j, k, 1] = 1
262+
return G
263+
264+
265+
class TestPlinkRoundTrip:
266+
def assert_bio2zarr_rt(self, tmp_path, tskit_vcz):
267+
# import pathlib
268+
# tmp_path = pathlib.Path("tmp/plink")
269+
plink_path = tmp_path / "plink"
270+
write_plink(tskit_vcz, plink_path)
271+
rt_vcz_path = tmp_path / "rt.vcz"
272+
p2z.convert(plink_path, rt_vcz_path)
273+
ds1 = sg.load_dataset(tskit_vcz)
274+
ds2 = sg.load_dataset(rt_vcz_path)
275+
276+
assert np.all(ds1["call_genotype_phased"])
277+
assert np.all(~ds2["call_genotype_phased"])
278+
279+
nt.assert_array_equal(
280+
recode_plink_hets(ds1["call_genotype"].values), ds2["call_genotype"]
281+
)
282+
283+
drop_fields = [
284+
"variant_id",
285+
"variant_id_mask",
286+
"call_genotype",
287+
"call_genotype_phased",
288+
]
289+
xt.assert_equal(
290+
ds1.drop_vars(["call_genotype", "call_genotype_phased"]),
291+
ds2.drop_vars(drop_fields),
292+
)
293+
294+
def test_diploid_msprime_sim(self, tmp_path, fx_diploid_msprime_sim):
295+
self.assert_bio2zarr_rt(tmp_path, fx_diploid_msprime_sim)
296+
297+
def test_diploid_missing_data(self, tmp_path, fx_diploid_missing_data):
298+
self.assert_bio2zarr_rt(tmp_path, fx_diploid_missing_data)
299+
300+
def test_diploid_multi_allelic(self, tmp_path, fx_diploid_multi_allelic):
301+
with pytest.raises(ValueError, match="Only biallelic VCFs supported"):
302+
self.assert_bio2zarr_rt(tmp_path, fx_diploid_multi_allelic)

vcztools/cli.py

Lines changed: 1 addition & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
import contextlib
22
import os
3-
import pathlib
43
import sys
54
from functools import wraps
65

@@ -293,16 +292,7 @@ def view_plink1(path, include, exclude, out):
293292
-o intermediate.vcf && plink 1.9 --vcf intermediate.vcf [plink options]``
294293
without generating the intermediate VCF.
295294
"""
296-
out_prefix = pathlib.Path(out)
297-
writer = plink.Writer(
298-
path,
299-
bed_path=out_prefix.with_suffix(".bed"),
300-
fam_path=out_prefix.with_suffix(".fam"),
301-
bim_path=out_prefix.with_suffix(".bim"),
302-
include=include,
303-
exclude=exclude,
304-
)
305-
writer.run()
295+
plink.write_plink(path, out, include=include, exclude=exclude)
306296

307297

308298
@version

vcztools/plink.py

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22
Convert VCZ to plink 1 binary format.
33
"""
44

5+
import pathlib
6+
57
import numpy as np
68
import pandas as pd
79
import zarr
@@ -46,10 +48,17 @@ def generate_bim(root, a12_allele):
4648
allele_1 = alleles[np.arange(num_variants), a12_allele[:, 0]]
4749
single_allele_sites = np.where(a12_allele[:, 0] == -1)
4850
allele_1[single_allele_sites] = "0"
51+
52+
num_variants = np.sum(select)
53+
if "variant_id" in root:
54+
variant_id = root["variant_id"][:][select]
55+
else:
56+
variant_id = np.array(["."] * num_variants, dtype="S")
57+
4958
df = pd.DataFrame(
5059
{
5160
"Chrom": contig_id[root["variant_contig"][:][select]],
52-
"VariantId": root["variant_id"][:][select],
61+
"VariantId": variant_id,
5362
"GeneticPosition": np.zeros(np.sum(select), dtype=int),
5463
"Position": root["variant_position"][:][select],
5564
"Allele1": allele_1,
@@ -74,6 +83,12 @@ def _compute_alleles(self, G, alleles):
7483
Returns the a12 alleles for the specified chunk of data.
7584
"""
7685
max_alleles = alleles.shape[1]
86+
if max_alleles != 2:
87+
raise ValueError(
88+
"Only biallelic VCFs supported currently: "
89+
"please comment on https://github.com/sgkit-dev/vcztools/issues/224 "
90+
"if this limitation affects you"
91+
)
7792
num_variants = G.shape[0]
7893
num_samples = G.shape[1]
7994
a12_allele = np.zeros((num_variants, 2), dtype=int) - 1
@@ -138,3 +153,17 @@ def run(self):
138153

139154
with open(self.fam_path, "w") as f:
140155
f.write(generate_fam(self.root))
156+
157+
158+
def write_plink(vcz_path, out, include=None, exclude=None):
159+
out_prefix = pathlib.Path(out)
160+
# out_prefix.mkdir(exist_ok=True)
161+
writer = Writer(
162+
vcz_path,
163+
bed_path=out_prefix.with_suffix(".bed"),
164+
fam_path=out_prefix.with_suffix(".fam"),
165+
bim_path=out_prefix.with_suffix(".bim"),
166+
include=include,
167+
exclude=exclude,
168+
)
169+
writer.run()

0 commit comments

Comments
 (0)