3
3
with various outputs.
4
4
"""
5
5
6
+ import bio2zarr .plink as p2z
6
7
import bio2zarr .tskit as ts2z
7
8
import bio2zarr .vcf as v2z
8
9
import msprime
13
14
import tskit
14
15
import xarray .testing as xt
15
16
17
+ from vcztools .plink import write_plink
16
18
from vcztools .vcf_writer import write_vcf
17
19
18
20
@@ -35,14 +37,126 @@ def add_mutations(ts):
35
37
@pytest .fixture ()
36
38
def fx_diploid_msprime_sim (tmp_path ):
37
39
seed = 1234
38
- ts = msprime .sim_ancestry (5 , sequence_length = 100 , random_seed = seed )
39
- ts = msprime .sim_mutations (ts , rate = 0.5 , random_seed = seed )
40
+ ts = msprime .sim_ancestry (5 , sequence_length = 10_000 , random_seed = seed )
41
+ ts = msprime .sim_mutations (ts , rate = 1e-4 , random_seed = seed )
40
42
assert ts .num_mutations > 0
43
+ assert ts .num_mutations == ts .num_sites # make sure we have biallelic sites
41
44
zarr_path = tmp_path / "sim.vcz"
42
45
ts2z .convert (ts , zarr_path )
43
46
return zarr_path
44
47
45
48
49
+ @pytest .fixture ()
50
+ def fx_haploid_missing_data (tmp_path ):
51
+ # 2.00┊ 4 ┊
52
+ # ┊ ┏━┻┓ ┊
53
+ # 1.00┊ ┃ 3 ┊
54
+ # ┊ ┃ ┏┻┓ ┊
55
+ # 0.00┊ 0 1 2 5 ┊
56
+ # 0 10
57
+ # | |
58
+ # pos 2 9
59
+ # anc A T
60
+ ts = tskit .Tree .generate_balanced (3 , span = 10 ).tree_sequence
61
+ tables = ts .dump_tables ()
62
+ tables .nodes .add_row (flags = tskit .NODE_IS_SAMPLE , time = 0 )
63
+ tables .sites .add_row (2 , ancestral_state = "A" )
64
+ tables .sites .add_row (9 , ancestral_state = "T" )
65
+ tables .mutations .add_row (site = 0 , node = 0 , derived_state = "G" )
66
+ tables .mutations .add_row (site = 1 , node = 3 , derived_state = "C" )
67
+ zarr_path = tmp_path / "sim.vcz"
68
+ ts2z .convert (tables .tree_sequence (), zarr_path , isolated_as_missing = True )
69
+ return zarr_path
70
+
71
+
72
+ def test_haploid_missing_data (fx_haploid_missing_data ):
73
+ ds = sg .load_dataset (fx_haploid_missing_data )
74
+ nt .assert_array_equal (
75
+ ds .call_genotype .values ,
76
+ [
77
+ [[1 ], [0 ], [0 ], [- 1 ]],
78
+ [[0 ], [1 ], [1 ], [- 1 ]],
79
+ ],
80
+ )
81
+
82
+
83
+ @pytest .fixture ()
84
+ def fx_diploid_missing_data (tmp_path ):
85
+ # 2.00┊ 6 ┊
86
+ # ┊ ┏━┻━┓ ┊
87
+ # 1.00┊ 4 5 ┊
88
+ # ┊ ┏┻┓ ┏┻┓ ┊
89
+ # 0.00┊ 0 1 2 3 7 8┊
90
+ # 0 10
91
+ # | |
92
+ # pos 2 9
93
+ # anc A T
94
+ ts = tskit .Tree .generate_balanced (4 , span = 10 ).tree_sequence
95
+ tables = ts .dump_tables ()
96
+ tables .nodes .add_row (flags = tskit .NODE_IS_SAMPLE , time = 0 )
97
+ u = tables .nodes .add_row (flags = tskit .NODE_IS_SAMPLE , time = 0 )
98
+ assert u == 8
99
+ tables .sites .add_row (2 , ancestral_state = "A" )
100
+ tables .sites .add_row (9 , ancestral_state = "T" )
101
+ tables .mutations .add_row (site = 0 , node = 0 , derived_state = "G" )
102
+ tables .mutations .add_row (site = 1 , node = 5 , derived_state = "C" )
103
+ zarr_path = tmp_path / "sim.vcz"
104
+ ts = tables .tree_sequence ()
105
+ model_map = ts .map_to_vcf_model (ploidy = 2 )
106
+ ts2z .convert (ts , zarr_path , model_mapping = model_map , isolated_as_missing = True )
107
+ return zarr_path
108
+
109
+
110
+ def test_diploid_missing_data (fx_diploid_missing_data ):
111
+ ds = sg .load_dataset (fx_diploid_missing_data )
112
+ nt .assert_array_equal (
113
+ ds .call_genotype .values ,
114
+ [
115
+ [[1 , 0 ], [0 , 0 ], [- 1 , - 1 ]],
116
+ [[0 , 0 ], [1 , 1 ], [- 1 , - 1 ]],
117
+ ],
118
+ )
119
+
120
+
121
+ @pytest .fixture ()
122
+ def fx_diploid_multi_allelic (tmp_path ):
123
+ # 2.00┊ 6 ┊
124
+ # ┊ ┏━┻━┓ ┊
125
+ # 1.00┊ 4 5 ┊
126
+ # ┊ ┏┻┓ ┏┻┓ ┊
127
+ # 0.00┊ 0 1 2 3 ┊
128
+ # 0 10
129
+ # | |
130
+ # pos 2 9
131
+ # anc A T
132
+ ts = tskit .Tree .generate_balanced (4 , span = 10 ).tree_sequence
133
+ tables = ts .dump_tables ()
134
+ tables .sites .add_row (2 , ancestral_state = "A" )
135
+ tables .sites .add_row (9 , ancestral_state = "T" )
136
+ tables .mutations .add_row (site = 0 , node = 0 , derived_state = "G" )
137
+ tables .mutations .add_row (site = 1 , node = 1 , derived_state = "G" )
138
+ tables .mutations .add_row (site = 1 , node = 5 , derived_state = "C" )
139
+ zarr_path = tmp_path / "sim.vcz"
140
+ ts = tables .tree_sequence ()
141
+ model_map = ts .map_to_vcf_model (ploidy = 2 )
142
+ ts2z .convert (ts , zarr_path , model_mapping = model_map )
143
+ return zarr_path
144
+
145
+
146
+ def test_diploid_multi_allelic (fx_diploid_multi_allelic ):
147
+ ds = sg .load_dataset (fx_diploid_multi_allelic )
148
+ # NOTE this example is constructed so that the rarest allele is in the middle
149
+ # of the alleles array
150
+ nt .assert_array_equal (ds .variant_allele .values , [["A" , "G" , "" ], ["T" , "G" , "C" ]])
151
+ nt .assert_array_equal (
152
+ ds .call_genotype .values ,
153
+ [
154
+ [[1 , 0 ], [0 , 0 ]],
155
+ [[0 , 1 ], [2 , 2 ]],
156
+ ],
157
+ )
158
+
159
+
46
160
@pytest .fixture ()
47
161
def fx_haploid_msprime_sim (tmp_path ):
48
162
seed = 12345
@@ -107,8 +221,8 @@ def assert_bio2zarr_rt(self, tmp_path, tskit_vcz):
107
221
"variant_filter" ,
108
222
"variant_quality" ,
109
223
]
110
- xt .assert_equal (ds1 , ds2 .drop (drop_fields ))
111
- num_variants = ds2 .dims ["variants" ]
224
+ xt .assert_equal (ds1 , ds2 .drop_vars (drop_fields ))
225
+ num_variants = ds2 .sizes ["variants" ]
112
226
assert np .all (np .isnan (ds2 ["variant_quality" ].values ))
113
227
nt .assert_array_equal (
114
228
ds2 ["variant_filter" ], np .ones ((num_variants , 1 ), dtype = bool )
@@ -123,3 +237,66 @@ def test_haploid_msprime_sim(self, tmp_path, fx_haploid_msprime_sim):
123
237
124
238
def test_simple_ts (self , tmp_path , fx_simple_ts ):
125
239
self .assert_bio2zarr_rt (tmp_path , fx_simple_ts )
240
+
241
+ def test_haploid_missing_data (self , tmp_path , fx_haploid_missing_data ):
242
+ self .assert_bio2zarr_rt (tmp_path , fx_haploid_missing_data )
243
+
244
+ def test_diploid_missing_data (self , tmp_path , fx_diploid_missing_data ):
245
+ self .assert_bio2zarr_rt (tmp_path , fx_diploid_missing_data )
246
+
247
+ def test_diploid_multi_allelic (self , tmp_path , fx_diploid_multi_allelic ):
248
+ self .assert_bio2zarr_rt (tmp_path , fx_diploid_multi_allelic )
249
+
250
+
251
+ def recode_plink_hets (G ):
252
+ """
253
+ Returns a copy of the specified genotype matrix in which hets are all
254
+ in the canonical unphased plink orientation, [0, 1]
255
+ """
256
+ G = G .copy ()
257
+ for j in range (G .shape [0 ]):
258
+ for k in range (G .shape [1 ]):
259
+ if G [j , k , 0 ] == 1 and G [j , k , 1 ] == 0 :
260
+ G [j , k , 0 ] = 0
261
+ G [j , k , 1 ] = 1
262
+ return G
263
+
264
+
265
+ class TestPlinkRoundTrip :
266
+ def assert_bio2zarr_rt (self , tmp_path , tskit_vcz ):
267
+ # import pathlib
268
+ # tmp_path = pathlib.Path("tmp/plink")
269
+ plink_path = tmp_path / "plink"
270
+ write_plink (tskit_vcz , plink_path )
271
+ rt_vcz_path = tmp_path / "rt.vcz"
272
+ p2z .convert (plink_path , rt_vcz_path )
273
+ ds1 = sg .load_dataset (tskit_vcz )
274
+ ds2 = sg .load_dataset (rt_vcz_path )
275
+
276
+ assert np .all (ds1 ["call_genotype_phased" ])
277
+ assert np .all (~ ds2 ["call_genotype_phased" ])
278
+
279
+ nt .assert_array_equal (
280
+ recode_plink_hets (ds1 ["call_genotype" ].values ), ds2 ["call_genotype" ]
281
+ )
282
+
283
+ drop_fields = [
284
+ "variant_id" ,
285
+ "variant_id_mask" ,
286
+ "call_genotype" ,
287
+ "call_genotype_phased" ,
288
+ ]
289
+ xt .assert_equal (
290
+ ds1 .drop_vars (["call_genotype" , "call_genotype_phased" ]),
291
+ ds2 .drop_vars (drop_fields ),
292
+ )
293
+
294
+ def test_diploid_msprime_sim (self , tmp_path , fx_diploid_msprime_sim ):
295
+ self .assert_bio2zarr_rt (tmp_path , fx_diploid_msprime_sim )
296
+
297
+ def test_diploid_missing_data (self , tmp_path , fx_diploid_missing_data ):
298
+ self .assert_bio2zarr_rt (tmp_path , fx_diploid_missing_data )
299
+
300
+ def test_diploid_multi_allelic (self , tmp_path , fx_diploid_multi_allelic ):
301
+ with pytest .raises (ValueError , match = "Only biallelic VCFs supported" ):
302
+ self .assert_bio2zarr_rt (tmp_path , fx_diploid_multi_allelic )
0 commit comments