5
5
from typing import MutableMapping , Optional , TextIO , Union
6
6
7
7
import numpy as np
8
+ from vcztools .regions import parse_targets_string , pslice_to_slice
8
9
import zarr
9
10
10
11
from . import _vcztools
@@ -80,7 +81,7 @@ def dims(arr):
80
81
81
82
82
83
def write_vcf (
83
- vcz , output , * , vcf_header : Optional [str ] = None , implementation = "numba"
84
+ vcz , output , * , vcf_header : Optional [str ] = None , variant_targets = None , implementation = "numba"
84
85
) -> None :
85
86
"""Convert a dataset to a VCF file.
86
87
@@ -163,7 +164,19 @@ def write_vcf(
163
164
contigs = root ["contig_id" ][:].astype ("S" )
164
165
filters = root ["filter_id" ][:].astype ("S" )
165
166
167
+ if variant_targets is None :
168
+ variant_mask = np .ones (pos .shape [0 ], dtype = bool )
169
+ else :
170
+ contig , start , end = parse_targets_string (variant_targets )
171
+ variant_slice = pslice_to_slice (root ["contig_id" ][:].astype ("U" ).tolist (), root ["variant_contig" ], pos , contig , start , end )
172
+ variant_mask = np .zeros (pos .shape [0 ], dtype = bool )
173
+ variant_mask [variant_slice ] = 1
174
+ # Use zarr arrays to get mask chunks aligned with the main data
175
+ # for convenience.
176
+ z_variant_mask = zarr .array (variant_mask , chunks = pos .chunks [0 ])
177
+
166
178
for v_chunk in range (pos .cdata_shape [0 ]):
179
+ v_mask_chunk = z_variant_mask .blocks [v_chunk ]
167
180
if implementation == "numba" :
168
181
numba_chunk_to_vcf (
169
182
root ,
@@ -178,22 +191,27 @@ def write_vcf(
178
191
c_chunk_to_vcf (
179
192
root ,
180
193
v_chunk ,
194
+ v_mask_chunk ,
181
195
contigs ,
182
196
filters ,
183
197
output ,
184
198
)
185
199
186
200
187
- def c_chunk_to_vcf (root , v_chunk , contigs , filters , output ):
188
- chrom = contigs [root .variant_contig .blocks [v_chunk ]]
201
+ def get_block_selection (zarray , key , mask ):
202
+ return zarray .blocks [key ][mask ]
203
+
204
+
205
+ def c_chunk_to_vcf (root , v_chunk , v_mask_chunk , contigs , filters , output ):
206
+ chrom = contigs [get_block_selection (root .variant_contig , v_chunk , v_mask_chunk )]
189
207
# TODO check we don't truncate silently by doing this
190
- pos = root .variant_position . blocks [ v_chunk ] .astype (np .int32 )
191
- id = root .variant_id . blocks [ v_chunk ] .astype ("S" )
192
- alleles = root .variant_allele . blocks [ v_chunk ]
208
+ pos = get_block_selection ( root .variant_position , v_chunk , v_mask_chunk ) .astype (np .int32 )
209
+ id = get_block_selection ( root .variant_id , v_chunk , v_mask_chunk ) .astype ("S" )
210
+ alleles = get_block_selection ( root .variant_allele , v_chunk , v_mask_chunk )
193
211
ref = alleles [:, 0 ].astype ("S" )
194
212
alt = alleles [:, 1 :].astype ("S" )
195
- qual = root .variant_quality . blocks [ v_chunk ]
196
- filter_ = root .variant_filter . blocks [ v_chunk ]
213
+ qual = get_block_selection ( root .variant_quality , v_chunk , v_mask_chunk )
214
+ filter_ = get_block_selection ( root .variant_filter , v_chunk , v_mask_chunk )
197
215
198
216
num_variants = len (pos )
199
217
if len (id .shape ) == 1 :
@@ -207,21 +225,21 @@ def c_chunk_to_vcf(root, v_chunk, contigs, filters, output):
207
225
for name , array in root .items ():
208
226
if name .startswith ("call_" ) and not name .startswith ("call_genotype" ):
209
227
vcf_name = name [len ("call_" ) :]
210
- format_fields [vcf_name ] = array . blocks [ v_chunk ]
228
+ format_fields [vcf_name ] = get_block_selection ( array , v_chunk , v_mask_chunk )
211
229
if num_samples is None :
212
230
num_samples = array .shape [1 ]
213
231
elif name .startswith ("variant_" ) and name not in RESERVED_VARIABLE_NAMES :
214
232
vcf_name = name [len ("variant_" ) :]
215
- info_fields [vcf_name ] = array . blocks [ v_chunk ]
233
+ info_fields [vcf_name ] = get_block_selection ( array , v_chunk , v_mask_chunk )
216
234
217
235
gt = None
218
236
gt_phased = None
219
237
if "call_genotype" in root :
220
238
array = root ["call_genotype" ]
221
- gt = array . blocks [ v_chunk ]
239
+ gt = get_block_selection ( array , v_chunk , v_mask_chunk )
222
240
if "call_genotype_phased" in root :
223
241
array = root ["call_genotype_phased" ]
224
- gt_phased = array . blocks [ v_chunk ]
242
+ gt_phased = get_block_selection ( array , v_chunk , v_mask_chunk )
225
243
else :
226
244
gt_phased = np .zeros_like (gt , dtype = bool )
227
245
0 commit comments