32
32
# [Table 1: Reserved INFO keys]
33
33
RESERVED_INFO_KEY_DESCRIPTIONS = {
34
34
"AA" : "Ancestral allele" ,
35
- "AC" : "Allele count in genotypes, for each ALT allele, in the same order as listed " ,
35
+ "AC" : "Allele count in genotypes" ,
36
36
"AD" : "Total read depth for each allele" ,
37
37
"ADF" : "Read depth for each allele on the forward strand" ,
38
38
"ADR" : "Read depth for each allele on the reverse strand" ,
@@ -136,6 +136,7 @@ def write_vcf(
136
136
root = zarr .open (vcz , mode = "r" )
137
137
138
138
with open_file_like (output ) as output :
139
+ force_ac_an_header = False
139
140
if samples and drop_genotypes :
140
141
raise ValueError ("Cannot select samples and drop genotypes." )
141
142
elif drop_genotypes :
@@ -145,6 +146,7 @@ def write_vcf(
145
146
sample_ids = root ["sample_id" ][:]
146
147
samples_selection = None
147
148
else :
149
+ force_ac_an_header = True
148
150
all_samples = root ["sample_id" ][:]
149
151
exclude_samples = samples .startswith ("^" )
150
152
samples = samples .lstrip ("^" )
@@ -157,15 +159,15 @@ def write_vcf(
157
159
if force_samples :
158
160
# remove unknown samples from sample_ids
159
161
logger .warning (
160
- ' subset called for sample(s) not in header: '
162
+ " subset called for sample(s) not in header: "
161
163
f'{ "," .join (unknown_samples )} .'
162
164
)
163
165
sample_ids = np .delete (
164
166
sample_ids , search (sample_ids , unknown_samples )
165
167
)
166
168
else :
167
169
raise ValueError (
168
- ' subset called for sample(s) not in header: '
170
+ " subset called for sample(s) not in header: "
169
171
f'{ "," .join (unknown_samples )} . '
170
172
'Use "--force-samples" to ignore this error.'
171
173
)
@@ -180,7 +182,11 @@ def write_vcf(
180
182
if not no_header :
181
183
original_header = root .attrs .get ("vcf_header" , None )
182
184
vcf_header = _generate_header (
183
- root , original_header , sample_ids , no_version = no_version
185
+ root ,
186
+ original_header ,
187
+ sample_ids ,
188
+ no_version = no_version ,
189
+ force_ac_an = force_ac_an_header ,
184
190
)
185
191
print (vcf_header , end = "" , file = output )
186
192
@@ -453,7 +459,14 @@ def c_chunk_to_vcf(
453
459
print (line , file = output )
454
460
455
461
456
- def _generate_header (ds , original_header , sample_ids , * , no_version : bool = False ):
462
+ def _generate_header (
463
+ ds ,
464
+ original_header ,
465
+ sample_ids ,
466
+ * ,
467
+ no_version : bool = False ,
468
+ force_ac_an : bool = False ,
469
+ ):
457
470
output = io .StringIO ()
458
471
459
472
contigs = list (ds ["contig_id" ][:])
@@ -488,7 +501,6 @@ def _generate_header(ds, original_header, sample_ids, *, no_version: bool = Fals
488
501
if key in ("genotype" , "genotype_phased" ):
489
502
continue
490
503
format_fields .append (key )
491
-
492
504
if original_header is None : # generate entire header
493
505
# [1.4.1 File format]
494
506
print ("##fileformat=VCFv4.3" , file = output )
@@ -543,6 +555,17 @@ def _generate_header(ds, original_header, sample_ids, *, no_version: bool = Fals
543
555
file = output ,
544
556
)
545
557
558
+ if force_ac_an :
559
+ # bcftools always recomputes the AC and AN fields when samples are specified,
560
+ # even if these fields don't exist before
561
+ for key , number in [("AC" , "A" ), ("AN" , "1" )]:
562
+ if key not in info_fields :
563
+ print (
564
+ f"##INFO=<ID={ key } ,Number={ number } ,Type=Integer,"
565
+ f'Description="{ RESERVED_INFO_KEY_DESCRIPTIONS [key ]} ">' ,
566
+ file = output ,
567
+ )
568
+
546
569
# [1.4.3 Filter field format]
547
570
for filter in filters :
548
571
print (f'##FILTER=<ID={ filter } ,Description="">' , file = output )
0 commit comments