Skip to content

Commit da807cf

Browse files
Add AC/AN to the header as required with -s
Closes #146
1 parent aacd32c commit da807cf

File tree

2 files changed

+31
-6
lines changed

2 files changed

+31
-6
lines changed

tests/test_bcftools_validation.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,8 @@ def run_vcztools(args: str, expect_error=False) -> tuple[str, str]:
7474
("view --no-version -s NA00001,NA00003", "sample.vcf.gz"),
7575
("view --no-version -s HG00096", "1kg_2020_chrM.vcf.gz"),
7676
("view --no-version -s tsk_0,tsk_1", "msprime_diploid.vcf.gz"),
77+
("view --no-version -s tsk_0,tsk_1,tsk_2", "msprime_diploid.vcf.gz"),
78+
("view --no-version -s ^tsk_0,tsk_1,tsk_2", "msprime_diploid.vcf.gz"),
7779
("view --no-version -s '' --force-samples", "sample.vcf.gz"),
7880
("view --no-version -s 'NO_SAMPLE' --force-samples", "sample.vcf.gz"),
7981
("view --no-version -s 'NO_SAMPLE,NA00001' --force-samples", "sample.vcf.gz"),

vcztools/vcf_writer.py

Lines changed: 29 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@
3232
# [Table 1: Reserved INFO keys]
3333
RESERVED_INFO_KEY_DESCRIPTIONS = {
3434
"AA": "Ancestral allele",
35-
"AC": "Allele count in genotypes, for each ALT allele, in the same order as listed",
35+
"AC": "Allele count in genotypes",
3636
"AD": "Total read depth for each allele",
3737
"ADF": "Read depth for each allele on the forward strand",
3838
"ADR": "Read depth for each allele on the reverse strand",
@@ -136,6 +136,7 @@ def write_vcf(
136136
root = zarr.open(vcz, mode="r")
137137

138138
with open_file_like(output) as output:
139+
force_ac_an_header = False
139140
if samples and drop_genotypes:
140141
raise ValueError("Cannot select samples and drop genotypes.")
141142
elif drop_genotypes:
@@ -145,6 +146,7 @@ def write_vcf(
145146
sample_ids = root["sample_id"][:]
146147
samples_selection = None
147148
else:
149+
force_ac_an_header = True
148150
all_samples = root["sample_id"][:]
149151
exclude_samples = samples.startswith("^")
150152
samples = samples.lstrip("^")
@@ -157,15 +159,15 @@ def write_vcf(
157159
if force_samples:
158160
# remove unknown samples from sample_ids
159161
logger.warning(
160-
'subset called for sample(s) not in header: '
162+
"subset called for sample(s) not in header: "
161163
f'{",".join(unknown_samples)}.'
162164
)
163165
sample_ids = np.delete(
164166
sample_ids, search(sample_ids, unknown_samples)
165167
)
166168
else:
167169
raise ValueError(
168-
'subset called for sample(s) not in header: '
170+
"subset called for sample(s) not in header: "
169171
f'{",".join(unknown_samples)}. '
170172
'Use "--force-samples" to ignore this error.'
171173
)
@@ -180,7 +182,11 @@ def write_vcf(
180182
if not no_header:
181183
original_header = root.attrs.get("vcf_header", None)
182184
vcf_header = _generate_header(
183-
root, original_header, sample_ids, no_version=no_version
185+
root,
186+
original_header,
187+
sample_ids,
188+
no_version=no_version,
189+
force_ac_an=force_ac_an_header,
184190
)
185191
print(vcf_header, end="", file=output)
186192

@@ -453,7 +459,14 @@ def c_chunk_to_vcf(
453459
print(line, file=output)
454460

455461

456-
def _generate_header(ds, original_header, sample_ids, *, no_version: bool = False):
462+
def _generate_header(
463+
ds,
464+
original_header,
465+
sample_ids,
466+
*,
467+
no_version: bool = False,
468+
force_ac_an: bool = False,
469+
):
457470
output = io.StringIO()
458471

459472
contigs = list(ds["contig_id"][:])
@@ -488,7 +501,6 @@ def _generate_header(ds, original_header, sample_ids, *, no_version: bool = Fals
488501
if key in ("genotype", "genotype_phased"):
489502
continue
490503
format_fields.append(key)
491-
492504
if original_header is None: # generate entire header
493505
# [1.4.1 File format]
494506
print("##fileformat=VCFv4.3", file=output)
@@ -543,6 +555,17 @@ def _generate_header(ds, original_header, sample_ids, *, no_version: bool = Fals
543555
file=output,
544556
)
545557

558+
if force_ac_an:
559+
# bcftools always recomputes the AC and AN fields when samples are specified,
560+
# even if these fields don't exist before
561+
for key, number in [("AC", "A"), ("AN", "1")]:
562+
if key not in info_fields:
563+
print(
564+
f"##INFO=<ID={key},Number={number},Type=Integer,"
565+
f'Description="{RESERVED_INFO_KEY_DESCRIPTIONS[key]}">',
566+
file=output,
567+
)
568+
546569
# [1.4.3 Filter field format]
547570
for filter in filters:
548571
print(f'##FILTER=<ID={filter},Description="">', file=output)

0 commit comments

Comments
 (0)