Skip to content

Commit 809028c

Browse files
committed
Loosen test for checking if VCF headers are the same
1 parent 9af47f7 commit 809028c

File tree

2 files changed

+46
-13
lines changed

2 files changed

+46
-13
lines changed

tests/test_vcf_writer.py

+6-12
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66
import numpy as np
77
import pytest
88
import zarr
9-
from bio2zarr import icf
109
from cyvcf2 import VCF
1110
from numpy.testing import assert_array_equal
1211

@@ -303,15 +302,9 @@ def test_write_vcf__header_flags(tmp_path):
303302
assert_vcfs_close(original, output)
304303

305304

306-
def test_write_vcf__generate_header(tmp_path):
305+
def test_write_vcf__generate_header():
307306
original = pathlib.Path("tests/data/vcf") / "sample.vcf.gz"
308-
# don't use cache here since we mutate the vcz
309-
vcz = tmp_path.joinpath("intermediate.vcz")
310-
icf.convert([original], vcz, worker_processes=0, local_alleles=False)
311-
312-
# remove vcf_header
313-
root = zarr.open(vcz, mode="r+")
314-
del root.attrs["vcf_header"]
307+
vcz = vcz_path_cache(original)
315308

316309
output_header = StringIO()
317310
write_vcf(vcz, output_header, header_only=True, no_version=True)
@@ -326,9 +319,9 @@ def test_write_vcf__generate_header(tmp_path):
326319
##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">
327320
##INFO=<ID=H2,Number=0,Type=Flag,Description="HapMap2 membership">
328321
##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of Samples With Data">
329-
##FILTER=<ID=PASS,Description="">
330-
##FILTER=<ID=s50,Description="">
331-
##FILTER=<ID=q10,Description="">
322+
##FILTER=<ID=PASS,Description="All filters passed">
323+
##FILTER=<ID=s50,Description="Less than 50% of samples have data">
324+
##FILTER=<ID=q10,Description="Quality below 10">
332325
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
333326
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">
334327
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
@@ -340,6 +333,7 @@ def test_write_vcf__generate_header(tmp_path):
340333
""" # noqa: E501
341334

342335
# substitute value of source
336+
root = zarr.open(vcz, mode="r+")
343337
expected_vcf_header = expected_vcf_header.format(root.attrs["source"])
344338

345339
assert output_header.getvalue() == expected_vcf_header

tests/utils.py

+40-1
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,45 @@ def normalise_info_missingness(info_dict, key):
2929
return value
3030

3131

32+
def _get_headers(vcf, header_type):
33+
def to_dict(header_field):
34+
d = header_field.info(extra=True)
35+
del d[b"IDX"] # remove IDX since we don't care about ordering
36+
37+
# cyvcf2 duplicates some keys as strings and bytes, so remove the bytes one
38+
for k in list(d.keys()):
39+
if isinstance(k, bytes) and k.decode("utf-8") in d:
40+
del d[k]
41+
return d
42+
43+
return {
44+
field["ID"]: to_dict(field)
45+
for field in vcf.header_iter()
46+
if field["HeaderType"] == header_type
47+
}
48+
49+
50+
def _assert_vcf_headers_equivalent(vcf1, vcf2):
51+
# Only compare INFO, FORMAT, FILTER, CONTIG fields, ignoring order
52+
# Other fields are ignored
53+
54+
info1 = _get_headers(vcf1, "INFO")
55+
info2 = _get_headers(vcf2, "INFO")
56+
assert info1 == info2
57+
58+
format1 = _get_headers(vcf1, "FORMAT")
59+
format2 = _get_headers(vcf2, "FORMAT")
60+
assert format1 == format2
61+
62+
filter1 = _get_headers(vcf1, "FILTER")
63+
filter2 = _get_headers(vcf2, "FILTER")
64+
assert filter1 == filter2
65+
66+
contig1 = _get_headers(vcf1, "CONTIG")
67+
contig2 = _get_headers(vcf2, "CONTIG")
68+
assert contig1 == contig2
69+
70+
3271
def assert_vcfs_close(f1, f2, *, rtol=1e-05, atol=1e-03):
3372
"""Like :py:func:`numpy.testing.assert_allclose()`, but for VCF files.
3473
@@ -48,7 +87,7 @@ def assert_vcfs_close(f1, f2, *, rtol=1e-05, atol=1e-03):
4887
Absolute tolerance.
4988
"""
5089
with open_vcf(f1) as vcf1, open_vcf(f2) as vcf2:
51-
assert vcf1.raw_header == vcf2.raw_header
90+
_assert_vcf_headers_equivalent(vcf1, vcf2)
5291
assert vcf1.samples == vcf2.samples
5392

5493
for v1, v2 in zip_longest(vcf1, vcf2):

0 commit comments

Comments
 (0)