Skip to content

Commit 316d264

Browse files
tomwhitejeromekelleher
authored andcommitted
Support filtering by CHROM
1 parent 7526549 commit 316d264

File tree

3 files changed

+55
-10
lines changed

3 files changed

+55
-10
lines changed

tests/test_bcftools_validation.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,8 @@ def run_vcztools(args: str, expect_error=False) -> tuple[str, str]:
4646
("view --no-version", "sample.vcf.gz"),
4747
("view --no-version", "chr22.vcf.gz"),
4848
("view --no-version", "msprime_diploid.vcf.gz"),
49+
("view --no-version -i 'CHROM == \"20\"'", "sample.vcf.gz"),
50+
("view --no-version -i 'CHROM != \"Z\"'", "sample.vcf.gz"),
4951
("view --no-version -i 'ID == \"rs6054257\"'", "sample.vcf.gz"),
5052
("view --no-version -i 'DB=0'", "sample.vcf.gz"),
5153
("view --no-version -i 'DB=1'", "sample.vcf.gz"),

tests/test_filter.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,6 @@ def test_invalid_expressions(self, parser, expression):
3535
# generic string issue. Can fix this later when we've gotten
3636
# some partial string handling implemented
3737
("INFO/HAYSTACK ~ 0", filter_mod.UnsupportedRegexError),
38-
('CHROM="1"', filter_mod.UnsupportedChromFieldError),
3938
('DP="."', filter_mod.UnsupportedMissingDataError),
4039
("ID!=@~/file", filter_mod.UnsupportedFileReferenceError),
4140
("INFO/TAG=@file", filter_mod.UnsupportedFileReferenceError),
@@ -68,6 +67,7 @@ class TestFilterExpressionSample:
6867
@pytest.mark.parametrize(
6968
("expression", "expected_result"),
7069
[
70+
('CHROM = "20"', [0, 0, 1, 1, 1, 1, 1, 1, 0]),
7171
("POS < 1000", [1, 1, 0, 0, 0, 0, 0, 0, 1]),
7272
("INFO/DP > 10", [0, 0, 1, 1, 0, 1, 0, 0, 0]),
7373
(

vcztools/filter.py

Lines changed: 52 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -52,11 +52,6 @@ class UnsupportedFileReferenceError(UnsupportedFilteringFeatureError):
5252
feature = "File references"
5353

5454

55-
class UnsupportedChromFieldError(UnsupportedFilteringFeatureError):
56-
issue = "178"
57-
feature = "CHROM field"
58-
59-
6055
class UnsupportedFunctionsError(UnsupportedFilteringFeatureError):
6156
issue = "190"
6257
feature = "Function evaluation"
@@ -117,9 +112,7 @@ def __init__(self, tokens):
117112
class Identifier(EvaluationNode):
118113
def __init__(self, mapper, tokens):
119114
token = tokens[0]
120-
if token == "CHROM":
121-
raise UnsupportedChromFieldError()
122-
elif token == "GT":
115+
if token == "GT":
123116
raise UnsupportedGenotypeValuesError()
124117
self.field_name = mapper(token)
125118
logger.debug(f"Mapped {token} to {self.field_name}")
@@ -292,6 +285,47 @@ def referenced_fields(self):
292285
return self.op1.referenced_fields() | self.op2.referenced_fields()
293286

294287

288+
# CHROM field expressions are translated to contig IDs to avoid string
289+
# comparisons for every variant site
290+
291+
292+
class ChromString(Constant):
293+
def __init__(self, tokens):
294+
super().__init__(tokens)
295+
296+
def eval(self, data):
297+
contig_ids = list(data["contig_id"])
298+
try:
299+
return contig_ids.index(self.tokens)
300+
except ValueError:
301+
return -1 # won't match anything
302+
303+
def referenced_fields(self):
304+
return frozenset(["contig_id"])
305+
306+
307+
class ChromFieldOperator(EvaluationNode):
308+
op_map = {
309+
"=": operator.eq,
310+
"==": operator.eq,
311+
"!=": operator.ne,
312+
}
313+
314+
def __init__(self, tokens):
315+
super().__init__(tokens)
316+
self.op1, self.op, self.op2 = tokens # not self.tokens
317+
self.comparison_fn = self.op_map[self.op]
318+
319+
def eval(self, data):
320+
return self.comparison_fn(self.op1.eval(data), self.op2.eval(data))
321+
322+
def __repr__(self):
323+
return f"({repr(self.op1)}){self.op}({repr(self.op2)})"
324+
325+
def referenced_fields(self):
326+
return self.op1.referenced_fields() | self.op2.referenced_fields()
327+
328+
295329
# FILTER field expressions have special set-like semantics
296330
# so they are handled by dedicated operators.
297331

@@ -382,6 +416,14 @@ def make_bcftools_filter_parser(all_fields=None, map_vcf_identifiers=True):
382416
if map_vcf_identifiers:
383417
name_mapper = functools.partial(vcf_name_to_vcz_name, all_fields)
384418

419+
chrom_field_identifier = pp.Literal("CHROM")
420+
chrom_field_identifier = chrom_field_identifier.set_parse_action(
421+
functools.partial(Identifier, name_mapper)
422+
)
423+
chrom_string = pp.QuotedString('"').set_parse_action(ChromString)
424+
chrom_field_expr = chrom_field_identifier + pp.one_of("= == !=") + chrom_string
425+
chrom_field_expr = chrom_field_expr.set_parse_action(ChromFieldOperator)
426+
385427
filter_field_identifier = pp.Literal("FILTER")
386428
filter_field_identifier = filter_field_identifier.set_parse_action(
387429
functools.partial(Identifier, name_mapper)
@@ -416,7 +458,8 @@ def make_bcftools_filter_parser(all_fields=None, map_vcf_identifiers=True):
416458

417459
comp_op = pp.oneOf("< = == > >= <= !=")
418460
filter_expression = pp.infix_notation(
419-
filter_field_expr
461+
chrom_field_expr
462+
| filter_field_expr
420463
| function
421464
| constant
422465
| indexed_identifier

0 commit comments

Comments
 (0)