From 3c6cde517647656bd95c7eaed4d66bfc6d8f3bb3 Mon Sep 17 00:00:00 2001 From: zhuchcn Date: Tue, 6 Feb 2024 13:23:06 -0800 Subject: [PATCH 1/2] fix (parseVEP): indels can be start-inclusion, too --- CHANGELOG.md | 4 +++- moPepGen/parser/VEPParser.py | 15 +++++++-------- test/unit/test_vep_parser.py | 28 ++++++++++++++++++++++++++++ 3 files changed, 38 insertions(+), 9 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6af3805d..0c1cd230 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -20,7 +20,9 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm - Updated `splitFasta` and `summarizeFasta` to accept source combinations in `--order-source`. -- Fixed parseCIRCexplorer so the exon/intron indices in variant IDs are sorted correctly. +- Fixed `parseCIRCexplorer` so the exon/intron indices in variant IDs are sorted correctly. + +- Fixed `parseVEP` to handle insertions in start-inclusion. #840 ## [1.2.1] - 2023-10-05 diff --git a/moPepGen/parser/VEPParser.py b/moPepGen/parser/VEPParser.py index d8ed41bd..e0d3ce5e 100644 --- a/moPepGen/parser/VEPParser.py +++ b/moPepGen/parser/VEPParser.py @@ -105,7 +105,7 @@ def __init__( def __repr__(self)->str: """Return representation of the VEP record.""" consequences = '|'.join(self.consequences) - return f"< {self.feature}, {consequences}, {self.location} >" + return f"< {self.feature}, {consequences}, {self.location}, {self.allele} >" def convert_to_variant_record(self, anno:gtf.GenomicAnnotation, genome:dna.DNASeqDict) -> seqvar.VariantRecord: @@ -174,18 +174,17 @@ def convert_to_variant_record(self, anno:gtf.GenomicAnnotation, # Sometimes insertions are reported by VEP in the end-inclusion # way (e.g., C -> TACC), which needs to be converted into # start-inclusion (A -> ATAC) for variants on + strand genes. - if strand == 1: - if seq.seq[alt_start] != allele[-1]: - raise ValueError(f"Don't know how to process this variant: {self}") + ref = str(seq.seq[alt_start]) + if ref == allele[-1]: alt_start -= 1 alt_end = alt_start + 1 ref = str(seq.seq[alt_start]) alt = ref + allele[:-1] - else: - if seq.seq[alt_start] != allele[0]: - raise ValueError(f"Don't know how to process this variant: {self}") - ref = str(seq.seq[alt_start]) + elif ref == allele[0]: + ref = str(ref) alt = allele + else: + raise ValueError(f"Don't know how to process this variant: {self}") else: # SNV ref = str(seq.seq[alt_start]) alt = allele diff --git a/test/unit/test_vep_parser.py b/test/unit/test_vep_parser.py index 0d4763b3..321532ac 100644 --- a/test/unit/test_vep_parser.py +++ b/test/unit/test_vep_parser.py @@ -507,6 +507,34 @@ def test_vep_to_variant_record_case16_insertion(self): record = vep_record.convert_to_variant_record(anno, genome) self.assertEqual(record.ref, 'G') self.assertEqual(record.alt, 'GTCA') + + def test_vep_to_variant_record_case17_insertion_start_inclusion(self): + """ Insertion when the location is a single spot. + In this test case, the variant is represented by VEP in a start-inclusion + format as C -> CTCA + """ + genome = create_dna_record_dict(GENOME_DATA) + anno = create_genomic_annotation(ANNOTATION_DATA) + + vep_record = VEPParser.VEPRecord( + uploaded_variation='rs55971985', + location='chr1:18', + allele='CTCA', + gene='ENSG0001', + feature='ENST0001.1', + feature_type='Transcript', + consequences=['missense_variant'], + cdna_position='11', + cds_position='11', + protein_position=3, + amino_acids=('S', 'T'), + codons=('aTa', 'aCa'), + existing_variation='-', + extra={} + ) + record = vep_record.convert_to_variant_record(anno, genome) + self.assertEqual(record.ref, 'C') + self.assertEqual(record.alt, 'CTCA') def test_vep_to_variant_mnv(self): """ error is raised for MNV """ From e9a35275e5ca4c5e74cd845dc6d0597d4de177a0 Mon Sep 17 00:00:00 2001 From: zhuchcn Date: Wed, 7 Feb 2024 10:06:47 -0800 Subject: [PATCH 2/2] style: trailing white space --- test/unit/test_vep_parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/unit/test_vep_parser.py b/test/unit/test_vep_parser.py index 321532ac..09c55c5a 100644 --- a/test/unit/test_vep_parser.py +++ b/test/unit/test_vep_parser.py @@ -507,7 +507,7 @@ def test_vep_to_variant_record_case16_insertion(self): record = vep_record.convert_to_variant_record(anno, genome) self.assertEqual(record.ref, 'G') self.assertEqual(record.alt, 'GTCA') - + def test_vep_to_variant_record_case17_insertion_start_inclusion(self): """ Insertion when the location is a single spot. In this test case, the variant is represented by VEP in a start-inclusion