Merge pull request #903 from uclahs-cds/czhu-fix-call-variant

Bundle Update: callVariant & parseVEP
uclahs-cds · Mar 5, 2025 · e6fb717 · e6fb717
2 parents 2ae1012 + 80bb218
commit e6fb717
Show file tree

Hide file tree

Showing 26 changed files with 1,185 additions and 34 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -10,6 +10,22 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm
 
 ## [Unreleased]
 
+## [1.4.6-rc2] - 2025-03-03
+
+### Fixed
+
+- Fixed callVariant that variant bubble not identified correctly.
+
+- Fixed parseVEP that failed records will now be skipped with --skip-failed. #902
+
+- Fixed callVariant with variant bubble finding error starting from a out-bridge node.
+
+- Fixed callVariant that peptide annotation not created correctly with SEC
+
+## Changed
+
+- A check added when converting a genomic location to gene coordinate to ensure they overlap.
+
 ## [1.4.6-rc1] - 2025-02-24
 
 ### Fixed

diff --git a/docs/files/fuzz_test_history.tsv b/docs/files/fuzz_test_history.tsv
@@ -38,3 +38,6 @@ v1.4.3	64a1cd7	2025-02-12	comprehensive	6985	0	0	0:00:00.372239	0.80905269209725
 v1.4.5	f46742d	2025-02-24	snv	3189	0	0	0:00:00.144238	0.35400948008943256	0:00:54.012856	109.64929228073993
 v1.4.5	f46742d	2025-02-24	indel	3228	0	0	0:00:00.202602	0.9296003929444859	0:00:41.938361	99.86673318517921
 v1.4.5	f46742d	2025-02-24	comprehensive	5902	0	0	0:00:00.375135	1.0157547122885704	0:00:40.352927	181.73796673959964
+v1.4.6-rc1	78e971d	2025-03-04	snv	2710	0	0	0:00:00.163103	0.38221870483830245	0:00:56.301104	116.21712282069569
+v1.4.6-rc1	78e971d	2025-03-04	indel	2850	0	0	0:00:00.191917	0.3938244401664319	0:00:41.244215	95.36212234507254
+v1.4.6-rc1	78e971d	2025-03-04	comprehensive	5310	0	0	0:00:00.395482	1.0924741762245143	0:00:40.031276	179.44714992445952
diff --git a/moPepGen/cli/common.py b/moPepGen/cli/common.py
@@ -158,6 +158,14 @@ def add_args_decoy(parser:argparse.ArgumentParser):
         metavar='<value>'
     )
 
+def add_args_skip_failed(parser:argparse.ArgumentParser):
+    """ add --skip-failed """
+    parser.add_argument(
+        '--skip-failed',
+        action='store_true',
+        help='When set, the failed records will be skipped.'
+    )
+
 def add_args_debug_level(parser:argparse.ArgumentParser):
     """ add debug level """
     parser.add_argument(

diff --git a/moPepGen/cli/parse_vep.py b/moPepGen/cli/parse_vep.py
@@ -5,20 +5,23 @@
 variant peptide sequences.
 """
 from __future__ import annotations
+from typing import TYPE_CHECKING
 import argparse
 import gzip
-from typing import Dict, List
 from pathlib import Path
 from moPepGen.parser import VEPParser
-from moPepGen.err import MNVParsingError, TranscriptionStopSiteMutationError, \
-    TranscriptionStartSiteMutationError, warning
+from moPepGen.err import TranscriptionStopSiteMutationError, TranscriptionStartSiteMutationError
 from moPepGen import seqvar, get_logger
 from moPepGen.cli import common
 
 
 INPUT_FILE_FORMATS = ['.tsv', '.txt', '.tsv.gz', '.txt.gz']
 OUTPUT_FILE_FORMATS = ['.gvf']
 
+if TYPE_CHECKING:
+    from typing import Dict, List
+    from logging import Logger
+
 # pylint: disable=W0212
 def add_subparser_parse_vep(subparsers:argparse._SubParsersAction):
     """ CLI for moPepGen parseVEP """
@@ -38,12 +41,39 @@ def add_subparser_parse_vep(subparsers:argparse._SubParsersAction):
     )
     common.add_args_output_path(p, OUTPUT_FILE_FORMATS)
     common.add_args_source(p)
+    common.add_args_skip_failed(p)
     common.add_args_reference(p, proteome=False)
     common.add_args_debug_level(p)
     p.set_defaults(func=parse_vep)
     common.print_help_if_missing_args(p)
     return p
 
+class TallyTable():
+    """ Tally table """
+    def __init__(self, logger:Logger):
+        """ Constructor """
+        self.total:int = 0
+        self.succeed:int = 0
+        self.failed:TallyTableFailed = TallyTableFailed()
+        self.logger = logger
+
+    def log(self):
+        """ Show tally results """
+        self.logger.info("Records successfully processed: %i", self.total)
+        self.logger.info("Records failed: %i", self.failed.total)
+        if self.failed.total > 0:
+            self.logger.info("Out of those failed,")
+            self.logger.info("Start codon mutation: %i", self.failed.start_site_mutation)
+            self.logger.info("Stop codon mutation: %i", self.failed.stop_site_mutation)
+
+class TallyTableFailed():
+    """ Tally table for failed ones """
+    def __init__(self):
+        """ constructor """
+        self.start_site_mutation:int = 0
+        self.stop_site_mutation:int = 0
+        self.total:int = 0
+
 def parse_vep(args:argparse.Namespace) -> None:
     """ Main entry point for the VEP parser. """
     logger = get_logger()
@@ -64,31 +94,45 @@ def parse_vep(args:argparse.Namespace) -> None:
 
     vep_records:Dict[str, List[seqvar.VariantRecord]] = {}
 
+    tally = TallyTable(logger)
+
     for vep_file in vep_files:
         opener = gzip.open if vep_file.suffix == '.gz' else open
         with opener(vep_file, 'rt') as handle:
             for record in VEPParser.parse(handle):
+                tally.total += 1
                 transcript_id = record.feature
 
                 if transcript_id not in vep_records:
                     vep_records[transcript_id] = []
 
+
                 try:
                     record = record.convert_to_variant_record(anno, genome)
+                    tally.succeed += 1
                 except TranscriptionStopSiteMutationError:
+                    tally.failed.total += 1
+                    tally.failed.stop_site_mutation += 1
                     continue
                 except TranscriptionStartSiteMutationError:
+                    tally.failed.total += 1
+                    tally.failed.start_site_mutation += 1
                     continue
-                except MNVParsingError:
-                    warning(
-                        f"MNVs are not currently supported. Skipping record: {record}"
-                    )
-                    continue
+                except:
+                    if args.skip_failed:
+                        logger.warning(
+                            "VEP record failed to convert: %s", str(record)
+                        )
+                        tally.failed.total += 1
+                        continue
+                    raise
 
                 vep_records[transcript_id].append(record)
 
         logger.info('VEP file %s loaded.', vep_file)
 
+    tally.log()
+
     if not vep_records:
         logger.warning('No variant record is saved.')
         return

diff --git a/moPepGen/gtf/GenomicAnnotation.py b/moPepGen/gtf/GenomicAnnotation.py
@@ -179,6 +179,12 @@ def coordinate_gene_to_transcript(self, index:int, gene:str,
     def coordinate_genomic_to_gene(self, index:int, gene:str) -> int:
         """ Get the gene coordinate from genomic coordinate. """
         gene_location = self.genes[gene].location
+        if not gene_location.start <= index < gene_location.end:
+            loc = f"{self.genes[gene].chrom}:{gene_location.start}-{gene_location.end}"
+            raise ValueError(
+                f"The position does not overlap with the gene. index: {index}, "
+                f"gene: {gene}, {loc}"
+            )
         if gene_location.strand == 1:
             return index - gene_location.start
         if gene_location.strand == -1:

diff --git a/moPepGen/svgraph/ThreeFrameTVG.py b/moPepGen/svgraph/ThreeFrameTVG.py
@@ -1580,10 +1580,7 @@ def align_variants(self, node:TVGNode) -> Tuple[TVGNode, TVGNode]:
 
         Args:
             node (TVGNode): The node of which the outbound nodes will
-                be aligned.
-            branch_out_size (int): The size limit that if a variant is larger
-                that it, it will branch out even if it's not a frameshifting
-                mutation.
+                be aligned
         Returns:
             The original input node.
         """
@@ -1609,29 +1606,25 @@ def align_variants(self, node:TVGNode) -> Tuple[TVGNode, TVGNode]:
         if not end_node:
             raise err.FailedToFindVariantBubbleError()
 
+        end_nodes = {end_node.id}
         if not self.is_circ_rna() and end_node.is_subgraph_end():
             subgraph_ends = {end_node}
             subgraph_ends.update(self.find_other_subgraph_end_nodes(end_node, members))
-            end_nodes = set()
             if len(subgraph_ends) > 1:
                 for subgraph_end in subgraph_ends:
-                    end_nodes.update(subgraph_end.get_out_nodes())
-            else:
-                end_nodes = {end_node}
-        else:
-            end_nodes = {end_node}
+                    end_nodes.update([x.id for x in subgraph_end.get_out_nodes()])
 
         bridges = self.find_bridge_nodes_between(start_node, end_node, members)
         bridge_ins, bridge_outs, subgraph_ins, subgraph_outs = bridges
 
         for bridge in bridge_outs:
             for e in bridge.out_edges:
-                end_nodes.add(e.out_node)
+                end_nodes.add(e.out_node.id)
 
         for subgraph in subgraph_outs:
             for e in subgraph.out_edges:
                 if e.out_node not in members:
-                    end_nodes.add(e.out_node)
+                    end_nodes.add(e.out_node.id)
 
         new_nodes:Set[TVGNode] = set()
         queue = deque()
@@ -1645,7 +1638,7 @@ def align_variants(self, node:TVGNode) -> Tuple[TVGNode, TVGNode]:
                 # In-bridge nodes should not be merged with their outgoing nodes
                 # that do not belong to the bubble.
                 if edge.out_node not in members:
-                    end_nodes.add(edge.out_node)
+                    end_nodes.add(edge.out_node.id)
             bridge_map[new_bridge] = bridge_in
             trash.add(bridge_in)
 
@@ -1664,7 +1657,7 @@ def align_variants(self, node:TVGNode) -> Tuple[TVGNode, TVGNode]:
 
         while queue:
             cur:TVGNode = queue.pop()
-            if cur in end_nodes or not cur.out_edges:
+            if cur.id in end_nodes or not cur.out_edges:
                 if cur not in bridge_map:
                     new_nodes.add(cur)
                 else:
@@ -1673,7 +1666,7 @@ def align_variants(self, node:TVGNode) -> Tuple[TVGNode, TVGNode]:
 
             # When all out nodes are in `end_nodes`. This is to avoid the `cur`
             # to be replicated.
-            if all(x in end_nodes for x in cur.get_out_nodes()):
+            if all(x.id in end_nodes for x in cur.get_out_nodes()):
                 new_node = cur.copy()
                 trash.add(cur)
                 for edge in cur.out_edges:
@@ -1690,7 +1683,7 @@ def align_variants(self, node:TVGNode) -> Tuple[TVGNode, TVGNode]:
 
                 # So this is the case that some of the out_nodes are in end_nodes
                 # but not the others.
-                if out_node in end_nodes:
+                if out_node.id in end_nodes:
                     new_node = cur.copy()
                     trash.add(cur)
                     for edge in cur.out_edges:
@@ -1723,7 +1716,7 @@ def align_variants(self, node:TVGNode) -> Tuple[TVGNode, TVGNode]:
                         else 'variant_end'
                     self.add_edge(new_node, edge.out_node, _type=edge_type)
 
-                if out_node not in end_nodes:
+                if out_node.id not in end_nodes:
                     queue.appendleft(new_node)
 
                 if cur in bridge_map:
@@ -1749,8 +1742,6 @@ def align_variants(self, node:TVGNode) -> Tuple[TVGNode, TVGNode]:
         for trash_node in trash:
             self.remove_node(trash_node)
 
-        return start_node, end_node
-
     def expand_alignments(self, start:TVGNode) -> List[TVGNode]:
         r""" Expand the aligned variants into the range of codons. For
         frameshifting mutations, a copy of each downstream node will be
@@ -1911,7 +1902,10 @@ def fit_into_codons(self) -> None:
                 queue.appendleft(cur)
                 continue
 
-            self.align_variants(cur)
+            try:
+                self.align_variants(cur)
+            except err.FailedToFindVariantBubbleError:
+                continue
 
             self.collapse_equivalent_nodes(cur)
             if cur.out_edges:

diff --git a/moPepGen/svgraph/VariantPeptideDict.py b/moPepGen/svgraph/VariantPeptideDict.py
@@ -466,19 +466,20 @@ def translational_modification(self, seq:Seq, metadata:VariantPeptideMetadata,
                 cur_metadata.has_variants = bool(cur_variants)
 
                 if is_valid or is_valid_start:
-                    cur_nodes = []
+                    cur_nodes:List[PVGNode] = []
                     cut_offset = sec.location.start
                     for node in nodes:
                         if cut_offset == 0:
                             cur_nodes.append(node)
                             continue
                         if len(node.seq.seq) > cut_offset:
                             node = node.copy()
-                            node.truncate_left(cut_offset)
+                            node.truncate_right(cut_offset)
                             cur_nodes.append(node)
                             cut_offset = 0
                         else:
                             cut_offset = max(0, cut_offset - len(node.seq.seq))
+                            cur_nodes.append(node)
                             continue
                     if is_valid:
                         cur_metadata_2 = copy.copy(cur_metadata)

diff --git a/test/files/circRNA/CIRCexplorer3_circularRNA_known.txt b/test/files/circRNA/CIRCexplorer3_circularRNA_known.txt
@@ -1,3 +1,3 @@
 chr22	0	464	circular_RNA/1	0	+	0	0	0,0,0	2	323,82	0,323	1	circRNA	RIBC2	ENST00000614167.2	1,2	chr22:0-130787|chr22:131505-160911	1	1	1
-chr22	4980	5177	circular_RNA/1	0	-	4980	4980	0,0,0	2	78,42	0,98	1	circRNA	LZTR1	ENST00000642151.1	1,2	chr22:427119-435769|chr22:437511-472574	1	1	1
+chr22	4980	5120	circular_RNA/1	0	-	4980	4980	0,0,0	2	78,42	0,98	1	circRNA	LZTR1	ENST00000642151.1	1,2	chr22:427119-435769|chr22:437511-472574	1	1	1
 chr22	5058	5078	circular_RNA/2	0	-	5058	5058	0,0,0	1	20	0	2	ciRNA	LZTR1	ENST00000642151.1	1	chr22:868685-870710|chr22:884086-888846	1	1	1
diff --git a/test/files/circRNA/CIRCexplorer_circularRNA_known.txt b/test/files/circRNA/CIRCexplorer_circularRNA_known.txt
@@ -1,3 +1,3 @@
 chr22	0	464	circular_RNA/1	0	+	0	0	0,0,0	2	323,82	0,323	1	circRNA	RIBC2	ENST00000614167.2	1,2	chr22:0-130787|chr22:131505-160911
-chr22	4980	5177	circular_RNA/1	0	-	4980	4980	0,0,0	2	78,42	0,98	1	circRNA	LZTR1	ENST00000642151.1	1,2	chr22:427119-435769|chr22:437511-472574
+chr22	4980	5120	circular_RNA/1	0	-	4980	4980	0,0,0	2	78,42	0,98	1	circRNA	LZTR1	ENST00000642151.1	1,2	chr22:427119-435769|chr22:437511-472574
 chr22	5058	5078	circular_RNA/2	0	-	5058	5058	0,0,0	1	20	0	2	ciRNA	LZTR1	ENST00000642151.1	1	chr22:868685-870710|chr22:884086-888846
diff --git a/test/files/comb/case_87/AltSplice.gvf b/test/files/comb/case_87/AltSplice.gvf
@@ -0,0 +1,21 @@
+##fileformat=VCFv4.2
+##mopepgen_version=1.4.6-rc1
+##parser=parseRMATS
+##reference_index=/hot/project/process/MissingPeptides/MISP-000132-MissingPeptidesPanCanP1/ref/GRCh38-EBI-GENCODE45/moPepGen/1.4.1
+##genome_fasta=
+##annotation_gtf=
+##source=AltSplice
+##CHROM=<Description="Gene ID">
+##INFO=<ID=TRANSCRIPT_ID,Number=1,Type=String,Description="Transcript ID">
+##INFO=<ID=GENE_SYMBOL,Number=1,Type=String,Description="Gene Symbol">
+##INFO=<ID=GENOMIC_POSITION,Number=1,Type=String,Description="Genomic Position">
+##INFO=<ID=START,Number=1,Type=Integer,Description="Start Position">
+##INFO=<ID=END,Number=1,Type=Integer,Description="End Position">
+##INFO=<ID=DONOR_START,Number=1,Type=Integer,Description="Donor Start Position">
+##INFO=<ID=DONOR_END,Number=1,Type=Integer,Description="Donor End Position">
+##INFO=<ID=COORDINATE,Number=1,Type=String,Description="Coordinate for Insertion or Substitution">
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO
+ENSG00000143727.16	7800	RI_7800-7897	T	<INS>	.	.	TRANSCRIPT_ID=ENST00000453390.5;DONOR_START=7801;DONOR_END=7897;DONOR_GENE_ID=ENSG00000143727.16;COORDINATE=gene;GENE_SYMBOL=ACP1;GENOMIC_POSITION=chr2:271939-272036
+ENSG00000143727.16	7898	SE_7799-8052-8166-11001	T	<DEL>	.	.	TRANSCRIPT_ID=ENST00000453390.5;START=7898;END=7926;GENE_SYMBOL=ACP1;GENOMIC_POSITION=chr2:272037:272065
+ENSG00000143727.16	8053	SE_7799-7897-8011-11001	T	<SUB>	.	.	TRANSCRIPT_ID=ENST00000453390.5;START=8053;END=8166;DONOR_START=7927;DONOR_END=8011;DONOR_GENE_ID=ENSG00000143727.16;GENE_SYMBOL=ACP1;GENOMIC_POSITION=chr2:272192:272305
+ENSG00000143727.16	8053	A5SS_8011-8007-12840	T	<SUB>	.	.	TRANSCRIPT_ID=ENST00000453390.5;START=8053;END=11062;DONOR_START=7927;DONOR_END=8007;DONOR_GENE_ID=ENSG00000143727.16;GENE_SYMBOL=ACP1;GENOMIC_POSITION=chr2:272192:275201
diff --git a/test/files/comb/case_87/annotation.gtf b/test/files/comb/case_87/annotation.gtf
@@ -0,0 +1,17 @@
+chr1	.	gene	1	14144	.	+	.	 gene_id ENSG00000143727.16; gene_type protein_coding; gene_name ACP1;
+chr1	.	transcript	826	13236	.	+	.	 gene_id ENSG00000143727.16; transcript_id ENST00000453390.5; gene_type protein_coding; gene_name ACP1; protein_id ENSP00000411121.1;
+chr1	.	CDS	826	868	.	+	0	 gene_id ENSG00000143727.16; transcript_id ENST00000453390.5; gene_type protein_coding; gene_name ACP1; protein_id ENSP00000411121.1;
+chr1	.	exon	826	868	.	+	.	 gene_id ENSG00000143727.16; transcript_id ENST00000453390.5; gene_type protein_coding; gene_name ACP1; protein_id ENSP00000411121.1;
+chr1	.	CDS	7727	7800	.	+	2	 gene_id ENSG00000143727.16; transcript_id ENST00000453390.5; gene_type protein_coding; gene_name ACP1; protein_id ENSP00000411121.1;
+chr1	.	exon	7727	7800	.	+	.	 gene_id ENSG00000143727.16; transcript_id ENST00000453390.5; gene_type protein_coding; gene_name ACP1; protein_id ENSP00000411121.1;
+chr1	.	CDS	7898	7926	.	+	0	 gene_id ENSG00000143727.16; transcript_id ENST00000453390.5; gene_type protein_coding; gene_name ACP1; protein_id ENSP00000411121.1;
+chr1	.	exon	7898	7926	.	+	.	 gene_id ENSG00000143727.16; transcript_id ENST00000453390.5; gene_type protein_coding; gene_name ACP1; protein_id ENSP00000411121.1;
+chr1	.	CDS	8053	8116	.	+	1	 gene_id ENSG00000143727.16; transcript_id ENST00000453390.5; gene_type protein_coding; gene_name ACP1; protein_id ENSP00000411121.1;
+chr1	.	exon	8053	8166	.	+	.	 gene_id ENSG00000143727.16; transcript_id ENST00000453390.5; gene_type protein_coding; gene_name ACP1; protein_id ENSP00000411121.1;
+chr1	.	exon	11001	11062	.	+	.	 gene_id ENSG00000143727.16; transcript_id ENST00000453390.5; gene_type protein_coding; gene_name ACP1; protein_id ENSP00000411121.1;
+chr1	.	exon	12841	12946	.	+	.	 gene_id ENSG00000143727.16; transcript_id ENST00000453390.5; gene_type protein_coding; gene_name ACP1; protein_id ENSP00000411121.1;
+chr1	.	exon	13088	13236	.	+	.	 gene_id ENSG00000143727.16; transcript_id ENST00000453390.5; gene_type protein_coding; gene_name ACP1; protein_id ENSP00000411121.1;
+chr1	.	UTR	8117	8166	.	+	.	 gene_id ENSG00000143727.16; transcript_id ENST00000453390.5; gene_type protein_coding; gene_name ACP1; protein_id ENSP00000411121.1;
+chr1	.	UTR	11001	11062	.	+	.	 gene_id ENSG00000143727.16; transcript_id ENST00000453390.5; gene_type protein_coding; gene_name ACP1; protein_id ENSP00000411121.1;
+chr1	.	UTR	12841	12946	.	+	.	 gene_id ENSG00000143727.16; transcript_id ENST00000453390.5; gene_type protein_coding; gene_name ACP1; protein_id ENSP00000411121.1;
+chr1	.	UTR	13088	13236	.	+	.	 gene_id ENSG00000143727.16; transcript_id ENST00000453390.5; gene_type protein_coding; gene_name ACP1; protein_id ENSP00000411121.1;