Skip to content

Commit

Permalink
Merge pull request #903 from uclahs-cds/czhu-fix-call-variant
Browse files Browse the repository at this point in the history
Bundle Update: callVariant & parseVEP
  • Loading branch information
zhuchcn authored Mar 5, 2025
2 parents 2ae1012 + 80bb218 commit e6fb717
Show file tree
Hide file tree
Showing 26 changed files with 1,185 additions and 34 deletions.
16 changes: 16 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,22 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm

## [Unreleased]

## [1.4.6-rc2] - 2025-03-03

### Fixed

- Fixed callVariant that variant bubble not identified correctly.

- Fixed parseVEP that failed records will now be skipped with --skip-failed. #902

- Fixed callVariant with variant bubble finding error starting from a out-bridge node.

- Fixed callVariant that peptide annotation not created correctly with SEC

## Changed

- A check added when converting a genomic location to gene coordinate to ensure they overlap.

## [1.4.6-rc1] - 2025-02-24

### Fixed
Expand Down
3 changes: 3 additions & 0 deletions docs/files/fuzz_test_history.tsv
Original file line number Diff line number Diff line change
Expand Up @@ -38,3 +38,6 @@ v1.4.3 64a1cd7 2025-02-12 comprehensive 6985 0 0 0:00:00.372239 0.80905269209725
v1.4.5 f46742d 2025-02-24 snv 3189 0 0 0:00:00.144238 0.35400948008943256 0:00:54.012856 109.64929228073993
v1.4.5 f46742d 2025-02-24 indel 3228 0 0 0:00:00.202602 0.9296003929444859 0:00:41.938361 99.86673318517921
v1.4.5 f46742d 2025-02-24 comprehensive 5902 0 0 0:00:00.375135 1.0157547122885704 0:00:40.352927 181.73796673959964
v1.4.6-rc1 78e971d 2025-03-04 snv 2710 0 0 0:00:00.163103 0.38221870483830245 0:00:56.301104 116.21712282069569
v1.4.6-rc1 78e971d 2025-03-04 indel 2850 0 0 0:00:00.191917 0.3938244401664319 0:00:41.244215 95.36212234507254
v1.4.6-rc1 78e971d 2025-03-04 comprehensive 5310 0 0 0:00:00.395482 1.0924741762245143 0:00:40.031276 179.44714992445952
8 changes: 8 additions & 0 deletions moPepGen/cli/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,14 @@ def add_args_decoy(parser:argparse.ArgumentParser):
metavar='<value>'
)

def add_args_skip_failed(parser:argparse.ArgumentParser):
""" add --skip-failed """
parser.add_argument(
'--skip-failed',
action='store_true',
help='When set, the failed records will be skipped.'
)

def add_args_debug_level(parser:argparse.ArgumentParser):
""" add debug level """
parser.add_argument(
Expand Down
60 changes: 52 additions & 8 deletions moPepGen/cli/parse_vep.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,20 +5,23 @@
variant peptide sequences.
"""
from __future__ import annotations
from typing import TYPE_CHECKING
import argparse
import gzip
from typing import Dict, List
from pathlib import Path
from moPepGen.parser import VEPParser
from moPepGen.err import MNVParsingError, TranscriptionStopSiteMutationError, \
TranscriptionStartSiteMutationError, warning
from moPepGen.err import TranscriptionStopSiteMutationError, TranscriptionStartSiteMutationError
from moPepGen import seqvar, get_logger
from moPepGen.cli import common


INPUT_FILE_FORMATS = ['.tsv', '.txt', '.tsv.gz', '.txt.gz']
OUTPUT_FILE_FORMATS = ['.gvf']

if TYPE_CHECKING:
from typing import Dict, List
from logging import Logger

# pylint: disable=W0212
def add_subparser_parse_vep(subparsers:argparse._SubParsersAction):
""" CLI for moPepGen parseVEP """
Expand All @@ -38,12 +41,39 @@ def add_subparser_parse_vep(subparsers:argparse._SubParsersAction):
)
common.add_args_output_path(p, OUTPUT_FILE_FORMATS)
common.add_args_source(p)
common.add_args_skip_failed(p)
common.add_args_reference(p, proteome=False)
common.add_args_debug_level(p)
p.set_defaults(func=parse_vep)
common.print_help_if_missing_args(p)
return p

class TallyTable():
""" Tally table """
def __init__(self, logger:Logger):
""" Constructor """
self.total:int = 0
self.succeed:int = 0
self.failed:TallyTableFailed = TallyTableFailed()
self.logger = logger

def log(self):
""" Show tally results """
self.logger.info("Records successfully processed: %i", self.total)
self.logger.info("Records failed: %i", self.failed.total)
if self.failed.total > 0:
self.logger.info("Out of those failed,")
self.logger.info("Start codon mutation: %i", self.failed.start_site_mutation)
self.logger.info("Stop codon mutation: %i", self.failed.stop_site_mutation)

class TallyTableFailed():
""" Tally table for failed ones """
def __init__(self):
""" constructor """
self.start_site_mutation:int = 0
self.stop_site_mutation:int = 0
self.total:int = 0

def parse_vep(args:argparse.Namespace) -> None:
""" Main entry point for the VEP parser. """
logger = get_logger()
Expand All @@ -64,31 +94,45 @@ def parse_vep(args:argparse.Namespace) -> None:

vep_records:Dict[str, List[seqvar.VariantRecord]] = {}

tally = TallyTable(logger)

for vep_file in vep_files:
opener = gzip.open if vep_file.suffix == '.gz' else open
with opener(vep_file, 'rt') as handle:
for record in VEPParser.parse(handle):
tally.total += 1
transcript_id = record.feature

if transcript_id not in vep_records:
vep_records[transcript_id] = []


try:
record = record.convert_to_variant_record(anno, genome)
tally.succeed += 1
except TranscriptionStopSiteMutationError:
tally.failed.total += 1
tally.failed.stop_site_mutation += 1
continue
except TranscriptionStartSiteMutationError:
tally.failed.total += 1
tally.failed.start_site_mutation += 1
continue
except MNVParsingError:
warning(
f"MNVs are not currently supported. Skipping record: {record}"
)
continue
except:
if args.skip_failed:
logger.warning(
"VEP record failed to convert: %s", str(record)
)
tally.failed.total += 1
continue
raise

vep_records[transcript_id].append(record)

logger.info('VEP file %s loaded.', vep_file)

tally.log()

if not vep_records:
logger.warning('No variant record is saved.')
return
Expand Down
6 changes: 6 additions & 0 deletions moPepGen/gtf/GenomicAnnotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,12 @@ def coordinate_gene_to_transcript(self, index:int, gene:str,
def coordinate_genomic_to_gene(self, index:int, gene:str) -> int:
""" Get the gene coordinate from genomic coordinate. """
gene_location = self.genes[gene].location
if not gene_location.start <= index < gene_location.end:
loc = f"{self.genes[gene].chrom}:{gene_location.start}-{gene_location.end}"
raise ValueError(
f"The position does not overlap with the gene. index: {index}, "
f"gene: {gene}, {loc}"
)
if gene_location.strand == 1:
return index - gene_location.start
if gene_location.strand == -1:
Expand Down
34 changes: 14 additions & 20 deletions moPepGen/svgraph/ThreeFrameTVG.py
Original file line number Diff line number Diff line change
Expand Up @@ -1580,10 +1580,7 @@ def align_variants(self, node:TVGNode) -> Tuple[TVGNode, TVGNode]:
Args:
node (TVGNode): The node of which the outbound nodes will
be aligned.
branch_out_size (int): The size limit that if a variant is larger
that it, it will branch out even if it's not a frameshifting
mutation.
be aligned
Returns:
The original input node.
"""
Expand All @@ -1609,29 +1606,25 @@ def align_variants(self, node:TVGNode) -> Tuple[TVGNode, TVGNode]:
if not end_node:
raise err.FailedToFindVariantBubbleError()

end_nodes = {end_node.id}
if not self.is_circ_rna() and end_node.is_subgraph_end():
subgraph_ends = {end_node}
subgraph_ends.update(self.find_other_subgraph_end_nodes(end_node, members))
end_nodes = set()
if len(subgraph_ends) > 1:
for subgraph_end in subgraph_ends:
end_nodes.update(subgraph_end.get_out_nodes())
else:
end_nodes = {end_node}
else:
end_nodes = {end_node}
end_nodes.update([x.id for x in subgraph_end.get_out_nodes()])

bridges = self.find_bridge_nodes_between(start_node, end_node, members)
bridge_ins, bridge_outs, subgraph_ins, subgraph_outs = bridges

for bridge in bridge_outs:
for e in bridge.out_edges:
end_nodes.add(e.out_node)
end_nodes.add(e.out_node.id)

for subgraph in subgraph_outs:
for e in subgraph.out_edges:
if e.out_node not in members:
end_nodes.add(e.out_node)
end_nodes.add(e.out_node.id)

new_nodes:Set[TVGNode] = set()
queue = deque()
Expand All @@ -1645,7 +1638,7 @@ def align_variants(self, node:TVGNode) -> Tuple[TVGNode, TVGNode]:
# In-bridge nodes should not be merged with their outgoing nodes
# that do not belong to the bubble.
if edge.out_node not in members:
end_nodes.add(edge.out_node)
end_nodes.add(edge.out_node.id)
bridge_map[new_bridge] = bridge_in
trash.add(bridge_in)

Expand All @@ -1664,7 +1657,7 @@ def align_variants(self, node:TVGNode) -> Tuple[TVGNode, TVGNode]:

while queue:
cur:TVGNode = queue.pop()
if cur in end_nodes or not cur.out_edges:
if cur.id in end_nodes or not cur.out_edges:
if cur not in bridge_map:
new_nodes.add(cur)
else:
Expand All @@ -1673,7 +1666,7 @@ def align_variants(self, node:TVGNode) -> Tuple[TVGNode, TVGNode]:

# When all out nodes are in `end_nodes`. This is to avoid the `cur`
# to be replicated.
if all(x in end_nodes for x in cur.get_out_nodes()):
if all(x.id in end_nodes for x in cur.get_out_nodes()):
new_node = cur.copy()
trash.add(cur)
for edge in cur.out_edges:
Expand All @@ -1690,7 +1683,7 @@ def align_variants(self, node:TVGNode) -> Tuple[TVGNode, TVGNode]:

# So this is the case that some of the out_nodes are in end_nodes
# but not the others.
if out_node in end_nodes:
if out_node.id in end_nodes:
new_node = cur.copy()
trash.add(cur)
for edge in cur.out_edges:
Expand Down Expand Up @@ -1723,7 +1716,7 @@ def align_variants(self, node:TVGNode) -> Tuple[TVGNode, TVGNode]:
else 'variant_end'
self.add_edge(new_node, edge.out_node, _type=edge_type)

if out_node not in end_nodes:
if out_node.id not in end_nodes:
queue.appendleft(new_node)

if cur in bridge_map:
Expand All @@ -1749,8 +1742,6 @@ def align_variants(self, node:TVGNode) -> Tuple[TVGNode, TVGNode]:
for trash_node in trash:
self.remove_node(trash_node)

return start_node, end_node

def expand_alignments(self, start:TVGNode) -> List[TVGNode]:
r""" Expand the aligned variants into the range of codons. For
frameshifting mutations, a copy of each downstream node will be
Expand Down Expand Up @@ -1911,7 +1902,10 @@ def fit_into_codons(self) -> None:
queue.appendleft(cur)
continue

self.align_variants(cur)
try:
self.align_variants(cur)
except err.FailedToFindVariantBubbleError:
continue

self.collapse_equivalent_nodes(cur)
if cur.out_edges:
Expand Down
5 changes: 3 additions & 2 deletions moPepGen/svgraph/VariantPeptideDict.py
Original file line number Diff line number Diff line change
Expand Up @@ -466,19 +466,20 @@ def translational_modification(self, seq:Seq, metadata:VariantPeptideMetadata,
cur_metadata.has_variants = bool(cur_variants)

if is_valid or is_valid_start:
cur_nodes = []
cur_nodes:List[PVGNode] = []
cut_offset = sec.location.start
for node in nodes:
if cut_offset == 0:
cur_nodes.append(node)
continue
if len(node.seq.seq) > cut_offset:
node = node.copy()
node.truncate_left(cut_offset)
node.truncate_right(cut_offset)
cur_nodes.append(node)
cut_offset = 0
else:
cut_offset = max(0, cut_offset - len(node.seq.seq))
cur_nodes.append(node)
continue
if is_valid:
cur_metadata_2 = copy.copy(cur_metadata)
Expand Down
2 changes: 1 addition & 1 deletion test/files/circRNA/CIRCexplorer3_circularRNA_known.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
chr22 0 464 circular_RNA/1 0 + 0 0 0,0,0 2 323,82 0,323 1 circRNA RIBC2 ENST00000614167.2 1,2 chr22:0-130787|chr22:131505-160911 1 1 1
chr22 4980 5177 circular_RNA/1 0 - 4980 4980 0,0,0 2 78,42 0,98 1 circRNA LZTR1 ENST00000642151.1 1,2 chr22:427119-435769|chr22:437511-472574 1 1 1
chr22 4980 5120 circular_RNA/1 0 - 4980 4980 0,0,0 2 78,42 0,98 1 circRNA LZTR1 ENST00000642151.1 1,2 chr22:427119-435769|chr22:437511-472574 1 1 1
chr22 5058 5078 circular_RNA/2 0 - 5058 5058 0,0,0 1 20 0 2 ciRNA LZTR1 ENST00000642151.1 1 chr22:868685-870710|chr22:884086-888846 1 1 1
2 changes: 1 addition & 1 deletion test/files/circRNA/CIRCexplorer_circularRNA_known.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
chr22 0 464 circular_RNA/1 0 + 0 0 0,0,0 2 323,82 0,323 1 circRNA RIBC2 ENST00000614167.2 1,2 chr22:0-130787|chr22:131505-160911
chr22 4980 5177 circular_RNA/1 0 - 4980 4980 0,0,0 2 78,42 0,98 1 circRNA LZTR1 ENST00000642151.1 1,2 chr22:427119-435769|chr22:437511-472574
chr22 4980 5120 circular_RNA/1 0 - 4980 4980 0,0,0 2 78,42 0,98 1 circRNA LZTR1 ENST00000642151.1 1,2 chr22:427119-435769|chr22:437511-472574
chr22 5058 5078 circular_RNA/2 0 - 5058 5058 0,0,0 1 20 0 2 ciRNA LZTR1 ENST00000642151.1 1 chr22:868685-870710|chr22:884086-888846
21 changes: 21 additions & 0 deletions test/files/comb/case_87/AltSplice.gvf
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
##fileformat=VCFv4.2
##mopepgen_version=1.4.6-rc1
##parser=parseRMATS
##reference_index=/hot/project/process/MissingPeptides/MISP-000132-MissingPeptidesPanCanP1/ref/GRCh38-EBI-GENCODE45/moPepGen/1.4.1
##genome_fasta=
##annotation_gtf=
##source=AltSplice
##CHROM=<Description="Gene ID">
##INFO=<ID=TRANSCRIPT_ID,Number=1,Type=String,Description="Transcript ID">
##INFO=<ID=GENE_SYMBOL,Number=1,Type=String,Description="Gene Symbol">
##INFO=<ID=GENOMIC_POSITION,Number=1,Type=String,Description="Genomic Position">
##INFO=<ID=START,Number=1,Type=Integer,Description="Start Position">
##INFO=<ID=END,Number=1,Type=Integer,Description="End Position">
##INFO=<ID=DONOR_START,Number=1,Type=Integer,Description="Donor Start Position">
##INFO=<ID=DONOR_END,Number=1,Type=Integer,Description="Donor End Position">
##INFO=<ID=COORDINATE,Number=1,Type=String,Description="Coordinate for Insertion or Substitution">
#CHROM POS ID REF ALT QUAL FILTER INFO
ENSG00000143727.16 7800 RI_7800-7897 T <INS> . . TRANSCRIPT_ID=ENST00000453390.5;DONOR_START=7801;DONOR_END=7897;DONOR_GENE_ID=ENSG00000143727.16;COORDINATE=gene;GENE_SYMBOL=ACP1;GENOMIC_POSITION=chr2:271939-272036
ENSG00000143727.16 7898 SE_7799-8052-8166-11001 T <DEL> . . TRANSCRIPT_ID=ENST00000453390.5;START=7898;END=7926;GENE_SYMBOL=ACP1;GENOMIC_POSITION=chr2:272037:272065
ENSG00000143727.16 8053 SE_7799-7897-8011-11001 T <SUB> . . TRANSCRIPT_ID=ENST00000453390.5;START=8053;END=8166;DONOR_START=7927;DONOR_END=8011;DONOR_GENE_ID=ENSG00000143727.16;GENE_SYMBOL=ACP1;GENOMIC_POSITION=chr2:272192:272305
ENSG00000143727.16 8053 A5SS_8011-8007-12840 T <SUB> . . TRANSCRIPT_ID=ENST00000453390.5;START=8053;END=11062;DONOR_START=7927;DONOR_END=8007;DONOR_GENE_ID=ENSG00000143727.16;GENE_SYMBOL=ACP1;GENOMIC_POSITION=chr2:272192:275201
17 changes: 17 additions & 0 deletions test/files/comb/case_87/annotation.gtf
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
chr1 . gene 1 14144 . + . gene_id ENSG00000143727.16; gene_type protein_coding; gene_name ACP1;
chr1 . transcript 826 13236 . + . gene_id ENSG00000143727.16; transcript_id ENST00000453390.5; gene_type protein_coding; gene_name ACP1; protein_id ENSP00000411121.1;
chr1 . CDS 826 868 . + 0 gene_id ENSG00000143727.16; transcript_id ENST00000453390.5; gene_type protein_coding; gene_name ACP1; protein_id ENSP00000411121.1;
chr1 . exon 826 868 . + . gene_id ENSG00000143727.16; transcript_id ENST00000453390.5; gene_type protein_coding; gene_name ACP1; protein_id ENSP00000411121.1;
chr1 . CDS 7727 7800 . + 2 gene_id ENSG00000143727.16; transcript_id ENST00000453390.5; gene_type protein_coding; gene_name ACP1; protein_id ENSP00000411121.1;
chr1 . exon 7727 7800 . + . gene_id ENSG00000143727.16; transcript_id ENST00000453390.5; gene_type protein_coding; gene_name ACP1; protein_id ENSP00000411121.1;
chr1 . CDS 7898 7926 . + 0 gene_id ENSG00000143727.16; transcript_id ENST00000453390.5; gene_type protein_coding; gene_name ACP1; protein_id ENSP00000411121.1;
chr1 . exon 7898 7926 . + . gene_id ENSG00000143727.16; transcript_id ENST00000453390.5; gene_type protein_coding; gene_name ACP1; protein_id ENSP00000411121.1;
chr1 . CDS 8053 8116 . + 1 gene_id ENSG00000143727.16; transcript_id ENST00000453390.5; gene_type protein_coding; gene_name ACP1; protein_id ENSP00000411121.1;
chr1 . exon 8053 8166 . + . gene_id ENSG00000143727.16; transcript_id ENST00000453390.5; gene_type protein_coding; gene_name ACP1; protein_id ENSP00000411121.1;
chr1 . exon 11001 11062 . + . gene_id ENSG00000143727.16; transcript_id ENST00000453390.5; gene_type protein_coding; gene_name ACP1; protein_id ENSP00000411121.1;
chr1 . exon 12841 12946 . + . gene_id ENSG00000143727.16; transcript_id ENST00000453390.5; gene_type protein_coding; gene_name ACP1; protein_id ENSP00000411121.1;
chr1 . exon 13088 13236 . + . gene_id ENSG00000143727.16; transcript_id ENST00000453390.5; gene_type protein_coding; gene_name ACP1; protein_id ENSP00000411121.1;
chr1 . UTR 8117 8166 . + . gene_id ENSG00000143727.16; transcript_id ENST00000453390.5; gene_type protein_coding; gene_name ACP1; protein_id ENSP00000411121.1;
chr1 . UTR 11001 11062 . + . gene_id ENSG00000143727.16; transcript_id ENST00000453390.5; gene_type protein_coding; gene_name ACP1; protein_id ENSP00000411121.1;
chr1 . UTR 12841 12946 . + . gene_id ENSG00000143727.16; transcript_id ENST00000453390.5; gene_type protein_coding; gene_name ACP1; protein_id ENSP00000411121.1;
chr1 . UTR 13088 13236 . + . gene_id ENSG00000143727.16; transcript_id ENST00000453390.5; gene_type protein_coding; gene_name ACP1; protein_id ENSP00000411121.1;
Loading

0 comments on commit e6fb717

Please sign in to comment.