From 770ac0ac0a2162d9b9c4171a449865b2f02a2542 Mon Sep 17 00:00:00 2001 From: zhuchcn Date: Mon, 3 Mar 2025 13:32:17 -0800 Subject: [PATCH 1/9] fix (callVariant): in-member end node not treated as an end node when aligning the variant bubble --- CHANGELOG.md | 4 + moPepGen/__init__.py | 2 +- moPepGen/svgraph/ThreeFrameTVG.py | 29 +-- test/files/comb/case_87/AltSplice.gvf | 21 ++ test/files/comb/case_87/annotation.gtf | 17 ++ test/files/comb/case_87/brute_force.txt | 53 ++++ test/files/comb/case_87/gSNP.gvf | 13 + test/files/comb/case_87/genome.fasta | 237 ++++++++++++++++++ test/files/comb/case_87/proteome.fasta | 3 + .../integration/test_call_variant_peptides.py | 14 ++ 10 files changed, 374 insertions(+), 19 deletions(-) create mode 100644 test/files/comb/case_87/AltSplice.gvf create mode 100644 test/files/comb/case_87/annotation.gtf create mode 100644 test/files/comb/case_87/brute_force.txt create mode 100644 test/files/comb/case_87/gSNP.gvf create mode 100644 test/files/comb/case_87/genome.fasta create mode 100644 test/files/comb/case_87/proteome.fasta diff --git a/CHANGELOG.md b/CHANGELOG.md index 9063ca03..feba44bd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,10 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm ## [Unreleased] +## [1.4.6-rc2] - 2025-03-03 + +- Fixed callVariant that variant bubble not identified correctly. + ## [1.4.6-rc1] - 2025-02-24 ### Fixed diff --git a/moPepGen/__init__.py b/moPepGen/__init__.py index 2521938b..d0a27169 100644 --- a/moPepGen/__init__.py +++ b/moPepGen/__init__.py @@ -8,7 +8,7 @@ from . import constant -__version__ = '1.4.6-rc1' +__version__ = '1.4.6-rc2' ## Error messages ERROR_INDEX_IN_INTRON = 'The genomic index seems to be in an intron' diff --git a/moPepGen/svgraph/ThreeFrameTVG.py b/moPepGen/svgraph/ThreeFrameTVG.py index 6c4f1acb..fd3ffaa9 100644 --- a/moPepGen/svgraph/ThreeFrameTVG.py +++ b/moPepGen/svgraph/ThreeFrameTVG.py @@ -1580,10 +1580,7 @@ def align_variants(self, node:TVGNode) -> Tuple[TVGNode, TVGNode]: Args: node (TVGNode): The node of which the outbound nodes will - be aligned. - branch_out_size (int): The size limit that if a variant is larger - that it, it will branch out even if it's not a frameshifting - mutation. + be aligned Returns: The original input node. """ @@ -1609,29 +1606,25 @@ def align_variants(self, node:TVGNode) -> Tuple[TVGNode, TVGNode]: if not end_node: raise err.FailedToFindVariantBubbleError() + end_nodes = {end_node.id} if not self.is_circ_rna() and end_node.is_subgraph_end(): subgraph_ends = {end_node} subgraph_ends.update(self.find_other_subgraph_end_nodes(end_node, members)) - end_nodes = set() if len(subgraph_ends) > 1: for subgraph_end in subgraph_ends: - end_nodes.update(subgraph_end.get_out_nodes()) - else: - end_nodes = {end_node} - else: - end_nodes = {end_node} + end_nodes.update([x.id for x in subgraph_end.get_out_nodes()]) bridges = self.find_bridge_nodes_between(start_node, end_node, members) bridge_ins, bridge_outs, subgraph_ins, subgraph_outs = bridges for bridge in bridge_outs: for e in bridge.out_edges: - end_nodes.add(e.out_node) + end_nodes.add(e.out_node.id) for subgraph in subgraph_outs: for e in subgraph.out_edges: if e.out_node not in members: - end_nodes.add(e.out_node) + end_nodes.add(e.out_node.id) new_nodes:Set[TVGNode] = set() queue = deque() @@ -1645,7 +1638,7 @@ def align_variants(self, node:TVGNode) -> Tuple[TVGNode, TVGNode]: # In-bridge nodes should not be merged with their outgoing nodes # that do not belong to the bubble. if edge.out_node not in members: - end_nodes.add(edge.out_node) + end_nodes.add(edge.out_node.id) bridge_map[new_bridge] = bridge_in trash.add(bridge_in) @@ -1664,7 +1657,7 @@ def align_variants(self, node:TVGNode) -> Tuple[TVGNode, TVGNode]: while queue: cur:TVGNode = queue.pop() - if cur in end_nodes or not cur.out_edges: + if cur.id in end_nodes or not cur.out_edges: if cur not in bridge_map: new_nodes.add(cur) else: @@ -1673,7 +1666,7 @@ def align_variants(self, node:TVGNode) -> Tuple[TVGNode, TVGNode]: # When all out nodes are in `end_nodes`. This is to avoid the `cur` # to be replicated. - if all(x in end_nodes for x in cur.get_out_nodes()): + if all(x.id in end_nodes for x in cur.get_out_nodes()): new_node = cur.copy() trash.add(cur) for edge in cur.out_edges: @@ -1690,7 +1683,7 @@ def align_variants(self, node:TVGNode) -> Tuple[TVGNode, TVGNode]: # So this is the case that some of the out_nodes are in end_nodes # but not the others. - if out_node in end_nodes: + if out_node.id in end_nodes: new_node = cur.copy() trash.add(cur) for edge in cur.out_edges: @@ -1723,7 +1716,7 @@ def align_variants(self, node:TVGNode) -> Tuple[TVGNode, TVGNode]: else 'variant_end' self.add_edge(new_node, edge.out_node, _type=edge_type) - if out_node not in end_nodes: + if out_node.id not in end_nodes: queue.appendleft(new_node) if cur in bridge_map: @@ -1910,7 +1903,7 @@ def fit_into_codons(self) -> None: cur = self.merge_with_outbonds(cur)[0] queue.appendleft(cur) continue - + self.align_variants(cur) self.collapse_equivalent_nodes(cur) diff --git a/test/files/comb/case_87/AltSplice.gvf b/test/files/comb/case_87/AltSplice.gvf new file mode 100644 index 00000000..b957eb9f --- /dev/null +++ b/test/files/comb/case_87/AltSplice.gvf @@ -0,0 +1,21 @@ +##fileformat=VCFv4.2 +##mopepgen_version=1.4.6-rc1 +##parser=parseRMATS +##reference_index=/hot/project/process/MissingPeptides/MISP-000132-MissingPeptidesPanCanP1/ref/GRCh38-EBI-GENCODE45/moPepGen/1.4.1 +##genome_fasta= +##annotation_gtf= +##source=AltSplice +##CHROM= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +#CHROM POS ID REF ALT QUAL FILTER INFO +ENSG00000143727.16 7800 RI_7800-7897 T . . TRANSCRIPT_ID=ENST00000453390.5;DONOR_START=7801;DONOR_END=7897;DONOR_GENE_ID=ENSG00000143727.16;COORDINATE=gene;GENE_SYMBOL=ACP1;GENOMIC_POSITION=chr2:271939-272036 +ENSG00000143727.16 7898 SE_7799-8052-8166-11001 T . . TRANSCRIPT_ID=ENST00000453390.5;START=7898;END=7926;GENE_SYMBOL=ACP1;GENOMIC_POSITION=chr2:272037:272065 +ENSG00000143727.16 8053 SE_7799-7897-8011-11001 T . . TRANSCRIPT_ID=ENST00000453390.5;START=8053;END=8166;DONOR_START=7927;DONOR_END=8011;DONOR_GENE_ID=ENSG00000143727.16;GENE_SYMBOL=ACP1;GENOMIC_POSITION=chr2:272192:272305 +ENSG00000143727.16 8053 A5SS_8011-8007-12840 T . . TRANSCRIPT_ID=ENST00000453390.5;START=8053;END=11062;DONOR_START=7927;DONOR_END=8007;DONOR_GENE_ID=ENSG00000143727.16;GENE_SYMBOL=ACP1;GENOMIC_POSITION=chr2:272192:275201 diff --git a/test/files/comb/case_87/annotation.gtf b/test/files/comb/case_87/annotation.gtf new file mode 100644 index 00000000..ddea1b61 --- /dev/null +++ b/test/files/comb/case_87/annotation.gtf @@ -0,0 +1,17 @@ +chr1 . gene 1 14144 . + . gene_id ENSG00000143727.16; gene_type protein_coding; gene_name ACP1; +chr1 . transcript 826 13236 . + . gene_id ENSG00000143727.16; transcript_id ENST00000453390.5; gene_type protein_coding; gene_name ACP1; protein_id ENSP00000411121.1; +chr1 . CDS 826 868 . + 0 gene_id ENSG00000143727.16; transcript_id ENST00000453390.5; gene_type protein_coding; gene_name ACP1; protein_id ENSP00000411121.1; +chr1 . exon 826 868 . + . gene_id ENSG00000143727.16; transcript_id ENST00000453390.5; gene_type protein_coding; gene_name ACP1; protein_id ENSP00000411121.1; +chr1 . CDS 7727 7800 . + 2 gene_id ENSG00000143727.16; transcript_id ENST00000453390.5; gene_type protein_coding; gene_name ACP1; protein_id ENSP00000411121.1; +chr1 . exon 7727 7800 . + . gene_id ENSG00000143727.16; transcript_id ENST00000453390.5; gene_type protein_coding; gene_name ACP1; protein_id ENSP00000411121.1; +chr1 . CDS 7898 7926 . + 0 gene_id ENSG00000143727.16; transcript_id ENST00000453390.5; gene_type protein_coding; gene_name ACP1; protein_id ENSP00000411121.1; +chr1 . exon 7898 7926 . + . gene_id ENSG00000143727.16; transcript_id ENST00000453390.5; gene_type protein_coding; gene_name ACP1; protein_id ENSP00000411121.1; +chr1 . CDS 8053 8116 . + 1 gene_id ENSG00000143727.16; transcript_id ENST00000453390.5; gene_type protein_coding; gene_name ACP1; protein_id ENSP00000411121.1; +chr1 . exon 8053 8166 . + . gene_id ENSG00000143727.16; transcript_id ENST00000453390.5; gene_type protein_coding; gene_name ACP1; protein_id ENSP00000411121.1; +chr1 . exon 11001 11062 . + . gene_id ENSG00000143727.16; transcript_id ENST00000453390.5; gene_type protein_coding; gene_name ACP1; protein_id ENSP00000411121.1; +chr1 . exon 12841 12946 . + . gene_id ENSG00000143727.16; transcript_id ENST00000453390.5; gene_type protein_coding; gene_name ACP1; protein_id ENSP00000411121.1; +chr1 . exon 13088 13236 . + . gene_id ENSG00000143727.16; transcript_id ENST00000453390.5; gene_type protein_coding; gene_name ACP1; protein_id ENSP00000411121.1; +chr1 . UTR 8117 8166 . + . gene_id ENSG00000143727.16; transcript_id ENST00000453390.5; gene_type protein_coding; gene_name ACP1; protein_id ENSP00000411121.1; +chr1 . UTR 11001 11062 . + . gene_id ENSG00000143727.16; transcript_id ENST00000453390.5; gene_type protein_coding; gene_name ACP1; protein_id ENSP00000411121.1; +chr1 . UTR 12841 12946 . + . gene_id ENSG00000143727.16; transcript_id ENST00000453390.5; gene_type protein_coding; gene_name ACP1; protein_id ENSP00000411121.1; +chr1 . UTR 13088 13236 . + . gene_id ENSG00000143727.16; transcript_id ENST00000453390.5; gene_type protein_coding; gene_name ACP1; protein_id ENSP00000411121.1; diff --git a/test/files/comb/case_87/brute_force.txt b/test/files/comb/case_87/brute_force.txt new file mode 100644 index 00000000..e5336779 --- /dev/null +++ b/test/files/comb/case_87/brute_force.txt @@ -0,0 +1,53 @@ +AFLEKAH +AKIELLGSYDPQK +ARQITKEDFATFDYILCMDESNLR +AVSCLRNHGIHTAHK +AVSCLRNHGIHTAHKAR +CCRAFLEK +CCRAFLEKAH +DLNRKSNQVK +EANLNSSGQEIAK +EANLNSSGQEIAKK +EANLNSSGQEIAKKK +EDFATFDYILCMDESNLR +EDFATFDYILCMDESNLRDLNR +EDFATFDYILCMDESNLRDLNRK +FHVSSPAVEGR +FHVSSPAVEGRQR +FHVSSPAVEGRQRGNFR +GNFRWVIDSGAVSDWNVGR +GNFRWVIDSGAVSDWNVGRSPDPR +GQSCMKR +GQSCMKRHGIPMSHVAR +HGIPMSHVAR +HGIPMSHVARDLNR +HGIPMSHVARDLNRK +HGIPMSHVARQITK +IELLGSYDPQK +KFHVSSPAVEGR +KFHVSSPAVEGRQR +KKFHVSSPAVEGR +KLVTDQNISENVSTIHYLK +KSNQVKTCK +LVTDQNISENVSTIHYLK +LVTDQNISENWVIDSGAVSDWNVGR +NHGIHTAHK +NHGIHTAHKAR +NHGIHTAHKARQITK +QITKEDFATFDYILCMDESNLR +QLIIEDPYYGNDSDFETVYQQCVR +QRGNFRV +QRGNFRWVIDSGAVSDWNVGR +RHGIPMSHVAR +RHGIPMSHVARDLNR +RHGIPMSHVARQITK +SNQVKTCK +SNQVKTCKAK +SPDPRAVSCLR +SPDPRAVSCLRNHGIHTAHK +TCKAKIELLGSYDPQK +VDSAATSGYEIGNPPDYR +VDSAATSGYEIGNPPDYRGQSCMK +VDSAATSGYEIGNPPDYRGQSCMKR +WVIDSGAVSDWNVGR +WVIDSGAVSDWNVGRSPDPR diff --git a/test/files/comb/case_87/gSNP.gvf b/test/files/comb/case_87/gSNP.gvf new file mode 100644 index 00000000..64e96a78 --- /dev/null +++ b/test/files/comb/case_87/gSNP.gvf @@ -0,0 +1,13 @@ +##fileformat=VCFv4.2 +##mopepgen_version=1.4.6-rc1 +##parser=parseVEP +##reference_index=/hot/project/process/MissingPeptides/MISP-000132-MissingPeptidesPanCanP1/ref/GRCh38-EBI-GENCODE45/moPepGen/1.4.1 +##genome_fasta= +##annotation_gtf= +##source=gSNP +##CHROM= +##INFO= +##INFO= +##INFO= +#CHROM POS ID REF ALT QUAL FILTER INFO +ENSG00000143727.16 5213 SNV-5213-G-A G A . . TRANSCRIPT_ID=ENST00000453390.5;GENOMIC_POSITION=chr2:269352;GENE_SYMBOL=ACP1 diff --git a/test/files/comb/case_87/genome.fasta b/test/files/comb/case_87/genome.fasta new file mode 100644 index 00000000..1c7275cd --- /dev/null +++ b/test/files/comb/case_87/genome.fasta @@ -0,0 +1,237 @@ +>chr1 +ACAAAAACCACGCGCCCGCCGGGCCGCGCTCAGGCCTTCGCCCTCAGGGACTTCGGAACC +GCCCCGTCCTCAAGATCGAAAAGCCCAGAGCCCCGCGGCGGCTCCAAGCACGGTGTTGGG +GGTGGGGGTCTCAGGGAGCGCCCAGGCCCAAGGCCGCCCTGGTCCGGCGTGGACCCCGCG +GGGCTCAAGGCAGGTTCCCCGCGTGACCCGCCCAGCCCCTCTATGCGAACTCGAACGACA +GGCACCACAGCCCGCCACGTGCGCGAGACTCGCGCTGTGCCCCAACCCAGGTGGGCGGCC +CGCGGAGCCGCGAGGCCTGAGCCCGCCCTGCAGGTGACCCGCGGCCCTTCCTCCTCCAGG +TACCCCTCTCCTGCGGCCCCGTCCCCTATAGGTAACCTTTACCTCCCGCGGCTCCTTCCC +CTCCAAGCGACCCGCAGCCCCGCCCCCTCCAGGTGACTCCCCCCCGCCAACCCCCGCCAC +CACACACACACACCCCCTCGCCTTCCGCGGCCCTACTCCCTCCAGGTGACCCCACCCCCG +CAGCTCCTCCTCCTCCGGACAACCTGCAGCCCCGCCCCTGCAGGTGAACGGCGGCCCAGT +CCCTGCAGGTGACCCGCGGACCCTCCCCGCCCGTCCTACTACCGTCATAGGACCGCCTCC +GCAGGCGCACTGGAGCCGATTGCGCAGGCGTGGCTCTCACACGCGCTGCCCTGTTGGCGT +TGGTGCGGGACTGCGCAGGCGCGCGGGGCAAGAGGGTGGCAGTGCGCCTGCGCCGCGTCG +GCGTGCGGAACGCCGCGGTGTCTCGGCGCCTCTGCGCGCGGGAAGATGGCGGAACAGGCT +ACCAAGTCCGTGCTGTTTGTGTGTCTGGGTAAGAGGGCGCCGACTTACTCATGTTCTGAC +GTCCTCTGGAGAGTTGGATCGGGCTTGTGCGCTGTAGGTTGTGCCGCCGGCCTAGGAACC +ATGAGGGGGAGGAGGCCAGGGACTGGGAGGCCTAGGGTGTTCTAGGAGTGTGCCGCAGCG +CCCCTGTTCCCCATCCGCCCCGTGCACCCGCCCAGCCTGCCCGCTAAACCTGGGTCCCCT +CGCGCCTGCCATATATCCGGGGCTCTTGGCATCTGCAGCCACAGCCCCTACAAGCCAAGG +TACTTTCTTTGCGACTACTAGAATCCCTCTCGTTCCCCTCCAATGGGCCCAGGTAGGCCT +GGAGGTTGTCGGACTGATGTGGTCTCTTAGGGGCCTCCTGAGTCCTATGACAACATTGTT +TTGTTTGCTAGTTTCCTTAGCCCATGCTTAGATAGGGATCTGAGTTAAAGGGAGATGAAG +TTTTACGGTTTCGGGTTTCTTGATTGTAAAAGCCAAGCAAAGGAGACCTGGAGGTCCCTA +TATAACTTTGGCCCATGAAAAATGAAAGTGTATTTTATTAATGGCTTTTCAGGAACTTTC +CAGCTTAAATTATTTTCACTCCTAGAATTTCAACACTTAGCTTGCCTTACTTCAAACATA +ATTTATTTTCCGTTTGGTCTTCAGGGATCTTCCTTTGGTGTAGCACTACAAGGTTATATT +CCTCGGTGCCTCCACCCTGAGAGGAAGGCCTAAATTGTCAGTAGTCCGGTTACTTAAAAC +CTTCTGTGAGTTTGCATTTCCTGCAGTTTCTGTTTATTTTTACAGACTCCTCACTTTATC +TCCTGTACCTTTTCCTGCTCACACCCTCGGGACACTGCCTTCCAGAACCTTTCGTTGTTC +GGTAGCACTTCACATGCTTACTGAGCCTTCATCGCTGGGTGCCCATTATGCATTTTGCCT +GGAAAGCCTTTTTATTTTATAAAGTCAAACCTAAATATTTCCTCCTTTGCTTTCCCAGAT +CATGAGTTGCTTCCTTATCGACATCTTTTATTACACAAATTTTACTATAATTTGTGTAAT +ATAGTAAAAGATGTAATACAGAAAAGGTGTGATTCCTTATCGACATCTTTTATTACACAA +ATTTTACTATAATTATTTGATAGTCTGTTGAAATCCCAGCTGAGTTTAATTTTGTCGATG +GCAATAACTGTGGCTTATTTTTCATTTTGTTTTCAGCTTTAGGACAATATATGATAAACG +CTGACTAGGTTTTGTAGAAGTTAACTTCTGTGAAAGACGAATTAGAAGGCTTACCTTGTT +TCACATTGGTAGTGGGCATCATTTGATTTTTAAGACTCATTGAAGGCTGAATGATGAAAT +CAATAAGTTTATTAAACAGTAGTTTGGTTAAGGATGGTTACTAAATAATATTCCATTCCC +TTGTATAGTTTTGGTTGGTATTGTCGATTATGCTGGCATAATGCATCATGGTTGCTGTTA +ATGGGGAAGTGTATCCTTGCATATACAGTAGTCCCACTTTAGCTGTAGGGGATACATTCC +AAGACTCCCCAGTGGATCCCGAAACTGTGGGTAGTACCAAACCCAATTGCTGTCAATTGG +AACATGTGTCTGCTCGTTTCTTACACCCACAAATGTAATACCGTTGCCACCTTAACTAAG +CATTTAAAACTGTGGTCGCAACTTTTGCAGTTTGAGGCAAGACAGCAGACGATCAGGAAT +TTCTTTTTCCTTGTTTACATTTTCATGGATAGAAGATTTGTTCGTACTGTAGATCTTAGT +AACCTCGGTATAAGTTTTTTTTTTCTTTCCATAAGAAGTCAAAAACTTTCACCTTTTTAC +TTAAGGGTAGTGCTTATGGCTTCTCTTTGCCGTATCTGAATTGCTAGCTTCACTACTCTT +ATGATTTCGGGCCATTATTAGTAAACTAAGGGTGACTCGAACTCAGGCACTGTGATAGCA +TCACAATGGATCTGATAAGCTAGAGGGCAATAGGTGGATAGACAGCATGGATATGCTGGA +CAAAGACGTGTTTCACTTCTGGGGCAGGACGGAATGGGATGGCATGAGAATTTATCCTGC +TACTCAGAACAGTGCACAATTTAAAACTTAGGAATTATTTGTGGAATTCTTCATTTAATA +TTTTCCAACTGAAGCTGACCATGGGGTAACCATGGAAAGCAAAACCTTGGAAAAGAGGGG +ACTATTGTATTTCAATACAATTCAGTGATTTTTTGTTGTTGTTGAGCATCTACTATATGG +CATGCATTCCTCTAAGTGGTGTGGTGAACACAACAGACAAAAATCCCAGCCTCTCTGGAG +CTCACGTTCTCTCAGTCACCAGCAAGTATTAATTGATGTGCTCAATCGGTTGATGTGAAG +ATTAAGAAAAATAGAGCAGAAAAGGGGAATGGGAGTGGAGTGCAGTTTTGAGTGGGATGG +TCAGGGAAGGCCCCACCTGATAGATGACTTTTGAGTAAAGACTTGAAGGAGGTGAGGAAG +CAAGCCTAGGATATTTTGGAGAAGCATCCTGAACCCAGAGGAAAGCAGGTAAAAGTTTTG +AAGGGAGAATACTGGCCATGGTCATGAACCGCAGGGAAGCTGCAGTGGCTGGGCCTAGTG +GCCAGATAGTTTTTCTCAGTCAGGATTCCTCATCTATACCTGAAAATTTAGCAGATAATT +TGAGCGCCTGTTTTCTCAGTGCTTCCATTGTAGATGTATAGGGAGAAGTTAATTCCTTAT +ATACACTGGTCCTTTAGGGCAGGAGTCTGCAAACTCTTTGTTTAAAGAACCAGATTATAA +TTATTTTAGATTGGCTTCACATGACATACAGAAAGGGTGTAGCACAAAGTAGCCATCCAT +ATGATGAACCTGGCTGTGTTCTAGTGGATCTTCATTTGTGGACATCAGAATTGGATGGAA +CTCTGTATAATTTTCACATGTCACAAGGTACTCTACTTTTAGATTTTTTCCAACTACTCA +ACAATGTAAAAATTATTCTTACCGCTGTGGGCTGTACATAAAGGCAGTGGACCATGAGCC +GATTTTCAGACCCCTGTTTTAGGTCCAGATCTTGGGAGGCCATGTGAAGACTTGGGCTTA +CTCTGAATAAACATTGGAGGGTTTTGAGCAGAGGTGGGAAGTGATCAAATTTAAGTTTAA +AAAAGAAGAGTTTCAGCTGTTGTTTTGAGATGAAACTGGAGGTGGAGCATGCCTTTACTG +GTTGCATTCTGAGGGAATTTGAGCGGTAAATGGCATTTTACTTGGTAGCCATATAGAGGA +CAGTAAGAGTTGTATGCTTTTGAAATGTAAAAAACTTTGTTCATTCAAAGCATTTGTTTG +GACCCTATTGAGATTGAAGTGTAGAAAAGCATCAGAGGTTTTTCATTGGTGAAGCTAATG +ATGGAAAATAATTGAGCTCCTTACTACAGCACAATTATGAAGGGTGCTTTATGTTTTTGC +AATAGAGACTTATTAGTGAAGATGATGCAGGAGCCATGACCCAGCCAGCATGGTTGGTGG +AGATAGCACATGCTGGTGGAGTCACTTTACTTCTGGGCTACACCCATACTGGGTGCTGGG +TCTCTTACCAGGTATAGCAGACACAGCCCCACCCTCCCAGAGCCTATCTGCCTTTTATGA +GTCTCCATCCTTGAATTGAAGATGCTGAAGAAGGGTATAAAATAGTCTGCCTAATTTGGA +AAAGACAAACTTGAGACTTTACAGATTGAAACATTTTTTAAAGGCATTTGAAGATGGAAT +GTTCTGTTTTTAGAACTTGTTTGATAGGCGAAACCTGAAGTACATGCAGTGTTTCACTGA +TGACCGGGGTGCTCCATTTTTTCATAACAAAATTTGGTGGCATCAACTTTCGAAATCAAC +AGGGCAAGCTACTGCTAGCCTTTAGGACAGTTAATAATGCAAGTTTTCATATTGTTGTGG +GGAAAGTAAGCTGTTATTTTACAGTTAAAGATAATTATTTGTAAACGTGTGGGGCTAGTC +TTTATTATATTTTTAGATGCTGAGATACTAGGCATCTGTTGTTCTACAGATCCAAAATAA +TAATTGTCTTAATTTGGAATACAATGTTTGCTGTTAAAAATATTCCCAGGGTATATGGTG +ACAAGTAAAAGAGTTACACACCCATGGGAAGCCTTTCAATAATTAAAATAGTGCTGATGA +CAATACCTGTTTAGAAAACTAAACAAAATTTAGAATGTCGAAAGGAAAAACTTCACAACA +CAAATTAGCATATAGAGGGTATTCAAGGCTAATTTGTTGTCTTTCCCTTTATGGTTTGTA +GGTAAGATTTTTTACTGCTGTGGAAACTACAGTCTCTGTGGGAAAAAAGTACGTAATATG +CTTAGGTCTTACAATGTAGAAACTGAAGGCTGTAGTAAGGCAGTGAGTTGGTTTTCTGAA +CTAGGGAATTAGCTGTGTTTTACAACACTAGAAGATCCTCATTGTATTTGTTTATCATAA +ACAGAAGACCCTAATAGTGATAAAGAGTAAACTCTTAGTTAGTGCTGCCTATTCTCACAG +TTCCTCTTTTTCGTATCTGTTTCCTAAATTCAGCTCCTAATGTCTGGACTCAGTGTGGTG +CTTTGGCATTAAGGAGACTGGAAGGCCTCAGAGCAGCCCCAGAAGCAAGGTCGTAGAAAC +CATCATTTATCTTCCCCAGCGCAGATCATAGAAACTAGAACTCCTCTCCTGCAAAGCAAG +CCATAAAACCTAGAAAGATCACTCTCTCCCTTTTGCCTTCTCTTGAAGACAGCATTTCAG +AGGGGCCCTGCCTCATTCCTGGGGGAGGAAATGCTGTGGCCCTGCTGGGTTTCCCTCAGT +CTGCTACCATTGGATCATTTCCTTTTGTCCAGTGCCATTTCTACACAGCTGTGCATTGGT +CATCTAAGCATCAAAACCGTTTCCCTGGGTCTTTGGGTCTTCATTTCTGAAGGCTCTTGT +TAACCTGTCTTTTGTTACAGGAGTATCTTCCATGACCTTTATGATGGGTGAGGAAAGCTG +CCACACCTTTCCACTCTGACACTGTCTTGTATTTGCTCCTTCATACGAGCCTCTTACAAG +GTCTCTCCAATTCAGTCATGTGCTCTTCTAGTGTTTCCTTAGCTGTGCCGGTAATTTTGG +TAACTTGTAGGAAGGGTCTGTGTTTTAAACAACTTTCTGTCTTCAGGTTAGGGCACAGTG +CTTGGCACATAGAGTTGGCATACAAAATATGTTTGTTGAATAAGTCATATTACCCTTTTC +CATAGGACTTTCTATTTCTCCTCATAGTGGCTACGCAAAGTTCAAATGTTGGCCCAGTGT +TTTAGGGCTCCTGTATACCAGCCCGTCAGTTTCCAGACACTGGCTCTCTGTACCTGAAGC +ATCGCAGCCTCACCTGCTTAGTGCCACAATCCCTCTGAGTTAAATGCCTTTCCCTTCCTC +TGTCCATTAAGATCCTGATCATCCTTTTAATATCATCTCTAATCTTTCTTTTTTCTGGTA +AGAAATTTTCTTATATCTTGTTTTTGCTTTGTAGCATCGTTCTTTTTCTGTGCTGCTGTA +GTGTATTGTTTATCAGCATTCATTTTCCCACTTTTATTATAATAAATATAATAAATAAAG +TGGAGATAGAGTTCAGCTGAATGGCCAGTGATTCAACCCATTGTGTCTCCCTCCGAAAAC +CCCAATAAAACCTCAGAGCACTATGGCTCTGAGGAGCTTCCTGGGTTGGCAGTGCTCTTT +GAATATTGTCTCATGTTGATGACAGGAGAGTAAGGTGTCCCAGAGGACAGTGGAGCTACC +TGTTAGGGAACCTCCCAGAATGTGCCCTATAGGTCACTTGCTTTGGCTGTTTTGTTTGTT +TGTTTGTTTTTGTTTTTGTTGCTGTTACTATAATGAAATTGTAATGGTAAGTGTTCTGCT +TTCCTGAGTTTGGGGAGTAATTCTAGGGAATTATCAAACCTGAGGGAGCCCTGGGGAACC +TGGATTTCTCACTTGCTGGTCTAAAGTAAGGATGGCCCTGATGTCGACATCGTGGTCTGG +AGAGTGGTGTCCCTGACCTTGGGCTGCTAATGCCTGGATCTGTATCCCTGGGCTCAGGGC +TTTGCATGTGATCGCTCAGTAAATGTTGGTGTTGAATCGCTGTTTATGCTTAAATGCATG +TTTGTGTGTTTGTGTATTGTTTTGTGAAACAGTAGTCATCCTTATGTGTGATATACTTGA +AAATACATACAGTTGACCCTTCAGCCACAGGGGTTTGAACTTTGAAGGATTTATTCGAGG +AATTTTTTTAATCAAGCACAGATGGAAAATACCGTGTTGTCTGAATGAGAAACCCATCTG +TATGGAGGGCTGACTTTTCCTATACTTGGGCTCCACATTCTCAGGGGCCAACTATAGGAC +CTGAGTATGCACGTTTTGGTATACTGGGTAGGGGTGGGAGGGTGTCCTGAACCAGCTCCT +GAGTCTACTGAGGGACAGACCACTGTATTTTATTTTTATTCTGTTGTATTTCATTTTAAA +ATAAGCTGGTTGCAACCCACTTCATTTTCGTGATTCACTCCTGGATCATTAGTAGCAACT +CTTTAAACAATAATTTTTGTTGAAATTGTTTTTCTGATATCTTTGTCATCTGTACTTGGA +GTACTTGGCTGGTTGGTGTCAAATCACTGAACTTGAGCCTGATAGGCTGTTCTCCAGCAG +GGTCTACCCTCACTGGTTGGACAAAGGCGTCAAAGGATAGTGTTGGCCTTTGTTTTTCCT +GAGGGATAGCACAGCCCCCGTGTGTCAGGTGTGTAAAGAAGGGGAGCTGGCATGTGCCCT +TCCATCCAACCTAACCCTGTTTCCCCACCCCTCCCTTTTTTTAAAGGTAACATTTGTCGA +TCACCCATTGCAGAAGCAGTTTTCAGGAAACTTGTAACCGATCAAAACATCTCAGAGAAT +GTAAGTACCATTCATTATCTTAAAGAGGCCAACCTGAACTCCTCTGGGCAGGAAATTGCA +AAAAAAAAAAAAAAATTCCATGTTTCTTCCCCTGCAGTGGAGGGTAGACAGCGCGGCAAC +TTCCGGGTATGAGATAGGGAACCCCCCTGACTACCGAGGGCAGAGCTGCATGAAGAGGCA +CGGCATTCCCATGAGCCACGTTGCCCGGCAGGTACCGTCCTTGGACTTGAAGTTGTGTGT +TTTGTGTTTCAGTGGGTCATTGACAGCGGTGCTGTTTCTGACTGGAACGTGGGCCGGTCC +CCAGACCCAAGAGCTGTGAGCTGCCTAAGAAATCATGGCATTCACACAGCCCATAAAGCA +AGACAGGTAGACAAGCTCTTGTTCAATTTCTAATATATAGAGTCCAGTAACTTGAGAAGT +AGCGAAAGGATTAACCAGACTTGTATATTAATGAATGTGTTTATTTAGGGTGAGCTTAAC +CAGCTATGGTGTGTCCATTTTGTTTCACTTCTGGTTGCACGGTGTTGAAAGACTTGCCTG +ACTTTGGAATTTACTTATTAAAATGCACATAAAAGCTAGGTAATTTATAATGAGAGAGCC +TGACTGTGAGCTGGGGCTGAGCGGTGCTCTGTCTTCTGTTCCTTCCTGCATAATTTTTAT +TAAACATTTAGGCCATAGTAATCATCCTGCTGATATTGCAAGTTTGTTGCTAGAATGAGG +TTATATAATATATACAAAAACATTTTTTCAACTGTAAAGTGCCTTAGTAATATAGGGTAA +TACCAGCAACATTATGGATATATAATTATAGTCTATTGGGCCACACTTAAGTTTGGAGTC +TAATAAAGTCACAATCAAATTCTGCAATTTCAATTGAAGATAACCTTGTCTTTATATTAT +GAATTAGAAGCTAAAGTTGATTTTTCTAAGAGTTCTTTATTTAAATGAAGTACTCTGGGA +CTGACCTTTTCGGAAATGGAATCTTCATTGGTCAGGTGATTCAACATTTTTATACAATTT +ATCCATCCTCATCTCTTCAGGATTTGCATACCTTGCCAGTTTCTACTGGCCATTGTTGAA +AATACATTTATTTGGAGAAGTCCAAAGCCAAGGGGCTCATGGGGCTGTGAAGTCCTTCTT +GCTGCATCGTCCTGTGGTAGAAGGTGGAGGAGTCAAGAGAGTGCCCCAGAGTGAGTGAGA +GCGAGAACTAGAAAAACGGGAAGAGGGAAACAGAGGAGAGAGAGAGAGAGGACCCATCAG +TGTCAGGAGCCCACTCCCAAGATAGTGGCATTAATAAAGATCCTGCGTCTCACTATTGTT +GCATTGGGGATGAAGTTTCCAACAAAGGAACTTTGGGGGACACATCCAAACCATAACATA +GGATTTAAATAATTTTACAGAGTTCAAGAGTTCTGCTACTGAACCGTTTGAGATCCCTGT +TCTGAGGTCTCATCACTTTCCAGTTTTAGCAGGAAGAGAAGTGGCAAGTGGCAGGAGTCT +GCAGATTGGGGCCTGCACCTTTTTTTGAGGCACCTTTTTTATGAACAGTGTTTGTTGGAA +CACAGCCAGACTCACTCATTCACCTGTAGTTCGCGGCTGCTTTTGTGCTAGAGCAGCACA +GCCAGAGCGGCCTGGTGGCCACAAAGCCCGACTGTCTCTGCAGCACTGGTTTTTTATATG +GCGTTTCTAGTTGTTTTTAGTAGTAGGGGTGATTTGCCACAAGCTGTTTCATTTATAGCT +GCAAGTGGAAATCCTTTATTGTGCATTTGTTACATAAGTTATAGCTGTTTCTTTCTCAAT +TATTGTAACATCTAAAAATTATGTAACAGTTACTAAGTCTTTTTTTATGAAAAATTTACA +ATTAGTCCAGAATTTAAAACAAGTTTAGTATTAAATCCTAGGGAATTTCTCACATAACTA +ATTTGGAGAAATAATATAATACTGTTCAGGCATGGTGGCTCACACTTATAATCCTAATAA +CACTTTGGGAGGCTCAACATTGCTTGAGCCTAGGAGGTCAAGACCAGCCTGAGCAACATA +GTGAGACCCATCTCTACCAAAAAAATTTTAAAAATTAGCTGGGTGCTATGACATATGCCT +GTAGTCCCAGCTGTTTGGGAGGCTGAGGTAGGAGGTTGGCTTGAACTCAAGAAGTTGAGG +CTGCAGTGAGCTATGATCACACCACTGTACCCCAGTGTTGGTGACAGAGTGAGACCTTTT +CTCTAAAAACAAAAAGAAAAAATTCAGTATTGTACTTATTCTCAATTTTGAAATAAGTCA +ATGTCATGGGAGTGTTGACGAAATACAGTCTTTTCTAGCTACTGGAAGTGCATACTAAAA +GCCAAAGTTTCTCATTTTTTAGAAGAACAGGTATACACTTTCCACCTTTGTTCCGCAGTA +CGTATGTTTATGTTTCTTATTCTGCCATTATTTATAGTAGTCGAAATTCACAAAATAATC +TTTTCATTTGTATTTTAAGTAGGTGTGTTTATAGATTATATCAGAAAACAATCATAAGCT +TTCAGGTTACAATCAGTAGAAATACTGAGTGGCATTCTCTTCTGTGTGAGGTTTAAATTA +TGGATAGTCAATCAAAATAAAGGAGGACTGTCAGTTATCACTCCTGTGTTTCTTTCTTAG +TGACCACCTCTGACCACCCAACTTAATACTTTTACTCCTCCCTGCCCCAAAGCTGTATGT +ATCTCCTCTCTGCTTTATTTCCTCCAGGGCATCTGTGCCTCCTGTGTGTTGCCTGCCCCC +ATGTGGAAGGGCAGCTCCATGGGACGCGGCTCTGTCCACTTAGCTTGGTGTTTGTTACTG +GGGCCCAGGATGTTCAGTGCTTGTGGAATGAAAAATAGAGTAGAATAATGAAGGGGATTA +AATATGTGACATTTTAGTATGTTGACTGTATTATACACTGCTACTAAGGAATTGAAGCCG +ATGTATAAACATTGTGTCTACACATTATACTATTTTATGATTATTGTATGGAACTTTATA +ATACAAAATTTCTTTGCAGTACAATTTTGAGAAATAGGTTAACTCTATTTTAACTTAAAA +GTACCCTAAAAATTATTTAATTGTTTATTAGTGAGTAACAGGCTCAAATACAGCAGTTTT +TTTTTTCTTTTAGTATTGCTTTGCATCCTCTAGGCTTGAATGGTATAAACACTGTGTTTT +GACTTCTTATTCAATTTTAGATTACCAAAGAAGATTTTGCCACATTTGATTATATACTAT +GTATGGATGAAAGCAATCTGAGGTAATCCTGTTTTTGAAGAATATTTCTGTTCAACTCTC +AGTTCAGCAGTGGGCCAAGTAATTTGTTGTCCAGATTTACTTTTTCTATTTTAAAGGTTT +TAATAGTCAGTGATGGTCACCATATGTAGAATTATTTTATTTAGAACAGCAAAAAACTTA +GTAATCTAGAATTGTCTCTTAGGTATTTACAAGGAAATACACCTTAGAAAAGGGAAAGTC +TAGTTGTTAATAGCATGATTTAATTGGAAAACTAGCTTGGCTGTTATAGTTTTGTATCAA +GCATATTGTTCTGCATGAGAACGATTTTGATATTTTTGCTGTAATGATTCCTGGGCAGGG +CAGTGTTTTATTGATTCTCAGGTCAAAGCTCTGCAACCATAGGTTCCTATTATTATCCCA +TCTTGTAAAAGAAGGAAGTGAGAAACGGTAATGTTAGGTTATGTCCCCGAGTTCACACAG +CTAGCATGTGGTGGATGTCATGGGATTTAAGCCTAGGCTCAGTGAGGAAGGGCCCTGCTG +TATGCACCCCTGCCTCAGTCACCACCCTCCTGACCTGCCCCAGTCCCATGTTCAAGGCGT +CATCGGGAACGGATCATAATTTAATGTCACTCCAGGATTCTGACTTGCATCACTACTTAT +TCATAAATATCTCATGTTTTTGATGAAGATAAACTATTTGTGCTTTTAATTACTTTTGTA +CAAATGAACAGATAGGGATGCACCAGTTTCAATTATTTTTTATTAATTAGGACCTTCATT +AAAATGCATGTATTTTTTAATTTTTATTTTTCAGTCCTTATTCTGTTCTCATTGTTTTTG +CTGACACCATAGCCCATAGGCATCGTAAAAACACCTTTTTAGGAGACCCCATTTGCTTTG +ACTGGCACTGAGCACCGTGTTTCTTTGCGTGTGGGGCAAGTGGAGTTCCTGCAGCCTCAC +ATTTGCACGTGCTGTTCCTCTGCCTGGAAGCTCCTCCTTGCTTCTTGGGTCATTCGGCTC +CTATTTGCCCCTCGTGGGTCAGCTTAAATTCATTTCTTGATAGAGGTATTCTGCTGCCCT +GTCATTAGGCCAGATTGTGCATTAGACTTAGGCCTGCCTGCAGTTCTCATTCCTGCACCT +AATCCTGAGCTGTAAAACTTCGTGCATGGGAGCTCTTGGCTGTTCCGTTCCCACATAAAG +CACGTACCTGACTCATCAACTCATCTGTCGCAGGTGCTCAGTCGAAATTTGTTAAATAAA +TGGAGGCGCTTAGTTAAGTTTGCCTTTTTCTTTTGTAGCAATGTGAAAGCTAAGGGTGGA +AGTCTAGAGTGAAGCTGAGTTTTCAGCTTGGGCAGAGGCTTAAGGAATAAAAGATGGAAG +AGTAATTAAGGAGTGGGTCGTGCTGTGGGAAGTGCTGTGTCAAGACACCTGAGGATGGAA +TTGGTAGCCCTCTGTTTTGGGACTGGTTCAGATGGTGTTTTGGGTTCCTTTCTATGACCT +TTATCTCTGAAACTTGATTGTCTTAAAATGTATTTTGTGGGAGAAAATATAATTATTGTA +TATTTTGTGTAACAGAATCAGTGAGAATAAGCTGCTGTCGCAAACTGTCTTGCCTAGAGA +GAGGGCAGTGGCATACAGCTGCAGAAGGGGCCACACGGGCCTGCTGAGTGTTCCGTTTCA +TTTCAAATTAGGTCATTCTGTCTTGATTTTGATATGGATGTTTCAGAACACCCTAGCAGA +TGTCCCTGTTTAACTTGAAACCATAGATCAGAAAACTAAGTTCATATTTCAATTTTACAG +AGATTTGAATAGAAAAAGTAATCAAGTTAAAACCTGCAAAGCTAAAATTGAACTACTTGG +GAGCTATGATCCACAAAAACAACTTATTATTGAAGATCCCTATTATGTAAGTACAGTTCA +CGTTTTAGGGCTAATATGAAGACCCAACACATTTGTATCCTGCCATATTAAATAACAGAT +GAGATTGTGTTAAGGATGTTTTTGTTATGCAGGTTTTGCCATTTTCTTCTTTTTCCTGTC +CATTTAGGGGAATGACTCTGACTTTGAGACGGTGTACCAGCAGTGTGTCAGGTGCTGCAG +AGCGTTCTTGGAGAAGGCCCACTGAGGCAGGTTCGTGCCCTGCTGCGGCCAGCCTGACTA +GACCCCACCCTGAGGTCCTGCATTTCTCAGTCGGTGTGTAATCACGTTCCAGGGCCCAAA +GCCCAGCTCTTTGTTCAGTTGACTTACTGTTTCTTACCTTAAAAAGTAATTGTAGATGGA +AATCAGTTGTGTTTGGCAGGAGAATCAATAAAAATCTTTGATTCAGACAGCTTATGGGGT +ATTTTAAGCATTCTTAGACTAGTTGAACATCTCACTTTGCCCCAGTTACAAAAATAGTAG +AACAAGCAACATAAAACAATGAAGGAAAACCTCACTTGAAGGCCCAGGTCAACATCTAAG +CCTGTTGAGACTTAGATAATCGAGTCTACCTCTTCAGTAGGTTTGTGTGGATGGCCTGGA +GGGCAGGTGCCCTCTGCTCCCCAGTGCTACCTCTCTCTTCCCTAGGGCCTTTTGTGGATT +GACAGTAGTCCCCTCCGTAGGAGCTCACAGTCTAGATTAGAAGTGTTTTAATTTCTACAC +ACCCATAGTGCACACTTGTATATTGAAAAGATAGGGAAGAGAGAAACATTTATGGAATCA +GTCGTTGGCACCTTCAATACTTCATGATTTTTGTCGAGTTTACTTCATGAGGAGGTCAGC +CCATTGGCTCCCATCTGAACCACTTTGCCTCTGAAACTTAATTACATCCAGAAAGAAGGA +CACTTGTATGCTAGTCTATGGTCAGTTGAGGAATATGACTGTTTTTATATGCACATGTAA +CCCAAATGTCCAATATAAATTGGCTTATTTTTTAAAATAATTTTAAAAGTTGGGAAAAGT +GTTATTATTTGGCATGCTTAAATATTGAATAAGTATTCTTCATCAGCATTTAATAAATGT +ATAGGCAGATGTAAGGTAATTTCTGTGTATTTTGAGATAATGTCAAAATCATGAATATTT +CAAAATAAACTGGGGAGTTATAAAAATACAACTAGAGATATAAA diff --git a/test/files/comb/case_87/proteome.fasta b/test/files/comb/case_87/proteome.fasta new file mode 100644 index 00000000..21cf6229 --- /dev/null +++ b/test/files/comb/case_87/proteome.fasta @@ -0,0 +1,3 @@ +>ENSP00000411121.1|ENST00000453390.5|ENSG00000143727.16|OTTHUMG00000086933.6|OTTHUMT00000195864.4|ACP1-209|ACP1|70 +MAEQATKSVLFVCLGNICRSPIAEAVFRKLVTDQNISENWRVDSAATSGGSLTAVLFLTG +TWAGPQTQEL diff --git a/test/integration/test_call_variant_peptides.py b/test/integration/test_call_variant_peptides.py index 182bd1e8..e6fb886e 100644 --- a/test/integration/test_call_variant_peptides.py +++ b/test/integration/test_call_variant_peptides.py @@ -1309,3 +1309,17 @@ def test_call_variant_peptide_case86(self): expected = test_dir/'brute_force.txt' reference = test_dir self.default_test_case(gvf, reference, expected) + + def test_call_variant_peptide_case87(self): + """ + Issue that the in-member end node not treated as a end node when aligning + the vairant bubble. + """ + test_dir = self.data_dir/'comb/case_87' + gvf = [ + test_dir/'gSNP.gvf', + test_dir/'AltSplice.gvf' + ] + expected = test_dir/'brute_force.txt' + reference = test_dir + self.default_test_case(gvf, reference, expected) From c85ef4512580206c526fc689b37516a5fc1f9e12 Mon Sep 17 00:00:00 2001 From: zhuchcn Date: Mon, 3 Mar 2025 13:37:24 -0800 Subject: [PATCH 2/9] style: remove trailing white spaces --- moPepGen/svgraph/ThreeFrameTVG.py | 2 +- test/integration/test_call_variant_peptides.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/moPepGen/svgraph/ThreeFrameTVG.py b/moPepGen/svgraph/ThreeFrameTVG.py index fd3ffaa9..21de01fd 100644 --- a/moPepGen/svgraph/ThreeFrameTVG.py +++ b/moPepGen/svgraph/ThreeFrameTVG.py @@ -1903,7 +1903,7 @@ def fit_into_codons(self) -> None: cur = self.merge_with_outbonds(cur)[0] queue.appendleft(cur) continue - + self.align_variants(cur) self.collapse_equivalent_nodes(cur) diff --git a/test/integration/test_call_variant_peptides.py b/test/integration/test_call_variant_peptides.py index e6fb886e..40148fc4 100644 --- a/test/integration/test_call_variant_peptides.py +++ b/test/integration/test_call_variant_peptides.py @@ -1309,10 +1309,10 @@ def test_call_variant_peptide_case86(self): expected = test_dir/'brute_force.txt' reference = test_dir self.default_test_case(gvf, reference, expected) - + def test_call_variant_peptide_case87(self): """ - Issue that the in-member end node not treated as a end node when aligning + Issue that the in-member end node not treated as an end node when aligning the vairant bubble. """ test_dir = self.data_dir/'comb/case_87' From 9d1d6faf8ee331d9a7e835599c1df775fe899657 Mon Sep 17 00:00:00 2001 From: zhuchcn Date: Mon, 3 Mar 2025 15:27:48 -0800 Subject: [PATCH 3/9] fix (parseVEP): --skip-failed added to skip VEP records failed to be converted to GVF --- moPepGen/cli/common.py | 8 +++++ moPepGen/cli/parse_vep.py | 58 ++++++++++++++++++++++++++---- test/files/vep/vep_indel2.txt | 4 +++ test/integration/test_parse_vep.py | 13 +++++++ 4 files changed, 76 insertions(+), 7 deletions(-) create mode 100644 test/files/vep/vep_indel2.txt diff --git a/moPepGen/cli/common.py b/moPepGen/cli/common.py index 2ff458ca..0c32f42a 100644 --- a/moPepGen/cli/common.py +++ b/moPepGen/cli/common.py @@ -158,6 +158,14 @@ def add_args_decoy(parser:argparse.ArgumentParser): metavar='' ) +def add_args_skip_failed(parser:argparse.ArgumentParser): + """ add --skip-failed """ + parser.add_argument( + '--skip-failed', + action='store_true', + help='When set, the failed records will be skipped.' + ) + def add_args_debug_level(parser:argparse.ArgumentParser): """ add debug level """ parser.add_argument( diff --git a/moPepGen/cli/parse_vep.py b/moPepGen/cli/parse_vep.py index 5c4bb5bd..5077a690 100644 --- a/moPepGen/cli/parse_vep.py +++ b/moPepGen/cli/parse_vep.py @@ -5,13 +5,13 @@ variant peptide sequences. """ from __future__ import annotations +from typing import TYPE_CHECKING import argparse import gzip from typing import Dict, List from pathlib import Path from moPepGen.parser import VEPParser -from moPepGen.err import MNVParsingError, TranscriptionStopSiteMutationError, \ - TranscriptionStartSiteMutationError, warning +from moPepGen.err import TranscriptionStopSiteMutationError, TranscriptionStartSiteMutationError from moPepGen import seqvar, get_logger from moPepGen.cli import common @@ -19,6 +19,9 @@ INPUT_FILE_FORMATS = ['.tsv', '.txt', '.tsv.gz', '.txt.gz'] OUTPUT_FILE_FORMATS = ['.gvf'] +if TYPE_CHECKING: + from logging import Logger + # pylint: disable=W0212 def add_subparser_parse_vep(subparsers:argparse._SubParsersAction): """ CLI for moPepGen parseVEP """ @@ -38,12 +41,39 @@ def add_subparser_parse_vep(subparsers:argparse._SubParsersAction): ) common.add_args_output_path(p, OUTPUT_FILE_FORMATS) common.add_args_source(p) + common.add_args_skip_failed(p) common.add_args_reference(p, proteome=False) common.add_args_debug_level(p) p.set_defaults(func=parse_vep) common.print_help_if_missing_args(p) return p +class TallyTable(): + """ Tally table """ + def __init__(self, logger:Logger): + """ Constructor """ + self.total:int = 0 + self.succeed:int = 0 + self.failed:TallyTableFailed = TallyTableFailed() + self.logger = logger + + def log(self): + """ Show tally results """ + self.logger.info("Records successfully processed: %i", self.total) + self.logger.info("Records failed: %i", self.failed.total) + if self.failed.total > 0: + self.logger.info("Out of those failed,") + self.logger.info("Start codon mutation: %i", self.failed.start_site_mutation) + self.logger.info("Stop codon mutation: %i", self.failed.stop_site_mutation) + +class TallyTableFailed(): + """ Tally table for failed ones """ + def __init__(self): + """ constructor """ + self.start_site_mutation:int = 0 + self.stop_site_mutation:int = 0 + self.total:int = 0 + def parse_vep(args:argparse.Namespace) -> None: """ Main entry point for the VEP parser. """ logger = get_logger() @@ -64,31 +94,45 @@ def parse_vep(args:argparse.Namespace) -> None: vep_records:Dict[str, List[seqvar.VariantRecord]] = {} + tally = TallyTable(logger) + for vep_file in vep_files: opener = gzip.open if vep_file.suffix == '.gz' else open with opener(vep_file, 'rt') as handle: for record in VEPParser.parse(handle): + tally.total += 1 transcript_id = record.feature if transcript_id not in vep_records: vep_records[transcript_id] = [] + try: record = record.convert_to_variant_record(anno, genome) + tally.succeed += 1 except TranscriptionStopSiteMutationError: + tally.failed.total += 1 + tally.failed.stop_site_mutation += 1 continue except TranscriptionStartSiteMutationError: + tally.failed.total += 1 + tally.failed.start_site_mutation += 1 continue - except MNVParsingError: - warning( - f"MNVs are not currently supported. Skipping record: {record}" - ) - continue + except: + if args.skip_failed: + logger.warning( + f"VEP record failed to convert: {record}" + ) + tally.failed.total += 1 + continue + raise vep_records[transcript_id].append(record) logger.info('VEP file %s loaded.', vep_file) + tally.log() + if not vep_records: logger.warning('No variant record is saved.') return diff --git a/test/files/vep/vep_indel2.txt b/test/files/vep/vep_indel2.txt new file mode 100644 index 00000000..5f326081 --- /dev/null +++ b/test/files/vep/vep_indel2.txt @@ -0,0 +1,4 @@ +rs59404993 chr22:399-400 C ENSG00000128408.9 ENST00000614167.2 Transcript frameshift_variant 399-400 205-206 69 T/TX acc/aCcc - IMPACT=HIGH;STRAND=1;SOURCE=GENCODEv34 +rs59404993 chr22:399-401 TAA ENSG00000128408.9 ENST00000614167.2 Transcript frameshift_variant 399-400 205-206 69 T/TX acc/aCcc - IMPACT=HIGH;STRAND=1;SOURCE=GENCODEv34 +rs59404993 chr22:399 TTT ENSG00000128408.9 ENST00000614167.2 Transcript intron_variant 399-400 205-206 69 - - - IMPACT=MODIFIER;STRAND=1 +rs59404993 chr22:4981-4983 TT ENSG00000128408.9 ENST00000614167.2 Transcript start_lost,inframe_deletion 1-3 1-3 1 M/- ATG/- - IMPACT=HIGH;STRAND=1;SOURCE=GENCODEv34 diff --git a/test/integration/test_parse_vep.py b/test/integration/test_parse_vep.py index 2bc3f61a..4b82db1d 100644 --- a/test/integration/test_parse_vep.py +++ b/test/integration/test_parse_vep.py @@ -55,6 +55,19 @@ def test_parse_vep(self): self.assertEqual(files, expected) self.assert_gvf_order(args.output_path, args.annotation_gtf) + def test_parse_vep2(self): + """ Failed records are skipped """ + args = self.create_base_args() + args.input_path = [ + self.data_dir/'vep/vep_indel2.txt' + ] + args.skip_failed = True + cli.parse_vep(args) + files = {str(file.name) for file in self.work_dir.glob('*')} + expected = {'vep.gvf'} + self.assertEqual(files, expected) + self.assert_gvf_order(args.output_path, args.annotation_gtf) + def test_parse_vep_gz(self): """ Test parsing gzipped VEP output into GVF """ args = self.create_base_args() From 1e04233f0199a061d0f4247f51d45f8855639bc0 Mon Sep 17 00:00:00 2001 From: zhuchcn Date: Mon, 3 Mar 2025 19:45:17 -0800 Subject: [PATCH 4/9] fix (callVariant): variant bubble finding error starting from a out-bridge node. --- CHANGELOG.md | 6 + moPepGen/svgraph/ThreeFrameTVG.py | 8 +- test/files/comb/case_88/AltSplice.gvf | 20 + test/files/comb/case_88/annotation.gtf | 10 + test/files/comb/case_88/brute_force.txt | 39 ++ test/files/comb/case_88/gSNP.gvf | 14 + test/files/comb/case_88/genome.fasta | 439 ++++++++++++++++++ test/files/comb/case_88/proteome.fasta | 162 +++++++ .../integration/test_call_variant_peptides.py | 13 + 9 files changed, 708 insertions(+), 3 deletions(-) create mode 100644 test/files/comb/case_88/AltSplice.gvf create mode 100644 test/files/comb/case_88/annotation.gtf create mode 100644 test/files/comb/case_88/brute_force.txt create mode 100644 test/files/comb/case_88/gSNP.gvf create mode 100644 test/files/comb/case_88/genome.fasta create mode 100644 test/files/comb/case_88/proteome.fasta diff --git a/CHANGELOG.md b/CHANGELOG.md index feba44bd..e6068231 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,8 +12,14 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm ## [1.4.6-rc2] - 2025-03-03 +### Fixed + - Fixed callVariant that variant bubble not identified correctly. +- Fixed parseVEP that failed records will now be skipped with --skip-failed. #902 + +- Fixed callVariant with variant bubble finding error starting from a out-bridge node. + ## [1.4.6-rc1] - 2025-02-24 ### Fixed diff --git a/moPepGen/svgraph/ThreeFrameTVG.py b/moPepGen/svgraph/ThreeFrameTVG.py index 21de01fd..76f3ff28 100644 --- a/moPepGen/svgraph/ThreeFrameTVG.py +++ b/moPepGen/svgraph/ThreeFrameTVG.py @@ -1742,8 +1742,6 @@ def align_variants(self, node:TVGNode) -> Tuple[TVGNode, TVGNode]: for trash_node in trash: self.remove_node(trash_node) - return start_node, end_node - def expand_alignments(self, start:TVGNode) -> List[TVGNode]: r""" Expand the aligned variants into the range of codons. For frameshifting mutations, a copy of each downstream node will be @@ -1904,7 +1902,11 @@ def fit_into_codons(self) -> None: queue.appendleft(cur) continue - self.align_variants(cur) + cur_copy = str(cur.seq.seq) + try: + self.align_variants(cur) + except err.FailedToFindVariantBubbleError: + continue self.collapse_equivalent_nodes(cur) if cur.out_edges: diff --git a/test/files/comb/case_88/AltSplice.gvf b/test/files/comb/case_88/AltSplice.gvf new file mode 100644 index 00000000..1c39a4c6 --- /dev/null +++ b/test/files/comb/case_88/AltSplice.gvf @@ -0,0 +1,20 @@ +##fileformat=VCFv4.2 +##mopepgen_version=1.4.6-rc2 +##parser=parseRMATS +##reference_index=/hot/project/process/MissingPeptides/MISP-000132-MissingPeptidesPanCanP1/ref/GRCh38-EBI-GENCODE45/moPepGen/1.4.1 +##genome_fasta= +##annotation_gtf= +##source=AltSplice +##CHROM= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +#CHROM POS ID REF ALT QUAL FILTER INFO +ENSG00000137959.17 8005 SE_669-1042-1070-9050 A . . TRANSCRIPT_ID=ENST00000476876.6;START=8005;END=9049;GENE_SYMBOL=IFI44L;GENOMIC_POSITION=chr1:78627906:78628950 +ENSG00000137959.17 8493 A5SS_9098-8492-9818 G . . TRANSCRIPT_ID=ENST00000476876.6;START=8493;END=9098;GENE_SYMBOL=IFI44L;GENOMIC_POSITION=chr1:78628394:78628999 +ENSG00000137959.17 9819 SE_9097-9818-10014-10092 G . . TRANSCRIPT_ID=ENST00000476876.6;START=9819;END=10014;DONOR_START=10092;DONOR_END=10286;DONOR_GENE_ID=ENSG00000137959.17;GENE_SYMBOL=IFI44L;GENOMIC_POSITION=chr1:78629720:78629915 diff --git a/test/files/comb/case_88/annotation.gtf b/test/files/comb/case_88/annotation.gtf new file mode 100644 index 00000000..15d59ec1 --- /dev/null +++ b/test/files/comb/case_88/annotation.gtf @@ -0,0 +1,10 @@ +chr1 . gene 1 26244 . + . gene_id ENSG00000137959.17; gene_type protein_coding; gene_name IFI44L; +chr1 . transcript 630 22461 . + . gene_id ENSG00000137959.17; transcript_id ENST00000476876.6; gene_type protein_coding; gene_name IFI44L; tag RNA_Seq_supported_only; +chr1 . exon 630 670 . + . gene_id ENSG00000137959.17; transcript_id ENST00000476876.6; gene_type protein_coding; gene_name IFI44L; tag RNA_Seq_supported_only; +chr1 . exon 8005 9098 . + . gene_id ENSG00000137959.17; transcript_id ENST00000476876.6; gene_type protein_coding; gene_name IFI44L; tag RNA_Seq_supported_only; +chr1 . exon 9819 10014 . + . gene_id ENSG00000137959.17; transcript_id ENST00000476876.6; gene_type protein_coding; gene_name IFI44L; tag RNA_Seq_supported_only; +chr1 . exon 15436 15588 . + . gene_id ENSG00000137959.17; transcript_id ENST00000476876.6; gene_type protein_coding; gene_name IFI44L; tag RNA_Seq_supported_only; +chr1 . exon 17131 17302 . + . gene_id ENSG00000137959.17; transcript_id ENST00000476876.6; gene_type protein_coding; gene_name IFI44L; tag RNA_Seq_supported_only; +chr1 . exon 21120 21220 . + . gene_id ENSG00000137959.17; transcript_id ENST00000476876.6; gene_type protein_coding; gene_name IFI44L; tag RNA_Seq_supported_only; +chr1 . exon 21534 21708 . + . gene_id ENSG00000137959.17; transcript_id ENST00000476876.6; gene_type protein_coding; gene_name IFI44L; tag RNA_Seq_supported_only; +chr1 . exon 21874 22461 . + . gene_id ENSG00000137959.17; transcript_id ENST00000476876.6; gene_type protein_coding; gene_name IFI44L; tag RNA_Seq_supported_only; diff --git a/test/files/comb/case_88/brute_force.txt b/test/files/comb/case_88/brute_force.txt new file mode 100644 index 00000000..bec9b98e --- /dev/null +++ b/test/files/comb/case_88/brute_force.txt @@ -0,0 +1,39 @@ +AREAILIYN +ATVSIWNVKFFELKAQK +DGRSSYIK +DGRSSYIKNTF +EAILIYN +ENRSKILIR +FFELKAQK +FYGHRQYLECEVFRVEGM +FYGHRQYLECEVFRVEGTEIGF +IFIGYILLK +IFIGYILLKMEK +IIKAREAILIYN +ILIRTTSIVLYYTEK +ILIRTTSIVLYYTEKLSNFMDGR +KIFIGYILLK +KIFIGYILLKMEK +LNQMENR +LNQMENRSK +LNQMENRSKILIR +LSNFMDGR +LSNFMDGRSSYIK +LSNFMDGRSSYIKNTF +MATVSIWNVKFFELKAQK +MDGRSSYIK +MDGRSSYIKNTF +MENRSKILIR +MKIFIGYILLK +MKIFIGYILLKMEK +MLNQMENR +MLNQMENRSK +MLNQMENRSKILIR +QYLECEVFRVEGM +QYLECEVFRVEGTEIGF +SKILIRTTSIVLYYTEK +SSYIKNTF +TTSIVLYYTEK +TTSIVLYYTEKLSNFMDGR +TTSIVLYYTEKLSNFMDGRSSYIK +VEGTEIGF diff --git a/test/files/comb/case_88/gSNP.gvf b/test/files/comb/case_88/gSNP.gvf new file mode 100644 index 00000000..f0bef17f --- /dev/null +++ b/test/files/comb/case_88/gSNP.gvf @@ -0,0 +1,14 @@ +##fileformat=VCFv4.2 +##mopepgen_version=1.4.6-rc2 +##parser=parseVEP +##reference_index=/hot/project/process/MissingPeptides/MISP-000132-MissingPeptidesPanCanP1/ref/GRCh38-EBI-GENCODE45/moPepGen/1.4.1 +##genome_fasta= +##annotation_gtf= +##source=gSNP +##CHROM= +##INFO= +##INFO= +##INFO= +#CHROM POS ID REF ALT QUAL FILTER INFO +ENSG00000137959.17 8495 SNV-8495-T-A T A . . TRANSCRIPT_ID=ENST00000476876.6;GENOMIC_POSITION=chr1:78628396;GENE_SYMBOL=IFI44L +ENSG00000137959.17 10248 INDEL-10248-T-TA T TA . . TRANSCRIPT_ID=ENST00000476876.6;GENOMIC_POSITION=chr1:78630149-78630150;GENE_SYMBOL=IFI44L diff --git a/test/files/comb/case_88/genome.fasta b/test/files/comb/case_88/genome.fasta new file mode 100644 index 00000000..98431e32 --- /dev/null +++ b/test/files/comb/case_88/genome.fasta @@ -0,0 +1,439 @@ +>chr1 +GAATCATCATCTGTTTGTTCCAGTGCAGACAGGCTTGGCTTTGAACAGCATCCAGCTCCA +CGATTCTATCAGCTAAATTTTTTTTTTGTACTTTAAGAAATAGGGATAGTCATAGTGTTT +ACTGTACGCAGTTGTTCTGAGGAGAAATTAGGAAAAGGTATGCAGTTACACAATGCCACG +TACAACTCTAAATTCCAATTAATAAATGTAGGACAAACGAGGGAAGTAGAAAAATGCACA +AGAAATGTTAACTATTAGCTTTACTGCAGTTTCTTCTTGGTAGTTATCATTTCATGCCTG +CCTACATACATACAAGGGGACCAGTGATAGTTTTTATGTGCTCAGCAAGATTTTTTTTTT +CTTTCTTTCTCTTCCTAGTGAGGAAAAAGAAAGTTAGTGGCAGTTGGCATGCTGCCAGCT +GAGTTTTTTTGCTGCTTTGAGTTTCAGGTTTCTTTCTTTCTCTTCCTAGTGAGGAAAAAG +AAAGTTAGTGGCAGTTGGCATGCTGCCAGCTGAGTTTTTTTGCTGCTTTGAGTCTCAGTT +TTCTTTCTTTCCTAGAGTCTCTGAAGCCACAGATCTCTTAAGAACTTTCTGTCTCCAAAC +CGTGGCTGCTCGATAAATCAGACAGAACAGTTAATCCTCAATTTAAGCCTGATCTAACCC +CTAGAAACAGGTAAGCGACTTTTTAATTGAAACATAGTATTTGTACGTATTTATGGGGTT +ATGTGTGATATTTTGATACAAGCATACAATGTAATGACCAAATCAGGATCATTGGGAAAT +CCATCACCTCAACCATTTATCATTTCTTTGTATTTGGAACATTCCAAATTTTCTCTTCTA +GCTATTTCAAAATAGACAATAAATTATTGTTAACTATAGTTAAACTGTTGTGCTATGAAC +ACAAGAACTTATTCCTTCTATGTAGCTGTATTTTTGTAACCATTAACCAACCTCTTTTCA +TCATCTCCTTCCTCTGACTTTCCCAACCTCTGGTAACCACCATGCTACTCTCTGCCTCCA +TGAAATCAACTTTTATTTTAAGCTTCCACGTGTGAGTGAGAACATCACAGGTAAGTGACT +TCTTGCCATCCAATTTTGCTAGCTGTGTGTGAAGAAAAAAGCTTGCTTTCTTTTTTTCTA +AAAAGGAGTTTCAGAGTGGAATTGCTGCTAATACTTTGCTCTTTCATTTGTCTTTTATTT +TAATGAAAATTTCACACACAGATAACTAGAGAGTATAAATGAACCATACTGTAATCTGAT +ATAGTTTTATCCAATTTTAAAAATGATTTGCTTTTTAAATTAGAATAGTTTTCTTTTACT +TAAATACAAAAGCATTACAAATAAAGTTGAAGAAATCTATGACTCTCTACTATATATTGT +TTTCTTATGAGATGGAGTCTCACTCTATTGCCCAGGCTGGAGAGCAGTGTCTCGATGTCG +GCTCACTGCAACCTCAGCCTCCTGGGTTCAAGGGATTCTTGTGCCTCAGCCTCCCAAGTA +CCTGGGATTACAGGCGTGCACCAGCATGCCTGGCTAATTTTTGTATTTTTAGTAGAGACG +AGGTTTCACCATGTTGGCCAGGCTGGTCTTGAACTCCTGACCGCAAGTGATCTGCCAGCC +TCAGCCTCCCAAAGTGCTGGGATTACAGGTGTGAGCCACCACACCTAGCCTCTCCTATAT +TCTTTACCTTTCTTTTTATTTCCAGAAGGCAACATTATCTAGACTTTGGTATAAATTATT +GTCATAATTGTTTTTATGGTTTTCTGTATGTGTGTATATATTTATCTAAATATGATCTCT +AGTATAATTTGTTAAAATTACTTCTGTTTTTATATACCAGCAAAAATGGTTTTACACAAT +AACTTAAAAAATTAATATACTTATATTAATTTTATTTTTATTGATACATAATACATTACA +TATTTATGGGGTACAGGTGATATTTGGTGATGCACTTAGAATGTGTAATGATCAAGTCAG +GGTATTTGTGGTTTCCATCGCTTTGAGTATTTATTATTTCCATGTGCTGGGAACAACCCA +AGTCTTCTAGCTACTTTGAAATAGACAATACATTGCTTTTGGCCATAGTCACTCTACTTT +GTTGTTGACTAATAGAACTTCTACCTTCTATTTAACTGTATATTTGTACTCATTAACCTA +CATCTCTTCATCTCTCCCTCCTAGACACCCACCCACCCTTCCTAGCCTCCAGTATTTATC +ATTATACTATCCACATTTATAAGATCAACATTTTTAGCTGATCTAAAATGATGAGTGAGA +ACGTGTGATATTTCCTGTGCCTAGGTTATTTCACTTAATATTCTCCAGTTCCGTCCATGT +TGCTGCAAATGACATGATTTTACTGTTTTTATAGCTAAATAGTATTCCATTGTGTAATAT +ACCACATTTTCTTTATCCATCCACTTATGGACAGTTAGGTTGATTCCATATCTTTTCTAT +TGTTAATAGAAATTCACAATAGAAAGGTTGCTGTGATAAACCTGAGAGACACCAGAAGGA +ATTTTAACATATTGACCATACTAAACAATATTTAACATATTGATTGTATTCAGCTATCTT +TTACAATGTGGTGTACAGACTCTAAAATAGCTCCTAATTGTCCCTGTCCTATTTTTTTTT +TAATACTCATGTCCGTGTATAATCCCCCCTTGTCTGTGGATGGAAACTGTGACTTAACGG +TAACAAATAGAATTTGGCAAATGTGATATGTTAAGATTATAATGTCTGCTATGCTAGGAG +ATTCTCTCCCTTGCTGGTTTCAATGAAGTAAGTGGTCATGTTGGGGAGGCCCATTTGGCA +AAGAACTAAGGGTGACTTCAGGCCAATGACTAACTGAGAACTGAGGATAACCTCTGATCA +TTAGCTAGCAAGAAATGGAGTTCTCATATCTACAACCATAAGGAACTGAATTCTGCTAAC +AACCATAGGAGCTTGGAAGCATGTCCTTCTCCCGTTAAACATTCAGAAGAGTCTGCAGCT +GTGGCTCGCACTTTGATAACTGCCTCATGAAAAATTCTGAAGCAGAGAACCAAACTGGAT +TTCTGACCCAGAGAAACTGTGAGACTATAAATGTGTGTTGTTTTAAGTCTCTAAACGTAT +GAGAAATTGCTATGCAGCAATAGATAAATAGCACACTAAATTAGTTAATACTATTACTTT +ATCGGTAGATGCTTTAAGGTTTTCCACATATAAGATAATATCATCTGTGAGCAGAGATGA +TTATACTTCCTTTATAATTCAGGTACCTTTTCCTTTTTCACTTTCTTTTTTATTTTGGCC +TACTTGTTCTGGCTAGGACATTCAGTATTATGTTGAATAGAAGTGGTAAAAGTGGATAAT +CTTGTCTTGTACTTTATCTTAGAGGAAAAGCTGTCAGTTTTTCACTGCTGAATATGATGT +TAACTATGAACTTTTTATACATGTATTTACTATGTTGAGGTAATTTCCTTCTACTCCTGG +TTTAAGTGTTTTTTGTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTAAATCATGGAAGGAC +TTGGGTTTTATCAAATGTCTTTTCTGTATCTATTGAGATGACCAATTTGTATTAGTCAGC +GTTCTTCAGAGAAACTGAACCAACATAAAAAAAATAAAATTAAAAAAAAACTAAGGAAAT +TTGTTATGGGAGGTGGCTCATGTGGTGTTGGAGGGCGAGAAGTCTCACTATCTGCCACCT +GCCAGATGTAAAGCCAGGAAAGCTGGTGGTGTAATTCAGTCTGAATCCAAATGCCTGAGA +ACTGGTGGAGCCAGTGGTGGAACTCCCAGTCTGAAATCAAAGTCCTGGGAACTGTGGGAA +CTGAGAGAGCTGGAGGTGTAAGTCCCAGAATCCGAATGTTTGAGAACCAGGAGCTCAGAT +ATCTGAGAGCAGAAGAAGATGGATACTCCAGCTCAAGAATAGAGAATTTGCTCTTCCTCT +GCCTTTTTGCTCTATTTGGGCCCTCAATGGATTGGATGATGTCAGCTCATGTTGGATCTT +TTTTATGCAGTTTACTAATTCAAATGCTAATCTAAAATGTGTTTGTATATTCATTTGCCT +CAACATTTCTTTTCTTTTCTTTTTTTCCTTTCTTTTCTTTTTTTGAGATGGAGTGTCTCT +CTGTCACCCAGGCTGGAGTGCAGTGGTGCAATCTCTGCCTCCCAGGTTCAAGCGATTCTC +CTACTTCAGCCTTCTGAGTAGCTGGGACTACAGGTGCACACCACCACACCCAATTTTTGT +ATTTTTGTATTTTTGTTAGAGACGGGGCTTTTCCGTATTGGCCAGGCTGGTCTCAGACTC +ATGACCTCAAGTGTTCCGCCGAACTCGGCCTCCCAAAGTACTGGGTTTACAGGTGTGAGT +CACCACGCCTGGCCTCAAGATATTTTTAAAATTCCCTTTTGATTTCATTTTGATCCACTG +CTTATTCCAGAGGGTGTAGTTTAATTCTCACATCCTTGTTAATTTTCTAGATTTTCTGAT +ATTGATTTCTAGTTTCATTACATGGTTATGAGGAAAGACTTGATATGGTTTCAATCTTCT +TAGATTTGTTAAGACTTGTTTTTTGACCTAACATGTAATCTATCCTGGAGAACGCTCCTA +TAATACACTTGAGAAGAATGTGTATTTTGTTGTTGCTGGGTGGAATGCCCTGTATATGCC +TATTAGGTCCATTGGGACTATAGTGTTCAAGTTTTCTGTTTTCCTATTGATCTTCTGTTT +GGATATTTTATAATGGATAAAGTATTGAAAGTGGGCTATCAAAATCTCCTTCTCTTATTG +TGTTGCTGTTTCTCCCTTCAATTCTTTCAGTGTTTGCTTCATTTATTTAGGTGCTCTGAT +GTTGGATGAACATATATTTATAACGGTTATATTTTTCTGGAGAATTGACCCTTTTCTTAT +TATATGATGTCCTTCATAGCCTGTTATGACAGGTTTTGATTTAATGTCTATTTTGTGTGG +GATAAGGACACACACCCCTTGCGGGTTATCTTGCGGTTCTTGCAGTTACCATTTGCAAAG +GTACCCTTATTATCACTTGATTTTCATCCTATGTGTGTTCTTAAGTCTAAGTGAGTTCTT +GGGGACAGCATGCTGTTGTATTTTGTCATGACATGCTTTGACTCAGTATTATCCTTCTTT +CTTTCCTTCCTTTTTTTTTTTTGGTTTGCTAATATTTTGCTGAGGATCATTCCATCTATA +TTTATAAAAGAGATTGGCCTGTAATTTTTCTTGCTTATAACGCCATTAACAGGTTTTGGT +ATCTAAGTTATTCTCTGGAAGAACATGTGTAAGATTGTGTATTATTTATTCTTCAAAATT +TGGTAGAATTCAGCAGCTAGACTCTGATCCTTGAGTTTTCTTGGGGAAGTCTTAAATTAA +TTATGAATTTAGTTTTTAAAAATAGCTATGAGATAGTCAAATTTTGTATTTACTTTTATG +TCAGCTTTGGTAAGTTGTATTTTTCTAGAAATTTGTCCACTTATTCTCAAATTTAATTTT +ACAAAGTTGTTTATCTTCAGACCATTTTATCGTTTATCATTTATTGCATCTGGACTGATG +TTTTCTCTTACGTTTTTGATTTTAGGACATTCTTTCCCTTTTTTCCCTTTATCAGTGTCA +GCAGTGTTTTACAAATTTTGTTAATCCTTTTGAAAACTGGGTTTTGGTTTTGTTAATCTT +CTCTATTGTATACTTGATTTGTCCATTAATATTTCTGTCCATTATTTCTTGCTTTTATTT +TTATTGTTGATTTCTTTCTACTTACTTTGAGTTTAATTGGTATTTTTTCTGACTTTTTGA +AGTGAAAGTCTATATAATTAATTTTCAGCCTTTCTCCATTTCTAATACATATGTTTAAAC +TTATATGATTTTTGTAACATTTTATCTTCATTGTATAAGTTTTAACAAGTATCTTTTGTC +ATTTAATTAAATTTCATTATGATTTCTTATGTGAATAATGGATTATTTCAAATTATATTT +TTTATTTTCCAATGTATGAGGGGATCTACTAGTTATCTTCTTGTTAAAAAAATTCTGGCA +TATACTGTCTGTTCAATATTATAGTACTTTGAAAATTTTTGAGACTTGCTTGCTTTTTAA +GGTTTGTAGAATATCTCTTCTATATTCTTTTTCTTTTATTCTTGTTTTCTGGTTCATATA +TCTTAACTAATTTTACTGTATCCTTTAATCCCTTTTGCCCTTTTCTCTTTTTATTCTTTT +TTTGTCTCAATACTTCAGTTGGTAAATTTTCTTCTAATTCTTTATGCCTACCTTGCAGTT +AAATGTATTCATTTAATTATTAATTTCAGATATATATTTTTACATAGCAAATCTCTTTTT +GACTTCTTATTGTATTCATTTTACTGTAAATCATTCAATCTTACCTTTTAGCTGTTTGAA +CATATTAAACATAGTAATTTAAAAATCTCTATGGGTAACTCCAGTATAAGTATGTGTTTG +TTTCTAGTGTGTGCTTTTTCCTCTTGGTTTTCTATCATGTTGTTTCATTTACCTCCAAAC +ATGAATGTGTTTATGATGAATGAGAGTCACTGAATATTTAAAAAATCATAGAGATATTTT +GAAGCTGGAAGAATGTTCTTTTCATCCAGACAAGATTTATTTTGTTTCTGCTATGCAGCT +ATGGACACTAGCAATTCTCATTTACCTTAATTCAATCAAGGACTGAGAATACTGAAGTAT +GAAATTAACTCTGTCTGCTTCCAGTTCATACTTCTTCCCTGTACCTCTTTTCTTAAATTT +TTTTTAAAATTTTATTTCAACAGCATTTGGAGTATAAGTGGTTTTTTTTTACGTGGATGA +ATTATATAGTGGTGAATTCTGAGATTTTAGTGCATCCATCACTCAAGTAGTGTACATTGT +ATCTAATATGTAGTTTTTTTTAATCCCTAGCTCCTCTTCCATCCTCTCCTTTCTGAGTCT +CTAAAGTCCATCATATCAGTCTGTATGCCTTTGTGTACTCATAGCTTAGCTCCCACTTAT +AAGTGAGAGCATATGGTTTTTGGTTTTCTCTGTAGCTCTTTAGAATCCCAAACTGTAGCC +TAAGGGAAAGGTTTCCAAGGTGCTCTCCTTATTGCACTCCAAAGTATGATTTTTATCTCC +CTAACTCTATGAATCTAGCAAACACTTTGCTTAACTTTTCAGCATCTCAAGTGCCTTTTC +TGAATTTGACAGATACACGTCAGAAATGTGCTCCCAAATGCCTAGCTTATCTCTCTTTAT +TAACTTCTCCAAATCTTTGCCAATAGCTATATGTCTTAACTGCCTTGTTTGCTCTTTAGA +GCTTTGTAACACATGCATTTTTATAGTTTATTCTGCTTTTCAAGTTTTTTTCATTGAAAT +AATTCTTGTAATGACCAATAGTCTAGTATGCTGCTGTAAGACTTCTATAGCTGTTAACTC +CTTTCATATTTCAATTTCTTTATGTGCTGTTTTTTGCAAGAATTCCTAGGCTCCATTTTC +TAATTCATACGTTGGCAAATGATGAATGTGATATGACAATGATGACCTCTGTTCCAGCCA +TATACTTTTTTAAAGAAAATTTTATTGGAACACAGTCATGCTCTCTTGTTTACATATTAT +CTATGACTTCTTTTGTATTACAGTGGCAAAGTTGAGTAATTGTGGAAGAGACCATATGGC +CTACAAAGGCTAAAATATTTACTATCTTGTTCATTAAGAAAAAATTTGCTGACTCTTATT +GAGATTTATTTTTCCTTTATTCAACTTTCTGAAGTTTGTCAATTGATTTAAAAATTTTAA +TGACTTTTCCTTTCTAAATTATATATTTTTATATCTAACATTATTGTGCCATTTCTATAT +TTTTGCTATATTTTTTGCCTTAACCAAACAATGTGTTTTAATTGAGTCAGTGAATAAAGA +ACTCACTGAAAGTAGAAGTGGAAACCTAAGCTATAAGCTACTATATTATATAAGCTTCTC +AACCTATAATTGTTTTTCCATTAGATATAGAACAATGGAAGTGACAACAAGATTGACATG +GAATGATGAAAATCATCTGCGCAAGCTGCTTGGAAATGTTTCTTTGAGTCTTCTCTATAA +GTCTAGTGTTCATGGAGGTAGCATTGAAGATATGGTTGAAAGATGCAGCCGTCAGGGATG +TACTATAACAATGGCTTACATTGATTACAATATGATTGTAGCCTTTATGCTTGGAAATTA +TATTAATTTACATGAAAGTTCTACAGAGCCAAATGATTCCCTATGGTTTTCACTTCAAAA +GAAAAATGACACCACTGAAATAGAAACTTTACTCTTAAATACAGCACCAAAAATTATTGA +TGAGCAACTGGTGTGTCGTTTATCGAAAACGGATATTTTCATTATATGTCGAGATAATAA +AATTTATCTAGATAAAATGATAACAAGAAACTTGAAACTAAGGTTTTATGGCCACCGTCA +GTATTTGGAATGTGAAGTTTTTCGAGTTGAAGGTTTGTAAAATTAGATAATCCTACATCT +CACATTATGATTCTAATCCTTGTGTTCGGTAGGTAATAATTGATCTACAAGAGCAAGGAA +AGTGAAAGGAACAGTTAGATTCATGCCATACTCTTTGATAAATTCTGAAATGAAACGTGG +GGAACATAAGGATTATGAGGTTTTTATCCAGAACGTATAGGCTGCTTTGAGCAAAGCAAG +TTGAGAAATATATGTGCAGAGATAGAGATGACAAAGGAGATGGGGATGTGGAGAGAGAGA +GATCTTATCATCAGAGCTGCTCTGTTTTGACATATGACCCAAAGATCAGACCCTTATATG +ACCCTTAGAACTTCTAGGTGTAATTATGTGAGAGCTTCCAGAGCAAGTTAATGTGCAGAG +AGGATAATTTTTCCTCCTCTTCCTCTCATGTTGCCAACTCTGACTCCTAACTGAGCAGCT +CAGCCCTGGGTTATGCAAGAAAACTCCATGTCTTATTTCAAAAGTAGACTTGGCCACATT +TAACCAAACAAGTTTTAAAAATGTATTATGCTATGTTTATTTCCTATAGGAATTAAGGAT +AACCTAGACGACATAAAGAGGATAATTAAAGCCAGAGAGTAAGTTGGATTCTTGGGCTAT +CTATTAATCATTTTATTTAGACCAATTGTATAAAGAATGCTGACAAATATGTGGTAAAGA +TAAAAATCATGCCACTGGAATGATTAAAAAAGAGTTATAATGTTATTGACATAAAATAAC +AAAAGCACTACTAACTATGAAACTTTAAATATCCACCTGATGACATTGACCAATTTGACT +TCATTTCCACTTTAGACCAAAATTTTCCAGGTAGGGTCTGGATAACCATTTTAGGTAATT +TCTCTGAATCCTTTCCTTGAGAAACAGCTTTTGAGAATCCTCAGTAAATGTCAAAAACAA +ACTTTCTTTTTAAATTTATAGATAGCTACATTGCATATATTTTATTTTGATTTCTCCTTA +ATGTACTTTTAGTGAAAATAATAATTTATGTTCAATGTCACTGATATTTCTTATTACATT +CATGTGATCTAAGACTAATAGTTGAAAAATGGTTTGGTTGCAGGTTAATATTTAACTTTA +GTCAGATATAAAATAATGCATTTTCTTTAAGCACAAAATGTATAACTTTAGATATTTCAA +ATCATATTATAGAAGATCTCATTATCAAAATAATAATTAAGTATTGAAAAAATACCCTTC +TCAGGGAAGAGAAAGAAGCTCTAGGTGACTTGAGATGTTCTTTATGGTATTCTTTATTGG +CTGTTCTGATGCTGTATTTAACCACTATATTTTAACAGGCACAGAAATAGGCTTCTAGCA +GACATCAGAGACTATAGGCCCTATGCAGACTTGGTTTCAGAAATTCGTATTCTTTTGGTG +GGTCCAGTTGGGTCTGGAAAGTCCAGTTTTTTCAATTCAGTCAAGTCTATTTTTCATGGC +CATGTGACTGGCCAAGCCGTAGTGGGGTCTGATATCACCAGCATAACCGAGCGGGTAAGT +TATTTCCCTGAGGATTTTATTTTATAGATTACAATTATTATATTGCTTATGTCTTTGCCT +TTTTCTCTTAGGGCAATCCTAATATACAATTAAATGCTAAATCAAATGGAAAATAGGAGT +AAAATTTTGATCAGAACCACCTCCATTGTGCTATATTATACAGAGAAGCTTTCAAATTTT +ATGGATGGAAGAAGCAGTTATATAAAGAATACTTTTTAGTATTTGTATAAAAAACAACAA +AAAATTAAAAACCCTATGAAAATCTTTGACCATTATATATGATGTTCATTTTACTCATGT +ACTACATGATTTTACCTGGTTTAAAATGTTTTTCTTTTTTTGTAATAAAATAATGAGAAT +TATGTAAGGTAATTTTTTTTAATTACGAGACCATTCTCTAAGACATGGTCATGAAAAGCA +AAATAGCAATTCACTTTGGTGGGTCAACTTTGAAGGAGAACTTGAAGTGTAAATCTTACA +AAAACTATCTTATTTACAAAAAGATATACATAAGGAAATAAATAGTAATATAGATTTCTT +AATTGTTGCTCTAAAGAACTACAAATTATGATGCAGCACTTGTCTAGAAAATTCTTTCTT +GTTTATGCTTTTCAATGTTATCATGCACATGCAATCAAACAAAAAATAATAAAGGGAAAG +AATAGAAAATTTTAGAATCAGGCTTTTTTAATGATTACCACAAATTAATATTGTCACTTG +TTAGGGAATATGCTGTGTATTATCAATGCAAAATTTAGCATAAGACAGCATAATAAGACA +GTAATGGTTCATTTTAGCTTTTGTGTCACATTCAATAACTGGAAGAATTTTCTAGGACTT +TCAAGAATCACATCTTATTTGTTGATGTTGGCCAGATACTGTGCTCTTGAGTGGCTTTAT +TTTAATATATTAGTGTAGTCTCAGAAGCCGTGGCAAGGTACGGCAGCACAAATTAAATTA +ATTAGGCATTGATTGAAATGTTTCTGTTTGCTAAATATTTATTCAGACATAGAGACCTAA +ATACAAATAATGCTTGGTCCCATTACCCAGATATCTCAGAGACCAATGCAGATAATAAAT +CTGTATCATTCATTGTGATATATGCTATAATAGAGATACTACTCAGGACAAAATTTCTCT +TTCTTCCTTTTAAGTTTTAAATTCCAAAGGTCTTATTTTAAGAGAAGATGTAGATACTGA +ACTCCAATGAAATGCAAATGTTATTTAAATTGAAGAAATATAGGGCAGACTTTTATGAGG +GAAATTGAATATTTTATTGCTTAAAAGACTCTTTCCTTTGAATAATTGAAGTTGTAAAGA +AAGCAAATTAATGGGTAAGGAAACTAGGAAGATGCATATTGGTGTTCCTGAAATTTTACA +ATGTACTTTAACTCTAGTGCTTAGTATAATTTCTTTCATGTTGAGGCACTGGTAAGTCCT +ATTCTCCTAGATAAAAATCTTCAGTGGGATTACGCCTCTGCGTTTCATGGCCCAATGAGA +GCTTTTGATTTTATTACAGATTACAATAAGTCATAGACTGTTCTCTCCCTTCTGGTTTGA +GTGATACTACACCCTTGCCAGCACCACAGATGGAAAAATTGTAATTAAAGTGTCTCTGAG +AAATCCTTTTCTTTGCAATCTTCCCAAATCGTCCATATTTTTTTGTGTATGTTGAATGTA +TATCTGACCATCAAATAAAATACTTTCAGAATTGTTCTAGAAAATATCCTTGCAATAAGA +TTGAATGTGTTTATCTAGGAAAGAAAGTTCTGTTGTGCTTCTCAAAGTGAAATTTGTGGA +CCCCTTGGGTTCCCGGAGACATTTTAAGGGTATCTCTGTGTTCAAAACTATTTTTAGAAT +TATTCTAAGTTTTTTTTGTCTTTTCTAGTGTATTGACACTTGAATTAATTACACAACATA +GTTTTACCTGTTGGCACATTAGTATGAATCAAACGGTGGCACCAAACACTACTAGTAGTC +ACTGTATTTCTCCCAATTACTGCACTTACAGGAAAAAAAATGCCAGTTTTACTCCTTGAT +GAATAGTAACAGTTATTGGTTTTATTTAAACTCCACTTTGGAATACACATCATTTTGATA +TTCAGTGTGATAAAATGGGAAGTGCTCATAAAGTACTTCTAATGCATATTGAAAATTGGC +AGTTGTCTTTAGAAAAAGCACTGGAGCAGCTGAGTTGTTAACTGAACTTTTTTTAAATGT +GACATCATTTTTACTGGACAGCACAGACTTACTCATTCATACTTGGATATTTGGCAGGTA +TCTTAATGCAAATGCTCAAGTGACCCTACTAATAAAGAACAAAAACAAAAACTGACAGTG +TTTGTTGCCAATAACAAAATTTGAGCTTTTGAGAAAAATCAAACTTTGAGAAACCTGTAT +TCACTATTCTGAGCCTGACAGCTTTTCAACACTTAAAGACTTATCCAATGAAATTAATGG +TGGTATTAACAAATGTGATGTTTTAATATTGCATGATGAAATGTGTCAACATTAGGAAGA +TCTGGATAACCCAGTGGACGAATGTTTTAAGATGATCAATAAATTAGGTTAAAAAATCAT +GCATGAGTAAAGGATATATTCAAAGTATAAGACAAACCAATGGATTTAATGTGGCAGCAT +ATAAAAATGTCGCAGATTCCACATTGCAACTAACCTTTAAGAAGTTATTACTTGTTGAGC +TTTGCTATATTACATTGAGGAATATTCACAATTACCTGAAAGCTATACAGTGAAATATTA +CCTCCTTTTCATATTATACATACATGTGAGAGGCCATATTTTCTTCACATACTTCAATTG +AAACAACATATCAAGAGATTGAATTGGAGCCACATATGAGATTACAGTTATATTCTTTAA +TCCAGACATTAAAGAGAATTTTACTAATAAACCAATGATTTCTGTTTTCTGTAAATTTTT +AAAAAAATGCTTTTTAAGTAAAAAATGTTATTTGATGAACATGTGACAGGTTTATAATTG +AATTTTTAATGATTAAAACAACACTTAAGAGCAGGAGGGGCAGAGCAAGATGGTCACATA +GGTCTCTCCAGCAATCATTCCTCCACTGAAACATCAATTTGAATAATTATTAGCACACAA +AATATCACCTCAGTAAACCAGGTGAAAGATTACTGTACCTAGTTCTAGCACAATAATAAC +AAAAGATACATTGAAGAGGGTAGGAAAGATAGTTTTACGTTATCTGCATCACCCCCCTCA +ACCCCAGGGAGCATAGTGTGGAGAGAACTGTCATCTGCTTTGGGGAAAAGAGGGAAGTGA +GCACAGGACTTTGCCTGGACTCCAATATCAGGCCTACTGCAGTAAAATCCAGCACTGGAT +AGAACCTCATGGCCCCAGACTTTAGGCTGTTATTTGACGGCTGAGCCTCTAGCCCCATGC +TGGTGCCAAGTGAGACCATGTGGCCCTAGGCTTCAAGCTTGCATGGTGGACTCAGTCCCT +GGACCACACCATGGATGGGTTATTAGTGGCCCAACCTCTGAACAGCACTTAGTGACACGG +ATGCCGCAGTGGCCCTGAGCTTTACGCATGCCTCAGTGCTGCACCAGTCACAGTGGTCCC +AGGCTTTGGGACTATGCAAGGCAGCCTGCCCAGAATTTTTGGATAGGCTGATTGTTGAAG +GACTTTTCCAGACAAAGGCAATCTGCAAAGATTGGAATAAGTACCTACTTCCAGAGATGT +TTATACATTGACACATTGTCACAAGAGTCAAAAGCAATCAGGGTAACATGACGTCAGTAA +AGAGAGAAAATAACGTGCCAGTGGCCAATCCTAAAGAAATGGAGATGTATGAACCACCCG +AAGAATTCAAAATAATTATTTTAAGGAAGCACAGCAAACTTCAAGAAAATACAGAGAAAC +AATTTAACAAAATGAGAAAAATGATGAGTGATCAGAATAAGAAATTTAATAGAAAGGTTG +TAATAATTATAAAAAAGAAACAAATTTCAGAGATGAAAAATACAATGGGTGAAATGAGAA +ATGAAATAGAGAGCATCAACAGCAGAATTGATCAAGTAGAAGAGTCTGTGAACTTAAAGG +TAGGTTATTTGAAAATATAGATAGAGAATAAAAAAACAGAATGAAGAGGAAGGAAGAAAG +CTTATGGGATTTATGGGACATCATCAAAAGGACAAATATTCAAGTTACAAGGGGGAAAGG +AGACAAGAAATTAAAAGTAATAGAAATCTTAAGTAAATAAATAGTAGCAAAAAACTTTTA +AAACCTGGAGGATGATGTAAATATCCAGGTAAAACTGAGAGAAAAGTCCCCAGTCAGATT +TAATCCAAAGAAGATTAACCCAAGATATATTAAAATCAAACTGTAAAATATCAAAGACAA +AGAGAGGATTCTGAAAACTGGAAGATAAAAAAGCAAATAACATATTAGAAAGTTTTAATA +TGACTAGCAGCAGACTTAGCAGAAACTCTGCAGGCCAAGAGAGTAGGATGAAATATTCAA +AGTGCTAAAAGAAAAAACAAAGAAACAAACAAACCCATCCAATTGAGAATATTATACCCA +GTAAAGCTGTGCTTCAGAAATAAAGGAGGGATAAAGATTTTCCTGGATGAACAAAAGCTG +AGGGAGTTCATTACAATTATACCTGTCTTACAAGAATGTTAAAGCGAGTTTTTCAAGCTG +AAAGAAAAGGATGCTAATAAGTAATATAAAAATATCTGAAAGTATAAAACTCACTGGTAA +AACTACAGTAAAATTCAGAATACTCATACTATAATGATGGTGATGTTTAAAAGACAAAAC +TGCTAAAATAATAATAGCTATAAGATTTTGTTAAAGGACATGCAATATGGAAAGATGTAA +ATTGTGGTATCAAAGGTGCAAACTGTGGGGAGGAGTGGAGTAAAAATGTGGAGTGTTTTT +TTTTTCTCTGTAAGTGGAGCTGTTATTAACTTAAAATAACCAGTTACAAATATGTTTATT +TTTTGTAAGCATCATGGTTAAGCACAAAACAAATACTGATAGTAGATACACAAAATATAA +AAAGCATGGAGTCAAAACATACCATTATATATATATATATGTGTGTGTGTGTGTGTGTGT +ATGTGTATATATATATATATATACACACACGTTTTACATGATAAATTACATAGAAATTTC +TTTTTTAATTAAAAAGGAATATTTACATTTTTTTCAGTTGTAATTTCCACTATTTTAAAT +ATCCATAGATATAACCCACATAAACAACAGATCTTTGGGATTCTCAATAACTTTTAAGAG +TCTAAAGGGGTCCTGAGACCAAACTGTTTGAGGACCATGGTGTTCAAATATTAACCAATC +TCTTTTCTTCTTATTGTCATGTCTTGAATGCTGGGTGATGAATGAACCTTTACACTTAAT +GTATTTTTTTAACAGTATAGGATATATTCTGTTAAAGATGGAAAAAATGGAAAATCTCTG +CCATTTATGTTGTGTGACACTATGGGGCTAGATGGGGCAGAAGGAGCAGGACTGTGCATG +GATGACATTCCCCACATCTTAAAAGGTTGTATGCCAGACAGATATCAGGTAAGATTTCTT +CAATATCCAAAACATTTCAAATCATTTTCTTCAGTATTTTTCATTGAATTTTTAACCACA +TAAGGCAATTTCAACTCCCATAATGTGGTTTCAACATCAATCAACACTATTCTACTTTGA +ATGAATAAGAATTCATATGAAAAAGTAGAGTGACTATTGTTCTTTTCTGTAACATTGAGA +AATAAAGCCAAAAGGCAGGATTCTACTCATTTGCATTCTAGTAAAAGATCAAGGAGATAG +TGTGTTTTTATCACTCTTTACAATAATGTAAACTAATGATAGTCACTTAATCCCTTGCAC +TTTTTGGGTTACATAGACAAAATATTGGAACAAGTAACTATTTTGTGTTGAAATGTCATT +TTATACTGTAAATGCTTAGATTTCACCACTAAATACCTAGATTCAAAATTTGGCTTTGCT +ACTAACAGATGTTTACCCGTAAGTTGCACACTTCTCTCTCTCGGTCTTTTTTTTTTTTTT +TTCAGGTGTGAAATATTGCTATTAATAGTGTCTATCTCATAGGGTGGCTAAGATGATTAA +ATGAGATGATATAGTTTAAGTTCTCAAAAAAGCCAACAATGATTGTTACCTTTCTTTTAG +GTGTTGGGTATCAGCCTCAATTTCTAAAGAAGAATCTCTTTTGCCTCCCAATTGTGCTGA +ATTGGAAACCTTACCTATGTATTTATTTATTCATTCAAAAACCATCTATTATAAACCATA +TTTGTGCCAAACTCTAAGCCAGTTACTGAAAACACATAAGTCAATAACAGCACAATCCTT +ATCCTCCAGGAGTTCATAATCCAGTCAGGGAAAAGGAGACACACAGATAAAGAGAGGAGG +GATAGAGCGAAGACATTGGAGAAATCAGGAGCATACCAACACCCTAATTTTTCACAGCAG +GGAGCCAGTCAATTCAGTCCAACACTGAAATACATAGCTTAATAACTACATGTTTCCAAA +CCATTGTTTCATAATCTATTTCCTTAATCTTAGAAAGATTACTGTCTTGTGGTAAGAAGA +CATTAACTTCAAATTTGGTATTTCTTTTATTTCACTTTAGTTTATTTTTCTTCTATAAAA +TCAAAATAAATTACAGTCTTTTAATTTAAATAGCATTTTCAACATATTTTTCTTTCTTTT +ATTCTCTCTATCCAGTTCATTTAAGCATGCAAAATGCTTAGGGAGGAGTCTTAAATTGTA +TTTAACAAACGTTAATGGTGTTTATTTCTGGATACTAGGATTTGAAATTAGCTTTTCAAA +AATATTATTTAAACTTTCCTGCAATACTTACAATGGATAATTATCATTTTTACCAAAGCA +ATAAAGTGTTTTTTTTCCTATAAGAGAGATATAATGGCATTATGCTCACAGTGGAAAGGA +AGTAAAGAGGAGGTGCTACTGTTTGGGGAAAGAGTAGCTGAGGAGAAAGAGGGAGATCAG +TAGTTACAGTATTAGAAGCATTAATTGTGTCTTTAAGGGTTAAACAGAGGCTCTCTGATT +TCTGAATGTTACCTTATGTTTTTATAACAGTTTAATTCCCGTAAACCAATTACACCTGAG +CATTCTACTTTTATCACCTCTCCATCTCTGAAGGACAGGATTCACTGTGTGGCTTATGTC +TTAGACATCAACTCTATTGACAATCTCTACTCTAAAATGTTGGCAAAAGTGAAGCAAGTT +CACAAAGAAGTATTAAACTGTGGTGAGTCTCACTGAACTTATAAAAAAATTTACTTTGAA +ATAATTATAGATTTGTAGGAAGTTGCAAAGAGAGTACAGAGATTTCCCATGTACCTTTCA +TCCTATTTCCCTGGAGTGGTGGCCATCGGAGGGAGAGATCATTCACAGGCATGACTGAAG +AAAGACCCCAAATCCTAATGTTCAGTCTTTAAAGAATAGACTGGCCAACCATTTTTTTTT +CGTGTGTGATTTTCCTGAGGCTTTAGATAGGGCTAAAACCTATGTGGGTAGACAGGGAAG +ATCGAAACCAGAAAATTGATTATTGTTAACAATGTGTGTTTATAGTTCTATGTCATTTTA +TCACATGTGTAAGTTTATGTACCCATCACTGCAACCAAATATTACATCACTTTGTAGATT +TCCCTCATGGCACCCACTATTCCTCCACCATCCTTAACTCTTGGCAACCACTAATATTTT +CTTCATTTCCATAATTTTGTCATTTCGTGAATGTTATACAAATGGAATCATACAGTATGT +CACCTTTTGAGACTGGCTTTTTTGACTCAACCTAATATCCTTTGACATCCATCCAAGATG +TATGTATGTATTTATTGTTCATTTAAAAAATTGTATAGTATTCCATGGTATAGATATACC +ACAGTTTAACCATGCTCTATCAAAAAATATTCCAGTTTTTAGCTATTGCAAACAAAACTC +CTATCAAGTTTTGTGTGGATATAAGTTTTTATTCCTCTGGGGTACATCTCCAGAAATGTA +ATGCTGGATCATAAAGTAACTGTATGTTTAGCTTTGAAAGAGATCACCAAACTATACTCC +AGAGGGACTGTACCATTTTACAATCCCACCAGCAATACATGAGAGATCTATTTTTTCTGC +ATTCTTGCTAGTATTTGGTATCGTTGCTATTAAAAATTCCTTAACTCTTCTAATAGATGT +ATGCTGATATTTCATCAGGGTCTTAATTTGCATTGCTTAATGACTAATGACATTGAACAT +CTTCTCATGTGCTTATTTGCCTTCTGTATAGCTTGTTTAGTGAAATGCCTCTTCACGTCT +TTTGATTATTTTCTGATTGGATGATTTGTGTTTTTTAGAAGTTGAATGTTAAAAGATCTA +TAAATCCTAAATATGAGTCAATTGTCAGATACATTGTTTGCGGATAGTTTCTCCCAGTCT +GTAGTTGTCTTACTATTATGATGATTTCCATCCCTGCCATCTCTCATCTCTCCTTTTGGA +ACTCTGGGACATGAATTCTAGATATTTTGTTATAGTCTCACAATTTATTTATTTATTTTT +TATATCTATTTGCTCTTTATTTTTCAGAGTGGGTAATTTCTATTGTTATATCTTTGAATT +CACTTGTTCATCACTTTGCCCTCTCCATTCTGCTTTTGAGCATAGCTATTGTTTTTCCTA +TCTTGGGTATTATGGTTTTTTAGCTCTAAAATTTCAAGTTGATTCTTTTTCAGGTCTTCT +ATTTCTTTGCTGAGACATCCCACTTCCTTCATTTGTTTCAAGAATGTTCCTACTTGGTTC +TTAAGCATTTTTATGATGGTTGCTTTAGAATCCTTGCCAAATAATTCTAACATCTTTGTC +ATCTAATTAGGGTCATGATCTTCTGATTAGCTTTTCTCATTCAAGTTGAGAATTTCCTGG +GTTTTGATGTGACAAATGATTTTTAATTGCATCTTCGATATTTTAGGTACTTTATTTTAT +GACTCTGGATCTTACATCTTGTGTTTTAGCAAACATATTCTGATACTGGTAAGATGGAAA +GTACAGGTTTCCCATAAGGCCTTGTTTGACACCCTTGGAGAAAGGGCACTTTCTTACTGT +GCCCCATATAGGATACCCTGACATCACAGGGGCTAGAAGCCTCCTTGCATCGAAGGTTAA +TGAAGGCTAAAGTTCCAGCTTCTCACTTGTTCTCCTCTGACACTTCCAGAGAGGGAAAAG +AGCACCTCATTGCTGCTGATTCGGTGGAAGTCTAGGATCCCCAAGTAGCCTCCCTTGACA +CCATCAAGAGGGACCTTGTTAGTATAGTGCAGGTATAAATGTCCTGGCTCCCTACATTTT +TTTCTTCTCTTATACTACCTTGGCTAAGGGTGAAAGTTGGGAAAGGGAAGGACATATCCA +TGGATAGGGCTAAGGTGAATTCTGTGGTGTTTGACTGGGGTAGTGTGGCTATTTTCTGAA +AGTTTTCTGTCTTGCTACACTGTCCTTTTTCTCATGCTTTGACTAGATGAAACTAGTCAA +CAGCAAAAAAAATTTTCCTTTTTGCTGGCTCTAATTTGTATTTTCCAGTTGTGAGTGTCT +CCAAAAGCCAATCTGGGACATATGAAGTAAAAAGAAAACCTAGAGAGCTCATCTCTGTAT +TGTTTCTTGGGTCTCAAGGTCTCTAGCCAAATTGCATTATCTGCACTTTTCAATCTTTAT +GTTTGATTTATGCATAATGTCCAGAGATTTTAGCTACATTTAGTGTAAAGAAGAGGGAAA +AGTGTGTCTACTCCATTTGGTCCAGAACTGGAAGTACTCTCACTGAACTTCCAAAAGATT +TTAATATAAAGAATTGTTGGGGCATTTATACTCCAAAGCCTCGTTGCTTACTTTATCAGT +AGAAAGATCACATATTTAGGTTCCGAAAAGTGCTATGCCGTTTAATGTATAGAGCATGGT +CTTGCTTAAGTGAACCTAGTTTTTAAATAGTAGTAAACTTTGACCTCGAGAAAGGTAATT +TAATTGCTCTGAACTTTATCTCCTGTTCTTCTAAAATGGGGATAATAAGTTTTGCATCTG +AACTTAATAAACTGTCTAACAAAAGTCTGAGTAGTGAACATTAGATATACACAATGTAAA +TATATAAGCTTATATAAGTACTTAACAGCTCTAAAATTATGGAGGGTTTGGGGTAAGACA +GAAATGCTAAGTTTTAAAATGACTTTTCAAGGATGTATGGCAACACATTGTCCTTGGCGT +GCTGGATGGCAATGCTTAATGCAACTTAGTGTGCTGACTTAGGACCTCTAAAAGGACATT +GTATTAATTTTGAAACAGAAGAATTACATTGCAACAGAGGTTGAAAAAAATTATAGGTGT +CAACTGAGAACTGCTCTCTACCAGAACCTAGGTATTAATTGGCCAACTTTGCTAGTTATT +CCTCTGAAACCTGAATGAAGAGAAGGTAGTAAAGGAAATCATTTAGTAATGGAAAAAATA +AACACAAGATTTTAGTAGGAAGAAGGTACTAAAAGCCCTGTTAAAAGAGTCCTGGGAGCT +TCCTTGACTGGCATTTGAAATTAATCTTATATTTTCCTCATAAGCAGTTGTTAGCATAGC +AGAATTTTGATTTGTTTAGAAGTCAAATGTGATTTACTAATTCAATCCCTGTGGTTTGAG +GGAGAAAGGAGTAGGTCCCTATTAGAACTGTGAAATCAGTACAGTTTGCTCTCATAAATG +CTCATACACAACCCTCCTCACTTCCTCTTTTTCTTGTGGTTCTTTTTAGTCCCCAAAGTC +CAGCAACCCCTTTTTTCCCCTTGACCAATTTAAAACAAACCATTTGAATGGAATCGTAGA +TGATTGCAATTTAGTGTGGATTTAGTTGCATTGAAAAATGCTGTCACATGTTGGAAAAAA +AAGATCAAAGATCATTTCTCCTCTAGAAAAGCTTCAGGCTCCATATTGAGATGTATTGGG +GATATTTCATTCTTCAGCTCTCAAAATCACCCTATAGAGTTGCCTTCACACATATTTAAT +AACTTGTTGACACATTGCTATTTATGATATAAAATAACTGATTTATCTATTTGATATAGG +TATAGCATATGTGGCCTTGCTTACTAAAGTGGATGATTGCAGTGAGGTTCTTCAAGACAA +CTTTTTAAACATGAGTAGATCTATGACTTCTCAAAGCCGGGTAAAAAATGCTGATCATAA +CCAGATATTATTGTAATAGTATCACAATCATACGTGTGTGTGTTTGTGTGTGTCTGTACG +TGTATATGTGCTTACAGAGTGTATTTGCAGGGAAATGAAGAATGGGATCAATTGCTTGAT +TAATTCAAATTTATATCACACTTTAAAACTGCAATGATCTGAATTATTGTATGTTCCGAA +GATAACTTATTAAGCCATTGCTTTTTTGGTTTCTTTCTTAAATTGTATTTAAAATTGGTC +AAATTTCAAATATTAAATACAGGACTTACACTTTTTAAATATACTTTCCACAGGTCATGA +ATGTCCATAAAATGCTAGGCATTCCTATTTCCAATATTTTGATGGTTGGAAATTATGCTT +CAGATTTGGAACTGGACCCCATGAAGGATATTCTCATCCTCTCTGCACTGAGGCAGATGC +TGCGGGCTGCAGATGATTTTTTAGAAGATTTGCCTCTTGAGGAAACTGGTAATCTGGCCC +TTTTCTCCCCCTGTCATAGATCACTGGTGCCTTTTGAAAAATCCTAAGTTATACATCAGT +TATTAGATGACTGGGGCCCACCTGCATGCCCTAGTCCTGAAAGCTACATTAGGATATATG +TTTCATTTCCACTCTTGTTTGTTTCATTTTCAGGTGCAATTGAGAGAGCGTTACAGCCCT +GCATTTGAGATAAGTTGCCTTGATTCTGACATTTGGCCCAGCCTGTACTGGTGTGCCGCA +ATGAGAGTCAATCTCTATTGACAGCCTGCTTCAGATTTTGCTTTTGTTCGTTTTGCCTTC +TGTCCTTGGAACAGTCATATCTCAAGTTCAAAGGCCAAAACCTGAGAAGCGGTGGGCTAA +GATAGGTCCTACTGCAAACCACCCCTCCATATTTCCGTACCATTTACAATTCAGTTTCTG +TGACATCTTTTTAAACCACTGGAGGAAAAATGAGATATTCTCTAATTTATTCTTCTATAA +CACTCTATATAGAGCTATGTGAGTACTAATCACATTGAATAATAGTTATAAAATTATTGT +ATAGACATCTGCTTCTTAAACAGATTGTGAGTTCTTTGAGAAACAGCGTGGATTTTACTT +ATCTGTGTATTCACAGAGCTTAGCACAGTGCCTGGTAATGAGCAAGCATACTTGCCATTA +CTTTTCCTTCCCACTCTCTCCAACATCACATTCACTTTAAATTTTTCTGTATATAGAAAG +GAAAACTAGCCTGGGCAACATGATGAAACCCCATCTCCACTGCAAAAAAAAAAAAAAAAA +ATAAGAAAGAACAAAACAAACCCCACAAAAATTAGCTGGGTATGATGGCACGTGCCTGTA +GTCCCAGTTACTCAGGATGATTGATTGAGCCTTGGAGGTGGAGGCTACAGTGAGCTGAGA +TTGTGCCACTGTACTCTAGCCAGGGAGAAAGAGTGAGATCCTGGCTCAAAAAAACCAAAT +AAAACAAAACAAACAAACGAAAAACAGAAAGGAAGACTGAAAGAGAATGAAAAGCTGGGG +AGAGGAAATAAAAATAAAGAAGGAAGAGTGTTTCATTTATATCTGAATGAAAATATGAAT +GACTCTAAGTAATTGAATTAATTAAAATGAGCCAACTTTTTTTTAACAATTTACATTTTA +TTTCTATGGGAAAAAATAAATATTCCTCTTCTAACAAACCCATGCTTGATTTTCATTAAT +TGAATTCCAAATCATCCTAGCCATGTGTCCTTCCATTTAGGTTACTGGGGCAAATCAGTA +AGAAAGTTCTTATATTTATGCTCCAAATAATTCTGAAGTCCTCTTACTAGCTGTGAAAGC +TAGTACTATTAAGAAAGAAAACAAAATTCCCAAAAGATAGCTTTCACTTTTTTTTTTCCT +TAAAGACTTCCTAATTCTCTTCTCCAAATTCTTAGTCTTCTTCAAAATAATATGCTTTGG +TTCAATAGTTATCCACATTCTGACAGTCTAATTTAGTTTTAATCAGAATTATACTCATCT +TTTGGGTAGTCATAGATATTAAGAAAGCAAGAGTTTCTTATGTCCAGTTATGGAATATTT +CCTAAAGCAAGGCTGCAGGTGAAGTTGTGCTCAAGTGAATGTTCAGGAGACACAATTCAG +TGGAAGAAATTAAGTCTTTAAAAAAGACCTAGGAATAGGAGAACCATGGAAATTGAGGAG +GTAGGCCTACAAGTAGATATTGGGAACAAAATTAGAGAGGCAACCAGAAAAAGTTATTTT +AGGCTCACCAGAGTTGTTCTTATTGCACAGTAACACACCAATATACCAAAACAGCAGGTA +TTGCAGTAGAGAAAGAGTTTAATAATTGAATGGCAGAAAAATGAGGAAGGTTGAGGAAAC +CTCAAATCTACCTCCCTGCTGAGTCTAAGTTTAGGATTTTTAAGAGAAAGGCAGGTAAGG +TGCTGAAGGTCTGGAGCTGCTGATTTGTTGGGGTATAGGGAATGAAATGAAACATACAGA +GATGAAAACTGGAAGTTTTTTTTTGTTTGTTTTGTTTTTTTTTTGTTGTTGTTTTTTTTT +TTTTTTGTTTTTTTGCTGAGTCAATTCCTTGGAGGGGGTCTTCAGACTGACTGGTGTCAG +CAGACCCATGGGATTCCAAGATCTGGAAAACTTTTTAGATAGAAACTTGATGTTTCTTAA +CGTTACATATATTATCTTATAGAAATAACTAAGGGAAGTTAGTGCCTTGTGACCACATCT +ATGTGACTTTTAGGCAGTAAGAAACTATAAGGAAAGGAGCTAACAGTCATGCTGTAAGTA +GCTACAGGGAATTGGCTTAAAGGGCAAGTTGGTTAGTACTTAGCTGTGTTTTTATTCAAA +GTCTACATTTTATGTAGTGGTTAATGTTTGCTGTTCATTAGGATGGTTTCACAGTTACCA +TACAAATGTAGAAGCAACAGGTCCAAAAAGTAGGGCATGATTTTCTCCATGTAATCCAGG +GAGAAAACAAGCCATGACCATTGTTGGTTGGGAGACTGAAGGTGATTGAAGGTTCACCAT +CATCCTCACCAACTTTTGGGCCATAATTCACCCAACCCTTTGGTGGAGCCTGAAAAAAAT +CTGGGCAGAATGTAGGACTTCTTTATTTTGTTTAAAGGGGTAACACAGAGTGCCCTTATG +AAGGAGTTGGAGATCCTGCAAGGAAGAGAAGGAGTGAAGGAGAGATCAAGAGAGAGAAAC +AATGAGGAACATTTCATTTGACCCAACATCCTTTAGGAGCATAAATGTTGACACTAAGTT +ATCCCTTTTGTGCTAAAATGGACAGTATTGGCAAAATGATACCACAACTTCTTATTCTCT +GGCTCTATATTGCTTTGGAAACACTTAAACATCAAATGGAGTTAAATACATATTTGAAAT +TTAGGTTAGGAAATATTGGTGAGGAGGCCTCAAAAAGGGGGAAACATCTTTTGTCTGGGA +GGATATTTTCCATTTTGTGGATTTCCCTGATCTTTTTCTACCACCCTGAGGGGTGGTGGG +AATTATCATTTTGCTACATTTTAGAGGTCATCCAGGATTTTTGAAACTTTACATTCTTTA +CGGTTAAGCAAGATGTACAGCTCAGTCAAAGACACTAAATTCTTCTTAGAAAAATAGTGC +TAAGGAGTATAGCAGATGACCTATATGTGTGTTGGCTGGGAGAATATCATCTTAAAGTGA +GAGTGATGTTGTGGAGACAGTTGAAATGTCAATGCTAGAGCCTCTGTGGTGTGAATGGGC +ACGTTAGGTTGTTGCATTAGAAAGTGACTGTTTCTGACAGAAATTTGTAGCTTTGTGCAA +ACTCACCCACCATCTACCTCAATAAAATATAGAGAAAAGAAAAATAGAGCAGTTTGAGTT +CTATGAGGTATGCAGGCCCAGAGAGACATAAGTATGTTCCTTTAGTCTTGCTTCCTGTGT +GCCACACTGCCCCTCCACAACCATAGCTGGGGGCAATTGTTTAAAGTCATTTTGTTCCCG +ACTAGCTGCCTTGCACATTATCTTCATTTTCCTGGAATTTGATACAGAGAGCAATTTATA +GCCAATTGATAGCTTATGCTGTTTCAATGTAAATTCGTGGTAAATAACTTAGGAACTGCC +TCTTCTTTTTCTTTGAAAACCTACTTATAACTGTTGCTAATAAGAATGTGTATTGTTCAG +GACAACTTGTCTCCATACAGTTGGGTTGTAACCCTCATGCTTGGCCCAAATAAACTCTCT +ACTTATATCAGTTTTTCCTACACTTCTTCCTTTTAGGTCAACAATACCAAGAGGGGTTAC +TGTGCTGGGTAATGTGTAAACTTGTGTCTTGTTTAGAAAGATAAATTTAAAGACTATCAC +ATTGCTTTTTCATAAAACAAGACAGGTCTACAATTAATTTATTTTGACGCAAATTGATAG +GGGGGCCAAGTAAGCCCCATATGCTTAATGATCAGCTGATGAATAATCATCTCCTAGCAA +CATAACTCAATCTAATGCTAAGGTACCCACAAGATGGCAAGGCTGATCAAAGTCGTCATG +GAATCCTGCAACCAAAAGCCATGGGAATTTGGAAGCCCTCAAATCCCATTCCTAATCTGA +TGAGTCTATGGACCAATTTGTGGAGGACAGTAGATTAAATAGATCTGATTTTTGCCATCA +ATGTAAGGAGGATAAAAACTTGCATACCAATTGTACACCCTTGCAAAATCTTTCTCTGAT +GTTGGAGAAAATGGGCCAGTGAGATCATGGATATAGAAGTACAGTCAATGTTCAGCTGTA +CCCTCCCACAATCCCACTTCCTTCCTCAACACAATTCAAACAAATAGACTCAGACTGTTT +CAGGCTCCAGGACAGGAAGTGCAGTGTAGGCAAAATTGCAAAAATTGAGGGCACAGGGGT +GGAGGTGGGGGGGTTGAATAACAAGCTGTGCTAAATAATTACGTGTAAATATATTTTTTC +ATTTTTAAAAATTGATTTCTTTTGCACATTCCATGACAATATATGTCACATTTTTAAAAT +AAATGCAAAGAAGCATACATCCAA diff --git a/test/files/comb/case_88/proteome.fasta b/test/files/comb/case_88/proteome.fasta new file mode 100644 index 00000000..4f6e49e4 --- /dev/null +++ b/test/files/comb/case_88/proteome.fasta @@ -0,0 +1,162 @@ +>ENSP00000476876.6-ORF51:534|ENST00000476876.6-ORF51:534|ENSG00000137959.17|- +MEVTTRLTWNDENHLRKLLGNVSLSLLYKSSVHGGSIEDMVERCSRQGCTITMAYIDYNM +IVAFMLGNYINLHESSTEPNDSLWFSLQKKNDTTEIETLLLNTAPKIIDEQLVCRLSKTD +IFIICRDNKIYLDKMITRNLKLRFYGHRQYLECEVFRVEGL +>ENSP00000476876.6-ORF74:80|ENST00000476876.6-ORF74:80|ENSG00000137959.17|- +ME +>ENSP00000476876.6-ORF79:121|ENST00000476876.6-ORF79:121|ENSG00000137959.17|- +MMKIICASCLEMFL +>ENSP00000476876.6-ORF82:121|ENST00000476876.6-ORF82:121|ENSG00000137959.17|- +MKIICASCLEMFL +>ENSP00000476876.6-ORF112:121|ENST00000476876.6-ORF112:121|ENSG00000137959.17|- +MFL +>ENSP00000476876.6-ORF148:202|ENST00000476876.6-ORF148:202|ENSG00000137959.17|- +MEVALKIWLKDAAVRDVL +>ENSP00000476876.6-ORF168:534|ENST00000476876.6-ORF168:534|ENSG00000137959.17|- +MVERCSRQGCTITMAYIDYNMIVAFMLGNYINLHESSTEPNDSLWFSLQKKNDTTEIETL +LLNTAPKIIDEQLVCRLSKTDIFIICRDNKIYLDKMITRNLKLRFYGHRQYLECEVFRVE +GL +>ENSP00000476876.6-ORF179:218|ENST00000476876.6-ORF179:218|ENSG00000137959.17|- +MQPSGMYYNNGLH +>ENSP00000476876.6-ORF194:218|ENST00000476876.6-ORF194:218|ENSG00000137959.17|- +MYYNNGLH +>ENSP00000476876.6-ORF207:534|ENST00000476876.6-ORF207:534|ENSG00000137959.17|- +MAYIDYNMIVAFMLGNYINLHESSTEPNDSLWFSLQKKNDTTEIETLLLNTAPKIIDEQL +VCRLSKTDIFIICRDNKIYLDKMITRNLKLRFYGHRQYLECEVFRVEGL +>ENSP00000476876.6-ORF228:534|ENST00000476876.6-ORF228:534|ENSG00000137959.17|- +MIVAFMLGNYINLHESSTEPNDSLWFSLQKKNDTTEIETLLLNTAPKIIDEQLVCRLSKT +DIFIICRDNKIYLDKMITRNLKLRFYGHRQYLECEVFRVEGL +>ENSP00000476876.6-ORF243:534|ENST00000476876.6-ORF243:534|ENSG00000137959.17|- +MLGNYINLHESSTEPNDSLWFSLQKKNDTTEIETLLLNTAPKIIDEQLVCRLSKTDIFII +CRDNKIYLDKMITRNLKLRFYGHRQYLECEVFRVEGL +>ENSP00000476876.6-ORF268:337|ENST00000476876.6-ORF268:337|ENSG00000137959.17|- +MKVLQSQMIPYGFHFKRKMTPLK +>ENSP00000476876.6-ORF289:337|ENST00000476876.6-ORF289:337|ENSG00000137959.17|- +MIPYGFHFKRKMTPLK +>ENSP00000476876.6-ORF299:323|ENST00000476876.6-ORF299:323|ENSG00000137959.17|- +MVFTSKEK +>ENSP00000476876.6-ORF322:337|ENST00000476876.6-ORF322:337|ENSG00000137959.17|- +MTPLK +>ENSP00000476876.6-ORF376:445|ENST00000476876.6-ORF376:445|ENSG00000137959.17|- +MSNWCVVYRKRIFSLYVEIIKFI +>ENSP00000476876.6-ORF422:431|ENST00000476876.6-ORF422:431|ENSG00000137959.17|- +MSR +>ENSP00000476876.6-ORF453:534|ENST00000476876.6-ORF453:534|ENSG00000137959.17|- +MITRNLKLRFYGHRQYLECEVFRVEGL +>ENSP00000476876.6-ORF484:544|ENST00000476876.6-ORF484:544|ENSG00000137959.17|- +MATVSIWNVKFFELKVCKIR +>ENSP00000476876.6-ORF506:509|ENST00000476876.6-ORF506:509|ENSG00000137959.17|- +M +>ENSP00000476876.6-ORF563:590|ENST00000476876.6-ORF563:590|ENSG00000137959.17|- +MILILVFGR +>ENSP00000476876.6-ORF639:714|ENST00000476876.6-ORF639:714|ENSG00000137959.17|- +MPYSLINSEMKRGEHKDYEVFIQNV +>ENSP00000476876.6-ORF666:714|ENST00000476876.6-ORF666:714|ENSG00000137959.17|- +MKRGEHKDYEVFIQNV +>ENSP00000476876.6-ORF691:724|ENST00000476876.6-ORF691:724|ENSG00000137959.17|- +MRFLSRTYRLL +>ENSP00000476876.6-ORF748:862|ENST00000476876.6-ORF748:862|ENSG00000137959.17|- +MCRDRDDKGDGDVERERSYHQSCSVLTYDPKIRPLYDP +>ENSP00000476876.6-ORF764:824|ENST00000476876.6-ORF764:824|ENSG00000137959.17|- +MTKEMGMWRERDLIIRAALF +>ENSP00000476876.6-ORF776:824|ENST00000476876.6-ORF776:824|ENSG00000137959.17|- +MGMWRERDLIIRAALF +>ENSP00000476876.6-ORF782:824|ENST00000476876.6-ORF782:824|ENSG00000137959.17|- +MWRERDLIIRAALF +>ENSP00000476876.6-ORF830:905|ENST00000476876.6-ORF830:905|ENSG00000137959.17|- +MTQRSDPYMTLRTSRCNYVRASRAS +>ENSP00000476876.6-ORF854:905|ENST00000476876.6-ORF854:905|ENSG00000137959.17|- +MTLRTSRCNYVRASRAS +>ENSP00000476876.6-ORF882:885|ENST00000476876.6-ORF882:885|ENSG00000137959.17|- +M +>ENSP00000476876.6-ORF907:964|ENST00000476876.6-ORF907:964|ENSG00000137959.17|- +MCREDNFSSSSSHVANSDS +>ENSP00000476876.6-ORF944:968|ENST00000476876.6-ORF944:968|ENSG00000137959.17|- +MLPTLTPN +>ENSP00000476876.6-ORF989:1037|ENST00000476876.6-ORF989:1037|ENSG00000137959.17|- +MQENSMSYFKSRLGHI +>ENSP00000476876.6-ORF1004:1037|ENST00000476876.6-ORF1004:1037|ENSG00000137959.17|- +MSYFKSRLGHI +>ENSP00000476876.6-ORF1057:1090|ENST00000476876.6-ORF1057:1090|ENSG00000137959.17|- +MYYAMFISYRN +>ENSP00000476876.6-ORF1064:1964|ENST00000476876.6-ORF1064:1964|ENSG00000137959.17|- +MLCLFPIGIKDNLDDIKRIIKAREHRNRLLADIRDYRPYADLVSEIRILLVGPVGSGKSS +FFNSVKSIFHGHVTGQAVVGSDITSITERYRIYSVKDGKNGKSLPFMLCDTMGLDGAEGA +GLCMDDIPHILKGCMPDRYQFNSRKPITPEHSTFITSPSLKDRIHCVAYVLDINSIDNLY +SKMLAKVKQVHKEVLNCGIAYVALLTKVDDCSEVLQDNFLNMSRSMTSQSRVMNVHKMLG +IPISNILMVGNYASDLELDPMKDILILSALRQMLRAADDFLEDLPLEETGAIERALQPCI +>ENSP00000476876.6-ORF1069:1090|ENST00000476876.6-ORF1069:1090|ENSG00000137959.17|- +MFISYRN +>ENSP00000476876.6-ORF1179:1281|ENST00000476876.6-ORF1179:1281|ENSG00000137959.17|- +MQTWFQKFVFFWWVQLGLESPVFSIQSSLFFMAM +>ENSP00000476876.6-ORF1272:1281|ENST00000476876.6-ORF1272:1281|ENSG00000137959.17|- +MAM +>ENSP00000476876.6-ORF1278:1281|ENST00000476876.6-ORF1278:1281|ENSG00000137959.17|- +M +>ENSP00000476876.6-ORF1353:1404|ENST00000476876.6-ORF1353:1404|ENSG00000137959.17|- +MEKMENLCHLCCVTLWG +>ENSP00000476876.6-ORF1362:1404|ENST00000476876.6-ORF1362:1404|ENSG00000137959.17|- +MENLCHLCCVTLWG +>ENSP00000476876.6-ORF1382:1964|ENST00000476876.6-ORF1382:1964|ENSG00000137959.17|- +MLCDTMGLDGAEGAGLCMDDIPHILKGCMPDRYQFNSRKPITPEHSTFITSPSLKDRIHC +VAYVLDINSIDNLYSKMLAKVKQVHKEVLNCGIAYVALLTKVDDCSEVLQDNFLNMSRSM +TSQSRVMNVHKMLGIPISNILMVGNYASDLELDPMKDILILSALRQMLRAADDFLEDLPL +EETGAIERALQPCI +>ENSP00000476876.6-ORF1397:1964|ENST00000476876.6-ORF1397:1964|ENSG00000137959.17|- +MGLDGAEGAGLCMDDIPHILKGCMPDRYQFNSRKPITPEHSTFITSPSLKDRIHCVAYVL +DINSIDNLYSKMLAKVKQVHKEVLNCGIAYVALLTKVDDCSEVLQDNFLNMSRSMTSQSR +VMNVHKMLGIPISNILMVGNYASDLELDPMKDILILSALRQMLRAADDFLEDLPLEETGA +IERALQPCI +>ENSP00000476876.6-ORF1407:1455|ENST00000476876.6-ORF1407:1455|ENSG00000137959.17|- +MGQKEQDCAWMTFPTS +>ENSP00000476876.6-ORF1433:1964|ENST00000476876.6-ORF1433:1964|ENSG00000137959.17|- +MDDIPHILKGCMPDRYQFNSRKPITPEHSTFITSPSLKDRIHCVAYVLDINSIDNLYSKM +LAKVKQVHKEVLNCGIAYVALLTKVDDCSEVLQDNFLNMSRSMTSQSRVMNVHKMLGIPI +SNILMVGNYASDLELDPMKDILILSALRQMLRAADDFLEDLPLEETGAIERALQPCI +>ENSP00000476876.6-ORF1437:1455|ENST00000476876.6-ORF1437:1455|ENSG00000137959.17|- +MTFPTS +>ENSP00000476876.6-ORF1466:1964|ENST00000476876.6-ORF1466:1964|ENSG00000137959.17|- +MPDRYQFNSRKPITPEHSTFITSPSLKDRIHCVAYVLDINSIDNLYSKMLAKVKQVHKEV +LNCGIAYVALLTKVDDCSEVLQDNFLNMSRSMTSQSRVMNVHKMLGIPISNILMVGNYAS +DLELDPMKDILILSALRQMLRAADDFLEDLPLEETGAIERALQPCI +>ENSP00000476876.6-ORF1569:1575|ENST00000476876.6-ORF1569:1575|ENSG00000137959.17|- +MS +>ENSP00000476876.6-ORF1610:1964|ENST00000476876.6-ORF1610:1964|ENSG00000137959.17|- +MLAKVKQVHKEVLNCGIAYVALLTKVDDCSEVLQDNFLNMSRSMTSQSRVMNVHKMLGIP +ISNILMVGNYASDLELDPMKDILILSALRQMLRAADDFLEDLPLEETGAIERALQPCI +>ENSP00000476876.6-ORF1665:1722|ENST00000476876.6-ORF1665:1722|ENSG00000137959.17|- +MWPCLLKWMIAVRFFKTTF +>ENSP00000476876.6-ORF1689:1722|ENST00000476876.6-ORF1689:1722|ENSG00000137959.17|- +MIAVRFFKTTF +>ENSP00000476876.6-ORF1727:1964|ENST00000476876.6-ORF1727:1964|ENSG00000137959.17|- +MSRSMTSQSRVMNVHKMLGIPISNILMVGNYASDLELDPMKDILILSALRQMLRAADDFL +EDLPLEETGAIERALQPCI +>ENSP00000476876.6-ORF1739:1964|ENST00000476876.6-ORF1739:1964|ENSG00000137959.17|- +MTSQSRVMNVHKMLGIPISNILMVGNYASDLELDPMKDILILSALRQMLRAADDFLEDLP +LEETGAIERALQPCI +>ENSP00000476876.6-ORF1760:1964|ENST00000476876.6-ORF1760:1964|ENSG00000137959.17|- +MNVHKMLGIPISNILMVGNYASDLELDPMKDILILSALRQMLRAADDFLEDLPLEETGAI +ERALQPCI +>ENSP00000476876.6-ORF1764:1779|ENST00000476876.6-ORF1764:1779|ENSG00000137959.17|- +MSIKC +>ENSP00000476876.6-ORF1775:1964|ENST00000476876.6-ORF1775:1964|ENSG00000137959.17|- +MLGIPISNILMVGNYASDLELDPMKDILILSALRQMLRAADDFLEDLPLEETGAIERALQ +PCI +>ENSP00000476876.6-ORF1805:1964|ENST00000476876.6-ORF1805:1964|ENSG00000137959.17|- +MVGNYASDLELDPMKDILILSALRQMLRAADDFLEDLPLEETGAIERALQPCI +>ENSP00000476876.6-ORF1818:1845|ENST00000476876.6-ORF1818:1845|ENSG00000137959.17|- +MLQIWNWTP +>ENSP00000476876.6-ORF1844:1964|ENST00000476876.6-ORF1844:1964|ENSG00000137959.17|- +MKDILILSALRQMLRAADDFLEDLPLEETGAIERALQPCI +>ENSP00000476876.6-ORF1880:1964|ENST00000476876.6-ORF1880:1964|ENSG00000137959.17|- +MLRAADDFLEDLPLEETGAIERALQPCI +>ENSP00000476876.6-ORF1896:1905|ENST00000476876.6-ORF1896:1905|ENSG00000137959.17|- +MIF +>ENSP00000476876.6-ORF2019:2037|ENST00000476876.6-ORF2019:2037|ENSG00000137959.17|- +MRVNLY +>ENSP00000476876.6-ORF2228:2285|ENST00000476876.6-ORF2228:2285|ENSG00000137959.17|- +MRYSLIYSSITLYIELCEY +>ENSP00000476876.6-ORF2275:2278|ENST00000476876.6-ORF2275:2278|ENSG00000137959.17|- +M +>ENSP00000476876.6-ORF2416:2476|ENST00000476876.6-ORF2416:2476|ENSG00000137959.17|- +MSKHTCHYFSFPLSPTSHSL diff --git a/test/integration/test_call_variant_peptides.py b/test/integration/test_call_variant_peptides.py index 40148fc4..c02cb336 100644 --- a/test/integration/test_call_variant_peptides.py +++ b/test/integration/test_call_variant_peptides.py @@ -1323,3 +1323,16 @@ def test_call_variant_peptide_case87(self): expected = test_dir/'brute_force.txt' reference = test_dir self.default_test_case(gvf, reference, expected) + + def test_call_variant_peptide_case88(self): + """ + Issue that variant bubble finding error starting from a out-bridge node. + """ + test_dir = self.data_dir/'comb/case_88' + gvf = [ + test_dir/'gSNP.gvf', + test_dir/'AltSplice.gvf' + ] + expected = test_dir/'brute_force.txt' + reference = test_dir + self.default_test_case(gvf, reference, expected) From 1df4e32ae21b7986d18ac7de673417d8e126c6f9 Mon Sep 17 00:00:00 2001 From: zhuchcn Date: Mon, 3 Mar 2025 21:32:30 -0800 Subject: [PATCH 5/9] fix (callVariant): peptide annotation not created correctly with SEC --- CHANGELOG.md | 2 ++ moPepGen/svgraph/VariantPeptideDict.py | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e6068231..f91be4ce 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -20,6 +20,8 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm - Fixed callVariant with variant bubble finding error starting from a out-bridge node. +- Fixed callVariant that peptide annotation not created correctly with SEC + ## [1.4.6-rc1] - 2025-02-24 ### Fixed diff --git a/moPepGen/svgraph/VariantPeptideDict.py b/moPepGen/svgraph/VariantPeptideDict.py index 64970df6..640ad9d8 100644 --- a/moPepGen/svgraph/VariantPeptideDict.py +++ b/moPepGen/svgraph/VariantPeptideDict.py @@ -466,7 +466,7 @@ def translational_modification(self, seq:Seq, metadata:VariantPeptideMetadata, cur_metadata.has_variants = bool(cur_variants) if is_valid or is_valid_start: - cur_nodes = [] + cur_nodes:List[PVGNode] = [] cut_offset = sec.location.start for node in nodes: if cut_offset == 0: @@ -474,7 +474,7 @@ def translational_modification(self, seq:Seq, metadata:VariantPeptideMetadata, continue if len(node.seq.seq) > cut_offset: node = node.copy() - node.truncate_left(cut_offset) + node.truncate_right(cut_offset) cur_nodes.append(node) cut_offset = 0 else: From 69e9b25c0b3447f593472e04a02b57dc1c4901dd Mon Sep 17 00:00:00 2001 From: zhuchcn Date: Mon, 3 Mar 2025 21:40:15 -0800 Subject: [PATCH 6/9] style: fix --- moPepGen/cli/parse_vep.py | 4 ++-- moPepGen/svgraph/ThreeFrameTVG.py | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/moPepGen/cli/parse_vep.py b/moPepGen/cli/parse_vep.py index 5077a690..eb7329d5 100644 --- a/moPepGen/cli/parse_vep.py +++ b/moPepGen/cli/parse_vep.py @@ -8,7 +8,6 @@ from typing import TYPE_CHECKING import argparse import gzip -from typing import Dict, List from pathlib import Path from moPepGen.parser import VEPParser from moPepGen.err import TranscriptionStopSiteMutationError, TranscriptionStartSiteMutationError @@ -20,6 +19,7 @@ OUTPUT_FILE_FORMATS = ['.gvf'] if TYPE_CHECKING: + from typing import Dict, List from logging import Logger # pylint: disable=W0212 @@ -121,7 +121,7 @@ def parse_vep(args:argparse.Namespace) -> None: except: if args.skip_failed: logger.warning( - f"VEP record failed to convert: {record}" + "VEP record failed to convert: %s", str(record) ) tally.failed.total += 1 continue diff --git a/moPepGen/svgraph/ThreeFrameTVG.py b/moPepGen/svgraph/ThreeFrameTVG.py index 76f3ff28..8537f44e 100644 --- a/moPepGen/svgraph/ThreeFrameTVG.py +++ b/moPepGen/svgraph/ThreeFrameTVG.py @@ -1902,7 +1902,6 @@ def fit_into_codons(self) -> None: queue.appendleft(cur) continue - cur_copy = str(cur.seq.seq) try: self.align_variants(cur) except err.FailedToFindVariantBubbleError: From 1ae368088efc8d881600532034486c74aa707077 Mon Sep 17 00:00:00 2001 From: zhuchcn Date: Tue, 4 Mar 2025 00:38:43 -0800 Subject: [PATCH 7/9] fix (callVariant): sect peptide not annotated correctly if the sec is not in the first node --- moPepGen/svgraph/VariantPeptideDict.py | 1 + 1 file changed, 1 insertion(+) diff --git a/moPepGen/svgraph/VariantPeptideDict.py b/moPepGen/svgraph/VariantPeptideDict.py index 640ad9d8..b77abc7f 100644 --- a/moPepGen/svgraph/VariantPeptideDict.py +++ b/moPepGen/svgraph/VariantPeptideDict.py @@ -479,6 +479,7 @@ def translational_modification(self, seq:Seq, metadata:VariantPeptideMetadata, cut_offset = 0 else: cut_offset = max(0, cut_offset - len(node.seq.seq)) + cur_nodes.append(node) continue if is_valid: cur_metadata_2 = copy.copy(cur_metadata) From 78e971dfca650def321b800af36097b22cc4431a Mon Sep 17 00:00:00 2001 From: zhuchcn Date: Tue, 4 Mar 2025 15:03:25 -0800 Subject: [PATCH 8/9] fix (gtf): check added when converting a genomic location to gene coordinate to ensure they overlap --- CHANGELOG.md | 4 ++++ moPepGen/gtf/GenomicAnnotation.py | 6 ++++++ test/files/circRNA/CIRCexplorer3_circularRNA_known.txt | 2 +- test/files/circRNA/CIRCexplorer_circularRNA_known.txt | 2 +- test/unit/test_gtf.py | 7 +++++++ test/unit/test_vep_parser.py | 4 ++-- 6 files changed, 21 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f91be4ce..bd6bb75f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,10 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm - Fixed callVariant that peptide annotation not created correctly with SEC +## Changed + +- A check added when converting a genomic location to gene coordinate to ensure they overlap. + ## [1.4.6-rc1] - 2025-02-24 ### Fixed diff --git a/moPepGen/gtf/GenomicAnnotation.py b/moPepGen/gtf/GenomicAnnotation.py index 863ff6fe..6145bb5d 100644 --- a/moPepGen/gtf/GenomicAnnotation.py +++ b/moPepGen/gtf/GenomicAnnotation.py @@ -179,6 +179,12 @@ def coordinate_gene_to_transcript(self, index:int, gene:str, def coordinate_genomic_to_gene(self, index:int, gene:str) -> int: """ Get the gene coordinate from genomic coordinate. """ gene_location = self.genes[gene].location + if not gene_location.start <= index < gene_location.end: + loc = f"{self.genes[gene].chrom}:{gene_location.start}-{gene_location.end}" + raise ValueError( + f"The position does not overlap with the gene. index: {index}, " + f"gene: {gene}, {loc}" + ) if gene_location.strand == 1: return index - gene_location.start if gene_location.strand == -1: diff --git a/test/files/circRNA/CIRCexplorer3_circularRNA_known.txt b/test/files/circRNA/CIRCexplorer3_circularRNA_known.txt index 1ff6802b..af582298 100644 --- a/test/files/circRNA/CIRCexplorer3_circularRNA_known.txt +++ b/test/files/circRNA/CIRCexplorer3_circularRNA_known.txt @@ -1,3 +1,3 @@ chr22 0 464 circular_RNA/1 0 + 0 0 0,0,0 2 323,82 0,323 1 circRNA RIBC2 ENST00000614167.2 1,2 chr22:0-130787|chr22:131505-160911 1 1 1 -chr22 4980 5177 circular_RNA/1 0 - 4980 4980 0,0,0 2 78,42 0,98 1 circRNA LZTR1 ENST00000642151.1 1,2 chr22:427119-435769|chr22:437511-472574 1 1 1 +chr22 4980 5120 circular_RNA/1 0 - 4980 4980 0,0,0 2 78,42 0,98 1 circRNA LZTR1 ENST00000642151.1 1,2 chr22:427119-435769|chr22:437511-472574 1 1 1 chr22 5058 5078 circular_RNA/2 0 - 5058 5058 0,0,0 1 20 0 2 ciRNA LZTR1 ENST00000642151.1 1 chr22:868685-870710|chr22:884086-888846 1 1 1 diff --git a/test/files/circRNA/CIRCexplorer_circularRNA_known.txt b/test/files/circRNA/CIRCexplorer_circularRNA_known.txt index 9e770512..fc9b53fc 100644 --- a/test/files/circRNA/CIRCexplorer_circularRNA_known.txt +++ b/test/files/circRNA/CIRCexplorer_circularRNA_known.txt @@ -1,3 +1,3 @@ chr22 0 464 circular_RNA/1 0 + 0 0 0,0,0 2 323,82 0,323 1 circRNA RIBC2 ENST00000614167.2 1,2 chr22:0-130787|chr22:131505-160911 -chr22 4980 5177 circular_RNA/1 0 - 4980 4980 0,0,0 2 78,42 0,98 1 circRNA LZTR1 ENST00000642151.1 1,2 chr22:427119-435769|chr22:437511-472574 +chr22 4980 5120 circular_RNA/1 0 - 4980 4980 0,0,0 2 78,42 0,98 1 circRNA LZTR1 ENST00000642151.1 1,2 chr22:427119-435769|chr22:437511-472574 chr22 5058 5078 circular_RNA/2 0 - 5058 5058 0,0,0 1 20 0 2 ciRNA LZTR1 ENST00000642151.1 1 chr22:868685-870710|chr22:884086-888846 diff --git a/test/unit/test_gtf.py b/test/unit/test_gtf.py index 3e506b6f..49810eb5 100644 --- a/test/unit/test_gtf.py +++ b/test/unit/test_gtf.py @@ -445,6 +445,13 @@ def test_coordinate_convert(self): x = anno.coordinate_gene_to_transcript(19, gene_id, tx_id) self.assertEqual(x, 10) + anno = create_genomic_annotation(ANNOTATION_DATA) + gene_id = ANNOTATION_ATTRS[0]['gene_id'] + tx_id = ANNOTATION_DATA['genes'][0]['transcripts'][0] + with self.assertRaises(ValueError): + ## This gene is 0-40, so 50 is out of the gene. + anno.coordinate_gene_to_transcript(50, gene_id, tx_id) + def test_coordinate_convert_tx_exon_start(self): """ Convert coodinate from transcript to genomic when the index is the start of an exon""" diff --git a/test/unit/test_vep_parser.py b/test/unit/test_vep_parser.py index 09c55c5a..03c0accf 100644 --- a/test/unit/test_vep_parser.py +++ b/test/unit/test_vep_parser.py @@ -325,7 +325,7 @@ def test_vep_to_variant_record_case11(self): vep_record = VEPParser.VEPRecord( uploaded_variation='rs55971985', - location='chr1:38-50', + location='chr1:38-40', allele='-', gene='ENSG0001', feature='ENST0001.1', @@ -437,7 +437,7 @@ def test_vep_to_variant_record_case15(self): vep_record = VEPParser.VEPRecord( uploaded_variation='rs55971985', - location='chr1:2-7', + location='chr1:6-7', allele='-', gene='ENSG0001', feature='ENST0001.1', From 80bb218238cc9f7bcc94b5160e932f6abd4ab0ae Mon Sep 17 00:00:00 2001 From: zhuchcn Date: Tue, 4 Mar 2025 17:51:37 -0800 Subject: [PATCH 9/9] doc: fuzz test results added --- docs/files/fuzz_test_history.tsv | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docs/files/fuzz_test_history.tsv b/docs/files/fuzz_test_history.tsv index fe4a97fb..7ecfc082 100644 --- a/docs/files/fuzz_test_history.tsv +++ b/docs/files/fuzz_test_history.tsv @@ -38,3 +38,6 @@ v1.4.3 64a1cd7 2025-02-12 comprehensive 6985 0 0 0:00:00.372239 0.80905269209725 v1.4.5 f46742d 2025-02-24 snv 3189 0 0 0:00:00.144238 0.35400948008943256 0:00:54.012856 109.64929228073993 v1.4.5 f46742d 2025-02-24 indel 3228 0 0 0:00:00.202602 0.9296003929444859 0:00:41.938361 99.86673318517921 v1.4.5 f46742d 2025-02-24 comprehensive 5902 0 0 0:00:00.375135 1.0157547122885704 0:00:40.352927 181.73796673959964 +v1.4.6-rc1 78e971d 2025-03-04 snv 2710 0 0 0:00:00.163103 0.38221870483830245 0:00:56.301104 116.21712282069569 +v1.4.6-rc1 78e971d 2025-03-04 indel 2850 0 0 0:00:00.191917 0.3938244401664319 0:00:41.244215 95.36212234507254 +v1.4.6-rc1 78e971d 2025-03-04 comprehensive 5310 0 0 0:00:00.395482 1.0924741762245143 0:00:40.031276 179.44714992445952