diff --git a/.github/workflows/CICD-base.yaml b/.github/workflows/CICD-base.yaml index 953c181f..d0c5f5f9 100644 --- a/.github/workflows/CICD-base.yaml +++ b/.github/workflows/CICD-base.yaml @@ -21,6 +21,6 @@ jobs: # Run CICD-base - name: CICD-base - uses: docker://blcdsdockerregistry/cicd-base:latest + uses: docker://ghcr.io/uclahs-cds/cicd-base:latest env: PYTHON_PYLINT_CONFIG_FILE: true diff --git a/.pylintrc b/.pylintrc index 8e8e72a8..f30d3456 100644 --- a/.pylintrc +++ b/.pylintrc @@ -60,17 +60,7 @@ confidence= # --enable=similarities". If you want to run only the classes checker, but have # no Warning level messages displayed, use "--disable=all --enable=classes # --disable=W". -disable=print-statement, - parameter-unpacking, - unpacking-in-except, - old-raise-syntax, - backtick, - long-suffix, - old-ne-operator, - old-octal-literal, - import-star-module-level, - non-ascii-bytes-literal, - raw-checker-failed, +disable=raw-checker-failed, bad-inline-option, locally-disabled, file-ignored, @@ -78,68 +68,12 @@ disable=print-statement, useless-suppression, deprecated-pragma, use-symbolic-message-instead, - apply-builtin, - basestring-builtin, - buffer-builtin, - cmp-builtin, - coerce-builtin, - execfile-builtin, - file-builtin, - long-builtin, - raw_input-builtin, - reduce-builtin, - standarderror-builtin, - unicode-builtin, - xrange-builtin, - coerce-method, - delslice-method, - getslice-method, - setslice-method, - no-absolute-import, - old-division, - dict-iter-method, - dict-view-method, - next-method-called, - metaclass-assignment, - indexing-exception, - raising-string, - reload-builtin, - oct-method, - hex-method, - nonzero-method, - cmp-method, - input-builtin, - round-builtin, - intern-builtin, - unichr-builtin, - map-builtin-not-iterating, - zip-builtin-not-iterating, - range-builtin-not-iterating, - filter-builtin-not-iterating, - using-cmp-argument, - eq-without-hash, - div-method, - idiv-method, - rdiv-method, - exception-message-attribute, - invalid-str-codec, - sys-max-int, - bad-python3-import, - deprecated-string-function, - deprecated-str-translate-call, - deprecated-itertools-function, - deprecated-types-field, - next-method-defined, - dict-items-not-iterating, - dict-keys-not-iterating, - dict-values-not-iterating, - deprecated-operator-function, - deprecated-urllib-function, - xreadlines-attribute, - deprecated-sys-function, - exception-escape, - comprehension-escape, - import-error + import-error, + superfluous-parens, + unnecessary-lambda-assignment, + unnecessary-dunder-call, + unspecified-encoding, + cyclic-import # Enable the message, report, category or checker with the given id(s). You can # either give multiple identifier separated by comma (,) or put this option diff --git a/CHANGELOG.md b/CHANGELOG.md index 875babcc..8880974b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -510,9 +510,9 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm - Enable `filterFasta` to filter by number of miscleavages per peptide. #382 -- Added CLI command `mergeFasta` to merge multiple variant peptide database Fasta files into one. This could be useful when working with multiplexed proteomic experiments such as TMT. [#380](https://github.com/uclahs-cds/private-moPepGen/issues/380) +- Added CLI command `mergeFasta` to merge multiple variant peptide database Fasta files into one. This could be useful when working with multiplexed proteomic experiments such as TMT. [#380](https://github.com/uclahs-cds/package-moPepGen/issues/380) -- Added CLI command `decoyFasta` to generate decoy database by shuffling or reversing each sequence. [#386](https://github.com/uclahs-cds/private-moPepGen/issues/386) +- Added CLI command `decoyFasta` to generate decoy database by shuffling or reversing each sequence. [#386](https://github.com/uclahs-cds/package-moPepGen/issues/386) - Added parameter `--min-coverage-rna` to `parseREDItools` to filter by total RNA reads at a given position. #392 @@ -552,7 +552,7 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm - The command line arguments are standardized across all commands, for example '-i/--input-path' for inputs and '-o/--output-path' for outputs. -- `generateIndex` is changed to use compressed text format to store genomic annotation, because for some reason that we are not sure, when loading the pickled genomic annotation, the memory usage is almost doubled. [#394](https://github.com/uclahs-cds/private-moPepGen/issues/394) +- `generateIndex` is changed to use compressed text format to store genomic annotation, because for some reason that we are not sure, when loading the pickled genomic annotation, the memory usage is almost doubled. [#394](https://github.com/uclahs-cds/package-moPepGen/issues/394) --- diff --git a/README.md b/README.md index 44c1c4cc..c464d220 100644 --- a/README.md +++ b/README.md @@ -3,25 +3,25 @@ [![Lifecycle:Maturing](https://img.shields.io/badge/Lifecycle-Maturing-007EC6)](https://img.shields.io/badge/Lifecycle-Maturing-007EC6) -[![Tests](https://github.com/uclahs-cds/private-moPepGen/actions/workflows/tests.yaml/badge.svg)](https://github.com/uclahs-cds/private-moPepGen/actions/workflows/tests.yaml) -[![Docker](https://img.shields.io/badge/docker-%230db7ed.svg?style=plastic&logo=docker&logoColor=white)](https://github.com/uclahs-cds/private-moPepGen/pkgs/container/mopepgen) -[![Documentation](https://img.shields.io/static/v1?style=plastic&message=ReadTheDocs&color=2C4AA8&logo=ReadTheDocs&logoColor=FFFFFF&label=Documentation)](https://uclahs-cds.github.io/private-moPepGen/) +[![Tests](https://github.com/uclahs-cds/package-moPepGen/actions/workflows/tests.yaml/badge.svg)](https://github.com/uclahs-cds/package-moPepGen/actions/workflows/tests.yaml) +[![Docker](https://img.shields.io/badge/docker-%230db7ed.svg?style=plastic&logo=docker&logoColor=white)](https://github.com/uclahs-cds/package-moPepGen/pkgs/container/mopepgen) +[![Documentation](https://img.shields.io/static/v1?style=plastic&message=ReadTheDocs&color=2C4AA8&logo=ReadTheDocs&logoColor=FFFFFF&label=Documentation)](https://uclahs-cds.github.io/package-moPepGen/) [![License](https://img.shields.io/badge/License-GPL_v2-blue)](./LICENSE.txt) -moPepGen (multi-omics peptide generator) uses data from multiple -omics experiments and calls variant peptides as custom database for proteogenomics library search. +moPepGen (multi-omics peptide generator) uses data from one or more omics experiments and calls variant peptides as custom databases for proteogenomic library search. -moPepGen takes genomic variants such as single nucleotide variants (SNP or SNV), insertion/deletion (INDEL), gene fusion, and post transcriptional modifications such as RNA editing and alternative splicing, and detects variated peptides affected. +moPepGen takes genomic and transcriptomic variants such as single nucleotide variants (SNPs or SNVs), small insertions/deletions (indels), gene fusion, alternative splicing, RNA circularization and RNA editing events, and generates noncanonical peptides affected by the variants. ## Installation -Install directly from github +Install directly from GitHub ```shell -pip install git+ssh://git@github.com/uclahs-cds/private-moPepGen.git +pip install git+ssh://git@github.com/uclahs-cds/package-moPepGen.git ``` ## Documentation -See [here](https://uclahs-cds.github.io/private-moPepGen/index.html) +See [here](https://uclahs-cds.github.io/package-moPepGen/index.html) diff --git a/docs/README.md b/docs/README.md index 5b46c484..435ba2d0 100644 --- a/docs/README.md +++ b/docs/README.md @@ -1,14 +1,14 @@ ## MKDocs -Documentations are written in markdown and converted to html by [MKDocs](https://www.mkdocs.org/). +Documentations are written in markdown and converted to HTML by [MKDocs](https://www.mkdocs.org/). ## Adding Pages -To add a new documentation page, first create a '.md' file under this directory. Next go to the 'mkdocs.yml' file under the root directory of the repo, and add a key-value pair under `nav`, and after rendering, a link to the new added page will show up in the navbar. +To add a new documentation page, first create a '.md' file under this directory. Next, go to the 'mkdocs.yml' file under the root directory of the repo, and add a key-value pair under `nav`, and after rendering, a link to the newly added page will show up in the navbar. ## Render locally -The online documentation is rendered automatically by github actions, it is still useful to see the changes in real time. This can be done by running `mkdocs` locally. moPepGen, mkdocs and several dependencies need to be installed first. +While the online documentation is rendered automatically by GitHub actions, it may be still useful to see the changes in real time. This can be done by running `mkdocs` locally. `moPepGen`, `mkdocs` and several dependencies need to be installed first. ```bash conda env create --name moPepGen python=3.8.11 @@ -17,7 +17,7 @@ pip install mkdocs==1.2.3 jinja2==3.0.0 mkdocstrings==0.16.2 mkdocs-macros-plugi pip install . ``` -Use the command below to start a development server and open http://127.0.0.1:8000/ to see the change in real time. +Use the command below to start a development server and open http://127.0.0.1:8000/ to see the changes in real time. ```bash mkdocs serve diff --git a/docs/call-alt-translation.md b/docs/call-alt-translation.md index 30af948a..1a970497 100644 --- a/docs/call-alt-translation.md +++ b/docs/call-alt-translation.md @@ -25,12 +25,12 @@ ## Alternative translation -Alternative translation is when a different peptide is generated from the same transcript without any change of the genetic code from the genome. +Alternative translation is when a different peptide is generated from the same transcript without any changes in the nucleotide sequence of the transcript. ### Selenocysteine Termination -In eukaryotes, the UGA on some mRNAs can be decoded into selenocysteine instead of being recognized as a stop codon, and those proteins are called selenoproteins. However the decoding of UGA is regulated by complex signals including mRNA and sec-tRNA abundance, which could result two isoforms: one with UGA read through and one being truncated. Selenocysteine termination is used to represent the later situation. Selenocysteine terminations are not written into any GVF file but they are represented in the format of `SECT-` where `pos` is the position of the selenocysteine UGA being recognized as a stop codon in the **gene**. +In eukaryotes, the UGA on some mRNAs can be decoded into selenocysteine instead of being recognized as a stop codon, and these proteins are called selenoproteins. However, the decoding of UGA is regulated by complex signals including mRNA and sec-tRNA abundance, which could result in two proteoforms: one with UGA read through and one with termination at the stop codon. Selenocysteine termination is used to represent the later situation. Selenocysteine terminations are not written into any GVF files but are represented in the format of `SECT-` where `pos` is the position of the selenocysteine UGA being recognized as a stop codon in the **gene**. ### Tryptophan > Phenylalanine Codon Reassignment -Tryptophan > Phenylalanine substitutants, described in [Patasker, et al.](https://pubmed.ncbi.nlm.nih.gov/35264796/), happens when cellular tryptophan is depleted and phenylalanine is reassigned to tryptophan codons to have protein synthesis continue. The process largely exists in tumor cells. Similar to selenocysteine termination, W > F substitutants are also not written into GVFs, but is represented in the format of `W2F-`. Noted that the `pos` is at peptide coordinate (*i.e.,* zeroed at the beginning of the peptide). +Tryptophan > Phenylalanine substitutants, described in [Patasker, et [al.](https://pubmed.ncbi.nlm.nih.gov/35264796/), happen when cellular tryptophan is depleted and phenylalanine is reassigned to tryptophan codons to continue protein synthesis. The process largely exists in tumor cells. Similar to selenocysteine termination, W > F substitutants are not written in GVFs, but are represented in the format of `W2F-`. Noted that the `pos` is a peptide coordinate (*i.e.,* zeroed at the beginning of the peptide). diff --git a/docs/file-format.md b/docs/file-format.md index da8bde23..6a317b07 100644 --- a/docs/file-format.md +++ b/docs/file-format.md @@ -12,21 +12,21 @@ ## 1 Gene Variant Format -In moPepGen we are interested in finding varianted peptide caused by combination of different types of variants, including single nucleotide substitution, INDEL, RNA editing site, gene fusion and alternative splicing. We are also interested in non-coding RNA and circRNA with unreported ORF, or start codon gained from mutation. +In moPepGen we are interested in finding variant peptides caused by combinations of different types of variants, including single nucleotide substitution, INDEL, RNA editing site, gene fusion and alternative splicing. We are also interested in non-coding RNA and circRNA with unreported ORF or start codon gained from mutation. -The different mutation events are called by different programs, and those files have different format. In moPepGen, for each type of data, we use data type and tool specific parsers to convert variant data from different sources to a standardized VCF-like format that moPepGen can use to create the transcript variant graph. They are then collected by `moPepGen callPeptide` command to call variant peptides. +The different mutation events are called by different algorithms with varying output formats. In moPepGen, data type and tool-specific parsers convert variant data from different sources to a standardized VCF-like format. They are used in the `moPepGen callVariant` command to create the transcript variant graph and call variant peptides. -In moPepGen, we define the GVF (Gene Variant Format) file format, that extended and modified from the [VCF](https://samtools.github.io/hts-specs/VCFv4.2.pdf) file format to represent the variant records. In a GVF file, each entry represents a variant associated with a transcript. The `CHROM` column is used to hold the gene ID, and the `POS` column indicates the position of the corresponding transcript. +In moPepGen, we define the GVF (Gene Variant Format) file format, extended and modified from the [VCF](https://samtools.github.io/hts-specs/VCFv4.2.pdf) file format, to represent the variant records. In a GVF file, each entry represents a variant associated with a transcript. The `CHROM` column is used to hold the gene ID, and the `POS` column indicates the variant position in reference to the transcript gene. ### 1.1 File Metadata -Each variant file should contain a metadata section that each line starts with a double hashtag. The first line of the metadta must be the 'fileformat' field for VCF, to be consistant with VCF's standards. The link should read: +Each variant GVF file contains a metadata section, where each line starts with a double hashtag. The first line of the metadata must be the `fileformat` field in consistency with VCF standards. The line should read: ``` ##fileformat=VCFv4.2 ``` -Starting from the second line should be moPepGen's metadata. Each line should be a key value pair separated by an equal sign ('='). Keys should follow the `snake_case`. See example below. +moPepGen-specific GVF metadata starts on the second line. Each line is a key-value pair separated by an equal sign (`=`). Keys follow the `snake_case`. See examples below. ``` ##moPepGen_version=0.0.1 @@ -105,24 +105,24 @@ ENSG0021 500 FUSION-ENSG0021:500-ENSG0031:1000 C . . T The `Info` column must contain the following fields: + `TRANSCRIPT_ID`: the transcript ID of the donor (upstream) transcript. -+ `ACCEPTER_GENE_ID`: the accepter (downstream) transcript's gene ID. -+ `ACCEPTER_TRANSCRIPT_ID`: the accepter (downstream) transcript's transcript ID. -+ `ACCEPTER_POSITION`: the position of the break point of the ACCEPTER (downstream) transcript. -+ `GENOMIC_POSITION`: the genomic position of the donor (upstream) transcript, in the format of `::`. ++ `ACCEPTER_GENE_ID`: the accepter (downstream) transcript gene ID. ++ `ACCEPTER_TRANSCRIPT_ID`: the accepter (downstream) transcript transcript ID. ++ `ACCEPTER_POSITION`: the position of the breakpoint of the ACCEPTER (downstream) transcript. ++ `GENOMIC_POSITION`: the genomic position of the donor (upstream) transcript, in the format of `:-`. + `ACCEPTER_GENOMIC_POSITION`: the genomic position of the accepter (downstream) transcript, in the format of `:-`. ### 1.4 Alternative Splicing Site -Alternative splicing site called by [rMATS](http://rnaseq-mats.sourceforge.net/) has five types, e.g. skipped exon (SE), alternative 5' splice site (A5SS), alternative 3' splice site (A3SS), mutually exclusive exons (MXE), and retained intron (RI). Each alternative splicing event can be represented as a deletion, insertion or a substitution. +Alternative splicing sites called by [rMATS](http://rnaseq-mats.sourceforge.net/) have five types, *i.e.* skipped exon (SE), alternative 5' splice site (A5SS), alternative 3' splice site (A3SS), mutually exclusive exons (MXE), and retained intron (RI). Each alternative splicing event can be represented as a deletion, insertion or substitution. -SE is when a exon is skipped given its upstream and downstream exon. It is represented as a **insertion** when the target transcript from the GTF file contains the exon. And it is represented as a **deletion** when the target transcript is annotated without the exon. +SE is called when an exon is skipped given its upstream and downstream exons. It is represented as an **insertion** when the target transcript from the GTF file contains the exon, or is represented as a **deletion** when the target transcript is annotated without the exon. -A5SS and A3SS are when an exon has two splicing sites that can generate a longer and a short version. When the longer version is annotated in the given transcript, the variant is represented as a deletion, and a insertion when the shorter version is annotated. +A5SS and A3SS are called when an exon has two splicing sites that can generate either a long or a short version of the exon. When the longer version is annotated in the given transcript, the variant is represented as a **deletion**, and an **insertion** is used when the shorter version is annotated. -MXE is represented as substitution of one exon with another exon. +MXE is represented as the substitution of one exon with another exon. -RI is represented as an insertion or the intron sequence. +RI is represented as an insertion of the intron sequence. ``` ##fileformat=VCFv4.2 @@ -153,29 +153,29 @@ ENSG0004 277 MXE-477-1103 T . . TRANSCRIPT_ID=ENST00041;D ENSG0001 110 SE-300 C . . TRANSCRIPT_ID=ENST0001;DONOR_START=300;DONOR_END=400;GENE_SYMBOL=TP53;GENOMIC_POSITION=chr1:1000-1001 ``` -The line above represents an SE (skipped exon), that the sequence of 300-400 of the gene ENSG0001 is inserted to the t ranscript of ENST0001 at position 110. In this case, all transcripts of the gene in the annotation GTF don't contain this exon. +The line above represents an SE (skipped exon), that the sequence of 300-400 of the gene ENSG0001 is inserted into the transcript of ENST0001 at position 110. In this case, all transcripts of the gene in the annotation GTF don't contain this exon. ``` ENSG0002 210 A5SS-210 T . . TRANSCRIPT_ID=ENST0002;START=210;END=400;GENE_SYMBOL=EGFR;GENOMIC_POSITION=chr1:1000-1001 ``` -The line above represents a A5SS (alternative 5' splicing site), that the sequence from 210 to 400 of the transcript ENST0002 is deleted. In this case, all transcripts of the gene in the annotation GTF have the longer version of the exon. +The line above represents an A5SS (alternative 5' splicing site), where the nucleotides from positions 210 to 400 of the transcript ENST0002 are deleted. This A5SS is represented as a **deletion** because all transcripts of the gene in the annotation GTF have the longer version of the exon. ``` ENSG0003 115 MXE-320 T . . TRANSCRIPT_ID=ENST0003;START=320;END=380;GENE_SYMBOL=EGFR;GENOMIC_POSITION=chr1:1000-1001 ``` -The line above represents a MXE (mutually exclusive exon), that the exon of 320-380 of the gene ENSG0003 is retained in the transcript ENST0003 and resulted as an insertion at position 115 of the transcript. In this case, none of the transcripts of this gene has the first exon retained and second spliced at the same time. And this transcript has both exons retained. +The line above represents an MXE (mutually exclusive exon), where the exon at position 320-380 of the gene ENSG0003 is retained in the transcript ENST0003 and resulted as an insertion at position 115 of the transcript. This MXE is represented as an **insertion** because none of the transcripts of this gene has the first exon retained and the second spliced in the GTF, but this transcript has both exons retained. ``` ENSG0004 277 MXE-477-1103 T . . TRANSCRIPT_ID=ENST0004;START=477;END=582;DONOR_START=1103;DONOR_END=1228;GENE_SYMBOL=EGFR;GENOMIC_POSITION=chr1:1000-1001 ``` -This line above represents a MXE that the exon 447-582 (transcript ENST0004 position 277) is replaced with exon 1103-1228 of the gene. +This line above represents an MXE where the exon at position 447-582 in the gene (transcript ENST0004 position 277) is replaced with the exon at position 1103-1228 of the gene. ### 1.5 CircRNA -Circular RNAs are derived from back-spliced exons. They exist as individual RNA molecules and have the potential to be translated to proteins. We are then interested in finding the possible peptide sequences translated from circRNAs with and without variants (SNP, INDEL, etc). In this case, circRNAs per se are rather new transcripts than variants. Here we define a TSV file format to represent the circRNA molecules. In this TSV format, each row represent a circRNA, with the gene ID it is associated with, the start position at the gene, the offset and length of each segment, and IDs. Normally ach segment is an exon, but with intron retained alternative splicing, there could be introns. +Circular RNAs are derived from back-spliced exons and introns. They exist as independent RNA molecules and have the potential to be translated into proteins. We are interested in finding the possible peptide sequences that could be the result of circRNA translation, with and without additional variants (SNP, INDEL, etc). In this case, circRNAs per se are rather new transcripts backbones than variants and are also recorded in GVF files in moPepGen. In such a GVF file, each row represents a circRNA, with the gene ID it is associated with, the start position at the gene coordinate, the offset and length of each segment, and their exon or intron indices. Normally each segment is an exon, but with intron-retained alternative splicing, they could be introns. ``` ##fileformat=VCFv4.2 @@ -200,21 +200,21 @@ ENSG0003 77 CIRC-ENST0003-E2-I2-E3-I3-E4 . . . . OFFSET=0,10 ENSG0004 789 CI-ENST0004-I3 . . . . OFFSET=0;LENGTH=112;INTRON=1;TRNASCRIPT=ENST0004;GENE_SYMBOL=SYMB4 ``` -Technically, circRNAs are not variants that alters the gene/transcript sequence. We here still use the GVF file format to tr The `Info` column must contain the following fields: +circRNAs are not variants that are added to the transcript variant graph, thus the `REF` and `ALT` columns should be kept empty as ".". The `INFO` column must contain the following fields. -+ **`OFFSET`**: The offset of each fragment after the `start` position of the gene. Each segment can be either an exon or intron. -+ **`LENGTH`**: The length of each fragmet. ++ **`OFFSET`**: The offset of each fragment after the `start` position of the gene. Each segment can be either an exon or an intron. ++ **`LENGTH`**: The length of each fragment. + **`INTRON`**: The indices of fragments that are introns. -+ **`TRANSCRIPT`** The transcript ID that are able to generate this circRNA (e.g. contains all exons and introns of the circRNA.) ++ **`TRANSCRIPT`** The transcript ID of a transcript that is able to generate this circRNA (e.g. contains all exons and introns of the circRNA). + **`GENE_SYMBOL`** The name of the gene. -The ID of circRNAs consist of two components. They all start with \-circRNA or \-ciRNA where `transcript_id` is the value from the `CHROM` column. Following that is the information for each fragment including E (exon) or I (intron) and the index of the fragment. For example,ENSG0001-circRNA-E2-I2-E3 is made up of the second exon, second intron, and the third exon of the gene ENSG0001. +The ID of circRNAs consists of two components. They all start with \-circRNA or \-ciRNA where `transcript_id` is the value from the `CHROM` column. Following that is the information for each fragment, indicating whether it is an exon (E) or intron (I) and the index of the fragment. For example, `ENSG0001-circRNA-E2-I2-E3` is made up of the second exon, the second intron, and the third exon of the gene ENSG0001. ## 2 Variant Peptide FASTA -In moPepGen, the headers of the final output variant peptide FASTA contains the transcript IDs and variants associated with this variant peptide. The header of a peptide record starts with the transcript ID, followed by the gene ID and gene symbol, and the variant IDs that it is associated with, separated by '|'. The Variant IDs are defined in the GVF files. In some cases, several non-canonical peptides from the same transcript may share the same variants. This is most common in cases of peptide miscleavages. In addition, a frameshifting variant may cause multiple non-canonical peptides. A integer index is thus always added to the end to resolve redundancies. +In moPepGen, the headers of the final output variant peptide FASTA contain the transcript IDs and variants associated with this variant peptide. The header of a peptide record starts with the transcript backbone ID and is followed by the variant IDs that it is associated with, separated by '|'. The variant IDs are defined in the GVF files. In some cases, several non-canonical peptides from the same transcript may share the same variants. This is most common in cases of peptide miscleavages. In addition, a frameshifting variant may cause multiple non-canonical peptides. An integer index is thus always added to the end of each header entry to resolve redundancies. -If the same peptide is found in multiple transcripts, the annotation is separated by space. +If the same peptide is found in multiple transcripts, all are documented as separate entries in the fasta header, separated by space. ``` >ENST0001|SNV-110-C-A|1 diff --git a/docs/filter-fasta.md b/docs/filter-fasta.md index ebf918eb..7d377a10 100644 --- a/docs/filter-fasta.md +++ b/docs/filter-fasta.md @@ -25,7 +25,7 @@ ### Filter by Expression -The example below filters the variant peptide sequences based on their expression level. The expression table is given as TSV file, with the first column being the transcript ID, and the forth column being the expression level. Peptides are removed if the transcript it is associated has the expression level smaller than 2. Any transcript quantitation value can be used, including read count, TPM, and FPKM. +The example below filters the variant peptide sequences based on their expression level. The expression table is given as TSV file, with the first column being the transcript ID, and the fourth column being the expression level. Peptides are removed if the transcript it is associated has the expression level smaller than 2. Any transcript quantitation value can be used, including read count, TPM, and FPKM. ```bash moPepGen fitlerFasta \ @@ -40,7 +40,7 @@ moPepGen fitlerFasta \ ### Filter by Expression and Miscleavages -This example is the same as above with the addition of a filter by miscleavages. Any peptides with the miscleavages larger than 3 will be dropped. +This example is the same as above with the addition of a filter by miscleavages. Any peptides with more than 3 miscleavages will be dropped. ```bash moPepGen fitlerFasta \ @@ -57,7 +57,7 @@ moPepGen fitlerFasta \ ### Filter Variant and Noncoding Peptides -This example takes both a variant peptide and noncoding peptide FASTA file and filter the sequences based on the expression level of the transcripts that they are associated with. +This example takes both the variant peptide FASTA and the noncoding peptide FASTA and filters the peptides based on the expression level of the transcripts they are associated with. ```bash moPepGen fitlerFasta \ @@ -74,10 +74,10 @@ moPepGen fitlerFasta \ ### Filter by Denylist -This example here removes any peptide sequences if they appear in the given denylist. +This example here removes any peptide sequences that appear in the given denylist. !!! warning: -When using noncoding peptides as denylist, do not also pass it as a input FASTA, because all peptides will be removed. +When using noncoding peptides in a denylist, do not also pass the noncoding peptide FASTA as an input FASTA, because all peptides will be removed. ```bash moPepGen fitlerFasta \ @@ -98,7 +98,7 @@ moPepGen filterFasta \ ### Complex Filtering -Sometimes we want more complex filtering strategy. In the example below, we want to first remove any variant peptides that overlap with any noncoding peptide, and next we filter again based on the expression level. +Sometimes we want a more complex filtering strategy. In the example below, we want to first remove any variant peptides that overlap with any noncoding peptides, and then filter again based on the expression level. Remove variant peptides if they overlap with any noncoding peptide. diff --git a/docs/generate-index.md b/docs/generate-index.md index 980a24ba..dda6425c 100644 --- a/docs/generate-index.md +++ b/docs/generate-index.md @@ -39,7 +39,7 @@ Similarly, for ENSEMBL, we recommend using the primary genome assembly and its a ## Output -Users usually don't need to worry about the output files of this command. As long as the correct path is provided to subsequent moPepGen commands, the correct index files will be recognized. +Users usually don't need to worry about the output files of this command. As long as the correct path is provided to subsequent moPepGen commands, the correct index files will be recognized. Files are created by this command: diff --git a/docs/index.md b/docs/index.md index a24fd00b..3127a39e 100644 --- a/docs/index.md +++ b/docs/index.md @@ -1,17 +1,17 @@ # moPepGen: Multi-Omics Peptide Generator -moPepGen (multi-omics peptide generator) uses data from multiple -omics experiments and calls variant peptides as custom database for proteogenomics library search. +moPepGen (multi-omics peptide generator) uses data from one or more omics experiments and calls variant peptides as custom databases for proteogenomic library search. -moPepGen takes genomic variants such as single nucleotide variants (SNP or SNV), insertion/deletion (INDEL), gene fusion, and post transcriptional modifications such as RNA editing and alternative splicing, and detects variated peptides affected. +moPepGen takes genomic and transcriptomic variants such as single nucleotide variants (SNPs or SNVs), small insertions/deletions (indels), gene fusion, alternative splicing, RNA circularization and RNA editing events, and generates noncanonical peptides affected by the variants. ## Installation -Install directly from github +Install directly from GitHub ``` -pip install git+ssh://git@github.com:uclahs-cds/private-moPepGen.git +pip install git+ssh://git@github.com:uclahs-cds/package-moPepGen.git ``` ## Overview -![graphic-abstract](img/graphic-abstract.png) \ No newline at end of file +![graphic-abstract](img/graphic-abstract.png) diff --git a/docs/quick-start.md b/docs/quick-start.md index e3153c2c..4daf10d4 100644 --- a/docs/quick-start.md +++ b/docs/quick-start.md @@ -2,9 +2,9 @@ ### Downloading Reference Files -moPepGen requires a coherent set of reference genome, proteome and Gene transfer format (GTF) files to run. The key is to ensure that the genome annotation version is consistent between the GTF and proteome FASTA, and that the reference genome build version is the same between all three. +moPepGen requires a coherent set of reference genome, proteome and Gene transfer format (GTF) files to run. The key is to ensure that the genome annotation version is consistent between the GTF and proteome FASTA and that the reference genome build version is the same between all three. -To ensure consistency, we recommend downloading the reference genome FASTA, reference proteome FASTA and genome annotation GTF from the same source. At this time reference files from GENCODE and Ensembl are supported. We do not forsee support for UniProt proteomes in the near future since there is no fail-safe method to map all IDs to formats found in commonly used genome annotation GTFs. +To ensure consistency, we recommend downloading the reference genome FASTA, reference proteome FASTA and genome annotation GTF from the same source. At this time reference files from GENCODE and Ensembl are supported. We do not foresee support for UniProt proteomes in the near future since there is no fail-safe method to map all IDs to formats found in commonly used genome annotation GTFs. #### GENCODE @@ -27,7 +27,7 @@ At the time of writing, the [Ensembl](https://www.ensembl.org/index.html) Releas ### Build Reference Index -Creating index files for genome, proteome and GTF, and create a canonical peptide pool to be used in further steps. By default, the `generateIndex` subcommand uses trypsin, up to 2 miscleavages, and minimal molecular weight of 500. Da. However those settings can be adjusted using command line arguments. See `moPepGen generateIndex --help`. +Creating index files for the genome, proteome and GTF annotations, as well as a canonical peptide pool to be used in further steps. By default, the `generateIndex` subcommand uses trypsin as the protease, considering up to 2 miscleavages, and a minimal peptide molecular weight of 500 Da. However, those settings can be adjusted using command line arguments. See `moPepGen generateIndex --help`. ``` moPepGen generateIndex \ @@ -50,7 +50,7 @@ moPepGen parseVEP \ ### Call Variant Peptides -Call variant peptides using the parsed variant table generated by `parseVEP`. `callPeptide` takes multiple variant tables. They must all be parsed by one of the parsers of `moPepGen`. Cleavage rule, miscleavage and minimal molecular weights are also set to be default as the values below. +Call variant peptides using the parsed variant GVF file generated by `parseVEP`. `callVariant` takes multiple variant GVF files to combine variant types. They must all be parsed by `moPepGen` parsers. Cleavage rule, miscleavage and minimal molecular weight are also set to default values as below. ``` moPepGen callPeptide \ diff --git a/docs/split-fasta.md b/docs/split-fasta.md index 37f12baa..9bdda9d7 100644 --- a/docs/split-fasta.md +++ b/docs/split-fasta.md @@ -53,11 +53,11 @@ moPepGen splitFasta \ --output-prefix path/to/split ``` -This example outputs three split FASTA filese, `split_Coding.fasta`, `split_RNAEditing.fasta`, and `split_Remaining.fasta`. +This example outputs three split FASTA files, `split_Coding.fasta`, `split_RNAEditing.fasta`, and `split_Remaining.fasta`. ### Additional split -Additional split allows you split the records with the source group specified that would otherwise be placed in to the remaining FASTA. See example below. +Additional split allows you to split the records with the source group specified that would otherwise be placed along with all other records exceeding `--max-source-groups` into `remaining.fasta`. See the example below. ```bash moPepGen splitFasta \ @@ -72,7 +72,7 @@ moPepGen splitFasta \ --output-prefix path/to/split ``` -As result, `split_gSNP.fasta` and `split_gINDLE.fasta` will be written. `split_gSNP-gINDEL.fasta` and `split_gSNP-RNAEditign.fasta` will also be written although the number of variant sources (2) are larger than the value specified through `--max-source-groups`. +As a result, `split_gSNP.fasta` and `split_gINDEL.fasta` will be written. `split_gSNP-gINDEL.fasta` and `split_gSNP-RNAEditing.fasta` will also be written although the number of variant sources (2) is larger than the value specified through `--max-source-groups`. ## Arguments diff --git a/docs/vignette.md b/docs/vignette.md index 2bdb9bd1..22fb8a7f 100644 --- a/docs/vignette.md +++ b/docs/vignette.md @@ -4,23 +4,23 @@ Welcome to the vignette for moPepGen, a powerful Python package designed for gen ## Installation -moPepGen is a command line tool designated to execute in a unix-like environment. For MacOS and Linux users, moPepGen can be installed using the command below. For Windows users, we recommend installing and running moPepGen from WSL (Windows Subsystem for Linux). +moPepGen is a command line tool designated to execute in a Unix-like environment. For MacOS and Linux users, moPepGen can be installed using the command below. For Windows users, we recommend installing and running moPepGen from WSL (Windows Subsystem for Linux). ```shell -pip install git+ssh://git@github.com/uclahs-cds/private-moPepGen.git +pip install git+ssh://git@github.com/uclahs-cds/package-moPepGen.git ``` Install a specific version. ```shell -pip install git+ssh://git@github.com/uclahs-cds/private-moPepGen.git@v0.11.3 +pip install git+ssh://git@github.com/uclahs-cds/package-moPepGen.git@v0.11.3 ``` -You can also clone the repo and install it directly from source code. +You can also clone the repo and install it directly from the source code. ```shell -git clone git@github.com:uclahs-cds/private-moPepGen.git -cd private-moPepGen +git clone git@github.com:uclahs-cds/package-moPepGen.git +cd package-moPepGen pip install . --use-feature=in-tree-build ``` @@ -34,9 +34,9 @@ A simulated reference set is provided for demonstration. The demo reference set cd ~ mkdir -p moPepGen-demo cd moPepGen-demo -wget https://github.com/uclahs-cds/private-moPepGen/raw/main/test/files/genome.fasta -wget https://github.com/uclahs-cds/private-moPepGen/raw/main/test/files/annotation.gtf -wget https://github.com/uclahs-cds/private-moPepGen/raw/main/test/files/translate.fasta +wget https://github.com/uclahs-cds/package-moPepGen/raw/main/test/files/genome.fasta +wget https://github.com/uclahs-cds/package-moPepGen/raw/main/test/files/annotation.gtf +wget https://github.com/uclahs-cds/package-moPepGen/raw/main/test/files/translate.fasta ``` Convert reference set into index files for quick access by moPepGen. @@ -61,7 +61,7 @@ Note that, the VEP cache files must be downloaded prior to running VEP (see [her !!! warning - If you use `--chr` to limit the chromosomes to annotate, make sure the style matches with your VCF/BED file. For example, if the chromosomes have the 'chr' prefix in your VCF file (*i.e.*, chr1, chr2, ...), you must include the prefix as well (*i.e.*, `--chr chr1,chr2,chr3`). + If you use `--chr` to limit the chromosomes to annotate, make sure the style matches your VCF/BED file. For example, if the chromosomes have the 'chr' prefix in your VCF file (*i.e.*, chr1, chr2, ...), you must include the prefix as well (*i.e.*, `--chr chr1,chr2,chr3`). The example data does not work for VEP. @@ -96,7 +96,7 @@ filter_vep \ For demonstration, we provide the following VEP output file in TSV format, to be used by moPepGen. ```shell -wget https://github.com/uclahs-cds/private-moPepGen/raw/main/test/files/vep/vep_snp.txt +wget https://github.com/uclahs-cds/package-moPepGen/raw/main/test/files/vep/vep_snp.txt ``` The output VEP TSV file must be parsed by `parseVEP` into GVF format. @@ -118,7 +118,7 @@ moPepGen provides parsers to three fusion callers, [STAR-Fusion](https://github. As an example, we provide a STAR-Fusion TSV output file for demonstration. ```shell -wget https://github.com/uclahs-cds/private-moPepGen/raw/main/test/files/fusion/star_fusion.txt +wget https://github.com/uclahs-cds/package-moPepGen/raw/main/test/files/fusion/star_fusion.txt ``` Parse it into GVF format. @@ -131,7 +131,7 @@ moPepGen parseSTARFusion \ -o star_fusion.gvf ``` -Be default, `parseSTARFusion` only keeps fusion events with minimal `est_J` value of 5. This can be altered by the `--min-est-j` argument. +By default, `parseSTARFusion` only keeps fusion events with minimal `est_J` value of 5. This can be altered by the `--min-est-j` argument. ### Alternative Splicing @@ -140,32 +140,19 @@ moPepGen accepts alternative splicing (AS) events estimated by [rMATS](https://r Example data: ```shell -wget https://github.com/uclahs-cds/private-moPepGen/raw/main/test/files/alternative_splicing/rmats_se_case_1.SE.JC.txt -wget https://github.com/uclahs-cds/private-moPepGen/raw/main/test/files/alternative_splicing/rmats_a3ss_case_1.A3SS.JC.txt -wget https://github.com/uclahs-cds/private-moPepGen/raw/main/test/files/alternative_splicing/rmats_a5ss_case_1.A5SS.JC.txt -wget https://github.com/uclahs-cds/private-moPepGen/raw/main/test/files/alternative_splicing/rmats_mxe_case_1.MXE.JC.txt -wget https://github.com/uclahs-cds/private-moPepGen/raw/main/test/files/alternative_splicing/rmats_ri_case_1.RI.JC.txt +wget https://github.com/uclahs-cds/package-moPepGen/raw/main/test/files/alternative_splicing/rmats_se_case_1.SE.JC.txt +wget https://github.com/uclahs-cds/package-moPepGen/raw/main/test/files/alternative_splicing/rmats_a3ss_case_1.A3SS.JC.txt +wget https://github.com/uclahs-cds/package-moPepGen/raw/main/test/files/alternative_splicing/rmats_a5ss_case_1.A5SS.JC.txt +wget https://github.com/uclahs-cds/package-moPepGen/raw/main/test/files/alternative_splicing/rmats_mxe_case_1.MXE.JC.txt +wget https://github.com/uclahs-cds/package-moPepGen/raw/main/test/files/alternative_splicing/rmats_ri_case_1.RI.JC.txt ``` Parse AS events output by rMATS into GVF format with `parseRMATS`. Note that you don't have to provide all 5 AS files to `parseRMATS`. - -```shell -moPepGen parseRMATS \ - --se rmats_se_case_1.SE.JC.txt \ - --a3ss rmats_a3ss_case_1.A3SS.JC.txt \ - --a5ss rmats_a5ss_case_1.A5SS.JC.txt \ - --mxe rmats_mxe_case_1.MXE.JC.txt \ - --ri rmats_ri_case_1.RI.JC.txt \ - --index-dir index \ - -o alt_splice_rmats.gvf \ - --source AltSplice \ -``` - -By default `parseRMATS` only accepts AS events with inclusion and exclusion junction counts of at least 1. These cutoffs can be set by `--min-ijc` and `--min-sjc`. See [here](./parse-rmats) for a complete list of arguments. +By default, `parseRMATS` only accepts AS events with inclusion and exclusion junction counts of at least 1. These cutoffs can be set by `--min-ijc` and `--min-sjc`. See [here](./parse-rmats) for a complete list of arguments. ### RNA Editing Sites -RNA editing sites are specific positions within mRNA molecules where nucleotides undergo post-transcriptional modifications. moPepGen supports RNA editing sites called by [REDItools](https://github.com/BioinfoUNIBA/REDItools). Noted that the REDItools output must be annotated by the `AnnotateTable.py` from the REDItools package prior to being passed to `parseREDItools`. Below is the command that can be used to perform the annotation. Note that the `${ANNOTATION_GTF}` must be the same file later used in `parseREDItools` and `callVariant`. `${PREFIX}` is the prefix of column names for gene and transcript IDs. +RNA editing sites are specific positions within mRNA molecules where nucleotides undergo post-transcriptional modifications. moPepGen supports RNA editing sites called by [REDItools](https://github.com/BioinfoUNIBA/REDItools). Note that the REDItools output must be annotated by the `AnnotateTable.py` from the REDItools package prior to being passed to `parseREDItools`. Below is the command that can be used to perform the annotation. Note that the `${ANNOTATION_GTF}` must be the same file later used in `parseREDItools` and `callVariant`. `${PREFIX}` is the prefix of column names for gene and transcript IDs. ```shell AnnotateTable.py \ @@ -180,20 +167,11 @@ AnnotateTable.py \ Example data: ```shell -wget https://github.com/uclahs-cds/private-moPepGen/raw/main/test/files/reditools/reditools_annotated.txt +wget https://github.com/uclahs-cds/package-moPepGen/raw/main/test/files/reditools/reditools_annotated.txt ``` Parse REDItools output to GVF: - -```shell -moPepGen parseREDItools \ - -i reditools_annotated.txt \ - -o rna_editing_reditools.gvf \ - --index-dir index \ - --source RNAEditing -``` - -By default `parseREDItools` looks for the transcript ID in column 17. This can be changed with `--transcript-id-column`, which takes a 1-based column number. See [here](./parse-reditools) for a complete list of arguments. +By default, `parseREDItools` looks for the transcript ID in column 17. This can be changed with `--transcript-id-column`, which takes a 1-based column number. See [here](./parse-reditools) for a complete list of arguments. ### CircRNA @@ -202,7 +180,7 @@ CircRNAs are commonly recognized as noncoding RNAs, but evidence has shown that Download demo data: ```shell -wget https://github.com/uclahs-cds/private-moPepGen/raw/main/test/files/circRNA/CIRCexplorer_circularRNA_known.txt +wget https://github.com/uclahs-cds/package-moPepGen/raw/main/test/files/circRNA/CIRCexplorer_circularRNA_known.txt ``` Parse it into GVF format. @@ -215,7 +193,7 @@ moPepGen parseCIRCexplorer \ --source CircRNA ``` -By default `parseCIRCexplorer` accepts the text file output by CIRCexplorer2, however CIRCexplorer3 is also supported with the `--circexplorer3` flag. We also provide a series of filtering parameters that can be found [here](./parse-circexplorer). +By default `parseCIRCexplorer` accepts the text file output by CIRCexplorer2, however, CIRCexplorer3 is also supported with the `--circexplorer3` flag. We also provide a series of filtering parameters that can be found [here](./parse-circexplorer). ## Non-canonical Peptides Calling @@ -249,7 +227,7 @@ Similar to `callVariant`, trypsin is the default enzyme and the default maximum ### Alternative Translation Peptides -Alternative translation peptides are those that harbor special events during translation, such as selenocysteine termination and W > F substitutants, where the genetic code are not altered but a different polypeptide is produced (see [here](./call-alt-translation) for more details). Similar to noncoding peptides, `callAltTranslation` only calls peptides using reference transcripts. +Alternative translation peptides are those that harbor special events during translation, such as selenocysteine termination and W > F substitutants, where the genetic code is not altered but a different polypeptide is produced (see [here](./call-alt-translation) for more details). Similar to noncoding peptides, `callAltTranslation` only calls peptides using reference transcripts. ```shell moPepGen callAltTranslation \ @@ -261,7 +239,7 @@ And again, `callAltTranslation` also uses trypsin as the default enzyme, and up ## Post-processing -moPepGen provides a series of post-processing commands that aims to deliver FASTA files ready for database searching. The post-processing tasks include summarization of a non-canonical database, filtering a database by transcript abundance, splitting a detabase to separate databases, creating decoy databases, shortening fasta headers for search engines to handle, and merging multiple database files for multiplexed proteomic experiments. +moPepGen provides a series of post-processing commands that aim to deliver FASTA files ready for database searching. The post-processing tasks include generating summary statistics of a non-canonical database, filtering a database by transcript abundance, splitting a database to separate tiered databases, creating decoy databases, shortening fasta headers for easy handling by search engines, and merging multiple database files for multiplexed proteomic experiments. ### Summarizing @@ -285,18 +263,18 @@ Because moPepGen calls enzymatically cleaved peptides, there is the possibility HETLFLLTFPR ``` -To resolve the issue of collapsed peptides like the example above, we use the `--order-source` argument that takes the priority order of sources considered. It takes the source names in a comma separated format. For example `--order-source gSNP,RNAEditing` will prioritize gSNP over RNA editing events, thus the example peptide above will be assigned to the gSNP category. Note that the values passed into `--order-source` must match the values used in `--source` in the corresponding parser calls. If `--order-source` is not provided, the source priority order will be inferred from the order of input GVF files. +To resolve the issue of collapsed peptides like the example above, we use the `--order-source` argument that takes the priority order of sources considered. It takes the source names in a comma-separated format. For example `--order-source gSNP,RNAEditing` will prioritize gSNP over RNA editing events, thus the example peptide above will be assigned to the gSNP category. Note that the values passed into `--order-source` must match the values used in `--source` in the corresponding parser calls. If `--order-source` is not provided, the source priority order will be inferred from the order of input GVF files. Besides variant peptides called by `callVariant`, noncoding peptides and alternative translation peptides can also be passed to `summarizeFasta` with `--noncoding-peptides` and `--alt-translation-peptides`. ### Filtering -The `fitlerFasta` command is provided to take a RNA abundance matrix and filter the non-canonical peptides based on the abundances of their corresponding transcripts. +The `fitlerFasta` module is designed to take an RNA abundance matrix and filter the non-canonical peptides based on the abundances of their corresponding transcripts. We provide an example RSEM table for demonstration. ```shell -wget https://github.com/uclahs-cds/private-moPepGen/raw/main/test/files/rsem/rsem.txt +wget https://github.com/uclahs-cds/package-moPepGen/raw/main/test/files/rsem/rsem.txt ``` ```shell @@ -315,7 +293,7 @@ moPepGen filterFasta \ ### Splitting -The `splitFasta` command is provided to split a variant peptide database into several separate databases for tiered database searching, particularly for the purpose of database-specific false disocvery rate control. +The `splitFasta` command is provided to split a variant peptide database into several separate databases for tiered database searching, particularly for the purpose of database-specific false discovery rate control. ```shell mkdir -p split @@ -326,9 +304,9 @@ moPepGen splitFasta \ --index-dir index ``` -Similar to `summarizeFasta`, `splitFasta` also takes a `--order-source` to specify the priority order of which category a peptide should be assigned to, and will be inferred from the input GVFs if not specified. `--group-source` is used to group sources as a super category. For example, `--group-source Germline:gSNP,gINDEL Somatic:sSNV,sINDEL` will group sources of `gSNP` and `gINDEL` together as `Germline`, and `sSNV` and `sINDEL` as `Somatic`. +Similar to `summarizeFasta`, `splitFasta` also takes a `--order-source` to specify the priority order of which category a peptide should be assigned to, which will be inferred from the input GVFs if not specified. `--group-source` is used to group sources as a super category. For example, `--group-source Germline:gSNP,gINDEL Somatic:sSNV,sINDEL` will group sources of `gSNP` and `gINDEL` together as `Germline`, and `sSNV` and `sINDEL` as `Somatic`. -Note that, when assigning a peptide to a source category, it must carry exclusively the desired type(s) of variants. For example, a peptide of 'ENST00000622235.5|SNV-100-G-T|SNV-110-C-A|2' is assigned to `SNV`, while a peptide of 'ENST00000622235.5|SNV-100-G-T|RES-110-C-A|2' will be assigned to the category of `SNV-RNAEditing` but not `SNV`. `--max-source-groups` is used to specify the maximum number of source groups that should be split into individual FASTA files. The default value is 1, which means all peptides that contains two or more types of variants will not be written into their own FASTA file, but kept in the '\_Remaining.fasta' file. +Note that, when assigning a peptide to a source category, it must carry exclusively the desired type(s) of variants. For example, a peptide of 'ENST00000622235.5|SNV-100-G-T|SNV-110-C-A|2' is assigned to `SNV`, while a peptide of 'ENST00000622235.5|SNV-100-G-T|RES-110-C-A|2' will be assigned to the category of `SNV-RNAEditing` but not `SNV`. `--max-source-groups` is used to specify the maximum number of source groups that should be split into individual FASTA files. The default value is 1, which means all peptides that contain two or more types of variants will not be written into their own FASTA file, but kept in the '\_Remaining.fasta' file. Similar to `summarizeFasta`, noncoding and alternative translation peptides can be passed to `splitFasta` via `--noncoding-peptides` and `--alt-translation-peptides`. @@ -336,7 +314,7 @@ See [here](./split-fasta) for a complete list of arguments. ### Target-Decoy Database -Most search engines expect a target-decoy database as input to estimate false discovery rate (FDR). We provide a `decoyFasta` command, that takes a variant peptide database and adds decoy sequences with either the `reverse` or `shuffle` algorithm. +Most search engines expect a target-decoy database as input to estimate the false discovery rate (FDR). We provide a `decoyFasta` command, that takes a variant peptide database and adds decoy sequences with either the `reverse` or `shuffle` algorithm. ```shell moPepGen decoyFasta \ diff --git a/main.py b/main.py index 0698e92e..fd384c10 100644 --- a/main.py +++ b/main.py @@ -42,7 +42,7 @@ def get_arg_data(command:str): def get_arg_usage(command:str): stream = io.StringIO() with redirect_stdout(stream): - parser = argparse.ArgumentParser() + parser = argparse.ArgumentParser(prog='moPepGen') subparsers = parser.add_subparsers() add_parser = COMMAND_MAPPER[command] p = add_parser(subparsers) diff --git a/mkdocs.yml b/mkdocs.yml index 7ce25c3e..4b52d562 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -1,7 +1,7 @@ --- site_name: moPepGen docs_dir: docs/ -repo_url: https://github.com/uclahs-cds/private-moPepGen +repo_url: https://github.com/uclahs-cds/package-moPepGen site_author: Chenghao Zhu nav: - Home: index.md diff --git a/moPepGen/circ/io.py b/moPepGen/circ/io.py index fbaf30db..8ab9f7f6 100644 --- a/moPepGen/circ/io.py +++ b/moPepGen/circ/io.py @@ -1,11 +1,14 @@ """ IO """ -from typing import Iterable, List, IO +from __future__ import annotations +from typing import Iterable, List, IO, TYPE_CHECKING from moPepGen import GVF_HEADER from moPepGen.SeqFeature import FeatureLocation, SeqFeature -from moPepGen.seqvar import GVFMetadata from .CircRNA import CircRNAModel +if TYPE_CHECKING: + from moPepGen.seqvar import GVFMetadata + def parse(handle:IO) -> Iterable[CircRNAModel]: """ Parse a circRNA TSV file and returns an iterable of CircRNAModel object """ diff --git a/moPepGen/cli/parse_vep.py b/moPepGen/cli/parse_vep.py index 6a857e26..2e81c336 100644 --- a/moPepGen/cli/parse_vep.py +++ b/moPepGen/cli/parse_vep.py @@ -69,7 +69,7 @@ def parse_vep(args:argparse.Namespace) -> None: for record in VEPParser.parse(handle): transcript_id = record.feature - if transcript_id not in vep_records.keys(): + if transcript_id not in vep_records: vep_records[transcript_id] = [] try: diff --git a/moPepGen/gtf/GtfIO.py b/moPepGen/gtf/GtfIO.py index 41b4810f..252cf09d 100644 --- a/moPepGen/gtf/GtfIO.py +++ b/moPepGen/gtf/GtfIO.py @@ -1,11 +1,14 @@ """ Module for GTF IO """ -from typing import IO, Union, Iterable +from __future__ import annotations +from typing import IO, Union, Iterable, TYPE_CHECKING from Bio.SeqIO.Interfaces import SequenceIterator from moPepGen.SeqFeature import FeatureLocation -from moPepGen.gtf import GenomicAnnotation from .GTFSeqFeature import GTFSeqFeature +if TYPE_CHECKING: + from moPepGen.gtf import GenomicAnnotation + class GtfIterator(SequenceIterator): """ GTF Iterator """ def __init__(self, source:Union[IO, str], mode='t'): diff --git a/moPepGen/seqvar/VariantRecordPoolOnDisk.py b/moPepGen/seqvar/VariantRecordPoolOnDisk.py index 1fb9d933..1cc5b40e 100644 --- a/moPepGen/seqvar/VariantRecordPoolOnDisk.py +++ b/moPepGen/seqvar/VariantRecordPoolOnDisk.py @@ -3,8 +3,7 @@ import copy from typing import Dict, IO, Iterable, List, TYPE_CHECKING, Union from pathlib import Path -from moPepGen import ERROR_INDEX_IN_INTRON, check_sha512 -from moPepGen.circ.CircRNA import CircRNAModel +from moPepGen import ERROR_INDEX_IN_INTRON, check_sha512, circ from moPepGen.seqvar.GVFIndex import GVFPointer, iterate_pointer from moPepGen.seqvar.GVFMetadata import GVFMetadata from moPepGen.seqvar.VariantRecord import ALTERNATIVE_SPLICING_TYPES @@ -21,7 +20,7 @@ class TranscriptionalVariantSeries(): """ Variants associated with a particular transcript """ def __init__(self, transcriptional:List[VariantRecord]=None, intronic:List[VariantRecord]=None, fusion:List[VariantRecord]=None, - circ_rna:List[CircRNAModel]=None + circ_rna:List[circ.CircRNAModel]=None ) -> None: """ constructor """ self.transcriptional = transcriptional or [] @@ -113,14 +112,14 @@ def __iter__(self) -> Iterable[str]: def __getitem__(self, key:str) -> TranscriptionalVariantSeries: """ Load variants and return as a TranscriptVariants object """ - records:List[Union[VariantRecord, CircRNAModel]] = [] + records:List[Union[VariantRecord, circ.CircRNAModel]] = [] for pointer in self.pointers[key]: records += pointer.load() records = set(records) series = TranscriptionalVariantSeries() cached_seqs:Dict[str, DNASeqRecordWithCoordinates] = {} for record in records: - if isinstance(record, CircRNAModel): + if isinstance(record, circ.CircRNAModel): series.circ_rna.append(record) continue diff --git a/moPepGen/svgraph/PVGNode.py b/moPepGen/svgraph/PVGNode.py index f922b3f6..cd348f12 100644 --- a/moPepGen/svgraph/PVGNode.py +++ b/moPepGen/svgraph/PVGNode.py @@ -450,7 +450,7 @@ def get_stop_lost_variants(self, stop_index:int) -> List[seqvar.VariantRecord]: def frames_shifted(self) -> int: """ Get total frames shifted of all variants """ - return sum([v.variant.frames_shifted() for v in self.variants]) % 3 + return sum(v.variant.frames_shifted() for v in self.variants) % 3 def find_reference_next(self) -> PVGNode: """ Find and return the next reference node. The next reference node diff --git a/moPepGen/svgraph/VariantPeptideDict.py b/moPepGen/svgraph/VariantPeptideDict.py index 1e9ea9c8..e5aefdec 100644 --- a/moPepGen/svgraph/VariantPeptideDict.py +++ b/moPepGen/svgraph/VariantPeptideDict.py @@ -43,7 +43,7 @@ def __init__(self, nodes:List[PVGNode], additional_variants:Set[VariantRecord]): def __len__(self) -> int: """ Get length of node sequences """ if self._len is None: - self._len = sum([len(x.seq.seq) for x in self.nodes]) + self._len = sum(len(x.seq.seq) for x in self.nodes) return self._len def is_too_short(self, param:params.CleavageParams) -> bool: diff --git a/setup.cfg b/setup.cfg index 047d5d1a..9b1d5799 100644 --- a/setup.cfg +++ b/setup.cfg @@ -7,9 +7,9 @@ author_email = ChenghaoZhu@mednet.ucla.edu, yiyangliu@mednet.ucla.edu description = Multi-omic peptide generator long_description = file: README.md long_description_content_type = text/markdown -url = https://github.com/uclahs-cds/private-moPepGen +url = https://github.com/uclahs-cds/package-moPepGen project_urls = - Bug Tracker = https://github.com/uclahs-cds/private-moPepGen/issues + Bug Tracker = https://github.com/uclahs-cds/package-moPepGen/issues classifiers = Programming Language :: Python :: 3 License :: OSI Approved :: GNU General Public License v2 (GPLv2) diff --git a/test/integration/test_brute_force.py b/test/integration/test_brute_force.py index b07d40c0..68665b13 100644 --- a/test/integration/test_brute_force.py +++ b/test/integration/test_brute_force.py @@ -7,6 +7,7 @@ from test.integration import TestCaseIntegration from moPepGen import util +# pylint: disable=duplicate-code def create_base_args() -> argparse.Namespace: """ Create base args """ diff --git a/test/integration/test_split_fasta.py b/test/integration/test_split_fasta.py index 708d808c..3734b814 100644 --- a/test/integration/test_split_fasta.py +++ b/test/integration/test_split_fasta.py @@ -66,7 +66,7 @@ def test_split_fasta_case1(self): expected = { 'test_gINDEL.fasta','test_gSNP.fasta', 'test_Fusion.fasta', 'test_RNAEditingSite.fasta', 'test_circRNA.fasta', - 'test_Remaining.fasta', 'test_circRNA.fasta', 'test_Noncoding.fasta', + 'test_Remaining.fasta', 'test_Noncoding.fasta', 'test_CodonReassign.fasta', 'test_SECT.fasta' } self.assertEqual(files, expected) @@ -91,7 +91,7 @@ def test_split_fasta_case2(self): files = {str(file.name) for file in self.work_dir.glob('*')} expected = { 'test_coding.fasta', 'test_RNAEditingSite.fasta', 'test_Fusion.fasta', - 'test_circRNA.fasta', 'test_Remaining.fasta', 'test_circRNA.fasta', + 'test_circRNA.fasta', 'test_Remaining.fasta', 'test_Noncoding.fasta', 'test_CodonReassign.fasta', 'test_SECT.fasta' } self.assertEqual(files, expected) @@ -147,7 +147,7 @@ def test_split_fasta_case4(self): expected = { 'test_gINDEL.fasta','test_gSNP.fasta', 'test_Fusion.fasta', 'test_RNAEditingSite.fasta', 'test_circRNA.fasta', - 'test_Remaining.fasta', 'test_circRNA.fasta', 'test_Noncoding.fasta', + 'test_Remaining.fasta', 'test_Noncoding.fasta', 'test_CodonReassign.fasta', 'test_SECT.fasta' } self.assertEqual(files, expected) @@ -172,7 +172,7 @@ def test_split_fasta_case5(self): files = {str(file.name) for file in self.work_dir.glob('*')} expected = { 'test_coding.fasta', 'test_RNAEditingSite.fasta', 'test_Fusion.fasta', - 'test_circRNA.fasta', 'test_Remaining.fasta', 'test_circRNA.fasta', + 'test_circRNA.fasta', 'test_Remaining.fasta', 'test_Noncoding.fasta', 'test_ALT.fasta' } self.assertEqual(files, expected) diff --git a/test/unit/test_peptide_variant_graph.py b/test/unit/test_peptide_variant_graph.py index 1a94a370..f5f3e0d1 100644 --- a/test/unit/test_peptide_variant_graph.py +++ b/test/unit/test_peptide_variant_graph.py @@ -359,7 +359,7 @@ def test_merge_join_with_cleave(self): graph.rule = 'trypsin' branches,_ = graph.merge_join(nodes[4]) seqs = {str(node.seq.seq) for node in nodes[1].out_nodes} - seqs_expect = {'NCWHSTQV', 'NCWR', 'NCWHSTQQ', 'NCWR'} + seqs_expect = {'NCWHSTQV', 'NCWR', 'NCWHSTQQ'} self.assertEqual(seqs, seqs_expect) self.assertEqual(len(nodes[4].in_nodes), 0) self.assertEqual(len(nodes[4].out_nodes), 0)