diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json new file mode 100644 index 0000000..b290e09 --- /dev/null +++ b/.devcontainer/devcontainer.json @@ -0,0 +1,20 @@ +{ + "name": "nfcore", + "image": "nfcore/gitpod:latest", + "remoteUser": "gitpod", + "runArgs": ["--privileged"], + + // Configure tool-specific properties. + "customizations": { + // Configure properties specific to VS Code. + "vscode": { + // Set *default* container specific settings.json values on container create. + "settings": { + "python.defaultInterpreterPath": "/opt/conda/bin/python" + }, + + // Add the IDs of extensions you want installed when the container is created. + "extensions": ["ms-python.python", "ms-python.vscode-pylance", "nf-core.nf-core-extensionpack"] + } + } +} diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 0000000..cc6bf3b --- /dev/null +++ b/.editorconfig @@ -0,0 +1,38 @@ +root = true + +[*] +charset = utf-8 +end_of_line = lf +insert_final_newline = true +trim_trailing_whitespace = true +indent_size = 4 +indent_style = space + +[*.{md,yml,yaml,html,css,scss,js}] +indent_size = 2 + +# These files are edited and tested upstream in nf-core/modules +[/modules/nf-core/**] +charset = unset +end_of_line = unset +insert_final_newline = unset +trim_trailing_whitespace = unset +indent_style = unset +[/subworkflows/nf-core/**] +charset = unset +end_of_line = unset +insert_final_newline = unset +trim_trailing_whitespace = unset +indent_style = unset + +[/assets/email*] +indent_size = unset + +# ignore python and markdown +[*.{py,md}] +indent_style = unset + +# ignore tests +[*.test] +indent_style = unset +end_of_line = unset diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..7a2dabc --- /dev/null +++ b/.gitattributes @@ -0,0 +1,4 @@ +*.config linguist-language=nextflow +*.nf.test linguist-language=nextflow +modules/nf-core/** linguist-generated +subworkflows/nf-core/** linguist-generated diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md new file mode 100644 index 0000000..6d3bdbc --- /dev/null +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -0,0 +1,22 @@ + + +## PR checklist + +- [ ] This comment contains a description of changes (with reason). +- [ ] If you've fixed a bug or added code that should be tested, add tests! +- [ ] Make sure your code lints (`nf-core pipelines lint`). +- [ ] Ensure the test suite passes (`nextflow run . -profile test,docker --outdir `). +- [ ] Check for unexpected warnings in debug mode (`nextflow run . -profile debug,test,docker --outdir `). +- [ ] Usage Documentation in `docs/usage.md` is updated. +- [ ] Output Documentation in `docs/output.md` is updated. +- [ ] `CHANGELOG.md` is updated. +- [ ] `README.md` is updated (including new tool citations and authors/contributors). diff --git a/.github/workflows/branch.yml b/.github/workflows/branch.yml new file mode 100644 index 0000000..9677924 --- /dev/null +++ b/.github/workflows/branch.yml @@ -0,0 +1,44 @@ +name: nf-core branch protection +# This workflow is triggered on PRs to main branch on the repository +# It fails when someone tries to make a PR against the nf-core `main` branch instead of `dev` +on: + pull_request_target: + branches: [main] + +jobs: + test: + runs-on: ubuntu-latest + steps: + # PRs to the nf-core repo main branch are only ok if coming from the nf-core repo `dev` or any `patch` branches + - name: Check PRs + if: github.repository == 'phac-nml/legiovue' + run: | + { [[ ${{github.event.pull_request.head.repo.full_name }} == phac-nml/legiovue ]] && [[ $GITHUB_HEAD_REF == "dev" ]]; } || [[ $GITHUB_HEAD_REF == "patch" ]] + + # If the above check failed, post a comment on the PR explaining the failure + # NOTE - this doesn't currently work if the PR is coming from a fork, due to limitations in GitHub actions secrets + - name: Post PR comment + if: failure() + uses: mshick/add-pr-comment@b8f338c590a895d50bcbfa6c5859251edc8952fc # v2 + with: + message: | + ## This PR is against the `main` branch :x: + + * Do not close this PR + * Click _Edit_ and change the `base` to `dev` + * This CI test will remain failed until you push a new commit + + --- + + Hi @${{ github.event.pull_request.user.login }}, + + It looks like this pull-request is has been made against the [${{github.event.pull_request.head.repo.full_name }}](https://github.com/${{github.event.pull_request.head.repo.full_name }}) `main` branch. + The `main` branch on nf-core repositories should always contain code from the latest release. + Because of this, PRs to `main` are only allowed if they come from the [${{github.event.pull_request.head.repo.full_name }}](https://github.com/${{github.event.pull_request.head.repo.full_name }}) `dev` branch. + + You do not need to close this PR, you can change the target branch to `dev` by clicking the _"Edit"_ button at the top of this page. + Note that even after this, the test will continue to show as failing until you push a new commit. + + Thanks again for your contribution! + repo-token: ${{ secrets.GITHUB_TOKEN }} + allow-repeats: false diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..f1bfa83 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,58 @@ +name: nf-core CI +# This workflow runs the pipeline with the minimal test dataset to check that it completes without any syntax errors +on: + push: + branches: + - dev + pull_request: + release: + types: [published] + workflow_dispatch: + +env: + NXF_ANSI_LOG: false + NXF_SINGULARITY_CACHEDIR: ${{ github.workspace }}/.singularity + NXF_SINGULARITY_LIBRARYDIR: ${{ github.workspace }}/.singularity + +concurrency: + group: "${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}" + cancel-in-progress: true + +jobs: + test: + name: "Run pipeline with test data (${{ matrix.NXF_VER }} | ${{ matrix.test_name }} | docker)" + # Only run on push if this is the nf-core dev branch (merged PRs) + if: "${{ github.event_name != 'push' || (github.event_name == 'push' && github.repository == 'phac-nml/legiovue') }}" + runs-on: ubuntu-latest + strategy: + matrix: + NXF_VER: + - "23.10.1" + - "latest-everything" + test_name: + - "test_full" + - "test" + steps: + - name: Check out pipeline code + uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b # v4 + + - name: Set up Nextflow + uses: nf-core/setup-nextflow@v2 + with: + version: "${{ matrix.NXF_VER }}" + + - name: Clean up Disk space + uses: jlumbroso/free-disk-space@54081f138730dfa15788a46383842cd2f914a1be # v1.3.1 + + - name: Install nf-test + run: | + wget -qO- https://get.nf-test.com | bash + sudo mv nf-test /usr/local/bin/ + + - name: Run nf-test + run: | + nf-test test --verbose + + - name: "Run pipeline with test data ${{ matrix.NXF_VER }} | ${{ matrix.test_name }} | docker" + run: | + nextflow run ${GITHUB_WORKSPACE} -profile ${{ matrix.test_name }},docker --outdir ./results diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml new file mode 100644 index 0000000..6bfe937 --- /dev/null +++ b/.github/workflows/linting.yml @@ -0,0 +1,83 @@ +name: nf-core linting +# This workflow is triggered on pushes and PRs to the repository. +# It runs the `nf-core pipelines lint` and markdown lint tests to ensure +# that the code meets the nf-core guidelines. +on: + push: + branches: + - dev + pull_request: + release: + types: [published] + +jobs: + pre-commit: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b # v4 + + - name: Set up Python 3.12 + uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5 + with: + python-version: "3.12" + + - name: Install pre-commit + run: pip install pre-commit + + - name: Run pre-commit + run: pre-commit run --all-files + + nf-core: + runs-on: ubuntu-latest + steps: + - name: Check out pipeline code + uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b # v4 + + - name: Install Nextflow + uses: nf-core/setup-nextflow@v2 + + - uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5 + with: + python-version: "3.12" + architecture: "x64" + + - name: read .nf-core.yml + uses: pietrobolcato/action-read-yaml@1.1.0 + id: read_yml + with: + config: ${{ github.workspace }}/.nf-core.yml + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install nf-core==${{ steps.read_yml.outputs['nf_core_version'] }} + + - name: Run nf-core pipelines lint + if: ${{ github.base_ref != 'main' }} + env: + GITHUB_COMMENTS_URL: ${{ github.event.pull_request.comments_url }} + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + GITHUB_PR_COMMIT: ${{ github.event.pull_request.head.sha }} + run: nf-core -l lint_log.txt pipelines lint --dir ${GITHUB_WORKSPACE} --markdown lint_results.md + + - name: Run nf-core pipelines lint --release + if: ${{ github.base_ref == 'main' }} + env: + GITHUB_COMMENTS_URL: ${{ github.event.pull_request.comments_url }} + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + GITHUB_PR_COMMIT: ${{ github.event.pull_request.head.sha }} + run: nf-core -l lint_log.txt pipelines lint --release --dir ${GITHUB_WORKSPACE} --markdown lint_results.md + + - name: Save PR number + if: ${{ always() }} + run: echo ${{ github.event.pull_request.number }} > PR_number.txt + + - name: Upload linting log file artifact + if: ${{ always() }} + uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808 # v4 + with: + name: linting-logs + path: | + lint_log.txt + lint_results.md + PR_number.txt diff --git a/.github/workflows/linting_comment.yml b/.github/workflows/linting_comment.yml new file mode 100644 index 0000000..42e519b --- /dev/null +++ b/.github/workflows/linting_comment.yml @@ -0,0 +1,28 @@ +name: nf-core linting comment +# This workflow is triggered after the linting action is complete +# It posts an automated comment to the PR, even if the PR is coming from a fork + +on: + workflow_run: + workflows: ["nf-core linting"] + +jobs: + test: + runs-on: ubuntu-latest + steps: + - name: Download lint results + uses: dawidd6/action-download-artifact@bf251b5aa9c2f7eeb574a96ee720e24f801b7c11 # v6 + with: + workflow: linting.yml + workflow_conclusion: completed + + - name: Get PR number + id: pr_number + run: echo "pr_number=$(cat linting-logs/PR_number.txt)" >> $GITHUB_OUTPUT + + - name: Post PR comment + uses: marocchino/sticky-pull-request-comment@331f8f5b4215f0445d3c07b4967662a32a2d3e31 # v2 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + number: ${{ steps.pr_number.outputs.pr_number }} + path: linting-logs/lint_results.md diff --git a/.gitignore b/.gitignore index bd66163..af8e8e4 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,7 @@ results/ testing/ testing* *.pyc +null/ +slurm* +.nf-test.log +.nf-test/ diff --git a/.nf-core.yml b/.nf-core.yml index 3805dc8..ad3fecf 100644 --- a/.nf-core.yml +++ b/.nf-core.yml @@ -1 +1,38 @@ repository_type: pipeline +nf_core_version: "3.0.2" +bump_version: null +lint: + files_exist: + - CODE_OF_CONDUCT.md + - .github/.dockstore.yml + - .github/CONTRIBUTING.md + - .github/ISSUE_TEMPLATE/bug_report.yml + - .github/ISSUE_TEMPLATE/config.yml + - .github/ISSUE_TEMPLATE/feature_request.yml + - .github/workflows/awstest.yml + - .github/workflows/awsfulltest.yml + - assets/email_template.html + - assets/email_template.txt + - assets/multiqc_config.yml + - assets/sendmail_template.txt + - assets/nf-core-LegioVue_logo_light.png + - conf/igenomes.config + - conf/igenomes_ignored.config + - docs/images/nf-core-LegioVue_logo_light.png + - docs/images/nf-core-LegioVue_logo_dark.png + files_unchanged: + - .github/PULL_REQUEST_TEMPLATE.md + - .github/workflows/branch.yml + - .github/workflows/linting.yml + - docs/README.md + - LICENSE + - .gitignore + multiqc_config: False + pipeline_name_conventions: False + actions_awsfulltest: False + actions_awstest: False + readme: False + nextflow_config: False + +template: + prefix: phac-nml diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..9e9f0e1 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,13 @@ +repos: + - repo: https://github.com/pre-commit/mirrors-prettier + rev: "v3.1.0" + hooks: + - id: prettier + additional_dependencies: + - prettier@3.2.5 + + - repo: https://github.com/editorconfig-checker/editorconfig-checker.python + rev: "3.0.3" + hooks: + - id: editorconfig-checker + alias: ec diff --git a/.prettierignore b/.prettierignore new file mode 100644 index 0000000..483f90c --- /dev/null +++ b/.prettierignore @@ -0,0 +1,12 @@ +.nextflow* +work/ +data/ +results/ +.DS_Store +testing/ +testing* +*.pyc +bin/ +adaptivecard.json +slackreport.json +tests/ diff --git a/.prettierrc.yml b/.prettierrc.yml new file mode 100644 index 0000000..c81f9a7 --- /dev/null +++ b/.prettierrc.yml @@ -0,0 +1 @@ +printWidth: 120 diff --git a/CHANGELOG.md b/CHANGELOG.md index c0ef23a..5d5877a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,7 +3,47 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## v0.1.0 - [Beta-Test-2024-11-29] +## [0.2.0] - 2025-01-24 ### `Added` + +- `nf-schema` plugin and associated functions + - Schemas + - Param summary, param help, version + - samplesheetToList +- `params.input ` to allow input samplesheets +- `iridanext` plugin +- `nf-prov` plugin +- Required nf-core files +- CI tests and linting +- Added in quality parameters to allow more user freedom: + - max_contigs + - min_align_percent + - min_reads_warn + - min_n50_score + - max_n50_score + +### `Changed` + +- Final quality metrics output is a CSV now to work with IRIDA next +- Logic for input data +- Logic for skipping specific modules + - Allowed to skip el_gato ST + - Allowed to skip el_gato allele plotting +- All process publishDir now in the `modules.conf` file +- Container for allele plotting +- Adjusted default warn and fail parameters for quality module based on testing + - `min_reads` to 60,000 from 150,000 + +### `Updated` + +- Usage and README docs for the input adjustments + +## [0.1.0] - Beta-Test-2024-11-29 + +### `Added` + - LegioVue pipeline created and initial beta code added + +[0.1.0]: https://github.com/phac-nml/legiovue/releases/tag/0.1.0 +[0.2.0]: https://github.com/phac-nml/legiovue/releases/tag/0.2.0 diff --git a/CITATIONS.md b/CITATIONS.md index 229e01a..489f01e 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -8,42 +8,42 @@ [Kraken2](https://github.com/DerrickWood/kraken2) ->Wood, D.E., Lu, J. & Langmead, B. Improved metagenomic analysis with Kraken 2. Genome Biol 20, 257 (2019). https://doi.org/10.1186/s13059-019-1891-0 +> Wood, D.E., Lu, J. & Langmead, B. Improved metagenomic analysis with Kraken 2. Genome Biol 20, 257 (2019). https://doi.org/10.1186/s13059-019-1891-0 [Bracken](https://github.com/jenniferlu717/Bracken) ->Lu, J., Breitwieser, F. P., Thielen, P., and Salzberg, S. L. (2017). Bracken: estimating species abundance in metagenomics data. PeerJ Comput. Sci. 3, e104. doi: 10.7717/peerj-cs.104 +> Lu, J., Breitwieser, F. P., Thielen, P., and Salzberg, S. L. (2017). Bracken: estimating species abundance in metagenomics data. PeerJ Comput. Sci. 3, e104. doi: 10.7717/peerj-cs.104 [Trimmomatic](https://github.com/usadellab/Trimmomatic) ->Bolger, A. M., Lohse, M., and Usadel, B. (2014). Trimmomatic: a flexible trimmer for Illumina sequence data. Bioinformatics 30, 2114–2120. doi: 10.1093/bioinformatics/btu170 +> Bolger, A. M., Lohse, M., and Usadel, B. (2014). Trimmomatic: a flexible trimmer for Illumina sequence data. Bioinformatics 30, 2114–2120. doi: 10.1093/bioinformatics/btu170 [FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/) ->Andrews S. (2010). FastQC: a quality control tool for high throughput sequence data. Available online at: http://www.bioinformatics.babraham.ac.uk/projects/fastqc +> Andrews S. (2010). FastQC: a quality control tool for high throughput sequence data. Available online at: http://www.bioinformatics.babraham.ac.uk/projects/fastqc [SPAdes](https://github.com/ablab/spades) ->Prjibelski, A., Antipov, D., Meleshko, D., Lapidus, A., and Korobeynikov, A. (2020). Using SPAdes De Novo Assembler. Current Protocols in Bioinformatics 70, e102. doi: 10.1002/cpbi.102 +> Prjibelski, A., Antipov, D., Meleshko, D., Lapidus, A., and Korobeynikov, A. (2020). Using SPAdes De Novo Assembler. Current Protocols in Bioinformatics 70, e102. doi: 10.1002/cpbi.102 [QUAST](https://github.com/ablab/quast) ->Alla Mikheenko, Vladislav Saveliev, Pascal Hirsch, Alexey Gurevich, -WebQUAST: online evaluation of genome assemblies, -Nucleic Acids Research (2023) 51 (W1): W601–W606. doi: 10.1093/nar/gkad406 +> Alla Mikheenko, Vladislav Saveliev, Pascal Hirsch, Alexey Gurevich, +> WebQUAST: online evaluation of genome assemblies, +> Nucleic Acids Research (2023) 51 (W1): W601–W606. doi: 10.1093/nar/gkad406 [el_gato](https://github.com/appliedbinf/el_gato) ->Alan Collins, Dev Mashruwala, Andrew Conley, Lavanya Rishishwar, Emily T. Norris, Anna Gaines, Will Overholt. Epidemiology of Legionella : Genome-bAsed Typing. Available online at: https://github.com/appliedbinf/el_gato +> Alan Collins, Dev Mashruwala, Andrew Conley, Lavanya Rishishwar, Emily T. Norris, Anna Gaines, Will Overholt. Epidemiology of Legionella : Genome-bAsed Typing. Available online at: https://github.com/appliedbinf/el_gato [chewBBACA](https://github.com/B-UMMI/chewBBACA) ->Silva M, Machado MP, Silva DN, Rossi M, Moran-Gilad J, Santos S, Ramirez M, Carriço JA. 2018. chewBBACA: A complete suite for gene-by-gene schema creation and strain identification. Microb Genom 4:000166. doi:10.1099/mgen.0.000166 +> Silva M, Machado MP, Silva DN, Rossi M, Moran-Gilad J, Santos S, Ramirez M, Carriço JA. 2018. chewBBACA: A complete suite for gene-by-gene schema creation and strain identification. Microb Genom 4:000166. doi:10.1099/mgen.0.000166 [ReporTree](https://github.com/insapathogenomics/ReporTree) ->Mixão V, Pinto M, Sobral D, Di Pasquale A, Gomes JP, Borges V (2023) ReporTree: a surveillance-oriented tool to strengthen the linkage between pathogen genetic clusters and epidemiological data. Genome Medicine. doi: 10.1186/s13073-023-01196-1 +> Mixão V, Pinto M, Sobral D, Di Pasquale A, Gomes JP, Borges V (2023) ReporTree: a surveillance-oriented tool to strengthen the linkage between pathogen genetic clusters and epidemiological data. Genome Medicine. doi: 10.1186/s13073-023-01196-1 [GrapeTree](https://github.com/achtman-lab/GrapeTree) ->Zhou, Z., Alikhan, N.-F., Sergeant, M. J., Luhmann, N., Vaz, C., Francisco, A. P., et al. (2018). GrapeTree: visualization of core genomic relationships among 100,000 bacterial pathogens. Genome Res. 28, 1395–1404. doi: 10.1101/gr.232397.117 \ No newline at end of file +> Zhou, Z., Alikhan, N.-F., Sergeant, M. J., Luhmann, N., Vaz, C., Francisco, A. P., et al. (2018). GrapeTree: visualization of core genomic relationships among 100,000 bacterial pathogens. Genome Res. 28, 1395–1404. doi: 10.1101/gr.232397.117 diff --git a/README.md b/README.md index b834492..79f035a 100644 --- a/README.md +++ b/README.md @@ -1,168 +1,211 @@ -# legiovue -LegioVue is a nextflow pipeline for whole-genome analysis of *Legionella pneumophila*. It performs *in silico* sequence typing, genome assembly, and core-genome analysis. It also provides detailed information about the quality of *L. pneumophila* genomes. The name is an homage to the Bellevue-Stratford hotel, site of the first known outbreak of Legionnaire's Disease. - -This project serves as a repository for tools, notes, and informtation regarding the LegioVue pipeline. This project is a GRDI funded research project surrounding the **assessment and implementation of a whole genome sequencing scheme for rapid resolution of _Legionella pneumophila_ outbreaks within Canada to better protect vulnerable populations**. The goal is to generate and nationally deploy a standardized pipeline that will shift _L. pneumophila_ analysis from conventional sequence based typing to whole genome sequence-based typing and clustering, for rapid detection and response to Legionnaires' Disease outbreaks in Canada. - -## Big Picture Overview -**LegioVue** contains a combination of tools that are used to do *de novo* assembly, sequence typing, cgMLST, and quality control for all input samples with the end goal in having the available data to confirm cluster outbreaks. Currently, clustering is not included in the pipeline but its addition is to come soon. With this, there are additional available steps on how to use all of the outputs to do cluster analysis. - -![LegioVue-WGS-Workflow.png](LegioVue-WGS-Workflow.png) ---- - -## Index -- [Installation](#installation) -- [Resource Requirements](#resources-requirements) -- [Quick Usage](#quick-usage) -- [Quick Outputs](#quick-outputs) -- [Pipeline Components and Settings](#pipeline-components-and-settings) -- [Limitations](#limitations) -- [Citations](#citations) -- [Contributing](#contributing) -- [Legal](#legal) - -## Installation -Installation requires both [nextflow](https://www.nextflow.io/) (minimum version tested `23.10.1`) and a dependency management system to run. - -Steps: -1. Download and install nextflow - 1. Download and install with [conda](https://docs.conda.io/en/latest/miniconda.html) - - Conda command: `conda create on nextflow -c conda-forge -c bioconda nextflow` - 2. Install with the instructions at https://www.nextflow.io/ - -2. Determine which dependency management system works best for you - - *Note*: Currently the plotting process is using a custom docker container - -3. Run the pipeline with one of the following profiles to handle dependencies (or use your [own profile](https://nf-co.re/docs/usage/getting_started/configuration) if you have one for your institution! The NML one is included as an example): - - `conda` - - `mamba` - - `singularity` - - `docker` - -## Resources Requirements -By default, the `kraken2` and `SPAdes` steps have a minimum resource usage allocation set to `8 cpus` and `48GB memory` using the nf-core `process_high` label. - -This can be adjusted (along with the other labels) by creating and passing a [custom configuration file](https://nf-co.re/docs/usage/getting_started/configuration) with `-c ` or by adjusting the `--max_cpus` and `--max_memory` parameters. More info can be found in the [usage doc](./docs/usage.md) - -## Quick Usage -Detailed run and parameter instructions are found in the [usage doc here](./docs/usage.md). - -To just get started and run the pipeline, the following basic command is all that is required: - -```bash -nextflow run phac-nml/legiovue \ - -profile \ - --fastq_dir \ - [Optional Args] -``` - -Where: -- `-profile `: The nextflow profile to use. - - Specification of a dependency management system (docker, singularity, conda) -- `--fastq_dir `: Path to directory containing paired Illumina `_R1` and `_R2` fastq files - - Fastqs must be formatted as `_{R1,R2}\*.fastq\*` - - At the moment everything before the first `_R1/_R2` is kept as the sample name - -> [!NOTE] -> The default kraken2 standard database is hosted on AWS. In the event the connection is interrupted the pipeline will fail out. It is recommended to use/download a database from [the kraken2 database zone](https://benlangmead.github.io/aws-indexes/k2) and include `--kraken2 ` in the command above. The 8GB standard DB is the default. - -## Quick Outputs -All of the outputs can be found in [the output docs](./docs/output.md). All outputs are by default put in the `results` folder with some of the major outputs being as follows: -- `spades/`: Contains the SPAdes assemblies (contigs as .fasta files) for each sample. -- `el_gato/el_gato_st.tsv`: Summarized el_gato ST calls for all samples. -- `chewbbaca/allele_calls/cgMLST/`: cgMLST profiles that can be used for downstream visualization. -- `overall.qc.tsv`: Final quality summary report for each sample throughout the different pipeline steps. Important quality flags can be found in this file. - -## Pipeline Components and Settings - -**`Kraken2`** and **`Bracken`** - -[Kraken2](https://github.com/DerrickWood/kraken2) is used to taxonomically profile the paired Illumina reads against the standard Kraken RefSeq database with a confidence level of 0.1 (`--confidence 0.1`). [Bracken](https://github.com/jenniferlu717/Bracken) is then used to estimate taxonomic abundances (including potential contaminants) from the Kraken profile. - -**`Trimmomatic`** - -[Trimmomatic](https://github.com/usadellab/Trimmomatic) is used to remove Illumina adapters (`ILLUMINACLIP:TruSeq3-PE.fa:2:30:10:2:True`) and trim reads according to quality (`LEADING:3`, `TRAILING:3`, `SLIDINGWINDOW:4:20`). Reads shorter than 100bp are dropped (`MINLEN:100`). - -**`FastQC`** - -[FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/) provides quality information about the trimmed reads including estimates of duplication, %GC, and N content. Samples retaining fewer than 150,000 high-quality read pairs after trimming are removed unless `--min_reads ` is specified. - -**`SPAdes`** and **`QUAST`** - -High-quality reads (both paired and unpaired) are then assembled into Legionella genomes using the [SPAdes](https://github.com/ablab/spades) assembler and `--careful` option, which aims to minimize mismatches and short indels in the assembly. The quality of the resulting assemblies is evaluated with [QUAST](https://github.com/ablab/quast). At this step, genomes are compared to a _Legionella pneumophila_ [reference genome](data/C9_S.reference.fna) and an assembly quality score is calculated for each sample using a custom script. - -The `quast_analyzer.py` script assigns a score to each SPAdes assembly based on pre-cgMLST metrics (_e.g.,_ similarity to RefSeq complete _Lp_ genomes, N50, # contigs, %GC content) originally outlined in the supplementary appendix (Supplementary Table 2) of the following paper: - -> Gorzynski, J., Wee, B., Llano, M., Alves, J., Cameron, R., McMenamin, J., et al. (2022). Epidemiological analysis of Legionnaires’ disease in Scotland: a genomic study. The Lancet Microbe 3, e835–e845. doi: 10.1016/S2666-5247(22)00231-2 - -Quality thresholds and score effects have been updated in this pipeline to better capture quality issues that are likely to affect the interpretation of the resulting cgMLST profile. Assemblies are assigned a quality score out of 6, where a score of 6/6 represents an "excellent" high-quality _Legionella pneumophila_ assembly. - -**`el_gato`** - -[el_gato](https://github.com/appliedbinf/el_gato) performs _in silico_ Sequence-based Typing (SBT) of _Legionella pneumophila_ sequences based on the identification and comparison of 7 loci (_flaA, pilE, asd, mip, mompS, proA, neuA/neuAh_) against an allele database. In this pipeline SBT is first called on Illumina paired-end reads using a mapping/alignment approach that is recommended by the `el_gato` developers. If samples are not initially assigned a sequence type (ST = `MA?` or `MD-`), `el_gato` is run again on the assembled genome using an _in silico_ PCR-based approach. The resulting allele and ST calls are reported in `el_gato_st.tsv`. - -_Note: if the ST results are inconclusive after both approaches have been tried, users are encouraged to review the `possible_mlsts.txt` intermediate output for that sample in the pipeline results folder under `el_gato/reads/`_ - -**`chewBBACA`** - -Assembled _Legionella pneumophila_ genomes are passed to [chewBBACA](https://github.com/B-UMMI/chewBBACA), which performs Core Genome MultiLocus Sequence Typing (cgMLST) according to the published [Ridom SeqSphere](https://www.cgmlst.org/ncs/schema/Lpneumophila1410/locus/) 1521-loci cgMLST schema for _L. pneumophila_. - -**cgMLST Visualization and Clustering** - -**`PHYLOViZ`** and **`reporTree`** - -_Note: Reportree requires an update before it can be properly incorporated into the nextflow pipeline. Users can run reportree on their pipeline output separately for now to produce the same visualizations._ - -Visualize cgMLST profiles alongside sample metadata using one of the following two methods: - -i) Either drop the cgMLST profile (e.g., `cgMLST100.tsv`) directly into [PhyloViz](https://online2.phyloviz.net/index) and upload metadata for visualization, or, -ii) Perform partitioning (clustering) with [ReporTree](https://github.com/insapathogenomics/ReporTree), which will generate outputs (MST and metadata) that can be visualized with the local version of [GrapeTree](https://achtman-lab.github.io/GrapeTree/MSTree_holder.html). - -Detailed instructions for clustering and visualization are provided [separately](docs/clustering.md). - -**Quality Summary** - -LegioVue outputs a summary of quality metrics and warnings for each step of the workflow in the `overall.qc.tsv` file - -The final quality summary has two columns: `qc_status` and `qc_message` that can be used to quickly determine if a sample is good or may have an issue. The `qc_status` column will be any of the following statuses: -- Pass: The sample passes all checks! -- Warn: The sample was flagged for a specific warning -- Fail: The sample has failed out of the pipeline and may not be included in the final cgMLST profile. - -The `qc_message` column contains the reason for the `qc_status` and includes: - -| Message | Associated Status | Flag Reason | -| - | - | - | -| low_lpn_abundance | WARN | Low (< 75%) *L. pneumophila* abundance is not expected with isolate sequencing and may indicate contamination. | -| low_read_count | WARN | Low read count (< 300,000 reads default) has been shown to lead to poor, uninformative assemblies. | -| low_n50 | WARN | Low N50 scores (< 100,000) have been shown to negatively affect clustering outputs by inflating observed allele differences. | -| low_exact_allele_calls | WARN | Low chewBBACA exact allele calls (< 90%) indicate that there may be issues in the assembly, possibly affecting the cgMLST profile. | -| low_qc_score | WARN | Low QUAST-Analyzer QC score (< 4) indicates that there may be issues in the assembly, possibly affecting the cgMLST profile. | -| no_lpn_detected | FAIL | Very low (< 10% default) *L.pneumophila* abundance flags that the sample may not be *L.pneumophila* and sample is removed from the remainder of the pipeline | -| failing_read_count | FAIL | Post-trimming read count below failing threshold (< 150,000 reads default) has been shown to lead to poor, uninformative assemblies and sample is removed. | - -## Limitations -This pipeline is intended to be run on *Legionella pneumophila* paired illumina isolate sequencing data. In the future Nanopore long-read sequencing data will also be supported. - -## Citations -This pipeline uses code and infrastructure developed and maintained by the [nf-core](https://nf-co.re) community, reused here under the [MIT license](https://github.com/nf-core/tools/blob/master/LICENSE). - -> The nf-core framework for community-curated bioinformatics pipelines. -> -> Philip Ewels, Alexander Peltzer, Sven Fillinger, Harshil Patel, Johannes Alneberg, Andreas Wilm, Maxime Ulysse Garcia, Paolo Di Tommaso & Sven Nahnsen. -> -> Nat Biotechnol. 2020 Feb 13. doi: 10.1038/s41587-020-0439-x. -> In addition, references of tools and data used in this pipeline are as follows: - -Detailed citations for utilized tools are found in [CITATIONS.md](./CITATIONS.md) - -## Contributing -Contributions are welcome through creating PRs or Issues - -## Legal -Copyright 2024 Government of Canada - -Licensed under the MIT License (the "License"); you may not use this work except in compliance with the License. You may obtain a copy of the License at: - -https://opensource.org/license/mit/ - -Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. +# LegioVue + +LegioVue is a nextflow pipeline for whole-genome analysis of _Legionella pneumophila_. It performs _in silico_ sequence typing, genome assembly, and core-genome analysis. It also provides detailed information about the quality of _L. pneumophila_ genomes. The name is an homage to the Bellevue-Stratford hotel, site of the first known outbreak of Legionnaire's Disease. + +This project serves as a repository for the LegioVue analysis pipeline along with validation notes and information on follow-up data analysis steps like clustering. This project is a GRDI-funded research project surrounding the **assessment and implementation of a whole genome sequencing scheme for rapid resolution of _Legionella pneumophila_ outbreaks within Canada to better protect vulnerable populations**. The goal is to generate and nationally deploy a standardized pipeline that will shift _L. pneumophila_ analysis from conventional sequence based typing to whole genome sequence-based typing and clustering, for rapid detection and response to Legionnaires' Disease outbreaks in Canada. + +## Big Picture Overview + +**LegioVue** contains a combination of tools that are used to do _de novo_ assembly, sequence typing, cgMLST, and quality control for all input samples with the end goal in having analyzed and formatted data to confirm cluster outbreaks. Currently, clustering is not included in the pipeline but its addition is to come soon. However, we include [additional instructions](./docs/clustering.md) on how to perform cluster analysis on generated outputs. + +## ![LegioVue-WGS-Workflow.png](LegioVue-WGS-Workflow.png) + +## Index + +- [Installation](#installation) +- [Resource Requirements](#resources-requirements) +- [Quick Usage](#quick-usage) +- [Quick Outputs](#quick-outputs) +- [Pipeline Components and Settings](#pipeline-components-and-settings) +- [Limitations](#limitations) +- [Citations](#citations) +- [Contributing](#contributing) +- [Legal](#legal) + +## Installation + +Installation requires both [nextflow](https://www.nextflow.io/) (minimum version tested `23.10.1`) and a dependency management system to run. + +Steps: + +1. Download and install nextflow + + 1. Download and install with [conda](https://docs.conda.io/en/latest/miniconda.html) + - Conda command: `conda create -n nextflow -c conda-forge -c bioconda nextflow` + 2. Install with the instructions at https://www.nextflow.io/ + +2. Determine which dependency management system works best for you + + - _Note_: Currently the plotting process is using a custom docker container + +3. Run the pipeline with one of the following profiles to handle dependencies (or use your [own profile](https://nf-co.re/docs/usage/getting_started/configuration) if you have one for your institution! The NML one is included as an example): + - `conda` + - `mamba` + - `singularity` + - `docker` + +## Resources Requirements + +By default, the `kraken2` and `SPAdes` steps have a minimum resource usage allocation set to `8 cpus` and `48GB memory` using the nf-core `process_high` label. + +This can be adjusted (along with the other labels) by creating and passing a [custom configuration file](https://nf-co.re/docs/usage/getting_started/configuration) with `-c ` or by adjusting the `--max_cpus` and `--max_memory` parameters. More info can be found in the [usage doc](./docs/usage.md) + +The recommended `kraken2` database is the 8Gb standard database that can be found on the [AWS Index server](s3://genome-idx/kraken/standard_08gb_20240904) or the [the kraken2 database zone](https://benlangmead.github.io/aws-indexes/k2) so the required memory can be lowered a decent bit (16Gb) with minimal impact if resources are a limiting factor. + +## Quick Usage + +Detailed run and parameter instructions are found in the [usage doc here](./docs/usage.md). + +To just get started and run the pipeline, one of the following basic commands is all that is required to do so. The only difference between the two being in how the input fastq data is specified/found: + +Directory Input: + +```bash +nextflow run phac-nml/legiovue \ + -profile \ + --fastq_dir \ + --kraken2_db \ + [Optional Args] +``` + +Where: + +- `-profile `: The nextflow profile to use. + - Specification of a dependency management system (docker, singularity, conda) +- `--fastq_dir `: Path to directory containing paired Illumina `_R1` and `_R2` fastq files + - Fastqs must be formatted as `_{R1,R2}\*.fastq\*` + - At the moment everything before the first `_R1/_R2` is kept as the sample name +- `--kraken2_db `: Path to a kraken2 database + +Samplesheet CSV Input: + +```bash +nextflow run phac-nml/legiovue \ + -profile \ + --input \ + --kraken2_db \ + [Optional Args] +``` + +Where: + +- `-profile `: The nextflow profile to use. + - Specification of a dependency management system (docker, singularity, conda) +- `--input `: Path to a CSV file with the header line `sample,fastq_1,fastq_2` + - `sample` is the name of the sample + - `fastq_1,fastq_2` is the path to both the fastq reads + - Note that paired end sequencing is required at this time! + - [Example file](./tests/test_data/input.csv) +- `--kraken2_db `: Path to a kraken2 database + +> [!NOTE] +> The recommended 8GB `kraken2` standard database can be found on the [AWS Index server](s3://genome-idx/kraken/standard_08gb_20240904) or the [the kraken2 database zone](https://benlangmead.github.io/aws-indexes/k2). Download this before running the pipeline! + +## Quick Outputs + +All of the outputs can be found in [the output docs](./docs/output.md). All outputs are by default put in the `results` folder with some of the major outputs being as follows: + +- `spades/`: Contains the SPAdes assemblies (contigs as .fasta files) for each sample. +- `el_gato/el_gato_st.tsv`: Summarized el_gato ST calls for all samples. +- `chewbbaca/allele_calls/cgMLST/`: cgMLST profiles that can be used for downstream visualization. +- `overall.qc.csv`: Final quality summary report for each sample throughout the different pipeline steps. Important quality flags can be found in this file. + +## Pipeline Components and Settings + +**`Kraken2`** and **`Bracken`** + +[Kraken2](https://github.com/DerrickWood/kraken2) is used to taxonomically profile the paired Illumina reads against the standard Kraken RefSeq database with a confidence level of 0.1 (`--confidence 0.1`). [Bracken](https://github.com/jenniferlu717/Bracken) is then used to estimate taxonomic abundances (including potential contaminants) from the Kraken profile. + +**`Trimmomatic`** + +[Trimmomatic](https://github.com/usadellab/Trimmomatic) is used to remove Illumina adapters (`ILLUMINACLIP:TruSeq3-PE.fa:2:30:10:2:True`) and trim reads according to quality (`LEADING:3`, `TRAILING:3`, `SLIDINGWINDOW:4:20`). Reads shorter than 100bp are dropped (`MINLEN:100`). + +**`FastQC`** + +[FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/) provides quality information about the trimmed reads including estimates of duplication, %GC, and N content. Samples retaining fewer than 150,000 high-quality read pairs after trimming are removed unless `--min_reads ` is specified. + +**`SPAdes`** and **`QUAST`** + +High-quality reads (both paired and unpaired) are then assembled into Legionella genomes using the [SPAdes](https://github.com/ablab/spades) assembler and `--careful` option, which aims to minimize mismatches and short indels in the assembly. The quality of the resulting assemblies is evaluated with [QUAST](https://github.com/ablab/quast). At this step, genomes are compared to a _Legionella pneumophila_ [reference genome](data/C9_S.reference.fna) and an assembly quality score is calculated for each sample using a custom script. + +The `quast_analyzer.py` script assigns a score to each SPAdes assembly based on pre-cgMLST metrics (_e.g.,_ similarity to RefSeq complete _Lp_ genomes, N50, # contigs, %GC content) originally outlined in the supplementary appendix (Supplementary Table 2) of the following paper: + +> Gorzynski, J., Wee, B., Llano, M., Alves, J., Cameron, R., McMenamin, J., et al. (2022). Epidemiological analysis of Legionnaires’ disease in Scotland: a genomic study. The Lancet Microbe 3, e835–e845. doi: 10.1016/S2666-5247(22)00231-2 + +Quality thresholds and score effects have been updated in this pipeline to better capture quality issues that are likely to affect the interpretation of the resulting cgMLST profile. Assemblies are assigned a quality score out of 6, where a score of 6/6 represents an "excellent" high-quality _Legionella pneumophila_ assembly. + +**`el_gato`** + +[el_gato](https://github.com/appliedbinf/el_gato) performs _in silico_ Sequence-based Typing (SBT) of _Legionella pneumophila_ sequences based on the identification and comparison of 7 loci (_flaA, pilE, asd, mip, mompS, proA, neuA/neuAh_) against an allele database. In this pipeline SBT is first called on Illumina paired-end reads using a mapping/alignment approach that is recommended by the `el_gato` developers. If samples are not initially assigned a sequence type (ST = `MA?` or `MD-`), `el_gato` is run again on the assembled genome using an _in silico_ PCR-based approach. The resulting allele and ST calls are reported in `el_gato_st.tsv`. + +_Note: if the ST results are inconclusive after both approaches have been tried, users are encouraged to review the `possible_mlsts.txt` intermediate output for that sample in the pipeline results folder under `el_gato/reads/`_ + +**`chewBBACA`** + +Assembled _Legionella pneumophila_ genomes are passed to [chewBBACA](https://github.com/B-UMMI/chewBBACA), which performs Core Genome MultiLocus Sequence Typing (cgMLST) according to the published [Ridom SeqSphere](https://www.cgmlst.org/ncs/schema/Lpneumophila1410/locus/) 1521-loci cgMLST schema for _L. pneumophila_. + +**cgMLST Visualization and Clustering** + +**`PHYLOViZ`** and **`reporTree`** + +_Note: Reportree requires an update before it can be properly incorporated into the nextflow pipeline. Users can run reportree on their pipeline output separately for now to produce the same visualizations._ + +Visualize cgMLST profiles alongside sample metadata using one of the following two methods: + +i) Either drop the cgMLST profile (e.g., `cgMLST100.tsv`) directly into [PhyloViz](https://online2.phyloviz.net/index) and upload metadata for visualization, or, +ii) Perform partitioning (clustering) with [ReporTree](https://github.com/insapathogenomics/ReporTree), which will generate outputs (MST and metadata) that can be visualized with the local version of [GrapeTree](https://achtman-lab.github.io/GrapeTree/MSTree_holder.html). + +Detailed instructions for clustering and visualization are provided [separately](docs/clustering.md). + +**Quality Summary** + +LegioVue outputs a summary of quality metrics and warnings for each step of the workflow in the `overall.qc.csv` file + +The final quality summary has two columns: `qc_status` and `qc_message` that can be used to quickly determine if a sample is good or may have an issue. The `qc_status` column will be any of the following statuses: + +- Pass: The sample passes all checks! +- Warn: The sample was flagged for a specific warning +- Fail: The sample has failed out of the pipeline and may not be included in the final cgMLST profile. + +The `qc_message` column contains the reason for the `qc_status` and includes: + +| Message | Associated Status | Flag Reason | +| ---------------------- | ----------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| low_lpn_abundance | WARN | Low (< 75%) _L. pneumophila_ abundance is not expected with isolate sequencing and may indicate contamination. | +| low_read_count | WARN | Low read count (< 150,000 reads default) has been shown to lead to poor, uninformative assemblies. | +| low_n50 | WARN | Low N50 scores (< 100,000) have been shown to negatively affect clustering outputs by inflating observed allele differences. | +| low_exact_allele_calls | WARN | Low chewBBACA exact allele calls (< 90%) indicate that there may be issues in the assembly, possibly affecting the cgMLST profile. | +| low_qc_score | WARN | Low QUAST-Analyzer QC score (< 4) indicates that there may be issues in the assembly, possibly affecting the cgMLST profile. | +| no_lpn_detected | FAIL | Very low (< 10% default) _L.pneumophila_ abundance flags that the sample may not be _L.pneumophila_ and sample is removed from the remainder of the pipeline | +| failing_read_count | FAIL | Post-trimming read count below failing threshold (< 60,000 reads default) has been shown to lead to poor, uninformative assemblies and sample is removed. | + +## Limitations + +This pipeline is intended to be run on _Legionella pneumophila_ paired illumina isolate sequencing data. In the future Nanopore long-read sequencing data will also be supported. + +## Citations + +This pipeline uses code and infrastructure developed and maintained by the [nf-core](https://nf-co.re) community, reused here under the [MIT license](https://github.com/nf-core/tools/blob/master/LICENSE). + +> The nf-core framework for community-curated bioinformatics pipelines. +> +> Philip Ewels, Alexander Peltzer, Sven Fillinger, Harshil Patel, Johannes Alneberg, Andreas Wilm, Maxime Ulysse Garcia, Paolo Di Tommaso & Sven Nahnsen. +> +> Nat Biotechnol. 2020 Feb 13. doi: 10.1038/s41587-020-0439-x. +> In addition, references of tools and data used in this pipeline are as follows: + +Detailed citations for utilized tools are found in [CITATIONS.md](./CITATIONS.md) + +## Contributing + +Contributions are welcome through creating PRs or Issues + +## Legal + +Copyright 2024 Government of Canada + +Licensed under the MIT License (the "License"); you may not use this work except in compliance with the License. You may obtain a copy of the License at: + +https://opensource.org/license/mit/ + +Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. diff --git a/assets/adaptivecard.json b/assets/adaptivecard.json new file mode 100644 index 0000000..721981b --- /dev/null +++ b/assets/adaptivecard.json @@ -0,0 +1,67 @@ +{ + "type": "message", + "attachments": [ + { + "contentType": "application/vnd.microsoft.card.adaptive", + "contentUrl": null, + "content": { + "\$schema": "http://adaptivecards.io/schemas/adaptive-card.json", + "msteams": { + "width": "Full" + }, + "type": "AdaptiveCard", + "version": "1.2", + "body": [ + { + "type": "TextBlock", + "size": "Large", + "weight": "Bolder", + "color": "<% if (success) { %>Good<% } else { %>Attention<%} %>", + "text": "nf-core/testest v${version} - ${runName}", + "wrap": true + }, + { + "type": "TextBlock", + "spacing": "None", + "text": "Completed at ${dateComplete} (duration: ${duration})", + "isSubtle": true, + "wrap": true + }, + { + "type": "TextBlock", + "text": "<% if (success) { %>Pipeline completed successfully!<% } else { %>Pipeline completed with errors. The full error message was: ${errorReport}.<% } %>", + "wrap": true + }, + { + "type": "TextBlock", + "text": "The command used to launch the workflow was as follows:", + "wrap": true + }, + { + "type": "TextBlock", + "text": "${commandLine}", + "isSubtle": true, + "wrap": true + } + ], + "actions": [ + { + "type": "Action.ShowCard", + "title": "Pipeline Configuration", + "card": { + "type": "AdaptiveCard", + "\$schema": "http://adaptivecards.io/schemas/adaptive-card.json", + "body": [ + { + "type": "FactSet", + "facts": [<% out << summary.collect{ k,v -> "{\"title\": \"$k\", \"value\" : \"$v\"}"}.join(",\n") %> + ] + } + ] + } + } + ] + } + } + ] +} diff --git a/assets/samplesheet.csv b/assets/samplesheet.csv new file mode 100644 index 0000000..8c11054 --- /dev/null +++ b/assets/samplesheet.csv @@ -0,0 +1,3 @@ +sample,fastq_1,fastq_2 +sample1,https://github.com/phac-nml/legiovue/raw/dev/tests/test_data/in1_R1.fastq.gz,https://github.com/phac-nml/legiovue/raw/dev/tests/test_data/in1_R2.fastq.gz +sample2,https://github.com/phac-nml/legiovue/raw/dev/tests/test_data/in2_R1.fastq.gz,https://github.com/phac-nml/legiovue/raw/dev/tests/test_data/in2_R2.fastq.gz diff --git a/assets/schema_input.json b/assets/schema_input.json new file mode 100644 index 0000000..b659c43 --- /dev/null +++ b/assets/schema_input.json @@ -0,0 +1,33 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://raw.githubusercontent.com/phac-nml/legiovue/main/assets/schema_input.json", + "title": "phac-nml/legiovue pipeline - params.input schema", + "description": "Schema for the file provided with params.input", + "type": "array", + "items": { + "type": "object", + "properties": { + "sample": { + "type": "string", + "pattern": "^\\S+$", + "errorMessage": "Sample name must be provided and cannot contain spaces", + "meta": ["id"] + }, + "fastq_1": { + "type": "string", + "format": "file-path", + "exists": true, + "pattern": "^\\S+\\.f(ast)?q(\\.gz)?$", + "errorMessage": "FastQ file for reads 1 must be provided, cannot contain spaces and must have extension '.fq', '.fq.gz', '.fastq', or '.fastq.gz'" + }, + "fastq_2": { + "type": "string", + "format": "file-path", + "exists": true, + "pattern": "^\\S+\\.f(ast)?q(\\.gz)?$", + "errorMessage": "FastQ file for reads 2 cannot contain spaces and must have extension '.fq', '.fq.gz', '.fastq', or '.fastq.gz'" + } + }, + "required": ["sample", "fastq_1", "fastq_2"] + } +} diff --git a/assets/slackreport.json b/assets/slackreport.json new file mode 100644 index 0000000..ea86951 --- /dev/null +++ b/assets/slackreport.json @@ -0,0 +1,34 @@ +{ + "attachments": [ + { + "fallback": "Plain-text summary of the attachment.", + "color": "<% if (success) { %>good<% } else { %>danger<%} %>", + "author_name": "nf-core/testest ${version} - ${runName}", + "author_icon": "https://www.nextflow.io/docs/latest/_static/favicon.ico", + "text": "<% if (success) { %>Pipeline completed successfully!<% } else { %>Pipeline completed with errors<% } %>", + "fields": [ + { + "title": "Command used to launch the workflow", + "value": "```${commandLine}```", + "short": false + } + <% + if (!success) { %> + , + { + "title": "Full error message", + "value": "```${errorReport}```", + "short": false + }, + { + "title": "Pipeline configuration", + "value": "<% out << summary.collect{ k,v -> k == "hook_url" ? "_${k}_: (_hidden_)" : ( ( v.class.toString().contains('Path') || ( v.class.toString().contains('String') && v.contains('/') ) ) ? "_${k}_: `${v}`" : (v.class.toString().contains('DateTime') ? ("_${k}_: " + v.format(java.time.format.DateTimeFormatter.ofLocalizedDateTime(java.time.format.FormatStyle.MEDIUM))) : "_${k}_: ${v}") ) }.join(",\n") %>", + "short": false + } + <% } + %> + ], + "footer": "Completed at <% out << dateComplete.format(java.time.format.DateTimeFormatter.ofLocalizedDateTime(java.time.format.FormatStyle.MEDIUM)) %> (duration: ${duration})" + } + ] +} diff --git a/bin/combine_qc_data.py b/bin/combine_qc_data.py index 3791ede..fe80b5c 100755 --- a/bin/combine_qc_data.py +++ b/bin/combine_qc_data.py @@ -42,7 +42,7 @@ def parse_args() -> argparse.ArgumentParser: '--quast_tsv', type=Path, required=False, - help="Quast summary TSV output" + help="QUAST summary TSV output" ) parser.add_argument( '-st', @@ -63,14 +63,21 @@ def parse_args() -> argparse.ArgumentParser: '--final_score_csv', type=Path, required=False, - help="Quast final score CSV output" + help="QUAST final score CSV output" ) parser.add_argument( - '--min_reads', + '--min_reads_fail', + type=int, + required=False, + default=60000, + help="Threshold for minimum number of reads required to allow sample to be passed through the pipeline" + ) + parser.add_argument( + '--min_reads_warn', type=int, required=False, default=150000, - help="Minimum number of reads required to be passed through the pipeline" + help="Threshold for minimum number of reads that will be given a QC warning" ) parser.add_argument( '--min_abundance_percent', @@ -145,7 +152,7 @@ def main() -> None: """Entry point""" parser = parse_args() args = parser.parse_args() - + # Parse each given file to add to our outdict sample = str(args.sample) outdict = {'sample': sample} @@ -180,13 +187,13 @@ def main() -> None: # Don't overwrite the failed reason as if it fails at abundance that should # be reported - if outdict['num_paired_trimmed_reads'] < args.min_reads and not failed: + if outdict['num_paired_trimmed_reads'] < args.min_reads_fail and not failed: failed = True failed_reason = ['failing_read_count'] - elif outdict['num_paired_trimmed_reads'] < 300000: + elif outdict['num_paired_trimmed_reads'] < args.min_reads_warn: warn_qual_criteria.append('low_read_count') - # Quast + # QUAST outdict['n50'] = 0 outdict['num_contigs'] = 0 outdict['pct_gc'] = 0 @@ -278,7 +285,7 @@ def main() -> None: outdict['qc_message'] = ';'.join(warn_qual_criteria) df = pd.DataFrame([outdict]) - df.to_csv(f'{sample}.qc.tsv', sep='\t', index=False) + df.to_csv(f'{sample}.qc.csv', sep=',', index=False) if __name__ == "__main__": diff --git a/bin/filter_lpn_abundance.py b/bin/filter_lpn_abundance.py index 1a0f0d7..9418126 100755 --- a/bin/filter_lpn_abundance.py +++ b/bin/filter_lpn_abundance.py @@ -3,7 +3,7 @@ Simple helper script to check that Bracken Lpn abundance is above the required input threshold -In the future we may want to include this as part of the +In the future we may want to include this as part of the nextflow/groovy script over it being a full process but with development time it is here for now ''' diff --git a/bin/plot_genome_cov.R b/bin/plot_genome_cov.R index ed57e78..1aae933 100755 --- a/bin/plot_genome_cov.R +++ b/bin/plot_genome_cov.R @@ -4,7 +4,7 @@ ## Legionella ST allele to help investigations on ## missing or non-called STs ## ------------------------------------------------ ## -library(argparse) +library(optparse) library(data.table) library(ggplot2) library(patchwork) @@ -55,7 +55,7 @@ create_plots <- function(gene, df) { area(2,1), area(3,1) ) - combined_plot <- plot1/plot2/plot3 + + combined_plot <- plot1/plot2/plot3 + plot_layout( design = areas ) + @@ -70,12 +70,17 @@ create_plots <- function(gene, df) { ## Main Script ## ## ----------- ## # Args -parser <- ArgumentParser() -parser$add_argument("-i", "--input_tsv", - help="Path to pysamstats TSV file with mapQ and baseQ annotated") -parser$add_argument("-o", "--outfile", default="el_gato_allele_plots.pdf", - help="Output plot filename") -args <- parser$parse_args() +option_list <- list( + make_option( + c("-i", "--input_tsv"), + help="Path to pysamstats TSV file with mapQ and baseQ annotated" + ), + make_option( + c("-o", "--outfile"), default="el_gato_allele_plots.pdf", + help="Output plot filename") + ) +opt_parser <- OptionParser(option_list = option_list) +args <- parse_args(opt_parser) # Split based on chrom df <- read.table(args$input_tsv, sep = '\t', header = TRUE) diff --git a/bin/quast_analyzer.py b/bin/quast_analyzer.py index e2d97a7..f036d9f 100755 --- a/bin/quast_analyzer.py +++ b/bin/quast_analyzer.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 """ -Analyze quast transposed results TSV file +Analyze QUAST transposed results TSV file according to provided specifications """ import argparse @@ -37,7 +37,7 @@ def init_parser() -> argparse.ArgumentParser: Returns: argparse.ArgumentParser: Parser """ - parser = argparse.ArgumentParser(prog='quast_analyzer', description='Analyze quast results and output the comparison.') + parser = argparse.ArgumentParser(prog='quast_analyzer', description='Analyze QUAST results and output the comparison.') # Input/Output Options parser.add_argument( 'input_file', @@ -65,15 +65,27 @@ def init_parser() -> argparse.ArgumentParser: '--max_contigs', type=int, default=100, - help='Threshold for the number of contigs > 500bp assembled by SPAdes' + help='Threshold for the number of contigs > 500bp assembled by SPAdes to get scoring points' ) parser.add_argument( '--min_align_percent', type=int, default=75, - help='Thresold for minimum quast genome fraction percentage' + help='Thresold for minimum QUAST genome fraction percentage to get scoring points' ) - + parser.add_argument( + '--min_n50_score', + type=int, + default=80000, + help='Thresold for minimum QUAST N50 value to obtain scoring points' + ) + parser.add_argument( + '--max_n50_score', + type=int, + default=220000, + help='Thresold for maximum QUAST N50 score to get max scoring points' + ) + # Version # parser.add_argument( '-v', @@ -101,7 +113,7 @@ def parse_sample_line(sample_line: str, headers: list) -> dict: """Split sample line string into key-val dict based on headers Args: - sample_line (str): Tab separated sample data line + sample_line (str): Tab separated sample data line headers (list): List of headers Returns: @@ -111,13 +123,11 @@ def parse_sample_line(sample_line: str, headers: list) -> dict: return dict(zip(headers, fields)) -def calculate_score(metric: int, bottom=100000, top=300000): - """Calculate variable score based on a bottom and top range +def calculate_n50_score(metric: int, bottom: int, top: int) -> float: + """Calculate N50 variable score based on a bottom and top range determined from testing Args: metric (int): Metric to score - bottom (int, optional): Bottom of the score range. Defaults to 100000. - top (int, optional): Top of the score range. Defaults to 300000. Returns: float: 2-digit calculated score between 0-1 @@ -131,7 +141,10 @@ def calculate_score(metric: int, bottom=100000, top=300000): return round((metric - bottom) / (top - bottom), 2) -def analyze_sample(sample: dict, max_contigs: int, min_align_percent: int) -> dict: +def analyze_sample( + sample: dict, max_contigs: int, min_align_percent: int, + min_n50_score: int, max_n50_score: int +) -> dict: """Extract and values from the sample dictionary Args: @@ -166,7 +179,7 @@ def analyze_sample(sample: dict, max_contigs: int, min_align_percent: int) -> di num_contigs_score = 1 # Evaluate "N50" - n50_score = calculate_score(n50) + n50_score = calculate_n50_score(n50, min_n50_score, max_n50_score) score += n50_score # Evaluate "Duplication ratio" @@ -261,7 +274,10 @@ def main() -> None: sample_data = parse_sample_line(line.strip(), headers) # Analyze the sample and append the result - result = analyze_sample(sample_data, args.max_contigs, args.min_align_percent) + result = analyze_sample( + sample_data, args.max_contigs, args.min_align_percent, + args.min_n50_score, args.max_n50_score + ) if result: results_list.append(result) diff --git a/conf/iridanext.config b/conf/iridanext.config new file mode 100644 index 0000000..7f3f36e --- /dev/null +++ b/conf/iridanext.config @@ -0,0 +1,32 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Config for IRIDA-Next Plugin +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Set how to find wanted files +---------------------------------------------------------------------------------------- +*/ +iridanext { + enabled = true + output { + path = "${params.outdir}/iridanext.output.json.gz" + overwrite = true + files { + idkey = "id" + global = [ + "**/el_gato/el_gato_st.tsv", + "**/chewbbaca/allele_calls/results_statistics.tsv", + "**/chewbbaca/allele_calls/cgMLST/cgMLST100.tsv", + "**/chewbbaca/allele_calls/cgMLST/cgMLST99.tsv" + ] + samples = ["**/spades/*.contigs.fa"] + } + metadata { + samples { + csv { + path = "**/overall.qc.csv" + idcol = "sample" + } + } + } + } +} diff --git a/conf/modules.config b/conf/modules.config index 21be542..444e37b 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -10,11 +10,148 @@ ---------------------------------------------------------------------------------------- */ process { + + withName: KRAKEN2_CLASSIFY { + publishDir = [ + path: { "${params.outdir}/kraken_bracken" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: BRACKEN { + publishDir = [ + path: { "${params.outdir}/kraken_bracken" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: TRIMMOMATIC { + publishDir = [ + path: { "${params.outdir}/trimmomatic" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + withName: FASTQC { publishDir = [ path: { "${params.outdir}/fastqc" }, - mode: 'copy', + mode: params.publish_dir_mode, pattern: '*.html' ] } + + withName: SPADES { + publishDir = [ + path: { "${params.outdir}/spades" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: QUAST { + publishDir = [ + path: { "${params.outdir}/quast" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: SCORE_QUAST { + publishDir = [ + path: { "${params.outdir}/quast" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: EL_GATO_READS { + publishDir = [ + path: { "${params.outdir}/el_gato/reads" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: EL_GATO_ASSEMBLY { + publishDir = [ + path: { "${params.outdir}/el_gato/assembly" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: EL_GATO_REPORT { + publishDir = [ + path: { "${params.outdir}/el_gato" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: COMBINE_EL_GATO { + publishDir = [ + path: { "${params.outdir}/el_gato" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: CSVTK_JOIN_ALLELE_STATS { + publishDir = [ + path: { "${params.outdir}/el_gato/allele_stats" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: PLOT_EL_GATO_ALLELES { + publishDir = [ + path: { "${params.outdir}/el_gato/plots" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: CHEWBBACA_PREP_EXTERNAL_SCHEMA { + publishDir = [ + path: { "${params.outdir}/chewbbaca" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: CHEWBBACA_ALLELE_CALL { + publishDir = [ + path: { "${params.outdir}/chewbbaca" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: CHEWBBACA_EXTRACT_CGMLST { + publishDir = [ + path: { "${params.outdir}/chewbbaca/allele_calls" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: CSVTK_CONCAT_QC_DATA { + publishDir = [ + path: { "${params.outdir}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: CUSTOM_DUMPSOFTWAREVERSIONS { + publishDir = [ + path: { "${params.outdir}/pipeline_info" }, + mode: params.publish_dir_mode, + pattern: '*_versions.yml' + ] + } } diff --git a/conf/test.config b/conf/test.config index 1a94bd9..fbb656c 100644 --- a/conf/test.config +++ b/conf/test.config @@ -1,7 +1,40 @@ /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - Small Test Config + Nextflow config file for running minimal tests ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - to be written - + Defines input files and everything required to run a fast and simple pipeline test + using the directory input and with skipping over some of the non-required steps. + + Use as follows: + nextflow run phac-nml/legiovue -profile test, ---------------------------------------------------------------------------------------- */ +// Resource limits are for nextflow >= 24.04.0 so also have to use the max_* params +process { + resourceLimits = [ + cpus: 2, + memory: '8.GB', + time: '2.h' + ] +} + +params { + config_profile_name = "Test profile" + config_profile_description = "Minimal test dataset to check pipeline function" + + // Input + fastq_dir = "${projectDir}/tests/test_data/" + kraken2_db = "s3://genome-idx/kraken/standard_08gb_20240904" + + // Filtering + min_reads = 100 + + // Skip all steps we can as the test_input will do them + skip_el_gato = true + skip_plotting = true + + // Limit resources for github actions + max_cpus = 2 + max_memory = '8.GB' + max_time = '2.h' +} diff --git a/conf/test_full.config b/conf/test_full.config index cf30ff5..131ae4c 100644 --- a/conf/test_full.config +++ b/conf/test_full.config @@ -1,7 +1,36 @@ /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - Full Test Config + Nextflow config file for running minimal tests ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - to be written - + Defines input files and everything required to run a simple pipeline test that + uses the input parameter and all steps. + + Use as follows: + nextflow run phac-nml/legiovue -profile test_full, ---------------------------------------------------------------------------------------- */ +// Resource limits are for nextflow >= 24.04.0 so also have to use the max_* params +process { + resourceLimits = [ + cpus: 2, + memory: '8.GB', + time: '2.h' + ] +} + +params { + config_profile_name = "Full test profile" + config_profile_description = "Full test dataset to check pipeline function" + + // Input + input = "${projectDir}/assets/samplesheet.csv" + kraken2_db = "s3://genome-idx/kraken/standard_08gb_20240904" + + // Filtering + min_reads = 100 + + // Limit resources for github actions + max_cpus = 2 + max_memory = '8.GB' + max_time = '2.h' +} diff --git a/docs/README.md b/docs/README.md index a6968dd..79640ee 100644 --- a/docs/README.md +++ b/docs/README.md @@ -3,11 +3,10 @@ The phac-nml/LegioVue documentation is split up into the following pages: - [Usage](./usage.md) - - An overview of how the pipeline works, how to run it and a description of all of the different command-line flags. + - An overview of how the pipeline works, how to run it and a description of all of the different command-line flags. - [Output](./output.md) - - An overview of the different results produced by the pipeline and how to interpret them. + - An overview of the different results produced by the pipeline and how to interpret them. - [Clustering](./clustering.md) - - Some recommended follow-up clustering steps until they are in incorporated into the pipeline + - Some recommended follow-up clustering steps until they are in incorporated into the pipeline - [Investigations](./investigations.md) - - An overview of the different investigations done to determine key pipeline values - - TODO write this out + - An overview of the different investigations done to determine key pipeline values diff --git a/docs/clustering.md b/docs/clustering.md index 40ce1db..3e0e342 100644 --- a/docs/clustering.md +++ b/docs/clustering.md @@ -1,38 +1,45 @@ # phac-nml/LegioVue : Clustering -This document provides the neccessary steps to visualize the cgMLST output from LegioVue. -At the moment these steps need to be run separately using the ouputs of the LegioVue. Pending updates, these steps will be incorporated into the nextflow workflow directly. + +This document provides the neccessary steps to visualize the cgMLST output from LegioVue. +At the moment these steps need to be run separately using the ouputs of the LegioVue. Pending updates, these steps will be incorporated into the nextflow workflow directly. Visualizations of cgMLST data can be generated with or without clustering. Both options are presented below, though [partitioning and visualization with ReporTree](#partitioning-and-visualization-with-reportree) is the recommended approach if you are able to install and run [ReporTree](https://github.com/insapathogenomics/ReporTree) on the command-line. ### Visualization-only with PHYLOViZ GUI + Use this option if you are unable to install ReporTree, or if you simply want to visualize relative allele differences between isolates without setting cluster/partition thresholds: + 1. Navigate to https://online2.phyloviz.net/index in a browser window. -2. Scroll down and click on "Login-free upload" under **Test PHYLOViZ Online**. This will take you to a page where you can upload and visualize your cgMLST profile without storing any data in the application. Note that navigating away from this page will erase your data. -3. From the **Possible Input Formats** dropdown menu, select "Profile Data". -4. Under **Input Files**, upload your `results/chewbbaca/allele_calls/cgMLST/cgMLST100.tsv` file from LegioVue as Profile Data. Upload a `.tsv` metadata file as Auxiliary Data. **Note:** the "sample name" or similar column header (usually the first column) needs to match the cgMLST output in order to be visualized correctly. Change it to "FILE" to match the profile data or vice versa. -5. Select "Core Analysis" as the **Analysis Method**. -6. Provide a name and optional description for your dataset and click on **Launch Tree**. In a minute or two you will be redirected to a visualization of your data. -7. On the left sidebar, navigate to **Assign Colors** > **By Auxiliary Data** and select the appropriate metadata column (_E.g.,_ ST). Node and branch labels can be added by selecting the "Add Labels" checkbox under **Graphic Properties** > **Nodes** or **Links**. +2. Scroll down and click on "Login-free upload" under **Test PHYLOViZ Online**. This will take you to a page where you can upload and visualize your cgMLST profile without storing any data in the application. Note that navigating away from this page will erase your data. +3. From the **Possible Input Formats** dropdown menu, select "Profile Data". +4. Under **Input Files**, upload your `results/chewbbaca/allele_calls/cgMLST/cgMLST100.tsv` file from LegioVue as Profile Data. Upload a `.tsv` metadata file as Auxiliary Data. **Note:** the "sample name" or similar column header (usually the first column) needs to match the cgMLST output in order to be visualized correctly. Change it to "FILE" to match the profile data or vice versa. +5. Select "Core Analysis" as the **Analysis Method**. +6. Provide a name and optional description for your dataset and click on **Launch Tree**. In a minute or two you will be redirected to a visualization of your data. +7. On the left sidebar, navigate to **Assign Colors** > **By Auxiliary Data** and select the appropriate metadata column (_E.g.,_ ST). Node and branch labels can be added by selecting the "Add Labels" checkbox under **Graphic Properties** > **Nodes** or **Links**. **Important:** In this Minumum Spanning Tree (MST), branch (or "link") lengths represent the number of alleles that differ between linked isolates. The default schema that the pipeline uses for cgMLST determination has a maximum of 1521 possible alleles. These branch lengths tend to increase when there are many inferred (INF) alleles and fewer exact (EXC) alleles (which, in turn, is affected by underlying data quality) used to generate the profile data. These numbers can be found in the `overall.qc.tsv` output of the main pipeline and should be taken into consideration when interpreting the visualization of the profile data. ### Partitioning and Visualization with ReporTree -Reportree can be used to partition the MST of isolates according to different thresholds, which may be useful for epidemiological investigation. + +Reportree can be used to partition the MST of isolates according to different thresholds, which may be useful for epidemiological investigation. 1. First, install [ReporTree](https://github.com/insapathogenomics/ReporTree) either with Conda or Docker according to the installation instructions in the Readme file on their GitHub page. 2. Prepare a metadata file with columns for `sample` and any other data you wish to include for downstream visualization. 3. Activate ReporTree and run `grapetree` analysis, using as input the cgMLST profile data and prepared metadata from Step 2. An example command is below to use with the test dataset: + ``` reportree.py -m /metadata.tsv \ -a /results/chewbbaca/allele_calls/cgMLST/cgMLST100.tsv -thr 0-5 --columns_summary_report ST,n_ST \ --method MSTreeV2 --loci-called 1.0 --matrix-4-grapetree --analysis grapetree ``` + You may wish to modify certain values depending on your analysis: + - `-thr` indicates the threshold(s) to use for cluster partitioning. Setting `-thr 0-5` will request that ReporTree assign samples to clusters at six different allele thresholds, ranging from 0 allele differences to 5. You may also select distinct threshold values, for example `-thr 5,10,15,20`, for more exploratory analysis. -- `--loci-called` should correspond to the cgMLST profile used as input, _i.e.,_ `--loci-called 0.95` should be used if the input profile is `cgMLST95.tsv`. -- `--columns_summary_report` indicates columns from the metadata file that should be described for each cluster. For example, `ST,n_ST` requests that for each cluster, the ST and number of STs included in that cluster should be reported in the output. This information can help you investigate different clustering thresholds. +- `--loci-called` should correspond to the cgMLST profile used as input, _i.e.,_ `--loci-called 0.95` should be used if the input profile is `cgMLST95.tsv`. +- `--columns_summary_report` indicates columns from the metadata file that should be described for each cluster. For example, `ST,n_ST` requests that for each cluster, the ST and number of STs included in that cluster should be reported in the output. This information can help you investigate different clustering thresholds. - `--out` can be added to the above command to specify an existing directory and prefix for the output files. Ex. `--out reportree/TD1` will append "TD1" as a prefix to all output files. -4. Once you have your output files from ReporTree, navigate to the local implementation of [GrapeTree](https://achtman-lab.github.io/GrapeTree/MSTree_holder.html) to visualize the MST data. -5. Under **Inputs/Outputs**, select "Load Files" and upload both `*.nwk` and `*_metadata_w_partitions.tsv`. +4. Once you have your output files from ReporTree, navigate to the local implementation of [GrapeTree](https://achtman-lab.github.io/GrapeTree/MSTree_holder.html) to visualize the MST data. +5. Under **Inputs/Outputs**, select "Load Files" and upload both `*.nwk` and `*_metadata_w_partitions.tsv`. 6. Under **Tree Layout**, you can customize the MST visualization including exploring different partitions by selecting `MST-###` in **Node Style** > **Colour By:** diff --git a/docs/output.md b/docs/output.md index bf5ac9e..b8d28ff 100644 --- a/docs/output.md +++ b/docs/output.md @@ -1,67 +1,74 @@ # phac-nml/LegioVue: Outputs + This document describes the output produced by the pipeline. The directories listed below will be created in the results directory (by default) after the pipeline has finished. All paths are relative to the top-level results directory. ## Pipeline overview + The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps: - [Preprocessing](#preprocessing) - - [Kraken2](#kraken2) - Taxonomic read classification - - [Bracken](#bracken) - Species abundance estimation from kraken2 output - - [Custom Abundance Check](#custom-abundance-check) - Filter samples with `< X%` *Legionella pneumophila* reads - - [Trimmomatic](#trimmomatic) - Trim and crop Illumina reads - - [FastQC](#fastqc) - Trimmed read QC plots + - [Kraken2](#kraken2) - Taxonomic read classification + - [Bracken](#bracken) - Species abundance estimation from kraken2 output + - [Custom Abundance Check](#custom-abundance-check) - Filter samples with `< X%` _Legionella pneumophila_ reads + - [Trimmomatic](#trimmomatic) - Trim and crop Illumina reads + - [FastQC](#fastqc) - Trimmed read QC plots - [Sequence Typing](#sequence-typing) - - [el_gato Reads](#el_gato-reads) - Sequence type (ST) input sample reads - - [el_gato Assembly](#el_gato-assembly) - Sequence type input sample assemblies when reads fail to generate an ST - - [el_gato Report](#el_gato-report) - Create PDF summary el_gato report - - [Pysamstats](#pysamstats) - Calculate positional depth, mapq, and baseq for each ST allele - - [Allele Reports](#allele-reports) - Create per-sample ST allele report pdf + - [el_gato Reads](#el_gato-reads) - Sequence type (ST) input sample reads + - [el_gato Assembly](#el_gato-assembly) - Sequence type input sample assemblies when reads fail to generate an ST + - [el_gato Report](#el_gato-report) - Create PDF summary el_gato report + - [Pysamstats](#pysamstats) - Calculate positional depth, mapq, and baseq for each ST allele + - [Allele Reports](#allele-reports) - Create per-sample ST allele report pdf - [Assembly](#assembly) - - [SPAdes](#spades) - *De novo* bacterial genome assembly - - [QUAST](#quast) - Assembly statistic report + - [SPAdes](#spades) - _De novo_ bacterial genome assembly + - [QUAST](#quast) - Assembly statistic report - [cgMLST and Clustering](#cgmlst-and-clustering) - - [chewBBACA](#chewbbaca) - cgMLST results + - [chewBBACA](#chewbbaca) - cgMLST results - [Final Quality Control](#final-quality-control) - - [QUAST Scoring Script](#quast-scoring-script) - Simple assembly score of quast output based on established criteria - - [Final QC Checks](#final-qc-checks) - Summary of pipeline QC metrics + - [QUAST Scoring Script](#quast-scoring-script) - Simple assembly score of QUAST output based on established criteria + - [Final QC Checks](#final-qc-checks) - Summary of pipeline QC metrics Additionally [Pipeline information](#pipeline-information) which includes report metrics generated during the workflow execution can also be found ### Preprocessing + Initial processing steps and statistic gathering #### Kraken2 +
Output files - `kraken_bracken/` - `*-kreport.tsv`: Kraken2 taxonomic report - `*-classified.tsv`: Kraken2 standard output -
+ [Kraken2](https://github.com/DerrickWood/kraken2/wiki/Manual#classification) classifies input sequences based on a taxonomic k-mer database where the input sequences are mapped to the lowest common ancestor of all genomes known to contain the given k-mer. -In the pipeline, kraken2 along with bracken are used to determine if there is any/enough *L.pneumophila* data to run through the pipeline +In the pipeline, kraken2 along with bracken are used to determine if there is any/enough _L.pneumophila_ data to run through the pipeline #### Bracken +
Output files - `kraken_bracken/` - `*-abundances.tsv`: Bracken abundance report - `*-braken-breakdown.tsv`: Bracken taxonomic report that matches kraken2 report -
+ [Bracken](https://github.com/jenniferlu717/Bracken/blob/v3.0/README.md) reestimates species abundance from kraken2 output. -In the pipeline, kraken2 along with bracken are used to determine if there is any/enough *L.pneumophila* data to run through the pipeline +In the pipeline, kraken2 along with bracken are used to determine if there is any/enough _L.pneumophila_ data to run through the pipeline #### Custom Abundance Check + Simply python program that takes in the bracken abundance report and determines if a sample is above the given threshold required to keep in the pipeline (default 10.0%) #### Trimmomatic +
Output files @@ -70,29 +77,32 @@ Simply python program that takes in the bracken abundance report and determines - `*_paired_R2.fastq.gz`: Paired trimmed read 2 to be used in the following pipeline steps - `*_unpaired_R1.fastq.gz`: Unpaired trimmed reads 1 to assist in SPAdes assembly - `*_unpaired_R1.fastq.gz`: Unpaired trimmed reads 2 to assist in SPAdes assembly - - `*.summary.txt`: Trimmomatic output summary -
+ - `*.summary.txt`: Trimmomatic output summary + [Trimmomatic](http://www.usadellab.org/cms/uploads/supplementary/Trimmomatic/TrimmomaticManual_V0.32.pdf) removes Illumina adapters and trim reads according to quality #### FastQC +
Output files - `fastqc/` - `*_fastqc.html`: FastQC per read quality summary report -
+ [FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/) gives general quality metrics and plots for the input reads. ![FastQC Report Image](images/fastqc.png) ----------- +--- ### Sequence Typing -*In silico* sequence typing and allele reporting using [el_gato](https://github.com/appliedbinf/el_gato) + +_In silico_ sequence typing and allele reporting using [el_gato](https://github.com/appliedbinf/el_gato) #### el_gato Reads +
Output files @@ -102,11 +112,12 @@ Simply python program that takes in the bracken abundance report and determines - `*_reads_vs_all_ref_filt_sorted.bam `: Pileup of reads for each ST allele used for building [allele report](#allele-reports) - `*_run.log `: Program logging info - `*_ST.tsv`: Called Sequence Type -
- + + Sequence-based Typing (SBT) of _Legionella pneumophila_ sequences using reads based on the identification and comparison of 7 loci (_flaA, pilE, asd, mip, mompS, proA, neuA/neuAh_) against an allele database. #### el_gato Assembly +
Output files @@ -114,77 +125,85 @@ Sequence-based Typing (SBT) of _Legionella pneumophila_ sequences using reads ba - `*_assembly.json`: Machine-readable summary for building [el_gato pdf report](#el_gato-report) - `*_run.log `: Program logging info - `*_ST.tsv`: Called Sequence Type -
+ Sequence-based Typing (SBT) of _Legionella pneumophila_ sequences using output assemblies based on the identification and comparison of 7 loci (_flaA, pilE, asd, mip, mompS, proA, neuA/neuAh_) against an allele database. The assemblies are only run when there is an inconclusive ST call as this was found to sometimes recover the ST. _Note: if the ST results are inconclusive after both approaches have been tried, users are encouraged to review the `possible_mlsts.txt` intermediate output for that sample in the pipeline results folder under `el_gato/reads/`_ #### el_gato Report +
Output files - `el_gato/` - `el_gato_report.pdf`: Final el_gato summary report including reads and assembly approaches -
+ Tabular summaries of locus information for all samples run through [el_gato](https://github.com/appliedbinf/el_gato) ![el_gato report](images/el_gato_report.png) #### Pysamstats +
Output files - `el_gato/allele_stats/` - `*.allele_stats.tsv`: Per-sample summary of depth, map quality, and base quality -
+ [Pysamstats](https://github.com/alimanfoo/pysamstats) combined output containing summary of depth, map quality, and base quality for each allele #### Allele Reports + Output files - `el_gato/plots/` - `*_allele_plots.pdf`: Per-sample plots of allele depth, map quality, and base quality - + Custom report plotting of the seven ST alleles looking at depth, map quality, and base quality for each sample. ![Allele Report](images/allele_report.png) ----------- +--- ### Assembly -*De novo* assembly and quality assessment + +_De novo_ assembly and quality assessment #### SPAdes + Output files - `spades/` - `*.contigs.fa`: SPAdes assembly contigs. - `*.scaffolds.fa`: SPAdes scaffold assembly - `*.spades.log`: SPAdes logging information - + -[SPAdes](https://github.com/ablab/spades) is an *de novo* de Bruijn graph-based assembly toolkit containing various assembly pipelines. In this pipeline we are using the `--careful` assembly flag to do the assembly and using the `contigs` to do subsequent analysis steps +[SPAdes](https://github.com/ablab/spades) is an _de novo_ de Bruijn graph-based assembly toolkit containing various assembly pipelines. In this pipeline we are using the `--careful` assembly flag to do the assembly and using the `contigs` to do subsequent analysis steps #### QUAST + Output files - `quast/` - - `report.html`: - - `transposed_report.tsv`: - + - `report.html`: + - `transposed_report.tsv`: + [QUAST](https://github.com/ablab/quast) is used to generate a single report with which to evaluate the quality of the assemblies sequence across all of the samples provided to the pipeline. Input genomes are compared to a _Legionella pneumophila_ [reference genome](../data/C9_S.reference.fna) and the transposed report is parsed downstream to report a final quality score. ----------- +--- ### cgMLST and Clustering + Core Genome MultiLocus Sequence Typing (cgMLST) using chewBACCA and the [Ridom SeqSphere](https://www.cgmlst.org/ncs/schema/Lpneumophila1410/locus/) 1521-loci cgMLST schema and how it can be used for follow-up clustering. #### ChewBBACA + Output files - `chewbbaca/allele_calls/` @@ -192,48 +211,52 @@ Core Genome MultiLocus Sequence Typing (cgMLST) using chewBACCA and the [Ridom S - `results_statistics.tsv`: Per-sample summary of classification type counts - `cgMLST/cgMLST.html`: Interactive line plot that displays number of loci in the cgMLST per threshold value (95/99,100) - `cgMLST/cgMLST###.tsv`: Allele calling results that masks all non-integer classifications that can be used for downstream visualization - + [ChewBBACA](https://chewbbaca.readthedocs.io/en/latest/index.html) cgMLST according to the published [Ridom SeqSphere](https://www.cgmlst.org/ncs/schema/Lpneumophila1410/locus/) 1521-loci cgMLST schema for _L. pneumophila_. The cgMLST allele calling results can be used downstream for clustering and visualization along with the STs. ----------- +--- ### Final Quality Control + Finally summary scoring and metrics #### QUAST Scoring Script + Output files - `scored_quast_report.csv`: Scored quast report based on determined thresholds -Scored quast report based on adapted thresholds from [Gorzynski et al.](10.1016/S2666-5247(22)00231-2) to determine if the sample has any metrics that significantly deviate from the expected results +Scored QUAST report based on adapted thresholds from [Gorzynski et al.](<10.1016/S2666-5247(22)00231-2>) to determine if the sample has any metrics that significantly deviate from the expected results #### Final QC Checks + Output files -- `overall.qc.tsv`: Final collated overall summary report +- `overall.qc.csv`: Final collated overall summary report The final collated summary report that is created using the outputs from the other pipeline steps and checks some final quality criteria. The `qc_status` column will be any of the following statuses: + - Pass: The sample passes all checks! - Warn: The sample was flagged for a specific warning - Fail: The sample has failed out of the pipeline The `qc_message` column contains the reason for the `qc_status` and includes: -| Message | Associated Status | Flag Reason | -| - | - | - | -| low_lpn_abundance | WARN | Low (< 75% abundance) *L.pneumophila* abundance is not expected with isolate sequencing and may signify a problem sample | -| low_read_count | WARN | Low read count (< 300,000 reads default) has been shown to lead to poor, uninformative assemblies and sample is kicked out | -| low_n50 | WARN | Low N50 (< 100,000) scores have been shown to very negatively affect clustering outputs | -| low_exact_allele_calls | WARN | Low chewBBACA exact allele calls (< 90% called) show that there may be issues in the assembly | -| low_qc_score | WARN | Low QUAST-Analyzer QC score (< 4) shows that there may be issues in the assembly | -| no_lpn_detected | FAIL | Very little (< 10% default) *L.pneumophila* abundance flags that the sample may not be *L.pneumophila* and sample is kicked from pipeline | -| failing_read_count | FAIL | Read count below failing threshold (< 150,000 reads default) has been shown to lead to poor, uninformative assemblies and sample is kicked out | - ----------- +| Message | Associated Status | Flag Reason | +| ---------------------- | ----------------- | --------------------------------------------------------------------------------------------------------------------------------------------- | +| low_lpn_abundance | WARN | Low (< 75% abundance) _L.pneumophila_ abundance is not expected with isolate sequencing and may signify a problem sample | +| low_read_count | WARN | Low read count (< 150,000 reads default) has been shown to lead to poor, uninformative assemblies and sample is kicked out | +| low_n50 | WARN | Low N50 (< 80,000) scores have been shown to very negatively affect clustering outputs | +| low_exact_allele_calls | WARN | Low chewBBACA exact allele calls (< 90% called) show that there may be issues in the assembly | +| low_qc_score | WARN | Low QUAST-Analyzer QC score (< 4) shows that there may be issues in the assembly | +| no_lpn_detected | FAIL | Very little (< 10% default) _L.pneumophila_ abundance flags that the sample may not be _L.pneumophila_ and sample is kicked from pipeline | +| failing_read_count | FAIL | Read count below failing threshold (< 60,000 reads default) has been shown to lead to poor, uninformative assemblies and sample is kicked out | + +--- diff --git a/docs/roadmap.md b/docs/roadmap.md deleted file mode 100644 index 1e56a55..0000000 --- a/docs/roadmap.md +++ /dev/null @@ -1,29 +0,0 @@ -# Development Roadmap -What is going to be added. The order is not necessarily the priority in which they will be - -1. Test Dataset available and run with profile `test` and `test_full` - - This will allow the pipeline to be checked by new users to see that it has been correctly installed - -2. Parameter validation using nf-core plugin - - Check that inputs are as expected - - Better help statement - - Better version output - -3. Investigations Document - - Downsampling testing - - Tool testing - -4. CI Tests - - nf-test - - linting - -5. IRIDA-Next requirements - - Add in needed IRIDA next requirements and plugin - -6. Validation dataset and report for releases - - To make sure everything is working correctly on releases, have a validation report to go along with them - -7. Requested updates (or updates we were planning) - - Filtering out non *Legionella* reads after kraken/bracken - - Other tool testing - - For resource/speed/output optimization diff --git a/docs/usage.md b/docs/usage.md index a6f745d..1f43299 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -1,39 +1,44 @@ # phac-nml/LegioVue: Usage ## Introduction -This pipeline is intended to be run on *Legionella pneumophila* paired illumina isolate sequencing data. It generates *de novo* assemblies using [`SPAdes`](https://github.com/ablab/spades), ST types using [`el_gato`](https://github.com/appliedbinf/el_gato), cgMLST calls with [`chewbbaca`](https://chewbbaca.readthedocs.io/en/latest/index.html), and a summary QC report. The outputs of the pipeline can be used for other downstream applications. All parameters have been determined based on outbreak dataset testing. + +This pipeline is intended to be run on _Legionella pneumophila_ paired illumina isolate sequencing data. It generates _de novo_ assemblies using [`SPAdes`](https://github.com/ablab/spades), sequence types (ST) using [`el_gato`](https://github.com/appliedbinf/el_gato), cgMLST calls with [`chewBBACA`](https://chewbbaca.readthedocs.io/en/latest/index.html), and a summary QC report. The outputs of the pipeline can be used for other downstream applications. All parameters have been determined based on outbreak dataset testing. ## Index + - [Profiles](#profiles) - [Running the Pipeline](#running-the-pipeline) - - [`--fastq_dir`](#fastq_dir) + - [`--fastq_dir`](#fastq_dir) + - [`--input`](#input) - [All Parameters](#all-parameters) - - [Required](#required) - - [Optional](#optional) + - [Required](#required) + - [Optional](#optional) - [Core Nextflow Arguments](#core-nextflow-arguments) - - [`-resume`](#resume) - - [`-c`](#c) - - [Resource Labels](#resource-labels) + - [`-resume`](#resume) + - [`-c`](#c) + - [Resource Labels](#resource-labels) - [Other Run Notes](#other-run-notes) - - [Updating the pipeline](#updating-the-pipeline) - - [Reproducibility](#reproducibility) + - [Updating the pipeline](#updating-the-pipeline) + - [Reproducibility](#reproducibility) ## Profiles -Profiles are used to specify dependency installation, resources, and how to handle pipeline jobs. You can specify more than one profile but *avoid* passing in more than one dependency managment profile (Ex. Do not use both `singularity` and `mamba`). They can be passed with `-profile ` + +Profiles are used to specify dependency installation, resources, and how to handle pipeline jobs. You can specify more than one profile but _avoid_ passing in more than one dependency managment profile (Ex. Do not use both `singularity` and `mamba`). They can be passed with `-profile ` Available: + - `conda`: Utilize conda to install dependencies and environment management - `mamba`: Utilize mamba to install dependencies and environment management - `singularity`: Utilize singularity for dependencies and environment management - `docker`: Utilize docker to for dependencies and environment management -> [!NOTE] -> `el_gato` and the plotting currently are using custom docker containers. The `el_gato` container will be returned to the proper one upon a new release of the tool - -For testing the pipeline functions correctly, you can use the `test` or `test_full` profile (TO-DO Create these) +For testing the pipeline functions correctly, you can use the `test` or `test_full` profile ## Running the Pipeline -To run the pipeline the following basic command is all that is required: + +To get started, one of the following commands may be used to run the pipeline: + +Directory Input: ```bash nextflow run phac-nml/legiovue \ @@ -42,10 +47,21 @@ nextflow run phac-nml/legiovue \ [Optional Args] ``` +Samplesheet CSV Input: + +```bash +nextflow run phac-nml/legiovue \ + -profile \ + --input \ + [Optional Args] +``` + ### `--fastq_dir` -The only required argument is needed to get data into the pipeline. Fastqs must be formatted as `_{R1,R2}\*.fastq\*` so that they can be paired based on the name. Note that at the moment everything before the first `_R1/_R2` is kept as the sample name. + +Specify a directory to where paired FASTQ-formatted files are found. FASTQs must be formatted as `_{R1,R2}\*.fastq\*` so that they can be paired up correctly based on the file name. Note that at the moment everything before the first `_R1/_R2` is kept as the sample name. Example directory with 3 samples: + ``` ├── TDS-01_R1.fastq.gz @@ -56,30 +72,57 @@ Example directory with 3 samples: └── another-sample_S1_L001_R2_001.fastq.gz ``` +### `--input` + +Specify a CSV samplesheet containing information about each sample including the sample name and the FASTQ-formatted reads associated with the sample. + +The outputs are named based on the given sample name and the FASTQ data is input based on the path specified under `fastq_1` and `fastq_2`. The input FASTQ files can end with `.fq`, `.fq.gz`, `.fastq`, or `.fastq.gz` to be valid inputs. + +Example: +| sample | fastq_1 | fastq_2 | +| - | - | - | +| sample1 | fastqs/sample1_R1.fastq.gz | fastqs/sample1_R1.fastq.gz | +| sample2 | fastqs/sample2_R1.fastq.gz | fastqs/sample2_R2.fastq.gz | +| other-sample | other_samples/other-sample_R1.fq.gz | other_samples/other-sample_R2.fq.gz | +| more_sample_data | fastqs/more_sample_data_R1.fq | fastqs/more_sample_data_R2.fq | + ## All Parameters + Use `--help` to see all options formatted on the command line Use `--version` to see version information All of the required and optional parameters are defined as follows: ### Required + +It is required to pick one of the following to get fastq data into the pipeline | Parameter | Description | Type | Default | Notes | | - | - | - | - | - | | --fastq_dir | Path to directory containing paired fastq files | Path | null | See [--fastq_dir](#fastq_dir) | +| --input | Path to CSV file containing information on the paired fastq files | Path | null | See [--input](#input) | ### Optional -| Parameter | Description | Type | Default | Notes | -| - | - | - | - | - | -| --outdir | Directory name to output results to | Str | 'results' | | -| --min_abundance_percent | Minimum *L. pneumophila* abundance required after bracken to continue sample on | Float | 10.0 | Very permissive for now | -| --min_reads | Minimum reads required after trimmomatic to continue sample on | Int | 150,000 | Under 150,000 reads samples don't usually provide enough info for proper clustering / STs | -| --kraken2_db | Path to standard `kraken2` database for detecting *L. pneumophila* reads | Path | s3://genome-idx/kraken/standard_08gb_20240904 | Default is AWS hosted database by developers. It is better to use your own if you have one | -| --quast_ref | Path to reference sequence for some of the `quast` metrics | Path | data/C9_S.reference.fna | C9 was picked as a default reference but any good sequence will work | -| --prepped_schema | Path to a prepped `chewbbaca` schema to save running the prep command | Path | data/SeqSphere_1521_schema | Provided with pipeline | -| --schema_targets | Path to schema targets to prep for `chewbbaca` | Path | null | | -| --max_memory | Maximum memory allowed to be given to a job | Str | 128.GB | | -| --max_cpus | Maximum cpus allowed to be given to a job | Int | 16 | | -| --max_time | Maximum time allowed to be given to a job | Str | 240.h' | | + +| Parameter | Description | Type | Default | Notes | +| ----------------------- | ------------------------------------------------------------------------------------- | ------ | -------------------------- | ---------------------------------------------------------------------------------------------------------------------- | +| --outdir | Directory name to output results to | Str | 'results' | | +| --min_abundance_percent | Minimum _L. pneumophila_ abundance required after bracken to continue sample on | Float | 10.0 | Very permissive for now | +| --min_reads | Minimum reads required after trimmomatic to continue sample on | Int | 60,000 | Under 150,000 reads samples don't usually provide enough info for proper clustering / STs | +| --kraken2_db | Path to standard `kraken2` database for detecting _L. pneumophila_ reads | Path | | You will need to get your own database. The 8GB database from s3://genome-idx/kraken/standard_08gb_20240904 works well | +| --quast_ref | Path to reference sequence for some of the `QUAST` metrics | Path | data/C9_S.reference.fna | C9 was picked as a default reference but any good sequence will work | +| --max_contigs | Threshold for the number of contigs > 500bp assembled by SPAdes to get scoring points | 100 | Int | | +| --min_align_percent | Thresold for minimum QUAST genome fraction percentage to get scoring points | 75 | Float | | +| --min_reads_warn | Threshold for minimum number of reads that will be given a QC warning | 150000 | Int | | +| --min_n50_score | Thresold for minimum QUAST N50 value to obtain scoring points | 80000 | Int | | +| --max_n50_score | Thresold for maximum QUAST N50 score to get max scoring points | 220000 | Int | | +| skip_el_gato | Flag to skip running el_gato sequence typing | Bool | False | | +| skip_plotting | Flag to skip running the el_gato allele plotting | Bool | False | | +| --prepped_schema | Path to a prepped `chewBBACA` schema to save running the prep command | Path | data/SeqSphere_1521_schema | Provided with pipeline | +| --schema_targets | Path to schema targets to prep for `chewBBACA` | Path | null | | +| --publish_dir_mode | Specifies how intermediate files should be saved to the output directory | Str | copy | | +| --max_memory | Maximum memory allowed to be given to a job | Str | 128.GB | | +| --max_cpus | Maximum cpus allowed to be given to a job | Int | 16 | | +| --max_time | Maximum time allowed to be given to a job | Str | 240.h' | | ## Core Nextflow Arguments @@ -87,21 +130,26 @@ All of the required and optional parameters are defined as follows: > These options are part of Nextflow and use a _single_ hyphen (pipeline parameters use a double-hyphen). ### `-resume` + Specify this when restarting a pipeline. Nextflow will use cached results from any pipeline steps where the inputs are the same, continuing from where it got to previously. For input to be considered the same, not only the names must be identical but the files' contents as well. For more info about this parameter, see [this blog post](https://www.nextflow.io/blog/2019/demystifying-nextflow-resume.html). You can also supply a run name to resume a specific run: `-resume [run-name]`. Use the `nextflow log` command to show previous run names. ### `-c` + Specify the path to a specific config file (this is a core Nextflow command). See the [nf-core website documentation](https://nf-co.re/usage/configuration) for more information. #### Resource Labels + The following resource labels can be adjusted in a custom config file: + - `process_single` - Default: 1cpus, 4GB memory - `process_low` - Default: 2cpus, 8GB memory - `process_medium` - Default: 4cpus, 24GB memory - `process_high` - Default: 8cpus, 48GB memory ## Other Run Notes + If you wish to repeatedly use the same parameters for multiple runs, rather than specifying each flag in the command, you can specify these in a params file. Pipeline settings can be provided in a `yaml` or `json` file via `-params-file `. @@ -118,11 +166,12 @@ nextflow run phac-nml/legiovue -profile docker -params-file params.yaml with `params.yaml` containing: ```yaml -fastq_dir: './fastqs' -outdir: './results/' +fastq_dir: "./fastqs" +outdir: "./results/" ``` ### Updating the pipeline + When you run the above command, Nextflow automatically pulls the pipeline code from GitHub and stores it as a cached version. When running the pipeline after this, it will always use the cached version if available - even if the pipeline has been updated since. To make sure that you're running the latest version of the pipeline, make sure that you regularly update the cached version of the pipeline: ```bash @@ -130,4 +179,5 @@ nextflow pull phac-nml/legiovue ``` ### Reproducibility + It is a good idea to specify a pipeline version when running the pipeline on your data. This ensures that a specific version of the pipeline code and software are used when you run your pipeline. If you keep using the same tag, you'll be running the same version of the pipeline, even if there have been changes to the code since. diff --git a/envs/plotting-env.yml b/envs/plotting-env.yml index fe5fade..9b9521a 100644 --- a/envs/plotting-env.yml +++ b/envs/plotting-env.yml @@ -4,7 +4,7 @@ channels: - bioconda dependencies: - r-base - - r-argparse + - r-optparse - r-tidyverse - r-patchwork - samtools diff --git a/main.nf b/main.nf index 109cf00..4a62727 100644 --- a/main.nf +++ b/main.nf @@ -14,24 +14,26 @@ nextflow.enable.dsl = 2 Input Checks and Help/Version ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -// Help and Version -include { printHelp } from './modules/local/help.nf' -if ( params.help ){ - printHelp() - exit 0 -} +// Version print if ( params.version ){ - log.info("LegioVue v${workflow.manifest.version}") + log.info "${workflow.manifest.name} v${workflow.manifest.version}" exit 0 } -// Quick data checks -if ( params.profile ){ - log.error("Profile should have a single dash: -profile") +// NF-Schema parts +include { validateParameters; paramsSummaryLog } from 'plugin/nf-schema' +// Validate input parameters +validateParameters() + +// Print summary of supplied parameters +log.info paramsSummaryLog(workflow) + +// Check that we have one proper input +if ( ! params.fastq_dir && ! params.input ){ + log.error "Please provide input data with either: '--input input.csv' or '--fastq_dir '" exit 1 -} -if ( ! params.fastq_dir ){ - log.error("Missing required argument '--fastq_dir '") +} else if ( params.fastq_dir && params.input ) { + log.error "Please provide input data with either: '--input input.csv' or '--fastq_dir ' but not both" exit 1 } diff --git a/modules.json b/modules.json index ae2e1f7..b153a73 100644 --- a/modules.json +++ b/modules.json @@ -1,17 +1,22 @@ { - "name": "phac-nml/LegioVue", - "homePage": "", - "repos": { - "https://github.com/nf-core/modules.git": { - "modules": { - "nf-core": { - "fastqc": { - "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", - "installed_by": ["modules"] - } + "name": "phac-nml/LegioVue", + "homePage": "", + "repos": { + "https://github.com/nf-core/modules.git": { + "modules": { + "nf-core": { + "custom/dumpsoftwareversions": { + "branch": "master", + "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", + "installed_by": ["modules"] + }, + "fastqc": { + "branch": "master", + "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "installed_by": ["modules"] + } + } + } } - } } - } } diff --git a/modules/local/bracken.nf b/modules/local/bracken.nf index 45ed563..e87431e 100644 --- a/modules/local/bracken.nf +++ b/modules/local/bracken.nf @@ -2,9 +2,6 @@ process BRACKEN { tag "$meta.id" label 'process_low' - publishDir "${params.outdir}/kraken_bracken", pattern: "*-abundances.tsv", mode: 'copy' - publishDir "${params.outdir}/kraken_bracken", pattern: "*-braken-breakdown.tsv", mode: 'copy' - conda "bioconda::bracken=2.9" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/bracken:2.9--py38h2494328_0': @@ -19,6 +16,9 @@ process BRACKEN { tuple val(meta), path('*-braken-breakdown.tsv'), emit: breakdown path "versions.yml", emit: versions + when: + task.ext.when == null || task.ext.when + script: """ bracken \\ @@ -32,4 +32,15 @@ process BRACKEN { bracken: \$(echo \$(bracken -v) | cut -f2 -d'v') END_VERSIONS """ + + stub: + """ + touch ${meta.id}-braken-breakdown.tsv + touch ${meta.id}-abundances.tsv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bracken: \$(echo \$(bracken -v) | cut -f2 -d'v') + END_VERSIONS + """ } diff --git a/modules/local/chewbbaca.nf b/modules/local/chewbbaca.nf index 717cef2..177a54a 100644 --- a/modules/local/chewbbaca.nf +++ b/modules/local/chewbbaca.nf @@ -1,8 +1,6 @@ process CHEWBBACA_PREP_EXTERNAL_SCHEMA { label 'process_low' - publishDir "${params.outdir}/chewbbaca", pattern: "prepped_schema", mode: 'copy' - conda "bioconda::chewbbaca=3.3.5" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/chewbbaca:3.3.5--pyhdfd78af_0': @@ -15,6 +13,9 @@ process CHEWBBACA_PREP_EXTERNAL_SCHEMA { path "prepped_schema", emit: schema path "versions.yml", emit: versions + when: + task.ext.when == null || task.ext.when + script: """ chewBBACA.py \\ @@ -28,13 +29,21 @@ process CHEWBBACA_PREP_EXTERNAL_SCHEMA { chewbbaca: \$(echo \$(chewBBACA.py --version 2>&1 | sed 's/^.*chewBBACA version: //g; s/Using.*\$//' )) END_VERSIONS """ + + stub: + """ + mkdir prepped_schema + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + chewbbaca: \$(echo \$(chewBBACA.py --version 2>&1 | sed 's/^.*chewBBACA version: //g; s/Using.*\$//' )) + END_VERSIONS + """ } process CHEWBBACA_ALLELE_CALL { label 'process_medium' - publishDir "${params.outdir}/chewbbaca", pattern: "allele_calls", mode: 'copy' - conda "bioconda::chewbbaca=3.3.5" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/chewbbaca:3.3.5--pyhdfd78af_0': @@ -45,10 +54,19 @@ process CHEWBBACA_ALLELE_CALL { path schema output: - path "allele_calls", emit: allele_calls + path "allele_calls/results_alleles.tsv", emit: results_alleles path "allele_calls/results_statistics.tsv", emit: statistics + path "allele_calls/cds_coordinates.tsv", emit: cds_coords + path "allele_calls/loci_summary_stats.tsv", emit: loci_summary + path "allele_calls/paralogous_counts.tsv", emit: paralogous_counts + path "allele_calls/paralogous_loci.tsv", emit: paralogous_loci + path "allele_calls/results_contigsInfo.tsv", emit: contig_info + path "allele_calls/*.txt", emit: allele_call_txt path "versions.yml", emit: versions + when: + task.ext.when == null || task.ext.when + script: """ # Move all assemblies to a directory @@ -69,30 +87,49 @@ process CHEWBBACA_ALLELE_CALL { chewbbaca: \$(echo \$(chewBBACA.py --version 2>&1 | sed 's/^.*chewBBACA version: //g; s/Using.*\$//' )) END_VERSIONS """ + + stub: + """ + mkdir allele_calls + touch allele_calls/results_statistics.tsv + touch allele_calls/results_alleles.tsv + touch allele_calls/cds_coordinates.tsv + touch allele_calls/loci_summary_stats.tsv + touch allele_calls/paralogous_counts.tsv + touch allele_calls/paralogous_loci.tsv + touch allele_calls/results_contigsInfo.tsv + touch allele_calls/invalid_cds.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + chewbbaca: \$(echo \$(chewBBACA.py --version 2>&1 | sed 's/^.*chewBBACA version: //g; s/Using.*\$//' )) + END_VERSIONS + """ } process CHEWBBACA_EXTRACT_CGMLST { label 'process_low' - publishDir "${params.outdir}/chewbbaca/$allele_calls", pattern: "cgMLST", mode: 'copy' - conda "bioconda::chewbbaca=3.3.5" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/chewbbaca:3.3.5--pyhdfd78af_0': 'biocontainers/chewbbaca:3.3.5--pyhdfd78af_0' }" input: - path allele_calls + path results_alleles output: - path "cgMLST", emit: cgmlst + path "cgMLST/*", emit: cgmlst path "versions.yml", emit: versions + when: + task.ext.when == null || task.ext.when + script: """ chewBBACA.py \\ ExtractCgMLST \\ - -i $allele_calls/results_alleles.tsv \\ + -i $results_alleles \\ -o cgMLST cat <<-END_VERSIONS > versions.yml @@ -100,4 +137,16 @@ process CHEWBBACA_EXTRACT_CGMLST { chewbbaca: \$(echo \$(chewBBACA.py --version 2>&1 | sed 's/^.*chewBBACA version: //g; s/Using.*\$//' )) END_VERSIONS """ + + stub: + """ + mkdir cgMLST + touch cgMLST/cgMLST99.tsv + touch cgMLST/cgMLST100.tsv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + chewbbaca: \$(echo \$(chewBBACA.py --version 2>&1 | sed 's/^.*chewBBACA version: //g; s/Using.*\$//' )) + END_VERSIONS + """ } diff --git a/modules/local/el_gato.nf b/modules/local/el_gato.nf index e45b737..b385e22 100644 --- a/modules/local/el_gato.nf +++ b/modules/local/el_gato.nf @@ -2,12 +2,6 @@ process EL_GATO_READS { tag "$meta.id" label 'process_medium' - publishDir "${params.outdir}/el_gato/reads", pattern: "*.tsv", mode: 'copy' - publishDir "${params.outdir}/el_gato/reads", pattern: "*.bam*", mode: 'copy' - publishDir "${params.outdir}/el_gato/reads", pattern: "*.log", mode: 'copy' - publishDir "${params.outdir}/el_gato/reads", pattern: "*.json", mode: 'copy' - publishDir "${params.outdir}/el_gato/reads", pattern: "*_possible_mlsts.txt", mode: 'copy' - conda "bioconda::el_gato=1.20.2" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/el_gato:1.20.2--py311h7e72e81_0' : @@ -24,6 +18,9 @@ process EL_GATO_READS { tuple val(meta), path("${meta.id}_reads.json"), emit: json path "versions.yml", emit: versions + when: + task.ext.when == null || task.ext.when + script: def reads_in = "--read1 ${reads[0]} --read2 ${reads[1]}" """ @@ -53,18 +50,32 @@ process EL_GATO_READS { el_gato: \$(el_gato.py --version | sed 's/^el_gato version: //') END_VERSIONS """ + + stub: + """ + # Due to splitCSV have to actually have a CSV in stub + echo "Sample ST flaA pilE asd mip mompS proA neuA_neuAH" > ${meta.id}_ST.tsv + echo "${meta.id} MD- - - - - - - -" >> ${meta.id}_ST.tsv + + touch ${meta.id}.bam + touch ${meta.id}.bam.bai + touch ${meta.id}_possible_mlsts.txt + touch ${meta.id}_run.log + touch ${meta.id}_reads.json + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + el_gato: \$(el_gato.py --version | sed 's/^el_gato version: //') + END_VERSIONS + """ } -// TODO: Combine to one function at some point as -// maintaining 2 is a pain process EL_GATO_ASSEMBLY { tag "$meta.id" label 'process_low' - label 'error_ignore' // Non-legion samples explode here otherwise - - publishDir "${params.outdir}/el_gato/assembly", pattern: "*.tsv", mode: 'copy' - publishDir "${params.outdir}/el_gato/assembly", pattern: "*.log", mode: 'copy' - publishDir "${params.outdir}/el_gato/assembly", pattern: "*.json", mode: 'copy' + // Non-legionella or really low cov assemblies explode here otherwise + // Due to an issue in el_gato with samples that can't find any loci + label 'error_ignore' conda "bioconda::el_gato=1.20.2" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? @@ -80,6 +91,9 @@ process EL_GATO_ASSEMBLY { tuple val(meta), path("${meta.id}_assembly.json"), emit: json path "versions.yml", emit: versions + when: + task.ext.when == null || task.ext.when + script: """ el_gato.py \\ @@ -99,13 +113,23 @@ process EL_GATO_ASSEMBLY { el_gato: \$(el_gato.py --version | sed 's/^el_gato version: //') END_VERSIONS """ + + stub: + """ + touch ${meta.id}_ST.tsv + touch ${meta.id}_run.log + touch ${meta.id}_assembly.json + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + el_gato: \$(el_gato.py --version | sed 's/^el_gato version: //') + END_VERSIONS + """ } process EL_GATO_REPORT { label 'process_low' - publishDir "${params.outdir}/el_gato", pattern: "*.pdf", mode: 'copy' - conda "bioconda::el_gato=1.20.2" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/el_gato:1.20.2--py311h7e72e81_0' : @@ -119,6 +143,9 @@ process EL_GATO_REPORT { path "*.pdf", emit: pdf path "versions.yml", emit: versions + when: + task.ext.when == null || task.ext.when + script: """ elgato_report.py \\ @@ -130,13 +157,21 @@ process EL_GATO_REPORT { el_gato: \$(el_gato.py --version | sed 's/^el_gato version: //') END_VERSIONS """ + + stub: + """ + touch el_gato_report.pdf + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + el_gato: \$(el_gato.py --version | sed 's/^el_gato version: //') + END_VERSIONS + """ } process COMBINE_EL_GATO { label 'process_low' - publishDir "${params.outdir}/el_gato", pattern: "el_gato_st.tsv", mode: 'copy' - conda "conda-forge::pandas=2.2.1" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/pandas:2.2.1' : @@ -150,6 +185,9 @@ process COMBINE_EL_GATO { path "el_gato_st.tsv", emit: report path "versions.yml", emit: versions + when: + task.ext.when == null || task.ext.when + script: def reads_arg = reads_st ? "--reads_tsv $reads_st" : "" def assembly_arg = assembly_st ? "--assembly_tsv $assembly_st" : "" @@ -163,4 +201,14 @@ process COMBINE_EL_GATO { combine_el_gato: 0.1.0 END_VERSIONS """ -} \ No newline at end of file + + stub: + """ + touch el_gato_st.tsv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + combine_el_gato: 0.1.0 + END_VERSIONS + """ +} diff --git a/modules/local/help.nf b/modules/local/help.nf deleted file mode 100644 index 7a37936..0000000 --- a/modules/local/help.nf +++ /dev/null @@ -1,45 +0,0 @@ -// Temp help statement -def printHelp() { - log.info""" -Usage: - nextflow run phac-nml/legiovue -profile --fastq_dir - -Description: - Pipeline to generates Legionella pneumophila de novo assemblies, ST types, cgMLST calls, and QC summaries for clustering and reporting - -Nextflow arguments (single DASH): - -profile Allowed values: conda, mamba, singularity, apptainer, docker - -c Add in custom config files for resources or own cluster - -Mandatory workflow arguments: - --fastq_dir Path to directory containing paired legionella fastq data - -Optional: - ## Basic Args ## - --outdir Output directory (Default: ./results) - - ## Filtering ## - --min_abundance_percent Minimum L. pneumophila abundance required (Default: 10.0) - --min_reads Minimum reads required after trimmomatic (Default: 150000) - - ## Kraken/Bracken ## - --kraken2_db Path to standard kraken2 database - (Default: s3://genome-idx/kraken/standard_08gb_20240904) - - ## Quast ## - --quast_ref Path to reference sequence for some of the quast metrics - (Default data/C9_S.reference.fna) - - ## Chewbbaca ## - --prepped_schema Path to a prepped chewbbaca schema to save running the prep command - (Default: data/SeqSphere_1521_schema) - --schema_targets Path to schema targets to prep for chewbbaca if not using the default SeqSphere_1521 - - ## Other Generic Args ## - --help Prints this statement - --version Prints the pipeline version - --max_memory Maximum memory to allow to be allocated when running processes (Default: 128G) - --max_cpus Maximum cpus to allow to be allocated when running processes (Default: 16) - --max_time Maximum time to allow to be allocated when running processes (Default: 240.h) -""".stripIndent() -} diff --git a/modules/local/kraken.nf b/modules/local/kraken.nf index e559cbb..37c923a 100644 --- a/modules/local/kraken.nf +++ b/modules/local/kraken.nf @@ -2,9 +2,6 @@ process KRAKEN2_CLASSIFY { tag "$meta.id" label 'process_high' - publishDir "${params.outdir}/kraken_bracken", pattern: "*-classified.tsv", mode: 'copy' - publishDir "${params.outdir}/kraken_bracken", pattern: "*-kreport.tsv", mode: 'copy' - conda "bioconda::kraken2=2.1.3" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/mulled-v2-8706a1dd73c6cc426e12dd4dd33a5e917b3989ae:c8cbdc8ff4101e6745f8ede6eb5261ef98bdaff4-0' : @@ -19,6 +16,9 @@ process KRAKEN2_CLASSIFY { tuple val(meta), path('*-kreport.tsv'), emit: report path "versions.yml", emit: versions + when: + task.ext.when == null || task.ext.when + script: def gz_arg = reads[0].endsWith('.gz') ? "--gzip-compressed" : "" """ @@ -38,4 +38,15 @@ process KRAKEN2_CLASSIFY { kraken2: \$(echo \$(kraken2 --version 2>&1) | sed 's/^.*Kraken version //; s/ .*\$//') END_VERSIONS """ + + stub: + """ + touch ${meta.id}-classified.tsv + touch ${meta.id}-kreport.tsv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + kraken2: \$(echo \$(kraken2 --version 2>&1) | sed 's/^.*Kraken version //; s/ .*\$//') + END_VERSIONS + """ } diff --git a/modules/local/plotting.nf b/modules/local/plotting.nf index 5f8bd6b..9ea55f2 100644 --- a/modules/local/plotting.nf +++ b/modules/local/plotting.nf @@ -1,12 +1,11 @@ -process PLOT_PYSAMSTATS_TSV { +process PLOT_EL_GATO_ALLELES { tag "$meta.id" label 'process_low' - publishDir "${params.outdir}/el_gato/plots", pattern: "*_allele_plots.pdf", mode: 'copy' - conda "$projectDir/envs/plotting-env.yml" - // Custom built for this... - container "docker://darianhole/legio-plotting:0.1.0" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-b2ec1fea5791d428eebb8c8ea7409c350d31dada:a447f6b7a6afde38352b24c30ae9cd6e39df95c4-1' : + 'biocontainers/mulled-v2-b2ec1fea5791d428eebb8c8ea7409c350d31dada:a447f6b7a6afde38352b24c30ae9cd6e39df95c4-1' }" input: tuple val(meta), path(tsv) @@ -15,13 +14,12 @@ process PLOT_PYSAMSTATS_TSV { tuple val(meta), path("${meta.id}_allele_plots.pdf"), emit: plot path "versions.yml", emit: versions + when: + task.ext.when == null || task.ext.when + script: - // Special handling of using executables based on a docker micromamba image - // https://stackoverflow.com/a/78027234 - // https://micromamba-docker.readthedocs.io/en/latest/faq.html#how-can-i-use-a-mambaorg-micromamba-based-image-with-apptainer - def run_cmd = workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? '/usr/local/bin/_entrypoint.sh plot_genome_cov.R' : 'plot_genome_cov.R' """ - $run_cmd \\ + plot_genome_cov.R \\ --input_tsv $tsv \\ --outfile ${meta.id}_allele_plots.pdf @@ -30,4 +28,14 @@ process PLOT_PYSAMSTATS_TSV { plot_genome_cov: 0.1.0 END_VERSIONS """ + + stub: + """ + touch ${meta.id}_allele_plots.pdf + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + plot_genome_cov: 0.1.0 + END_VERSIONS + """ } diff --git a/modules/local/pysamstats.nf b/modules/local/pysamstats.nf index 842d0f0..cc557f1 100644 --- a/modules/local/pysamstats.nf +++ b/modules/local/pysamstats.nf @@ -15,6 +15,9 @@ process PYSAMSTATS { tuple val(meta), path("${meta.id}.${type}.stats.tsv"), emit: tsv path "versions.yml", emit: versions + when: + task.ext.when == null || task.ext.when + script: """ pysamstats \\ @@ -27,4 +30,14 @@ process PYSAMSTATS { pysamstats: \$(pysamstats -h | tail -n 2 | grep -Eo ": \\S+" | cut -d" " -f2) END_VERSIONS """ + + stub: + """ + touch ${meta.id}.${type}.stats.tsv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + plot_genome_cov: 0.1.0 + END_VERSIONS + """ } diff --git a/modules/local/qc.nf b/modules/local/qc.nf index 093e459..90c3e78 100644 --- a/modules/local/qc.nf +++ b/modules/local/qc.nf @@ -15,9 +15,12 @@ process COMBINE_SAMPLE_DATA { path(chewbbaca_stats) output: - tuple val(meta), path("*.tsv"), emit: tsv + tuple val(meta), path("*.csv"), emit: csv path "versions.yml", emit: versions + when: + task.ext.when == null || task.ext.when + script: def trimmomatic_arg = trimmomatic_summary ? "-tr $trimmomatic_summary" : "" def quast_report_arg = quast_report ? "-qa $quast_report" : "" @@ -34,11 +37,22 @@ process COMBINE_SAMPLE_DATA { $st_report_arg \\ $chewbbaca_stats_arg \\ --min_abundance_percent ${params.min_abundance_percent} \\ - --min_reads ${params.min_reads} + --min_reads_fail ${params.min_reads} \\ + --min_reads_warn ${params.min_reads_warn} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + combine_qc_data: 0.2.0 + END_VERSIONS + """ + + stub: + """ + touch ${meta.id}.csv cat <<-END_VERSIONS > versions.yml "${task.process}": - combine_qc_data: 0.1.0 + combine_qc_data: 0.2.0 END_VERSIONS """ } diff --git a/modules/local/quast.nf b/modules/local/quast.nf index d3f5e75..aaf77fd 100644 --- a/modules/local/quast.nf +++ b/modules/local/quast.nf @@ -1,11 +1,6 @@ process QUAST { label 'process_medium' - publishDir "${params.outdir}/quast", pattern: "transposed_report.tsv", mode: 'copy' - publishDir "${params.outdir}/quast", pattern: "report.html", mode: 'copy' - publishDir "${params.outdir}/quast", pattern: "report.pdf", mode: 'copy' - publishDir "${params.outdir}/quast", pattern: "*_stats", mode: 'copy' - conda "bioconda::quast=5.2.0" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/quast:5.2.0--py39pl5321h2add14b_1' : @@ -22,6 +17,9 @@ process QUAST { path "*_stats", emit: stats_folders path "versions.yml", emit: versions + when: + task.ext.when == null || task.ext.when + script: """ quast \\ @@ -35,6 +33,19 @@ process QUAST { quast: \$(quast.py --version 2>&1 | sed 's/^.*QUAST v//; s/ .*\$//') END_VERSIONS """ + + stub: + """ + touch transposed_report.tsv + touch report.html + touch report.pdf + mkdir quast_stats + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + quast: \$(quast.py --version 2>&1 | sed 's/^.*QUAST v//; s/ .*\$//') + END_VERSIONS + """ } process SCORE_QUAST { @@ -54,9 +65,16 @@ process SCORE_QUAST { path "scored_quast_report.csv", emit: report path "versions.yml", emit: versions + when: + task.ext.when == null || task.ext.when + script: """ quast_analyzer.py \\ + --max_contigs ${params.max_contigs} \\ + --min_align_percent ${params.min_align_percent} \\ + --min_n50_score ${params.min_n50_score} \\ + --max_n50_score ${params.max_n50_score} \\ $transposed_report \\ --outfile scored_quast_report.csv @@ -66,4 +84,14 @@ process SCORE_QUAST { quast_analyzer: 0.1.0 END_VERSIONS """ + + stub: + """ + touch scored_quast_report.csv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + quast_analyzer: 0.1.0 + END_VERSIONS + """ } diff --git a/modules/local/reportree.nf b/modules/local/reportree.nf deleted file mode 100644 index e3d7e30..0000000 --- a/modules/local/reportree.nf +++ /dev/null @@ -1,36 +0,0 @@ -process REPORTREE { - label 'process_medium' - - publishDir "${params.outdir}/reportree", pattern: "", mode: 'copy' - - // No conda at the moment as the install for the env is a pain - // conda "$projectDir/envs/" - // Only a docker container that they host at the moment - // Container also missing ps so won't work - container "insapathogenomics/reportree" - - input: - path cgmlst - path metadata - - output: - path "versions.yml", emit: versions - - script: - def metadata_arg = metadata ? "-m $metadata" : "" - """ - reportree.py \\ - $metadata_arg \\ - -a $cgmlst/cgMLST100.tsv \\ - -thr 0-5 \\ - --method MSTreeV2 \\ - --loci-called 1.0 \\ - --matrix-4-grapetree \\ - --analysis grapetree - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - ReporTree: \$(reportree.py -v | head -n 1 | cut -d' ' -f2) - END_VERSIONS - """ -} diff --git a/modules/local/spades.nf b/modules/local/spades.nf index e251601..38b221b 100644 --- a/modules/local/spades.nf +++ b/modules/local/spades.nf @@ -2,9 +2,6 @@ process SPADES { tag "$meta.id" label 'process_high' - publishDir "${params.outdir}/spades", pattern: "*.fa", mode: 'copy' - publishDir "${params.outdir}/spades", pattern: "*.log", mode: 'copy' - conda "bioconda::spades=4.0.0" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/spades:4.0.0--h5fb382e_1' : @@ -20,15 +17,30 @@ process SPADES { tuple val(meta), path('*.spades.log'), emit: log path "versions.yml", emit: versions + when: + task.ext.when == null || task.ext.when + script: def reads_paired_in = "-1 ${reads_paired[0]} -2 ${reads_paired[1]}" - def reads_unpaired_in = "--s1 ${reads_single[0]} --s2 ${reads_single[1]}" """ + # Have to check if we have data in the unpaired reads + # Doing it with bash for now as the relative nextflow file constructor path + # wasn't playing nice with File class and gzip input stream + unpaired_1_in="" + unpaired_2_in="" + if [ "\$(zcat ${reads_single[0]} | wc -c)" -gt 0 ]; then + unpaired_1_in="--s1 ${reads_single[0]}" + fi + if [ "\$(zcat ${reads_single[1]} | wc -c)" -gt 0 ]; then + unpaired_2_in="--s2 ${reads_single[1]}" + fi + # We found that using --careful works best for Legionella spades.py \\ --threads $task.cpus \\ $reads_paired_in \\ - $reads_unpaired_in \\ + \$unpaired_1_in \\ + \$unpaired_2_in \\ --careful \\ -o ./ @@ -50,6 +62,20 @@ process SPADES { cat <<-END_VERSIONS > versions.yml "${task.process}": spades: \$(spades.py --version 2>&1 | sed -n 's/^.*SPAdes genome assembler v//p') - END_VERSION + END_VERSIONS + """ + + stub: + """ + # Output naming + touch ${meta.id}.spades.log + touch ${meta.id}.scaffolds.fa + touch ${meta.id}.contigs.fa + touch ${meta.id}.warnings.log + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + spades: \$(spades.py --version 2>&1 | sed -n 's/^.*SPAdes genome assembler v//p') + END_VERSIONS """ } diff --git a/modules/local/trimmomatic.nf b/modules/local/trimmomatic.nf index 05ff1a5..7b64b73 100644 --- a/modules/local/trimmomatic.nf +++ b/modules/local/trimmomatic.nf @@ -2,9 +2,6 @@ process TRIMMOMATIC { tag "$meta.id" label 'process_medium' - publishDir "${params.outdir}/trimmomatic", pattern: "*.fastq.gz", mode: 'copy' - publishDir "${params.outdir}/trimmomatic", pattern: "*.summary.txt", mode: 'copy' - conda "bioconda::trimmomatic=0.39" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/trimmomatic:0.39--hdfd78af_2': @@ -20,6 +17,9 @@ process TRIMMOMATIC { tuple val(meta), path("*.summary.txt"), emit: summary path "versions.yml", emit: versions + when: + task.ext.when == null || task.ext.when + script: // I've included the current args here for now, may make a modules config later """ @@ -43,4 +43,32 @@ process TRIMMOMATIC { trimmomatic: \$(trimmomatic -version) END_VERSIONS """ + + stub: + """ + # Due to read filtering need a read to continue pipeline + # Note no using newlines as it breaks the versions + read="@read1 + TTT + + + CCC + " + + # Create read files + echo -e \$read > ${meta.id}_paired_R1.fastq + echo -e \$read > ${meta.id}_paired_R2.fastq + gzip ${meta.id}_paired_R1.fastq + gzip ${meta.id}_paired_R2.fastq + + # Summary files and unpaired reads + touch ${meta.id}.summary.txt + touch ${meta.id}_unpaired_R1.fastq.gz + touch ${meta.id}_unpaired_R2.fastq.gz + + # Versions + cat <<-END_VERSIONS > versions.yml + "${task.process}": + trimmomatic: \$(trimmomatic -version) + END_VERSIONS + """ } diff --git a/modules/local/utils.nf b/modules/local/utils.nf index a926125..6f08a50 100644 --- a/modules/local/utils.nf +++ b/modules/local/utils.nf @@ -14,6 +14,9 @@ process CREATE_ABUNDANCE_FILTER { tuple val(meta), path("${meta.id}.check.csv"), emit: abundance_check path "versions.yml", emit: versions + when: + task.ext.when == null || task.ext.when + script: """ filter_lpn_abundance.py \\ @@ -26,14 +29,24 @@ process CREATE_ABUNDANCE_FILTER { filter_lpn_abundance: 0.1.0 END_VERSIONS """ + + stub: + """ + # Due to splitCSV have to actually have a CSV in stub + echo "sample,pass" > ${meta.id}.check.csv + echo "${meta.id},YES" >> ${meta.id}.check.csv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + filter_lpn_abundance: 0.1.0 + END_VERSIONS + """ } -process CSVTK_COMBINE_STATS { +process CSVTK_JOIN_ALLELE_STATS { tag "$meta.id" label 'process_single' - publishDir "${params.outdir}/el_gato/allele_stats", pattern: "*.tsv", mode: 'copy' - conda "bioconda::csvtk=0.30.0" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/csvtk:0.30.0--h9ee0642_0': @@ -46,6 +59,9 @@ process CSVTK_COMBINE_STATS { tuple val(meta), path("${meta.id}.allele_stats.tsv"), emit: tsv path "versions.yml", emit: versions + when: + task.ext.when == null || task.ext.when + script: """ csvtk \\ @@ -61,32 +77,52 @@ process CSVTK_COMBINE_STATS { csvtk: \$(echo \$( csvtk version | sed -e "s/csvtk v//g" )) END_VERSIONS """ + + stub: + """ + touch ${meta.id}.allele_stats.tsv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + csvtk: \$(echo \$( csvtk version | sed -e "s/csvtk v//g" )) + END_VERSIONS + """ } -process CSVTK_COMBINE{ +process CSVTK_CONCAT_QC_DATA { label 'process_single' - publishDir "${params.outdir}", pattern: "*.tsv", mode: 'copy' - conda "bioconda::csvtk=0.30.0" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/csvtk:0.30.0--h9ee0642_0': 'biocontainers/csvtk:0.30.0--h9ee0642_0' }" input: - path tsvs + path csvs output: - path "overall.qc.tsv", emit: tsv + path "overall.qc.csv", emit: csv path "versions.yml", emit: versions + when: + task.ext.when == null || task.ext.when + script: """ csvtk \\ concat \\ - -tT \\ - $tsvs \\ - > overall.qc.tsv + $csvs \\ + > overall.qc.csv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + csvtk: \$(echo \$( csvtk version | sed -e "s/csvtk v//g" )) + END_VERSIONS + """ + + stub: + """ + touch overall.qc.csv cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/nf-core/custom/dumpsoftwareversions/environment.yml b/modules/nf-core/custom/dumpsoftwareversions/environment.yml new file mode 100644 index 0000000..9d79af9 --- /dev/null +++ b/modules/nf-core/custom/dumpsoftwareversions/environment.yml @@ -0,0 +1,5 @@ +channels: + - conda-forge + - bioconda +dependencies: + - bioconda::multiqc=1.20 diff --git a/modules/nf-core/custom/dumpsoftwareversions/main.nf b/modules/nf-core/custom/dumpsoftwareversions/main.nf new file mode 100644 index 0000000..105f926 --- /dev/null +++ b/modules/nf-core/custom/dumpsoftwareversions/main.nf @@ -0,0 +1,24 @@ +process CUSTOM_DUMPSOFTWAREVERSIONS { + label 'process_single' + + // Requires `pyyaml` which does not have a dedicated container but is in the MultiQC container + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/multiqc:1.20--pyhdfd78af_0' : + 'biocontainers/multiqc:1.20--pyhdfd78af_0' }" + + input: + path versions + + output: + path "software_versions.yml" , emit: yml + path "software_versions_mqc.yml", emit: mqc_yml + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + template 'dumpsoftwareversions.py' +} diff --git a/modules/nf-core/custom/dumpsoftwareversions/meta.yml b/modules/nf-core/custom/dumpsoftwareversions/meta.yml new file mode 100644 index 0000000..dc1e412 --- /dev/null +++ b/modules/nf-core/custom/dumpsoftwareversions/meta.yml @@ -0,0 +1,43 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json +name: custom_dumpsoftwareversions +description: Custom module used to dump software versions within the nf-core pipeline + template +keywords: + - custom + - dump + - version +tools: + - custom: + description: Custom module used to dump software versions within the nf-core pipeline + template + homepage: https://github.com/nf-core/tools + documentation: https://github.com/nf-core/tools + licence: ["MIT"] + identifier: "" +input: + - - versions: + type: file + description: YML file containing software versions + pattern: "*.yml" +output: + - yml: + - software_versions.yml: + type: file + description: Standard YML file containing software versions + pattern: "software_versions.yml" + - mqc_yml: + - software_versions_mqc.yml: + type: file + description: MultiQC custom content YML file containing software versions + pattern: "software_versions_mqc.yml" + - versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@drpatelh" + - "@grst" +maintainers: + - "@drpatelh" + - "@grst" diff --git a/modules/nf-core/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py b/modules/nf-core/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py new file mode 100644 index 0000000..b83b32c --- /dev/null +++ b/modules/nf-core/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py @@ -0,0 +1,101 @@ +#!/usr/bin/env python + + +"""Provide functions to merge multiple versions.yml files.""" + +import platform +from textwrap import dedent + +import yaml + + +def _make_versions_html(versions): + """Generate a tabular HTML output of all versions for MultiQC.""" + html = [ + dedent( + """\\ + + + + + + + + + + """ + ) + ] + for process, tmp_versions in sorted(versions.items()): + html.append("") + for i, (tool, version) in enumerate(sorted(tmp_versions.items())): + html.append( + dedent( + f"""\\ + + + + + + """ + ) + ) + html.append("") + html.append("
Process Name Software Version
{process if (i == 0) else ''}{tool}{version}
") + return "\\n".join(html) + + +def main(): + """Load all version files and generate merged output.""" + versions_this_module = {} + versions_this_module["${task.process}"] = { + "python": platform.python_version(), + "yaml": yaml.__version__, + } + + with open("$versions") as f: + versions_by_process = yaml.load(f, Loader=yaml.BaseLoader) | versions_this_module + + # aggregate versions by the module name (derived from fully-qualified process name) + versions_by_module = {} + for process, process_versions in versions_by_process.items(): + module = process.split(":")[-1] + try: + if versions_by_module[module] != process_versions: + raise AssertionError( + "We assume that software versions are the same between all modules. " + "If you see this error-message it means you discovered an edge-case " + "and should open an issue in nf-core/tools. " + ) + except KeyError: + versions_by_module[module] = process_versions + + versions_by_module["Workflow"] = { + "Nextflow": "$workflow.nextflow.version", + "$workflow.manifest.name": "$workflow.manifest.version", + } + + versions_mqc = { + "id": "software_versions", + "section_name": "${workflow.manifest.name} Software Versions", + "section_href": "https://github.com/${workflow.manifest.name}", + "plot_type": "html", + "description": "are collected at run time from the software output.", + "data": _make_versions_html(versions_by_module), + } + + with open("software_versions.yml", "w") as f: + yaml.dump(versions_by_module, f, default_flow_style=False) + with open("software_versions_mqc.yml", "w") as f: + yaml.dump(versions_mqc, f, default_flow_style=False) + + with open("versions.yml", "w") as f: + yaml.dump(versions_this_module, f, default_flow_style=False) + + +if __name__ == "__main__": + main() diff --git a/modules/nf-core/custom/dumpsoftwareversions/tests/main.nf.test b/modules/nf-core/custom/dumpsoftwareversions/tests/main.nf.test new file mode 100644 index 0000000..b1e1630 --- /dev/null +++ b/modules/nf-core/custom/dumpsoftwareversions/tests/main.nf.test @@ -0,0 +1,43 @@ +nextflow_process { + + name "Test Process CUSTOM_DUMPSOFTWAREVERSIONS" + script "../main.nf" + process "CUSTOM_DUMPSOFTWAREVERSIONS" + tag "modules" + tag "modules_nfcore" + tag "custom" + tag "dumpsoftwareversions" + tag "custom/dumpsoftwareversions" + + test("Should run without failures") { + when { + process { + """ + def tool1_version = ''' + TOOL1: + tool1: 0.11.9 + '''.stripIndent() + + def tool2_version = ''' + TOOL2: + tool2: 1.9 + '''.stripIndent() + + input[0] = Channel.of(tool1_version, tool2_version).collectFile() + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + process.out.versions, + file(process.out.mqc_yml[0]).readLines()[0..10], + file(process.out.yml[0]).readLines()[0..7] + ).match() + } + ) + } + } +} diff --git a/modules/nf-core/custom/dumpsoftwareversions/tests/main.nf.test.snap b/modules/nf-core/custom/dumpsoftwareversions/tests/main.nf.test.snap new file mode 100644 index 0000000..5f59a93 --- /dev/null +++ b/modules/nf-core/custom/dumpsoftwareversions/tests/main.nf.test.snap @@ -0,0 +1,33 @@ +{ + "Should run without failures": { + "content": [ + [ + "versions.yml:md5,76d454d92244589d32455833f7c1ba6d" + ], + [ + "data: \"\\n\\n \\n \\n \\n \\n \\n \\n \\n\\", + " \\n\\n\\n \\n \\n\\", + " \\ \\n\\n\\n\\n \\n \\", + " \\ \\n \\n\\n\\n\\n\\", + " \\n\\n \\n \\n\\", + " \\ \\n\\n\\n\\n\\n\\n \\n\\", + " \\ \\n \\n\\n\\n\\n\\", + " \\n\\n \\n \\n\\" + ], + [ + "CUSTOM_DUMPSOFTWAREVERSIONS:", + " python: 3.11.7", + " yaml: 5.4.1", + "TOOL1:", + " tool1: 0.11.9", + "TOOL2:", + " tool2: '1.9'", + "Workflow:" + ] + ], + "timestamp": "2024-01-09T23:01:18.710682" + } +} \ No newline at end of file diff --git a/modules/nf-core/custom/dumpsoftwareversions/tests/tags.yml b/modules/nf-core/custom/dumpsoftwareversions/tests/tags.yml new file mode 100644 index 0000000..405aa24 --- /dev/null +++ b/modules/nf-core/custom/dumpsoftwareversions/tests/tags.yml @@ -0,0 +1,2 @@ +custom/dumpsoftwareversions: + - modules/nf-core/custom/dumpsoftwareversions/** diff --git a/nextflow.config b/nextflow.config index 0215594..a1a9fc5 100644 --- a/nextflow.config +++ b/nextflow.config @@ -9,26 +9,34 @@ // Global default params, used in configs params { // Required Input fastq options (Pick 1) - fastq_dir = "" + input = null + fastq_dir = null // Input data filtering based on testing - min_abundance_percent = 10.0 - min_reads = 150000 + min_abundance_percent = 10 + min_reads = 60000 + // Kraken + Braken // Link to AWS, if connection is lost or not available user needs to pass // own database path - kraken2_db = "s3://genome-idx/kraken/standard_08gb_20240904" + kraken2_db = null - // Quast ref for scoring + // Quast and quality scoring based on testing quast_ref = "$projectDir/data/C9_S.reference.fna" + max_contigs = 100 + min_align_percent = 75 + min_reads_warn = 150000 + min_n50_score = 80000 + max_n50_score = 220000 // El_Gato skip_el_gato = false + skip_plotting = false // ChewBBACA cgMLST params prepped_schema = "$projectDir/data/SeqSphere_1521_schema" - schema_targets = "" + schema_targets = null // Metadata -- To add with clustering later // Or maybe can add to join to final output file? @@ -36,6 +44,7 @@ params { // Generic useful options outdir = 'results' + publish_dir_mode = 'copy' help = false version = false @@ -53,6 +62,9 @@ includeConfig 'conf/base.config' // ToDoMaybeEventually - Convert more process args to the modules config includeConfig 'conf/modules.config' +// IRIDA-Next config +includeConfig 'conf/iridanext.config' + // Profiles profiles { debug { @@ -149,11 +161,14 @@ profiles { executor.cpus = 4 executor.memory = 8.GB } - test { includeConfig 'conf/test.config' } - test_full { includeConfig 'conf/test_full.config' } - nml { includeConfig 'conf/nml.config' } + test_full { includeConfig 'conf/test_full.config' } + test { includeConfig 'conf/test.config' } + nml { includeConfig 'conf/nml.config' } } +// Override the default Docker registry when required +process.ext.override_configured_container_registry = true + // Set default registry for Apptainer, Docker, Podman, Charliecloud and Singularity independent of -profile // Will not be used unless Apptainer / Docker / Podman / Charliecloud / Singularity are enabled // Set to your registry if you have a mirror of containers @@ -163,6 +178,27 @@ podman.registry = 'quay.io' singularity.registry = 'quay.io' charliecloud.registry = 'quay.io' +// Nextflow plugins and settings +plugins { + id 'nf-schema@2.2.1' + id 'nf-iridanext@0.2.0' + id 'nf-prov@1.2.4' +} +validation { + help { + enabled = true + } +} +prov { + enabled = true + formats { + legacy { + file = "${params.outdir}/pipeline_info/manifest.json" + overwrite = true + } + } +} + // Export these variables to prevent local Python/R libraries from conflicting with those in the container // The JULIA depot path has been adjusted to a fixed path `/usr/local/share/julia` that needs to be used for packages in the container. // See https://apeltzer.github.io/post/03-julia-lang-nextflow/ for details on that. Once we have a common agreement on where to keep Julia packages, this is adjustable. @@ -202,11 +238,11 @@ dag { manifest { name = "phac-nml/LegioVue" author = """Darian Hole, Molly Pratt, Jennifer Tanner""" - homePage = "" + homePage = """https://github.com/phac-nml/legiovue""" description = """Legionella pneumophila WGS analysis""" mainScript = "main.nf" nextflowVersion = "!>=23.10.1" - version = "0.1.0" + version = "0.2.0" doi = "" defaultBranch = "main" } diff --git a/nextflow_schema.json b/nextflow_schema.json new file mode 100644 index 0000000..afa7a3f --- /dev/null +++ b/nextflow_schema.json @@ -0,0 +1,234 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://raw.githubusercontent.com/phac-nml/legiovue/main/nextflow_schema.json", + "title": "phac-nml/LegioVue pipeline parameters", + "description": "Legionella pneumophila WGS analysis", + "type": "object", + "$defs": { + "input_data_option_choose_1": { + "title": "Input data option (choose 1)", + "type": "object", + "fa_icon": "fas fa-terminal", + "description": "Define where the pipeline should find input data", + "properties": { + "input": { + "type": "string", + "format": "file-path", + "exists": true, + "schema": "assets/schema_input.json", + "mimetype": "text/csv", + "pattern": "^\\S+\\.csv$", + "description": "Path to comma-separated file containing information about the samples in the experiment.", + "help_text": "You will need to create a design file with information about the samples in your experiment before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row.", + "fa_icon": "fas fa-file-csv" + }, + "fastq_dir": { + "type": "string", + "fa_icon": "fas fa-folder-open", + "description": "Path to directory containing paired (_R1/_R2) gzipped fastq files", + "format": "directory-path", + "hidden": true + } + } + }, + "filtering_options": { + "title": "Filtering options", + "type": "object", + "description": "Options on minimum values for filtering out input data", + "default": "", + "properties": { + "kraken2_db": { + "type": "string", + "exists": true, + "format": "directory-path", + "fa_icon": "fas fa-server", + "description": "Path to kraken2 database to use for abundance checks" + }, + "min_abundance_percent": { + "type": "number", + "default": 10, + "description": "Minimum L.pn abundance to keep sample in pipeline", + "fa_icon": "fas fa-percentage", + "minimum": 0, + "maximum": 100 + }, + "min_reads": { + "type": "integer", + "default": 60000, + "description": "Minimum number of paired reads to keep sample in pipeline", + "minimum": 0 + } + }, + "required": ["kraken2_db"], + "fa_icon": "fas fa-filter" + }, + "skipping_options": { + "title": "Step skipping options", + "type": "object", + "description": "Processes that can be skipped through specifying the following parameters", + "default": "", + "properties": { + "skip_el_gato": { + "type": "boolean", + "description": "Skip running all el_gato sequence typing aspects", + "fa_icon": "fas fa-forward" + }, + "skip_plotting": { + "type": "boolean", + "description": "Skip plotting el_gato allele profiles", + "fa_icon": "fas fa-forward" + } + }, + "fa_icon": "fas fa-forward" + }, + "quast_and_quality_options": { + "title": "QUAST and Quality options", + "type": "object", + "description": "Options for adjusting running QUAST", + "default": "", + "properties": { + "quast_ref": { + "type": "string", + "default": "data/C9_S.reference.fna", + "format": "file-path", + "fa_icon": "fas fa-file-alt", + "description": "Path to reference fasta file to base QUAST alignment metrics off of", + "hidden": true + }, + "max_contigs": { + "type": "integer", + "default": 100, + "description": "Threshold for the number of contigs > 500bp assembled by SPAdes to get scoring points", + "minimum": 0 + }, + "min_align_percent": { + "type": "number", + "default": 75, + "description": "Thresold for minimum QUAST genome fraction percentage to get scoring points", + "fa_icon": "fas fa-percentage", + "minimum": 0, + "maximum": 100 + }, + "min_reads_warn": { + "type": "integer", + "default": 150000, + "description": "Threshold for minimum number of reads that will be given a QC warning", + "minimum": 0 + }, + "min_n50_score": { + "type": "integer", + "default": 80000, + "description": "Thresold for minimum QUAST N50 value to obtain scoring points", + "minimum": 0 + }, + "max_n50_score": { + "type": "integer", + "default": 220000, + "description": "Thresold for maximum QUAST N50 score to get max scoring points", + "minimum": 0 + } + } + }, + "chewbbaca_options": { + "title": "ChewBBACA options", + "type": "object", + "description": "Specifies the input for the chewBBACA schema to utilize for cgMLST", + "default": "", + "properties": { + "schema_targets": { + "type": "string", + "description": "ChewBBACA schema targets directory", + "format": "directory-path", + "hidden": true + }, + "prepped_schema": { + "type": "string", + "default": "data/SeqSphere_1521_schema", + "description": "ChewBBACA prepped schema directory", + "format": "directory-path", + "hidden": true + } + }, + "fa_icon": "fab fa-cc-diners-club" + }, + "generic_options": { + "title": "Generic options", + "type": "object", + "fa_icon": "fas fa-file-import", + "description": "Less common options for the pipeline, typically set in a config file.", + "help_text": "These options are common to all nf-core pipelines and allow you to customise some of the core preferences for how the pipeline runs.\n\nTypically these options would be set in a Nextflow config file loaded for all pipeline runs, such as `~/.nextflow/config`.", + "properties": { + "outdir": { + "type": "string", + "format": "directory-path", + "description": "The output directory where the results will be saved. You have to use absolute paths to storage on Cloud infrastructure.", + "fa_icon": "fas fa-folder-open", + "default": "results" + }, + "publish_dir_mode": { + "type": "string", + "default": "copy", + "description": "Method used to save pipeline results to output directory.", + "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.", + "fa_icon": "fas fa-copy", + "enum": ["symlink", "rellink", "link", "copy", "copyNoFollow", "move"], + "hidden": true + }, + "max_memory": { + "type": "string", + "default": "128.GB", + "fa_icon": "fas fa-database", + "description": "Maximum memory to allow for a process to assign", + "hidden": true + }, + "max_cpus": { + "type": "integer", + "default": 16, + "fa_icon": "fas fa-microchip", + "description": "Maximum CPUs to allow for a process to assign", + "hidden": true + }, + "max_time": { + "type": "string", + "default": "240.h", + "fa_icon": "fas fa-clock", + "description": "Maximum time to allow for a process to assign", + "hidden": true + }, + "help": { + "type": "boolean", + "description": "Display help info and exit", + "fa_icon": "far fa-question-circle", + "hidden": true + }, + "version": { + "type": "boolean", + "description": "Display version and exit.", + "fa_icon": "fas fa-question-circle", + "hidden": true + } + }, + "required": ["outdir"] + } + }, + "allOf": [ + { + "$ref": "#/$defs/input_data_option_choose_1" + }, + { + "$ref": "#/$defs/filtering_options" + }, + { + "$ref": "#/$defs/skipping_options" + }, + { + "$ref": "#/$defs/quast_and_quality_options" + }, + { + "$ref": "#/$defs/chewbbaca_options" + }, + { + "$ref": "#/$defs/generic_options" + } + ] +} diff --git a/nf-test.config b/nf-test.config new file mode 100644 index 0000000..2fa82ad --- /dev/null +++ b/nf-test.config @@ -0,0 +1,8 @@ +config { + + testsDir "tests" + workDir ".nf-test" + configFile "tests/nextflow.config" + profile "docker" + +} diff --git a/subworkflows/local/format_input.nf b/subworkflows/local/format_input.nf index 8a301da..7721ced 100644 --- a/subworkflows/local/format_input.nf +++ b/subworkflows/local/format_input.nf @@ -5,6 +5,7 @@ IMPORT MODULES / SUBWORKFLOWS / FUNCTIONS ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ +include {samplesheetToList } from 'plugin/nf-schema' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -19,12 +20,19 @@ */ workflow FORMAT_INPUT { main: - // Just adapting to the metamap format using fromFilePairs if ( params.fastq_dir ) { + // Just adapting to the metamap format using fromFilePairs Channel .fromFilePairs("${params.fastq_dir}/*_{R1,R2}*.fastq*", checkIfExists:true) .map { it -> [ [id: it[0]], it[1] ] } .set { ch_paired_fastqs } + } else { + // Matching the above formatting by creating a list of the fastq file pairs + // Schema requires pairs at the moment so this is ok. If we want to support ONT + // data later will need to adjust the logic + Channel.fromList(samplesheetToList(params.input, "assets/schema_input.json")) + .map { it -> [ it[0], [it[1], it[2]] ] } + .set { ch_paired_fastqs } } emit: diff --git a/tests/main.nf.test b/tests/main.nf.test new file mode 100644 index 0000000..4d25155 --- /dev/null +++ b/tests/main.nf.test @@ -0,0 +1,182 @@ +nextflow_pipeline { + + name "Full Pipeline NF-Tests for LegioVUE" + script "main.nf" + + //--- Test 1 + test("Conflicting Input Data Sources Specified") { + tag "fail" + + when { + params { + input = "${projectDir}/assets/samplesheet.csv" + fastq_dir = "${baseDir}/tests/test_data/" + outdir = "results" + kraken2_db = "s3://genome-idx/kraken/standard_08gb_20240904" + } + } + then { + // Status + assert workflow.failed + + // Message + assert workflow.stdout.contains("ERROR ~ Please provide input data with either: '--input input.csv' or '--fastq_dir ' but not both") + } + } + + //--- Test 2 + test("No Input Data Sources Specified") { + tag "fail" + + when { + params { + outdir = "results" + kraken2_db = "s3://genome-idx/kraken/standard_08gb_20240904" + } + } + then { + // Status + assert workflow.failed + + // Message + assert workflow.stdout.contains("ERROR ~ Please provide input data with either: '--input input.csv' or '--fastq_dir '") + } + } + + //--- Test 3 + test("Missing Input Fastq Test") { + tag "fail" + + when { + params { + input = "$baseDir/tests/test_data/input_none.csv" + outdir = "results" + kraken2_db = "s3://genome-idx/kraken/standard_08gb_20240904" + min_reads = 100 + } + } + then { + // Status + assert workflow.failed + + // Message + assert workflow.stdout.contains("ERROR ~ Validation of pipeline parameters failed!") + } + } + + //--- Test 4 + test("Data Run Test") { + tag "success" + + when { + params { + fastq_dir = "${projectDir}/tests/test_data/" + kraken2_db = "s3://genome-idx/kraken/standard_08gb_20240904" + outdir = "results" + min_reads = 100 + max_memory = "8.GB" + max_cpus = 2 + max_time = "1.h" + } + } + then { + // + // Status + // + assert workflow.success + // One fail which is from el_gato assemblies on low data + assert workflow.trace.failed().size() == 1 + + // + // Channels and Files Exist + // + def lines = [] + assert path("$launchDir/results").exists() + + // Final QC File + assert path("$launchDir/results/overall.qc.csv").exists() + + lines = path("$launchDir/results/overall.qc.csv").text + assert lines.contains("sample,lpn_abundance,num_paired_trimmed_reads,pct_paired_reads_passing_qc,n50,num_contigs,pct_gc,assembly_len,largest_contig,st,st_approach,chewbbaca_exc,chewbbaca_inf,chewbbaca_pct_exc,final_qc_score,qc_status,qc_message") + assert lines.contains("in1,100.0,4662,93.24,716,19,43.59,95953,1810,MD-,reads,0,0,0.0,2.0,WARN,low_read_count;low_n50;low_exact_allele_calls;low_qc_score") + assert lines.contains("in2,100.0,12,100.0,0,0,0,0,0,NA,NA,0,0,0.0,0,FAIL,failing_read_count") + + // ChewBBACA Allele Call data + assert path("$launchDir/results/chewbbaca/allele_calls/results_statistics.tsv").exists() + + lines = path("$launchDir/results/chewbbaca/allele_calls/results_statistics.tsv").text + assert lines.contains("FILE\tEXC\tINF\tPLOT3\tPLOT5\tLOTSC\tNIPH\tNIPHEM\tALM\tASM\tPAMA\tLNF\tInvalid CDSs\tClassified_CDSs\tTotal_CDSs") + assert lines.contains("in1\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t1521\t0\t0\t40") + + assert path("$launchDir/results/chewbbaca/allele_calls/cgMLST/missing_loci_stats.tsv").exists() + + lines = path("$launchDir/results/chewbbaca/allele_calls/cgMLST/missing_loci_stats.tsv").text + assert lines.contains("FILE\tmissing\tpercentage") + assert lines.contains("in1\t1521\t1.0") + + // el_gato final allele data + assert path("$launchDir/results/el_gato/el_gato_st.tsv").exists() + + lines = path("$launchDir/results/el_gato/el_gato_st.tsv").text + assert lines.contains("Sample\tST\tflaA\tpilE\tasd\tmip\tmompS\tproA\tneuA_neuAH\tapproach") + assert lines.contains("in1\tMD-\t-\t-\t-\t-\t-\t-\t-\treads") + + // el_gato expected subdirectories and files + assert path("$launchDir/results/el_gato/el_gato_report.pdf").exists() + assert path("$launchDir/results/el_gato/allele_stats").exists() + assert path("$launchDir/results/el_gato/plots").exists() + assert path("$launchDir/results/el_gato/reads").exists() + + // IRIDA Next output + def iridanext_json = path("$launchDir/results/iridanext.output.json").json + def iridanext_global = iridanext_json.files.global + def iridanext_samples = iridanext_json.files.samples + def iridanext_metadata = iridanext_json.metadata.samples + + assert iridanext_global.findAll { it.path == "chewbbaca/allele_calls/cgMLST/cgMLST99.tsv" }.size() == 1 + assert iridanext_global.findAll { it.path == "chewbbaca/allele_calls/cgMLST/cgMLST100.tsv" }.size() == 1 + assert iridanext_global.findAll { it.path == "chewbbaca/allele_calls/results_statistics.tsv" }.size() == 1 + assert iridanext_global.findAll { it.path == "el_gato/el_gato_st.tsv" }.size() == 1 + + assert iridanext_samples.in1.findAll { it.path == "spades/in1.contigs.fa" }.size() == 1 + + assert iridanext_metadata.in1.num_paired_trimmed_reads == "4662" + assert iridanext_metadata.in1.st_approach == "reads" + assert iridanext_metadata.in1.qc_status == "WARN" + assert iridanext_metadata.in1.qc_message == "low_read_count;low_n50;low_exact_allele_calls;low_qc_score" + + // Bracken final output (depends on kraken output) + assert path("$launchDir/results/kraken_bracken/in1-abundances.tsv").exists() + + lines = path("$launchDir/results/kraken_bracken/in1-abundances.tsv").text + assert lines.contains("name\ttaxonomy_id\ttaxonomy_lvl\tkraken_assigned_reads\tadded_reads\tnew_est_reads\tfraction_total_reads") + assert lines.contains("Legionella pneumophila\t446\tS\t1443\t407\t1850\t1.00000") + + // Trimmomatic + assert path("$launchDir/results/trimmomatic/in1_paired_R1.fastq.gz").exists() + assert path("$launchDir/results/trimmomatic/in1_paired_R2.fastq.gz").exists() + + assert path("$launchDir/results/trimmomatic/in1.summary.txt").exists() + + lines = path("$launchDir/results/trimmomatic/in1.summary.txt").text + assert lines.contains("Input Read Pairs: 5000") + assert lines.contains("Both Surviving Reads: 4662") + + // QUAST scored report (final step for quast) + assert path("$launchDir/results/quast/scored_quast_report.csv").exists() + + lines = path("$launchDir/results/quast/scored_quast_report.csv").text + assert lines.contains("sample,num_contigs,N50,duplication_ratio,percent_alignment,assembly_length,GC_content,final_score,score_rating") + assert lines.contains("in1.contigs,1,0.0,1,0,0,0,2.0,unideal") + + // Just checking that outdirs exist + assert path("$launchDir/results/fastqc").exists() + assert path("$launchDir/results/spades").exists() + + // Pipeline info and tracking + assert path("$launchDir/results/pipeline_info").exists() + assert path("$launchDir/results/pipeline_info/software_versions.yml").exists() + assert path("$launchDir/results/pipeline_info/manifest.json").exists() + } + } +} diff --git a/tests/nextflow.config b/tests/nextflow.config new file mode 100644 index 0000000..4de138f --- /dev/null +++ b/tests/nextflow.config @@ -0,0 +1,20 @@ +/* +======================================================================================== + Nextflow config file for running tests +======================================================================================== +*/ +// Resource limits are for nextflow >= 24.04.0 so also have to use the max_* params +process { + resourceLimits = [ + cpus: 2, + memory: '8.GB', + time: '1.h' + ] +} + +params.max_memory = "8.GB" +params.max_cpus = 2 +params.max_time = "1.h" + +/* Remove gzipping on JSON output for testing/asserts on file contents */ +iridanext.output.path = "${params.outdir}/iridanext.output.json" diff --git a/tests/test_data/in1_R1.fastq.gz b/tests/test_data/in1_R1.fastq.gz new file mode 100644 index 0000000..6618245 Binary files /dev/null and b/tests/test_data/in1_R1.fastq.gz differ diff --git a/tests/test_data/in1_R2.fastq.gz b/tests/test_data/in1_R2.fastq.gz new file mode 100644 index 0000000..b99f084 Binary files /dev/null and b/tests/test_data/in1_R2.fastq.gz differ diff --git a/tests/test_data/in2_R1.fastq.gz b/tests/test_data/in2_R1.fastq.gz new file mode 100644 index 0000000..88f0848 Binary files /dev/null and b/tests/test_data/in2_R1.fastq.gz differ diff --git a/tests/test_data/in2_R2.fastq.gz b/tests/test_data/in2_R2.fastq.gz new file mode 100644 index 0000000..39fc342 Binary files /dev/null and b/tests/test_data/in2_R2.fastq.gz differ diff --git a/tests/test_data/input_none.csv b/tests/test_data/input_none.csv new file mode 100644 index 0000000..a0fcc93 --- /dev/null +++ b/tests/test_data/input_none.csv @@ -0,0 +1,2 @@ +sample,fastq_1,fastq_2 +missing,none_R1.fastq,none_R2.fastq diff --git a/workflows/legiovue.nf b/workflows/legiovue.nf index 12cf0ae..3672df2 100644 --- a/workflows/legiovue.nf +++ b/workflows/legiovue.nf @@ -20,10 +20,11 @@ include { CHEWBBACA_ALLELE_CALL } from '../modules/local/chewbbaca.n include { CHEWBBACA_EXTRACT_CGMLST } from '../modules/local/chewbbaca.nf' include { PYSAMSTATS as PYSAMSTATS_MAPQ } from '../modules/local/pysamstats.nf' include { PYSAMSTATS as PYSAMSTATS_BASEQ } from '../modules/local/pysamstats.nf' -include { CSVTK_COMBINE_STATS } from '../modules/local/utils.nf' -include { PLOT_PYSAMSTATS_TSV } from '../modules/local/plotting.nf' +include { CSVTK_JOIN_ALLELE_STATS } from '../modules/local/utils.nf' +include { PLOT_EL_GATO_ALLELES } from '../modules/local/plotting.nf' include { COMBINE_SAMPLE_DATA } from '../modules/local/qc.nf' -include { CSVTK_COMBINE } from '../modules/local/utils.nf' +include { CSVTK_CONCAT_QC_DATA } from '../modules/local/utils.nf' +include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoftwareversions/main' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -46,6 +47,9 @@ workflow LEGIOVUE { ch_paired_fastqs // channel: [ val(meta), [ file(fastq_1), file(fastq_2) ] ] main: + // 0. Make version channel + ch_versions = Channel.empty() + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // // 1. Kraken and Bracken Check with maybe(?) Host Removal (TODO) // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // @@ -54,10 +58,13 @@ workflow LEGIOVUE { ch_paired_fastqs, ch_kraken2_db ) + ch_versions = ch_versions.mix(KRAKEN2_CLASSIFY.out.versions) + BRACKEN( KRAKEN2_CLASSIFY.out.report, ch_kraken2_db ) + ch_versions = ch_versions.mix(BRACKEN.out.versions) // Remove NON L.pn samples to allow good clustering // This is a temp python/process solution until we @@ -66,6 +73,8 @@ workflow LEGIOVUE { CREATE_ABUNDANCE_FILTER( BRACKEN.out.abundance ) + ch_versions = ch_versions.mix(CREATE_ABUNDANCE_FILTER.out.versions) + CREATE_ABUNDANCE_FILTER.out.abundance_check .splitCsv(header:true, sep:',') .branch{ meta, row -> @@ -85,6 +94,7 @@ workflow LEGIOVUE { ch_abundance_filter.pass .join(ch_paired_fastqs, by: [0]) ) + ch_versions = ch_versions.mix(TRIMMOMATIC.out.versions) // Filter by min count TRIMMOMATIC.out.trimmed_reads @@ -99,6 +109,7 @@ workflow LEGIOVUE { FASTQC( ch_filtered_paired_fastqs.pass ) + ch_versions = ch_versions.mix(FASTQC.out.versions) // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // // 4. SPAdes @@ -107,6 +118,7 @@ workflow LEGIOVUE { ch_filtered_paired_fastqs.pass .join(TRIMMOMATIC.out.unpaired_reads, by: [0]) ) + ch_versions = ch_versions.mix(SPADES.out.versions) // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // // 5. Quast @@ -116,17 +128,22 @@ workflow LEGIOVUE { .collect{ it[1] }, ch_quast_ref ) + ch_versions = ch_versions.mix(QUAST.out.versions) + SCORE_QUAST( QUAST.out.report ) + ch_versions = ch_versions.mix(SCORE_QUAST.out.versions) // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // // 6. El_Gato - Second round with assemblies for failing samples only // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // + ch_el_gato_report = Channel.value([]) if ( ! params.skip_el_gato ){ EL_GATO_READS( ch_filtered_paired_fastqs.pass ) + ch_versions = ch_versions.mix(EL_GATO_READS.out.versions) // El_gato consensus is only for samples where the reads output an // inconclusive ST as it has been found to potentially call one @@ -142,6 +159,7 @@ workflow LEGIOVUE { .map{ it -> it[0] } .join(SPADES.out.contigs, by:[0]) ) + ch_versions = ch_versions.mix(EL_GATO_ASSEMBLY.out.versions) // Combine and add in the approach used COMBINE_EL_GATO( @@ -153,6 +171,8 @@ workflow LEGIOVUE { .collectFile(name: 'assembly_st.tsv', keepHeader: true) .ifEmpty([]) ) + ch_versions = ch_versions.mix(COMBINE_EL_GATO.out.versions) + ch_el_gato_report = COMBINE_EL_GATO.out.report.collect().ifEmpty([]) // PDF Report // Have to rejoin the el_gato reads output as its hard to remake a csv @@ -164,25 +184,35 @@ workflow LEGIOVUE { .collect{ it[1] } .ifEmpty([]) ) + ch_versions = ch_versions.mix(EL_GATO_REPORT.out.versions) // Alleles Stats with pysamstats and plots // Visualize the called alleles to help // investigate potential issue calls - PYSAMSTATS_MAPQ( - EL_GATO_READS.out.bam_bai, - "mapq" - ) - PYSAMSTATS_BASEQ( - EL_GATO_READS.out.bam_bai, - "baseq" - ) - CSVTK_COMBINE_STATS( - PYSAMSTATS_MAPQ.out.tsv - .join(PYSAMSTATS_BASEQ.out.tsv, by:[0]) - ) - PLOT_PYSAMSTATS_TSV( - CSVTK_COMBINE_STATS.out.tsv - ) + if ( ! params.skip_plotting ){ + PYSAMSTATS_MAPQ( + EL_GATO_READS.out.bam_bai, + "mapq" + ) + ch_versions = ch_versions.mix(PYSAMSTATS_MAPQ.out.versions) + + PYSAMSTATS_BASEQ( + EL_GATO_READS.out.bam_bai, + "baseq" + ) + ch_versions = ch_versions.mix(PYSAMSTATS_BASEQ.out.versions) + + CSVTK_JOIN_ALLELE_STATS( + PYSAMSTATS_MAPQ.out.tsv + .join(PYSAMSTATS_BASEQ.out.tsv, by:[0]) + ) + ch_versions = ch_versions.mix(CSVTK_JOIN_ALLELE_STATS.out.versions) + + PLOT_EL_GATO_ALLELES( + CSVTK_JOIN_ALLELE_STATS.out.tsv + ) + ch_versions = ch_versions.mix(PLOT_EL_GATO_ALLELES.out.versions) + } } // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // @@ -193,15 +223,19 @@ workflow LEGIOVUE { ch_schema_targets ) ch_prepped_schema = CHEWBBACA_PREP_EXTERNAL_SCHEMA.out.schema + ch_versions = ch_versions.mix(CHEWBBACA_PREP_EXTERNAL_SCHEMA.out.versions) } CHEWBBACA_ALLELE_CALL( SPADES.out.contigs .collect{ it[1] }, ch_prepped_schema ) + ch_versions = ch_versions.mix(CHEWBBACA_ALLELE_CALL.out.versions) + CHEWBBACA_EXTRACT_CGMLST( - CHEWBBACA_ALLELE_CALL.out.allele_calls + CHEWBBACA_ALLELE_CALL.out.results_alleles ) + ch_versions = ch_versions.mix(CHEWBBACA_EXTRACT_CGMLST.out.versions) // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // // 8. QC + Summaries @@ -214,7 +248,6 @@ workflow LEGIOVUE { // Create some value channels using `.collect()` ch_quast_report = QUAST.out.report.collect().ifEmpty([]) ch_quast_score = SCORE_QUAST.out.report.collect().ifEmpty([]) - ch_el_gato_report = COMBINE_EL_GATO.out.report.collect().ifEmpty([]) ch_quast_report = QUAST.out.report.collect().ifEmpty([]) ch_allele_stats = CHEWBBACA_ALLELE_CALL.out.statistics.collect().ifEmpty([]) @@ -227,22 +260,18 @@ workflow LEGIOVUE { ch_el_gato_report, ch_allele_stats ) + ch_versions = ch_versions.mix(COMBINE_SAMPLE_DATA.out.versions) - CSVTK_COMBINE( - COMBINE_SAMPLE_DATA.out.tsv + CSVTK_CONCAT_QC_DATA( + COMBINE_SAMPLE_DATA.out.csv .collect{ it[1] } ) + ch_versions = ch_versions.mix(CSVTK_CONCAT_QC_DATA.out.versions) // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // - // 9. Clustering (TO-DO) + // 9. Version Output // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // - // chewbacca -> reportree -> manual vis (right now) - // As reportree isn't easy to get installed in pipeline - // Its current container is missing PS which nextflow needs - - // ReporTree container and conda env are not going to play nice - // REPORTREE( - // CHEWBBACA_EXTRACT_CGMLST.out.cgmlst, - // ch_metadata - // ) + CUSTOM_DUMPSOFTWAREVERSIONS ( + ch_versions.unique().collectFile(name: 'collated_versions.yml') + ) }
Process Name \\", + " \\ Software Version
CUSTOM_DUMPSOFTWAREVERSIONSpython3.11.7
yaml5.4.1
TOOL1tool10.11.9
TOOL2tool21.9
WorkflowNextflow