From a2bbed138735d8953ed791a385917a70a6178d4b Mon Sep 17 00:00:00 2001 From: Ian Light Date: Wed, 29 Mar 2023 11:44:41 +0000 Subject: [PATCH 001/198] partial implementation metagenomicprofiling subwor --- CITATIONS.md | 18 +- conf/modules.config | 2 +- conf/test.config | 2 +- docs/development/manual_tests.md | 24 +- modules.json | 25 ++ modules/nf-core/kraken2/kraken2/main.nf | 58 +++++ modules/nf-core/kraken2/kraken2/meta.yml | 75 ++++++ .../krakenuniq/preloadedkrakenuniq/main.nf | 224 ++++++++++++++++++ .../krakenuniq/preloadedkrakenuniq/meta.yml | 78 ++++++ modules/nf-core/malt/run/main.nf | 41 ++++ modules/nf-core/malt/run/meta.yml | 54 +++++ modules/nf-core/maltextract/main.nf | 39 +++ modules/nf-core/maltextract/meta.yml | 51 ++++ modules/nf-core/metaphlan3/metaphlan3/main.nf | 48 ++++ .../nf-core/metaphlan3/metaphlan3/meta.yml | 58 +++++ nextflow.config | 20 +- nextflow_schema.json | 29 ++- subworkflows/local/bamfiltering.nf | 18 +- subworkflows/local/metagenomics_profiling.nf | 186 +++++++++++++++ workflows/eager.nf | 28 ++- 20 files changed, 1046 insertions(+), 32 deletions(-) create mode 100644 modules/nf-core/kraken2/kraken2/main.nf create mode 100644 modules/nf-core/kraken2/kraken2/meta.yml create mode 100644 modules/nf-core/krakenuniq/preloadedkrakenuniq/main.nf create mode 100644 modules/nf-core/krakenuniq/preloadedkrakenuniq/meta.yml create mode 100644 modules/nf-core/malt/run/main.nf create mode 100644 modules/nf-core/malt/run/meta.yml create mode 100644 modules/nf-core/maltextract/main.nf create mode 100644 modules/nf-core/maltextract/meta.yml create mode 100644 modules/nf-core/metaphlan3/metaphlan3/main.nf create mode 100644 modules/nf-core/metaphlan3/metaphlan3/meta.yml create mode 100644 subworkflows/local/metagenomics_profiling.nf diff --git a/CITATIONS.md b/CITATIONS.md index 07bcca065..ec2920423 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -18,7 +18,7 @@ - [Falco](https://doi.org/10.12688%2Ff1000research.21142.2) - > de Sena Brandine, G., Smith, A.D. (2019) Falco: high-speed FastQC emulation for quality control of sequencing data. F1000Res., 8, 1874. doi: [10.12688%2Ff1000research.21142.2](https://doi.org/10.12688%2Ff1000research.21142.2) + > de Sena Brandine, G., Smith, A.D. (2019). Falco: high-speed FastQC emulation for quality control of sequencing data. F1000Res., 8, 1874. doi: [10.12688%2Ff1000research.21142.2](https://doi.org/10.12688%2Ff1000research.21142.2) - [fastp](https://doi.org/10.1093/bioinformatics/bty560) @@ -44,6 +44,22 @@ > Peltzer, A., Jäger, G., Herbig, A., Seitz, A., Kniep, C., Krause, J., & Nieselt, K. (2016). EAGER: efficient ancient genome reconstruction. Genome Biology, 17(1), 1–14. doi: [10.1186/s13059-016-0918-z](https://doi.org/10.1186/s13059-016-0918-z) +- [MALT](https://www.nature.com/articles/s41559-017-0446-6) + + > Vågene, Å.J., Herbig, A., Campana, M.G., Nelly, M., García, R., Warinner, C., Sabin, S., Spyrou, M.A., Valtueña, A.A., Huson, D., Tuross, N., Bos, K.I. & Krause, J. (2018). Salmonella enterica genomes from victims of a major sixteenth-century epidemic in Mexico. Nat Ecol Evol 2, 520–528. doi: [10.1038/s41559-017-0446-6](https://doi.org/10.1038/s41559-017-0446-6) + +- [Kraken2](https://doi.org/10.1186/s13059-019-1891-0) + + > Wood, Derrick E., Jennifer Lu, and Ben Langmead. 2019. Improved Metagenomic Analysis with Kraken 2. Genome Biology 20 (1): 257. doi: 10.1186/s13059-019-1891-0. + +- [KrakenUniq](https://doi.org/10.1186/s13059-018-1568-0) + + > Breitwieser, Florian P., Daniel N. Baker, and Steven L. Salzberg. 2018. KrakenUniq: confident and fast metagenomics classification using unique k-mer counts. Genome Biology 19 (1): 198. doi: 10.1186/s13059-018-1568-0 + +- [MetaPhlAn3](https://doi.org/10.7554/eLife.65088) + + > Beghini, Francesco, Lauren J McIver, Aitor Blanco-Míguez, Leonard Dubois, Francesco Asnicar, Sagun Maharjan, Ana Mailyan, et al. 2021. “Integrating Taxonomic, Functional, and Strain-Level Profiling of Diverse Microbial Communities with BioBakery 3.” Edited by Peter Turnbaugh, Eduardo Franco, and C Titus Brown. ELife 10 (May): e65088. doi: 10.7554/eLife.65088 + ## Software packaging/containerisation tools - [Anaconda](https://anaconda.com) diff --git a/conf/modules.config b/conf/modules.config index 3ab0325bc..6f9691471 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -357,7 +357,7 @@ process { enabled: params.bamfiltering_generatemappedfastq ] ext.args = [ - params.metagenomicscreening_input == 'all' ? '' : '-F 4', + params.metagenomics_screening_input == 'all' ? '' : '-F 4', ].join(' ').trim() ext.prefix = { "${meta.id}_${meta.library_id}_mapped" } } diff --git a/conf/test.config b/conf/test.config index 11e19a857..ba4d16f58 100644 --- a/conf/test.config +++ b/conf/test.config @@ -33,7 +33,7 @@ params { bamfiltering_mappingquality = 37 // Metagenomic screening - run_metagenomicscreening = false + run_metagenomics_screening = false } diff --git a/docs/development/manual_tests.md b/docs/development/manual_tests.md index 8bc17c9f6..b08e26880 100644 --- a/docs/development/manual_tests.md +++ b/docs/development/manual_tests.md @@ -208,8 +208,8 @@ All possible parameters bamfiltering_savefilteredbam = false // can include unmapped reads if --bamfiltering_retainunmappedgenomicbam specified // Metagenomic Screening - run_metagenomicscreening = false - metagenomicscreening_input = 'unmapped' // mapped, all, unmapped -> mapped vs all specified in SAMTOOLS_FASTQ_MAPPED in modules.conf, unmapped hardcoded SAMTOOLS_FASTQ_UMAPPED + run_metagenomics_screening = false + metagenomics_screening_input = 'unmapped' // mapped, all, unmapped -> mapped vs all specified in SAMTOOLS_FASTQ_MAPPED in modules.conf, unmapped hardcoded SAMTOOLS_FASTQ_UMAPPED ``` Tests @@ -270,45 +270,45 @@ nextflow run ../main.nf -profile test,singularity --outdir ./results -ansi-log f ## Check BAM filtering (mapped only/length/quality on genomic bam) with metagenomics screening, with unmapped reads to metagenomics # Expect: filtered BAM (samtools stats | grep SN total/mapped same), and a dump() on the ch_bam_for_metagenomics channel should report unmapped_other. Nr. of reads in dumped FASTQ should match approx unmmaped reads in results/mapping/*.flagstat -nextflow run ../main.nf -profile test,singularity --outdir ./results -ansi-log false --input data/samplesheet.tsv --fasta data/reference/Mammoth_MT_Krause.fasta --run_bamfiltering --bamfiltering_savefilteredbams --bamfiltering_minreadlength 50 --bamfiltering_mappingquality 37 --run_metagenomicscreening -dump-channels +nextflow run ../main.nf -profile test,singularity --outdir ./results -ansi-log false --input data/samplesheet.tsv --fasta data/reference/Mammoth_MT_Krause.fasta --run_bamfiltering --bamfiltering_savefilteredbams --bamfiltering_minreadlength 50 --bamfiltering_mappingquality 37 --run_metagenomics_screening -dump-channels ## Check BAM filtering (mapped only/length/quality on genomic bam) with metagenomics screening, with mapped only reads going to metagenomics # Expect: filtered BAM (samtools stats | grep SN total/mapped same), and a dump() on the ch_bam_for_metagenomics channel should report mapped_other. Nr. of reads in dumped FASTQ should match approx mmaped reads in results/mapping/*.flagstat -nextflow run ../main.nf -profile test,singularity --outdir ./results -ansi-log false --input data/samplesheet.tsv --fasta data/reference/Mammoth_MT_Krause.fasta --run_bamfiltering --bamfiltering_savefilteredbams --bamfiltering_minreadlength 50 --bamfiltering_mappingquality 37 --run_metagenomicscreening --metagenomicscreening_input 'mapped' -dump-channels +nextflow run ../main.nf -profile test,singularity --outdir ./results -ansi-log false --input data/samplesheet.tsv --fasta data/reference/Mammoth_MT_Krause.fasta --run_bamfiltering --bamfiltering_savefilteredbams --bamfiltering_minreadlength 50 --bamfiltering_mappingquality 37 --run_metagenomics_screening --metagenomics_screening_input 'mapped' -dump-channels ## Check BAM filtering (mapped only/length/quality on genomic bam) with metagenomics screening, with all reads going to metagenomics # Expect: filtered BAM (samtools stats | grep SN total/mapped same), and a dump() on the ch_bam_for_metagenomics channel should report mapped_other. Nr. of reads in dumped FASTQ should match total reads in results/mapping/*.flagstat -nextflow run ../main.nf -profile test,singularity --outdir ./results -ansi-log false --input data/samplesheet.tsv --fasta data/reference/Mammoth_MT_Krause.fasta --run_bamfiltering --bamfiltering_savefilteredbams --bamfiltering_minreadlength 50 --bamfiltering_mappingquality 37 --run_metagenomicscreening --metagenomicscreening_input 'all' -dump-channels +nextflow run ../main.nf -profile test,singularity --outdir ./results -ansi-log false --input data/samplesheet.tsv --fasta data/reference/Mammoth_MT_Krause.fasta --run_bamfiltering --bamfiltering_savefilteredbams --bamfiltering_minreadlength 50 --bamfiltering_mappingquality 37 --run_metagenomics_screening --metagenomics_screening_input 'all' -dump-channels ## Check BAM filtering NO LENGTH/QAULITY with metagenomics screening, with unmapped reads to metagenomics # Expect: filtered BAM (samtools stats SN quality average < 36.7 or view -q 0 vs. -q 37 is different and RL reads min <50), and a dump() on the ch_bam_for_metagenomics channel should report mapped_other. Nr. of reads in dumped FASTQ should match unmapped reads as calculated from results/mapping/*.flagstat. Note: No filtered flagstat expected! -nextflow run ../main.nf -profile test,singularity --outdir ./results -ansi-log false --input data/samplesheet.tsv --fasta data/reference/Mammoth_MT_Krause.fasta --run_bamfiltering --bamfiltering_savefilteredbams --run_metagenomicscreening --metagenomicscreening_input 'unmapped' -dump-channels +nextflow run ../main.nf -profile test,singularity --outdir ./results -ansi-log false --input data/samplesheet.tsv --fasta data/reference/Mammoth_MT_Krause.fasta --run_bamfiltering --bamfiltering_savefilteredbams --run_metagenomics_screening --metagenomics_screening_input 'unmapped' -dump-channels ## Check BAM filtering NO LENGTH/QAULITY with metagenomics screening, with unmapped reads to metagenomics and save unmapped FASTQ # Expect: filtered BAM (samtools stats SN quality average < 36.7 or view -q 0 vs. -q 37 is different and RL reads min <50), and a dump() on the ch_bam_for_metagenomics channel should report unmapped_other. Nr. of reads in dumped FASTQ should match unmapped reads as calculated from results/mapping/*.flagstat; and unmapped other fASTQ in bam_filtering directoryt. Note: No filtered flagstat expected! -nextflow run ../main.nf -profile test,singularity --outdir ./results -ansi-log false --input data/samplesheet.tsv --fasta data/reference/Mammoth_MT_Krause.fasta --run_bamfiltering --bamfiltering_savefilteredbams --run_metagenomicscreening --metagenomicscreening_input 'unmapped' -dump-channels +nextflow run ../main.nf -profile test,singularity --outdir ./results -ansi-log false --input data/samplesheet.tsv --fasta data/reference/Mammoth_MT_Krause.fasta --run_bamfiltering --bamfiltering_savefilteredbams --run_metagenomics_screening --metagenomics_screening_input 'unmapped' -dump-channels ## Check BAM filtering NO LENGTH/QAULITY with metagenomics screening, with mapped only reads going to metagenomics # Expect: filtered BAM (samtools stats SN quality average < 36.7 or view -q 0 vs. -q 37 is different and RL reads min <50), and a dump() on the ch_bam_for_metagenomics channel should report mapped_other. Nr. of reads in dumped FASTQ should be roughly matching mappd reads as calculated from results/mapping/*.flagstatt. Note: No filtered flagstat expected! -nextflow run ../main.nf -profile test,singularity --outdir ./results -ansi-log false --input data/samplesheet.tsv --fasta data/reference/Mammoth_MT_Krause.fasta --run_bamfiltering --bamfiltering_savefilteredbams --run_metagenomicscreening --metagenomicscreening_input 'mapped' -dump-channels +nextflow run ../main.nf -profile test,singularity --outdir ./results -ansi-log false --input data/samplesheet.tsv --fasta data/reference/Mammoth_MT_Krause.fasta --run_bamfiltering --bamfiltering_savefilteredbams --run_metagenomics_screening --metagenomics_screening_input 'mapped' -dump-channels ## Check BAM filtering NO LENGTH/QAULITY with metagenomics screening, with all reads going to metagenomics # Expect: filtered BAM (samtools stats SN quality average < 36.7 or view -q 0 vs. -q 37 is different and RL reads min <50), and a dump() on the ch_bam_for_metagenomics channel should report mapped_other. Nr. of reads in dumped FASTQ should be roughly matching total reads as calculated from results/mapping/*.flagstatt. Note: No filtered flagstat expected! ## Some reads lost, not 100% why command looks OK... but not just unmapped as more than that -nextflow run ../main.nf -profile test,singularity --outdir ./results -ansi-log false --input data/samplesheet.tsv --fasta data/reference/Mammoth_MT_Krause.fasta --run_bamfiltering --bamfiltering_savefilteredbams --run_metagenomicscreening --metagenomicscreening_input 'all' -dump-channels +nextflow run ../main.nf -profile test,singularity --outdir ./results -ansi-log false --input data/samplesheet.tsv --fasta data/reference/Mammoth_MT_Krause.fasta --run_bamfiltering --bamfiltering_savefilteredbams --run_metagenomics_screening --metagenomics_screening_input 'all' -dump-channels ## Check BAM filtering ONLY length filtering, with metagenomics screening, with unmapped reads to metagenomics and save unmapped FASTQ ## Metagenomics with length only # Expect: filtered BAM (samtools stats SN quality average < 36.7 or view -q 0 vs. -q 37 is different and RL reads min >= 50), and a dump() on the ch_bam_for_metagenomics channel should report unmapped_other. Nr. of reads in dumped FASTQ should match unmapped reads as calculated from results/mapping/*.flagstat; and unmapped other fASTQ in bam_filtering directoryt. -nextflow run ../main.nf -profile test,singularity --outdir ./results -ansi-log false --input data/samplesheet.tsv --fasta data/reference/Mammoth_MT_Krause.fasta --run_bamfiltering --bamfiltering_savefilteredbams --run_metagenomicscreening --metagenomicscreening_input 'unmapped' -dump-channels --bamfiltering_minreadlength 50 +nextflow run ../main.nf -profile test,singularity --outdir ./results -ansi-log false --input data/samplesheet.tsv --fasta data/reference/Mammoth_MT_Krause.fasta --run_bamfiltering --bamfiltering_savefilteredbams --run_metagenomics_screening --metagenomics_screening_input 'unmapped' -dump-channels --bamfiltering_minreadlength 50 ## Check BAM filtering ONLY length filtering, with metagenomics screening, with unmapped reads to metagenomics and save unmapped FASTQ ## Metagenomics with length only # Expect: filtered BAM (samtools stats SN quality average < 36.7 or view -q 0 vs. -q 37 is not different and RL reads min <= 50), and a dump() on the ch_bam_for_metagenomics channel should report unmapped_other. Nr. of reads in dumped FASTQ should match unmapped reads as calculated from results/mapping/*.flagstat; and unmapped other fASTQ in bam_filtering directoryt. -nextflow run ../main.nf -profile test,singularity --outdir ./results -ansi-log false --input data/samplesheet.tsv --fasta data/reference/Mammoth_MT_Krause.fasta --run_bamfiltering --bamfiltering_savefilteredbams --run_metagenomicscreening --metagenomicscreening_input 'unmapped' -dump-channels --bamfiltering_mappingquality 37 +nextflow run ../main.nf -profile test,singularity --outdir ./results -ansi-log false --input data/samplesheet.tsv --fasta data/reference/Mammoth_MT_Krause.fasta --run_bamfiltering --bamfiltering_savefilteredbams --run_metagenomics_screening --metagenomics_screening_input 'unmapped' -dump-channels --bamfiltering_mappingquality 37 ## Check what happens when we do paired-end merging and sending reads to metagenomics... -nextflow run ../main.nf -profile test,singularity --outdir ./results -ansi-log false --input data/samplesheet.tsv --fasta data/reference/Mammoth_MT_Krause.fasta --run_bamfiltering --bamfiltering_savefilteredbams --run_metagenomicscreening --metagenomicscreening_input 'unmapped' -dump-channels --bamfiltering_mappingquality 37 --preprocessing_skippairmerging +nextflow run ../main.nf -profile test,singularity --outdir ./results -ansi-log false --input data/samplesheet.tsv --fasta data/reference/Mammoth_MT_Krause.fasta --run_bamfiltering --bamfiltering_savefilteredbams --run_metagenomics_screening --metagenomics_screening_input 'unmapped' -dump-channels --bamfiltering_mappingquality 37 --preprocessing_skippairmerging ``` ## Deduplication diff --git a/modules.json b/modules.json index 4402d9af3..c42af3b6e 100644 --- a/modules.json +++ b/modules.json @@ -75,6 +75,31 @@ "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", "installed_by": ["modules"] }, + "kraken2/kraken2": { + "branch": "master", + "git_sha": "7c695e0147df1157413e06246d9b0094617d3e6b", + "installed_by": ["modules"] + }, + "krakenuniq/preloadedkrakenuniq": { + "branch": "master", + "git_sha": "a6eb17f65b3ee5761c25c075a6166c9f76733cee", + "installed_by": ["modules"] + }, + "malt/run": { + "branch": "master", + "git_sha": "75027bf77472b1f4fd2cdd7e46f83119dfb0f2c6", + "installed_by": ["modules"] + }, + "maltextract": { + "branch": "master", + "git_sha": "0f8a77ff00e65eaeebc509b8156eaa983192474b", + "installed_by": ["modules"] + }, + "metaphlan3/metaphlan3": { + "branch": "master", + "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "installed_by": ["modules"] + }, "multiqc": { "branch": "master", "git_sha": "ee80d14721e76e2e079103b8dcd5d57129e584ba", diff --git a/modules/nf-core/kraken2/kraken2/main.nf b/modules/nf-core/kraken2/kraken2/main.nf new file mode 100644 index 000000000..5901064e7 --- /dev/null +++ b/modules/nf-core/kraken2/kraken2/main.nf @@ -0,0 +1,58 @@ +process KRAKEN2_KRAKEN2 { + tag "$meta.id" + label 'process_high' + + conda "bioconda::kraken2=2.1.2 conda-forge::pigz=2.6" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-5799ab18b5fc681e75923b2450abaa969907ec98:87fc08d11968d081f3e8a37131c1f1f6715b6542-0' : + 'quay.io/biocontainers/mulled-v2-5799ab18b5fc681e75923b2450abaa969907ec98:87fc08d11968d081f3e8a37131c1f1f6715b6542-0' }" + + input: + tuple val(meta), path(reads) + path db + val save_output_fastqs + val save_reads_assignment + + output: + tuple val(meta), path('*.classified{.,_}*') , optional:true, emit: classified_reads_fastq + tuple val(meta), path('*.unclassified{.,_}*') , optional:true, emit: unclassified_reads_fastq + tuple val(meta), path('*classifiedreads.txt') , optional:true, emit: classified_reads_assignment + tuple val(meta), path('*report.txt') , emit: report + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def paired = meta.single_end ? "" : "--paired" + def classified = meta.single_end ? "${prefix}.classified.fastq" : "${prefix}.classified#.fastq" + def unclassified = meta.single_end ? "${prefix}.unclassified.fastq" : "${prefix}.unclassified#.fastq" + def classified_option = save_output_fastqs ? "--classified-out ${classified}" : "" + def unclassified_option = save_output_fastqs ? "--unclassified-out ${unclassified}" : "" + def readclassification_option = save_reads_assignment ? "--output ${prefix}.kraken2.classifiedreads.txt" : "--output /dev/null" + def compress_reads_command = save_output_fastqs ? "pigz -p $task.cpus *.fastq" : "" + + """ + kraken2 \\ + --db $db \\ + --threads $task.cpus \\ + --report ${prefix}.kraken2.report.txt \\ + --gzip-compressed \\ + $unclassified_option \\ + $classified_option \\ + $readclassification_option \\ + $paired \\ + $args \\ + $reads + + $compress_reads_command + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + kraken2: \$(echo \$(kraken2 --version 2>&1) | sed 's/^.*Kraken version //; s/ .*\$//') + pigz: \$( pigz --version 2>&1 | sed 's/pigz //g' ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/kraken2/kraken2/meta.yml b/modules/nf-core/kraken2/kraken2/meta.yml new file mode 100644 index 000000000..7129fe3a0 --- /dev/null +++ b/modules/nf-core/kraken2/kraken2/meta.yml @@ -0,0 +1,75 @@ +name: kraken2_kraken2 +description: Classifies metagenomic sequence data +keywords: + - classify + - metagenomics + - fastq + - db +tools: + - kraken2: + description: | + Kraken2 is a taxonomic sequence classifier that assigns taxonomic labels to sequence reads + homepage: https://ccb.jhu.edu/software/kraken2/ + documentation: https://github.com/DerrickWood/kraken2/wiki/Manual + doi: 10.1186/s13059-019-1891-0 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input FastQ files of size 1 and 2 for single-end and paired-end data, + respectively. + - db: + type: directory + description: Kraken2 database + - save_output_fastqs: + type: boolean + description: | + If true, optional commands are added to save classified and unclassified reads + as fastq files + - save_reads_assignment: + type: boolean + description: | + If true, an optional command is added to save a file reporting the taxonomic + classification of each input read +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - classified_reads_fastq: + type: file + description: | + Reads classified as belonging to any of the taxa + on the Kraken2 database. + pattern: "*{fastq.gz}" + - unclassified_reads_fastq: + type: file + description: | + Reads not classified to any of the taxa + on the Kraken2 database. + pattern: "*{fastq.gz}" + - classified_reads_assignment: + type: file + description: | + Kraken2 output file indicating the taxonomic assignment of + each input read + - report: + type: file + description: | + Kraken2 report containing stats about classified + and not classifed reads. + pattern: "*.{report.txt}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@joseespinosa" + - "@drpatelh" diff --git a/modules/nf-core/krakenuniq/preloadedkrakenuniq/main.nf b/modules/nf-core/krakenuniq/preloadedkrakenuniq/main.nf new file mode 100644 index 000000000..0cb402f77 --- /dev/null +++ b/modules/nf-core/krakenuniq/preloadedkrakenuniq/main.nf @@ -0,0 +1,224 @@ +process KRAKENUNIQ_PRELOADEDKRAKENUNIQ { + tag "$meta.id" + label 'process_high' + + conda "bioconda::krakenuniq=1.0.2" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/krakenuniq:1.0.2--pl5321h19e8d03_0': + 'quay.io/biocontainers/krakenuniq:1.0.2--pl5321h19e8d03_0' }" + + input: + tuple val(meta), path(fastqs) + path db + val ram_chunk_size + val save_output_fastqs + val report_file + val save_output + + output: + tuple val(meta), path('*.classified{.,_}*') , optional:true, emit: classified_reads_fastq + tuple val(meta), path('*.unclassified{.,_}*') , optional:true, emit: unclassified_reads_fastq + tuple val(meta), path('*classified.txt') , optional:true, emit: classified_assignment + tuple val(meta), path('*report.txt') , emit: report + + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def args2 = task.ext.args ?: '' + + def classified = meta.single_end ? '"\${PREFIX}.classified.fastq"' : '"\${PREFIX}.classified#.fastq"' + def unclassified = meta.single_end ? '"\${PREFIX}.unclassified.fastq"' : '"\${PREFIX}.unclassified#.fastq"' + def classified_option = save_output_fastqs ? "--classified-out ${classified}" : '' + def unclassified_option = save_output_fastqs ? "--unclassified-out ${unclassified}" : '' + def output_option = save_output ? '--output "\${PREFIX}.krakenuniq.classified.txt"' : '' + def report = report_file ? '--report-file "\${PREFIX}.krakenuniq.report.txt"' : '' + def compress_reads_command = save_output_fastqs ? 'gzip --no-name *.fastq' : '' + if (meta.single_end) { + """ + krakenuniq \\ + --db $db \\ + --preload \\ + --preload-size $ram_chunk_size \\ + --threads $task.cpus \\ + $args + + strip_suffix() { + local result=\$1 + # Strip any file extensions. + echo "\${result%%.*}" + } + + printf "%s\\n" ${fastqs} | while read FASTQ; do \\ + PREFIX="\$(strip_suffix "\${FASTQ}")" + + krakenuniq \\ + --db $db \\ + --threads $task.cpus \\ + $report \\ + $output_option \\ + $unclassified_option \\ + $classified_option \\ + $output_option \\ + $args2 \\ + "\${FASTQ}" + done + + $compress_reads_command + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + krakenuniq: \$(echo \$(krakenuniq --version 2>&1) | sed 's/^.*KrakenUniq version //; s/ .*\$//') + END_VERSIONS + """ + } else { + """ + krakenuniq \\ + --db $db \\ + --preload \\ + --preload-size $ram_chunk_size \\ + --threads $task.cpus \\ + $args + + strip_suffix() { + local result + read result + # Strip any trailing dot or underscore. + result="\${result%_}" + echo "\${result%.}" + } + + printf "%s %s\\n" ${fastqs} | while read FASTQ; do \\ + read -r -a FASTQ <<< "\${FASTQ}" + PREFIX="\$(printf "%s\\n" "\${FASTQ[@]}" | sed -e 'N;s/^\\(.*\\).*\\n\\1.*\$/\\1\\n\\1/;D' | strip_suffix)" + + krakenuniq \\ + --db $db \\ + --threads $task.cpus \\ + $report \\ + $output_option \\ + $unclassified_option \\ + $classified_option \\ + $output_option \\ + --paired \\ + $args2 \\ + "\${FASTQ[@]}" + done + + $compress_reads_command + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + krakenuniq: \$(echo \$(krakenuniq --version 2>&1) | sed 's/^.*KrakenUniq version //; s/ .*\$//') + END_VERSIONS + """ + } + + stub: + def args = task.ext.args ?: '' + def args2 = task.ext.args ?: '' + + def classified = meta.single_end ? '"\${PREFIX}.classified.fastq"' : '"\${PREFIX}.classified#.fastq"' + def unclassified = meta.single_end ? '"\${PREFIX}.unclassified.fastq"' : '"\${PREFIX}.unclassified#.fastq"' + def classified_option = save_output_fastqs ? "--classified-out ${classified}" : '' + def unclassified_option = save_output_fastqs ? "--unclassified-out ${unclassified}" : '' + def output_option = save_output ? '--output "\${PREFIX}.krakenuniq.classified.txt"' : '' + def report = report_file ? '--report-file "\${PREFIX}.krakenuniq.report.txt"' : '' + def compress_reads_command = save_output_fastqs ? 'gzip --no-name *.fastq' : '' + if (meta.single_end) { + """ + echo krakenuniq \\ + --db $db \\ + --preload \\ + --preload-size $ram_chunk_size \\ + --threads $task.cpus \\ + $args + + strip_suffix() { + local result=\$1 + # Strip any file extensions. + echo "\${result%%.*}" + } + + printf "%s\\n" ${fastqs} | while read FASTQ; do \\ + echo "\${FASTQ}" + PREFIX="\$(strip_suffix "\${FASTQ}")" + echo "\${PREFIX}" + + echo krakenuniq \\ + --db $db \\ + --threads $task.cpus \\ + $report \\ + $output_option \\ + $unclassified_option \\ + $classified_option \\ + $output_option \\ + $args2 \\ + "\${FASTQ}" + + touch "\${PREFIX}.classified.fastq.gz" + touch "\${PREFIX}.krakenuniq.classified.txt" + touch "\${PREFIX}.krakenuniq.report.txt" + touch "\${PREFIX}.unclassified.fastq.gz" + done + + echo $compress_reads_command + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + krakenuniq: \$(echo \$(krakenuniq --version 2>&1) | sed 's/^.*KrakenUniq version //; s/ .*\$//') + END_VERSIONS + """ + } else { + """ + echo krakenuniq \\ + --db $db \\ + --preload \\ + --preload-size $ram_chunk_size \\ + --threads $task.cpus \\ + $args + + strip_suffix() { + local result + read result + # Strip any trailing dot or underscore. + result="\${result%_}" + echo "\${result%.}" + } + + printf "%s %s\\n" ${fastqs} | while read FASTQ; do \\ + read -r -a FASTQ <<< "\${FASTQ}" + echo "\${FASTQ[@]}" + PREFIX="\$(printf "%s\\n" "\${FASTQ[@]}" | sed -e 'N;s/^\\(.*\\).*\\n\\1.*\$/\\1\\n\\1/;D' | strip_suffix)" + echo "\${PREFIX}" + + echo krakenuniq \\ + --db $db \\ + --threads $task.cpus \\ + $report \\ + $output_option \\ + $unclassified_option \\ + $classified_option \\ + $output_option \\ + --paired \\ + $args2 \\ + "\${FASTQ[@]}" + + touch "\${PREFIX}.classified_1.fastq.gz" "\${PREFIX}.classified_2.fastq.gz" + touch "\${PREFIX}.krakenuniq.classified.txt" + touch "\${PREFIX}.krakenuniq.report.txt" + touch "\${PREFIX}.unclassified_1.fastq.gz" "\${PREFIX}.unclassified_2.fastq.gz" + done + + echo $compress_reads_command + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + krakenuniq: \$(echo \$(krakenuniq --version 2>&1) | sed 's/^.*KrakenUniq version //; s/ .*\$//') + END_VERSIONS + """ + } +} diff --git a/modules/nf-core/krakenuniq/preloadedkrakenuniq/meta.yml b/modules/nf-core/krakenuniq/preloadedkrakenuniq/meta.yml new file mode 100644 index 000000000..4ac645c55 --- /dev/null +++ b/modules/nf-core/krakenuniq/preloadedkrakenuniq/meta.yml @@ -0,0 +1,78 @@ +name: "krakenuniq_preloadedkrakenuniq" +description: Classifies metagenomic sequence data using unique k-mer counts +keywords: + - classify + - metagenomics + - kmers + - fastq + - db +tools: + - "krakenuniq": + description: "Metagenomics classifier with unique k-mer counting for more specific results" + homepage: https://github.com/fbreitwieser/krakenuniq + documentation: https://github.com/fbreitwieser/krakenuniq + doi: 10.1186/s13059-018-1568-0 + licence: ["MIT"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fastqs: + type: file + description: List of input FastQ files + - db: + type: directory + description: KrakenUniq database + - ram_chunk_size: + type: val + description: Amount of maximum amount of RAM each chunk of database that should be loaded at any one time + pattern: "*GB" + - save_output_fastqs: + type: boolean + description: | + If true, optional commands are added to save classified and unclassified reads + as fastq files + - save_reads_assignment: + type: boolean + description: | + If true, an optional command is added to save a file reporting the taxonomic + classification of each input read +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - classified_reads_fastq: + type: file + description: | + Reads classified as belonging to any of the taxa + on the KrakenUniq database. + pattern: "*.fastq.gz" + - unclassified_reads_fastq: + type: file + description: | + Reads not classified to any of the taxa + on the KrakenUniq database. + pattern: "*.fastq.gz" + - classified_assignment: + type: file + description: | + KrakenUniq output file indicating the taxonomic assignment of + each input read ## DOUBLE CHECK!! + - report: + type: file + description: | + KrakenUniq report containing stats about classified + and not classifed reads. + pattern: "*.report.txt" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@mjamy" + - "@Midnighter" diff --git a/modules/nf-core/malt/run/main.nf b/modules/nf-core/malt/run/main.nf new file mode 100644 index 000000000..61b592dcb --- /dev/null +++ b/modules/nf-core/malt/run/main.nf @@ -0,0 +1,41 @@ +process MALT_RUN { + tag "$meta.id" + label 'process_high' + + conda "bioconda::malt=0.61" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/malt:0.61--hdfd78af_0' : + 'quay.io/biocontainers/malt:0.61--hdfd78af_0' }" + + input: + tuple val(meta), path(fastqs) + path index + + output: + tuple val(meta), path("*.rma6") , emit: rma6 + tuple val(meta), path("*.{tab,text,sam}"), optional:true, emit: alignments + tuple val(meta), path("*.log") , emit: log + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + + """ + malt-run \\ + -t $task.cpus \\ + -v \\ + -o . \\ + $args \\ + --inFile ${fastqs.join(' ')} \\ + --index $index/ |&tee ${prefix}-malt-run.log + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + malt: \$(malt-run --help 2>&1 | grep -o 'version.* ' | cut -f 1 -d ',' | cut -f2 -d ' ') + END_VERSIONS + """ +} diff --git a/modules/nf-core/malt/run/meta.yml b/modules/nf-core/malt/run/meta.yml new file mode 100644 index 000000000..2a7944642 --- /dev/null +++ b/modules/nf-core/malt/run/meta.yml @@ -0,0 +1,54 @@ +name: malt_run +description: MALT, an acronym for MEGAN alignment tool, is a sequence alignment and analysis tool designed for processing high-throughput sequencing data, especially in the context of metagenomics. +keywords: + - malt + - alignment + - metagenomics + - ancient DNA + - aDNA + - palaeogenomics + - archaeogenomics + - microbiome +tools: + - malt: + description: A tool for mapping metagenomic data + homepage: https://www.wsi.uni-tuebingen.de/lehrstuehle/algorithms-in-bioinformatics/software/malt/ + documentation: https://software-ab.informatik.uni-tuebingen.de/download/malt/manual.pdf + + doi: "10.1038/s41559-017-0446-6" + licence: ["GPL v3"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fastqs: + type: file + description: Input FASTQ files + pattern: "*.{fastq.gz,fq.gz}" + - index: + type: directory + description: Index/database directory from malt-build + pattern: "*/" +output: + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - rma6: + type: file + description: MEGAN6 RMA6 file + pattern: "*.rma6" + - sam: + type: file + description: Alignment files in Tab, Text or MEGAN-compatible SAM format + pattern: "*.{tab,txt,sam}" + - log: + type: file + description: Log of verbose MALT stdout + pattern: "*-malt-run.log" + +authors: + - "@jfy133" diff --git a/modules/nf-core/maltextract/main.nf b/modules/nf-core/maltextract/main.nf new file mode 100644 index 000000000..d44b54c60 --- /dev/null +++ b/modules/nf-core/maltextract/main.nf @@ -0,0 +1,39 @@ +process MALTEXTRACT { + + label 'process_medium' + + conda "bioconda::hops=0.35" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/hops:0.35--hdfd78af_1' : + 'quay.io/biocontainers/hops:0.35--hdfd78af_1' }" + + input: + path rma6 + path taxon_list + path ncbi_dir + + output: + path "results" , emit: results + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + """ + MaltExtract \\ + -Xmx${task.memory.toGiga()}g \\ + -p $task.cpus \\ + -i ${rma6.join(' ')} \\ + -t $taxon_list \\ + -r $ncbi_dir \\ + -o results/ \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + maltextract: \$(MaltExtract --help | head -n 2 | tail -n 1 | sed 's/MaltExtract version//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/maltextract/meta.yml b/modules/nf-core/maltextract/meta.yml new file mode 100644 index 000000000..c365a7c5e --- /dev/null +++ b/modules/nf-core/maltextract/meta.yml @@ -0,0 +1,51 @@ +name: maltextract +description: Tool for evaluation of MALT results for true positives of ancient metagenomic taxonomic screening +keywords: + - malt + - MaltExtract + - HOPS + - alignment + - metagenomics + - ancient DNA + - aDNA + - palaeogenomics + - archaeogenomics + - microbiome + - authentication + - damage + - edit distance +tools: + - maltextract: + description: Java tool to work with ancient metagenomics + homepage: https://github.com/rhuebler/hops + documentation: https://github.com/rhuebler/hops + tool_dev_url: https://github.com/rhuebler/hops + doi: "10.1186/s13059-019-1903-0" + licence: ["GPL 3"] + +input: + - rma6: + type: file + description: RMA6 files from MALT + pattern: "*.rma6" + - taxon_list: + type: file + description: List of target taxa to evaluate + pattern: "*.txt" + - ncbi_dir: + type: directory + description: Directory containing NCBI taxonomy map and tre files + pattern: "${ncbi_dir}/" + +output: + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - results: + type: directory + description: Directory containing MaltExtract text results files + pattern: "*.rma6" + +authors: + - "@jfy133" diff --git a/modules/nf-core/metaphlan3/metaphlan3/main.nf b/modules/nf-core/metaphlan3/metaphlan3/main.nf new file mode 100644 index 000000000..34f8705cc --- /dev/null +++ b/modules/nf-core/metaphlan3/metaphlan3/main.nf @@ -0,0 +1,48 @@ +process METAPHLAN3_METAPHLAN3 { + tag "$meta.id" + label 'process_high' + + conda "bioconda::metaphlan=3.0.12" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/metaphlan:3.0.12--pyhb7b1952_0' : + 'quay.io/biocontainers/metaphlan:3.0.12--pyhb7b1952_0' }" + + input: + tuple val(meta), path(input) + path metaphlan_db + + output: + tuple val(meta), path("*_profile.txt") , emit: profile + tuple val(meta), path("*.biom") , emit: biom + tuple val(meta), path('*.bowtie2out.txt'), optional:true, emit: bt2out + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def input_type = ("$input".endsWith(".fastq.gz") || "$input".endsWith(".fq.gz")) ? "--input_type fastq" : ("$input".contains(".fasta")) ? "--input_type fasta" : ("$input".endsWith(".bowtie2out.txt")) ? "--input_type bowtie2out" : "--input_type sam" + def input_data = ("$input_type".contains("fastq")) && !meta.single_end ? "${input[0]},${input[1]}" : "$input" + def bowtie2_out = "$input_type" == "--input_type bowtie2out" || "$input_type" == "--input_type sam" ? '' : "--bowtie2out ${prefix}.bowtie2out.txt" + + """ + BT2_DB=`find -L "${metaphlan_db}" -name "*rev.1.bt2" -exec dirname {} \\;` + + metaphlan \\ + --nproc $task.cpus \\ + $input_type \\ + $input_data \\ + $args \\ + $bowtie2_out \\ + --bowtie2db \$BT2_DB \\ + --biom ${prefix}.biom \\ + --output_file ${prefix}_profile.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + metaphlan3: \$(metaphlan --version 2>&1 | awk '{print \$3}') + END_VERSIONS + """ +} diff --git a/modules/nf-core/metaphlan3/metaphlan3/meta.yml b/modules/nf-core/metaphlan3/metaphlan3/meta.yml new file mode 100644 index 000000000..659d83a95 --- /dev/null +++ b/modules/nf-core/metaphlan3/metaphlan3/meta.yml @@ -0,0 +1,58 @@ +name: metaphlan3_metaphlan3 +description: MetaPhlAn is a tool for profiling the composition of microbial communities from metagenomic shotgun sequencing data. +keywords: + - metagenomics + - classification + - fastq + - bam + - fasta +tools: + - metaphlan3: + description: Identify clades (phyla to species) present in the metagenome obtained from a microbiome sample and their relative abundance + homepage: https://huttenhower.sph.harvard.edu/metaphlan/ + documentation: https://github.com/biobakery/MetaPhlAn + doi: "10.7554/eLife.65088" + licence: ["MIT License"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - input: + type: file + description: Metaphlan 3.0 can classify the metagenome from a variety of input data types, including FASTQ files (single-end and paired-end), FASTA, bowtie2-produced SAM files (produced from alignments to the MetaPHlAn marker database) and intermediate bowtie2 alignment files (bowtie2out) + pattern: "*.{fastq.gz, fasta, fasta.gz, sam, bowtie2out.txt}" + - metaphlan_db: + type: file + description: | + Directory containing pre-downloaded and uncompressed MetaPhlAn3 database downloaded from: http://cmprod1.cibio.unitn.it/biobakery3/metaphlan_databases/. + Note that you will also need to specify `--index` and the database version name (e.g. 'mpa_v31_CHOCOPhlAn_201901') in your module.conf ext.args for METAPHLAN3_METAPHLAN3! + pattern: "*/" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - profile: + type: file + description: Tab-separated output file of the predicted taxon relative abundances + pattern: "*.{txt}" + - biom: + type: file + description: General-use format for representing biological sample by observation contingency tables + pattern: "*.{biom}" + - bowtie2out: + type: file + description: Intermediate Bowtie2 output produced from mapping the metagenome against the MetaPHlAn marker database ( not compatible with `bowtie2out` files generated with MetaPhlAn versions below 3 ) + pattern: "*.{bowtie2out.txt}" + +authors: + - "@MGordon09" diff --git a/nextflow.config b/nextflow.config index ca382c9e1..a67e9b2de 100644 --- a/nextflow.config +++ b/nextflow.config @@ -109,8 +109,24 @@ params { bamfiltering_savefilteredbams = false // can include unmapped reads if --bamfiltering_retainunmappedgenomicbam specified // Metagenomic Screening - run_metagenomicscreening = false - metagenomicscreening_input = 'unmapped' // mapped, all, unmapped -> mapped vs all specified in SAMTOOLS_FASTQ_MAPPED in modules.conf, unmapped hardcoded SAMTOOLS_FASTQ_UMAPPED + run_metagenomics_screening = false + metagenomics_screening_input = 'unmapped' // mapped, all, unmapped -> mapped vs all specified in SAMTOOLS_FASTQ_MAPPED in modules.conf, unmapped hardcoded SAMTOOLS_FASTQ_UMAPPED + metagenomics_profiling_tool = '' + metagenomics_profiling_database = '' + metagenomics_profiling_krakenuniq_ram_chunk_size = '16G' + metagenomics_profiling_krakenuniq_save_reads = false + metagenomics_profiling_krakenuniq_save_readclassifications = false + metagenomics_profiling_kraken2_save_reads = false + metagenomics_profiling_kraken2_save_readclassification = false + metagenomics_profiling_kraken2_save_minimizers = false + metagenomics_profiling_malt_mode = 'BlastN' + metagenomics_profiling_malt_alignment_mode = 'SemiGlobal' + metagenomics_profiling_malt_save_reads = false + metagenomics_profiling_malt_sam_output = false + metagenomics_profiling_malt_percent_identity = 85 + metagenomics_profiling_malt_top_percent = 1 + metagenomics_profiling_malt_max_queries = 100 + metagenomics_profiling_malt_memory_mode = 'load' // Deduplication options skip_deduplication = false diff --git a/nextflow_schema.json b/nextflow_schema.json index 26cfca816..5f1ebc08d 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -544,7 +544,7 @@ "bamfiltering_retainunmappedgenomicbam": { "type": "boolean", "description": "Specify to retain unmapped reads in the BAM file used for downstream genomic analyses.", - "help_text": "You can use this parameter to retain unmapped reads (optionally also length filtered) in the genomic BAM for downstream analysis. By default, the pipeline only keeps mapped reads for downstream analysis.\n\nThis is also turned on if `--metagenomicscreening_input` is set to `all`.\n\n> \u26a0\ufe0f This will likely slow down run time of downstream pipeline steps!\n\n> Modifies tool parameter(s):\n> - samtools view: `-f 4` / `-F 4`", + "help_text": "You can use this parameter to retain unmapped reads (optionally also length filtered) in the genomic BAM for downstream analysis. By default, the pipeline only keeps mapped reads for downstream analysis.\n\nThis is also turned on if `--metagenomics_screening_input` is set to `all`.\n\n> \u26a0\ufe0f This will likely slow down run time of downstream pipeline steps!\n\n> Modifies tool parameter(s):\n> - samtools view: `-f 4` / `-F 4`", "fa_icon": "fas fa-piggy-bank" }, "bamfiltering_generateunmappedfastq": { @@ -574,19 +574,42 @@ "description": "Options to related to metagenomic screening.", "default": "", "properties": { - "run_metagenomicscreening": { + "run_metagenomics_screening": { "type": "boolean", "description": "Turn on metagenomic screening of mapped, unmapped, or all reads.", "fa_icon": "fas fa-power-off", "help_text": "Turns on the metagenomic screening subworkflow of the pipeline, where reads are screened against large databases. Typically used for pathogen screening or microbial community analysis.\n\nIf supplied, this will also turn on the BAM filtering subworkflow of the pipeline." }, - "metagenomicscreening_input": { + "metagenomics_screening_input": { "type": "string", "default": "unmapped", "description": "Specify which type of reads to go into metagenomic screening.", "enum": ["unmapped", "mapped", "all"], "fa_icon": "fas fa-hand-pointer", "help_text": "You can select which reads coming out of the read alignment step will be sent for metagenomic analysis.\n\nThis influences which reads are sent to this step, whether you want unmapped reads (used in most cases, as 'host reads' can often be contaminants in microbial genomes), mapped reads (e.g, when doing competitive against a genomic reference of multiple genomes and which to apply LCA correction), or all reads.\n\n> \u26a0\ufe0f If you skip paired-end merging, all reads will be screened as independent reads - not as pairs! - as all FASTQ files from BAM filtering are merged into one. This merged file is _not_ saved in results directory.\n\n> Modifies tool parameter(s):\n> - samtools fastq: `-f 4` / `-F 4`" + }, + "metagenomics_profiling_tool": { + "type": "string", + "default": "", + "description": "Specify which tool to use for metagenomic profiling and screening.", + "enum": ["malt", "metaphlan2", "kraken2", "krakenuniq"], + "fa_icon": "fas fa-hand-pointer", + "help_text": "Select which tool to run metagenomics profiling on designated metagenomics_screening_input. These tools behave vastly differently due to performing read profiling using different methods and yield vastly different reuslts." + }, + "metagenomics_profiling_database": { + "type": "string", + "format": "directory-path", + "default": "", + "description": "Specify a databse directory to run metagenomics profiling on. In the case of kraken2, this can be a tar.gz of the directory.", + "fa_icon": "fas fa-hand-pointer", + "help_text": "Select which tool to run metagenomics profiling database to use with the designated metagenomics_profiling_tool on the selected metagenomics_screening_input. These databases are NOT cross-compatible and need to be pre-built/downloaded for use in nf-core/eager. Database construction is often a balancing act between breadth of sequence diversity and size." + }, + "metagenomics_profiling_malt_group_size": { + "type": "integer", + "default": 0, + "description": "Define group sizes for running multiple fastq files into malt.", + "fa_icon": "fas fa-hand-pointer", + "help_text": "Very large fastq files or many fastq files run through malt at the same time can lead to excessively long runtimes. This parameter allows for parallelization of malt runs. Please note, malt is resource heavy and setting this value above the default will spawn N/metagenomics_profiling_malt_group_size jobs where N is the number of samples. Please only use this if it is necessary to avoid runtime limits on your HPC cluster." } }, "fa_icon": "fas fa-search" diff --git a/subworkflows/local/bamfiltering.nf b/subworkflows/local/bamfiltering.nf index be77b58f7..b5e587701 100644 --- a/subworkflows/local/bamfiltering.nf +++ b/subworkflows/local/bamfiltering.nf @@ -69,19 +69,19 @@ workflow FILTER_BAM { // // Generate unmapped bam (no additional filtering) if the unmapped bam OR unmapped for metagneomics selected - if ( params.bamfiltering_generateunmappedfastq || ( params.run_metagenomicscreening && params.metagenomicscreening_input == 'unmapped' ) ) { + if ( params.bamfiltering_generateunmappedfastq || ( params.run_metagenomics_screening && params.metagenomics_screening_input == 'unmapped' ) ) { SAMTOOLS_FASTQ_UNMAPPED ( bam.map{[ it[0], it[1] ]}, false ) ch_versions = ch_versions.mix( SAMTOOLS_FASTQ_UNMAPPED.out.versions.first() ) } // Solution to the Andrades Valtueña-Light Problem: mapped bam for metagenomics (with options for quality- and length filtered) - if ( params.bamfiltering_generatemappedfastq || ( params.run_metagenomicscreening && ( params.metagenomicscreening_input == 'mapped' || params.metagenomicscreening_input == 'all' ) ) ) { + if ( params.bamfiltering_generatemappedfastq || ( params.run_metagenomics_screening && ( params.metagenomics_screening_input == 'mapped' || params.metagenomics_screening_input == 'all' ) ) ) { SAMTOOLS_FASTQ_MAPPED ( bam.map{[ it[0], it[1] ]}, false ) ch_versions = ch_versions.mix( SAMTOOLS_FASTQ_MAPPED.out.versions.first() ) } - if ( ( params.run_metagenomicscreening && params.metagenomicscreening_input == 'unmapped' ) && params.preprocessing_skippairmerging ) { + if ( ( params.run_metagenomics_screening && params.metagenomics_screening_input == 'unmapped' ) && params.preprocessing_skippairmerging ) { ch_paired_fastq_for_cat = SAMTOOLS_FASTQ_UNMAPPED.out.fastq .mix(SAMTOOLS_FASTQ_UNMAPPED.out.singleton) .mix(SAMTOOLS_FASTQ_UNMAPPED.out.other) @@ -96,7 +96,7 @@ workflow FILTER_BAM { } // TODO: see request https://github.com/nf-core/eager/issues/945 - if ( ( params.run_metagenomicscreening && ( params.metagenomicscreening_input == 'mapped' || params.metagenomicscreening_input == 'all' ) ) && params.preprocessing_skippairmerging ) { + if ( ( params.run_metagenomics_screening && ( params.metagenomics_screening_input == 'mapped' || params.metagenomics_screening_input == 'all' ) ) && params.preprocessing_skippairmerging ) { ch_paired_fastq_for_cat = SAMTOOLS_FASTQ_UNMAPPED.out.fastq .mix(SAMTOOLS_FASTQ_MAPPED.out.singleton) .mix(SAMTOOLS_FASTQ_MAPPED.out.other) @@ -111,15 +111,15 @@ workflow FILTER_BAM { } // Routing for metagenomic screening -> first accounting for paired-end mapping, then merged mapping, then no metagenomics - if ( ( params.run_metagenomicscreening && params.metagenomicscreening_input == 'unmapped' ) && params.preprocessing_skippairmerging ) { + if ( ( params.run_metagenomics_screening && params.metagenomics_screening_input == 'unmapped' ) && params.preprocessing_skippairmerging ) { ch_fastq_for_metagenomics = CAT_FASTQ_UNMAPPED.out.reads - } else if ( ( params.run_metagenomicscreening && ( params.metagenomicscreening_input == 'mapped' || params.metagenomicscreening_input == 'all' ) ) && params.preprocessing_skippairmerging ) { + } else if ( ( params.run_metagenomics_screening && ( params.metagenomics_screening_input == 'mapped' || params.metagenomics_screening_input == 'all' ) ) && params.preprocessing_skippairmerging ) { ch_fastq_for_metagenomics = CAT_FASTQ_UNMAPPED.out.reads - } else if ( params.run_metagenomicscreening && params.metagenomicscreening_input == 'unmapped' ) { + } else if ( params.run_metagenomics_screening && params.metagenomics_screening_input == 'unmapped' ) { ch_fastq_for_metagenomics = SAMTOOLS_FASTQ_UNMAPPED.out.other - } else if ( params.run_metagenomicscreening && ( params.metagenomicscreening_input == 'mapped' || params.metagenomicscreening_input == 'all' )) { + } else if ( params.run_metagenomics_screening && ( params.metagenomics_screening_input == 'mapped' || params.metagenomics_screening_input == 'all' )) { ch_fastq_for_metagenomics = SAMTOOLS_FASTQ_MAPPED.out.other - } else if ( !params.run_metagenomicscreening ) { + } else if ( !params.run_metagenomics_screening ) { ch_fastq_for_metagenomics = Channel.empty() } diff --git a/subworkflows/local/metagenomics_profiling.nf b/subworkflows/local/metagenomics_profiling.nf new file mode 100644 index 000000000..d031923d4 --- /dev/null +++ b/subworkflows/local/metagenomics_profiling.nf @@ -0,0 +1,186 @@ +// +// Complexity filtering and metagenomics screening of sequencing reads +// + +// Much taken from nf-core/taxprofile subworkflows/local/profiling.nf + +include { MALT_RUN } from '../../modules/nf-core/malt/run/main' +include { KRAKEN2_KRAKEN2 } from '../../modules/nf-core/kraken2/kraken2/main' +include { KRAKENUNIQ_PRELOADEDKRAKENUNIQ } from '../../modules/nf-core/krakenuniq/preloadedkrakenuniq/main' +include { METAPHLAN3_METAPHLAN3 } from '../../modules/nf-core/metaphlan3/metaphlan3/main' + +workflow METAGENOMICS_PROFILING { + + take: + reads // channel: [ [ meta ] , [ reads ] ] + database // channel: [ [ meta ] , path ] + + main: + + ch_versions = Channel.empty() + ch_raw_classifications = Channel.empty() + ch_raw_profiles = Channel.empty() + ch_multiqc_files = Channel.empty() + // TODO: malt, metaphylan, kraken2, krakenuniq + // TODO: maltextract, krakenparse + + /* + PREPARE PROFILER INPUT CHANNELS & RUN PROFILING + */ + + // Each tool as a slightly different input structure and generally separate + // input channels for reads vs database. We restructure the channel tuple + // for each tool and make liberal use of multiMap to keep reads/database + // channel element order in sync with each other + + if ( params.metagenomics_profiling_tool == 'malt' ) { + + if ( params.metagenomics_profiling_malt_group_size > 0 ) { + ch_input_for_malt = reads + .map { + meta, reads -> + + // Reset entire input meta for MALT to just database name, + // as we don't run run on a per-sample basis due to huge datbaases + // so all samples are in one run and so sample-specific metadata + // unnecessary. Set as database name to prevent `null` job ID and prefix. + + def temp_meta = [ id: database ] + + // Combine reduced sample metadata with updated database parameters metadata, + // make sure id is db_name for publishing purposes. + + [ temp_meta, reads, database ] + + } + .groupTuple(by: [0,2], size: params.metagenomics_profiling_malt_group_size, remainder: true) + .multiMap { + meta, reads, database -> + reads: [ meta, reads ] + database: database + } + } + + else { + ch_input_for_malt = reads + .map { + meta, reads -> + + // Reset entire input meta for MALT to just database name, + // as we don't run run on a per-sample basis due to huge datbaases + // so all samples are in one run and so sample-specific metadata + // unnecessary. Set as database name to prevent `null` job ID and prefix. + + def temp_meta = [ id: database ] + + // Combine reduced sample metadata with updated database parameters metadata, + // make sure id is db_name for publishing purposes. + + [ temp_meta, reads, database ] + + } + .groupTuple(by: [0,2]) + .multiMap { + meta, reads, database -> + reads: [ meta, reads ] + database: database + } + } + + ch_input_for_malt.reads.dump() + ch_input_for_malt.database.dump() + + // MALT: We groupTuple to have all samples in one channel for MALT as database + // loading takes a long time, so we only want to run it once per database, unless otherwise specified + + MALT_RUN ( ch_input_for_malt.reads, ch_input_for_malt.database ) + + ch_maltrun_for_megan = MALT_RUN.out.rma6 + .transpose() + .map{ + meta, rma -> + // re-extract meta from file names, use filename without rma to + // ensure we keep paired-end information in downstream filenames + // when no pair-merging + def meta_new = meta.clone() + meta_new['db_name'] = meta.id + meta_new['id'] = rma.baseName + [ meta_new, rma ] + } + + ch_multiqc_files = ch_multiqc_files.mix( MALT_RUN.out.log ) + ch_versions = ch_versions.mix( MALT_RUN.out.versions.first() ) + ch_raw_classifications = ch_raw_classifications.mix( ch_maltrun_for_megan ) + } + + if ( params.metagenomics_profiling_tool == 'metaphlan3' ) { + + ch_input_for_metaphlan3 = ch_input_for_profiling.metaphlan3 + .filter{ + if (it[0].is_fasta) log.warn "[nf-core/taxprofiler] MetaPhlAn3 currently does not accept FASTA files as input. Skipping MetaPhlAn3 for sample ${it[0].id}." + !it[0].is_fasta + } + .multiMap { + it -> + reads: [it[0] + it[2], it[1]] + db: it[3] + } + + METAPHLAN3_METAPHLAN3 ( ch_input_for_metaphlan3.reads, database ) + ch_versions = ch_versions.mix( METAPHLAN3_METAPHLAN3.out.versions.first() ) + ch_raw_profiles = ch_raw_profiles.mix( METAPHLAN3_METAPHLAN3.out.profile ) + + } + + if ( params.metagenomics_profiling_tool == 'krakenuniq' ) { + ch_input_for_krakenuniq = ch_input_for_profiling.krakenuniq + .map { + meta, reads, db_meta, db -> + [[id: db_meta.db_name, single_end: meta.single_end], reads, db_meta, db] + } + .groupTuple(by: [0,2,3]) + .multiMap { + single_meta, reads, db_meta, db -> + reads: [ single_meta + db_meta, reads.flatten() ] + db: db + } + // Hardcode to _always_ produce the report file (which is our basic output, and goes into) + KRAKENUNIQ_PRELOADEDKRAKENUNIQ ( ch_input_for_krakenuniq.reads, ch_input_for_krakenuniq.db, params.metagenomics_profiling_krakenuniq_ram_chunk_size, params.metagenomics_profiling_krakenuniq_save_reads, true, params.metagenomics_profiling_krakenuniq_save_readclassifications ) + ch_multiqc_files = ch_multiqc_files.mix( KRAKENUNIQ_PRELOADEDKRAKENUNIQ.out.report ) + ch_versions = ch_versions.mix( KRAKENUNIQ_PRELOADEDKRAKENUNIQ.out.versions.first() ) + ch_raw_classifications = ch_raw_classifications.mix( KRAKENUNIQ_PRELOADEDKRAKENUNIQ.out.classified_assignment ) + ch_raw_profiles = ch_raw_profiles.mix( KRAKENUNIQ_PRELOADEDKRAKENUNIQ.out.report ) + + } + + if ( params.metagenomics_profiling_tool == 'kraken2' ) { + ch_input_for_kraken2 = ch_input_for_profiling.kraken2 + .map { + meta, reads, db_meta, db -> + [ meta, reads, db_meta, db ] + } + .multiMap { + it -> + reads: [ it[0] + it[2], it[1] ] + db: it[3] + } + + KRAKEN2_KRAKEN2 ( ch_input_for_kraken2.reads, database, params.metagenomics_profiling_kraken2_save_reads, params.metagenomics_profiling_kraken2_save_readclassification ) + ch_multiqc_files = ch_multiqc_files.mix( KRAKEN2_KRAKEN2.out.report ) + ch_versions = ch_versions.mix( KRAKEN2_KRAKEN2.out.versions.first() ) + ch_raw_classifications = ch_raw_classifications.mix( KRAKEN2_KRAKEN2.out.classified_reads_assignment ) + ch_raw_profiles = ch_raw_profiles.mix( + KRAKEN2_KRAKEN2.out.report + // Set the tool to be strictly 'kraken2' instead of potentially 'bracken' for downstream use. + // Will remain distinct from 'pure' Kraken2 results due to distinct database names in file names. + .map { meta, report -> [meta + [tool: 'kraken2'], report]} + ) + + } + + emit: + classifications = ch_raw_classifications + profiles = ch_raw_profiles // channel: [ val(meta), [ reads ] ] - should be text files or biom + versions = ch_versions // channel: [ versions.yml ] + mqc = ch_multiqc_files +} diff --git a/workflows/eager.nf b/workflows/eager.nf index 863480701..422f6d5fd 100644 --- a/workflows/eager.nf +++ b/workflows/eager.nf @@ -17,13 +17,26 @@ for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true // Check mandatory parameters if (params.input) { ch_input = file(params.input) } else { exit 1, 'Input samplesheet not specified!' } -// Check failing parameter combinations +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Check failing parameter combinations +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + if ( params.bamfiltering_retainunmappedgenomicbam && params.bamfiltering_mappingquality > 0 ) { exit 1, ("[nf-core/eager] ERROR: You cannot both retain unmapped reads and perform quality filtering, as unmapped reads have a mapping quality of 0. Pick one or the other functionality.") } // TODO What to do when params.preprocessing_excludeunmerged is provided but the data is SE? if ( params.deduplication_tool == 'dedup' && ! params.preprocessing_excludeunmerged ) { exit 1, "[nf-core/eager] ERROR: Dedup can only be used on collapsed (i.e. merged) PE reads. For all other cases, please set --deduplication_tool to 'markduplicates'."} -// Report possible warnings +// TODO add any other metagenomics screening parameters checks for eg complexity filtering, post-processing +if ( params.run_metagenomics_screening && ! params.metagenomics_profiling_database ) { exit 1, ("[nf-core/eager] ERROR: Please provide an appropriate database path for metagenomics screening") } + +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Report possible warnings +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + if ( params.preprocessing_skipadaptertrim && params.preprocessing_adapterlist ) log.warn("[nf-core/eager] --preprocessing_skipadaptertrim will override --preprocessing_adapterlist. Adapter trimming will be skipped!") /* @@ -54,6 +67,7 @@ include { PREPROCESSING } from '../subworkflows/local/preprocessing' include { MAP } from '../subworkflows/local/map' include { FILTER_BAM } from '../subworkflows/local/bamfiltering.nf' include { DEDUPLICATE } from '../subworkflows/local/deduplicate' +include { METAGENOMICS_PROFILING } from '../subworkflows/local/metagenomics_profiling' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -172,7 +186,7 @@ workflow EAGER { // SUBWORKFLOW: bam filtering (length, mapped/unmapped, quality etc.) // - if ( params.run_bamfiltering || params.run_metagenomicscreening ) { + if ( params.run_bamfiltering || params.run_metagenomics_screening ) { ch_mapped_for_bamfilter = MAP.out.bam .join(MAP.out.bai) @@ -215,6 +229,14 @@ workflow EAGER { ch_dedupped_flagstat = Channel.empty() } + // + // SUBWORKFLOW: metagenomics screening + // + //TODO: finish and figure out how exactly to call with proper database (check via a helper function?) + if ( params.run_metagenomics_screening ) { + METAGENOMICS_PROFILING ( ch_bamfiltered_for_metagenomics, params.metagenomics_profiling_database ) // TODO: implement full metagenomics screening main subworkflow + } + // that then calls complexityfilter, profiling, postprocessing // // MODULE: MultiQC From b09072f641c2b8a9af0cc48b455b8a0efab7ab40 Mon Sep 17 00:00:00 2001 From: Ian Light Date: Wed, 29 Mar 2023 14:03:59 +0000 Subject: [PATCH 002/198] 'full' metagenomics profiling implementation --- conf/modules.config | 50 ++++++++ nextflow.config | 22 ++-- nextflow_schema.json | 126 ++++++++++++++++++- subworkflows/local/metagenomics_profiling.nf | 38 ++---- workflows/eager.nf | 2 +- 5 files changed, 191 insertions(+), 47 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 6f9691471..de135afda 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -423,4 +423,54 @@ process { pattern: '*.flagstat' ] } + + withName: MALT_RUN { + ext.args = [ + "-m ${params.metagenomics_profiling_malt_mode}", + "-at ${params.metagenomics_profiling_malt_alignment_mode}", + "-top ${params.metagenomics_profiling_malt_top_percent}", + "-id ${params.metagenomics_profiling_malt_min_percent_identity}", + "-mq ${params.metagenomics_profiling_malt_max_queries}", + "--memoryMode ${params.metagenomics_profiling_malt_memory_mode}", + params.metagenomics_profiling_malt_min_support_mode == "percent" ? "-supp ${params.metagenomics_profiling_malt_min_support_percent}" : "-sup ${params.metagenomics_profiling_malt_min_support_reads}", + params.metagenomics_profiling_malt_sam_output ? "-a . -f SAM" : "", + params.metagenomics_profiling_malt_save_reads ? "--alignments ./ -za false" : "" + ].join(' ').trim() + publishDir = [ + path: { "${params.outdir}/metagenomics_screening/profiling/malt/" }, + mode: params.publish_dir_mode, + pattern: '*.{rma6,log,sam}' + ] + } + + withName: KRAKEN2_KRAKEN2 { + ext.prefix = params.perform_runmerging ? "${meta.id}.kraken2" : "${meta.id}_${meta.run_accession}.kraken2" + ext.args = [ + params.metagenomics_profiling_kraken2_save_minimizers ? "-report-minimizer-data" : "" + ].join(' ').trim() + publishDir = [ + path: { "${params.outdir}/metagenomics_screening/profiling/kraken2/" }, + mode: params.publish_dir_mode, + pattern: '*.{txt,fastq.gz}' + ] + } + + withName: KRAKENUNIQ_PRELOADEDKRAKENUNIQ { + publishDir = [ + path: { "${params.outdir}/metagenomics_screening/profiling/krakenuniq/" }, + mode: params.publish_dir_mode, + pattern: '*.{txt,fastq.gz}' + ] + } + + withName: METAPHLAN3_METAPHLAN3 { + ext.prefix = params.perform_runmerging ? { "${meta.id}.metaphlan3" } : { "${meta.id}_${meta.run_accession}.metaphlan3" } + publishDir = [ + path: { "${params.outdir}/metagenomics_screening/profiling/metaphlan3/" }, + mode: params.publish_dir_mode, + pattern: '*.{biom,txt}' + ] + } + + } diff --git a/nextflow.config b/nextflow.config index a67e9b2de..3814b7c4b 100644 --- a/nextflow.config +++ b/nextflow.config @@ -109,24 +109,28 @@ params { bamfiltering_savefilteredbams = false // can include unmapped reads if --bamfiltering_retainunmappedgenomicbam specified // Metagenomic Screening - run_metagenomics_screening = false - metagenomics_screening_input = 'unmapped' // mapped, all, unmapped -> mapped vs all specified in SAMTOOLS_FASTQ_MAPPED in modules.conf, unmapped hardcoded SAMTOOLS_FASTQ_UMAPPED - metagenomics_profiling_tool = '' - metagenomics_profiling_database = '' - metagenomics_profiling_krakenuniq_ram_chunk_size = '16G' + run_metagenomics_screening = false + metagenomics_screening_input = 'unmapped' // mapped, all, unmapped -> mapped vs all specified in SAMTOOLS_FASTQ_MAPPED in modules.conf, unmapped hardcoded SAMTOOLS_FASTQ_UMAPPED + metagenomics_profiling_tool = null + metagenomics_profiling_database = null metagenomics_profiling_krakenuniq_save_reads = false - metagenomics_profiling_krakenuniq_save_readclassifications = false + metagenomics_profiling_krakenuniq_save_read_classifications = false + metagenomics_profiling_krakenuniq_ram_chunk_size = '16G' metagenomics_profiling_kraken2_save_reads = false metagenomics_profiling_kraken2_save_readclassification = false metagenomics_profiling_kraken2_save_minimizers = false metagenomics_profiling_malt_mode = 'BlastN' metagenomics_profiling_malt_alignment_mode = 'SemiGlobal' - metagenomics_profiling_malt_save_reads = false - metagenomics_profiling_malt_sam_output = false - metagenomics_profiling_malt_percent_identity = 85 + metagenomics_profiling_malt_min_percent_identity = 85 metagenomics_profiling_malt_top_percent = 1 metagenomics_profiling_malt_max_queries = 100 metagenomics_profiling_malt_memory_mode = 'load' + metagenomics_profiling_malt_min_support_mode = 'percent' + metagenomics_profiling_malt_min_support_percent = 0.01 + metagenomics_profiling_malt_min_support_reads = 1 + metagenomics_profiling_malt_sam_output = false + metagenomics_profiling_malt_save_reads = false + metagenomics_profiling_malt_group_size = 0 // Deduplication options skip_deduplication = false diff --git a/nextflow_schema.json b/nextflow_schema.json index 5f1ebc08d..0a4368af9 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -590,26 +590,140 @@ }, "metagenomics_profiling_tool": { "type": "string", - "default": "", "description": "Specify which tool to use for metagenomic profiling and screening.", "enum": ["malt", "metaphlan2", "kraken2", "krakenuniq"], - "fa_icon": "fas fa-hand-pointer", + "fa_icon": "fas fa-toolbox", "help_text": "Select which tool to run metagenomics profiling on designated metagenomics_screening_input. These tools behave vastly differently due to performing read profiling using different methods and yield vastly different reuslts." }, "metagenomics_profiling_database": { "type": "string", "format": "directory-path", - "default": "", "description": "Specify a databse directory to run metagenomics profiling on. In the case of kraken2, this can be a tar.gz of the directory.", - "fa_icon": "fas fa-hand-pointer", + "fa_icon": "fas fa-database", "help_text": "Select which tool to run metagenomics profiling database to use with the designated metagenomics_profiling_tool on the selected metagenomics_screening_input. These databases are NOT cross-compatible and need to be pre-built/downloaded for use in nf-core/eager. Database construction is often a balancing act between breadth of sequence diversity and size." }, + "metagenomics_profiling_krakenuniq_save_reads": { + "type": "boolean", + "fa_icon": "fas fa-save", + "description": "Turn on saving of KrakenUniq-aligned reads", + "help_text": "Save reads that do and do not have a taxonomic classification in your output results directory in FASTQ format.\n\n> Modifies tool parameter(s):\n> - krakenuniq: `--classified-out` and `--unclassified-out`" + }, + "metagenomics_profiling_krakenuniq_save_read_classifications": { + "type": "boolean", + "fa_icon": "fas fa-save", + "description": "Turn on saving of KrakenUniq per-read taxonomic assignment file", + "help_text": "Save a text file that contains a list of each read that had a taxonomic assignment, with information on specific taxonomic taxonomic assignment that that read recieved.\n\n> Modifies tool parameter(s):\n> - krakenuniq: `--output`" + }, + "metagenomics_profiling_krakenuniq_ram_chunk_size": { + "type": "string", + "default": "16G", + "description": "Specify how large to chunk database when loading into memory for KrakenUniq", + "fa_icon": "fas fa-database", + "help_text": "nf-core/taxprofiler utilises a 'low memory' option for KrakenUniq that can reduce the amount of RAM the process requires using the `--preloaded` option.\n\nA further extension to this option is that you can specify how large each chunk of the database should be that gets loaded into memory at any one time. You can specify the amount of RAM to chunk the database to with this parameter, and is particularly useful for people with limited computational resources.\n\nMore information about this parameter can be seen [here](https://github.com/fbreitwieser/krakenuniq/blob/master/README.md#new-release-v07).\n\n> Modifies KrakenUniq parameter: --preload-size\n\n" + }, + "metagenomics_profiling_kraken2_save_reads": { + "type": "boolean", + "fa_icon": "fas fa-save", + "description": "Turn on saving of Kraken2-aligned reads", + "help_text": "Save reads that do and do not have a taxonomic classification in your output results directory in FASTQ format.\n\n> Modifies tool parameter(s):\n> - kraken2: `--classified-out` and `--unclassified-out`" + }, + "metagenomics_profiling_kraken2_save_readclassification": { + "type": "boolean", + "fa_icon": "fas fa-save", + "description": "Turn on saving of Kraken2 per-read taxonomic assignment file", + "help_text": "Save a text file that contains a list of each read that had a taxonomic assignment, with information on specific taxonomic taxonomic assignment that that read recieved.\n\n> Modifies tool parameter(s):\n> - kraken2: `--output`" + }, + "metagenomics_profiling_kraken2_save_minimizers": { + "type": "boolean", + "description": "Turn on saving minimizer information in the kraken2 report thus increasing to an eight column layout.", + "fa_icon": "fas fa-save", + "help_text": "Turn on saving minimizer information in the kraken2 report thus increasing to an eight column layout.\n\nAdds `--report-minimizer-data` to the kraken2 command." + }, + "metagenomics_profiling_malt_mode": { + "type": "string", + "default": "BlastN", + "description": "Specify which alignment mode to use for MALT. Options: 'Unknown', 'BlastN', 'BlastP', 'BlastX', 'Classifier'.", + "fa_icon": "fas fa-align-left", + "help_text": "Use this to run the program in 'BlastN', 'BlastP', 'BlastX' modes to align DNA\nand DNA, protein and protein, or DNA reads against protein references\nrespectively. Ensure your database matches the mode. Check the\n[MALT\nmanual](http://ab.inf.uni-tuebingen.de/data/software/malt/download/manual.pdf)\nfor more details. Default: `'BlastN'`\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MALT parameter: `-m`\n", + "enum": ["BlastN", "BlastP", "BlastX"] + }, + "metagenomics_profiling_malt_alignment_mode": { + "type": "string", + "default": "SemiGlobal", + "description": "Specify alignment method for MALT. Options: 'Local', 'SemiGlobal'.", + "fa_icon": "fas fa-align-center", + "help_text": "Specify what alignment algorithm to use. Options are 'Local' or 'SemiGlobal'. Local is a BLAST like alignment, but is much slower. Semi-global alignment aligns reads end-to-end. Default: `'SemiGlobal'`\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MALT parameter: `-at`", + "enum": ["Local", "SemiGlobal"] + }, + "metagenomics_profiling_malt_min_percent_identity": { + "type": "integer", + "default": 85, + "description": "Percent identity value threshold for MALT.", + "fa_icon": "fas fa-id-card", + "help_text": "Specify the minimum percent identity (or similarity) a sequence must have to the reference for it to be retained. Default is `85`\n\nOnly used when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MALT parameter: `-id`" + }, + "metagenomics_profiling_malt_top_percent": { + "type": "integer", + "default": 1, + "description": "Specify the percent for LCA algorithm for MALT (see MEGAN6 CE manual).", + "fa_icon": "fas fa-percent", + "help_text": "Specify the top percent value of the LCA algorithm. From the [MALT manual](http://ab.inf.uni-tuebingen.de/data/software/malt/download/manual.pdf): \"For each\nread, only those matches are used for taxonomic placement whose bit disjointScore is within\n10% of the best disjointScore for that read.\". Default: `1`.\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MALT parameter: `-top`" + }, + "metagenomics_profiling_malt_min_support_mode": { + "type": "string", + "default": "percent", + "description": "Specify whether to use percent or raw number of reads for minimum support required for taxon to be retained for MALT. Options: 'percent', 'reads'.", + "fa_icon": "fas fa-drumstick-bite", + "help_text": "Specify whether to use a percentage, or raw number of reads as the value used to decide the minimum support a taxon requires to be retained.\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MALT parameter: `-sup -supp`", + "enum": ["percent", "reads"] + }, + "metagenomics_profiling_malt_min_support_percent": { + "type": "number", + "default": 0.01, + "description": "Specify the minimum percentage of reads a taxon of sample total is required to have to be retained for MALT.", + "fa_icon": "fas fa-percentage", + "help_text": "Specify the minimum number of reads (as a percentage of all assigned reads) a given taxon is required to have to be retained as a positive 'hit' in the RMA6 file. This only applies when `--malt_min_support_mode` is set to 'percent'. Default 0.01.\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MALT parameter: `-supp`" + }, + "metagenomics_profiling_malt_min_support_reads": { + "type": "integer", + "default": 1, + "description": "Specify a minimum number of reads a taxon of sample total is required to have to be retained in malt. Not compatible with --malt_min_support_mode 'percent'.", + "fa_icon": "fas fa-sort-numeric-up-alt", + "help_text": "Specify the minimum number of reads a given taxon is required to have to be retained as a positive 'hit'. \nFor malt, this only applies when `--malt_min_support_mode` is set to 'reads'. Default: 1.\n\n> Modifies MALT or kraken_parse.py parameter: `-sup` and `-c` respectively\n" + }, + "metagenomics_profiling_malt_max_queries": { + "type": "integer", + "default": 100, + "description": "Specify the maximum number of queries a read can have for MALT.", + "fa_icon": "fas fa-phone", + "help_text": "Specify the maximum number of alignments a read can have. All further alignments are discarded. Default: `100`\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MALT parameter: `-mq`" + }, + "metagenomics_profiling_malt_memory_mode": { + "type": "string", + "default": "load", + "description": "Specify the memory load method. Do not use 'map' with GPFS file systems for MALT as can be very slow. Options: 'load', 'page', 'map'.", + "fa_icon": "fas fa-memory", + "help_text": "\nHow to load the database into memory. Options are `'load'`, `'page'` or `'map'`.\n'load' directly loads the entire database into memory prior seed look up, this\nis slow but compatible with all servers/file systems. `'page'` and `'map'`\nperform a sort of 'chunked' database loading, allowing seed look up prior entire\ndatabase loading. Note that Page and Map modes do not work properly not with\nmany remote file-systems such as GPFS. Default is `'load'`.\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MALT parameter: `--memoryMode`", + "enum": ["load", "page", "map"] + }, + "metagenomics_profiling_malt_sam_output": { + "type": "boolean", + "description": "Specify to also produce SAM alignment files. Note this includes both aligned and unaligned reads, and are gzipped. Note this will result in very large file sizes.", + "fa_icon": "fas fa-file-alt", + "help_text": "Specify to _also_ produce gzipped SAM files of all alignments and un-aligned reads in addition to RMA6 files. These are **not** soft-clipped or in 'sparse' format. Can be useful for downstream analyses due to more common file format. \n\n:warning: can result in very large run output directories as this is essentially duplication of the RMA6 files.\n\n> Modifies MALT parameter `-a -f`" + }, + "metagenomics_profiling_malt_save_reads": { + "type": "boolean", + "fa_icon": "fas fa-save", + "description": "Turn on saving of MALT-aligned reads", + "help_text": "Turns on saving of MALT aligned reads in SAM format.\n\nNote that the SAM format produce by MALT is not completely valid, and may not work with downstream tools.\n\n> Modifies tool parameter(s):\n> - malt-run: `--alignments`, `-za`" + }, "metagenomics_profiling_malt_group_size": { "type": "integer", "default": 0, "description": "Define group sizes for running multiple fastq files into malt.", - "fa_icon": "fas fa-hand-pointer", - "help_text": "Very large fastq files or many fastq files run through malt at the same time can lead to excessively long runtimes. This parameter allows for parallelization of malt runs. Please note, malt is resource heavy and setting this value above the default will spawn N/metagenomics_profiling_malt_group_size jobs where N is the number of samples. Please only use this if it is necessary to avoid runtime limits on your HPC cluster." + "fa_icon": "fas fa-barcode", + "help_text": "Very large fastq files or many fastq files run through MALT at the same time can lead to excessively long runtimes. This parameter allows for parallelization of MALT runs. Please note, MALT is resource heavy and setting this value above the default will spawn N/metagenomics_profiling_malt_group_size jobs where N is the number of samples. Please only use this if it is necessary to avoid runtime limits on your HPC cluster." } }, "fa_icon": "fas fa-search" diff --git a/subworkflows/local/metagenomics_profiling.nf b/subworkflows/local/metagenomics_profiling.nf index d031923d4..0cf02795c 100644 --- a/subworkflows/local/metagenomics_profiling.nf +++ b/subworkflows/local/metagenomics_profiling.nf @@ -87,11 +87,8 @@ workflow METAGENOMICS_PROFILING { } } - ch_input_for_malt.reads.dump() - ch_input_for_malt.database.dump() - // MALT: We groupTuple to have all samples in one channel for MALT as database - // loading takes a long time, so we only want to run it once per database, unless otherwise specified + // loading takes a long time, so we only want to run it once per database, unless otherwise specified (eg grouping samples) MALT_RUN ( ch_input_for_malt.reads, ch_input_for_malt.database ) @@ -108,25 +105,14 @@ workflow METAGENOMICS_PROFILING { [ meta_new, rma ] } - ch_multiqc_files = ch_multiqc_files.mix( MALT_RUN.out.log ) ch_versions = ch_versions.mix( MALT_RUN.out.versions.first() ) ch_raw_classifications = ch_raw_classifications.mix( ch_maltrun_for_megan ) + ch_multiqc_files = ch_multiqc_files.mix( MALT_RUN.out.log ) } if ( params.metagenomics_profiling_tool == 'metaphlan3' ) { - ch_input_for_metaphlan3 = ch_input_for_profiling.metaphlan3 - .filter{ - if (it[0].is_fasta) log.warn "[nf-core/taxprofiler] MetaPhlAn3 currently does not accept FASTA files as input. Skipping MetaPhlAn3 for sample ${it[0].id}." - !it[0].is_fasta - } - .multiMap { - it -> - reads: [it[0] + it[2], it[1]] - db: it[3] - } - - METAPHLAN3_METAPHLAN3 ( ch_input_for_metaphlan3.reads, database ) + METAPHLAN3_METAPHLAN3 ( reads , database ) ch_versions = ch_versions.mix( METAPHLAN3_METAPHLAN3.out.versions.first() ) ch_raw_profiles = ch_raw_profiles.mix( METAPHLAN3_METAPHLAN3.out.profile ) @@ -145,27 +131,17 @@ workflow METAGENOMICS_PROFILING { db: db } // Hardcode to _always_ produce the report file (which is our basic output, and goes into) - KRAKENUNIQ_PRELOADEDKRAKENUNIQ ( ch_input_for_krakenuniq.reads, ch_input_for_krakenuniq.db, params.metagenomics_profiling_krakenuniq_ram_chunk_size, params.metagenomics_profiling_krakenuniq_save_reads, true, params.metagenomics_profiling_krakenuniq_save_readclassifications ) - ch_multiqc_files = ch_multiqc_files.mix( KRAKENUNIQ_PRELOADEDKRAKENUNIQ.out.report ) + KRAKENUNIQ_PRELOADEDKRAKENUNIQ ( reads , database , params.metagenomics_profiling_krakenuniq_ram_chunk_size, params.metagenomics_profiling_krakenuniq_save_reads, true, params.metagenomics_profiling_krakenuniq_save_read_classifications ) ch_versions = ch_versions.mix( KRAKENUNIQ_PRELOADEDKRAKENUNIQ.out.versions.first() ) ch_raw_classifications = ch_raw_classifications.mix( KRAKENUNIQ_PRELOADEDKRAKENUNIQ.out.classified_assignment ) ch_raw_profiles = ch_raw_profiles.mix( KRAKENUNIQ_PRELOADEDKRAKENUNIQ.out.report ) + ch_multiqc_files = ch_multiqc_files.mix( KRAKENUNIQ_PRELOADEDKRAKENUNIQ.out.report ) } if ( params.metagenomics_profiling_tool == 'kraken2' ) { - ch_input_for_kraken2 = ch_input_for_profiling.kraken2 - .map { - meta, reads, db_meta, db -> - [ meta, reads, db_meta, db ] - } - .multiMap { - it -> - reads: [ it[0] + it[2], it[1] ] - db: it[3] - } - KRAKEN2_KRAKEN2 ( ch_input_for_kraken2.reads, database, params.metagenomics_profiling_kraken2_save_reads, params.metagenomics_profiling_kraken2_save_readclassification ) + KRAKEN2_KRAKEN2 ( reads, database, params.metagenomics_profiling_kraken2_save_reads, params.metagenomics_profiling_kraken2_save_readclassification ) ch_multiqc_files = ch_multiqc_files.mix( KRAKEN2_KRAKEN2.out.report ) ch_versions = ch_versions.mix( KRAKEN2_KRAKEN2.out.versions.first() ) ch_raw_classifications = ch_raw_classifications.mix( KRAKEN2_KRAKEN2.out.classified_reads_assignment ) @@ -179,8 +155,8 @@ workflow METAGENOMICS_PROFILING { } emit: + versions = ch_versions // channel: [ versions.yml ] classifications = ch_raw_classifications profiles = ch_raw_profiles // channel: [ val(meta), [ reads ] ] - should be text files or biom - versions = ch_versions // channel: [ versions.yml ] mqc = ch_multiqc_files } diff --git a/workflows/eager.nf b/workflows/eager.nf index 422f6d5fd..b606fa1de 100644 --- a/workflows/eager.nf +++ b/workflows/eager.nf @@ -29,7 +29,7 @@ if ( params.bamfiltering_retainunmappedgenomicbam && params.bamfiltering_mapping if ( params.deduplication_tool == 'dedup' && ! params.preprocessing_excludeunmerged ) { exit 1, "[nf-core/eager] ERROR: Dedup can only be used on collapsed (i.e. merged) PE reads. For all other cases, please set --deduplication_tool to 'markduplicates'."} // TODO add any other metagenomics screening parameters checks for eg complexity filtering, post-processing -if ( params.run_metagenomics_screening && ! params.metagenomics_profiling_database ) { exit 1, ("[nf-core/eager] ERROR: Please provide an appropriate database path for metagenomics screening") } +if ( params.run_metagenomics_screening && ! params.metagenomics_profiling_database ) { exit 1, ("[nf-core/eager] ERROR: Please provide an appropriate database path for metagenomics screening using --metagenomics_profiling_database") } /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ From 34a3014e87e43d23e582db9dc0704488b86136d4 Mon Sep 17 00:00:00 2001 From: Merlin Szymanski Date: Fri, 31 Mar 2023 12:17:56 +0200 Subject: [PATCH 003/198] add metagenomics subworkflow --- subworkflows/local/metagenomics.nf | 44 ++++++++++++++++++++++++++++++ workflows/eager.nf | 40 ++++++++------------------- 2 files changed, 56 insertions(+), 28 deletions(-) create mode 100644 subworkflows/local/metagenomics.nf diff --git a/subworkflows/local/metagenomics.nf b/subworkflows/local/metagenomics.nf new file mode 100644 index 000000000..e0fb72e58 --- /dev/null +++ b/subworkflows/local/metagenomics.nf @@ -0,0 +1,44 @@ +include { METAGENOMICS_COMPLEXITYFILTER } from './metagenomics_complexityfilter' +include { METAGENOMICS_PROFILING } from './metagenomics_profiling' + +workflow METAGENOMICS { + take: ch_bamfiltered_for_metagenomics + + main: + // Define channels + ch_multiqc_files = Channel.empty() + ch_versions = Channel.empty() + ch_bamfiltered_for_metagenomics = ch_bamfiltered_for_metagenomics + .map{ meta, fastq -> + [meta+['single_end':true], fastq] + } + + // + // Run the complexity filter subworkflow + // + + if ( params.run_metagenomics_complexityfiltering ) { + METAGENOMICS_COMPLEXITYFILTER( ch_bamfiltered_for_metagenomics ) + ch_reads_for_metagenomics = METAGENOMICS_COMPLEXITYFILTER.out.fastq + ch_versions = ch_versions.mix(METAGENOMICS_COMPLEXITYFILTER.out.versions.first()) + ch_multiqc_files = ch_multiqc_files.mix(METAGENOMICS_COMPLEXITYFILTER.out.fastq.collect{it[1]}.ifEmpty([])) + } else { + ch_reads_for_metagenomics = ch_bamfiltered_for_metagenomics + } + + // + // Run the profiling subworkflow + // + + database = params.metagenomics_profiling_database + + METAGENOMICS_PROFILING( ch_reads_for_metagenomics, database ) + ch_versions = ch_versions.mix( METAGENOMICS_PROFILING.out.versions.first() ) + ch_multiqc_files = ch_multiqc_files.mix( METAGENOMICS_PROFILING.out.mqc.collect{it[1]}.ifEmpty([]) ) + + emit: + ch_versions = ch_versions + ch_multiqc_files = ch_multiqc_files + + +} \ No newline at end of file diff --git a/workflows/eager.nf b/workflows/eager.nf index 8e1becdc6..3a49d3b50 100644 --- a/workflows/eager.nf +++ b/workflows/eager.nf @@ -73,14 +73,13 @@ ch_multiqc_custom_methods_description = params.multiqc_methods_description ? fil // // TODO rename to active: index_reference, filter_bam etc. -include { INPUT_CHECK } from '../subworkflows/local/input_check' -include { REFERENCE_INDEXING } from '../subworkflows/local/reference_indexing' -include { PREPROCESSING } from '../subworkflows/local/preprocessing' -include { MAP } from '../subworkflows/local/map' -include { FILTER_BAM } from '../subworkflows/local/bamfiltering.nf' -include { DEDUPLICATE } from '../subworkflows/local/deduplicate' -include { METAGENOMICS_COMPLEXITYFILTER } from '../subworkflows/local/metagenomics_complexityfilter' -include { METAGENOMICS_PROFILING } from '../subworkflows/local/metagenomics_profiling' +include { INPUT_CHECK } from '../subworkflows/local/input_check' +include { REFERENCE_INDEXING } from '../subworkflows/local/reference_indexing' +include { PREPROCESSING } from '../subworkflows/local/preprocessing' +include { MAP } from '../subworkflows/local/map' +include { FILTER_BAM } from '../subworkflows/local/bamfiltering.nf' +include { DEDUPLICATE } from '../subworkflows/local/deduplicate' +include { METAGENOMICS } from '../subworkflows/local/deduplicate' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -247,30 +246,15 @@ workflow EAGER { } // - // Section: Metagenomics screening + // Section: Metagenomics // //TODO: finish and figure out how exactly to call with proper database (check via a helper function?) if ( params.run_metagenomics_screening ) { - METAGENOMICS_PROFILING ( ch_bamfiltered_for_metagenomics, params.metagenomics_profiling_database ) // TODO: implement full metagenomics screening main subworkflow - } - // that then calls complexityfilter, profiling, postprocessing - - if( params.run_metagenomics_screening ) { - ch_bamfiltered_for_metagenomics = ch_bamfiltered_for_metagenomics - .map{ meta, fastq -> - [meta+['single_end':true], fastq] - } - - // Check if a complexity filter is wanted? - if ( params.run_metagenomics_complexityfiltering ) { - METAGENOMICS_COMPLEXITYFILTER( ch_bamfiltered_for_metagenomics ) - ch_reads_for_metagenomics = METAGENOMICS_COMPLEXITYFILTER.out.fastq - ch_versions = ch_versions.mix(METAGENOMICS_COMPLEXITYFILTER.out.versions.first()) - ch_multiqc_files = ch_multiqc_files.mix(METAGENOMICS_COMPLEXITYFILTER.out.fastq.collect{it[1]}.ifEmpty([])) - } else { - ch_reads_for_metagenomics = ch_bamfiltered_for_metagenomics - } + METAGENOMICS ( ch_bamfiltered_for_metagenomics ) + ch_versions = ch_versions.mix( METAGENOMICS.out.versions.first() ) + ch_multiqc_files = ch_multiqc_files.mix( METAGENOMICS.out.ch_multiqc_files ) + } // From 6245b4d4716df03ae492ddf7d6abb536f4d68152 Mon Sep 17 00:00:00 2001 From: Ian Light <86308592+ilight1542@users.noreply.github.com> Date: Fri, 21 Apr 2023 10:24:53 +0200 Subject: [PATCH 004/198] Update nextflow_schema.json from PR review Co-authored-by: James A. Fellows Yates --- nextflow_schema.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index 0a4368af9..da82697e9 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -593,7 +593,7 @@ "description": "Specify which tool to use for metagenomic profiling and screening.", "enum": ["malt", "metaphlan2", "kraken2", "krakenuniq"], "fa_icon": "fas fa-toolbox", - "help_text": "Select which tool to run metagenomics profiling on designated metagenomics_screening_input. These tools behave vastly differently due to performing read profiling using different methods and yield vastly different reuslts." + "help_text": "Select which tool to run metagenomics profiling on designated metagenomics_screening_input. Which tool to use will depend on your specific context, as each tool uses a different method and database. See literature of the tools for recommendations" }, "metagenomics_profiling_database": { "type": "string", From b9767d4cdd4ec0ce7a7fad9ef81bd80584b76e05 Mon Sep 17 00:00:00 2001 From: Ian Light Date: Fri, 21 Apr 2023 08:32:08 +0000 Subject: [PATCH 005/198] jfy comments on help text updates --- nextflow_schema.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index da82697e9..917536d12 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -600,7 +600,7 @@ "format": "directory-path", "description": "Specify a databse directory to run metagenomics profiling on. In the case of kraken2, this can be a tar.gz of the directory.", "fa_icon": "fas fa-database", - "help_text": "Select which tool to run metagenomics profiling database to use with the designated metagenomics_profiling_tool on the selected metagenomics_screening_input. These databases are NOT cross-compatible and need to be pre-built/downloaded for use in nf-core/eager. Database construction is often a balancing act between breadth of sequence diversity and size." + "help_text": "Specify your metagenomics profiling database to use with the designated metagenomics_profiling_tool on the selected metagenomics_screening_input. These databases are NOT cross-compatible and need to be pre-built/downloaded for use in nf-core/eager. Database construction is often a balancing act between breadth of sequence diversity and size." }, "metagenomics_profiling_krakenuniq_save_reads": { "type": "boolean", From 6b3f895326c9e486dae4e566101e0883fd4025cc Mon Sep 17 00:00:00 2001 From: Ian Light Date: Fri, 21 Apr 2023 08:33:27 +0000 Subject: [PATCH 006/198] clarified metagenomics database description --- nextflow_schema.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index 917536d12..cd3e35e9d 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -598,7 +598,7 @@ "metagenomics_profiling_database": { "type": "string", "format": "directory-path", - "description": "Specify a databse directory to run metagenomics profiling on. In the case of kraken2, this can be a tar.gz of the directory.", + "description": "Specify the path to a databse directory to run metagenomics profiling on. In the case of kraken2, this can be a tar.gz of the directory.", "fa_icon": "fas fa-database", "help_text": "Specify your metagenomics profiling database to use with the designated metagenomics_profiling_tool on the selected metagenomics_screening_input. These databases are NOT cross-compatible and need to be pre-built/downloaded for use in nf-core/eager. Database construction is often a balancing act between breadth of sequence diversity and size." }, From 603eb0981ef7f24b4a65598c5fd003363035fd43 Mon Sep 17 00:00:00 2001 From: Ian Light Date: Wed, 29 Mar 2023 11:44:41 +0000 Subject: [PATCH 007/198] partial implementation metagenomicprofiling subwor --- CITATIONS.md | 18 +- conf/modules.config | 2 +- conf/test.config | 2 +- docs/development/manual_tests.md | 24 +- modules.json | 25 ++ modules/nf-core/kraken2/kraken2/main.nf | 58 +++++ modules/nf-core/kraken2/kraken2/meta.yml | 75 ++++++ .../krakenuniq/preloadedkrakenuniq/main.nf | 224 ++++++++++++++++++ .../krakenuniq/preloadedkrakenuniq/meta.yml | 78 ++++++ modules/nf-core/malt/run/main.nf | 41 ++++ modules/nf-core/malt/run/meta.yml | 54 +++++ modules/nf-core/maltextract/main.nf | 39 +++ modules/nf-core/maltextract/meta.yml | 51 ++++ modules/nf-core/metaphlan3/metaphlan3/main.nf | 48 ++++ .../nf-core/metaphlan3/metaphlan3/meta.yml | 58 +++++ nextflow.config | 32 ++- nextflow_schema.json | 29 ++- subworkflows/local/bamfiltering.nf | 18 +- subworkflows/local/metagenomics_profiling.nf | 186 +++++++++++++++ workflows/eager.nf | 27 ++- 20 files changed, 1051 insertions(+), 38 deletions(-) create mode 100644 modules/nf-core/kraken2/kraken2/main.nf create mode 100644 modules/nf-core/kraken2/kraken2/meta.yml create mode 100644 modules/nf-core/krakenuniq/preloadedkrakenuniq/main.nf create mode 100644 modules/nf-core/krakenuniq/preloadedkrakenuniq/meta.yml create mode 100644 modules/nf-core/malt/run/main.nf create mode 100644 modules/nf-core/malt/run/meta.yml create mode 100644 modules/nf-core/maltextract/main.nf create mode 100644 modules/nf-core/maltextract/meta.yml create mode 100644 modules/nf-core/metaphlan3/metaphlan3/main.nf create mode 100644 modules/nf-core/metaphlan3/metaphlan3/meta.yml create mode 100644 subworkflows/local/metagenomics_profiling.nf diff --git a/CITATIONS.md b/CITATIONS.md index 5effbc663..6ee0bb663 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -18,7 +18,7 @@ - [Falco](https://doi.org/10.12688%2Ff1000research.21142.2) - > de Sena Brandine, G., Smith, A.D. (2019) Falco: high-speed FastQC emulation for quality control of sequencing data. F1000Res., 8, 1874. doi: [10.12688%2Ff1000research.21142.2](https://doi.org/10.12688%2Ff1000research.21142.2) + > de Sena Brandine, G., Smith, A.D. (2019). Falco: high-speed FastQC emulation for quality control of sequencing data. F1000Res., 8, 1874. doi: [10.12688%2Ff1000research.21142.2](https://doi.org/10.12688%2Ff1000research.21142.2) - [fastp](https://doi.org/10.1093/bioinformatics/bty560) @@ -56,6 +56,22 @@ > Daley, T., & Smith, A. D. (2013). Predicting the molecular complexity of sequencing libraries. Nature Methods, 10(4), 325–327. doi: [10.1038/nmeth.2375](https://doi.org/10.1038/nmeth.2375) +- [MALT](https://www.nature.com/articles/s41559-017-0446-6) + + > Vågene, Å.J., Herbig, A., Campana, M.G., Nelly, M., García, R., Warinner, C., Sabin, S., Spyrou, M.A., Valtueña, A.A., Huson, D., Tuross, N., Bos, K.I. & Krause, J. (2018). Salmonella enterica genomes from victims of a major sixteenth-century epidemic in Mexico. Nat Ecol Evol 2, 520–528. doi: [10.1038/s41559-017-0446-6](https://doi.org/10.1038/s41559-017-0446-6) + +- [Kraken2](https://doi.org/10.1186/s13059-019-1891-0) + + > Wood, Derrick E., Jennifer Lu, and Ben Langmead. 2019. Improved Metagenomic Analysis with Kraken 2. Genome Biology 20 (1): 257. doi: 10.1186/s13059-019-1891-0. + +- [KrakenUniq](https://doi.org/10.1186/s13059-018-1568-0) + + > Breitwieser, Florian P., Daniel N. Baker, and Steven L. Salzberg. 2018. KrakenUniq: confident and fast metagenomics classification using unique k-mer counts. Genome Biology 19 (1): 198. doi: 10.1186/s13059-018-1568-0 + +- [MetaPhlAn3](https://doi.org/10.7554/eLife.65088) + + > Beghini, Francesco, Lauren J McIver, Aitor Blanco-Míguez, Leonard Dubois, Francesco Asnicar, Sagun Maharjan, Ana Mailyan, et al. 2021. “Integrating Taxonomic, Functional, and Strain-Level Profiling of Diverse Microbial Communities with BioBakery 3.” Edited by Peter Turnbaugh, Eduardo Franco, and C Titus Brown. ELife 10 (May): e65088. doi: 10.7554/eLife.65088 + ## Software packaging/containerisation tools - [Anaconda](https://anaconda.com) diff --git a/conf/modules.config b/conf/modules.config index 55cf9bdd5..12cfe6438 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -357,7 +357,7 @@ process { enabled: params.bamfiltering_generatemappedfastq ] ext.args = [ - params.metagenomicscreening_input == 'all' ? '' : '-F 4', + params.metagenomics_input == 'all' ? '' : '-F 4', ].join(' ').trim() ext.prefix = { "${meta.id}_${meta.library_id}_mapped" } } diff --git a/conf/test.config b/conf/test.config index 11e19a857..ba4d16f58 100644 --- a/conf/test.config +++ b/conf/test.config @@ -33,7 +33,7 @@ params { bamfiltering_mappingquality = 37 // Metagenomic screening - run_metagenomicscreening = false + run_metagenomics_screening = false } diff --git a/docs/development/manual_tests.md b/docs/development/manual_tests.md index 8bc17c9f6..1ed7e43a8 100644 --- a/docs/development/manual_tests.md +++ b/docs/development/manual_tests.md @@ -208,8 +208,8 @@ All possible parameters bamfiltering_savefilteredbam = false // can include unmapped reads if --bamfiltering_retainunmappedgenomicbam specified // Metagenomic Screening - run_metagenomicscreening = false - metagenomicscreening_input = 'unmapped' // mapped, all, unmapped -> mapped vs all specified in SAMTOOLS_FASTQ_MAPPED in modules.conf, unmapped hardcoded SAMTOOLS_FASTQ_UMAPPED + run_metagenomics_screening = false + metagenomics_input = 'unmapped' // mapped, all, unmapped -> mapped vs all specified in SAMTOOLS_FASTQ_MAPPED in modules.conf, unmapped hardcoded SAMTOOLS_FASTQ_UMAPPED ``` Tests @@ -270,45 +270,45 @@ nextflow run ../main.nf -profile test,singularity --outdir ./results -ansi-log f ## Check BAM filtering (mapped only/length/quality on genomic bam) with metagenomics screening, with unmapped reads to metagenomics # Expect: filtered BAM (samtools stats | grep SN total/mapped same), and a dump() on the ch_bam_for_metagenomics channel should report unmapped_other. Nr. of reads in dumped FASTQ should match approx unmmaped reads in results/mapping/*.flagstat -nextflow run ../main.nf -profile test,singularity --outdir ./results -ansi-log false --input data/samplesheet.tsv --fasta data/reference/Mammoth_MT_Krause.fasta --run_bamfiltering --bamfiltering_savefilteredbams --bamfiltering_minreadlength 50 --bamfiltering_mappingquality 37 --run_metagenomicscreening -dump-channels +nextflow run ../main.nf -profile test,singularity --outdir ./results -ansi-log false --input data/samplesheet.tsv --fasta data/reference/Mammoth_MT_Krause.fasta --run_bamfiltering --bamfiltering_savefilteredbams --bamfiltering_minreadlength 50 --bamfiltering_mappingquality 37 --run_metagenomics_screening -dump-channels ## Check BAM filtering (mapped only/length/quality on genomic bam) with metagenomics screening, with mapped only reads going to metagenomics # Expect: filtered BAM (samtools stats | grep SN total/mapped same), and a dump() on the ch_bam_for_metagenomics channel should report mapped_other. Nr. of reads in dumped FASTQ should match approx mmaped reads in results/mapping/*.flagstat -nextflow run ../main.nf -profile test,singularity --outdir ./results -ansi-log false --input data/samplesheet.tsv --fasta data/reference/Mammoth_MT_Krause.fasta --run_bamfiltering --bamfiltering_savefilteredbams --bamfiltering_minreadlength 50 --bamfiltering_mappingquality 37 --run_metagenomicscreening --metagenomicscreening_input 'mapped' -dump-channels +nextflow run ../main.nf -profile test,singularity --outdir ./results -ansi-log false --input data/samplesheet.tsv --fasta data/reference/Mammoth_MT_Krause.fasta --run_bamfiltering --bamfiltering_savefilteredbams --bamfiltering_minreadlength 50 --bamfiltering_mappingquality 37 --run_metagenomics_screening --metagenomics_input 'mapped' -dump-channels ## Check BAM filtering (mapped only/length/quality on genomic bam) with metagenomics screening, with all reads going to metagenomics # Expect: filtered BAM (samtools stats | grep SN total/mapped same), and a dump() on the ch_bam_for_metagenomics channel should report mapped_other. Nr. of reads in dumped FASTQ should match total reads in results/mapping/*.flagstat -nextflow run ../main.nf -profile test,singularity --outdir ./results -ansi-log false --input data/samplesheet.tsv --fasta data/reference/Mammoth_MT_Krause.fasta --run_bamfiltering --bamfiltering_savefilteredbams --bamfiltering_minreadlength 50 --bamfiltering_mappingquality 37 --run_metagenomicscreening --metagenomicscreening_input 'all' -dump-channels +nextflow run ../main.nf -profile test,singularity --outdir ./results -ansi-log false --input data/samplesheet.tsv --fasta data/reference/Mammoth_MT_Krause.fasta --run_bamfiltering --bamfiltering_savefilteredbams --bamfiltering_minreadlength 50 --bamfiltering_mappingquality 37 --run_metagenomics_screening --metagenomics_input 'all' -dump-channels ## Check BAM filtering NO LENGTH/QAULITY with metagenomics screening, with unmapped reads to metagenomics # Expect: filtered BAM (samtools stats SN quality average < 36.7 or view -q 0 vs. -q 37 is different and RL reads min <50), and a dump() on the ch_bam_for_metagenomics channel should report mapped_other. Nr. of reads in dumped FASTQ should match unmapped reads as calculated from results/mapping/*.flagstat. Note: No filtered flagstat expected! -nextflow run ../main.nf -profile test,singularity --outdir ./results -ansi-log false --input data/samplesheet.tsv --fasta data/reference/Mammoth_MT_Krause.fasta --run_bamfiltering --bamfiltering_savefilteredbams --run_metagenomicscreening --metagenomicscreening_input 'unmapped' -dump-channels +nextflow run ../main.nf -profile test,singularity --outdir ./results -ansi-log false --input data/samplesheet.tsv --fasta data/reference/Mammoth_MT_Krause.fasta --run_bamfiltering --bamfiltering_savefilteredbams --run_metagenomics_screening --metagenomics_input 'unmapped' -dump-channels ## Check BAM filtering NO LENGTH/QAULITY with metagenomics screening, with unmapped reads to metagenomics and save unmapped FASTQ # Expect: filtered BAM (samtools stats SN quality average < 36.7 or view -q 0 vs. -q 37 is different and RL reads min <50), and a dump() on the ch_bam_for_metagenomics channel should report unmapped_other. Nr. of reads in dumped FASTQ should match unmapped reads as calculated from results/mapping/*.flagstat; and unmapped other fASTQ in bam_filtering directoryt. Note: No filtered flagstat expected! -nextflow run ../main.nf -profile test,singularity --outdir ./results -ansi-log false --input data/samplesheet.tsv --fasta data/reference/Mammoth_MT_Krause.fasta --run_bamfiltering --bamfiltering_savefilteredbams --run_metagenomicscreening --metagenomicscreening_input 'unmapped' -dump-channels +nextflow run ../main.nf -profile test,singularity --outdir ./results -ansi-log false --input data/samplesheet.tsv --fasta data/reference/Mammoth_MT_Krause.fasta --run_bamfiltering --bamfiltering_savefilteredbams --run_metagenomics_screening --metagenomics_input 'unmapped' -dump-channels ## Check BAM filtering NO LENGTH/QAULITY with metagenomics screening, with mapped only reads going to metagenomics # Expect: filtered BAM (samtools stats SN quality average < 36.7 or view -q 0 vs. -q 37 is different and RL reads min <50), and a dump() on the ch_bam_for_metagenomics channel should report mapped_other. Nr. of reads in dumped FASTQ should be roughly matching mappd reads as calculated from results/mapping/*.flagstatt. Note: No filtered flagstat expected! -nextflow run ../main.nf -profile test,singularity --outdir ./results -ansi-log false --input data/samplesheet.tsv --fasta data/reference/Mammoth_MT_Krause.fasta --run_bamfiltering --bamfiltering_savefilteredbams --run_metagenomicscreening --metagenomicscreening_input 'mapped' -dump-channels +nextflow run ../main.nf -profile test,singularity --outdir ./results -ansi-log false --input data/samplesheet.tsv --fasta data/reference/Mammoth_MT_Krause.fasta --run_bamfiltering --bamfiltering_savefilteredbams --run_metagenomics_screening --metagenomics_input 'mapped' -dump-channels ## Check BAM filtering NO LENGTH/QAULITY with metagenomics screening, with all reads going to metagenomics # Expect: filtered BAM (samtools stats SN quality average < 36.7 or view -q 0 vs. -q 37 is different and RL reads min <50), and a dump() on the ch_bam_for_metagenomics channel should report mapped_other. Nr. of reads in dumped FASTQ should be roughly matching total reads as calculated from results/mapping/*.flagstatt. Note: No filtered flagstat expected! ## Some reads lost, not 100% why command looks OK... but not just unmapped as more than that -nextflow run ../main.nf -profile test,singularity --outdir ./results -ansi-log false --input data/samplesheet.tsv --fasta data/reference/Mammoth_MT_Krause.fasta --run_bamfiltering --bamfiltering_savefilteredbams --run_metagenomicscreening --metagenomicscreening_input 'all' -dump-channels +nextflow run ../main.nf -profile test,singularity --outdir ./results -ansi-log false --input data/samplesheet.tsv --fasta data/reference/Mammoth_MT_Krause.fasta --run_bamfiltering --bamfiltering_savefilteredbams --run_metagenomics_screening --metagenomics_input 'all' -dump-channels ## Check BAM filtering ONLY length filtering, with metagenomics screening, with unmapped reads to metagenomics and save unmapped FASTQ ## Metagenomics with length only # Expect: filtered BAM (samtools stats SN quality average < 36.7 or view -q 0 vs. -q 37 is different and RL reads min >= 50), and a dump() on the ch_bam_for_metagenomics channel should report unmapped_other. Nr. of reads in dumped FASTQ should match unmapped reads as calculated from results/mapping/*.flagstat; and unmapped other fASTQ in bam_filtering directoryt. -nextflow run ../main.nf -profile test,singularity --outdir ./results -ansi-log false --input data/samplesheet.tsv --fasta data/reference/Mammoth_MT_Krause.fasta --run_bamfiltering --bamfiltering_savefilteredbams --run_metagenomicscreening --metagenomicscreening_input 'unmapped' -dump-channels --bamfiltering_minreadlength 50 +nextflow run ../main.nf -profile test,singularity --outdir ./results -ansi-log false --input data/samplesheet.tsv --fasta data/reference/Mammoth_MT_Krause.fasta --run_bamfiltering --bamfiltering_savefilteredbams --run_metagenomics_screening --metagenomics_input 'unmapped' -dump-channels --bamfiltering_minreadlength 50 ## Check BAM filtering ONLY length filtering, with metagenomics screening, with unmapped reads to metagenomics and save unmapped FASTQ ## Metagenomics with length only # Expect: filtered BAM (samtools stats SN quality average < 36.7 or view -q 0 vs. -q 37 is not different and RL reads min <= 50), and a dump() on the ch_bam_for_metagenomics channel should report unmapped_other. Nr. of reads in dumped FASTQ should match unmapped reads as calculated from results/mapping/*.flagstat; and unmapped other fASTQ in bam_filtering directoryt. -nextflow run ../main.nf -profile test,singularity --outdir ./results -ansi-log false --input data/samplesheet.tsv --fasta data/reference/Mammoth_MT_Krause.fasta --run_bamfiltering --bamfiltering_savefilteredbams --run_metagenomicscreening --metagenomicscreening_input 'unmapped' -dump-channels --bamfiltering_mappingquality 37 +nextflow run ../main.nf -profile test,singularity --outdir ./results -ansi-log false --input data/samplesheet.tsv --fasta data/reference/Mammoth_MT_Krause.fasta --run_bamfiltering --bamfiltering_savefilteredbams --run_metagenomics_screening --metagenomics_input 'unmapped' -dump-channels --bamfiltering_mappingquality 37 ## Check what happens when we do paired-end merging and sending reads to metagenomics... -nextflow run ../main.nf -profile test,singularity --outdir ./results -ansi-log false --input data/samplesheet.tsv --fasta data/reference/Mammoth_MT_Krause.fasta --run_bamfiltering --bamfiltering_savefilteredbams --run_metagenomicscreening --metagenomicscreening_input 'unmapped' -dump-channels --bamfiltering_mappingquality 37 --preprocessing_skippairmerging +nextflow run ../main.nf -profile test,singularity --outdir ./results -ansi-log false --input data/samplesheet.tsv --fasta data/reference/Mammoth_MT_Krause.fasta --run_bamfiltering --bamfiltering_savefilteredbams --run_metagenomics_screening --metagenomics_input 'unmapped' -dump-channels --bamfiltering_mappingquality 37 --preprocessing_skippairmerging ``` ## Deduplication diff --git a/modules.json b/modules.json index 6b1b8157c..9e9d60d46 100644 --- a/modules.json +++ b/modules.json @@ -80,6 +80,31 @@ "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", "installed_by": ["modules"] }, + "kraken2/kraken2": { + "branch": "master", + "git_sha": "7c695e0147df1157413e06246d9b0094617d3e6b", + "installed_by": ["modules"] + }, + "krakenuniq/preloadedkrakenuniq": { + "branch": "master", + "git_sha": "a6eb17f65b3ee5761c25c075a6166c9f76733cee", + "installed_by": ["modules"] + }, + "malt/run": { + "branch": "master", + "git_sha": "75027bf77472b1f4fd2cdd7e46f83119dfb0f2c6", + "installed_by": ["modules"] + }, + "maltextract": { + "branch": "master", + "git_sha": "0f8a77ff00e65eaeebc509b8156eaa983192474b", + "installed_by": ["modules"] + }, + "metaphlan3/metaphlan3": { + "branch": "master", + "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "installed_by": ["modules"] + }, "mtnucratio": { "branch": "master", "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", diff --git a/modules/nf-core/kraken2/kraken2/main.nf b/modules/nf-core/kraken2/kraken2/main.nf new file mode 100644 index 000000000..5901064e7 --- /dev/null +++ b/modules/nf-core/kraken2/kraken2/main.nf @@ -0,0 +1,58 @@ +process KRAKEN2_KRAKEN2 { + tag "$meta.id" + label 'process_high' + + conda "bioconda::kraken2=2.1.2 conda-forge::pigz=2.6" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-5799ab18b5fc681e75923b2450abaa969907ec98:87fc08d11968d081f3e8a37131c1f1f6715b6542-0' : + 'quay.io/biocontainers/mulled-v2-5799ab18b5fc681e75923b2450abaa969907ec98:87fc08d11968d081f3e8a37131c1f1f6715b6542-0' }" + + input: + tuple val(meta), path(reads) + path db + val save_output_fastqs + val save_reads_assignment + + output: + tuple val(meta), path('*.classified{.,_}*') , optional:true, emit: classified_reads_fastq + tuple val(meta), path('*.unclassified{.,_}*') , optional:true, emit: unclassified_reads_fastq + tuple val(meta), path('*classifiedreads.txt') , optional:true, emit: classified_reads_assignment + tuple val(meta), path('*report.txt') , emit: report + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def paired = meta.single_end ? "" : "--paired" + def classified = meta.single_end ? "${prefix}.classified.fastq" : "${prefix}.classified#.fastq" + def unclassified = meta.single_end ? "${prefix}.unclassified.fastq" : "${prefix}.unclassified#.fastq" + def classified_option = save_output_fastqs ? "--classified-out ${classified}" : "" + def unclassified_option = save_output_fastqs ? "--unclassified-out ${unclassified}" : "" + def readclassification_option = save_reads_assignment ? "--output ${prefix}.kraken2.classifiedreads.txt" : "--output /dev/null" + def compress_reads_command = save_output_fastqs ? "pigz -p $task.cpus *.fastq" : "" + + """ + kraken2 \\ + --db $db \\ + --threads $task.cpus \\ + --report ${prefix}.kraken2.report.txt \\ + --gzip-compressed \\ + $unclassified_option \\ + $classified_option \\ + $readclassification_option \\ + $paired \\ + $args \\ + $reads + + $compress_reads_command + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + kraken2: \$(echo \$(kraken2 --version 2>&1) | sed 's/^.*Kraken version //; s/ .*\$//') + pigz: \$( pigz --version 2>&1 | sed 's/pigz //g' ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/kraken2/kraken2/meta.yml b/modules/nf-core/kraken2/kraken2/meta.yml new file mode 100644 index 000000000..7129fe3a0 --- /dev/null +++ b/modules/nf-core/kraken2/kraken2/meta.yml @@ -0,0 +1,75 @@ +name: kraken2_kraken2 +description: Classifies metagenomic sequence data +keywords: + - classify + - metagenomics + - fastq + - db +tools: + - kraken2: + description: | + Kraken2 is a taxonomic sequence classifier that assigns taxonomic labels to sequence reads + homepage: https://ccb.jhu.edu/software/kraken2/ + documentation: https://github.com/DerrickWood/kraken2/wiki/Manual + doi: 10.1186/s13059-019-1891-0 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input FastQ files of size 1 and 2 for single-end and paired-end data, + respectively. + - db: + type: directory + description: Kraken2 database + - save_output_fastqs: + type: boolean + description: | + If true, optional commands are added to save classified and unclassified reads + as fastq files + - save_reads_assignment: + type: boolean + description: | + If true, an optional command is added to save a file reporting the taxonomic + classification of each input read +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - classified_reads_fastq: + type: file + description: | + Reads classified as belonging to any of the taxa + on the Kraken2 database. + pattern: "*{fastq.gz}" + - unclassified_reads_fastq: + type: file + description: | + Reads not classified to any of the taxa + on the Kraken2 database. + pattern: "*{fastq.gz}" + - classified_reads_assignment: + type: file + description: | + Kraken2 output file indicating the taxonomic assignment of + each input read + - report: + type: file + description: | + Kraken2 report containing stats about classified + and not classifed reads. + pattern: "*.{report.txt}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@joseespinosa" + - "@drpatelh" diff --git a/modules/nf-core/krakenuniq/preloadedkrakenuniq/main.nf b/modules/nf-core/krakenuniq/preloadedkrakenuniq/main.nf new file mode 100644 index 000000000..0cb402f77 --- /dev/null +++ b/modules/nf-core/krakenuniq/preloadedkrakenuniq/main.nf @@ -0,0 +1,224 @@ +process KRAKENUNIQ_PRELOADEDKRAKENUNIQ { + tag "$meta.id" + label 'process_high' + + conda "bioconda::krakenuniq=1.0.2" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/krakenuniq:1.0.2--pl5321h19e8d03_0': + 'quay.io/biocontainers/krakenuniq:1.0.2--pl5321h19e8d03_0' }" + + input: + tuple val(meta), path(fastqs) + path db + val ram_chunk_size + val save_output_fastqs + val report_file + val save_output + + output: + tuple val(meta), path('*.classified{.,_}*') , optional:true, emit: classified_reads_fastq + tuple val(meta), path('*.unclassified{.,_}*') , optional:true, emit: unclassified_reads_fastq + tuple val(meta), path('*classified.txt') , optional:true, emit: classified_assignment + tuple val(meta), path('*report.txt') , emit: report + + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def args2 = task.ext.args ?: '' + + def classified = meta.single_end ? '"\${PREFIX}.classified.fastq"' : '"\${PREFIX}.classified#.fastq"' + def unclassified = meta.single_end ? '"\${PREFIX}.unclassified.fastq"' : '"\${PREFIX}.unclassified#.fastq"' + def classified_option = save_output_fastqs ? "--classified-out ${classified}" : '' + def unclassified_option = save_output_fastqs ? "--unclassified-out ${unclassified}" : '' + def output_option = save_output ? '--output "\${PREFIX}.krakenuniq.classified.txt"' : '' + def report = report_file ? '--report-file "\${PREFIX}.krakenuniq.report.txt"' : '' + def compress_reads_command = save_output_fastqs ? 'gzip --no-name *.fastq' : '' + if (meta.single_end) { + """ + krakenuniq \\ + --db $db \\ + --preload \\ + --preload-size $ram_chunk_size \\ + --threads $task.cpus \\ + $args + + strip_suffix() { + local result=\$1 + # Strip any file extensions. + echo "\${result%%.*}" + } + + printf "%s\\n" ${fastqs} | while read FASTQ; do \\ + PREFIX="\$(strip_suffix "\${FASTQ}")" + + krakenuniq \\ + --db $db \\ + --threads $task.cpus \\ + $report \\ + $output_option \\ + $unclassified_option \\ + $classified_option \\ + $output_option \\ + $args2 \\ + "\${FASTQ}" + done + + $compress_reads_command + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + krakenuniq: \$(echo \$(krakenuniq --version 2>&1) | sed 's/^.*KrakenUniq version //; s/ .*\$//') + END_VERSIONS + """ + } else { + """ + krakenuniq \\ + --db $db \\ + --preload \\ + --preload-size $ram_chunk_size \\ + --threads $task.cpus \\ + $args + + strip_suffix() { + local result + read result + # Strip any trailing dot or underscore. + result="\${result%_}" + echo "\${result%.}" + } + + printf "%s %s\\n" ${fastqs} | while read FASTQ; do \\ + read -r -a FASTQ <<< "\${FASTQ}" + PREFIX="\$(printf "%s\\n" "\${FASTQ[@]}" | sed -e 'N;s/^\\(.*\\).*\\n\\1.*\$/\\1\\n\\1/;D' | strip_suffix)" + + krakenuniq \\ + --db $db \\ + --threads $task.cpus \\ + $report \\ + $output_option \\ + $unclassified_option \\ + $classified_option \\ + $output_option \\ + --paired \\ + $args2 \\ + "\${FASTQ[@]}" + done + + $compress_reads_command + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + krakenuniq: \$(echo \$(krakenuniq --version 2>&1) | sed 's/^.*KrakenUniq version //; s/ .*\$//') + END_VERSIONS + """ + } + + stub: + def args = task.ext.args ?: '' + def args2 = task.ext.args ?: '' + + def classified = meta.single_end ? '"\${PREFIX}.classified.fastq"' : '"\${PREFIX}.classified#.fastq"' + def unclassified = meta.single_end ? '"\${PREFIX}.unclassified.fastq"' : '"\${PREFIX}.unclassified#.fastq"' + def classified_option = save_output_fastqs ? "--classified-out ${classified}" : '' + def unclassified_option = save_output_fastqs ? "--unclassified-out ${unclassified}" : '' + def output_option = save_output ? '--output "\${PREFIX}.krakenuniq.classified.txt"' : '' + def report = report_file ? '--report-file "\${PREFIX}.krakenuniq.report.txt"' : '' + def compress_reads_command = save_output_fastqs ? 'gzip --no-name *.fastq' : '' + if (meta.single_end) { + """ + echo krakenuniq \\ + --db $db \\ + --preload \\ + --preload-size $ram_chunk_size \\ + --threads $task.cpus \\ + $args + + strip_suffix() { + local result=\$1 + # Strip any file extensions. + echo "\${result%%.*}" + } + + printf "%s\\n" ${fastqs} | while read FASTQ; do \\ + echo "\${FASTQ}" + PREFIX="\$(strip_suffix "\${FASTQ}")" + echo "\${PREFIX}" + + echo krakenuniq \\ + --db $db \\ + --threads $task.cpus \\ + $report \\ + $output_option \\ + $unclassified_option \\ + $classified_option \\ + $output_option \\ + $args2 \\ + "\${FASTQ}" + + touch "\${PREFIX}.classified.fastq.gz" + touch "\${PREFIX}.krakenuniq.classified.txt" + touch "\${PREFIX}.krakenuniq.report.txt" + touch "\${PREFIX}.unclassified.fastq.gz" + done + + echo $compress_reads_command + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + krakenuniq: \$(echo \$(krakenuniq --version 2>&1) | sed 's/^.*KrakenUniq version //; s/ .*\$//') + END_VERSIONS + """ + } else { + """ + echo krakenuniq \\ + --db $db \\ + --preload \\ + --preload-size $ram_chunk_size \\ + --threads $task.cpus \\ + $args + + strip_suffix() { + local result + read result + # Strip any trailing dot or underscore. + result="\${result%_}" + echo "\${result%.}" + } + + printf "%s %s\\n" ${fastqs} | while read FASTQ; do \\ + read -r -a FASTQ <<< "\${FASTQ}" + echo "\${FASTQ[@]}" + PREFIX="\$(printf "%s\\n" "\${FASTQ[@]}" | sed -e 'N;s/^\\(.*\\).*\\n\\1.*\$/\\1\\n\\1/;D' | strip_suffix)" + echo "\${PREFIX}" + + echo krakenuniq \\ + --db $db \\ + --threads $task.cpus \\ + $report \\ + $output_option \\ + $unclassified_option \\ + $classified_option \\ + $output_option \\ + --paired \\ + $args2 \\ + "\${FASTQ[@]}" + + touch "\${PREFIX}.classified_1.fastq.gz" "\${PREFIX}.classified_2.fastq.gz" + touch "\${PREFIX}.krakenuniq.classified.txt" + touch "\${PREFIX}.krakenuniq.report.txt" + touch "\${PREFIX}.unclassified_1.fastq.gz" "\${PREFIX}.unclassified_2.fastq.gz" + done + + echo $compress_reads_command + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + krakenuniq: \$(echo \$(krakenuniq --version 2>&1) | sed 's/^.*KrakenUniq version //; s/ .*\$//') + END_VERSIONS + """ + } +} diff --git a/modules/nf-core/krakenuniq/preloadedkrakenuniq/meta.yml b/modules/nf-core/krakenuniq/preloadedkrakenuniq/meta.yml new file mode 100644 index 000000000..4ac645c55 --- /dev/null +++ b/modules/nf-core/krakenuniq/preloadedkrakenuniq/meta.yml @@ -0,0 +1,78 @@ +name: "krakenuniq_preloadedkrakenuniq" +description: Classifies metagenomic sequence data using unique k-mer counts +keywords: + - classify + - metagenomics + - kmers + - fastq + - db +tools: + - "krakenuniq": + description: "Metagenomics classifier with unique k-mer counting for more specific results" + homepage: https://github.com/fbreitwieser/krakenuniq + documentation: https://github.com/fbreitwieser/krakenuniq + doi: 10.1186/s13059-018-1568-0 + licence: ["MIT"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fastqs: + type: file + description: List of input FastQ files + - db: + type: directory + description: KrakenUniq database + - ram_chunk_size: + type: val + description: Amount of maximum amount of RAM each chunk of database that should be loaded at any one time + pattern: "*GB" + - save_output_fastqs: + type: boolean + description: | + If true, optional commands are added to save classified and unclassified reads + as fastq files + - save_reads_assignment: + type: boolean + description: | + If true, an optional command is added to save a file reporting the taxonomic + classification of each input read +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - classified_reads_fastq: + type: file + description: | + Reads classified as belonging to any of the taxa + on the KrakenUniq database. + pattern: "*.fastq.gz" + - unclassified_reads_fastq: + type: file + description: | + Reads not classified to any of the taxa + on the KrakenUniq database. + pattern: "*.fastq.gz" + - classified_assignment: + type: file + description: | + KrakenUniq output file indicating the taxonomic assignment of + each input read ## DOUBLE CHECK!! + - report: + type: file + description: | + KrakenUniq report containing stats about classified + and not classifed reads. + pattern: "*.report.txt" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@mjamy" + - "@Midnighter" diff --git a/modules/nf-core/malt/run/main.nf b/modules/nf-core/malt/run/main.nf new file mode 100644 index 000000000..61b592dcb --- /dev/null +++ b/modules/nf-core/malt/run/main.nf @@ -0,0 +1,41 @@ +process MALT_RUN { + tag "$meta.id" + label 'process_high' + + conda "bioconda::malt=0.61" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/malt:0.61--hdfd78af_0' : + 'quay.io/biocontainers/malt:0.61--hdfd78af_0' }" + + input: + tuple val(meta), path(fastqs) + path index + + output: + tuple val(meta), path("*.rma6") , emit: rma6 + tuple val(meta), path("*.{tab,text,sam}"), optional:true, emit: alignments + tuple val(meta), path("*.log") , emit: log + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + + """ + malt-run \\ + -t $task.cpus \\ + -v \\ + -o . \\ + $args \\ + --inFile ${fastqs.join(' ')} \\ + --index $index/ |&tee ${prefix}-malt-run.log + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + malt: \$(malt-run --help 2>&1 | grep -o 'version.* ' | cut -f 1 -d ',' | cut -f2 -d ' ') + END_VERSIONS + """ +} diff --git a/modules/nf-core/malt/run/meta.yml b/modules/nf-core/malt/run/meta.yml new file mode 100644 index 000000000..2a7944642 --- /dev/null +++ b/modules/nf-core/malt/run/meta.yml @@ -0,0 +1,54 @@ +name: malt_run +description: MALT, an acronym for MEGAN alignment tool, is a sequence alignment and analysis tool designed for processing high-throughput sequencing data, especially in the context of metagenomics. +keywords: + - malt + - alignment + - metagenomics + - ancient DNA + - aDNA + - palaeogenomics + - archaeogenomics + - microbiome +tools: + - malt: + description: A tool for mapping metagenomic data + homepage: https://www.wsi.uni-tuebingen.de/lehrstuehle/algorithms-in-bioinformatics/software/malt/ + documentation: https://software-ab.informatik.uni-tuebingen.de/download/malt/manual.pdf + + doi: "10.1038/s41559-017-0446-6" + licence: ["GPL v3"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fastqs: + type: file + description: Input FASTQ files + pattern: "*.{fastq.gz,fq.gz}" + - index: + type: directory + description: Index/database directory from malt-build + pattern: "*/" +output: + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - rma6: + type: file + description: MEGAN6 RMA6 file + pattern: "*.rma6" + - sam: + type: file + description: Alignment files in Tab, Text or MEGAN-compatible SAM format + pattern: "*.{tab,txt,sam}" + - log: + type: file + description: Log of verbose MALT stdout + pattern: "*-malt-run.log" + +authors: + - "@jfy133" diff --git a/modules/nf-core/maltextract/main.nf b/modules/nf-core/maltextract/main.nf new file mode 100644 index 000000000..d44b54c60 --- /dev/null +++ b/modules/nf-core/maltextract/main.nf @@ -0,0 +1,39 @@ +process MALTEXTRACT { + + label 'process_medium' + + conda "bioconda::hops=0.35" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/hops:0.35--hdfd78af_1' : + 'quay.io/biocontainers/hops:0.35--hdfd78af_1' }" + + input: + path rma6 + path taxon_list + path ncbi_dir + + output: + path "results" , emit: results + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + """ + MaltExtract \\ + -Xmx${task.memory.toGiga()}g \\ + -p $task.cpus \\ + -i ${rma6.join(' ')} \\ + -t $taxon_list \\ + -r $ncbi_dir \\ + -o results/ \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + maltextract: \$(MaltExtract --help | head -n 2 | tail -n 1 | sed 's/MaltExtract version//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/maltextract/meta.yml b/modules/nf-core/maltextract/meta.yml new file mode 100644 index 000000000..c365a7c5e --- /dev/null +++ b/modules/nf-core/maltextract/meta.yml @@ -0,0 +1,51 @@ +name: maltextract +description: Tool for evaluation of MALT results for true positives of ancient metagenomic taxonomic screening +keywords: + - malt + - MaltExtract + - HOPS + - alignment + - metagenomics + - ancient DNA + - aDNA + - palaeogenomics + - archaeogenomics + - microbiome + - authentication + - damage + - edit distance +tools: + - maltextract: + description: Java tool to work with ancient metagenomics + homepage: https://github.com/rhuebler/hops + documentation: https://github.com/rhuebler/hops + tool_dev_url: https://github.com/rhuebler/hops + doi: "10.1186/s13059-019-1903-0" + licence: ["GPL 3"] + +input: + - rma6: + type: file + description: RMA6 files from MALT + pattern: "*.rma6" + - taxon_list: + type: file + description: List of target taxa to evaluate + pattern: "*.txt" + - ncbi_dir: + type: directory + description: Directory containing NCBI taxonomy map and tre files + pattern: "${ncbi_dir}/" + +output: + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - results: + type: directory + description: Directory containing MaltExtract text results files + pattern: "*.rma6" + +authors: + - "@jfy133" diff --git a/modules/nf-core/metaphlan3/metaphlan3/main.nf b/modules/nf-core/metaphlan3/metaphlan3/main.nf new file mode 100644 index 000000000..34f8705cc --- /dev/null +++ b/modules/nf-core/metaphlan3/metaphlan3/main.nf @@ -0,0 +1,48 @@ +process METAPHLAN3_METAPHLAN3 { + tag "$meta.id" + label 'process_high' + + conda "bioconda::metaphlan=3.0.12" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/metaphlan:3.0.12--pyhb7b1952_0' : + 'quay.io/biocontainers/metaphlan:3.0.12--pyhb7b1952_0' }" + + input: + tuple val(meta), path(input) + path metaphlan_db + + output: + tuple val(meta), path("*_profile.txt") , emit: profile + tuple val(meta), path("*.biom") , emit: biom + tuple val(meta), path('*.bowtie2out.txt'), optional:true, emit: bt2out + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def input_type = ("$input".endsWith(".fastq.gz") || "$input".endsWith(".fq.gz")) ? "--input_type fastq" : ("$input".contains(".fasta")) ? "--input_type fasta" : ("$input".endsWith(".bowtie2out.txt")) ? "--input_type bowtie2out" : "--input_type sam" + def input_data = ("$input_type".contains("fastq")) && !meta.single_end ? "${input[0]},${input[1]}" : "$input" + def bowtie2_out = "$input_type" == "--input_type bowtie2out" || "$input_type" == "--input_type sam" ? '' : "--bowtie2out ${prefix}.bowtie2out.txt" + + """ + BT2_DB=`find -L "${metaphlan_db}" -name "*rev.1.bt2" -exec dirname {} \\;` + + metaphlan \\ + --nproc $task.cpus \\ + $input_type \\ + $input_data \\ + $args \\ + $bowtie2_out \\ + --bowtie2db \$BT2_DB \\ + --biom ${prefix}.biom \\ + --output_file ${prefix}_profile.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + metaphlan3: \$(metaphlan --version 2>&1 | awk '{print \$3}') + END_VERSIONS + """ +} diff --git a/modules/nf-core/metaphlan3/metaphlan3/meta.yml b/modules/nf-core/metaphlan3/metaphlan3/meta.yml new file mode 100644 index 000000000..659d83a95 --- /dev/null +++ b/modules/nf-core/metaphlan3/metaphlan3/meta.yml @@ -0,0 +1,58 @@ +name: metaphlan3_metaphlan3 +description: MetaPhlAn is a tool for profiling the composition of microbial communities from metagenomic shotgun sequencing data. +keywords: + - metagenomics + - classification + - fastq + - bam + - fasta +tools: + - metaphlan3: + description: Identify clades (phyla to species) present in the metagenome obtained from a microbiome sample and their relative abundance + homepage: https://huttenhower.sph.harvard.edu/metaphlan/ + documentation: https://github.com/biobakery/MetaPhlAn + doi: "10.7554/eLife.65088" + licence: ["MIT License"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - input: + type: file + description: Metaphlan 3.0 can classify the metagenome from a variety of input data types, including FASTQ files (single-end and paired-end), FASTA, bowtie2-produced SAM files (produced from alignments to the MetaPHlAn marker database) and intermediate bowtie2 alignment files (bowtie2out) + pattern: "*.{fastq.gz, fasta, fasta.gz, sam, bowtie2out.txt}" + - metaphlan_db: + type: file + description: | + Directory containing pre-downloaded and uncompressed MetaPhlAn3 database downloaded from: http://cmprod1.cibio.unitn.it/biobakery3/metaphlan_databases/. + Note that you will also need to specify `--index` and the database version name (e.g. 'mpa_v31_CHOCOPhlAn_201901') in your module.conf ext.args for METAPHLAN3_METAPHLAN3! + pattern: "*/" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - profile: + type: file + description: Tab-separated output file of the predicted taxon relative abundances + pattern: "*.{txt}" + - biom: + type: file + description: General-use format for representing biological sample by observation contingency tables + pattern: "*.{biom}" + - bowtie2out: + type: file + description: Intermediate Bowtie2 output produced from mapping the metagenome against the MetaPHlAn marker database ( not compatible with `bowtie2out` files generated with MetaPhlAn versions below 3 ) + pattern: "*.{bowtie2out.txt}" + +authors: + - "@MGordon09" diff --git a/nextflow.config b/nextflow.config index 664baaedd..a1c1f42a6 100644 --- a/nextflow.config +++ b/nextflow.config @@ -109,14 +109,30 @@ params { bamfiltering_savefilteredbams = false // can include unmapped reads if --bamfiltering_retainunmappedgenomicbam specified // Metagenomic Screening - run_metagenomicscreening = false - run_metagenomics_complexityfiltering = false - metagenomicscreening_input = 'unmapped' // mapped, all, unmapped -> mapped vs all specified in SAMTOOLS_FASTQ_MAPPED in modules.conf, unmapped hardcoded SAMTOOLS_FASTQ_UMAPPED - metagenomics_complexity_tool = 'bbduk' - metagenomics_complexity_savefastq = false - metagenomics_complexity_entropy = 0.3 - metagenomics_prinseq_mode = 'entropy' - metagenomics_prinseq_dustscore = 0.5 + run_metagenomics = false + metagenomics_input = 'unmapped' // mapped, all, unmapped -> mapped vs all specified in SAMTOOLS_FASTQ_MAPPED in modules.conf, unmapped hardcoded SAMTOOLS_FASTQ_UMAPPED + run_metagenomics_complexityfiltering = false + metagenomics_complexity_tool = 'bbduk' + metagenomics_complexity_savefastq = false + metagenomics_complexity_entropy = 0.3 + metagenomics_prinseq_mode = 'entropy' + metagenomics_prinseq_dustscore = 0.5 + metagenomics_profiling_tool = '' + metagenomics_profiling_database = '' + metagenomics_krakenuniq_ram_chunk_size = '16G' + metagenomics_krakenuniq_save_reads = false + metagenomics_krakenuniq_save_readclassifications = false + metagenomics_kraken2_save_reads = false + metagenomics_kraken2_save_readclassification = false + metagenomics_kraken2_save_minimizers = false + metagenomics_malt_mode = 'BlastN' + metagenomics_malt_alignment_mode = 'SemiGlobal' + metagenomics_malt_save_reads = false + metagenomics_malt_sam_output = false + metagenomics_malt_percent_identity = 85 + metagenomics_malt_top_percent = 1 + metagenomics_malt_max_queries = 100 + metagenomics_malt_memory_mode = 'load' // Deduplication options skip_deduplication = false diff --git a/nextflow_schema.json b/nextflow_schema.json index d165a2155..89fd16192 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -544,7 +544,7 @@ "bamfiltering_retainunmappedgenomicbam": { "type": "boolean", "description": "Specify to retain unmapped reads in the BAM file used for downstream genomic analyses.", - "help_text": "You can use this parameter to retain unmapped reads (optionally also length filtered) in the genomic BAM for downstream analysis. By default, the pipeline only keeps mapped reads for downstream analysis.\n\nThis is also turned on if `--metagenomicscreening_input` is set to `all`.\n\n> \u26a0\ufe0f This will likely slow down run time of downstream pipeline steps!\n\n> Modifies tool parameter(s):\n> - samtools view: `-f 4` / `-F 4`", + "help_text": "You can use this parameter to retain unmapped reads (optionally also length filtered) in the genomic BAM for downstream analysis. By default, the pipeline only keeps mapped reads for downstream analysis.\n\nThis is also turned on if `--metagenomics_screening_input` is set to `all`.\n\n> \u26a0\ufe0f This will likely slow down run time of downstream pipeline steps!\n\n> Modifies tool parameter(s):\n> - samtools view: `-f 4` / `-F 4`", "fa_icon": "fas fa-piggy-bank" }, "bamfiltering_generateunmappedfastq": { @@ -574,13 +574,13 @@ "description": "Options to related to metagenomic screening.", "default": "", "properties": { - "run_metagenomicscreening": { + "run_metagenomics_screening": { "type": "boolean", "description": "Turn on metagenomic screening of mapped, unmapped, or all reads.", "fa_icon": "fas fa-power-off", "help_text": "Turns on the metagenomic screening subworkflow of the pipeline, where reads are screened against large databases. Typically used for pathogen screening or microbial community analysis.\n\nIf supplied, this will also turn on the BAM filtering subworkflow of the pipeline." }, - "metagenomicscreening_input": { + "metagenomics_screening_input": { "type": "string", "default": "unmapped", "description": "Specify which type of reads to go into metagenomic screening.", @@ -588,6 +588,29 @@ "fa_icon": "fas fa-hand-pointer", "help_text": "You can select which reads coming out of the read alignment step will be sent for metagenomic analysis.\n\nThis influences which reads are sent to this step, whether you want unmapped reads (used in most cases, as 'host reads' can often be contaminants in microbial genomes), mapped reads (e.g, when doing competitive against a genomic reference of multiple genomes and which to apply LCA correction), or all reads.\n\n> \u26a0\ufe0f If you skip paired-end merging, all reads will be screened as independent reads - not as pairs! - as all FASTQ files from BAM filtering are merged into one. This merged file is _not_ saved in results directory.\n\n> Modifies tool parameter(s):\n> - samtools fastq: `-f 4` / `-F 4`" }, + "metagenomics_profiling_tool": { + "type": "string", + "default": "", + "description": "Specify which tool to use for metagenomic profiling and screening.", + "enum": ["malt", "metaphlan2", "kraken2", "krakenuniq"], + "fa_icon": "fas fa-hand-pointer", + "help_text": "Select which tool to run metagenomics profiling on designated metagenomics_screening_input. These tools behave vastly differently due to performing read profiling using different methods and yield vastly different reuslts." + }, + "metagenomics_profiling_database": { + "type": "string", + "format": "directory-path", + "default": "", + "description": "Specify a databse directory to run metagenomics profiling on. In the case of kraken2, this can be a tar.gz of the directory.", + "fa_icon": "fas fa-hand-pointer", + "help_text": "Select which tool to run metagenomics profiling database to use with the designated metagenomics_profiling_tool on the selected metagenomics_screening_input. These databases are NOT cross-compatible and need to be pre-built/downloaded for use in nf-core/eager. Database construction is often a balancing act between breadth of sequence diversity and size." + }, + "metagenomics_profiling_malt_group_size": { + "type": "integer", + "default": 0, + "description": "Define group sizes for running multiple fastq files into malt.", + "fa_icon": "fas fa-hand-pointer", + "help_text": "Very large fastq files or many fastq files run through malt at the same time can lead to excessively long runtimes. This parameter allows for parallelization of malt runs. Please note, malt is resource heavy and setting this value above the default will spawn N/metagenomics_profiling_malt_group_size jobs where N is the number of samples. Please only use this if it is necessary to avoid runtime limits on your HPC cluster." + }, "run_metagenomics_complexityfiltering": { "type": "boolean", "fa_icon": "fas fa-power-off", diff --git a/subworkflows/local/bamfiltering.nf b/subworkflows/local/bamfiltering.nf index be77b58f7..3506d4505 100644 --- a/subworkflows/local/bamfiltering.nf +++ b/subworkflows/local/bamfiltering.nf @@ -69,19 +69,19 @@ workflow FILTER_BAM { // // Generate unmapped bam (no additional filtering) if the unmapped bam OR unmapped for metagneomics selected - if ( params.bamfiltering_generateunmappedfastq || ( params.run_metagenomicscreening && params.metagenomicscreening_input == 'unmapped' ) ) { + if ( params.bamfiltering_generateunmappedfastq || ( params.run_metagenomics_screening && params.metagenomics_input == 'unmapped' ) ) { SAMTOOLS_FASTQ_UNMAPPED ( bam.map{[ it[0], it[1] ]}, false ) ch_versions = ch_versions.mix( SAMTOOLS_FASTQ_UNMAPPED.out.versions.first() ) } // Solution to the Andrades Valtueña-Light Problem: mapped bam for metagenomics (with options for quality- and length filtered) - if ( params.bamfiltering_generatemappedfastq || ( params.run_metagenomicscreening && ( params.metagenomicscreening_input == 'mapped' || params.metagenomicscreening_input == 'all' ) ) ) { + if ( params.bamfiltering_generatemappedfastq || ( params.run_metagenomics_screening && ( params.metagenomics_input == 'mapped' || params.metagenomics_input == 'all' ) ) ) { SAMTOOLS_FASTQ_MAPPED ( bam.map{[ it[0], it[1] ]}, false ) ch_versions = ch_versions.mix( SAMTOOLS_FASTQ_MAPPED.out.versions.first() ) } - if ( ( params.run_metagenomicscreening && params.metagenomicscreening_input == 'unmapped' ) && params.preprocessing_skippairmerging ) { + if ( ( params.run_metagenomics_screening && params.metagenomics_input == 'unmapped' ) && params.preprocessing_skippairmerging ) { ch_paired_fastq_for_cat = SAMTOOLS_FASTQ_UNMAPPED.out.fastq .mix(SAMTOOLS_FASTQ_UNMAPPED.out.singleton) .mix(SAMTOOLS_FASTQ_UNMAPPED.out.other) @@ -96,7 +96,7 @@ workflow FILTER_BAM { } // TODO: see request https://github.com/nf-core/eager/issues/945 - if ( ( params.run_metagenomicscreening && ( params.metagenomicscreening_input == 'mapped' || params.metagenomicscreening_input == 'all' ) ) && params.preprocessing_skippairmerging ) { + if ( ( params.run_metagenomics_screening && ( params.metagenomics_input == 'mapped' || params.metagenomics_input == 'all' ) ) && params.preprocessing_skippairmerging ) { ch_paired_fastq_for_cat = SAMTOOLS_FASTQ_UNMAPPED.out.fastq .mix(SAMTOOLS_FASTQ_MAPPED.out.singleton) .mix(SAMTOOLS_FASTQ_MAPPED.out.other) @@ -111,15 +111,15 @@ workflow FILTER_BAM { } // Routing for metagenomic screening -> first accounting for paired-end mapping, then merged mapping, then no metagenomics - if ( ( params.run_metagenomicscreening && params.metagenomicscreening_input == 'unmapped' ) && params.preprocessing_skippairmerging ) { + if ( ( params.run_metagenomics_screening && params.metagenomics_input == 'unmapped' ) && params.preprocessing_skippairmerging ) { ch_fastq_for_metagenomics = CAT_FASTQ_UNMAPPED.out.reads - } else if ( ( params.run_metagenomicscreening && ( params.metagenomicscreening_input == 'mapped' || params.metagenomicscreening_input == 'all' ) ) && params.preprocessing_skippairmerging ) { + } else if ( ( params.run_metagenomics_screening && ( params.metagenomics_input == 'mapped' || params.metagenomics_input == 'all' ) ) && params.preprocessing_skippairmerging ) { ch_fastq_for_metagenomics = CAT_FASTQ_UNMAPPED.out.reads - } else if ( params.run_metagenomicscreening && params.metagenomicscreening_input == 'unmapped' ) { + } else if ( params.run_metagenomics_screening && params.metagenomics_input == 'unmapped' ) { ch_fastq_for_metagenomics = SAMTOOLS_FASTQ_UNMAPPED.out.other - } else if ( params.run_metagenomicscreening && ( params.metagenomicscreening_input == 'mapped' || params.metagenomicscreening_input == 'all' )) { + } else if ( params.run_metagenomics_screening && ( params.metagenomics_input == 'mapped' || params.metagenomics_input == 'all' )) { ch_fastq_for_metagenomics = SAMTOOLS_FASTQ_MAPPED.out.other - } else if ( !params.run_metagenomicscreening ) { + } else if ( !params.run_metagenomics_screening ) { ch_fastq_for_metagenomics = Channel.empty() } diff --git a/subworkflows/local/metagenomics_profiling.nf b/subworkflows/local/metagenomics_profiling.nf new file mode 100644 index 000000000..9c8cc58f3 --- /dev/null +++ b/subworkflows/local/metagenomics_profiling.nf @@ -0,0 +1,186 @@ +// +// Complexity filtering and metagenomics screening of sequencing reads +// + +// Much taken from nf-core/taxprofile subworkflows/local/profiling.nf + +include { MALT_RUN } from '../../modules/nf-core/malt/run/main' +include { KRAKEN2_KRAKEN2 } from '../../modules/nf-core/kraken2/kraken2/main' +include { KRAKENUNIQ_PRELOADEDKRAKENUNIQ } from '../../modules/nf-core/krakenuniq/preloadedkrakenuniq/main' +include { METAPHLAN3_METAPHLAN3 } from '../../modules/nf-core/metaphlan3/metaphlan3/main' + +workflow METAGENOMICS_PROFILING { + + take: + reads // channel: [ [ meta ] , [ reads ] ] + database // channel: [ [ meta ] , path ] + + main: + + ch_versions = Channel.empty() + ch_raw_classifications = Channel.empty() + ch_raw_profiles = Channel.empty() + ch_multiqc_files = Channel.empty() + // TODO: malt, metaphylan, kraken2, krakenuniq + // TODO: maltextract, krakenparse + + /* + PREPARE PROFILER INPUT CHANNELS & RUN PROFILING + */ + + // Each tool as a slightly different input structure and generally separate + // input channels for reads vs database. We restructure the channel tuple + // for each tool and make liberal use of multiMap to keep reads/database + // channel element order in sync with each other + + if ( params.metagenomics_profiling_tool == 'malt' ) { + + if ( params.metagenomics_malt_group_size > 0 ) { + ch_input_for_malt = reads + .map { + meta, reads -> + + // Reset entire input meta for MALT to just database name, + // as we don't run run on a per-sample basis due to huge datbaases + // so all samples are in one run and so sample-specific metadata + // unnecessary. Set as database name to prevent `null` job ID and prefix. + + def temp_meta = [ id: database ] + + // Combine reduced sample metadata with updated database parameters metadata, + // make sure id is db_name for publishing purposes. + + [ temp_meta, reads, database ] + + } + .groupTuple(by: [0,2], size: params.metagenomics_malt_group_size, remainder: true) + .multiMap { + meta, reads, database -> + reads: [ meta, reads ] + database: database + } + } + + else { + ch_input_for_malt = reads + .map { + meta, reads -> + + // Reset entire input meta for MALT to just database name, + // as we don't run run on a per-sample basis due to huge datbaases + // so all samples are in one run and so sample-specific metadata + // unnecessary. Set as database name to prevent `null` job ID and prefix. + + def temp_meta = [ id: database ] + + // Combine reduced sample metadata with updated database parameters metadata, + // make sure id is db_name for publishing purposes. + + [ temp_meta, reads, database ] + + } + .groupTuple(by: [0,2]) + .multiMap { + meta, reads, database -> + reads: [ meta, reads ] + database: database + } + } + + ch_input_for_malt.reads.dump() + ch_input_for_malt.database.dump() + + // MALT: We groupTuple to have all samples in one channel for MALT as database + // loading takes a long time, so we only want to run it once per database, unless otherwise specified + + MALT_RUN ( ch_input_for_malt.reads, ch_input_for_malt.database ) + + ch_maltrun_for_megan = MALT_RUN.out.rma6 + .transpose() + .map{ + meta, rma -> + // re-extract meta from file names, use filename without rma to + // ensure we keep paired-end information in downstream filenames + // when no pair-merging + def meta_new = meta.clone() + meta_new['db_name'] = meta.id + meta_new['id'] = rma.baseName + [ meta_new, rma ] + } + + ch_multiqc_files = ch_multiqc_files.mix( MALT_RUN.out.log ) + ch_versions = ch_versions.mix( MALT_RUN.out.versions.first() ) + ch_raw_classifications = ch_raw_classifications.mix( ch_maltrun_for_megan ) + } + + if ( params.metagenomics_profiling_tool == 'metaphlan3' ) { + + ch_input_for_metaphlan3 = ch_input_for_profiling.metaphlan3 + .filter{ + if (it[0].is_fasta) log.warn "[nf-core/taxprofiler] MetaPhlAn3 currently does not accept FASTA files as input. Skipping MetaPhlAn3 for sample ${it[0].id}." + !it[0].is_fasta + } + .multiMap { + it -> + reads: [it[0] + it[2], it[1]] + db: it[3] + } + + METAPHLAN3_METAPHLAN3 ( ch_input_for_metaphlan3.reads, database ) + ch_versions = ch_versions.mix( METAPHLAN3_METAPHLAN3.out.versions.first() ) + ch_raw_profiles = ch_raw_profiles.mix( METAPHLAN3_METAPHLAN3.out.profile ) + + } + + if ( params.metagenomics_profiling_tool == 'krakenuniq' ) { + ch_input_for_krakenuniq = ch_input_for_profiling.krakenuniq + .map { + meta, reads, db_meta, db -> + [[id: db_meta.db_name, single_end: meta.single_end], reads, db_meta, db] + } + .groupTuple(by: [0,2,3]) + .multiMap { + single_meta, reads, db_meta, db -> + reads: [ single_meta + db_meta, reads.flatten() ] + db: db + } + // Hardcode to _always_ produce the report file (which is our basic output, and goes into) + KRAKENUNIQ_PRELOADEDKRAKENUNIQ ( ch_input_for_krakenuniq.reads, ch_input_for_krakenuniq.db, params.metagenomics_krakenuniq_ram_chunk_size, params.metagenomics_krakenuniq_save_reads, true, params.metagenomics_krakenuniq_save_readclassifications ) + ch_multiqc_files = ch_multiqc_files.mix( KRAKENUNIQ_PRELOADEDKRAKENUNIQ.out.report ) + ch_versions = ch_versions.mix( KRAKENUNIQ_PRELOADEDKRAKENUNIQ.out.versions.first() ) + ch_raw_classifications = ch_raw_classifications.mix( KRAKENUNIQ_PRELOADEDKRAKENUNIQ.out.classified_assignment ) + ch_raw_profiles = ch_raw_profiles.mix( KRAKENUNIQ_PRELOADEDKRAKENUNIQ.out.report ) + + } + + if ( params.metagenomics_profiling_tool == 'kraken2' ) { + ch_input_for_kraken2 = ch_input_for_profiling.kraken2 + .map { + meta, reads, db_meta, db -> + [ meta, reads, db_meta, db ] + } + .multiMap { + it -> + reads: [ it[0] + it[2], it[1] ] + db: it[3] + } + + KRAKEN2_KRAKEN2 ( ch_input_for_kraken2.reads, database, params.metagenomics_kraken2_save_reads, params.metagenomics_kraken2_save_readclassification ) + ch_multiqc_files = ch_multiqc_files.mix( KRAKEN2_KRAKEN2.out.report ) + ch_versions = ch_versions.mix( KRAKEN2_KRAKEN2.out.versions.first() ) + ch_raw_classifications = ch_raw_classifications.mix( KRAKEN2_KRAKEN2.out.classified_reads_assignment ) + ch_raw_profiles = ch_raw_profiles.mix( + KRAKEN2_KRAKEN2.out.report + // Set the tool to be strictly 'kraken2' instead of potentially 'bracken' for downstream use. + // Will remain distinct from 'pure' Kraken2 results due to distinct database names in file names. + .map { meta, report -> [meta + [tool: 'kraken2'], report]} + ) + + } + + emit: + classifications = ch_raw_classifications + profiles = ch_raw_profiles // channel: [ val(meta), [ reads ] ] - should be text files or biom + versions = ch_versions // channel: [ versions.yml ] + mqc = ch_multiqc_files +} diff --git a/workflows/eager.nf b/workflows/eager.nf index 22ccd8483..2e47423ff 100644 --- a/workflows/eager.nf +++ b/workflows/eager.nf @@ -17,7 +17,12 @@ for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true // Check mandatory parameters if (params.input) { ch_input = file(params.input) } else { exit 1, 'Input samplesheet not specified!' } -// Check failing parameter combinations +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Check failing parameter combinations +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + if ( params.bamfiltering_retainunmappedgenomicbam && params.bamfiltering_mappingquality > 0 ) { exit 1, ("[nf-core/eager] ERROR: You cannot both retain unmapped reads and perform quality filtering, as unmapped reads have a mapping quality of 0. Pick one or the other functionality.") } if ( params.metagenomics_complexity_tool == 'prinseq' && params.metagenomics_prinseq_mode == 'dust' && params.metagenomics_complexity_entropy != 0.3 ) { // entropy score was set but dust method picked. If no dust-score provided, assume it was an error and fail @@ -35,7 +40,15 @@ if ( params.metagenomics_complexity_tool == 'prinseq' && params.metagenomics_pri // TODO What to do when params.preprocessing_excludeunmerged is provided but the data is SE? if ( params.deduplication_tool == 'dedup' && ! params.preprocessing_excludeunmerged ) { exit 1, "[nf-core/eager] ERROR: Dedup can only be used on collapsed (i.e. merged) PE reads. For all other cases, please set --deduplication_tool to 'markduplicates'."} -// Report possible warnings +// TODO add any other metagenomics screening parameters checks for eg complexity filtering, post-processing +if ( params.run_metagenomics_screening && ! params.metagenomics_profiling_database ) { exit 1, ("[nf-core/eager] ERROR: Please provide an appropriate database path for metagenomics screening") } + +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Report possible warnings +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + if ( params.preprocessing_skipadaptertrim && params.preprocessing_adapterlist ) log.warn("[nf-core/eager] --preprocessing_skipadaptertrim will override --preprocessing_adapterlist. Adapter trimming will be skipped!") /* @@ -67,6 +80,7 @@ include { MAP } from '../subworkflows/local/map' include { FILTER_BAM } from '../subworkflows/local/bamfiltering.nf' include { DEDUPLICATE } from '../subworkflows/local/deduplicate' include { METAGENOMICS_COMPLEXITYFILTER } from '../subworkflows/local/metagenomics_complexityfilter' +include { METAGENOMICS_PROFILING } from '../subworkflows/local/metagenomics_profiling' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -189,7 +203,7 @@ workflow EAGER { // SUBWORKFLOW: bam filtering (length, mapped/unmapped, quality etc.) // - if ( params.run_bamfiltering || params.run_metagenomicscreening ) { + if ( params.run_bamfiltering || params.run_metagenomics_screening ) { ch_mapped_for_bamfilter = MAP.out.bam .join(MAP.out.bai) @@ -282,6 +296,13 @@ workflow EAGER { ch_multiqc_files = ch_multiqc_files.mix(PRESEQ_LCEXTRAP.out.lc_extrap.collect{it[1]}.ifEmpty([])) ch_versions = ch_versions.mix( PRESEQ_LCEXTRAP.out.versions ) } + // SUBWORKFLOW: metagenomics screening + // + //TODO: finish and figure out how exactly to call with proper database (check via a helper function?) + if ( params.run_metagenomics_screening ) { + METAGENOMICS_PROFILING ( ch_bamfiltered_for_metagenomics, params.metagenomics_profiling_database ) // TODO: implement full metagenomics screening main subworkflow + } + // that then calls complexityfilter, profiling, postprocessing // // MODULE: MultiQC From 15d8bb548c674d12dd6160d578c907b6550be716 Mon Sep 17 00:00:00 2001 From: Ian Light Date: Wed, 29 Mar 2023 14:03:59 +0000 Subject: [PATCH 008/198] 'full' metagenomics profiling implementation --- conf/modules.config | 48 +++++++ nextflow_schema.json | 126 ++++++++++++++++++- subworkflows/local/metagenomics_profiling.nf | 33 +---- workflows/eager.nf | 2 +- 4 files changed, 174 insertions(+), 35 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 12cfe6438..68902c9a1 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -480,4 +480,52 @@ process { enabled: false ] } + withName: MALT_RUN { + ext.args = [ + "-m ${params.metagenomics_profiling_malt_mode}", + "-at ${params.metagenomics_profiling_malt_alignment_mode}", + "-top ${params.metagenomics_profiling_malt_top_percent}", + "-id ${params.metagenomics_profiling_malt_min_percent_identity}", + "-mq ${params.metagenomics_profiling_malt_max_queries}", + "--memoryMode ${params.metagenomics_profiling_malt_memory_mode}", + params.metagenomics_profiling_malt_min_support_mode == "percent" ? "-supp ${params.metagenomics_profiling_malt_min_support_percent}" : "-sup ${params.metagenomics_profiling_malt_min_support_reads}", + params.metagenomics_profiling_malt_sam_output ? "-a . -f SAM" : "", + params.metagenomics_profiling_malt_save_reads ? "--alignments ./ -za false" : "" + ].join(' ').trim() + publishDir = [ + path: { "${params.outdir}/metagenomics_screening/profiling/malt/" }, + mode: params.publish_dir_mode, + pattern: '*.{rma6,log,sam}' + ] + } + + withName: KRAKEN2_KRAKEN2 { + ext.prefix = params.perform_runmerging ? "${meta.id}.kraken2" : "${meta.id}_${meta.run_accession}.kraken2" + ext.args = [ + params.metagenomics_profiling_kraken2_save_minimizers ? "-report-minimizer-data" : "" + ].join(' ').trim() + publishDir = [ + path: { "${params.outdir}/metagenomics_screening/profiling/kraken2/" }, + mode: params.publish_dir_mode, + pattern: '*.{txt,fastq.gz}' + ] + } + + withName: KRAKENUNIQ_PRELOADEDKRAKENUNIQ { + publishDir = [ + path: { "${params.outdir}/metagenomics_screening/profiling/krakenuniq/" }, + mode: params.publish_dir_mode, + pattern: '*.{txt,fastq.gz}' + ] + } + + withName: METAPHLAN3_METAPHLAN3 { + ext.prefix = params.perform_runmerging ? { "${meta.id}.metaphlan3" } : { "${meta.id}_${meta.run_accession}.metaphlan3" } + publishDir = [ + path: { "${params.outdir}/metagenomics_screening/profiling/metaphlan3/" }, + mode: params.publish_dir_mode, + pattern: '*.{biom,txt}' + ] + } + } diff --git a/nextflow_schema.json b/nextflow_schema.json index 89fd16192..e8a9471c2 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -590,26 +590,140 @@ }, "metagenomics_profiling_tool": { "type": "string", - "default": "", "description": "Specify which tool to use for metagenomic profiling and screening.", "enum": ["malt", "metaphlan2", "kraken2", "krakenuniq"], - "fa_icon": "fas fa-hand-pointer", + "fa_icon": "fas fa-toolbox", "help_text": "Select which tool to run metagenomics profiling on designated metagenomics_screening_input. These tools behave vastly differently due to performing read profiling using different methods and yield vastly different reuslts." }, "metagenomics_profiling_database": { "type": "string", "format": "directory-path", - "default": "", "description": "Specify a databse directory to run metagenomics profiling on. In the case of kraken2, this can be a tar.gz of the directory.", - "fa_icon": "fas fa-hand-pointer", + "fa_icon": "fas fa-database", "help_text": "Select which tool to run metagenomics profiling database to use with the designated metagenomics_profiling_tool on the selected metagenomics_screening_input. These databases are NOT cross-compatible and need to be pre-built/downloaded for use in nf-core/eager. Database construction is often a balancing act between breadth of sequence diversity and size." }, + "metagenomics_profiling_krakenuniq_save_reads": { + "type": "boolean", + "fa_icon": "fas fa-save", + "description": "Turn on saving of KrakenUniq-aligned reads", + "help_text": "Save reads that do and do not have a taxonomic classification in your output results directory in FASTQ format.\n\n> Modifies tool parameter(s):\n> - krakenuniq: `--classified-out` and `--unclassified-out`" + }, + "metagenomics_profiling_krakenuniq_save_read_classifications": { + "type": "boolean", + "fa_icon": "fas fa-save", + "description": "Turn on saving of KrakenUniq per-read taxonomic assignment file", + "help_text": "Save a text file that contains a list of each read that had a taxonomic assignment, with information on specific taxonomic taxonomic assignment that that read recieved.\n\n> Modifies tool parameter(s):\n> - krakenuniq: `--output`" + }, + "metagenomics_profiling_krakenuniq_ram_chunk_size": { + "type": "string", + "default": "16G", + "description": "Specify how large to chunk database when loading into memory for KrakenUniq", + "fa_icon": "fas fa-database", + "help_text": "nf-core/taxprofiler utilises a 'low memory' option for KrakenUniq that can reduce the amount of RAM the process requires using the `--preloaded` option.\n\nA further extension to this option is that you can specify how large each chunk of the database should be that gets loaded into memory at any one time. You can specify the amount of RAM to chunk the database to with this parameter, and is particularly useful for people with limited computational resources.\n\nMore information about this parameter can be seen [here](https://github.com/fbreitwieser/krakenuniq/blob/master/README.md#new-release-v07).\n\n> Modifies KrakenUniq parameter: --preload-size\n\n" + }, + "metagenomics_profiling_kraken2_save_reads": { + "type": "boolean", + "fa_icon": "fas fa-save", + "description": "Turn on saving of Kraken2-aligned reads", + "help_text": "Save reads that do and do not have a taxonomic classification in your output results directory in FASTQ format.\n\n> Modifies tool parameter(s):\n> - kraken2: `--classified-out` and `--unclassified-out`" + }, + "metagenomics_profiling_kraken2_save_readclassification": { + "type": "boolean", + "fa_icon": "fas fa-save", + "description": "Turn on saving of Kraken2 per-read taxonomic assignment file", + "help_text": "Save a text file that contains a list of each read that had a taxonomic assignment, with information on specific taxonomic taxonomic assignment that that read recieved.\n\n> Modifies tool parameter(s):\n> - kraken2: `--output`" + }, + "metagenomics_profiling_kraken2_save_minimizers": { + "type": "boolean", + "description": "Turn on saving minimizer information in the kraken2 report thus increasing to an eight column layout.", + "fa_icon": "fas fa-save", + "help_text": "Turn on saving minimizer information in the kraken2 report thus increasing to an eight column layout.\n\nAdds `--report-minimizer-data` to the kraken2 command." + }, + "metagenomics_profiling_malt_mode": { + "type": "string", + "default": "BlastN", + "description": "Specify which alignment mode to use for MALT. Options: 'Unknown', 'BlastN', 'BlastP', 'BlastX', 'Classifier'.", + "fa_icon": "fas fa-align-left", + "help_text": "Use this to run the program in 'BlastN', 'BlastP', 'BlastX' modes to align DNA\nand DNA, protein and protein, or DNA reads against protein references\nrespectively. Ensure your database matches the mode. Check the\n[MALT\nmanual](http://ab.inf.uni-tuebingen.de/data/software/malt/download/manual.pdf)\nfor more details. Default: `'BlastN'`\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MALT parameter: `-m`\n", + "enum": ["BlastN", "BlastP", "BlastX"] + }, + "metagenomics_profiling_malt_alignment_mode": { + "type": "string", + "default": "SemiGlobal", + "description": "Specify alignment method for MALT. Options: 'Local', 'SemiGlobal'.", + "fa_icon": "fas fa-align-center", + "help_text": "Specify what alignment algorithm to use. Options are 'Local' or 'SemiGlobal'. Local is a BLAST like alignment, but is much slower. Semi-global alignment aligns reads end-to-end. Default: `'SemiGlobal'`\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MALT parameter: `-at`", + "enum": ["Local", "SemiGlobal"] + }, + "metagenomics_profiling_malt_min_percent_identity": { + "type": "integer", + "default": 85, + "description": "Percent identity value threshold for MALT.", + "fa_icon": "fas fa-id-card", + "help_text": "Specify the minimum percent identity (or similarity) a sequence must have to the reference for it to be retained. Default is `85`\n\nOnly used when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MALT parameter: `-id`" + }, + "metagenomics_profiling_malt_top_percent": { + "type": "integer", + "default": 1, + "description": "Specify the percent for LCA algorithm for MALT (see MEGAN6 CE manual).", + "fa_icon": "fas fa-percent", + "help_text": "Specify the top percent value of the LCA algorithm. From the [MALT manual](http://ab.inf.uni-tuebingen.de/data/software/malt/download/manual.pdf): \"For each\nread, only those matches are used for taxonomic placement whose bit disjointScore is within\n10% of the best disjointScore for that read.\". Default: `1`.\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MALT parameter: `-top`" + }, + "metagenomics_profiling_malt_min_support_mode": { + "type": "string", + "default": "percent", + "description": "Specify whether to use percent or raw number of reads for minimum support required for taxon to be retained for MALT. Options: 'percent', 'reads'.", + "fa_icon": "fas fa-drumstick-bite", + "help_text": "Specify whether to use a percentage, or raw number of reads as the value used to decide the minimum support a taxon requires to be retained.\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MALT parameter: `-sup -supp`", + "enum": ["percent", "reads"] + }, + "metagenomics_profiling_malt_min_support_percent": { + "type": "number", + "default": 0.01, + "description": "Specify the minimum percentage of reads a taxon of sample total is required to have to be retained for MALT.", + "fa_icon": "fas fa-percentage", + "help_text": "Specify the minimum number of reads (as a percentage of all assigned reads) a given taxon is required to have to be retained as a positive 'hit' in the RMA6 file. This only applies when `--malt_min_support_mode` is set to 'percent'. Default 0.01.\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MALT parameter: `-supp`" + }, + "metagenomics_profiling_malt_min_support_reads": { + "type": "integer", + "default": 1, + "description": "Specify a minimum number of reads a taxon of sample total is required to have to be retained in malt. Not compatible with --malt_min_support_mode 'percent'.", + "fa_icon": "fas fa-sort-numeric-up-alt", + "help_text": "Specify the minimum number of reads a given taxon is required to have to be retained as a positive 'hit'. \nFor malt, this only applies when `--malt_min_support_mode` is set to 'reads'. Default: 1.\n\n> Modifies MALT or kraken_parse.py parameter: `-sup` and `-c` respectively\n" + }, + "metagenomics_profiling_malt_max_queries": { + "type": "integer", + "default": 100, + "description": "Specify the maximum number of queries a read can have for MALT.", + "fa_icon": "fas fa-phone", + "help_text": "Specify the maximum number of alignments a read can have. All further alignments are discarded. Default: `100`\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MALT parameter: `-mq`" + }, + "metagenomics_profiling_malt_memory_mode": { + "type": "string", + "default": "load", + "description": "Specify the memory load method. Do not use 'map' with GPFS file systems for MALT as can be very slow. Options: 'load', 'page', 'map'.", + "fa_icon": "fas fa-memory", + "help_text": "\nHow to load the database into memory. Options are `'load'`, `'page'` or `'map'`.\n'load' directly loads the entire database into memory prior seed look up, this\nis slow but compatible with all servers/file systems. `'page'` and `'map'`\nperform a sort of 'chunked' database loading, allowing seed look up prior entire\ndatabase loading. Note that Page and Map modes do not work properly not with\nmany remote file-systems such as GPFS. Default is `'load'`.\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MALT parameter: `--memoryMode`", + "enum": ["load", "page", "map"] + }, + "metagenomics_profiling_malt_sam_output": { + "type": "boolean", + "description": "Specify to also produce SAM alignment files. Note this includes both aligned and unaligned reads, and are gzipped. Note this will result in very large file sizes.", + "fa_icon": "fas fa-file-alt", + "help_text": "Specify to _also_ produce gzipped SAM files of all alignments and un-aligned reads in addition to RMA6 files. These are **not** soft-clipped or in 'sparse' format. Can be useful for downstream analyses due to more common file format. \n\n:warning: can result in very large run output directories as this is essentially duplication of the RMA6 files.\n\n> Modifies MALT parameter `-a -f`" + }, + "metagenomics_profiling_malt_save_reads": { + "type": "boolean", + "fa_icon": "fas fa-save", + "description": "Turn on saving of MALT-aligned reads", + "help_text": "Turns on saving of MALT aligned reads in SAM format.\n\nNote that the SAM format produce by MALT is not completely valid, and may not work with downstream tools.\n\n> Modifies tool parameter(s):\n> - malt-run: `--alignments`, `-za`" + }, "metagenomics_profiling_malt_group_size": { "type": "integer", "default": 0, "description": "Define group sizes for running multiple fastq files into malt.", - "fa_icon": "fas fa-hand-pointer", - "help_text": "Very large fastq files or many fastq files run through malt at the same time can lead to excessively long runtimes. This parameter allows for parallelization of malt runs. Please note, malt is resource heavy and setting this value above the default will spawn N/metagenomics_profiling_malt_group_size jobs where N is the number of samples. Please only use this if it is necessary to avoid runtime limits on your HPC cluster." + "fa_icon": "fas fa-barcode", + "help_text": "Very large fastq files or many fastq files run through MALT at the same time can lead to excessively long runtimes. This parameter allows for parallelization of MALT runs. Please note, MALT is resource heavy and setting this value above the default will spawn N/metagenomics_profiling_malt_group_size jobs where N is the number of samples. Please only use this if it is necessary to avoid runtime limits on your HPC cluster." }, "run_metagenomics_complexityfiltering": { "type": "boolean", diff --git a/subworkflows/local/metagenomics_profiling.nf b/subworkflows/local/metagenomics_profiling.nf index 9c8cc58f3..4fba84f1f 100644 --- a/subworkflows/local/metagenomics_profiling.nf +++ b/subworkflows/local/metagenomics_profiling.nf @@ -87,11 +87,8 @@ workflow METAGENOMICS_PROFILING { } } - ch_input_for_malt.reads.dump() - ch_input_for_malt.database.dump() - // MALT: We groupTuple to have all samples in one channel for MALT as database - // loading takes a long time, so we only want to run it once per database, unless otherwise specified + // loading takes a long time, so we only want to run it once per database, unless otherwise specified (eg grouping samples) MALT_RUN ( ch_input_for_malt.reads, ch_input_for_malt.database ) @@ -108,25 +105,14 @@ workflow METAGENOMICS_PROFILING { [ meta_new, rma ] } - ch_multiqc_files = ch_multiqc_files.mix( MALT_RUN.out.log ) ch_versions = ch_versions.mix( MALT_RUN.out.versions.first() ) ch_raw_classifications = ch_raw_classifications.mix( ch_maltrun_for_megan ) + ch_multiqc_files = ch_multiqc_files.mix( MALT_RUN.out.log ) } if ( params.metagenomics_profiling_tool == 'metaphlan3' ) { - ch_input_for_metaphlan3 = ch_input_for_profiling.metaphlan3 - .filter{ - if (it[0].is_fasta) log.warn "[nf-core/taxprofiler] MetaPhlAn3 currently does not accept FASTA files as input. Skipping MetaPhlAn3 for sample ${it[0].id}." - !it[0].is_fasta - } - .multiMap { - it -> - reads: [it[0] + it[2], it[1]] - db: it[3] - } - - METAPHLAN3_METAPHLAN3 ( ch_input_for_metaphlan3.reads, database ) + METAPHLAN3_METAPHLAN3 ( reads , database ) ch_versions = ch_versions.mix( METAPHLAN3_METAPHLAN3.out.versions.first() ) ch_raw_profiles = ch_raw_profiles.mix( METAPHLAN3_METAPHLAN3.out.profile ) @@ -150,20 +136,11 @@ workflow METAGENOMICS_PROFILING { ch_versions = ch_versions.mix( KRAKENUNIQ_PRELOADEDKRAKENUNIQ.out.versions.first() ) ch_raw_classifications = ch_raw_classifications.mix( KRAKENUNIQ_PRELOADEDKRAKENUNIQ.out.classified_assignment ) ch_raw_profiles = ch_raw_profiles.mix( KRAKENUNIQ_PRELOADEDKRAKENUNIQ.out.report ) + ch_multiqc_files = ch_multiqc_files.mix( KRAKENUNIQ_PRELOADEDKRAKENUNIQ.out.report ) } if ( params.metagenomics_profiling_tool == 'kraken2' ) { - ch_input_for_kraken2 = ch_input_for_profiling.kraken2 - .map { - meta, reads, db_meta, db -> - [ meta, reads, db_meta, db ] - } - .multiMap { - it -> - reads: [ it[0] + it[2], it[1] ] - db: it[3] - } KRAKEN2_KRAKEN2 ( ch_input_for_kraken2.reads, database, params.metagenomics_kraken2_save_reads, params.metagenomics_kraken2_save_readclassification ) ch_multiqc_files = ch_multiqc_files.mix( KRAKEN2_KRAKEN2.out.report ) @@ -179,8 +156,8 @@ workflow METAGENOMICS_PROFILING { } emit: + versions = ch_versions // channel: [ versions.yml ] classifications = ch_raw_classifications profiles = ch_raw_profiles // channel: [ val(meta), [ reads ] ] - should be text files or biom - versions = ch_versions // channel: [ versions.yml ] mqc = ch_multiqc_files } diff --git a/workflows/eager.nf b/workflows/eager.nf index 2e47423ff..670a99515 100644 --- a/workflows/eager.nf +++ b/workflows/eager.nf @@ -41,7 +41,7 @@ if ( params.metagenomics_complexity_tool == 'prinseq' && params.metagenomics_pri if ( params.deduplication_tool == 'dedup' && ! params.preprocessing_excludeunmerged ) { exit 1, "[nf-core/eager] ERROR: Dedup can only be used on collapsed (i.e. merged) PE reads. For all other cases, please set --deduplication_tool to 'markduplicates'."} // TODO add any other metagenomics screening parameters checks for eg complexity filtering, post-processing -if ( params.run_metagenomics_screening && ! params.metagenomics_profiling_database ) { exit 1, ("[nf-core/eager] ERROR: Please provide an appropriate database path for metagenomics screening") } +if ( params.run_metagenomics_screening && ! params.metagenomics_profiling_database ) { exit 1, ("[nf-core/eager] ERROR: Please provide an appropriate database path for metagenomics screening using --metagenomics_profiling_database") } /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ From 9e963fb29d877e8a8ae7b8de971628667636f8c9 Mon Sep 17 00:00:00 2001 From: Ian Light <86308592+ilight1542@users.noreply.github.com> Date: Fri, 21 Apr 2023 10:24:53 +0200 Subject: [PATCH 009/198] Update nextflow_schema.json from PR review Co-authored-by: James A. Fellows Yates --- nextflow_schema.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index e8a9471c2..ee9f97da0 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -593,7 +593,7 @@ "description": "Specify which tool to use for metagenomic profiling and screening.", "enum": ["malt", "metaphlan2", "kraken2", "krakenuniq"], "fa_icon": "fas fa-toolbox", - "help_text": "Select which tool to run metagenomics profiling on designated metagenomics_screening_input. These tools behave vastly differently due to performing read profiling using different methods and yield vastly different reuslts." + "help_text": "Select which tool to run metagenomics profiling on designated metagenomics_screening_input. Which tool to use will depend on your specific context, as each tool uses a different method and database. See literature of the tools for recommendations" }, "metagenomics_profiling_database": { "type": "string", From c53754cfa26d450890b35a6a51b68eb3602386b8 Mon Sep 17 00:00:00 2001 From: Ian Light Date: Fri, 21 Apr 2023 08:32:08 +0000 Subject: [PATCH 010/198] jfy comments on help text updates --- nextflow_schema.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index ee9f97da0..6d517da5b 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -600,7 +600,7 @@ "format": "directory-path", "description": "Specify a databse directory to run metagenomics profiling on. In the case of kraken2, this can be a tar.gz of the directory.", "fa_icon": "fas fa-database", - "help_text": "Select which tool to run metagenomics profiling database to use with the designated metagenomics_profiling_tool on the selected metagenomics_screening_input. These databases are NOT cross-compatible and need to be pre-built/downloaded for use in nf-core/eager. Database construction is often a balancing act between breadth of sequence diversity and size." + "help_text": "Specify your metagenomics profiling database to use with the designated metagenomics_profiling_tool on the selected metagenomics_screening_input. These databases are NOT cross-compatible and need to be pre-built/downloaded for use in nf-core/eager. Database construction is often a balancing act between breadth of sequence diversity and size." }, "metagenomics_profiling_krakenuniq_save_reads": { "type": "boolean", From 36185360f062dffc8852161fdf34d1cd214b5a4f Mon Sep 17 00:00:00 2001 From: Ian Light Date: Fri, 21 Apr 2023 08:33:27 +0000 Subject: [PATCH 011/198] clarified metagenomics database description --- nextflow_schema.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index 6d517da5b..b32e634bb 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -598,7 +598,7 @@ "metagenomics_profiling_database": { "type": "string", "format": "directory-path", - "description": "Specify a databse directory to run metagenomics profiling on. In the case of kraken2, this can be a tar.gz of the directory.", + "description": "Specify the path to a databse directory to run metagenomics profiling on. In the case of kraken2, this can be a tar.gz of the directory.", "fa_icon": "fas fa-database", "help_text": "Specify your metagenomics profiling database to use with the designated metagenomics_profiling_tool on the selected metagenomics_screening_input. These databases are NOT cross-compatible and need to be pre-built/downloaded for use in nf-core/eager. Database construction is often a balancing act between breadth of sequence diversity and size." }, From 61c82540eb26c3bd9ca4a22664db2e3ec82313b9 Mon Sep 17 00:00:00 2001 From: Ian Light Date: Fri, 21 Apr 2023 11:11:41 +0000 Subject: [PATCH 012/198] updated for nf-core linting --- conf/modules.config | 20 ++++++------- conf/test.config | 2 +- docs/development/manual_tests.md | 24 +++++++-------- modules.json | 10 +++---- nextflow.config | 48 ++++++++++++++++-------------- nextflow_schema.json | 48 +++++++++++++++--------------- subworkflows/local/bamfiltering.nf | 19 ++++++------ workflows/eager.nf | 6 ++-- 8 files changed, 90 insertions(+), 87 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 68902c9a1..940f47cce 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -482,15 +482,15 @@ process { } withName: MALT_RUN { ext.args = [ - "-m ${params.metagenomics_profiling_malt_mode}", - "-at ${params.metagenomics_profiling_malt_alignment_mode}", - "-top ${params.metagenomics_profiling_malt_top_percent}", - "-id ${params.metagenomics_profiling_malt_min_percent_identity}", - "-mq ${params.metagenomics_profiling_malt_max_queries}", - "--memoryMode ${params.metagenomics_profiling_malt_memory_mode}", - params.metagenomics_profiling_malt_min_support_mode == "percent" ? "-supp ${params.metagenomics_profiling_malt_min_support_percent}" : "-sup ${params.metagenomics_profiling_malt_min_support_reads}", - params.metagenomics_profiling_malt_sam_output ? "-a . -f SAM" : "", - params.metagenomics_profiling_malt_save_reads ? "--alignments ./ -za false" : "" + "-m ${params.metagenomics_malt_mode}", + "-at ${params.metagenomics_malt_alignment_mode}", + "-top ${params.metagenomics_malt_top_percent}", + "-id ${params.metagenomics_malt_min_percent_identity}", + "-mq ${params.metagenomics_malt_max_queries}", + "--memoryMode ${params.metagenomics_malt_memory_mode}", + params.metagenomics_malt_min_support_mode == "percent" ? "-supp ${params.metagenomics_malt_min_support_percent}" : "-sup ${params.metagenomics_malt_min_support_reads}", + params.metagenomics_malt_sam_output ? "-a . -f SAM" : "", + params.metagenomics_malt_save_reads ? "--alignments ./ -za false" : "" ].join(' ').trim() publishDir = [ path: { "${params.outdir}/metagenomics_screening/profiling/malt/" }, @@ -502,7 +502,7 @@ process { withName: KRAKEN2_KRAKEN2 { ext.prefix = params.perform_runmerging ? "${meta.id}.kraken2" : "${meta.id}_${meta.run_accession}.kraken2" ext.args = [ - params.metagenomics_profiling_kraken2_save_minimizers ? "-report-minimizer-data" : "" + params.metagenomics_kraken2_save_minimizers ? "-report-minimizer-data" : "" ].join(' ').trim() publishDir = [ path: { "${params.outdir}/metagenomics_screening/profiling/kraken2/" }, diff --git a/conf/test.config b/conf/test.config index ba4d16f58..f655f0e3d 100644 --- a/conf/test.config +++ b/conf/test.config @@ -33,7 +33,7 @@ params { bamfiltering_mappingquality = 37 // Metagenomic screening - run_metagenomics_screening = false + run_metagenomics = false } diff --git a/docs/development/manual_tests.md b/docs/development/manual_tests.md index 26be61964..5449d1c46 100644 --- a/docs/development/manual_tests.md +++ b/docs/development/manual_tests.md @@ -208,7 +208,7 @@ All possible parameters bamfiltering_savefilteredbam = false // can include unmapped reads if --bamfiltering_retainunmappedgenomicbam specified // Metagenomic Screening - run_metagenomics_screening = false + run_metagenomics = false metagenomics_input = 'unmapped' // mapped, all, unmapped -> mapped vs all specified in SAMTOOLS_FASTQ_MAPPED in modules.conf, unmapped hardcoded SAMTOOLS_FASTQ_UMAPPED ``` @@ -270,46 +270,46 @@ nextflow run ../main.nf -profile test,singularity --outdir ./results -ansi-log f ## Check BAM filtering (mapped only/length/quality on genomic bam) with metagenomics screening, with unmapped reads to metagenomics # Expect: filtered BAM (samtools stats | grep SN total/mapped same), and a dump() on the ch_bam_for_metagenomics channel should report unmapped_other. Nr. of reads in dumped FASTQ should match approx unmmaped reads in results/mapping/*.flagstat -nextflow run ../main.nf -profile test,singularity --outdir ./results -ansi-log false --input data/samplesheet.tsv --fasta data/reference/Mammoth_MT_Krause.fasta --run_bamfiltering --bamfiltering_savefilteredbams --bamfiltering_minreadlength 50 --bamfiltering_mappingquality 37 --run_metagenomics_screening -dump-channels -nextflow run ../main.nf -profile test,singularity --outdir ./results -ansi-log false --input data/samplesheet.tsv --fasta data/reference/Mammoth_MT_Krause.fasta --run_bamfiltering --bamfiltering_savefilteredbams --bamfiltering_minreadlength 50 --bamfiltering_mappingquality 37 --run_metagenomics_screening -dump-channels +nextflow run ../main.nf -profile test,singularity --outdir ./results -ansi-log false --input data/samplesheet.tsv --fasta data/reference/Mammoth_MT_Krause.fasta --run_bamfiltering --bamfiltering_savefilteredbams --bamfiltering_minreadlength 50 --bamfiltering_mappingquality 37 --run_metagenomics -dump-channels +nextflow run ../main.nf -profile test,singularity --outdir ./results -ansi-log false --input data/samplesheet.tsv --fasta data/reference/Mammoth_MT_Krause.fasta --run_bamfiltering --bamfiltering_savefilteredbams --bamfiltering_minreadlength 50 --bamfiltering_mappingquality 37 --run_metagenomics -dump-channels ## Check BAM filtering (mapped only/length/quality on genomic bam) with metagenomics screening, with mapped only reads going to metagenomics # Expect: filtered BAM (samtools stats | grep SN total/mapped same), and a dump() on the ch_bam_for_metagenomics channel should report mapped_other. Nr. of reads in dumped FASTQ should match approx mmaped reads in results/mapping/*.flagstat -nextflow run ../main.nf -profile test,singularity --outdir ./results -ansi-log false --input data/samplesheet.tsv --fasta data/reference/Mammoth_MT_Krause.fasta --run_bamfiltering --bamfiltering_savefilteredbams --bamfiltering_minreadlength 50 --bamfiltering_mappingquality 37 --run_metagenomics_screening --metagenomics_input 'mapped' -dump-channels +nextflow run ../main.nf -profile test,singularity --outdir ./results -ansi-log false --input data/samplesheet.tsv --fasta data/reference/Mammoth_MT_Krause.fasta --run_bamfiltering --bamfiltering_savefilteredbams --bamfiltering_minreadlength 50 --bamfiltering_mappingquality 37 --run_metagenomics --metagenomics_input 'mapped' -dump-channels ## Check BAM filtering (mapped only/length/quality on genomic bam) with metagenomics screening, with all reads going to metagenomics # Expect: filtered BAM (samtools stats | grep SN total/mapped same), and a dump() on the ch_bam_for_metagenomics channel should report mapped_other. Nr. of reads in dumped FASTQ should match total reads in results/mapping/*.flagstat -nextflow run ../main.nf -profile test,singularity --outdir ./results -ansi-log false --input data/samplesheet.tsv --fasta data/reference/Mammoth_MT_Krause.fasta --run_bamfiltering --bamfiltering_savefilteredbams --bamfiltering_minreadlength 50 --bamfiltering_mappingquality 37 --run_metagenomics_screening --metagenomics_input 'all' -dump-channels +nextflow run ../main.nf -profile test,singularity --outdir ./results -ansi-log false --input data/samplesheet.tsv --fasta data/reference/Mammoth_MT_Krause.fasta --run_bamfiltering --bamfiltering_savefilteredbams --bamfiltering_minreadlength 50 --bamfiltering_mappingquality 37 --run_metagenomics --metagenomics_input 'all' -dump-channels ## Check BAM filtering NO LENGTH/QAULITY with metagenomics screening, with unmapped reads to metagenomics # Expect: filtered BAM (samtools stats SN quality average < 36.7 or view -q 0 vs. -q 37 is different and RL reads min <50), and a dump() on the ch_bam_for_metagenomics channel should report mapped_other. Nr. of reads in dumped FASTQ should match unmapped reads as calculated from results/mapping/*.flagstat. Note: No filtered flagstat expected! -nextflow run ../main.nf -profile test,singularity --outdir ./results -ansi-log false --input data/samplesheet.tsv --fasta data/reference/Mammoth_MT_Krause.fasta --run_bamfiltering --bamfiltering_savefilteredbams --run_metagenomics_screening --metagenomics_input 'unmapped' -dump-channels +nextflow run ../main.nf -profile test,singularity --outdir ./results -ansi-log false --input data/samplesheet.tsv --fasta data/reference/Mammoth_MT_Krause.fasta --run_bamfiltering --bamfiltering_savefilteredbams --run_metagenomics --metagenomics_input 'unmapped' -dump-channels ## Check BAM filtering NO LENGTH/QAULITY with metagenomics screening, with unmapped reads to metagenomics and save unmapped FASTQ # Expect: filtered BAM (samtools stats SN quality average < 36.7 or view -q 0 vs. -q 37 is different and RL reads min <50), and a dump() on the ch_bam_for_metagenomics channel should report unmapped_other. Nr. of reads in dumped FASTQ should match unmapped reads as calculated from results/mapping/*.flagstat; and unmapped other fASTQ in bam_filtering directoryt. Note: No filtered flagstat expected! -nextflow run ../main.nf -profile test,singularity --outdir ./results -ansi-log false --input data/samplesheet.tsv --fasta data/reference/Mammoth_MT_Krause.fasta --run_bamfiltering --bamfiltering_savefilteredbams --run_metagenomics_screening --metagenomics_input 'unmapped' -dump-channels +nextflow run ../main.nf -profile test,singularity --outdir ./results -ansi-log false --input data/samplesheet.tsv --fasta data/reference/Mammoth_MT_Krause.fasta --run_bamfiltering --bamfiltering_savefilteredbams --run_metagenomics --metagenomics_input 'unmapped' -dump-channels ## Check BAM filtering NO LENGTH/QAULITY with metagenomics screening, with mapped only reads going to metagenomics # Expect: filtered BAM (samtools stats SN quality average < 36.7 or view -q 0 vs. -q 37 is different and RL reads min <50), and a dump() on the ch_bam_for_metagenomics channel should report mapped_other. Nr. of reads in dumped FASTQ should be roughly matching mappd reads as calculated from results/mapping/*.flagstatt. Note: No filtered flagstat expected! -nextflow run ../main.nf -profile test,singularity --outdir ./results -ansi-log false --input data/samplesheet.tsv --fasta data/reference/Mammoth_MT_Krause.fasta --run_bamfiltering --bamfiltering_savefilteredbams --run_metagenomics_screening --metagenomics_input 'mapped' -dump-channels +nextflow run ../main.nf -profile test,singularity --outdir ./results -ansi-log false --input data/samplesheet.tsv --fasta data/reference/Mammoth_MT_Krause.fasta --run_bamfiltering --bamfiltering_savefilteredbams --run_metagenomics --metagenomics_input 'mapped' -dump-channels ## Check BAM filtering NO LENGTH/QAULITY with metagenomics screening, with all reads going to metagenomics # Expect: filtered BAM (samtools stats SN quality average < 36.7 or view -q 0 vs. -q 37 is different and RL reads min <50), and a dump() on the ch_bam_for_metagenomics channel should report mapped_other. Nr. of reads in dumped FASTQ should be roughly matching total reads as calculated from results/mapping/*.flagstatt. Note: No filtered flagstat expected! ## Some reads lost, not 100% why command looks OK... but not just unmapped as more than that -nextflow run ../main.nf -profile test,singularity --outdir ./results -ansi-log false --input data/samplesheet.tsv --fasta data/reference/Mammoth_MT_Krause.fasta --run_bamfiltering --bamfiltering_savefilteredbams --run_metagenomics_screening --metagenomics_input 'all' -dump-channels +nextflow run ../main.nf -profile test,singularity --outdir ./results -ansi-log false --input data/samplesheet.tsv --fasta data/reference/Mammoth_MT_Krause.fasta --run_bamfiltering --bamfiltering_savefilteredbams --run_metagenomics --metagenomics_input 'all' -dump-channels ## Check BAM filtering ONLY length filtering, with metagenomics screening, with unmapped reads to metagenomics and save unmapped FASTQ ## Metagenomics with length only # Expect: filtered BAM (samtools stats SN quality average < 36.7 or view -q 0 vs. -q 37 is different and RL reads min >= 50), and a dump() on the ch_bam_for_metagenomics channel should report unmapped_other. Nr. of reads in dumped FASTQ should match unmapped reads as calculated from results/mapping/*.flagstat; and unmapped other fASTQ in bam_filtering directoryt. -nextflow run ../main.nf -profile test,singularity --outdir ./results -ansi-log false --input data/samplesheet.tsv --fasta data/reference/Mammoth_MT_Krause.fasta --run_bamfiltering --bamfiltering_savefilteredbams --run_metagenomics_screening --metagenomics_input 'unmapped' -dump-channels --bamfiltering_minreadlength 50 +nextflow run ../main.nf -profile test,singularity --outdir ./results -ansi-log false --input data/samplesheet.tsv --fasta data/reference/Mammoth_MT_Krause.fasta --run_bamfiltering --bamfiltering_savefilteredbams --run_metagenomics --metagenomics_input 'unmapped' -dump-channels --bamfiltering_minreadlength 50 ## Check BAM filtering ONLY length filtering, with metagenomics screening, with unmapped reads to metagenomics and save unmapped FASTQ ## Metagenomics with length only # Expect: filtered BAM (samtools stats SN quality average < 36.7 or view -q 0 vs. -q 37 is not different and RL reads min <= 50), and a dump() on the ch_bam_for_metagenomics channel should report unmapped_other. Nr. of reads in dumped FASTQ should match unmapped reads as calculated from results/mapping/*.flagstat; and unmapped other fASTQ in bam_filtering directoryt. -nextflow run ../main.nf -profile test,singularity --outdir ./results -ansi-log false --input data/samplesheet.tsv --fasta data/reference/Mammoth_MT_Krause.fasta --run_bamfiltering --bamfiltering_savefilteredbams --run_metagenomics_screening --metagenomics_input 'unmapped' -dump-channels --bamfiltering_mappingquality 37 +nextflow run ../main.nf -profile test,singularity --outdir ./results -ansi-log false --input data/samplesheet.tsv --fasta data/reference/Mammoth_MT_Krause.fasta --run_bamfiltering --bamfiltering_savefilteredbams --run_metagenomics --metagenomics_input 'unmapped' -dump-channels --bamfiltering_mappingquality 37 ## Check what happens when we do paired-end merging and sending reads to metagenomics... -nextflow run ../main.nf -profile test,singularity --outdir ./results -ansi-log false --input data/samplesheet.tsv --fasta data/reference/Mammoth_MT_Krause.fasta --run_bamfiltering --bamfiltering_savefilteredbams --run_metagenomics_screening --metagenomics_input 'unmapped' -dump-channels --bamfiltering_mappingquality 37 --preprocessing_skippairmerging +nextflow run ../main.nf -profile test,singularity --outdir ./results -ansi-log false --input data/samplesheet.tsv --fasta data/reference/Mammoth_MT_Krause.fasta --run_bamfiltering --bamfiltering_savefilteredbams --run_metagenomics --metagenomics_input 'unmapped' -dump-channels --bamfiltering_mappingquality 37 --preprocessing_skippairmerging ``` ## Deduplication diff --git a/modules.json b/modules.json index 9e9d60d46..0825fd8d1 100644 --- a/modules.json +++ b/modules.json @@ -125,11 +125,6 @@ "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", "installed_by": ["modules"] }, - "prinseqplusplus": { - "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", - "installed_by": ["modules"] - }, "preseq/ccurve": { "branch": "master", "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", @@ -140,6 +135,11 @@ "git_sha": "0f8a77ff00e65eaeebc509b8156eaa983192474b", "installed_by": ["modules"] }, + "prinseqplusplus": { + "branch": "master", + "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "installed_by": ["modules"] + }, "samtools/faidx": { "branch": "master", "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", diff --git a/nextflow.config b/nextflow.config index a1c1f42a6..c2c78f1de 100644 --- a/nextflow.config +++ b/nextflow.config @@ -109,30 +109,34 @@ params { bamfiltering_savefilteredbams = false // can include unmapped reads if --bamfiltering_retainunmappedgenomicbam specified // Metagenomic Screening - run_metagenomics = false - metagenomics_input = 'unmapped' // mapped, all, unmapped -> mapped vs all specified in SAMTOOLS_FASTQ_MAPPED in modules.conf, unmapped hardcoded SAMTOOLS_FASTQ_UMAPPED - run_metagenomics_complexityfiltering = false - metagenomics_complexity_tool = 'bbduk' - metagenomics_complexity_savefastq = false - metagenomics_complexity_entropy = 0.3 - metagenomics_prinseq_mode = 'entropy' - metagenomics_prinseq_dustscore = 0.5 - metagenomics_profiling_tool = '' - metagenomics_profiling_database = '' - metagenomics_krakenuniq_ram_chunk_size = '16G' - metagenomics_krakenuniq_save_reads = false - metagenomics_krakenuniq_save_readclassifications = false - metagenomics_kraken2_save_reads = false - metagenomics_kraken2_save_readclassification = false - metagenomics_kraken2_save_minimizers = false - metagenomics_malt_mode = 'BlastN' - metagenomics_malt_alignment_mode = 'SemiGlobal' - metagenomics_malt_save_reads = false - metagenomics_malt_sam_output = false - metagenomics_malt_percent_identity = 85 - metagenomics_malt_top_percent = 1 + run_metagenomics = false + metagenomics_input = 'unmapped' // mapped, all, unmapped -> mapped vs all specified in SAMTOOLS_FASTQ_MAPPED in modules.conf, unmapped hardcoded SAMTOOLS_FASTQ_UMAPPED + run_metagenomics_complexityfiltering = false + metagenomics_complexity_tool = 'bbduk' + metagenomics_complexity_savefastq = false + metagenomics_complexity_entropy = 0.3 + metagenomics_prinseq_mode = 'entropy' + metagenomics_prinseq_dustscore = 0.5 + metagenomics_profiling_tool = null + metagenomics_profiling_database = null + metagenomics_krakenuniq_ram_chunk_size = '16G' + metagenomics_krakenuniq_save_reads = false + metagenomics_krakenuniq_save_read_classifications = false + metagenomics_kraken2_save_reads = false + metagenomics_kraken2_save_readclassification = false + metagenomics_kraken2_save_minimizers = false + metagenomics_malt_mode = 'BlastN' + metagenomics_malt_alignment_mode = 'SemiGlobal' + metagenomics_malt_save_reads = false + metagenomics_malt_sam_output = false + metagenomics_malt_min_support_mode = 'percent' + metagenomics_malt_min_support_percent = 0.01 + metagenomics_malt_min_support_reads = 1 + metagenomics_malt_min_percent_identity = 85 + metagenomics_malt_top_percent = 1 metagenomics_malt_max_queries = 100 metagenomics_malt_memory_mode = 'load' + metagenomics_malt_group_size = 0 // Deduplication options skip_deduplication = false diff --git a/nextflow_schema.json b/nextflow_schema.json index b32e634bb..87dbd079c 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -544,7 +544,7 @@ "bamfiltering_retainunmappedgenomicbam": { "type": "boolean", "description": "Specify to retain unmapped reads in the BAM file used for downstream genomic analyses.", - "help_text": "You can use this parameter to retain unmapped reads (optionally also length filtered) in the genomic BAM for downstream analysis. By default, the pipeline only keeps mapped reads for downstream analysis.\n\nThis is also turned on if `--metagenomics_screening_input` is set to `all`.\n\n> \u26a0\ufe0f This will likely slow down run time of downstream pipeline steps!\n\n> Modifies tool parameter(s):\n> - samtools view: `-f 4` / `-F 4`", + "help_text": "You can use this parameter to retain unmapped reads (optionally also length filtered) in the genomic BAM for downstream analysis. By default, the pipeline only keeps mapped reads for downstream analysis.\n\nThis is also turned on if `--metagenomics_input` is set to `all`.\n\n> \u26a0\ufe0f This will likely slow down run time of downstream pipeline steps!\n\n> Modifies tool parameter(s):\n> - samtools view: `-f 4` / `-F 4`", "fa_icon": "fas fa-piggy-bank" }, "bamfiltering_generateunmappedfastq": { @@ -574,13 +574,13 @@ "description": "Options to related to metagenomic screening.", "default": "", "properties": { - "run_metagenomics_screening": { + "run_metagenomics": { "type": "boolean", "description": "Turn on metagenomic screening of mapped, unmapped, or all reads.", "fa_icon": "fas fa-power-off", "help_text": "Turns on the metagenomic screening subworkflow of the pipeline, where reads are screened against large databases. Typically used for pathogen screening or microbial community analysis.\n\nIf supplied, this will also turn on the BAM filtering subworkflow of the pipeline." }, - "metagenomics_screening_input": { + "metagenomics_input": { "type": "string", "default": "unmapped", "description": "Specify which type of reads to go into metagenomic screening.", @@ -593,53 +593,53 @@ "description": "Specify which tool to use for metagenomic profiling and screening.", "enum": ["malt", "metaphlan2", "kraken2", "krakenuniq"], "fa_icon": "fas fa-toolbox", - "help_text": "Select which tool to run metagenomics profiling on designated metagenomics_screening_input. Which tool to use will depend on your specific context, as each tool uses a different method and database. See literature of the tools for recommendations" + "help_text": "Select which tool to run metagenomics profiling on designated metagenomics_input. Which tool to use will depend on your specific context, as each tool uses a different method and database. See literature of the tools for recommendations" }, "metagenomics_profiling_database": { "type": "string", "format": "directory-path", "description": "Specify the path to a databse directory to run metagenomics profiling on. In the case of kraken2, this can be a tar.gz of the directory.", "fa_icon": "fas fa-database", - "help_text": "Specify your metagenomics profiling database to use with the designated metagenomics_profiling_tool on the selected metagenomics_screening_input. These databases are NOT cross-compatible and need to be pre-built/downloaded for use in nf-core/eager. Database construction is often a balancing act between breadth of sequence diversity and size." + "help_text": "Specify your metagenomics profiling database to use with the designated metagenomics_profiling_tool on the selected metagenomics_input. These databases are NOT cross-compatible and need to be pre-built/downloaded for use in nf-core/eager. Database construction is often a balancing act between breadth of sequence diversity and size." }, - "metagenomics_profiling_krakenuniq_save_reads": { + "metagenomics_krakenuniq_save_reads": { "type": "boolean", "fa_icon": "fas fa-save", "description": "Turn on saving of KrakenUniq-aligned reads", "help_text": "Save reads that do and do not have a taxonomic classification in your output results directory in FASTQ format.\n\n> Modifies tool parameter(s):\n> - krakenuniq: `--classified-out` and `--unclassified-out`" }, - "metagenomics_profiling_krakenuniq_save_read_classifications": { + "metagenomics_krakenuniq_save_read_classifications": { "type": "boolean", "fa_icon": "fas fa-save", "description": "Turn on saving of KrakenUniq per-read taxonomic assignment file", "help_text": "Save a text file that contains a list of each read that had a taxonomic assignment, with information on specific taxonomic taxonomic assignment that that read recieved.\n\n> Modifies tool parameter(s):\n> - krakenuniq: `--output`" }, - "metagenomics_profiling_krakenuniq_ram_chunk_size": { + "metagenomics_krakenuniq_ram_chunk_size": { "type": "string", "default": "16G", "description": "Specify how large to chunk database when loading into memory for KrakenUniq", "fa_icon": "fas fa-database", "help_text": "nf-core/taxprofiler utilises a 'low memory' option for KrakenUniq that can reduce the amount of RAM the process requires using the `--preloaded` option.\n\nA further extension to this option is that you can specify how large each chunk of the database should be that gets loaded into memory at any one time. You can specify the amount of RAM to chunk the database to with this parameter, and is particularly useful for people with limited computational resources.\n\nMore information about this parameter can be seen [here](https://github.com/fbreitwieser/krakenuniq/blob/master/README.md#new-release-v07).\n\n> Modifies KrakenUniq parameter: --preload-size\n\n" }, - "metagenomics_profiling_kraken2_save_reads": { + "metagenomics_kraken2_save_reads": { "type": "boolean", "fa_icon": "fas fa-save", "description": "Turn on saving of Kraken2-aligned reads", "help_text": "Save reads that do and do not have a taxonomic classification in your output results directory in FASTQ format.\n\n> Modifies tool parameter(s):\n> - kraken2: `--classified-out` and `--unclassified-out`" }, - "metagenomics_profiling_kraken2_save_readclassification": { + "metagenomics_kraken2_save_readclassification": { "type": "boolean", "fa_icon": "fas fa-save", "description": "Turn on saving of Kraken2 per-read taxonomic assignment file", "help_text": "Save a text file that contains a list of each read that had a taxonomic assignment, with information on specific taxonomic taxonomic assignment that that read recieved.\n\n> Modifies tool parameter(s):\n> - kraken2: `--output`" }, - "metagenomics_profiling_kraken2_save_minimizers": { + "metagenomics_kraken2_save_minimizers": { "type": "boolean", "description": "Turn on saving minimizer information in the kraken2 report thus increasing to an eight column layout.", "fa_icon": "fas fa-save", "help_text": "Turn on saving minimizer information in the kraken2 report thus increasing to an eight column layout.\n\nAdds `--report-minimizer-data` to the kraken2 command." }, - "metagenomics_profiling_malt_mode": { + "metagenomics_malt_mode": { "type": "string", "default": "BlastN", "description": "Specify which alignment mode to use for MALT. Options: 'Unknown', 'BlastN', 'BlastP', 'BlastX', 'Classifier'.", @@ -647,7 +647,7 @@ "help_text": "Use this to run the program in 'BlastN', 'BlastP', 'BlastX' modes to align DNA\nand DNA, protein and protein, or DNA reads against protein references\nrespectively. Ensure your database matches the mode. Check the\n[MALT\nmanual](http://ab.inf.uni-tuebingen.de/data/software/malt/download/manual.pdf)\nfor more details. Default: `'BlastN'`\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MALT parameter: `-m`\n", "enum": ["BlastN", "BlastP", "BlastX"] }, - "metagenomics_profiling_malt_alignment_mode": { + "metagenomics_malt_alignment_mode": { "type": "string", "default": "SemiGlobal", "description": "Specify alignment method for MALT. Options: 'Local', 'SemiGlobal'.", @@ -655,21 +655,21 @@ "help_text": "Specify what alignment algorithm to use. Options are 'Local' or 'SemiGlobal'. Local is a BLAST like alignment, but is much slower. Semi-global alignment aligns reads end-to-end. Default: `'SemiGlobal'`\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MALT parameter: `-at`", "enum": ["Local", "SemiGlobal"] }, - "metagenomics_profiling_malt_min_percent_identity": { + "metagenomics_malt_min_percent_identity": { "type": "integer", "default": 85, "description": "Percent identity value threshold for MALT.", "fa_icon": "fas fa-id-card", "help_text": "Specify the minimum percent identity (or similarity) a sequence must have to the reference for it to be retained. Default is `85`\n\nOnly used when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MALT parameter: `-id`" }, - "metagenomics_profiling_malt_top_percent": { + "metagenomics_malt_top_percent": { "type": "integer", "default": 1, "description": "Specify the percent for LCA algorithm for MALT (see MEGAN6 CE manual).", "fa_icon": "fas fa-percent", "help_text": "Specify the top percent value of the LCA algorithm. From the [MALT manual](http://ab.inf.uni-tuebingen.de/data/software/malt/download/manual.pdf): \"For each\nread, only those matches are used for taxonomic placement whose bit disjointScore is within\n10% of the best disjointScore for that read.\". Default: `1`.\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MALT parameter: `-top`" }, - "metagenomics_profiling_malt_min_support_mode": { + "metagenomics_malt_min_support_mode": { "type": "string", "default": "percent", "description": "Specify whether to use percent or raw number of reads for minimum support required for taxon to be retained for MALT. Options: 'percent', 'reads'.", @@ -677,28 +677,28 @@ "help_text": "Specify whether to use a percentage, or raw number of reads as the value used to decide the minimum support a taxon requires to be retained.\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MALT parameter: `-sup -supp`", "enum": ["percent", "reads"] }, - "metagenomics_profiling_malt_min_support_percent": { + "metagenomics_malt_min_support_percent": { "type": "number", "default": 0.01, "description": "Specify the minimum percentage of reads a taxon of sample total is required to have to be retained for MALT.", "fa_icon": "fas fa-percentage", "help_text": "Specify the minimum number of reads (as a percentage of all assigned reads) a given taxon is required to have to be retained as a positive 'hit' in the RMA6 file. This only applies when `--malt_min_support_mode` is set to 'percent'. Default 0.01.\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MALT parameter: `-supp`" }, - "metagenomics_profiling_malt_min_support_reads": { + "metagenomics_malt_min_support_reads": { "type": "integer", "default": 1, "description": "Specify a minimum number of reads a taxon of sample total is required to have to be retained in malt. Not compatible with --malt_min_support_mode 'percent'.", "fa_icon": "fas fa-sort-numeric-up-alt", "help_text": "Specify the minimum number of reads a given taxon is required to have to be retained as a positive 'hit'. \nFor malt, this only applies when `--malt_min_support_mode` is set to 'reads'. Default: 1.\n\n> Modifies MALT or kraken_parse.py parameter: `-sup` and `-c` respectively\n" }, - "metagenomics_profiling_malt_max_queries": { + "metagenomics_malt_max_queries": { "type": "integer", "default": 100, "description": "Specify the maximum number of queries a read can have for MALT.", "fa_icon": "fas fa-phone", "help_text": "Specify the maximum number of alignments a read can have. All further alignments are discarded. Default: `100`\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MALT parameter: `-mq`" }, - "metagenomics_profiling_malt_memory_mode": { + "metagenomics_malt_memory_mode": { "type": "string", "default": "load", "description": "Specify the memory load method. Do not use 'map' with GPFS file systems for MALT as can be very slow. Options: 'load', 'page', 'map'.", @@ -706,24 +706,24 @@ "help_text": "\nHow to load the database into memory. Options are `'load'`, `'page'` or `'map'`.\n'load' directly loads the entire database into memory prior seed look up, this\nis slow but compatible with all servers/file systems. `'page'` and `'map'`\nperform a sort of 'chunked' database loading, allowing seed look up prior entire\ndatabase loading. Note that Page and Map modes do not work properly not with\nmany remote file-systems such as GPFS. Default is `'load'`.\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MALT parameter: `--memoryMode`", "enum": ["load", "page", "map"] }, - "metagenomics_profiling_malt_sam_output": { + "metagenomics_malt_sam_output": { "type": "boolean", "description": "Specify to also produce SAM alignment files. Note this includes both aligned and unaligned reads, and are gzipped. Note this will result in very large file sizes.", "fa_icon": "fas fa-file-alt", "help_text": "Specify to _also_ produce gzipped SAM files of all alignments and un-aligned reads in addition to RMA6 files. These are **not** soft-clipped or in 'sparse' format. Can be useful for downstream analyses due to more common file format. \n\n:warning: can result in very large run output directories as this is essentially duplication of the RMA6 files.\n\n> Modifies MALT parameter `-a -f`" }, - "metagenomics_profiling_malt_save_reads": { + "metagenomics_malt_save_reads": { "type": "boolean", "fa_icon": "fas fa-save", "description": "Turn on saving of MALT-aligned reads", "help_text": "Turns on saving of MALT aligned reads in SAM format.\n\nNote that the SAM format produce by MALT is not completely valid, and may not work with downstream tools.\n\n> Modifies tool parameter(s):\n> - malt-run: `--alignments`, `-za`" }, - "metagenomics_profiling_malt_group_size": { + "metagenomics_malt_group_size": { "type": "integer", "default": 0, "description": "Define group sizes for running multiple fastq files into malt.", "fa_icon": "fas fa-barcode", - "help_text": "Very large fastq files or many fastq files run through MALT at the same time can lead to excessively long runtimes. This parameter allows for parallelization of MALT runs. Please note, MALT is resource heavy and setting this value above the default will spawn N/metagenomics_profiling_malt_group_size jobs where N is the number of samples. Please only use this if it is necessary to avoid runtime limits on your HPC cluster." + "help_text": "Very large fastq files or many fastq files run through MALT at the same time can lead to excessively long runtimes. This parameter allows for parallelization of MALT runs. Please note, MALT is resource heavy and setting this value above the default will spawn N/metagenomics_malt_group_size jobs where N is the number of samples. Please only use this if it is necessary to avoid runtime limits on your HPC cluster." }, "run_metagenomics_complexityfiltering": { "type": "boolean", diff --git a/subworkflows/local/bamfiltering.nf b/subworkflows/local/bamfiltering.nf index 7fca12733..6b0c960ba 100644 --- a/subworkflows/local/bamfiltering.nf +++ b/subworkflows/local/bamfiltering.nf @@ -69,19 +69,19 @@ workflow FILTER_BAM { // // Generate unmapped bam (no additional filtering) if the unmapped bam OR unmapped for metagneomics selected - if ( params.bamfiltering_generateunmappedfastq || ( params.run_metagenomics_screening && params.metagenomics_input == 'unmapped' ) ) { + if ( params.bamfiltering_generateunmappedfastq || ( params.run_metagenomics && params.metagenomics_input == 'unmapped' ) ) { SAMTOOLS_FASTQ_UNMAPPED ( bam.map{[ it[0], it[1] ]}, false ) ch_versions = ch_versions.mix( SAMTOOLS_FASTQ_UNMAPPED.out.versions.first() ) } // Solution to the Andrades Valtueña-Light Problem: mapped bam for metagenomics (with options for quality- and length filtered) - if ( params.bamfiltering_generatemappedfastq || ( params.run_metagenomics_screening && ( params.metagenomics_input == 'mapped' || params.metagenomics_input == 'all' ) ) ) { + if ( params.bamfiltering_generatemappedfastq || ( params.run_metagenomics && ( params.metagenomics_input == 'mapped' || params.metagenomics_input == 'all' ) ) ) { SAMTOOLS_FASTQ_MAPPED ( bam.map{[ it[0], it[1] ]}, false ) ch_versions = ch_versions.mix( SAMTOOLS_FASTQ_MAPPED.out.versions.first() ) } - if ( ( params.run_metagenomics_screening && params.metagenomics_input == 'unmapped' ) && params.preprocessing_skippairmerging ) { + if ( ( params.run_metagenomics && params.metagenomics_input == 'unmapped' ) && params.preprocessing_skippairmerging ) { ch_paired_fastq_for_cat = SAMTOOLS_FASTQ_UNMAPPED.out.fastq .mix(SAMTOOLS_FASTQ_UNMAPPED.out.singleton) .mix(SAMTOOLS_FASTQ_UNMAPPED.out.other) @@ -96,7 +96,7 @@ workflow FILTER_BAM { } // TODO: see request https://github.com/nf-core/eager/issues/945 - if ( ( params.run_metagenomics_screening && ( params.metagenomics_input == 'mapped' || params.metagenomics_input == 'all' ) ) && params.preprocessing_skippairmerging ) { + if ( ( params.run_metagenomics && ( params.metagenomics_input == 'mapped' || params.metagenomics_input == 'all' ) ) && params.preprocessing_skippairmerging ) { ch_paired_fastq_for_cat = SAMTOOLS_FASTQ_UNMAPPED.out.fastq .mix(SAMTOOLS_FASTQ_MAPPED.out.singleton) .mix(SAMTOOLS_FASTQ_MAPPED.out.other) @@ -111,16 +111,15 @@ workflow FILTER_BAM { } // Routing for metagenomic screening -> first accounting for paired-end mapping, then merged mapping, then no metagenomics - if ( ( params.run_metagenomics_screening && params.metagenomics_input == 'unmapped' ) && params.preprocessing_skippairmerging ) { + if ( ( params.run_metagenomics && params.metagenomics_input == 'unmapped' ) && params.preprocessing_skippairmerging ) { ch_fastq_for_metagenomics = CAT_FASTQ_UNMAPPED.out.reads - } else if ( ( params.run_metagenomics_screening && ( params.metagenomics_input == 'mapped' || params.metagenomics_input == 'all' ) ) && params.preprocessing_skippairmerging ) { + } else if ( ( params.run_metagenomics && ( params.metagenomics_input == 'mapped' || params.metagenomics_input == 'all' ) ) && params.preprocessing_skippairmerging ) { ch_fastq_for_metagenomics = CAT_FASTQ_UNMAPPED.out.reads - } else if ( params.run_metagenomics_screening && params.metagenomics_input == 'unmapped' ) { + } else if ( params.run_metagenomics && params.metagenomics_input == 'unmapped' ) { ch_fastq_for_metagenomics = SAMTOOLS_FASTQ_UNMAPPED.out.other - } else if ( params.run_metagenomics_screening && ( params.metagenomics_input == 'mapped' || params.metagenomics_input == 'all' )) { + } else if ( params.run_metagenomics && ( params.metagenomics_input == 'mapped' || params.metagenomics_input == 'all' )) { ch_fastq_for_metagenomics = SAMTOOLS_FASTQ_MAPPED.out.other - } else if ( !params.run_metagenomics_screening ) { - } else if ( !params.run_metagenomics_screening ) { + } else if ( !params.run_metagenomics ) { ch_fastq_for_metagenomics = Channel.empty() } diff --git a/workflows/eager.nf b/workflows/eager.nf index 670a99515..0cce86fb5 100644 --- a/workflows/eager.nf +++ b/workflows/eager.nf @@ -41,7 +41,7 @@ if ( params.metagenomics_complexity_tool == 'prinseq' && params.metagenomics_pri if ( params.deduplication_tool == 'dedup' && ! params.preprocessing_excludeunmerged ) { exit 1, "[nf-core/eager] ERROR: Dedup can only be used on collapsed (i.e. merged) PE reads. For all other cases, please set --deduplication_tool to 'markduplicates'."} // TODO add any other metagenomics screening parameters checks for eg complexity filtering, post-processing -if ( params.run_metagenomics_screening && ! params.metagenomics_profiling_database ) { exit 1, ("[nf-core/eager] ERROR: Please provide an appropriate database path for metagenomics screening using --metagenomics_profiling_database") } +if ( params.run_metagenomics && ! params.metagenomics_profiling_database ) { exit 1, ("[nf-core/eager] ERROR: Please provide an appropriate database path for metagenomics screening using --metagenomics_profiling_database") } /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -203,7 +203,7 @@ workflow EAGER { // SUBWORKFLOW: bam filtering (length, mapped/unmapped, quality etc.) // - if ( params.run_bamfiltering || params.run_metagenomics_screening ) { + if ( params.run_bamfiltering || params.run_metagenomics ) { ch_mapped_for_bamfilter = MAP.out.bam .join(MAP.out.bai) @@ -299,7 +299,7 @@ workflow EAGER { // SUBWORKFLOW: metagenomics screening // //TODO: finish and figure out how exactly to call with proper database (check via a helper function?) - if ( params.run_metagenomics_screening ) { + if ( params.run_metagenomics ) { METAGENOMICS_PROFILING ( ch_bamfiltered_for_metagenomics, params.metagenomics_profiling_database ) // TODO: implement full metagenomics screening main subworkflow } // that then calls complexityfilter, profiling, postprocessing From 4cb682f79dc1cc641c16b26e9f944058e4294b70 Mon Sep 17 00:00:00 2001 From: Ian Light Date: Fri, 28 Apr 2023 08:58:21 +0000 Subject: [PATCH 013/198] initial creating of metagenomics_postprocessing --- .../local/metagenomics_postprocessing.nf | 36 +++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 subworkflows/local/metagenomics_postprocessing.nf diff --git a/subworkflows/local/metagenomics_postprocessing.nf b/subworkflows/local/metagenomics_postprocessing.nf new file mode 100644 index 000000000..cd213b78e --- /dev/null +++ b/subworkflows/local/metagenomics_postprocessing.nf @@ -0,0 +1,36 @@ +// TODO nf-core: If in doubt look at other nf-core/subworkflows to see how we are doing things! :) +// https://github.com/nf-core/modules/tree/master/subworkflows +// You can also ask for help via your pull request or on the #subworkflows channel on the nf-core Slack workspace: +// https://nf-co.re/join +// TODO nf-core: A subworkflow SHOULD import at least two modules + +include { SAMTOOLS_SORT } from '../../../modules/nf-core/samtools/sort/main' +include { SAMTOOLS_INDEX } from '../../../modules/nf-core/samtools/index/main' + +workflow METAGENOMICS_POSTPROCESSING { + + take: + // TODO nf-core: edit input (take) channels + ch_bam // channel: [ val(meta), [ bam ] ] + + main: + + ch_versions = Channel.empty() + + // TODO nf-core: substitute modules here for the modules of your subworkflow + + SAMTOOLS_SORT ( ch_bam ) + ch_versions = ch_versions.mix(SAMTOOLS_SORT.out.versions.first()) + + SAMTOOLS_INDEX ( SAMTOOLS_SORT.out.bam ) + ch_versions = ch_versions.mix(SAMTOOLS_INDEX.out.versions.first()) + + emit: + // TODO nf-core: edit emitted channels + bam = SAMTOOLS_SORT.out.bam // channel: [ val(meta), [ bam ] ] + bai = SAMTOOLS_INDEX.out.bai // channel: [ val(meta), [ bai ] ] + csi = SAMTOOLS_INDEX.out.csi // channel: [ val(meta), [ csi ] ] + + versions = ch_versions // channel: [ versions.yml ] +} + From 21f6ecb79bb906fb83b5bcc5f278b58daa43ea6e Mon Sep 17 00:00:00 2001 From: Merlin Szymanski Date: Fri, 28 Apr 2023 10:59:48 +0200 Subject: [PATCH 014/198] Adjust parameter names --- conf/test_humanbam.config | 2 +- nextflow.config | 6 +++--- workflows/eager.nf | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/conf/test_humanbam.config b/conf/test_humanbam.config index 713f34598..b7071f99d 100644 --- a/conf/test_humanbam.config +++ b/conf/test_humanbam.config @@ -41,5 +41,5 @@ params { bamfiltering_mappingquality = 37 // Metagenomic screening - run_metagenomicscreening = false + run_metagenomic = false } diff --git a/nextflow.config b/nextflow.config index c2c78f1de..3571f1356 100644 --- a/nextflow.config +++ b/nextflow.config @@ -134,9 +134,9 @@ params { metagenomics_malt_min_support_reads = 1 metagenomics_malt_min_percent_identity = 85 metagenomics_malt_top_percent = 1 - metagenomics_malt_max_queries = 100 - metagenomics_malt_memory_mode = 'load' - metagenomics_malt_group_size = 0 + metagenomics_malt_max_queries = 100 + metagenomics_malt_memory_mode = 'load' + metagenomics_malt_group_size = 0 // Deduplication options skip_deduplication = false diff --git a/workflows/eager.nf b/workflows/eager.nf index 6d69df0b0..35715c784 100644 --- a/workflows/eager.nf +++ b/workflows/eager.nf @@ -250,7 +250,7 @@ workflow EAGER { // //TODO: finish and figure out how exactly to call with proper database (check via a helper function?) - if ( params.run_metagenomics_screening ) { + if ( params.run_metagenomics ) { METAGENOMICS ( ch_bamfiltered_for_metagenomics ) ch_versions = ch_versions.mix( METAGENOMICS.out.versions.first() ) ch_multiqc_files = ch_multiqc_files.mix( METAGENOMICS.out.ch_multiqc_files ) From 71f525e1ed1fb5ff6c65b0e6a3bb4be0351132b8 Mon Sep 17 00:00:00 2001 From: Merlin Szymanski Date: Fri, 28 Apr 2023 11:32:31 +0200 Subject: [PATCH 015/198] Fix nextflow schema --- conf/modules.config | 2 - nextflow.config | 6 +- nextflow_schema.json | 301 +++---------------- subworkflows/local/metagenomics_profiling.nf | 4 +- 4 files changed, 52 insertions(+), 261 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 79dd878c1..17b0701b3 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -501,7 +501,6 @@ process { } withName: KRAKEN2_KRAKEN2 { - ext.prefix = params.perform_runmerging ? "${meta.id}.kraken2" : "${meta.id}_${meta.run_accession}.kraken2" ext.args = [ params.metagenomics_kraken2_save_minimizers ? "-report-minimizer-data" : "" ].join(' ').trim() @@ -521,7 +520,6 @@ process { } withName: METAPHLAN3_METAPHLAN3 { - ext.prefix = params.perform_runmerging ? { "${meta.id}.metaphlan3" } : { "${meta.id}_${meta.run_accession}.metaphlan3" } publishDir = [ path: { "${params.outdir}/metagenomics_screening/profiling/metaphlan3/" }, mode: params.publish_dir_mode, diff --git a/nextflow.config b/nextflow.config index 3571f1356..521e33fd8 100644 --- a/nextflow.config +++ b/nextflow.config @@ -120,10 +120,8 @@ params { metagenomics_profiling_tool = null metagenomics_profiling_database = null metagenomics_krakenuniq_ram_chunk_size = '16G' - metagenomics_krakenuniq_save_reads = false - metagenomics_krakenuniq_save_read_classifications = false - metagenomics_kraken2_save_reads = false - metagenomics_kraken2_save_readclassification = false + metagenomics_kraken_save_reads = false + metagenomics_kraken_save_read_classifications = false metagenomics_kraken2_save_minimizers = false metagenomics_malt_mode = 'BlastN' metagenomics_malt_alignment_mode = 'SemiGlobal' diff --git a/nextflow_schema.json b/nextflow_schema.json index 8cff03409..45d7cfded 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -10,10 +10,7 @@ "type": "object", "fa_icon": "fas fa-terminal", "description": "Define where the pipeline should find input data and save output data.", - "required": [ - "input", - "outdir" - ], + "required": ["input", "outdir"], "properties": { "input": { "type": "string", @@ -219,14 +216,7 @@ "description": "Method used to save pipeline results to output directory.", "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.", "fa_icon": "fas fa-copy", - "enum": [ - "symlink", - "rellink", - "link", - "copy", - "copyNoFollow", - "move" - ], + "enum": ["symlink", "rellink", "link", "copy", "copyNoFollow", "move"], "hidden": true }, "email_on_fail": { @@ -315,10 +305,7 @@ "default": "fastqc", "description": "Specify which tool to use for sequencing quality control.", "help_text": "Specify which tool to use for sequencing quality control.\n\nFalco is designed as a drop-in replacement for FastQC but written in C++ for faster computation. We recommend using falco with very large datasets (due to reduced memory constraints).", - "enum": [ - "fastqc", - "falco" - ], + "enum": ["fastqc", "falco"], "fa_icon": "fas fa-hammer" }, "skip_preprocessing": { @@ -331,10 +318,7 @@ "type": "string", "default": "fastp", "description": "Specify which preprocessing tool to use.", - "enum": [ - "fastp", - "adapterremoval" - ], + "enum": ["fastp", "adapterremoval"], "help_text": "Specify which preprocessing tool to use.\n\nAdapterRemoval is commonly used in palaeogenomics, however fastp has similar performance and has many additional functionality (including inbuilt complexity trimming) that can be often useful.", "fa_icon": "fas fa-hammer" }, @@ -342,7 +326,7 @@ "type": "boolean", "description": "Specify to skip read-pair merging.", "fa_icon": "fas fa-forward", - "help_text": "Turns off the paired-end read merging, and will result in paired-end mapping modes being used during reference of reads again alignment.\n\nThis can be useful in cases where you have long ancient DNA reads, modern DNA, or when you want to utilise mate-pair 'spatial' information..\n\n\u26a0\ufe0f If you run this and also with --preprocessing_minlength set to a value (as is by default!), you may end up removing single reads from either the pair1 or pair2 file. These reads will be NOT be mapped when aligning with either bwa or bowtie, as both can only accept one (forward) or two (forward and reverse) FASTQs as input in paired-end mode.\n\n> \u26a0\ufe0f If you run metagenomic screening as well as skipping merging, all reads will be screened as independent reads - not as pairs! - as all FASTQ files from BAM filtering are merged into one. This merged file is _not_ saved in results directory.\n\n> Modifies AdapterRemoval parameter: `--collapse`\n> Modifies fastp parameter: `--merge`" + "help_text": "Turns off the paired-end read merging, and will result in paired-end mapping modes being used during reference of reads again alignment.\n\nThis can be useful in cases where you have long ancient DNA reads, modern DNA, or when you want to utilise mate-pair 'spatial' information..\n\n⚠️ If you run this and also with --preprocessing_minlength set to a value (as is by default!), you may end up removing single reads from either the pair1 or pair2 file. These reads will be NOT be mapped when aligning with either bwa or bowtie, as both can only accept one (forward) or two (forward and reverse) FASTQs as input in paired-end mode.\n\n> ⚠️ If you run metagenomic screening as well as skipping merging, all reads will be screened as independent reads - not as pairs! - as all FASTQ files from BAM filtering are merged into one. This merged file is _not_ saved in results directory.\n\n> Modifies AdapterRemoval parameter: `--collapse`\n> Modifies fastp parameter: `--merge`" }, "preprocessing_excludeunmerged": { "type": "boolean", @@ -388,7 +372,7 @@ "type": "integer", "default": 0, "description": "Specify number of bases to hard-trim from 5 prime or front of reads. Exact behaviour varies per tool, see documentation.", - "help_text": "Specify number of bases to hard-trim from 5 prime or front of reads. Exact behaviour varies per tool, see documentation. By default set to `0` to not perform any hard trimming.\n\nThis parameter allows users to 'hard' remove a number of bases from the beginning or end of reads, regardless of quality.\n\n\u26a0\ufe0f when this trimming occurs depends on the tool, i.e., the exact behaviour is not the same between AdapterRemoval and fastp.\n\nFor fastp: this 5p/3p trimming occurs _prior_ to any other trimming (quality, poly-G, adapter). Please see the [fastp documentation](https://github.com/OpenGene/fastp#global-trimming) for more information. If you wish to use this to remove damage prior mapping (to allow more specific mapping), ensure you have manually removed adapters/quality trimmed **prior** to giving the reads to nf-core/eager. Alternatively, you can use Bowtie2's inbuilt pre-mapping read-end trimming functionality. Note that nf-core/eager only allows this hard trimming equally for both forward and reverse reads (i.e., you cannot provide different values for the 5p end for R1 and R2).\n\nFor AdapterRemoval, this trimming happens _after_ the removal of adapters, however prior to quality trimming. Therefore this is more suitable for hard-removal of damage prior mapping (however the Bowtie2 system will be more reliable).\n\n> Modifies AdapterRemoval parameters: `--trim5p`\n> Modifies fastp parameters: `--trim_front1` and/or `--trim_front2`\n", + "help_text": "Specify number of bases to hard-trim from 5 prime or front of reads. Exact behaviour varies per tool, see documentation. By default set to `0` to not perform any hard trimming.\n\nThis parameter allows users to 'hard' remove a number of bases from the beginning or end of reads, regardless of quality.\n\n⚠️ when this trimming occurs depends on the tool, i.e., the exact behaviour is not the same between AdapterRemoval and fastp.\n\nFor fastp: this 5p/3p trimming occurs _prior_ to any other trimming (quality, poly-G, adapter). Please see the [fastp documentation](https://github.com/OpenGene/fastp#global-trimming) for more information. If you wish to use this to remove damage prior mapping (to allow more specific mapping), ensure you have manually removed adapters/quality trimmed **prior** to giving the reads to nf-core/eager. Alternatively, you can use Bowtie2's inbuilt pre-mapping read-end trimming functionality. Note that nf-core/eager only allows this hard trimming equally for both forward and reverse reads (i.e., you cannot provide different values for the 5p end for R1 and R2).\n\nFor AdapterRemoval, this trimming happens _after_ the removal of adapters, however prior to quality trimming. Therefore this is more suitable for hard-removal of damage prior mapping (however the Bowtie2 system will be more reliable).\n\n> Modifies AdapterRemoval parameters: `--trim5p`\n> Modifies fastp parameters: `--trim_front1` and/or `--trim_front2`\n", "fa_icon": "fas fa-cut" }, "preprocessing_trim3p": { @@ -396,7 +380,7 @@ "default": 0, "description": "Specify number of bases to hard-trim from 3 prime or tail of reads. Exact behaviour varies per tool, see documentation.", "fa_icon": "fas fa-cut", - "help_text": "Specify number of bases to hard-trim from 3 prime or tail of reads. Exact behaviour varies per tool, see documentation. By default set to `0` to not perform any hard trimming.\n\nThis parameter allows users to 'hard' remove a number of bases from the beginning or end of reads, regardless of quality.\n\n\u26a0\ufe0f when this trimming occurs depends on the tool, i.e., the exact behaviour is not the same between AdapterRemoval and fastp.\n\nFor fastp: this 5p/3p trimming occurs _prior_ to any other trimming (quality, poly-G, adapter). Please see the [fastp documentation](https://github.com/OpenGene/fastp#global-trimming) for more information. If you wish to use this to remove damage prior mapping (to allow more specific mapping), ensure you have manually removed adapters/quality trimmed **prior** to giving the reads to nf-core/eager. Alternatively, you can use Bowtie2's inbuilt pre-mapping read-end trimming functionality. Note that nf-core/eager only allows this hard trimming equally for both forward and reverse reads (i.e., you cannot provide different values for the 3p end for R1 and R2).\n\nFor AdapterRemoval, this trimming happens _after_ the removal of adapters, however prior to quality trimming. Therefore this is more suitable for hard-removal of damage prior mapping (however the Bowtie2 system will be more reliable).\n\n> Modifies AdapterRemoval parameters: `--trim3p`\n> Modifies fastp parameters: `--trim_tail1` and/or `--trim_tail2`\n" + "help_text": "Specify number of bases to hard-trim from 3 prime or tail of reads. Exact behaviour varies per tool, see documentation. By default set to `0` to not perform any hard trimming.\n\nThis parameter allows users to 'hard' remove a number of bases from the beginning or end of reads, regardless of quality.\n\n⚠️ when this trimming occurs depends on the tool, i.e., the exact behaviour is not the same between AdapterRemoval and fastp.\n\nFor fastp: this 5p/3p trimming occurs _prior_ to any other trimming (quality, poly-G, adapter). Please see the [fastp documentation](https://github.com/OpenGene/fastp#global-trimming) for more information. If you wish to use this to remove damage prior mapping (to allow more specific mapping), ensure you have manually removed adapters/quality trimmed **prior** to giving the reads to nf-core/eager. Alternatively, you can use Bowtie2's inbuilt pre-mapping read-end trimming functionality. Note that nf-core/eager only allows this hard trimming equally for both forward and reverse reads (i.e., you cannot provide different values for the 3p end for R1 and R2).\n\nFor AdapterRemoval, this trimming happens _after_ the removal of adapters, however prior to quality trimming. Therefore this is more suitable for hard-removal of damage prior mapping (however the Bowtie2 system will be more reliable).\n\n> Modifies AdapterRemoval parameters: `--trim3p`\n> Modifies fastp parameters: `--trim_tail1` and/or `--trim_tail2`\n" }, "preprocessing_savepreprocessedreads": { "type": "boolean", @@ -468,12 +452,7 @@ "mapping_tool": { "type": "string", "default": "bowtie2", - "enum": [ - "bwaaln", - "bwamem", - "bowtie2", - "circularmapper" - ], + "enum": ["bwaaln", "bwamem", "bowtie2", "circularmapper"], "description": "Specify which mapper to use.", "help_text": "Specify which mapping tool to use. Options are BWA aln ('`bwaaln`'), BWA mem ('`bwamem`'), circularmapper ('`circularmapper`'), or bowtie2 ('`bowtie2`'). BWA aln is the default and highly suited for short-read ancient DNA. BWA mem can be quite useful for modern DNA, but is rarely used in projects for ancient DNA. CircularMapper enhances the mapping procedure to circular references, using the BWA algorithm but utilizing a extend-remap procedure (see Peltzer et al 2016, Genome Biology for details). Bowtie2 is similar to BWA aln, and has recently been suggested to provide slightly better results under certain conditions ([Poullet and Orlando 2020](https://doi.org/10.3389/fevo.2020.00105)), as well as providing extra functionality (such as FASTQ trimming). Default is 'bwaaln'\n\nMore documentation can be seen for each tool under:\n\n- [BWA aln](http://bio-bwa.sourceforge.net/bwa.shtml#3)\n- [BWA mem](http://bio-bwa.sourceforge.net/bwa.shtml#3)\n- [CircularMapper](https://circularmapper.readthedocs.io/en/latest/contents/userguide.html)\n- [Bowtie2](http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml#command-line)", "fa_icon": "fas fa-layer-group" @@ -560,12 +539,12 @@ "default": 4, "fa_icon": "fas fa-flag", "description": "Specify the SAM format flag of reads to remove during BAM filtering for downstream genomic steps. Generally not recommended to change.", - "help_text": "You can use this to customise the exact SAM format flag of reads you wish to _remove_ from your BAM file to for downstream _genomic_ analyses.\n\nYou can explore more using a tool from the Broad institute [here](https://broadinstitute.github.io/picard/explain-flags.html)\n\n> \u26a0\ufe0f Modify at your own risk, alternative flags are not necessarily supported in downstream steps!\n\n> Modifies tool parameter(s):\n> - SAMtools: `-F`" + "help_text": "You can use this to customise the exact SAM format flag of reads you wish to _remove_ from your BAM file to for downstream _genomic_ analyses.\n\nYou can explore more using a tool from the Broad institute [here](https://broadinstitute.github.io/picard/explain-flags.html)\n\n> ⚠️ Modify at your own risk, alternative flags are not necessarily supported in downstream steps!\n\n> Modifies tool parameter(s):\n> - SAMtools: `-F`" }, "bamfiltering_retainunmappedgenomicbam": { "type": "boolean", "description": "Specify to retain unmapped reads in the BAM file used for downstream genomic analyses.", - "help_text": "You can use this parameter to retain unmapped reads (optionally also length filtered) in the genomic BAM for downstream analysis. By default, the pipeline only keeps mapped reads for downstream analysis.\n\nThis is also turned on if `--metagenomics_input` is set to `all`.\n\n> \u26a0\ufe0f This will likely slow down run time of downstream pipeline steps!\n\n> Modifies tool parameter(s):\n> - samtools view: `-f 4` / `-F 4`", + "help_text": "You can use this parameter to retain unmapped reads (optionally also length filtered) in the genomic BAM for downstream analysis. By default, the pipeline only keeps mapped reads for downstream analysis.\n\nThis is also turned on if `--metagenomics_input` is set to `all`.\n\n> ⚠️ This will likely slow down run time of downstream pipeline steps!\n\n> Modifies tool parameter(s):\n> - samtools view: `-f 4` / `-F 4`", "fa_icon": "fas fa-piggy-bank" }, "bamfiltering_generateunmappedfastq": { @@ -605,150 +584,9 @@ "type": "string", "default": "unmapped", "description": "Specify which type of reads to go into metagenomic screening.", - "enum": [ - "unmapped", - "mapped", - "all" - ], + "enum": ["unmapped", "mapped", "all"], "fa_icon": "fas fa-hand-pointer", - "help_text": "You can select which reads coming out of the read alignment step will be sent for metagenomic analysis.\n\nThis influences which reads are sent to this step, whether you want unmapped reads (used in most cases, as 'host reads' can often be contaminants in microbial genomes), mapped reads (e.g, when doing competitive against a genomic reference of multiple genomes and which to apply LCA correction), or all reads.\n\n> \u26a0\ufe0f If you skip paired-end merging, all reads will be screened as independent reads - not as pairs! - as all FASTQ files from BAM filtering are merged into one. This merged file is _not_ saved in results directory.\n\n> Modifies tool parameter(s):\n> - samtools fastq: `-f 4` / `-F 4`" - }, - "metagenomics_profiling_tool": { - "type": "string", - "description": "Specify which tool to use for metagenomic profiling and screening.", - "enum": ["malt", "metaphlan2", "kraken2", "krakenuniq"], - "fa_icon": "fas fa-toolbox", - "help_text": "Select which tool to run metagenomics profiling on designated metagenomics_input. Which tool to use will depend on your specific context, as each tool uses a different method and database. See literature of the tools for recommendations" - }, - "metagenomics_profiling_database": { - "type": "string", - "format": "directory-path", - "description": "Specify the path to a databse directory to run metagenomics profiling on. In the case of kraken2, this can be a tar.gz of the directory.", - "fa_icon": "fas fa-database", - "help_text": "Specify your metagenomics profiling database to use with the designated metagenomics_profiling_tool on the selected metagenomics_input. These databases are NOT cross-compatible and need to be pre-built/downloaded for use in nf-core/eager. Database construction is often a balancing act between breadth of sequence diversity and size." - }, - "metagenomics_krakenuniq_save_reads": { - "type": "boolean", - "fa_icon": "fas fa-save", - "description": "Turn on saving of KrakenUniq-aligned reads", - "help_text": "Save reads that do and do not have a taxonomic classification in your output results directory in FASTQ format.\n\n> Modifies tool parameter(s):\n> - krakenuniq: `--classified-out` and `--unclassified-out`" - }, - "metagenomics_krakenuniq_save_read_classifications": { - "type": "boolean", - "fa_icon": "fas fa-save", - "description": "Turn on saving of KrakenUniq per-read taxonomic assignment file", - "help_text": "Save a text file that contains a list of each read that had a taxonomic assignment, with information on specific taxonomic taxonomic assignment that that read recieved.\n\n> Modifies tool parameter(s):\n> - krakenuniq: `--output`" - }, - "metagenomics_krakenuniq_ram_chunk_size": { - "type": "string", - "default": "16G", - "description": "Specify how large to chunk database when loading into memory for KrakenUniq", - "fa_icon": "fas fa-database", - "help_text": "nf-core/taxprofiler utilises a 'low memory' option for KrakenUniq that can reduce the amount of RAM the process requires using the `--preloaded` option.\n\nA further extension to this option is that you can specify how large each chunk of the database should be that gets loaded into memory at any one time. You can specify the amount of RAM to chunk the database to with this parameter, and is particularly useful for people with limited computational resources.\n\nMore information about this parameter can be seen [here](https://github.com/fbreitwieser/krakenuniq/blob/master/README.md#new-release-v07).\n\n> Modifies KrakenUniq parameter: --preload-size\n\n" - }, - "metagenomics_kraken2_save_reads": { - "type": "boolean", - "fa_icon": "fas fa-save", - "description": "Turn on saving of Kraken2-aligned reads", - "help_text": "Save reads that do and do not have a taxonomic classification in your output results directory in FASTQ format.\n\n> Modifies tool parameter(s):\n> - kraken2: `--classified-out` and `--unclassified-out`" - }, - "metagenomics_kraken2_save_readclassification": { - "type": "boolean", - "fa_icon": "fas fa-save", - "description": "Turn on saving of Kraken2 per-read taxonomic assignment file", - "help_text": "Save a text file that contains a list of each read that had a taxonomic assignment, with information on specific taxonomic taxonomic assignment that that read recieved.\n\n> Modifies tool parameter(s):\n> - kraken2: `--output`" - }, - "metagenomics_kraken2_save_minimizers": { - "type": "boolean", - "description": "Turn on saving minimizer information in the kraken2 report thus increasing to an eight column layout.", - "fa_icon": "fas fa-save", - "help_text": "Turn on saving minimizer information in the kraken2 report thus increasing to an eight column layout.\n\nAdds `--report-minimizer-data` to the kraken2 command." - }, - "metagenomics_malt_mode": { - "type": "string", - "default": "BlastN", - "description": "Specify which alignment mode to use for MALT. Options: 'Unknown', 'BlastN', 'BlastP', 'BlastX', 'Classifier'.", - "fa_icon": "fas fa-align-left", - "help_text": "Use this to run the program in 'BlastN', 'BlastP', 'BlastX' modes to align DNA\nand DNA, protein and protein, or DNA reads against protein references\nrespectively. Ensure your database matches the mode. Check the\n[MALT\nmanual](http://ab.inf.uni-tuebingen.de/data/software/malt/download/manual.pdf)\nfor more details. Default: `'BlastN'`\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MALT parameter: `-m`\n", - "enum": ["BlastN", "BlastP", "BlastX"] - }, - "metagenomics_malt_alignment_mode": { - "type": "string", - "default": "SemiGlobal", - "description": "Specify alignment method for MALT. Options: 'Local', 'SemiGlobal'.", - "fa_icon": "fas fa-align-center", - "help_text": "Specify what alignment algorithm to use. Options are 'Local' or 'SemiGlobal'. Local is a BLAST like alignment, but is much slower. Semi-global alignment aligns reads end-to-end. Default: `'SemiGlobal'`\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MALT parameter: `-at`", - "enum": ["Local", "SemiGlobal"] - }, - "metagenomics_malt_min_percent_identity": { - "type": "integer", - "default": 85, - "description": "Percent identity value threshold for MALT.", - "fa_icon": "fas fa-id-card", - "help_text": "Specify the minimum percent identity (or similarity) a sequence must have to the reference for it to be retained. Default is `85`\n\nOnly used when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MALT parameter: `-id`" - }, - "metagenomics_malt_top_percent": { - "type": "integer", - "default": 1, - "description": "Specify the percent for LCA algorithm for MALT (see MEGAN6 CE manual).", - "fa_icon": "fas fa-percent", - "help_text": "Specify the top percent value of the LCA algorithm. From the [MALT manual](http://ab.inf.uni-tuebingen.de/data/software/malt/download/manual.pdf): \"For each\nread, only those matches are used for taxonomic placement whose bit disjointScore is within\n10% of the best disjointScore for that read.\". Default: `1`.\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MALT parameter: `-top`" - }, - "metagenomics_malt_min_support_mode": { - "type": "string", - "default": "percent", - "description": "Specify whether to use percent or raw number of reads for minimum support required for taxon to be retained for MALT. Options: 'percent', 'reads'.", - "fa_icon": "fas fa-drumstick-bite", - "help_text": "Specify whether to use a percentage, or raw number of reads as the value used to decide the minimum support a taxon requires to be retained.\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MALT parameter: `-sup -supp`", - "enum": ["percent", "reads"] - }, - "metagenomics_malt_min_support_percent": { - "type": "number", - "default": 0.01, - "description": "Specify the minimum percentage of reads a taxon of sample total is required to have to be retained for MALT.", - "fa_icon": "fas fa-percentage", - "help_text": "Specify the minimum number of reads (as a percentage of all assigned reads) a given taxon is required to have to be retained as a positive 'hit' in the RMA6 file. This only applies when `--malt_min_support_mode` is set to 'percent'. Default 0.01.\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MALT parameter: `-supp`" - }, - "metagenomics_malt_min_support_reads": { - "type": "integer", - "default": 1, - "description": "Specify a minimum number of reads a taxon of sample total is required to have to be retained in malt. Not compatible with --malt_min_support_mode 'percent'.", - "fa_icon": "fas fa-sort-numeric-up-alt", - "help_text": "Specify the minimum number of reads a given taxon is required to have to be retained as a positive 'hit'. \nFor malt, this only applies when `--malt_min_support_mode` is set to 'reads'. Default: 1.\n\n> Modifies MALT or kraken_parse.py parameter: `-sup` and `-c` respectively\n" - }, - "metagenomics_malt_max_queries": { - "type": "integer", - "default": 100, - "description": "Specify the maximum number of queries a read can have for MALT.", - "fa_icon": "fas fa-phone", - "help_text": "Specify the maximum number of alignments a read can have. All further alignments are discarded. Default: `100`\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MALT parameter: `-mq`" - }, - "metagenomics_malt_memory_mode": { - "type": "string", - "default": "load", - "description": "Specify the memory load method. Do not use 'map' with GPFS file systems for MALT as can be very slow. Options: 'load', 'page', 'map'.", - "fa_icon": "fas fa-memory", - "help_text": "\nHow to load the database into memory. Options are `'load'`, `'page'` or `'map'`.\n'load' directly loads the entire database into memory prior seed look up, this\nis slow but compatible with all servers/file systems. `'page'` and `'map'`\nperform a sort of 'chunked' database loading, allowing seed look up prior entire\ndatabase loading. Note that Page and Map modes do not work properly not with\nmany remote file-systems such as GPFS. Default is `'load'`.\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MALT parameter: `--memoryMode`", - "enum": ["load", "page", "map"] - }, - "metagenomics_malt_sam_output": { - "type": "boolean", - "description": "Specify to also produce SAM alignment files. Note this includes both aligned and unaligned reads, and are gzipped. Note this will result in very large file sizes.", - "fa_icon": "fas fa-file-alt", - "help_text": "Specify to _also_ produce gzipped SAM files of all alignments and un-aligned reads in addition to RMA6 files. These are **not** soft-clipped or in 'sparse' format. Can be useful for downstream analyses due to more common file format. \n\n:warning: can result in very large run output directories as this is essentially duplication of the RMA6 files.\n\n> Modifies MALT parameter `-a -f`" - }, - "metagenomics_malt_save_reads": { - "type": "boolean", - "fa_icon": "fas fa-save", - "description": "Turn on saving of MALT-aligned reads", - "help_text": "Turns on saving of MALT aligned reads in SAM format.\n\nNote that the SAM format produce by MALT is not completely valid, and may not work with downstream tools.\n\n> Modifies tool parameter(s):\n> - malt-run: `--alignments`, `-za`" - }, - "metagenomics_malt_group_size": { - "type": "integer", - "default": 0, - "description": "Define group sizes for running multiple fastq files into malt.", - "fa_icon": "fas fa-barcode", - "help_text": "Very large fastq files or many fastq files run through MALT at the same time can lead to excessively long runtimes. This parameter allows for parallelization of MALT runs. Please note, MALT is resource heavy and setting this value above the default will spawn N/metagenomics_malt_group_size jobs where N is the number of samples. Please only use this if it is necessary to avoid runtime limits on your HPC cluster." + "help_text": "You can select which reads coming out of the read alignment step will be sent for metagenomic analysis.\n\nThis influences which reads are sent to this step, whether you want unmapped reads (used in most cases, as 'host reads' can often be contaminants in microbial genomes), mapped reads (e.g, when doing competitive against a genomic reference of multiple genomes and which to apply LCA correction), or all reads.\n\n> ⚠️ If you skip paired-end merging, all reads will be screened as independent reads - not as pairs! - as all FASTQ files from BAM filtering are merged into one. This merged file is _not_ saved in results directory.\n\n> Modifies tool parameter(s):\n> - samtools fastq: `-f 4` / `-F 4`" }, "run_metagenomics_complexityfiltering": { "type": "boolean", @@ -756,23 +594,20 @@ "help_text": "Turns on a subworkflow of the pipeline that filters the fastq-files for complexity before the metagenomics profiling\nUse the metagenomics_complexity_tool parameter to select a method", "description": "Run a complexity filter on the metagenomics input files before classification. Specifiy the tool to use with the `metagenomics_complexity_tool` parameter, save with `metagenomics_complexity_savefastq`" }, - "metagenomics_complexity_savefastq": { - "type": "boolean", - "fa_icon": "fas fa-save", - "description": "Save FASTQ files containing the complexity filtered reads (before metagenomic classification).", - "help_text": "Save the complexity-filtered fastq-files to the results directory" - }, "metagenomics_complexity_tool": { "type": "string", "default": "bbduk", "description": "Specify which tool to use for trimming, filtering, or reformatting of fastq reads that go into metagenomics screening.", - "enum": [ - "bbduk", - "prinseq" - ], - "fa_icon": "fas fa-hand-pointer", + "enum": ["bbduk", "prinseq"], + "fa_icon": "fas fa-toolbox", "help_text": "You can select which tool is used to generate a final set of reads for the metagenomic classifier after any necessary trimming, filtering or reformatting of the reads.\n\nThis intermediate file is not saved in the results directory, unless marked with `--metagenomics_complexity_savefastq`." }, + "metagenomics_complexity_savefastq": { + "type": "boolean", + "fa_icon": "fas fa-save", + "description": "Save FASTQ files containing the complexity filtered reads (before metagenomic classification).", + "help_text": "Save the complexity-filtered fastq-files to the results directory" + }, "metagenomics_complexity_entropy": { "type": "number", "fa_icon": "fas fa-sort-numeric-up", @@ -783,10 +618,7 @@ "metagenomics_prinseq_mode": { "type": "string", "default": "entropy", - "enum": [ - "entropy", - "dust" - ], + "enum": ["entropy", "dust"], "fa_icon": "fas fa-check-square", "description": "Specify the complexity filter mode for PRINSEQ++", "help_text": "Specify the complexity filter mode for PRINSEQ++ \n\nUse the selected mode together with the correct flag:\n'dust' requires the `--metagenomics_prinseq_dustscore` parameter set\n'entropy' requires the `--metagenomics_complexity_entropy` parameter set\n\n> Sets one of the tool parameter(s):\n> - PRINSEQ++: `-lc_entropy`\n> - PRINSEQ++: `-lc_dust`" @@ -801,12 +633,7 @@ "metagenomics_profiling_tool": { "type": "string", "description": "Specify which tool to use for metagenomic profiling and screening.", - "enum": [ - "malt", - "metaphlan2", - "kraken2", - "krakenuniq" - ], + "enum": ["malt", "metaphlan2", "kraken2", "krakenuniq"], "fa_icon": "fas fa-toolbox", "help_text": "Select which tool to run metagenomics profiling on designated metagenomics_screening_input. These tools behave vastly differently due to performing read profiling using different methods and yield vastly different reuslts." }, @@ -817,142 +644,116 @@ "fa_icon": "fas fa-database", "help_text": "Select which tool to run metagenomics profiling database to use with the designated metagenomics_profiling_tool on the selected metagenomics_screening_input. These databases are NOT cross-compatible and need to be pre-built/downloaded for use in nf-core/eager. Database construction is often a balancing act between breadth of sequence diversity and size." }, - "metagenomics_profiling_krakenuniq_save_reads": { + "metagenomics_kraken_save_reads": { "type": "boolean", "fa_icon": "fas fa-save", - "description": "Turn on saving of KrakenUniq-aligned reads", + "description": "Turn on saving reads assigned by of KrakenUniq or Kraken2", "help_text": "Save reads that do and do not have a taxonomic classification in your output results directory in FASTQ format.\n\n> Modifies tool parameter(s):\n> - krakenuniq: `--classified-out` and `--unclassified-out`" }, - "metagenomics_profiling_krakenuniq_save_read_classifications": { + "metagenomics_kraken_save_read_classifications": { "type": "boolean", "fa_icon": "fas fa-save", - "description": "Turn on saving of KrakenUniq per-read taxonomic assignment file", + "description": "Turn on saving of KrakenUniq or Kraken2 per-read taxonomic assignment file", "help_text": "Save a text file that contains a list of each read that had a taxonomic assignment, with information on specific taxonomic taxonomic assignment that that read recieved.\n\n> Modifies tool parameter(s):\n> - krakenuniq: `--output`" }, - "metagenomics_profiling_krakenuniq_ram_chunk_size": { + "metagenomics_krakenuniq_ram_chunk_size": { "type": "string", "default": "16G", "description": "Specify how large to chunk database when loading into memory for KrakenUniq", "fa_icon": "fas fa-database", "help_text": "nf-core/taxprofiler utilises a 'low memory' option for KrakenUniq that can reduce the amount of RAM the process requires using the `--preloaded` option.\n\nA further extension to this option is that you can specify how large each chunk of the database should be that gets loaded into memory at any one time. You can specify the amount of RAM to chunk the database to with this parameter, and is particularly useful for people with limited computational resources.\n\nMore information about this parameter can be seen [here](https://github.com/fbreitwieser/krakenuniq/blob/master/README.md#new-release-v07).\n\n> Modifies KrakenUniq parameter: --preload-size\n\n" }, - "metagenomics_profiling_kraken2_save_reads": { - "type": "boolean", - "fa_icon": "fas fa-save", - "description": "Turn on saving of Kraken2-aligned reads", - "help_text": "Save reads that do and do not have a taxonomic classification in your output results directory in FASTQ format.\n\n> Modifies tool parameter(s):\n> - kraken2: `--classified-out` and `--unclassified-out`" - }, - "metagenomics_profiling_kraken2_save_readclassification": { - "type": "boolean", - "fa_icon": "fas fa-save", - "description": "Turn on saving of Kraken2 per-read taxonomic assignment file", - "help_text": "Save a text file that contains a list of each read that had a taxonomic assignment, with information on specific taxonomic taxonomic assignment that that read recieved.\n\n> Modifies tool parameter(s):\n> - kraken2: `--output`" - }, - "metagenomics_profiling_kraken2_save_minimizers": { + "metagenomics_kraken2_save_minimizers": { "type": "boolean", "description": "Turn on saving minimizer information in the kraken2 report thus increasing to an eight column layout.", "fa_icon": "fas fa-save", "help_text": "Turn on saving minimizer information in the kraken2 report thus increasing to an eight column layout.\n\nAdds `--report-minimizer-data` to the kraken2 command." }, - "metagenomics_profiling_malt_mode": { + "metagenomics_malt_mode": { "type": "string", "default": "BlastN", "description": "Specify which alignment mode to use for MALT. Options: 'Unknown', 'BlastN', 'BlastP', 'BlastX', 'Classifier'.", "fa_icon": "fas fa-align-left", "help_text": "Use this to run the program in 'BlastN', 'BlastP', 'BlastX' modes to align DNA\nand DNA, protein and protein, or DNA reads against protein references\nrespectively. Ensure your database matches the mode. Check the\n[MALT\nmanual](http://ab.inf.uni-tuebingen.de/data/software/malt/download/manual.pdf)\nfor more details. Default: `'BlastN'`\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MALT parameter: `-m`\n", - "enum": [ - "BlastN", - "BlastP", - "BlastX" - ] + "enum": ["BlastN", "BlastP", "BlastX"] }, - "metagenomics_profiling_malt_alignment_mode": { + "metagenomics_malt_alignment_mode": { "type": "string", "default": "SemiGlobal", "description": "Specify alignment method for MALT. Options: 'Local', 'SemiGlobal'.", "fa_icon": "fas fa-align-center", "help_text": "Specify what alignment algorithm to use. Options are 'Local' or 'SemiGlobal'. Local is a BLAST like alignment, but is much slower. Semi-global alignment aligns reads end-to-end. Default: `'SemiGlobal'`\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MALT parameter: `-at`", - "enum": [ - "Local", - "SemiGlobal" - ] + "enum": ["Local", "SemiGlobal"] }, - "metagenomics_profiling_malt_min_percent_identity": { + "metagenomics_malt_min_percent_identity": { "type": "integer", "default": 85, "description": "Percent identity value threshold for MALT.", "fa_icon": "fas fa-id-card", "help_text": "Specify the minimum percent identity (or similarity) a sequence must have to the reference for it to be retained. Default is `85`\n\nOnly used when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MALT parameter: `-id`" }, - "metagenomics_profiling_malt_top_percent": { + "metagenomics_malt_top_percent": { "type": "integer", "default": 1, "description": "Specify the percent for LCA algorithm for MALT (see MEGAN6 CE manual).", "fa_icon": "fas fa-percent", "help_text": "Specify the top percent value of the LCA algorithm. From the [MALT manual](http://ab.inf.uni-tuebingen.de/data/software/malt/download/manual.pdf): \"For each\nread, only those matches are used for taxonomic placement whose bit disjointScore is within\n10% of the best disjointScore for that read.\". Default: `1`.\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MALT parameter: `-top`" }, - "metagenomics_profiling_malt_min_support_mode": { + "metagenomics_malt_min_support_mode": { "type": "string", "default": "percent", "description": "Specify whether to use percent or raw number of reads for minimum support required for taxon to be retained for MALT. Options: 'percent', 'reads'.", "fa_icon": "fas fa-drumstick-bite", "help_text": "Specify whether to use a percentage, or raw number of reads as the value used to decide the minimum support a taxon requires to be retained.\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MALT parameter: `-sup -supp`", - "enum": [ - "percent", - "reads" - ] + "enum": ["percent", "reads"] }, - "metagenomics_profiling_malt_min_support_percent": { + "metagenomics_malt_min_support_percent": { "type": "number", "default": 0.01, "description": "Specify the minimum percentage of reads a taxon of sample total is required to have to be retained for MALT.", "fa_icon": "fas fa-percentage", "help_text": "Specify the minimum number of reads (as a percentage of all assigned reads) a given taxon is required to have to be retained as a positive 'hit' in the RMA6 file. This only applies when `--malt_min_support_mode` is set to 'percent'. Default 0.01.\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MALT parameter: `-supp`" }, - "metagenomics_profiling_malt_min_support_reads": { + "metagenomics_malt_min_support_reads": { "type": "integer", "default": 1, "description": "Specify a minimum number of reads a taxon of sample total is required to have to be retained in malt. Not compatible with --malt_min_support_mode 'percent'.", "fa_icon": "fas fa-sort-numeric-up-alt", "help_text": "Specify the minimum number of reads a given taxon is required to have to be retained as a positive 'hit'. \nFor malt, this only applies when `--malt_min_support_mode` is set to 'reads'. Default: 1.\n\n> Modifies MALT or kraken_parse.py parameter: `-sup` and `-c` respectively\n" }, - "metagenomics_profiling_malt_max_queries": { + "metagenomics_malt_max_queries": { "type": "integer", "default": 100, "description": "Specify the maximum number of queries a read can have for MALT.", "fa_icon": "fas fa-phone", "help_text": "Specify the maximum number of alignments a read can have. All further alignments are discarded. Default: `100`\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MALT parameter: `-mq`" }, - "metagenomics_profiling_malt_memory_mode": { + "metagenomics_malt_memory_mode": { "type": "string", "default": "load", "description": "Specify the memory load method. Do not use 'map' with GPFS file systems for MALT as can be very slow. Options: 'load', 'page', 'map'.", "fa_icon": "fas fa-memory", "help_text": "\nHow to load the database into memory. Options are `'load'`, `'page'` or `'map'`.\n'load' directly loads the entire database into memory prior seed look up, this\nis slow but compatible with all servers/file systems. `'page'` and `'map'`\nperform a sort of 'chunked' database loading, allowing seed look up prior entire\ndatabase loading. Note that Page and Map modes do not work properly not with\nmany remote file-systems such as GPFS. Default is `'load'`.\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MALT parameter: `--memoryMode`", - "enum": [ - "load", - "page", - "map" - ] + "enum": ["load", "page", "map"] }, - "metagenomics_profiling_malt_sam_output": { + "metagenomics_malt_sam_output": { "type": "boolean", "description": "Specify to also produce SAM alignment files. Note this includes both aligned and unaligned reads, and are gzipped. Note this will result in very large file sizes.", "fa_icon": "fas fa-file-alt", "help_text": "Specify to _also_ produce gzipped SAM files of all alignments and un-aligned reads in addition to RMA6 files. These are **not** soft-clipped or in 'sparse' format. Can be useful for downstream analyses due to more common file format. \n\n:warning: can result in very large run output directories as this is essentially duplication of the RMA6 files.\n\n> Modifies MALT parameter `-a -f`" }, - "metagenomics_profiling_malt_save_reads": { + "metagenomics_malt_save_reads": { "type": "boolean", "fa_icon": "fas fa-save", "description": "Turn on saving of MALT-aligned reads", "help_text": "Turns on saving of MALT aligned reads in SAM format.\n\nNote that the SAM format produce by MALT is not completely valid, and may not work with downstream tools.\n\n> Modifies tool parameter(s):\n> - malt-run: `--alignments`, `-za`" }, - "metagenomics_profiling_malt_group_size": { + "metagenomics_malt_group_size": { "type": "integer", "default": 0, "description": "Define group sizes for running multiple fastq files into malt.", "fa_icon": "fas fa-barcode", - "help_text": "Very large fastq files or many fastq files run through MALT at the same time can lead to excessively long runtimes. This parameter allows for parallelization of MALT runs. Please note, MALT is resource heavy and setting this value above the default will spawn N/metagenomics_profiling_malt_group_size jobs where N is the number of samples. Please only use this if it is necessary to avoid runtime limits on your HPC cluster." + "help_text": "Very large fastq files or many fastq files run through MALT at the same time can lead to excessively long runtimes. This parameter allows for parallelization of MALT runs. Please note, MALT is resource heavy and setting this value above the default will spawn N/metagenomics_malt_group_size jobs where N is the number of samples. Please only use this if it is necessary to avoid runtime limits on your HPC cluster." } }, "fa_icon": "fas fa-search" @@ -972,11 +773,8 @@ "type": "string", "default": "markduplicates", "description": "Specify which tool to use for deduplication.", - "help_text": "Sets the duplicate read removal tool. By default uses `markduplicates` from Picard. Alternatively an ancient DNA specific read deduplication tool `dedup` (Peltzer et al. 2016) is offered. The latter utilises both ends of paired-end data to remove duplicates (i.e. true exact duplicates, as markduplicates will over-zealously deduplicate anything with the same starting position even if the ends are different).\n\n> \u26a0\ufe0f DeDup can only be used on collapsed (i.e. merged) reads from paired-end sequencing.", - "enum": [ - "markduplicates", - "dedup" - ], + "help_text": "Sets the duplicate read removal tool. By default uses `markduplicates` from Picard. Alternatively an ancient DNA specific read deduplication tool `dedup` (Peltzer et al. 2016) is offered. The latter utilises both ends of paired-end data to remove duplicates (i.e. true exact duplicates, as markduplicates will over-zealously deduplicate anything with the same starting position even if the ends are different).\n\n> ⚠️ DeDup can only be used on collapsed (i.e. merged) reads from paired-end sequencing.", + "enum": ["markduplicates", "dedup"], "fa_icon": "fas fa-layer-group" } }, @@ -1023,10 +821,7 @@ "help_text": "Specify which mode of preseq to run.\n\nFrom the [PreSeq documentation](http://smithlabresearch.org/wp-content/uploads/manual.pdf):\n\nc curve is used to compute the expected complexity curve of a mapped read file with a hypergeometric formula\n\nlc extrap is used to generate the expected yield for theoretical larger experiments and bounds on the number of distinct reads in the library and the associated confidence intervals, which is computed by bootstrapping the observed duplicate counts histogram.", "description": "Specify which mode of preseq to run.", "fa_icon": "fas fa-toggle-on", - "enum": [ - "c_curve", - "lc_extrap" - ] + "enum": ["c_curve", "lc_extrap"] }, "mapstats_preseq_stepsize": { "type": "integer", diff --git a/subworkflows/local/metagenomics_profiling.nf b/subworkflows/local/metagenomics_profiling.nf index 4fba84f1f..da47d0b8a 100644 --- a/subworkflows/local/metagenomics_profiling.nf +++ b/subworkflows/local/metagenomics_profiling.nf @@ -131,7 +131,7 @@ workflow METAGENOMICS_PROFILING { db: db } // Hardcode to _always_ produce the report file (which is our basic output, and goes into) - KRAKENUNIQ_PRELOADEDKRAKENUNIQ ( ch_input_for_krakenuniq.reads, ch_input_for_krakenuniq.db, params.metagenomics_krakenuniq_ram_chunk_size, params.metagenomics_krakenuniq_save_reads, true, params.metagenomics_krakenuniq_save_readclassifications ) + KRAKENUNIQ_PRELOADEDKRAKENUNIQ ( ch_input_for_krakenuniq.reads, ch_input_for_krakenuniq.db, params.metagenomics_krakenuniq_ram_chunk_size, params.metagenomics_kraken_save_reads, true, params.metagenomics_kraken_save_readclassifications ) ch_multiqc_files = ch_multiqc_files.mix( KRAKENUNIQ_PRELOADEDKRAKENUNIQ.out.report ) ch_versions = ch_versions.mix( KRAKENUNIQ_PRELOADEDKRAKENUNIQ.out.versions.first() ) ch_raw_classifications = ch_raw_classifications.mix( KRAKENUNIQ_PRELOADEDKRAKENUNIQ.out.classified_assignment ) @@ -142,7 +142,7 @@ workflow METAGENOMICS_PROFILING { if ( params.metagenomics_profiling_tool == 'kraken2' ) { - KRAKEN2_KRAKEN2 ( ch_input_for_kraken2.reads, database, params.metagenomics_kraken2_save_reads, params.metagenomics_kraken2_save_readclassification ) + KRAKEN2_KRAKEN2 ( ch_input_for_kraken2.reads, database, params.metagenomics_kraken_save_reads, params.metagenomics_kraken_save_readclassification ) ch_multiqc_files = ch_multiqc_files.mix( KRAKEN2_KRAKEN2.out.report ) ch_versions = ch_versions.mix( KRAKEN2_KRAKEN2.out.versions.first() ) ch_raw_classifications = ch_raw_classifications.mix( KRAKEN2_KRAKEN2.out.classified_reads_assignment ) From 19e0a62d90172e440a224c43c2fdefe52a9ec1ae Mon Sep 17 00:00:00 2001 From: Ian Light Date: Fri, 28 Apr 2023 09:40:45 +0000 Subject: [PATCH 016/198] consistency check for postprocessing metagenomics --- workflows/eager.nf | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/workflows/eager.nf b/workflows/eager.nf index 35715c784..139cb3c98 100644 --- a/workflows/eager.nf +++ b/workflows/eager.nf @@ -43,6 +43,10 @@ if ( params.deduplication_tool == 'dedup' && ! params.preprocessing_excludeunmer // TODO add any other metagenomics screening parameters checks for eg complexity filtering, post-processing if ( params.run_metagenomics && ! params.metagenomics_profiling_database ) { exit 1, ("[nf-core/eager] ERROR: Please provide an appropriate database path for metagenomics screening using --metagenomics_profiling_database") } +if ( params.metagenomics_postprocessing_tool == 'maltextract' && ! params.metagenomics_profiling_tool != 'malt' ) { exit 1, ("[nf-core/eager] ERROR: --metagenomics_postprocessing_tool 'maltextract' can only be run with --metagenomics_profiling_tool 'malt'") } + +if ( params.metagenomics_postprocessing_tool == 'krakenparse' && ( ! params.metagenomics_profiling_tool != 'kraken2' || ! params.metagenomics_profiling_tool != 'krakenuniq' ) ) { exit 1, ("[nf-core/eager] ERROR: --metagenomics_postprocessing_tool 'krakenparse' can only be run with --metagenomics_profiling_tool 'kraken2' or 'krakenuniq'") } + /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Report possible warnings From ad070569557dab86173de082fc7ca323b2a3eda8 Mon Sep 17 00:00:00 2001 From: Ian Light Date: Fri, 28 Apr 2023 09:45:05 +0000 Subject: [PATCH 017/198] partial implem of subwrkflw metagenom_postpr --- subworkflows/local/metagenomics.nf | 12 +++- .../local/metagenomics_postprocessing.nf | 30 +++++----- subworkflows/local/metagenomics_profiling.nf | 56 +++++++++---------- 3 files changed, 54 insertions(+), 44 deletions(-) diff --git a/subworkflows/local/metagenomics.nf b/subworkflows/local/metagenomics.nf index e0fb72e58..fb45a033b 100644 --- a/subworkflows/local/metagenomics.nf +++ b/subworkflows/local/metagenomics.nf @@ -1,5 +1,6 @@ include { METAGENOMICS_COMPLEXITYFILTER } from './metagenomics_complexityfilter' include { METAGENOMICS_PROFILING } from './metagenomics_profiling' +include { METAGENOMICS_POSTPROCESSING } from './metagenomics_postprocessing' workflow METAGENOMICS { take: ch_bamfiltered_for_metagenomics @@ -36,9 +37,16 @@ workflow METAGENOMICS { ch_versions = ch_versions.mix( METAGENOMICS_PROFILING.out.versions.first() ) ch_multiqc_files = ch_multiqc_files.mix( METAGENOMICS_PROFILING.out.mqc.collect{it[1]}.ifEmpty([]) ) + // + // Run the post profiling subworkflow + // + if ( params.metagenomics_postprocessing_tool == 'maltextract' || params.metagenomics_postprocessing_tool == 'krakenparse' ) { } + METAGENOMICS_POSTPROCESSING ( METAGENOMICS_PROFILING.out.postprocessing_input ) + ch_versions = ch_versions.mix( METAGENOMICS_POSTPROCESSING.out.versions.first() ) + ch_multiqc_files = ch_multiqc_files.mix( METAGENOMICS_POSTPROCESSING.out.mqc.collect{it[1]}.ifEmpty([]) ) + emit: ch_versions = ch_versions ch_multiqc_files = ch_multiqc_files - -} \ No newline at end of file +} diff --git a/subworkflows/local/metagenomics_postprocessing.nf b/subworkflows/local/metagenomics_postprocessing.nf index cd213b78e..bbd1f965e 100644 --- a/subworkflows/local/metagenomics_postprocessing.nf +++ b/subworkflows/local/metagenomics_postprocessing.nf @@ -4,33 +4,37 @@ // https://nf-co.re/join // TODO nf-core: A subworkflow SHOULD import at least two modules -include { SAMTOOLS_SORT } from '../../../modules/nf-core/samtools/sort/main' -include { SAMTOOLS_INDEX } from '../../../modules/nf-core/samtools/index/main' +include { MALTEXTRACT } from '../../../modules/nf-core/maltextract/main' workflow METAGENOMICS_POSTPROCESSING { take: // TODO nf-core: edit input (take) channels - ch_bam // channel: [ val(meta), [ bam ] ] + ch_postprocessing_input // different between kraken and malt main: - ch_versions = Channel.empty() + ch_versions = Channel.empty() + ch_multiqc_files = Channel.empty() // TODO nf-core: substitute modules here for the modules of your subworkflow - SAMTOOLS_SORT ( ch_bam ) - ch_versions = ch_versions.mix(SAMTOOLS_SORT.out.versions.first()) - - SAMTOOLS_INDEX ( SAMTOOLS_SORT.out.bam ) - ch_versions = ch_versions.mix(SAMTOOLS_INDEX.out.versions.first()) + if ( params.metagenomics_postprocessing_tool == 'maltextract') { + MALTEXTRACT ( ch_postprocessing_input, params.taxon_list, params.ncbi_dir ) + ch_versions = ch_versions.mix( MALTEXTRACT.out.versions.first() ) + ch_results = ch_results.mix( MALTEXTRACT.out.results ) + } + else if ( params.metagenomics_postprocessing_tool == 'krakenparse' ) { + // TODO: @merlin fininsh implementation/merge with your implementation + KRAKENPARSE ( ch_postprocessing_input ) + ch_versions = ch_versions.mix( KRAKENPARSE.out.versions.first() ) + ch_results = ch_results.mix( KRAKENPARSE.out.results ) + } emit: // TODO nf-core: edit emitted channels - bam = SAMTOOLS_SORT.out.bam // channel: [ val(meta), [ bam ] ] - bai = SAMTOOLS_INDEX.out.bai // channel: [ val(meta), [ bai ] ] - csi = SAMTOOLS_INDEX.out.csi // channel: [ val(meta), [ csi ] ] + versions = ch_versions + results_directory = ch_results - versions = ch_versions // channel: [ versions.yml ] } diff --git a/subworkflows/local/metagenomics_profiling.nf b/subworkflows/local/metagenomics_profiling.nf index 4fba84f1f..1244ced5e 100644 --- a/subworkflows/local/metagenomics_profiling.nf +++ b/subworkflows/local/metagenomics_profiling.nf @@ -21,8 +21,7 @@ workflow METAGENOMICS_PROFILING { ch_raw_classifications = Channel.empty() ch_raw_profiles = Channel.empty() ch_multiqc_files = Channel.empty() - // TODO: malt, metaphylan, kraken2, krakenuniq - // TODO: maltextract, krakenparse + ch_postprocessing_input = Channel.empty() /* PREPARE PROFILER INPUT CHANNELS & RUN PROFILING @@ -87,8 +86,9 @@ workflow METAGENOMICS_PROFILING { } } - // MALT: We groupTuple to have all samples in one channel for MALT as database - // loading takes a long time, so we only want to run it once per database, unless otherwise specified (eg grouping samples) + // MALT: We can groupTuple to have all samples in one channel for MALT as database + // since loading takes a long time, we only want to run it once per database + // unless otherwise specified (eg grouping samples) MALT_RUN ( ch_input_for_malt.reads, ch_input_for_malt.database ) @@ -105,12 +105,13 @@ workflow METAGENOMICS_PROFILING { [ meta_new, rma ] } - ch_versions = ch_versions.mix( MALT_RUN.out.versions.first() ) - ch_raw_classifications = ch_raw_classifications.mix( ch_maltrun_for_megan ) - ch_multiqc_files = ch_multiqc_files.mix( MALT_RUN.out.log ) + ch_versions = ch_versions.mix( MALT_RUN.out.versions.first() ) + ch_raw_classifications = ch_raw_classifications.mix( ch_maltrun_for_megan ) + ch_multiqc_files = ch_multiqc_files.mix( MALT_RUN.out.log ) + ch_postprocessing_input = ch_postprocessing_input.mix( ch_maltrun_for_megan ) } - if ( params.metagenomics_profiling_tool == 'metaphlan3' ) { + else if ( params.metagenomics_profiling_tool == 'metaphlan3' ) { METAPHLAN3_METAPHLAN3 ( reads , database ) ch_versions = ch_versions.mix( METAPHLAN3_METAPHLAN3.out.versions.first() ) @@ -118,7 +119,7 @@ workflow METAGENOMICS_PROFILING { } - if ( params.metagenomics_profiling_tool == 'krakenuniq' ) { + else if ( params.metagenomics_profiling_tool == 'krakenuniq' ) { ch_input_for_krakenuniq = ch_input_for_profiling.krakenuniq .map { meta, reads, db_meta, db -> @@ -132,32 +133,29 @@ workflow METAGENOMICS_PROFILING { } // Hardcode to _always_ produce the report file (which is our basic output, and goes into) KRAKENUNIQ_PRELOADEDKRAKENUNIQ ( ch_input_for_krakenuniq.reads, ch_input_for_krakenuniq.db, params.metagenomics_krakenuniq_ram_chunk_size, params.metagenomics_krakenuniq_save_reads, true, params.metagenomics_krakenuniq_save_readclassifications ) - ch_multiqc_files = ch_multiqc_files.mix( KRAKENUNIQ_PRELOADEDKRAKENUNIQ.out.report ) - ch_versions = ch_versions.mix( KRAKENUNIQ_PRELOADEDKRAKENUNIQ.out.versions.first() ) - ch_raw_classifications = ch_raw_classifications.mix( KRAKENUNIQ_PRELOADEDKRAKENUNIQ.out.classified_assignment ) - ch_raw_profiles = ch_raw_profiles.mix( KRAKENUNIQ_PRELOADEDKRAKENUNIQ.out.report ) - ch_multiqc_files = ch_multiqc_files.mix( KRAKENUNIQ_PRELOADEDKRAKENUNIQ.out.report ) - + ch_multiqc_files = ch_multiqc_files.mix( KRAKENUNIQ_PRELOADEDKRAKENUNIQ.out.report ) + ch_versions = ch_versions.mix( KRAKENUNIQ_PRELOADEDKRAKENUNIQ.out.versions.first() ) + ch_raw_classifications = ch_raw_classifications.mix( KRAKENUNIQ_PRELOADEDKRAKENUNIQ.out.classified_assignment ) + ch_raw_profiles = ch_raw_profiles.mix( KRAKENUNIQ_PRELOADEDKRAKENUNIQ.out.report ) + ch_multiqc_files = ch_multiqc_files.mix( KRAKENUNIQ_PRELOADEDKRAKENUNIQ.out.report ) + ch_postprocessing_input = ch_postprocessing_input.mix( KRAKENUNIQ_PRELOADEDKRAKENUNIQ.out.report ) } - if ( params.metagenomics_profiling_tool == 'kraken2' ) { + else if ( params.metagenomics_profiling_tool == 'kraken2' ) { KRAKEN2_KRAKEN2 ( ch_input_for_kraken2.reads, database, params.metagenomics_kraken2_save_reads, params.metagenomics_kraken2_save_readclassification ) - ch_multiqc_files = ch_multiqc_files.mix( KRAKEN2_KRAKEN2.out.report ) - ch_versions = ch_versions.mix( KRAKEN2_KRAKEN2.out.versions.first() ) - ch_raw_classifications = ch_raw_classifications.mix( KRAKEN2_KRAKEN2.out.classified_reads_assignment ) - ch_raw_profiles = ch_raw_profiles.mix( - KRAKEN2_KRAKEN2.out.report - // Set the tool to be strictly 'kraken2' instead of potentially 'bracken' for downstream use. - // Will remain distinct from 'pure' Kraken2 results due to distinct database names in file names. - .map { meta, report -> [meta + [tool: 'kraken2'], report]} - ) + ch_multiqc_files = ch_multiqc_files.mix( KRAKEN2_KRAKEN2.out.report ) + ch_versions = ch_versions.mix( KRAKEN2_KRAKEN2.out.versions.first() ) + ch_raw_classifications = ch_raw_classifications.mix( KRAKEN2_KRAKEN2.out.classified_reads_assignment ) + ch_raw_profiles = ch_raw_profiles.mix( KRAKEN2_KRAKEN2.out.report ) + ch_postprocessing_input = ch_postprocessing_input.mix( KRAKEN2_KRAKEN2.out.report ) } emit: - versions = ch_versions // channel: [ versions.yml ] - classifications = ch_raw_classifications - profiles = ch_raw_profiles // channel: [ val(meta), [ reads ] ] - should be text files or biom - mqc = ch_multiqc_files + versions = ch_versions // channel: [ versions.yml ] + classifications = ch_raw_classifications + profiles = ch_raw_profiles // channel: [ val(meta), [ reads ] ] - should be text files or biom + postprocessing_input = ch_postprocessing_input + mqc = ch_multiqc_files } From 243b779043abd324b7518f08416d81770060552b Mon Sep 17 00:00:00 2001 From: Merlin Szymanski Date: Fri, 28 Apr 2023 11:36:24 +0200 Subject: [PATCH 018/198] WIP: Fix metagenomics subworkflow import - WIP: prepare kraken input channel --- subworkflows/local/metagenomics.nf | 2 +- subworkflows/local/metagenomics_profiling.nf | 30 ++++++++++---------- workflows/eager.nf | 2 +- 3 files changed, 17 insertions(+), 17 deletions(-) diff --git a/subworkflows/local/metagenomics.nf b/subworkflows/local/metagenomics.nf index e0fb72e58..083f573a8 100644 --- a/subworkflows/local/metagenomics.nf +++ b/subworkflows/local/metagenomics.nf @@ -37,7 +37,7 @@ workflow METAGENOMICS { ch_multiqc_files = ch_multiqc_files.mix( METAGENOMICS_PROFILING.out.mqc.collect{it[1]}.ifEmpty([]) ) emit: - ch_versions = ch_versions + versions = ch_versions ch_multiqc_files = ch_multiqc_files diff --git a/subworkflows/local/metagenomics_profiling.nf b/subworkflows/local/metagenomics_profiling.nf index da47d0b8a..d6d0eaf42 100644 --- a/subworkflows/local/metagenomics_profiling.nf +++ b/subworkflows/local/metagenomics_profiling.nf @@ -11,9 +11,8 @@ include { METAPHLAN3_METAPHLAN3 } from '../../modules/nf workflow METAGENOMICS_PROFILING { - take: - reads // channel: [ [ meta ] , [ reads ] ] - database // channel: [ [ meta ] , path ] + take: reads // channel: [ [ meta ] , [ reads ] ] + take: database // channel: [ [ meta ] , path ] main: @@ -119,17 +118,18 @@ workflow METAGENOMICS_PROFILING { } if ( params.metagenomics_profiling_tool == 'krakenuniq' ) { - ch_input_for_krakenuniq = ch_input_for_profiling.krakenuniq - .map { - meta, reads, db_meta, db -> - [[id: db_meta.db_name, single_end: meta.single_end], reads, db_meta, db] - } - .groupTuple(by: [0,2,3]) - .multiMap { - single_meta, reads, db_meta, db -> - reads: [ single_meta + db_meta, reads.flatten() ] - db: db - } + reads.view() + /*reads = reads + .map { + meta, reads -> + [[id: db_meta.db_name, single_end: meta.single_end], reads, db_meta, db] + } + .groupTuple(by: [0,2,3]) + .multiMap { + single_meta, reads, db_meta, db -> + reads: [ single_meta + db_meta, reads.flatten() ] + db: db + } // Hardcode to _always_ produce the report file (which is our basic output, and goes into) KRAKENUNIQ_PRELOADEDKRAKENUNIQ ( ch_input_for_krakenuniq.reads, ch_input_for_krakenuniq.db, params.metagenomics_krakenuniq_ram_chunk_size, params.metagenomics_kraken_save_reads, true, params.metagenomics_kraken_save_readclassifications ) ch_multiqc_files = ch_multiqc_files.mix( KRAKENUNIQ_PRELOADEDKRAKENUNIQ.out.report ) @@ -137,7 +137,7 @@ workflow METAGENOMICS_PROFILING { ch_raw_classifications = ch_raw_classifications.mix( KRAKENUNIQ_PRELOADEDKRAKENUNIQ.out.classified_assignment ) ch_raw_profiles = ch_raw_profiles.mix( KRAKENUNIQ_PRELOADEDKRAKENUNIQ.out.report ) ch_multiqc_files = ch_multiqc_files.mix( KRAKENUNIQ_PRELOADEDKRAKENUNIQ.out.report ) - + */ } if ( params.metagenomics_profiling_tool == 'kraken2' ) { diff --git a/workflows/eager.nf b/workflows/eager.nf index 35715c784..4d1fb519e 100644 --- a/workflows/eager.nf +++ b/workflows/eager.nf @@ -79,7 +79,7 @@ include { PREPROCESSING } from '../subworkflows/local/preprocessing' include { MAP } from '../subworkflows/local/map' include { FILTER_BAM } from '../subworkflows/local/bamfiltering.nf' include { DEDUPLICATE } from '../subworkflows/local/deduplicate' -include { METAGENOMICS } from '../subworkflows/local/deduplicate' +include { METAGENOMICS } from '../subworkflows/local/metagenomics' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ From a839b620ba37b221be40756b1f577fdaa6d8e32c Mon Sep 17 00:00:00 2001 From: Ian Light Date: Fri, 28 Apr 2023 09:55:46 +0000 Subject: [PATCH 019/198] updated param name for kraken_save_reads all files --- subworkflows/local/metagenomics_profiling.nf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/subworkflows/local/metagenomics_profiling.nf b/subworkflows/local/metagenomics_profiling.nf index f2f560970..7289ea0c6 100644 --- a/subworkflows/local/metagenomics_profiling.nf +++ b/subworkflows/local/metagenomics_profiling.nf @@ -132,7 +132,7 @@ workflow METAGENOMICS_PROFILING { db: db } // Hardcode to _always_ produce the report file (which is our basic output, and goes into) - KRAKENUNIQ_PRELOADEDKRAKENUNIQ ( ch_input_for_krakenuniq.reads, ch_input_for_krakenuniq.db, params.metagenomics_krakenuniq_ram_chunk_size, params.metagenomics_krakenuniq_save_reads, true, params.metagenomics_krakenuniq_save_readclassifications ) + KRAKENUNIQ_PRELOADEDKRAKENUNIQ ( ch_input_for_krakenuniq.reads, ch_input_for_krakenuniq.db, params.metagenomics_krakenuniq_ram_chunk_size, params.metagenomics_kraken_save_reads, true, params.metagenomics_krakenuniq_save_readclassifications ) ch_multiqc_files = ch_multiqc_files.mix( KRAKENUNIQ_PRELOADEDKRAKENUNIQ.out.report ) ch_versions = ch_versions.mix( KRAKENUNIQ_PRELOADEDKRAKENUNIQ.out.versions.first() ) ch_raw_classifications = ch_raw_classifications.mix( KRAKENUNIQ_PRELOADEDKRAKENUNIQ.out.classified_assignment ) @@ -143,7 +143,7 @@ workflow METAGENOMICS_PROFILING { else if ( params.metagenomics_profiling_tool == 'kraken2' ) { - KRAKEN2_KRAKEN2 ( ch_input_for_kraken2.reads, database, params.metagenomics_kraken2_save_reads, params.metagenomics_kraken2_save_readclassification ) + KRAKEN2_KRAKEN2 ( ch_input_for_kraken2.reads, database, params.metagenomics_kraken_save_reads, params.metagenomics_kraken_save_read_classifications ) ch_multiqc_files = ch_multiqc_files.mix( KRAKEN2_KRAKEN2.out.report ) ch_versions = ch_versions.mix( KRAKEN2_KRAKEN2.out.versions.first() ) ch_raw_classifications = ch_raw_classifications.mix( KRAKEN2_KRAKEN2.out.classified_reads_assignment ) From faedcef1f12691200d69d67fddd0a59d25d40e66 Mon Sep 17 00:00:00 2001 From: Merlin Szymanski Date: Fri, 5 May 2023 11:58:36 +0200 Subject: [PATCH 020/198] Complete krakenuniq workflow This commit completes the krakenuniq workflow, tested with the test profile. Other classifiers not yet touched! --- conf/modules.config | 6 +-- docs/development/manual_tests.md | 22 +++++++++ subworkflows/local/metagenomics.nf | 4 +- subworkflows/local/metagenomics_profiling.nf | 48 ++++++++++---------- workflows/eager.nf | 7 --- 5 files changed, 50 insertions(+), 37 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 17b0701b3..cbb850bd1 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -456,7 +456,7 @@ process { ext.prefix = { "${meta.id}_${meta.library_id}_complexity" } publishDir = [ [ - path: { "${params.outdir}/metagenomic_complexity_filter/" }, + path: { "${params.outdir}/metagenomics_screening/complexity_filter/prinseq" }, mode: params.publish_dir_mode, pattern: '*{_good_out.fastq.gz,_good_out_R1.fastq.gz,_good_out_R2.fastq.gz,log}', enabled: params.metagenomics_complexity_savefastq @@ -468,7 +468,7 @@ process { ext.prefix = { "${meta.id}_${meta.library_id}_complexity" } ext.args = { "entropymask=f entropy=${params.metagenomics_complexity_entropy}" } publishDir = [ - path: { "${params.outdir}/metagenomic_complexity_filter/" }, + path: { "${params.outdir}/metagenomics_screening/complexity_filter/bbduk/" }, mode: params.publish_dir_mode, pattern: '*.{fastq.gz,log}', enabled: params.metagenomics_complexity_savefastq @@ -511,7 +511,7 @@ process { ] } - withName: KRAKENUNIQ_PRELOADEDKRAKENUNIQ { + withName: ".*KRAKENUNIQ_PRELOADEDKRAKENUNIQ" { publishDir = [ path: { "${params.outdir}/metagenomics_screening/profiling/krakenuniq/" }, mode: params.publish_dir_mode, diff --git a/docs/development/manual_tests.md b/docs/development/manual_tests.md index 5449d1c46..ee1ed4790 100644 --- a/docs/development/manual_tests.md +++ b/docs/development/manual_tests.md @@ -389,3 +389,25 @@ nextflow run main.nf -profile docker,test --outdir ./results/AR_dedup_merged -du ## Expect: deduplication directory with a single bam,bai,flagstat for the library (3 files total). Flagstat for each library should include fewer mapped reads than the mapped bam version. Check that duplicate at NC_007596.2:187-187 is removed. nextflow run main.nf -profile docker,test --input ~/eager_dsl2_testing/input/only_PE/pe_only.tsv --outdir ./results/AR_dedup_merged_PE_only -dump-channels -ansi-log false --preprocessing_tool 'adapterremoval' --deduplication_tool 'dedup' --preprocessing_excludeunmerged -resume ``` + +## Metagenomics + +### Complexityfilter + +### Profiling + +#### Krakenuniq + +```bash +### With saved reads +# Use only the -profile test dataset, provide a custom kraken database +# Expected: directory with 2 fastq-files and 1 textfile for each sample, containing classified, unclassified reads and the raw krakenuniq profile +nextflow run main.nf -profile test,singularity --outdir out --run_metagenomics --metagenomics_profiling_tool krakenuniq --metagenomics_profiling_database ../runtest/refseq_rel215/kraken/Mito_db_kmer22/ --metagenomics_kraken_save_reads +``` + +```bash +### Without saved reads +# Use only the -profile test dataset, provide a custom kraken database +# Expected: directory with 1 textfile for each sample: the raw krakenuniq profile +nextflow run main.nf -profile test,singularity --outdir out --run_metagenomics --metagenomics_profiling_tool krakenuniq --metagenomics_profiling_database ../runtest/refseq_rel215/kraken/Mito_db_kmer22/ +``` diff --git a/subworkflows/local/metagenomics.nf b/subworkflows/local/metagenomics.nf index 083f573a8..7344721a1 100644 --- a/subworkflows/local/metagenomics.nf +++ b/subworkflows/local/metagenomics.nf @@ -30,9 +30,10 @@ workflow METAGENOMICS { // Run the profiling subworkflow // - database = params.metagenomics_profiling_database + database = Channel.fromPath(params.metagenomics_profiling_database) METAGENOMICS_PROFILING( ch_reads_for_metagenomics, database ) + ch_versions = ch_versions.mix( METAGENOMICS_PROFILING.out.versions.first() ) ch_multiqc_files = ch_multiqc_files.mix( METAGENOMICS_PROFILING.out.mqc.collect{it[1]}.ifEmpty([]) ) @@ -40,5 +41,4 @@ workflow METAGENOMICS { versions = ch_versions ch_multiqc_files = ch_multiqc_files - } \ No newline at end of file diff --git a/subworkflows/local/metagenomics_profiling.nf b/subworkflows/local/metagenomics_profiling.nf index d6d0eaf42..793e09fbe 100644 --- a/subworkflows/local/metagenomics_profiling.nf +++ b/subworkflows/local/metagenomics_profiling.nf @@ -11,23 +11,21 @@ include { METAPHLAN3_METAPHLAN3 } from '../../modules/nf workflow METAGENOMICS_PROFILING { - take: reads // channel: [ [ meta ] , [ reads ] ] - take: database // channel: [ [ meta ] , path ] + take: + reads + database main: - - ch_versions = Channel.empty() - ch_raw_classifications = Channel.empty() - ch_raw_profiles = Channel.empty() - ch_multiqc_files = Channel.empty() - // TODO: malt, metaphylan, kraken2, krakenuniq - // TODO: maltextract, krakenparse + ch_versions = Channel.empty() + ch_raw_classifications = Channel.empty() + ch_raw_profiles = Channel.empty() + ch_multiqc_files = Channel.empty() /* PREPARE PROFILER INPUT CHANNELS & RUN PROFILING */ - // Each tool as a slightly different input structure and generally separate + // Each tool has a slightly different input structure and generally separate // input channels for reads vs database. We restructure the channel tuple // for each tool and make liberal use of multiMap to keep reads/database // channel element order in sync with each other @@ -118,26 +116,26 @@ workflow METAGENOMICS_PROFILING { } if ( params.metagenomics_profiling_tool == 'krakenuniq' ) { - reads.view() - /*reads = reads - .map { - meta, reads -> - [[id: db_meta.db_name, single_end: meta.single_end], reads, db_meta, db] - } - .groupTuple(by: [0,2,3]) - .multiMap { - single_meta, reads, db_meta, db -> - reads: [ single_meta + db_meta, reads.flatten() ] - db: db - } - // Hardcode to _always_ produce the report file (which is our basic output, and goes into) - KRAKENUNIQ_PRELOADEDKRAKENUNIQ ( ch_input_for_krakenuniq.reads, ch_input_for_krakenuniq.db, params.metagenomics_krakenuniq_ram_chunk_size, params.metagenomics_kraken_save_reads, true, params.metagenomics_kraken_save_readclassifications ) + // run kraken uniq per sample, to preserve the meta-data + + reads = reads.combine(database) + krakenuniq_reads = reads.map{meta, reads, database -> [meta, reads]} + krakenuniq_db = reads.map{meta, reads, database -> [database]} + + KRAKENUNIQ_PRELOADEDKRAKENUNIQ ( + krakenuniq_reads, + krakenuniq_db, + params.metagenomics_krakenuniq_ram_chunk_size, + params.metagenomics_kraken_save_reads, + true, + params.metagenomics_kraken_save_read_classifications + ) + ch_multiqc_files = ch_multiqc_files.mix( KRAKENUNIQ_PRELOADEDKRAKENUNIQ.out.report ) ch_versions = ch_versions.mix( KRAKENUNIQ_PRELOADEDKRAKENUNIQ.out.versions.first() ) ch_raw_classifications = ch_raw_classifications.mix( KRAKENUNIQ_PRELOADEDKRAKENUNIQ.out.classified_assignment ) ch_raw_profiles = ch_raw_profiles.mix( KRAKENUNIQ_PRELOADEDKRAKENUNIQ.out.report ) ch_multiqc_files = ch_multiqc_files.mix( KRAKENUNIQ_PRELOADEDKRAKENUNIQ.out.report ) - */ } if ( params.metagenomics_profiling_tool == 'kraken2' ) { diff --git a/workflows/eager.nf b/workflows/eager.nf index 4d1fb519e..32a8f730c 100644 --- a/workflows/eager.nf +++ b/workflows/eager.nf @@ -286,13 +286,6 @@ workflow EAGER { ch_multiqc_files = ch_multiqc_files.mix(PRESEQ_LCEXTRAP.out.lc_extrap.collect{it[1]}.ifEmpty([])) ch_versions = ch_versions.mix( PRESEQ_LCEXTRAP.out.versions ) } - // SUBWORKFLOW: metagenomics screening - // - //TODO: finish and figure out how exactly to call with proper database (check via a helper function?) - if ( params.run_metagenomics ) { - METAGENOMICS_PROFILING ( ch_bamfiltered_for_metagenomics, params.metagenomics_profiling_database ) // TODO: implement full metagenomics screening main subworkflow - } - // that then calls complexityfilter, profiling, postprocessing // // MODULE: MultiQC From b26ea9a3941ff6bdbb1aec7ebadcbcd29ceffe9e Mon Sep 17 00:00:00 2001 From: Ian Light Date: Fri, 5 May 2023 10:02:24 +0000 Subject: [PATCH 021/198] partial implementation of krakenparse --- bin/kraken_parse.py | 100 ++++++++++++++++++ conf/modules.config | 2 +- modules/local/filter_bam_fragment_length.nf | 2 +- modules/local/krakenparse.nf | 69 ++++++++++++ nextflow.config | 52 ++++----- nextflow_schema.json | 6 +- subworkflows/local/metagenomics.nf | 14 +-- .../local/metagenomics_postprocessing.nf | 9 +- workflows/eager.nf | 2 +- 9 files changed, 213 insertions(+), 43 deletions(-) create mode 100644 bin/kraken_parse.py create mode 100644 modules/local/krakenparse.nf diff --git a/bin/kraken_parse.py b/bin/kraken_parse.py new file mode 100644 index 000000000..20533e2ee --- /dev/null +++ b/bin/kraken_parse.py @@ -0,0 +1,100 @@ +#!/usr/bin/env python + +# Written by Maxime Borry and released under the MIT license. +# See git repository (https://github.com/nf-core/eager) for full license text. + +import argparse +import csv + +def _get_args(): + '''This function parses and return arguments passed in''' + parser = argparse.ArgumentParser( + prog='kraken_parse', + formatter_class=argparse.RawDescriptionHelpFormatter, + description='Parsing kraken') + parser.add_argument('krakenReport', help="path to kraken report file") + parser.add_argument( + '-c', + dest="count", + default=50, + help="Minimum number of hits on clade to report it. Default = 50") + parser.add_argument( + '-or', + dest="readout", + default=None, + help="Read count output file. Default = .read_kraken_parsed.csv") + parser.add_argument( + '-ok', + dest="kmerout", + default=None, + help="Kmer Output file. Default = .kmer_kraken_parsed.csv") + + args = parser.parse_args() + + infile = args.krakenReport + countlim = int(args.count) + readout = args.readout + kmerout = args.kmerout + + return(infile, countlim, readout, kmerout) + + +def _get_basename(file_name): + if ("/") in file_name: + basename = file_name.split("/")[-1].split(".")[0] + else: + basename = file_name.split(".")[0] + return(basename) + + +def parse_kraken(infile, countlim): + ''' + INPUT: + infile (str): path to kraken report file + countlim (int): lowest count threshold to report hit + OUTPUT: + resdict (dict): key=taxid, value=readCount + ''' + with open(infile, 'r') as f: + read_dict = {} + kmer_dict = {} + csvreader = csv.reader(f, delimiter='\t') + for line in csvreader: + reads = int(line[1]) + if reads >= countlim: + taxid = line[6] + kmer = line[3] + unique_kmer = line[4] + try: + kmer_duplicity = float(kmer)/float(unique_kmer) + except ZeroDivisionError: + kmer_duplicity = 0 + read_dict[taxid] = reads + kmer_dict[taxid] = kmer_duplicity + + return(read_dict, kmer_dict) + + +def write_output(resdict, infile, outfile): + with open(outfile, 'w') as f: + basename = _get_basename(infile) + f.write(f"TAXID,{basename}\n") + for akey in resdict.keys(): + f.write(f"{akey},{resdict[akey]}\n") + + +if __name__ == '__main__': + INFILE, COUNTLIM, readout, kmerout = _get_args() + + if not readout: + read_outfile = _get_basename(INFILE)+".read_kraken_parsed.csv" + else: + read_outfile = readout + if not kmerout: + kmer_outfile = _get_basename(INFILE)+".kmer_kraken_parsed.csv" + else: + kmer_outfile = kmerout + + read_dict, kmer_dict = parse_kraken(infile=INFILE, countlim=COUNTLIM) + write_output(resdict=read_dict, infile=INFILE, outfile=read_outfile) + write_output(resdict=kmer_dict, infile=INFILE, outfile=kmer_outfile) diff --git a/conf/modules.config b/conf/modules.config index 17b0701b3..3108fd50a 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -489,7 +489,7 @@ process { "-id ${params.metagenomics_malt_min_percent_identity}", "-mq ${params.metagenomics_malt_max_queries}", "--memoryMode ${params.metagenomics_malt_memory_mode}", - params.metagenomics_malt_min_support_mode == "percent" ? "-supp ${params.metagenomics_malt_min_support_percent}" : "-sup ${params.metagenomics_malt_min_support_reads}", + params.metagenomics_malt_min_support_mode == "percent" ? "-supp ${params.metagenomics_malt_min_support_percent}" : "-sup ${params.metagenomics_min_support_reads}", params.metagenomics_malt_sam_output ? "-a . -f SAM" : "", params.metagenomics_malt_save_reads ? "--alignments ./ -za false" : "" ].join(' ').trim() diff --git a/modules/local/filter_bam_fragment_length.nf b/modules/local/filter_bam_fragment_length.nf index abbadb87a..65f29d971 100644 --- a/modules/local/filter_bam_fragment_length.nf +++ b/modules/local/filter_bam_fragment_length.nf @@ -12,7 +12,7 @@ process FILTER_BAM_FRAGMENT_LENGTH { output: tuple val(meta), path("*filtered.bam"), emit: bam - path "versions.yml" , emit: versions + path "versions.yml" , emit: versions when: task.ext.when == null || task.ext.when diff --git a/modules/local/krakenparse.nf b/modules/local/krakenparse.nf new file mode 100644 index 000000000..6fb58eec4 --- /dev/null +++ b/modules/local/krakenparse.nf @@ -0,0 +1,69 @@ +// TODO nf-core: If in doubt look at other nf-core/modules to see how we are doing things! :) +// https://github.com/nf-core/modules/tree/master/modules/nf-core/ +// You can also ask for help via your pull request or on the #modules channel on the nf-core Slack workspace: +// https://nf-co.re/join +// TODO nf-core: A module file SHOULD only define input and output files as command-line parameters. +// All other parameters MUST be provided using the "task.ext" directive, see here: +// https://www.nextflow.io/docs/latest/process.html#ext +// where "task.ext" is a string. +// Any parameters that need to be evaluated in the context of a particular sample +// e.g. single-end/paired-end data MUST also be defined and evaluated appropriately. +// TODO nf-core: Software that can be piped together SHOULD be added to separate module files +// unless there is a run-time, storage advantage in implementing in this way +// e.g. it's ok to have a single module for bwa to output BAM instead of SAM: +// bwa mem | samtools view -B -T ref.fasta +// TODO nf-core: Optional inputs are not currently supported by Nextflow. However, using an empty +// list (`[]`) instead of a file can be used to work around this issue. + +process KRAKENPARSE { + tag "$meta.id" + label 'process_single' + + // TODO nf-core: List required Conda package(s). + // Software MUST be pinned to channel (i.e. "bioconda"), version (i.e. "1.10"). + // For Conda, the build (i.e. "h9402c20_2") must be EXCLUDED to support installation on different operating systems. + // TODO nf-core: See section in main README for further information regarding finding and adding container addresses to the section below. + conda "conda-forge::python=3.8.3" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/python:3.8.3' : + 'quay.io/biocontainers/python:3.8.3' }" + input: + // TODO nf-core: Where applicable all sample-specific information e.g. "id", "single_end", "read_group" + // MUST be provided as an input via a Groovy Map called "meta". + // This information may not be required in some instances e.g. indexing reference genome files: + // https://github.com/nf-core/modules/blob/master/modules/nf-core/bwa/index/main.nf + // TODO nf-core: Where applicable please provide/convert compressed files as input/output + // e.g. "*.fastq.gz" and NOT "*.fastq", "*.bam" and NOT "*.sam" etc. + tuple val(meta), path(report) + + output: + tuple val(meta), path(read_out), emit: read_kraken_parsed + tuple val(meta), path(kmer_out), emit: kmer_kraken_parsed + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + + // TODO nf-core: Where possible, a command MUST be provided to obtain the version number of the software e.g. 1.10 + // If the software is unable to output a version number on the command-line then it can be manually specified + // e.g. https://github.com/nf-core/modules/blob/master/modules/nf-core/homer/annotatepeaks/main.nf + // Each software used MUST provide the software name and version number in the YAML version file (versions.yml) + // TODO nf-core: It MUST be possible to pass additional parameters to the tool as a command-line string via the "task.ext.args" directive + // TODO default value for KRAKEN_PARSE min reads shared with MALTEXTRACT, but recommended defaults in tools is 50 vs 1, respectively: add check and warning? + def read_out = "${meta.id}.read_kraken_parsed.csv" + def kmer_out = "${meta.id}.kmer_kraken_parsed.csv" + """ + kraken_parse.py \\ + -c ${params.metagenomics_min_support_reads} \\ + -or $read_out \\ + -ok $kmer_out \\ + $ch_postprocessing_input + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + END_VERSIONS + """ +} diff --git a/nextflow.config b/nextflow.config index 521e33fd8..a6bc1cc8c 100644 --- a/nextflow.config +++ b/nextflow.config @@ -109,32 +109,32 @@ params { bamfiltering_savefilteredbams = false // can include unmapped reads if --bamfiltering_retainunmappedgenomicbam specified // Metagenomic Screening - run_metagenomics = false - metagenomics_input = 'unmapped' // mapped, all, unmapped -> mapped vs all specified in SAMTOOLS_FASTQ_MAPPED in modules.conf, unmapped hardcoded SAMTOOLS_FASTQ_UMAPPED - run_metagenomics_complexityfiltering = false - metagenomics_complexity_tool = 'bbduk' - metagenomics_complexity_savefastq = false - metagenomics_complexity_entropy = 0.3 - metagenomics_prinseq_mode = 'entropy' - metagenomics_prinseq_dustscore = 0.5 - metagenomics_profiling_tool = null - metagenomics_profiling_database = null - metagenomics_krakenuniq_ram_chunk_size = '16G' - metagenomics_kraken_save_reads = false - metagenomics_kraken_save_read_classifications = false - metagenomics_kraken2_save_minimizers = false - metagenomics_malt_mode = 'BlastN' - metagenomics_malt_alignment_mode = 'SemiGlobal' - metagenomics_malt_save_reads = false - metagenomics_malt_sam_output = false - metagenomics_malt_min_support_mode = 'percent' - metagenomics_malt_min_support_percent = 0.01 - metagenomics_malt_min_support_reads = 1 - metagenomics_malt_min_percent_identity = 85 - metagenomics_malt_top_percent = 1 - metagenomics_malt_max_queries = 100 - metagenomics_malt_memory_mode = 'load' - metagenomics_malt_group_size = 0 + run_metagenomics = false + metagenomics_input = 'unmapped' // mapped, all, unmapped -> mapped vs all specified in SAMTOOLS_FASTQ_MAPPED in modules.conf, unmapped hardcoded SAMTOOLS_FASTQ_UMAPPED + run_metagenomics_complexityfiltering = false + metagenomics_complexity_tool = 'bbduk' + metagenomics_complexity_savefastq = false + metagenomics_complexity_entropy = 0.3 + metagenomics_prinseq_mode = 'entropy' + metagenomics_prinseq_dustscore = 0.5 + metagenomics_profiling_tool = null + metagenomics_profiling_database = null + metagenomics_krakenuniq_ram_chunk_size = '16G' + metagenomics_kraken_save_reads = false + metagenomics_kraken_save_read_classifications = false + metagenomics_kraken2_save_minimizers = false + metagenomics_malt_mode = 'BlastN' + metagenomics_malt_alignment_mode = 'SemiGlobal' + metagenomics_malt_save_reads = false + metagenomics_malt_sam_output = false + metagenomics_malt_min_support_mode = 'percent' + metagenomics_malt_min_support_percent = 0.01 + metagenomics_min_support_reads = 1 + metagenomics_malt_min_percent_identity = 85 + metagenomics_malt_top_percent = 1 + metagenomics_malt_max_queries = 100 + metagenomics_malt_memory_mode = 'load' + metagenomics_malt_group_size = 0 // Deduplication options skip_deduplication = false diff --git a/nextflow_schema.json b/nextflow_schema.json index 45d7cfded..16c5c3881 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -714,12 +714,12 @@ "fa_icon": "fas fa-percentage", "help_text": "Specify the minimum number of reads (as a percentage of all assigned reads) a given taxon is required to have to be retained as a positive 'hit' in the RMA6 file. This only applies when `--malt_min_support_mode` is set to 'percent'. Default 0.01.\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MALT parameter: `-supp`" }, - "metagenomics_malt_min_support_reads": { + "metagenomics_min_support_reads": { "type": "integer", "default": 1, - "description": "Specify a minimum number of reads a taxon of sample total is required to have to be retained in malt. Not compatible with --malt_min_support_mode 'percent'.", + "description": "Specify a minimum number of reads a taxon of sample total is required to have to be retained in malt or kraken. Not compatible with --malt_min_support_mode 'percent'.", "fa_icon": "fas fa-sort-numeric-up-alt", - "help_text": "Specify the minimum number of reads a given taxon is required to have to be retained as a positive 'hit'. \nFor malt, this only applies when `--malt_min_support_mode` is set to 'reads'. Default: 1.\n\n> Modifies MALT or kraken_parse.py parameter: `-sup` and `-c` respectively\n" + "help_text": "For usage in malt: Specify the minimum number of reads a given taxon is required to have to be retained as a positive 'hit'. \n For usage in kraken2 or krakenuniq: Specify the number of hits on a clade to retain it in the final report when using kraken_parse. \nFor malt, this only applies when `--malt_min_support_mode` is set to 'reads'. Default: 1.\n\n> Modifies MALT or kraken_parse.py parameter: `-sup` and `-c` respectively\n" }, "metagenomics_malt_max_queries": { "type": "integer", diff --git a/subworkflows/local/metagenomics.nf b/subworkflows/local/metagenomics.nf index fb45a033b..cf8802ea1 100644 --- a/subworkflows/local/metagenomics.nf +++ b/subworkflows/local/metagenomics.nf @@ -31,19 +31,19 @@ workflow METAGENOMICS { // Run the profiling subworkflow // - database = params.metagenomics_profiling_database - - METAGENOMICS_PROFILING( ch_reads_for_metagenomics, database ) + METAGENOMICS_PROFILING( ch_reads_for_metagenomics, params.metagenomics_profiling_database ) ch_versions = ch_versions.mix( METAGENOMICS_PROFILING.out.versions.first() ) ch_multiqc_files = ch_multiqc_files.mix( METAGENOMICS_PROFILING.out.mqc.collect{it[1]}.ifEmpty([]) ) // // Run the post profiling subworkflow // - if ( params.metagenomics_postprocessing_tool == 'maltextract' || params.metagenomics_postprocessing_tool == 'krakenparse' ) { } - METAGENOMICS_POSTPROCESSING ( METAGENOMICS_PROFILING.out.postprocessing_input ) - ch_versions = ch_versions.mix( METAGENOMICS_POSTPROCESSING.out.versions.first() ) - ch_multiqc_files = ch_multiqc_files.mix( METAGENOMICS_POSTPROCESSING.out.mqc.collect{it[1]}.ifEmpty([]) ) + + if ( params.metagenomics_postprocessing_tool ) { + METAGENOMICS_POSTPROCESSING ( METAGENOMICS_PROFILING.out.postprocessing_input ) + ch_versions = ch_versions.mix( METAGENOMICS_POSTPROCESSING.out.versions.first() ) + ch_multiqc_files = ch_multiqc_files.mix( METAGENOMICS_POSTPROCESSING.out.mqc.collect{it[1]}.ifEmpty([]) ) + } emit: ch_versions = ch_versions diff --git a/subworkflows/local/metagenomics_postprocessing.nf b/subworkflows/local/metagenomics_postprocessing.nf index bbd1f965e..6229abfa5 100644 --- a/subworkflows/local/metagenomics_postprocessing.nf +++ b/subworkflows/local/metagenomics_postprocessing.nf @@ -5,6 +5,7 @@ // TODO nf-core: A subworkflow SHOULD import at least two modules include { MALTEXTRACT } from '../../../modules/nf-core/maltextract/main' +include { KRAKENPARSE } from '../../../modules/local/krakenparse' workflow METAGENOMICS_POSTPROCESSING { @@ -17,24 +18,24 @@ workflow METAGENOMICS_POSTPROCESSING { ch_versions = Channel.empty() ch_multiqc_files = Channel.empty() - // TODO nf-core: substitute modules here for the modules of your subworkflow - if ( params.metagenomics_postprocessing_tool == 'maltextract') { MALTEXTRACT ( ch_postprocessing_input, params.taxon_list, params.ncbi_dir ) ch_versions = ch_versions.mix( MALTEXTRACT.out.versions.first() ) ch_results = ch_results.mix( MALTEXTRACT.out.results ) } else if ( params.metagenomics_postprocessing_tool == 'krakenparse' ) { - // TODO: @merlin fininsh implementation/merge with your implementation KRAKENPARSE ( ch_postprocessing_input ) ch_versions = ch_versions.mix( KRAKENPARSE.out.versions.first() ) ch_results = ch_results.mix( KRAKENPARSE.out.results ) } - +// TODO check how to actually emit krakenparse output channels into one directory +// TODO check if necessary to have merge_kraken_parsed +// TODO add paths for multiqc files (maltextract and maybe kraken parse) emit: // TODO nf-core: edit emitted channels versions = ch_versions results_directory = ch_results + mqc = ch_multiqc_files } diff --git a/workflows/eager.nf b/workflows/eager.nf index 139cb3c98..c8f11a32a 100644 --- a/workflows/eager.nf +++ b/workflows/eager.nf @@ -294,7 +294,7 @@ workflow EAGER { // //TODO: finish and figure out how exactly to call with proper database (check via a helper function?) if ( params.run_metagenomics ) { - METAGENOMICS_PROFILING ( ch_bamfiltered_for_metagenomics, params.metagenomics_profiling_database ) // TODO: implement full metagenomics screening main subworkflow + METAGENOMICS ( ch_bamfiltered_for_metagenomics ) // TODO: implement full metagenomics screening main subworkflow } // that then calls complexityfilter, profiling, postprocessing From 5df230548d2928ba3b547f371c0bd21f339f872f Mon Sep 17 00:00:00 2001 From: Ian Light Date: Fri, 19 May 2023 10:37:35 +0000 Subject: [PATCH 022/198] untested implementation of postprocessing --- bin/merge_kraken_res.py | 81 +++++++++++++++++++ modules.json | 30 ++----- modules/local/krakenmerge.nf | 69 ++++++++++++++++ modules/nf-core/amps/main.nf | 40 +++++++++ modules/nf-core/amps/meta.yml | 66 +++++++++++++++ subworkflows/local/metagenomics.nf | 4 +- .../local/metagenomics_postprocessing.nf | 46 +++++++---- subworkflows/local/metagenomics_profiling.nf | 4 +- 8 files changed, 298 insertions(+), 42 deletions(-) create mode 100644 bin/merge_kraken_res.py create mode 100644 modules/local/krakenmerge.nf create mode 100644 modules/nf-core/amps/main.nf create mode 100644 modules/nf-core/amps/meta.yml diff --git a/bin/merge_kraken_res.py b/bin/merge_kraken_res.py new file mode 100644 index 000000000..35d0de499 --- /dev/null +++ b/bin/merge_kraken_res.py @@ -0,0 +1,81 @@ +#!/usr/bin/env python + +# Written by Maxime Borry and released under the MIT license. +# Modifications for DSL2 compliance and integration into eager DSL2 release by @ilight1542 +# See git repository (https://github.com/nf-core/eager) for full license text. + +import argparse +import os +import pandas as pd +import numpy as np + +def _get_args(): + '''This function parses and return arguments passed in''' + parser = argparse.ArgumentParser( + prog='merge_kraken_res', + formatter_class=argparse.RawDescriptionHelpFormatter, + description='Merging csv count files in one table') + parser.add_argument( + '-or', + dest="readout", + default="kraken_read_count_table.csv", + help="Read count output file. Default = kraken_read_count_table.csv") + parser.add_argument( + '-ok', + dest="kmerout", + default="kraken_kmer_unicity_table.csv", + help="Kmer unicity output file. Default = kraken_kmer_unicity_table.csv") + parser.add_argument( + '-inr', + dest="input_read_csvs", + help="Paths to .read_ outputs from krakenparse") + parser.add_argument( + '-ink', + dest="input_kmer_csvs", + help="Paths to .kmer_ outputs from krakenparse" + ) + + args = parser.parse_args() + + read_paths_in = args.input_read_csvs + kmer_paths_in = args.input_kmer_csvs + readout = args.readout + kmerout = args.kmerout + + return(readout, kmerout, read_paths_in, kmer_paths_in) + + +def get_csv(read_paths_in, kmer_paths_in): + kmer = [i for i in kmer_paths_in] + read = [i for i in read_paths_in] + return(read, kmer) + + +def _get_basename(file_name): + if ("/") in file_name: + basename = file_name.split("/")[-1].split(".")[0] + else: + basename = file_name.split(".")[0] + return(basename) + + +def merge_csv(all_csv): + df = pd.read_csv(all_csv[0], index_col=0) + for i in range(1, len(all_csv)): + df_tmp = pd.read_csv(all_csv[i], index_col=0) + df = pd.merge(left=df, right=df_tmp, on='TAXID', how='outer') + df.fillna(0, inplace=True) + return(df) + + +def write_csv(pd_dataframe, outfile): + pd_dataframe.to_csv(outfile) + + +if __name__ == "__main__": + READOUT, KMEROUT, READ_PATHS_IN, KMER_PATHS_IN = _get_args() + reads, kmers = get_csv(READ_PATHS_IN,KMER_PATHS_IN) + read_df = merge_csv(reads) + kmer_df = merge_csv(kmers) + write_csv(read_df, READOUT) + write_csv(kmer_df, KMEROUT) diff --git a/modules.json b/modules.json index 7cbe366ea..495991a35 100644 --- a/modules.json +++ b/modules.json @@ -10,6 +10,11 @@ "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", "installed_by": ["modules"] }, + "amps": { + "branch": "master", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "installed_by": ["modules"] + }, "bbmap/bbduk": { "branch": "master", "git_sha": "0f8a77ff00e65eaeebc509b8156eaa983192474b", @@ -110,31 +115,6 @@ "git_sha": "7c695e0147df1157413e06246d9b0094617d3e6b", "installed_by": ["modules"] }, - "kraken2/kraken2": { - "branch": "master", - "git_sha": "7c695e0147df1157413e06246d9b0094617d3e6b", - "installed_by": ["modules"] - }, - "krakenuniq/preloadedkrakenuniq": { - "branch": "master", - "git_sha": "a6eb17f65b3ee5761c25c075a6166c9f76733cee", - "installed_by": ["modules"] - }, - "malt/run": { - "branch": "master", - "git_sha": "75027bf77472b1f4fd2cdd7e46f83119dfb0f2c6", - "installed_by": ["modules"] - }, - "maltextract": { - "branch": "master", - "git_sha": "0f8a77ff00e65eaeebc509b8156eaa983192474b", - "installed_by": ["modules"] - }, - "metaphlan3/metaphlan3": { - "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", - "installed_by": ["modules"] - }, "multiqc": { "branch": "master", "git_sha": "ee80d14721e76e2e079103b8dcd5d57129e584ba", diff --git a/modules/local/krakenmerge.nf b/modules/local/krakenmerge.nf new file mode 100644 index 000000000..b0bdfdcd3 --- /dev/null +++ b/modules/local/krakenmerge.nf @@ -0,0 +1,69 @@ +// TODO nf-core: If in doubt look at other nf-core/modules to see how we are doing things! :) +// https://github.com/nf-core/modules/tree/master/modules/nf-core/ +// You can also ask for help via your pull request or on the #modules channel on the nf-core Slack workspace: +// https://nf-co.re/join +// TODO nf-core: A module file SHOULD only define input and output files as command-line parameters. +// All other parameters MUST be provided using the "task.ext" directive, see here: +// https://www.nextflow.io/docs/latest/process.html#ext +// where "task.ext" is a string. +// Any parameters that need to be evaluated in the context of a particular sample +// e.g. single-end/paired-end data MUST also be defined and evaluated appropriately. +// TODO nf-core: Software that can be piped together SHOULD be added to separate module files +// unless there is a run-time, storage advantage in implementing in this way +// e.g. it's ok to have a single module for bwa to output BAM instead of SAM: +// bwa mem | samtools view -B -T ref.fasta +// TODO nf-core: Optional inputs are not currently supported by Nextflow. However, using an empty +// list (`[]`) instead of a file can be used to work around this issue. + +process KRAKENMERGE { + label 'process_single' + + // TODO nf-core: List required Conda package(s). + // Software MUST be pinned to channel (i.e. "bioconda"), version (i.e. "1.10"). + // For Conda, the build (i.e. "h9402c20_2") must be EXCLUDED to support installation on different operating systems. + // TODO nf-core: See section in main README for further information regarding finding and adding container addresses to the section below. + conda "conda-forge::python=3.8.3" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/python:3.8.3' : + 'quay.io/biocontainers/python:3.8.3' }" + input: + // TODO nf-core: Where applicable all sample-specific information e.g. "id", "single_end", "read_group" + // MUST be provided as an input via a Groovy Map called "meta". + // This information may not be required in some instances e.g. indexing reference genome files: + // https://github.com/nf-core/modules/blob/master/modules/nf-core/bwa/index/main.nf + + // TODO check if this works + path karken_parse_reads + path kraken_parse_kmers + + output: + path "kraken_read_count_table.csv" , emit: read_count_table + path "kraken_kmer_duplication.csv" , emit: kmer_duplication_table + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + + // TODO nf-core: Where possible, a command MUST be provided to obtain the version number of the software e.g. 1.10 + // If the software is unable to output a version number on the command-line then it can be manually specified + // e.g. https://github.com/nf-core/modules/blob/master/modules/nf-core/homer/annotatepeaks/main.nf + // Each software used MUST provide the software name and version number in the YAML version file (versions.yml) + // TODO nf-core: It MUST be possible to pass additional parameters to the tool as a command-line string via the "task.ext.args" directive + // TODO default value for KRAKEN_PARSE min reads shared with MALTEXTRACT, but recommended defaults in tools is 50 vs 1, respectively: add check and warning? + def read_out = "kraken_read_count_table.csv" + def kmer_out = "kraken_kmer_duplication.csv" + """ + merge_kraken_res.py \\ + -or $read_out \\ + -ok $kmer_out \\ + -inr $karken_parse_reads \\ + -ink $kraken_parse_kmers + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + END_VERSIONS + """ +} diff --git a/modules/nf-core/amps/main.nf b/modules/nf-core/amps/main.nf new file mode 100644 index 000000000..00a59b160 --- /dev/null +++ b/modules/nf-core/amps/main.nf @@ -0,0 +1,40 @@ +process AMPS { + label 'process_low' + + conda "bioconda::hops=0.35" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/hops:0.35--hdfd78af_1' : + 'biocontainers/hops:0.35--hdfd78af_1' }" + + input: + path maltextract_results + path taxon_list + val filter + + output: + path "results/heatmap_overview_Wevid.json" , emit: json + path "results/heatmap_overview_Wevid.pdf" , emit: summary_pdf + path "results/heatmap_overview_Wevid.tsv" , emit: tsv + path "results/pdf_candidate_profiles/" , emit: candidate_pdfs + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + """ + postprocessing.AMPS.r \\ + -r $maltextract_results \\ + -n $taxon_list \\ + -m $filter \\ + -t $task.cpus \\ + -j \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + amps: \$(echo \$(hops --version 2>&1) | sed 's/HOPS version//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/amps/meta.yml b/modules/nf-core/amps/meta.yml new file mode 100644 index 000000000..34f825af3 --- /dev/null +++ b/modules/nf-core/amps/meta.yml @@ -0,0 +1,66 @@ +name: amps +description: Post-processing script of the MaltExtract component of the HOPS package +keywords: + - malt + - MaltExtract + - HOPS + - amps + - alignment + - metagenomics + - ancient DNA + - aDNA + - palaeogenomics + - archaeogenomics + - microbiome + - authentication + - damage + - edit distance + - post Post-processing + - visualisation +tools: + - amps: + description: Post-processing script of the MaltExtract tool for ancient metagenomics + homepage: "https://github.com/rhuebler/HOPS" + documentation: "https://github.com/keyfm/amps" + tool_dev_url: "https://github.com/keyfm/amps" + doi: "10.1186/s13059-019-1903-0" + licence: ["GPL >=3"] + +input: + - maltextract_results: + type: directory + description: MaltExtract output directory + pattern: "results/" + - taxon_list: + type: file + description: List of target taxa to evaluate used in MaltExtract + pattern: "*.txt" + - filter: + type: string + description: The filter mode used in MaltExtract + pattern: "def_anc|default|scan|ancient|crawl" + +output: + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - json: + type: file + description: Candidate summary heatmap in MultiQC compatible JSON format + pattern: "heatmap_overview_Wevid.json" + - summary_pdf: + type: file + description: Candidate summary heatmap in PDF format + pattern: "heatmap_overview_Wevid.pdf" + - tsv: + type: file + description: Candidate summary heatmap in TSV format + pattern: "heatmap_overview_Wevid.tsv" + - candidate_pdfs: + type: directory + description: Directory of per sample output PDFs organised by reference + pattern: "pdf_candidate_profiles/" + +authors: + - "@jfy133" diff --git a/subworkflows/local/metagenomics.nf b/subworkflows/local/metagenomics.nf index 7604aa70e..29da3cd45 100644 --- a/subworkflows/local/metagenomics.nf +++ b/subworkflows/local/metagenomics.nf @@ -39,10 +39,10 @@ workflow METAGENOMICS { ch_multiqc_files = ch_multiqc_files.mix( METAGENOMICS_PROFILING.out.mqc.collect{it[1]}.ifEmpty([]) ) // - // Run the post profiling subworkflow + // Run the post profiling subworkflow (optionally run for malt, mandatory for kraken2/krakenuniq) // - if ( params.metagenomics_postprocessing_tool ) { + if ( params.metagenomics_postprocessing_tool || params.metagenomics_profiling_tool == 'kraken2' || params.metagenomics_profiling_tool == 'krakenuniq' ) { METAGENOMICS_POSTPROCESSING ( METAGENOMICS_PROFILING.out.postprocessing_input ) ch_versions = ch_versions.mix( METAGENOMICS_POSTPROCESSING.out.versions.first() ) ch_multiqc_files = ch_multiqc_files.mix( METAGENOMICS_POSTPROCESSING.out.mqc.collect{it[1]}.ifEmpty([]) ) diff --git a/subworkflows/local/metagenomics_postprocessing.nf b/subworkflows/local/metagenomics_postprocessing.nf index 6229abfa5..d462e4188 100644 --- a/subworkflows/local/metagenomics_postprocessing.nf +++ b/subworkflows/local/metagenomics_postprocessing.nf @@ -5,37 +5,55 @@ // TODO nf-core: A subworkflow SHOULD import at least two modules include { MALTEXTRACT } from '../../../modules/nf-core/maltextract/main' +include { AMPS } from '../../../modules/nf-core/amps/main' include { KRAKENPARSE } from '../../../modules/local/krakenparse' +include { KRAKENMERGE } from '../../../modules/local/krakenmerge' workflow METAGENOMICS_POSTPROCESSING { take: - // TODO nf-core: edit input (take) channels ch_postprocessing_input // different between kraken and malt main: - ch_versions = Channel.empty() + ch_results = Channel.empty() ch_multiqc_files = Channel.empty() if ( params.metagenomics_postprocessing_tool == 'maltextract') { + MALTEXTRACT ( ch_postprocessing_input, params.taxon_list, params.ncbi_dir ) - ch_versions = ch_versions.mix( MALTEXTRACT.out.versions.first() ) - ch_results = ch_results.mix( MALTEXTRACT.out.results ) + + AMPS ( MALTEXTRACT.out.results, params.taxon_list, params.metagenomics_malt_filter ) + + ch_versions = ch_versions.mix( MALTEXTRACT.out.versions.first(), AMPS.out.versions.first() ) + ch_results = ch_results.mix( AMPS.out.results.summary_pdf, AMPS.out.tsv, AMPS.out.summary_pdf ) + ch_multiqc_files = ch_multiqc_files.mix( AMPS.out.results.json ) + } - else if ( params.metagenomics_postprocessing_tool == 'krakenparse' ) { + + else if ( params.metagenomics_profiling_tool == 'kraken2' || params.metagenomics_profiling_tool == 'krakenuniq' ) { + KRAKENPARSE ( ch_postprocessing_input ) - ch_versions = ch_versions.mix( KRAKENPARSE.out.versions.first() ) - ch_results = ch_results.mix( KRAKENPARSE.out.results ) + + ch_list_of_kraken_parse_reads = KRAKENPARSE.out.read_kraken_parsed.map { + meta, read_out -> [ read_out ] + } + ch_list_of_kraken_parse_kmer = KRAKENPARSE.out.kmer_kraken_parsed.map { + meta, kmer_out -> [ kmer_out ] + } + + KRAKENMERGE ( ch_list_of_kraken_parse_reads , ch_list_of_kraken_parse_kmer ) + + ch_versions = ch_versions.mix( KRAKENPARSE.out.versions.first(), KRAKENMERGE.out.versions.first() ) + ch_results = ch_results.mix( KRAKENMERGE.out.read_count_table, KRAKENMERGE.out.kmer_duplication_table ) + ch_multiqc_files = ch_multiqc_files.mix( KRAKENMERGE.out.read_count_table, KRAKENMERGE.out.kmer_duplication_table ) + } -// TODO check how to actually emit krakenparse output channels into one directory -// TODO check if necessary to have merge_kraken_parsed -// TODO add paths for multiqc files (maltextract and maybe kraken parse) + emit: - // TODO nf-core: edit emitted channels - versions = ch_versions - results_directory = ch_results - mqc = ch_multiqc_files + versions = ch_versions + results = ch_results + mqc = ch_multiqc_files } diff --git a/subworkflows/local/metagenomics_profiling.nf b/subworkflows/local/metagenomics_profiling.nf index d55c6a24a..e337bcaba 100644 --- a/subworkflows/local/metagenomics_profiling.nf +++ b/subworkflows/local/metagenomics_profiling.nf @@ -144,14 +144,15 @@ workflow METAGENOMICS_PROFILING { } else if ( params.metagenomics_profiling_tool == 'kraken2' ) { + // run kraken2 over all samples KRAKEN2_KRAKEN2 ( ch_input_for_kraken2.reads, database, params.metagenomics_kraken_save_reads, params.metagenomics_kraken_save_read_classifications ) + ch_multiqc_files = ch_multiqc_files.mix( KRAKEN2_KRAKEN2.out.report ) ch_versions = ch_versions.mix( KRAKEN2_KRAKEN2.out.versions.first() ) ch_raw_classifications = ch_raw_classifications.mix( KRAKEN2_KRAKEN2.out.classified_reads_assignment ) ch_raw_profiles = ch_raw_profiles.mix( KRAKEN2_KRAKEN2.out.report ) ch_postprocessing_input = ch_postprocessing_input.mix( KRAKEN2_KRAKEN2.out.report ) - } emit: @@ -160,4 +161,5 @@ workflow METAGENOMICS_PROFILING { profiles = ch_raw_profiles // channel: [ val(meta), [ reads ] ] - should be text files or biom postprocessing_input = ch_postprocessing_input mqc = ch_multiqc_files + } From cd96724ac7e21aa4ccc1146c1be48a80e91a066d Mon Sep 17 00:00:00 2001 From: Merlin Szymanski Date: Fri, 26 May 2023 10:32:41 +0200 Subject: [PATCH 023/198] Fix paths for subworkflow module import --- subworkflows/local/metagenomics_postprocessing.nf | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/subworkflows/local/metagenomics_postprocessing.nf b/subworkflows/local/metagenomics_postprocessing.nf index d462e4188..7fddb4b55 100644 --- a/subworkflows/local/metagenomics_postprocessing.nf +++ b/subworkflows/local/metagenomics_postprocessing.nf @@ -4,10 +4,10 @@ // https://nf-co.re/join // TODO nf-core: A subworkflow SHOULD import at least two modules -include { MALTEXTRACT } from '../../../modules/nf-core/maltextract/main' -include { AMPS } from '../../../modules/nf-core/amps/main' -include { KRAKENPARSE } from '../../../modules/local/krakenparse' -include { KRAKENMERGE } from '../../../modules/local/krakenmerge' +include { MALTEXTRACT } from '../../modules/nf-core/maltextract/main' +include { AMPS } from '../../modules/nf-core/amps/main' +include { KRAKENPARSE } from '../../modules/local/krakenparse' +include { KRAKENMERGE } from '../../modules/local/krakenmerge' workflow METAGENOMICS_POSTPROCESSING { From 1151e7499d6a83b94f6a7908c0268465309b684d Mon Sep 17 00:00:00 2001 From: Ian Light Date: Fri, 26 May 2023 09:35:07 +0000 Subject: [PATCH 024/198] modules configuration for ext.args --- conf/modules.config | 28 +++++++++++++++++++ nextflow.config | 11 ++++++++ .../local/metagenomics_postprocessing.nf | 4 +-- 3 files changed, 41 insertions(+), 2 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 2f16d26a8..8338595d8 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -526,4 +526,32 @@ process { pattern: '*.{biom,txt}' ] } + + withName: MALTEXTRACT { + ext.args = [ + "-f ${params.metagenomics_maltextract_filter}", + "-a ${params.metagenomics_maltextract_toppercent}", + "--minPI ${params.metagenomics_maltextract_percentidentity}, + params.metagenomics_maltextract_destackingoff ? "--destackingOff" : "", + params.metagenomics_maltextract_downsamplingoff ? "--downSampOff" : "", + params.metagenomics_maltextract_duplicateremovaloff ? "--dupRemOff" : "", + params.metagenomics_maltextract_matches ? "--matches" : "", + params.metagenomics_maltextract_megansummary ? "--meganSummary" : "", + params.metagenomics_maltextract_topalignment ? "--useTopAlignment" : "", + meta.single_stranded ? "--singleStranded" : "" #TODO: figure out how to parse single strandedness, + ].join(' ').trim() + publishDir = [ + path: { "${params.outdir}/metagenomics_screening/postprocessing/maltextract/" }, + mode: params.publish_dir_mode, + pattern: '*/results/*' + ] + } + + withName: AMPS { + publishDir = [ + path: { "${params.outdir}/metagenomics_screening/postprocessing/maltextract/" }, + mode: params.publish_dir_mode, + pattern: '*/results/*' + ] + } } diff --git a/nextflow.config b/nextflow.config index a6bc1cc8c..4eed1956b 100644 --- a/nextflow.config +++ b/nextflow.config @@ -135,6 +135,17 @@ params { metagenomics_malt_max_queries = 100 metagenomics_malt_memory_mode = 'load' metagenomics_malt_group_size = 0 + metagenomics_maltextract_taxon_list = null + metagenomics_maltextract_ncbi_dir = null + metagenomics_maltextract_filter = 'def_anc' + metagenomics_maltextract_toppercent = 0.01 + metagenomics_maltextract_destackingoff = false + metagenomics_maltextract_downsamplingoff = false + metagenomics_maltextract_duplicateremovaloff = false + metagenomics_maltextract_matches = false + metagenomics_maltextract_megansummary = false + metagenomics_maltextract_percentidentity = 85.0 + metagenomics_maltextract_topalignment = false // Deduplication options skip_deduplication = false diff --git a/subworkflows/local/metagenomics_postprocessing.nf b/subworkflows/local/metagenomics_postprocessing.nf index 7fddb4b55..4cb5dea7a 100644 --- a/subworkflows/local/metagenomics_postprocessing.nf +++ b/subworkflows/local/metagenomics_postprocessing.nf @@ -21,9 +21,9 @@ workflow METAGENOMICS_POSTPROCESSING { if ( params.metagenomics_postprocessing_tool == 'maltextract') { - MALTEXTRACT ( ch_postprocessing_input, params.taxon_list, params.ncbi_dir ) + MALTEXTRACT ( ch_postprocessing_input, params.metagenomics_maltextract_taxon_list, params.metagenomics_maltextract_ncbi_dir ) - AMPS ( MALTEXTRACT.out.results, params.taxon_list, params.metagenomics_malt_filter ) + AMPS ( MALTEXTRACT.out.results, params.taxon_list, params.metagenomics_malt_filter, params.metagenomics_maltextract_filter ) ch_versions = ch_versions.mix( MALTEXTRACT.out.versions.first(), AMPS.out.versions.first() ) ch_results = ch_results.mix( AMPS.out.results.summary_pdf, AMPS.out.tsv, AMPS.out.summary_pdf ) From a6799ded122041e466a299ee866fac5f0b8d8af4 Mon Sep 17 00:00:00 2001 From: Ian Light Date: Fri, 26 May 2023 09:36:13 +0000 Subject: [PATCH 025/198] correct filter call in submodule AMPS --- subworkflows/local/metagenomics_postprocessing.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/subworkflows/local/metagenomics_postprocessing.nf b/subworkflows/local/metagenomics_postprocessing.nf index 4cb5dea7a..017517a10 100644 --- a/subworkflows/local/metagenomics_postprocessing.nf +++ b/subworkflows/local/metagenomics_postprocessing.nf @@ -23,7 +23,7 @@ workflow METAGENOMICS_POSTPROCESSING { MALTEXTRACT ( ch_postprocessing_input, params.metagenomics_maltextract_taxon_list, params.metagenomics_maltextract_ncbi_dir ) - AMPS ( MALTEXTRACT.out.results, params.taxon_list, params.metagenomics_malt_filter, params.metagenomics_maltextract_filter ) + AMPS ( MALTEXTRACT.out.results, params.taxon_list, params.metagenomics_maltextract_filter ) ch_versions = ch_versions.mix( MALTEXTRACT.out.versions.first(), AMPS.out.versions.first() ) ch_results = ch_results.mix( AMPS.out.results.summary_pdf, AMPS.out.tsv, AMPS.out.summary_pdf ) From 8d0dcaef0766176fc8577eb70c050c38e89ee26a Mon Sep 17 00:00:00 2001 From: Merlin Szymanski Date: Fri, 26 May 2023 11:41:20 +0200 Subject: [PATCH 026/198] make kraken_parse.py executable, fix krakenparse.nf module input --- bin/kraken_parse.py | 52 +++++++++----------- bin/merge_kraken_res.py | 0 modules/local/krakenparse.nf | 2 +- subworkflows/local/metagenomics.nf | 4 ++ subworkflows/local/metagenomics_profiling.nf | 13 ++++- 5 files changed, 39 insertions(+), 32 deletions(-) mode change 100644 => 100755 bin/kraken_parse.py mode change 100644 => 100755 bin/merge_kraken_res.py diff --git a/bin/kraken_parse.py b/bin/kraken_parse.py old mode 100644 new mode 100755 index 20533e2ee..0e23f6dca --- a/bin/kraken_parse.py +++ b/bin/kraken_parse.py @@ -6,28 +6,22 @@ import argparse import csv + def _get_args(): - '''This function parses and return arguments passed in''' + """This function parses and return arguments passed in""" parser = argparse.ArgumentParser( - prog='kraken_parse', - formatter_class=argparse.RawDescriptionHelpFormatter, - description='Parsing kraken') - parser.add_argument('krakenReport', help="path to kraken report file") + prog="kraken_parse", formatter_class=argparse.RawDescriptionHelpFormatter, description="Parsing kraken" + ) + parser.add_argument("krakenReport", help="path to kraken report file") parser.add_argument( - '-c', - dest="count", - default=50, - help="Minimum number of hits on clade to report it. Default = 50") + "-c", dest="count", default=50, help="Minimum number of hits on clade to report it. Default = 50" + ) parser.add_argument( - '-or', - dest="readout", - default=None, - help="Read count output file. Default = .read_kraken_parsed.csv") + "-or", dest="readout", default=None, help="Read count output file. Default = .read_kraken_parsed.csv" + ) parser.add_argument( - '-ok', - dest="kmerout", - default=None, - help="Kmer Output file. Default = .kmer_kraken_parsed.csv") + "-ok", dest="kmerout", default=None, help="Kmer Output file. Default = .kmer_kraken_parsed.csv" + ) args = parser.parse_args() @@ -36,7 +30,7 @@ def _get_args(): readout = args.readout kmerout = args.kmerout - return(infile, countlim, readout, kmerout) + return (infile, countlim, readout, kmerout) def _get_basename(file_name): @@ -44,21 +38,21 @@ def _get_basename(file_name): basename = file_name.split("/")[-1].split(".")[0] else: basename = file_name.split(".")[0] - return(basename) + return basename def parse_kraken(infile, countlim): - ''' + """ INPUT: infile (str): path to kraken report file countlim (int): lowest count threshold to report hit OUTPUT: resdict (dict): key=taxid, value=readCount - ''' - with open(infile, 'r') as f: + """ + with open(infile, "r") as f: read_dict = {} kmer_dict = {} - csvreader = csv.reader(f, delimiter='\t') + csvreader = csv.reader(f, delimiter="\t") for line in csvreader: reads = int(line[1]) if reads >= countlim: @@ -66,32 +60,32 @@ def parse_kraken(infile, countlim): kmer = line[3] unique_kmer = line[4] try: - kmer_duplicity = float(kmer)/float(unique_kmer) + kmer_duplicity = float(kmer) / float(unique_kmer) except ZeroDivisionError: kmer_duplicity = 0 read_dict[taxid] = reads kmer_dict[taxid] = kmer_duplicity - return(read_dict, kmer_dict) + return (read_dict, kmer_dict) def write_output(resdict, infile, outfile): - with open(outfile, 'w') as f: + with open(outfile, "w") as f: basename = _get_basename(infile) f.write(f"TAXID,{basename}\n") for akey in resdict.keys(): f.write(f"{akey},{resdict[akey]}\n") -if __name__ == '__main__': +if __name__ == "__main__": INFILE, COUNTLIM, readout, kmerout = _get_args() if not readout: - read_outfile = _get_basename(INFILE)+".read_kraken_parsed.csv" + read_outfile = _get_basename(INFILE) + ".read_kraken_parsed.csv" else: read_outfile = readout if not kmerout: - kmer_outfile = _get_basename(INFILE)+".kmer_kraken_parsed.csv" + kmer_outfile = _get_basename(INFILE) + ".kmer_kraken_parsed.csv" else: kmer_outfile = kmerout diff --git a/bin/merge_kraken_res.py b/bin/merge_kraken_res.py old mode 100644 new mode 100755 diff --git a/modules/local/krakenparse.nf b/modules/local/krakenparse.nf index 6fb58eec4..ef36714fc 100644 --- a/modules/local/krakenparse.nf +++ b/modules/local/krakenparse.nf @@ -59,7 +59,7 @@ process KRAKENPARSE { -c ${params.metagenomics_min_support_reads} \\ -or $read_out \\ -ok $kmer_out \\ - $ch_postprocessing_input + $report cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/subworkflows/local/metagenomics.nf b/subworkflows/local/metagenomics.nf index 29da3cd45..6b59c2cc9 100644 --- a/subworkflows/local/metagenomics.nf +++ b/subworkflows/local/metagenomics.nf @@ -42,12 +42,16 @@ workflow METAGENOMICS { // Run the post profiling subworkflow (optionally run for malt, mandatory for kraken2/krakenuniq) // + if ( params.metagenomics_postprocessing_tool || params.metagenomics_profiling_tool == 'kraken2' || params.metagenomics_profiling_tool == 'krakenuniq' ) { + METAGENOMICS_POSTPROCESSING ( METAGENOMICS_PROFILING.out.postprocessing_input ) + ch_versions = ch_versions.mix( METAGENOMICS_POSTPROCESSING.out.versions.first() ) ch_multiqc_files = ch_multiqc_files.mix( METAGENOMICS_POSTPROCESSING.out.mqc.collect{it[1]}.ifEmpty([]) ) } + emit: versions = ch_versions ch_multiqc_files = ch_multiqc_files diff --git a/subworkflows/local/metagenomics_profiling.nf b/subworkflows/local/metagenomics_profiling.nf index e337bcaba..32a2539d1 100644 --- a/subworkflows/local/metagenomics_profiling.nf +++ b/subworkflows/local/metagenomics_profiling.nf @@ -144,9 +144,18 @@ workflow METAGENOMICS_PROFILING { } else if ( params.metagenomics_profiling_tool == 'kraken2' ) { - // run kraken2 over all samples + // run kraken2 per sample - KRAKEN2_KRAKEN2 ( ch_input_for_kraken2.reads, database, params.metagenomics_kraken_save_reads, params.metagenomics_kraken_save_read_classifications ) + reads = reads.combine(database) + kraken2_reads = reads.map{meta, reads, database -> [meta, reads]} + kraken2_db = reads.map{meta, reads, database -> [database]} + + KRAKEN2_KRAKEN2 ( + kraken2_reads, + kraken2_db, + params.metagenomics_kraken_save_reads, + params.metagenomics_kraken_save_read_classifications + ) ch_multiqc_files = ch_multiqc_files.mix( KRAKEN2_KRAKEN2.out.report ) ch_versions = ch_versions.mix( KRAKEN2_KRAKEN2.out.versions.first() ) From 19f04c96db53f4b56a0445227cd55547ca1f67c2 Mon Sep 17 00:00:00 2001 From: Ian Light Date: Fri, 26 May 2023 09:42:21 +0000 Subject: [PATCH 027/198] modules.config updating for karken postprocessing --- conf/modules.config | 15 +++++++++++++++ subworkflows/local/metagenomics_postprocessing.nf | 2 +- 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/conf/modules.config b/conf/modules.config index 8338595d8..c2fc89940 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -554,4 +554,19 @@ process { pattern: '*/results/*' ] } + + withName: KRAKENPARSE{ + publishDir = [ + enabled: false + ] + } + + withName: KRAKENMERGE { + publishDir = [ + path: { "${params.outdir}/metagenomics_screening/postprocessing/kraken_merge/" }, + mode: params.publish_dir_mode, + pattern: '*.csv' + ] + } + } diff --git a/subworkflows/local/metagenomics_postprocessing.nf b/subworkflows/local/metagenomics_postprocessing.nf index 017517a10..d0ac43942 100644 --- a/subworkflows/local/metagenomics_postprocessing.nf +++ b/subworkflows/local/metagenomics_postprocessing.nf @@ -31,7 +31,7 @@ workflow METAGENOMICS_POSTPROCESSING { } - else if ( params.metagenomics_profiling_tool == 'kraken2' || params.metagenomics_profiling_tool == 'krakenuniq' ) { + else if ( params.metagenomics_postprocessing_tool == 'krakenmerge' ) { KRAKENPARSE ( ch_postprocessing_input ) From 57c7b32c33a0948a96fc1f8a5954cfd0977ab051 Mon Sep 17 00:00:00 2001 From: Ian Light Date: Fri, 26 May 2023 09:42:41 +0000 Subject: [PATCH 028/198] nextflow config metagenomics_postprocessing --- nextflow.config | 1 + 1 file changed, 1 insertion(+) diff --git a/nextflow.config b/nextflow.config index 4eed1956b..885b84f77 100644 --- a/nextflow.config +++ b/nextflow.config @@ -135,6 +135,7 @@ params { metagenomics_malt_max_queries = 100 metagenomics_malt_memory_mode = 'load' metagenomics_malt_group_size = 0 + metagenomics_postprocessing_tool = null metagenomics_maltextract_taxon_list = null metagenomics_maltextract_ncbi_dir = null metagenomics_maltextract_filter = 'def_anc' From f5b32c606f9281fc40c89fd9cb3d2384c5d1850a Mon Sep 17 00:00:00 2001 From: Merlin Szymanski Date: Fri, 26 May 2023 12:15:34 +0200 Subject: [PATCH 029/198] Fix minor typos --- conf/modules.config | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index c2fc89940..978600761 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -531,14 +531,14 @@ process { ext.args = [ "-f ${params.metagenomics_maltextract_filter}", "-a ${params.metagenomics_maltextract_toppercent}", - "--minPI ${params.metagenomics_maltextract_percentidentity}, + "--minPI ${params.metagenomics_maltextract_percentidentity}", params.metagenomics_maltextract_destackingoff ? "--destackingOff" : "", params.metagenomics_maltextract_downsamplingoff ? "--downSampOff" : "", params.metagenomics_maltextract_duplicateremovaloff ? "--dupRemOff" : "", params.metagenomics_maltextract_matches ? "--matches" : "", params.metagenomics_maltextract_megansummary ? "--meganSummary" : "", params.metagenomics_maltextract_topalignment ? "--useTopAlignment" : "", - meta.single_stranded ? "--singleStranded" : "" #TODO: figure out how to parse single strandedness, + // meta.single_stranded ? "--singleStranded" : "" #TODO: figure out how to parse single strandedness, ].join(' ').trim() publishDir = [ path: { "${params.outdir}/metagenomics_screening/postprocessing/maltextract/" }, From 66310117f0c1bb02721298ab1dd7a6d1d15a4a9e Mon Sep 17 00:00:00 2001 From: Merlin Szymanski Date: Fri, 26 May 2023 15:12:10 +0200 Subject: [PATCH 030/198] Fix krakenparse args, merge_kraken module+script. Complete Workflow for kraken2 --- bin/merge_kraken_res.py | 55 ++++++++----------- conf/modules.config | 2 +- modules/local/krakenmerge.nf | 10 ++-- modules/local/krakenparse.nf | 10 +--- .../local/metagenomics_postprocessing.nf | 2 +- 5 files changed, 32 insertions(+), 47 deletions(-) diff --git a/bin/merge_kraken_res.py b/bin/merge_kraken_res.py index 35d0de499..f13a38ee2 100755 --- a/bin/merge_kraken_res.py +++ b/bin/merge_kraken_res.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 # Written by Maxime Borry and released under the MIT license. # Modifications for DSL2 compliance and integration into eager DSL2 release by @ilight1542 @@ -9,46 +9,39 @@ import pandas as pd import numpy as np + def _get_args(): - '''This function parses and return arguments passed in''' + """This function parses and return arguments passed in""" parser = argparse.ArgumentParser( - prog='merge_kraken_res', + prog="merge_kraken_res", formatter_class=argparse.RawDescriptionHelpFormatter, - description='Merging csv count files in one table') + description="Merging csv count files in one table", + ) parser.add_argument( - '-or', + "-or", dest="readout", default="kraken_read_count_table.csv", - help="Read count output file. Default = kraken_read_count_table.csv") + help="Read count output file. Default = kraken_read_count_table.csv", + ) parser.add_argument( - '-ok', + "-ok", dest="kmerout", default="kraken_kmer_unicity_table.csv", - help="Kmer unicity output file. Default = kraken_kmer_unicity_table.csv") - parser.add_argument( - '-inr', - dest="input_read_csvs", - help="Paths to .read_ outputs from krakenparse") - parser.add_argument( - '-ink', - dest="input_kmer_csvs", - help="Paths to .kmer_ outputs from krakenparse" + help="Kmer unicity output file. Default = kraken_kmer_unicity_table.csv", ) - args = parser.parse_args() - read_paths_in = args.input_read_csvs - kmer_paths_in = args.input_kmer_csvs - readout = args.readout - kmerout = args.kmerout + readout = args.readout + kmerout = args.kmerout - return(readout, kmerout, read_paths_in, kmer_paths_in) + return (readout, kmerout) -def get_csv(read_paths_in, kmer_paths_in): - kmer = [i for i in kmer_paths_in] - read = [i for i in read_paths_in] - return(read, kmer) +def get_csv(): + tmp = [i for i in os.listdir() if ".csv" in i] + kmer = [i for i in tmp if ".kmer_" in i] + read = [i for i in tmp if ".read_" in i] + return (read, kmer) def _get_basename(file_name): @@ -56,16 +49,16 @@ def _get_basename(file_name): basename = file_name.split("/")[-1].split(".")[0] else: basename = file_name.split(".")[0] - return(basename) + return basename def merge_csv(all_csv): df = pd.read_csv(all_csv[0], index_col=0) for i in range(1, len(all_csv)): df_tmp = pd.read_csv(all_csv[i], index_col=0) - df = pd.merge(left=df, right=df_tmp, on='TAXID', how='outer') + df = pd.merge(left=df, right=df_tmp, on="TAXID", how="outer", validate="1:1") df.fillna(0, inplace=True) - return(df) + return df def write_csv(pd_dataframe, outfile): @@ -73,8 +66,8 @@ def write_csv(pd_dataframe, outfile): if __name__ == "__main__": - READOUT, KMEROUT, READ_PATHS_IN, KMER_PATHS_IN = _get_args() - reads, kmers = get_csv(READ_PATHS_IN,KMER_PATHS_IN) + READOUT, KMEROUT = _get_args() + reads, kmers = get_csv() read_df = merge_csv(reads) kmer_df = merge_csv(kmers) write_csv(read_df, READOUT) diff --git a/conf/modules.config b/conf/modules.config index 978600761..40797a5db 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -502,7 +502,7 @@ process { withName: KRAKEN2_KRAKEN2 { ext.args = [ - params.metagenomics_kraken2_save_minimizers ? "-report-minimizer-data" : "" + "--report-minimizer-data" ].join(' ').trim() publishDir = [ path: { "${params.outdir}/metagenomics_screening/profiling/kraken2/" }, diff --git a/modules/local/krakenmerge.nf b/modules/local/krakenmerge.nf index b0bdfdcd3..4e9fc168c 100644 --- a/modules/local/krakenmerge.nf +++ b/modules/local/krakenmerge.nf @@ -22,10 +22,10 @@ process KRAKENMERGE { // Software MUST be pinned to channel (i.e. "bioconda"), version (i.e. "1.10"). // For Conda, the build (i.e. "h9402c20_2") must be EXCLUDED to support installation on different operating systems. // TODO nf-core: See section in main README for further information regarding finding and adding container addresses to the section below. - conda "conda-forge::python=3.8.3" + conda "conda-forge::pandas=1.5.2" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/python:3.8.3' : - 'quay.io/biocontainers/python:3.8.3' }" + 'https://depot.galaxyproject.org/singularity/pandas:1.5.2' : + 'quay.io/biocontainers/pandas:1.5.2' }" input: // TODO nf-core: Where applicable all sample-specific information e.g. "id", "single_end", "read_group" // MUST be provided as an input via a Groovy Map called "meta". @@ -33,7 +33,7 @@ process KRAKENMERGE { // https://github.com/nf-core/modules/blob/master/modules/nf-core/bwa/index/main.nf // TODO check if this works - path karken_parse_reads + path kraken_parse_reads path kraken_parse_kmers output: @@ -58,8 +58,6 @@ process KRAKENMERGE { merge_kraken_res.py \\ -or $read_out \\ -ok $kmer_out \\ - -inr $karken_parse_reads \\ - -ink $kraken_parse_kmers cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/krakenparse.nf b/modules/local/krakenparse.nf index ef36714fc..3da22c8d1 100644 --- a/modules/local/krakenparse.nf +++ b/modules/local/krakenparse.nf @@ -1,7 +1,3 @@ -// TODO nf-core: If in doubt look at other nf-core/modules to see how we are doing things! :) -// https://github.com/nf-core/modules/tree/master/modules/nf-core/ -// You can also ask for help via your pull request or on the #modules channel on the nf-core Slack workspace: -// https://nf-co.re/join // TODO nf-core: A module file SHOULD only define input and output files as command-line parameters. // All other parameters MUST be provided using the "task.ext" directive, see here: // https://www.nextflow.io/docs/latest/process.html#ext @@ -12,8 +8,6 @@ // unless there is a run-time, storage advantage in implementing in this way // e.g. it's ok to have a single module for bwa to output BAM instead of SAM: // bwa mem | samtools view -B -T ref.fasta -// TODO nf-core: Optional inputs are not currently supported by Nextflow. However, using an empty -// list (`[]`) instead of a file can be used to work around this issue. process KRAKENPARSE { tag "$meta.id" @@ -37,8 +31,8 @@ process KRAKENPARSE { tuple val(meta), path(report) output: - tuple val(meta), path(read_out), emit: read_kraken_parsed - tuple val(meta), path(kmer_out), emit: kmer_kraken_parsed + tuple val(meta), path("*read_kraken_parsed.csv"), emit: read_kraken_parsed + tuple val(meta), path("*kmer_kraken_parsed.csv"), emit: kmer_kraken_parsed path "versions.yml" , emit: versions when: diff --git a/subworkflows/local/metagenomics_postprocessing.nf b/subworkflows/local/metagenomics_postprocessing.nf index d0ac43942..14953c56b 100644 --- a/subworkflows/local/metagenomics_postprocessing.nf +++ b/subworkflows/local/metagenomics_postprocessing.nf @@ -42,7 +42,7 @@ workflow METAGENOMICS_POSTPROCESSING { meta, kmer_out -> [ kmer_out ] } - KRAKENMERGE ( ch_list_of_kraken_parse_reads , ch_list_of_kraken_parse_kmer ) + KRAKENMERGE ( ch_list_of_kraken_parse_reads.collect() , ch_list_of_kraken_parse_kmer.collect() ) ch_versions = ch_versions.mix( KRAKENPARSE.out.versions.first(), KRAKENMERGE.out.versions.first() ) ch_results = ch_results.mix( KRAKENMERGE.out.read_count_table, KRAKENMERGE.out.kmer_duplication_table ) From 2be88618526b14a4c978d9f5d51d76e54438161f Mon Sep 17 00:00:00 2001 From: Ian Light Date: Fri, 9 Jun 2023 09:01:33 +0000 Subject: [PATCH 031/198] implementation of singlestrnd args into malt: test --- conf/modules.config | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 40797a5db..1b0d5b307 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -519,9 +519,9 @@ process { ] } - withName: METAPHLAN3_METAPHLAN3 { + withName: METAPHLAN_METAPHLAN { publishDir = [ - path: { "${params.outdir}/metagenomics_screening/profiling/metaphlan3/" }, + path: { "${params.outdir}/metagenomics_screening/profiling/metaphlan/" }, mode: params.publish_dir_mode, pattern: '*.{biom,txt}' ] @@ -538,7 +538,7 @@ process { params.metagenomics_maltextract_matches ? "--matches" : "", params.metagenomics_maltextract_megansummary ? "--meganSummary" : "", params.metagenomics_maltextract_topalignment ? "--useTopAlignment" : "", - // meta.single_stranded ? "--singleStranded" : "" #TODO: figure out how to parse single strandedness, + "${meta.single_stranded}" ? "--singleStranded" : "" ].join(' ').trim() publishDir = [ path: { "${params.outdir}/metagenomics_screening/postprocessing/maltextract/" }, From c8b6c99b08df46cfffd4bf46530298830849eceb Mon Sep 17 00:00:00 2001 From: Ian Light Date: Fri, 9 Jun 2023 09:02:23 +0000 Subject: [PATCH 032/198] tried to add meta singlestrnd to maltextr input --- CITATIONS.md | 4 ++-- modules.json | 4 ++-- .../metaphlan3 => metaphlan/metaphlan}/main.nf | 16 ++++++++-------- .../metaphlan3 => metaphlan/metaphlan}/meta.yml | 15 ++++++++------- nextflow_schema.json | 4 ++-- subworkflows/local/metagenomics.nf | 2 +- subworkflows/local/metagenomics_profiling.nf | 16 ++++++++-------- workflows/eager.nf | 5 +++-- 8 files changed, 34 insertions(+), 32 deletions(-) rename modules/nf-core/{metaphlan3/metaphlan3 => metaphlan/metaphlan}/main.nf (78%) rename modules/nf-core/{metaphlan3/metaphlan3 => metaphlan/metaphlan}/meta.yml (74%) diff --git a/CITATIONS.md b/CITATIONS.md index 6ee0bb663..652413151 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -68,9 +68,9 @@ > Breitwieser, Florian P., Daniel N. Baker, and Steven L. Salzberg. 2018. KrakenUniq: confident and fast metagenomics classification using unique k-mer counts. Genome Biology 19 (1): 198. doi: 10.1186/s13059-018-1568-0 -- [MetaPhlAn3](https://doi.org/10.7554/eLife.65088) +- [MetaPhlAn](https://doi.org/10.1038/s41587-023-01688-w) - > Beghini, Francesco, Lauren J McIver, Aitor Blanco-Míguez, Leonard Dubois, Francesco Asnicar, Sagun Maharjan, Ana Mailyan, et al. 2021. “Integrating Taxonomic, Functional, and Strain-Level Profiling of Diverse Microbial Communities with BioBakery 3.” Edited by Peter Turnbaugh, Eduardo Franco, and C Titus Brown. ELife 10 (May): e65088. doi: 10.7554/eLife.65088 + > Blanco-Míguez, A., Beghini, F., Cumbo, F. et al. Extending and improving metagenomic taxonomic profiling with uncharacterized species using MetaPhlAn 4. Nat Biotechnol (2023). doi: [10.1038/s41587-023-01688-w](https://doi.org/10.1038/s41587-023-01688-w) ## Software packaging/containerisation tools diff --git a/modules.json b/modules.json index 495991a35..579e8d2ab 100644 --- a/modules.json +++ b/modules.json @@ -105,9 +105,9 @@ "git_sha": "0f8a77ff00e65eaeebc509b8156eaa983192474b", "installed_by": ["modules"] }, - "metaphlan3/metaphlan3": { + "metaphlan/metaphlan": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", "installed_by": ["modules"] }, "mtnucratio": { diff --git a/modules/nf-core/metaphlan3/metaphlan3/main.nf b/modules/nf-core/metaphlan/metaphlan/main.nf similarity index 78% rename from modules/nf-core/metaphlan3/metaphlan3/main.nf rename to modules/nf-core/metaphlan/metaphlan/main.nf index 34f8705cc..15bd42858 100644 --- a/modules/nf-core/metaphlan3/metaphlan3/main.nf +++ b/modules/nf-core/metaphlan/metaphlan/main.nf @@ -1,15 +1,15 @@ -process METAPHLAN3_METAPHLAN3 { +process METAPHLAN_METAPHLAN { tag "$meta.id" - label 'process_high' + label 'process_medium' - conda "bioconda::metaphlan=3.0.12" + conda "bioconda::metaphlan=4.0.6" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/metaphlan:3.0.12--pyhb7b1952_0' : - 'quay.io/biocontainers/metaphlan:3.0.12--pyhb7b1952_0' }" + 'https://depot.galaxyproject.org/singularity/metaphlan:4.0.6--pyhca03a8a_0' : + 'biocontainers/metaphlan:4.0.6--pyhca03a8a_0' }" input: tuple val(meta), path(input) - path metaphlan_db + path metaphlan_db_latest output: tuple val(meta), path("*_profile.txt") , emit: profile @@ -28,7 +28,7 @@ process METAPHLAN3_METAPHLAN3 { def bowtie2_out = "$input_type" == "--input_type bowtie2out" || "$input_type" == "--input_type sam" ? '' : "--bowtie2out ${prefix}.bowtie2out.txt" """ - BT2_DB=`find -L "${metaphlan_db}" -name "*rev.1.bt2" -exec dirname {} \\;` + BT2_DB=`find -L "${metaphlan_db_latest}" -name "*rev.1.bt2l" -exec dirname {} \\;` metaphlan \\ --nproc $task.cpus \\ @@ -42,7 +42,7 @@ process METAPHLAN3_METAPHLAN3 { cat <<-END_VERSIONS > versions.yml "${task.process}": - metaphlan3: \$(metaphlan --version 2>&1 | awk '{print \$3}') + metaphlan: \$(metaphlan --version 2>&1 | awk '{print \$3}') END_VERSIONS """ } diff --git a/modules/nf-core/metaphlan3/metaphlan3/meta.yml b/modules/nf-core/metaphlan/metaphlan/meta.yml similarity index 74% rename from modules/nf-core/metaphlan3/metaphlan3/meta.yml rename to modules/nf-core/metaphlan/metaphlan/meta.yml index 659d83a95..cb74bd594 100644 --- a/modules/nf-core/metaphlan3/metaphlan3/meta.yml +++ b/modules/nf-core/metaphlan/metaphlan/meta.yml @@ -1,17 +1,17 @@ -name: metaphlan3_metaphlan3 +name: metaphlan_metaphlan description: MetaPhlAn is a tool for profiling the composition of microbial communities from metagenomic shotgun sequencing data. keywords: - metagenomics - classification - fastq - - bam - fasta + - sam tools: - - metaphlan3: + - metaphlan: description: Identify clades (phyla to species) present in the metagenome obtained from a microbiome sample and their relative abundance homepage: https://huttenhower.sph.harvard.edu/metaphlan/ documentation: https://github.com/biobakery/MetaPhlAn - doi: "10.7554/eLife.65088" + doi: "10.1038/s41587-023-01688-w" licence: ["MIT License"] input: @@ -22,13 +22,13 @@ input: e.g. [ id:'test', single_end:false ] - input: type: file - description: Metaphlan 3.0 can classify the metagenome from a variety of input data types, including FASTQ files (single-end and paired-end), FASTA, bowtie2-produced SAM files (produced from alignments to the MetaPHlAn marker database) and intermediate bowtie2 alignment files (bowtie2out) + description: Metaphlan can classify the metagenome from a variety of input data types, including FASTQ files (single-end and paired-end), FASTA, bowtie2-produced SAM files (produced from alignments to the MetaPHlAn marker database) and intermediate bowtie2 alignment files (bowtie2out) pattern: "*.{fastq.gz, fasta, fasta.gz, sam, bowtie2out.txt}" - metaphlan_db: type: file description: | - Directory containing pre-downloaded and uncompressed MetaPhlAn3 database downloaded from: http://cmprod1.cibio.unitn.it/biobakery3/metaphlan_databases/. - Note that you will also need to specify `--index` and the database version name (e.g. 'mpa_v31_CHOCOPhlAn_201901') in your module.conf ext.args for METAPHLAN3_METAPHLAN3! + Directory containing pre-downloaded and uncompressed MetaPhlAn database downloaded from: http://cmprod1.cibio.unitn.it/biobakery4/metaphlan_databases/. + Note that you will also need to specify `--index` and the database version name (e.g. 'mpa_vJan21_TOY_CHOCOPhlAnSGB_202103') in your module.conf ext.args for METAPHLAN_METAPHLAN! pattern: "*/" output: @@ -56,3 +56,4 @@ output: authors: - "@MGordon09" + - "@LilyAnderssonLee" diff --git a/nextflow_schema.json b/nextflow_schema.json index 16c5c3881..eb4e90bd3 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -633,7 +633,7 @@ "metagenomics_profiling_tool": { "type": "string", "description": "Specify which tool to use for metagenomic profiling and screening.", - "enum": ["malt", "metaphlan2", "kraken2", "krakenuniq"], + "enum": ["malt", "metaphlan", "kraken2", "krakenuniq"], "fa_icon": "fas fa-toolbox", "help_text": "Select which tool to run metagenomics profiling on designated metagenomics_screening_input. These tools behave vastly differently due to performing read profiling using different methods and yield vastly different reuslts." }, @@ -719,7 +719,7 @@ "default": 1, "description": "Specify a minimum number of reads a taxon of sample total is required to have to be retained in malt or kraken. Not compatible with --malt_min_support_mode 'percent'.", "fa_icon": "fas fa-sort-numeric-up-alt", - "help_text": "For usage in malt: Specify the minimum number of reads a given taxon is required to have to be retained as a positive 'hit'. \n For usage in kraken2 or krakenuniq: Specify the number of hits on a clade to retain it in the final report when using kraken_parse. \nFor malt, this only applies when `--malt_min_support_mode` is set to 'reads'. Default: 1.\n\n> Modifies MALT or kraken_parse.py parameter: `-sup` and `-c` respectively\n" + "help_text": "For usage in malt or kraken: Specify the minimum number of reads a given taxon is required to have to be retained as a positive 'hit'. \n For usage in kraken2 or krakenuniq: Specify the number of hits on a clade to retain it in the final report when using kraken_parse. Default: 1. \nFor malt, this only applies when `--malt_min_support_mode` is set to 'reads'. \n\n> Modifies MALT or kraken_parse.py parameter: `-sup` and `-c` respectively\n" }, "metagenomics_malt_max_queries": { "type": "integer", diff --git a/subworkflows/local/metagenomics.nf b/subworkflows/local/metagenomics.nf index 6b59c2cc9..0ba78ba16 100644 --- a/subworkflows/local/metagenomics.nf +++ b/subworkflows/local/metagenomics.nf @@ -43,7 +43,7 @@ workflow METAGENOMICS { // - if ( params.metagenomics_postprocessing_tool || params.metagenomics_profiling_tool == 'kraken2' || params.metagenomics_profiling_tool == 'krakenuniq' ) { + if ( params.metagenomics_postprocessing_tool == 'krakenmerge' || params.metagenomics_profiling_tool == 'kraken2' || params.metagenomics_profiling_tool == 'krakenuniq' ) { METAGENOMICS_POSTPROCESSING ( METAGENOMICS_PROFILING.out.postprocessing_input ) diff --git a/subworkflows/local/metagenomics_profiling.nf b/subworkflows/local/metagenomics_profiling.nf index 32a2539d1..6d45cfb4f 100644 --- a/subworkflows/local/metagenomics_profiling.nf +++ b/subworkflows/local/metagenomics_profiling.nf @@ -4,10 +4,10 @@ // Much taken from nf-core/taxprofile subworkflows/local/profiling.nf -include { MALT_RUN } from '../../modules/nf-core/malt/run/main' -include { KRAKEN2_KRAKEN2 } from '../../modules/nf-core/kraken2/kraken2/main' -include { KRAKENUNIQ_PRELOADEDKRAKENUNIQ } from '../../modules/nf-core/krakenuniq/preloadedkrakenuniq/main' -include { METAPHLAN3_METAPHLAN3 } from '../../modules/nf-core/metaphlan3/metaphlan3/main' +include { MALT_RUN } from '../../modules/nf-core/malt/run/main' +include { KRAKEN2_KRAKEN2 } from '../../modules/nf-core/kraken2/kraken2/main' +include { KRAKENUNIQ_PRELOADEDKRAKENUNIQ } from '../../modules/nf-core/krakenuniq/preloadedkrakenuniq/main' +include { METAPHLAN_METAPHLAN } from '../../modules/nf-core/metaphlan/metaphlan/main' workflow METAGENOMICS_PROFILING { @@ -111,11 +111,11 @@ workflow METAGENOMICS_PROFILING { ch_postprocessing_input = ch_postprocessing_input.mix( ch_maltrun_for_megan ) } - else if ( params.metagenomics_profiling_tool == 'metaphlan3' ) { + else if ( params.metagenomics_profiling_tool == 'metaphlan' ) { - METAPHLAN3_METAPHLAN3 ( reads , database ) - ch_versions = ch_versions.mix( METAPHLAN3_METAPHLAN3.out.versions.first() ) - ch_raw_profiles = ch_raw_profiles.mix( METAPHLAN3_METAPHLAN3.out.profile ) + METAPHLAN_METAPHLAN ( reads , database ) + ch_versions = ch_versions.mix( METAPHLAN_METAPHLAN.out.versions.first() ) + ch_raw_profiles = ch_raw_profiles.mix( METAPHLAN_METAPHLAN.out.profile ) } diff --git a/workflows/eager.nf b/workflows/eager.nf index f58de8b2c..1306dacc8 100644 --- a/workflows/eager.nf +++ b/workflows/eager.nf @@ -45,7 +45,7 @@ if ( params.run_metagenomics && ! params.metagenomics_profiling_database ) { exi if ( params.metagenomics_postprocessing_tool == 'maltextract' && ! params.metagenomics_profiling_tool != 'malt' ) { exit 1, ("[nf-core/eager] ERROR: --metagenomics_postprocessing_tool 'maltextract' can only be run with --metagenomics_profiling_tool 'malt'") } -if ( params.metagenomics_postprocessing_tool == 'krakenparse' && ( ! params.metagenomics_profiling_tool != 'kraken2' || ! params.metagenomics_profiling_tool != 'krakenuniq' ) ) { exit 1, ("[nf-core/eager] ERROR: --metagenomics_postprocessing_tool 'krakenparse' can only be run with --metagenomics_profiling_tool 'kraken2' or 'krakenuniq'") } +if ( params.metagenomics_postprocessing_tool == 'krakenmerge' && ( ! params.metagenomics_profiling_tool != 'kraken2' || ! params.metagenomics_profiling_tool != 'krakenuniq' ) ) { exit 1, ("[nf-core/eager] ERROR: --metagenomics_postprocessing_tool 'krakenmerge' can only be run with --metagenomics_profiling_tool 'kraken2' or 'krakenuniq'") } /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -55,6 +55,8 @@ if ( params.metagenomics_postprocessing_tool == 'krakenparse' && ( ! params.meta if ( params.preprocessing_skipadaptertrim && params.preprocessing_adapterlist ) log.warn("[nf-core/eager] --preprocessing_skipadaptertrim will override --preprocessing_adapterlist. Adapter trimming will be skipped!") +if ( params.metagenomics_postprocessing_tool == 'krakenmerge' && params.metagenomics_min_support_reads == 1 ) log.warn("[nf-core/eager] Warning: The default value for krakenmerge minimum reads for outputing a node has not been changed from the default. This default is set for MALT and maltextract. Consider updating to the default value for krakenmerge (50 reads) by setting --metagenomics_min_support_reads 50") + /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CONFIG FILES @@ -253,7 +255,6 @@ workflow EAGER { // Section: Metagenomics // - //TODO: finish and figure out how exactly to call with proper database (check via a helper function?) if ( params.run_metagenomics ) { METAGENOMICS ( ch_bamfiltered_for_metagenomics ) ch_versions = ch_versions.mix( METAGENOMICS.out.versions.first() ) From 41f0f8adebc082c1329429f5026d698028b72d1e Mon Sep 17 00:00:00 2001 From: Merlin Szymanski Date: Fri, 16 Jun 2023 11:49:02 +0200 Subject: [PATCH 033/198] Fix MALT run with and without malt_group_size In our workflow, database is a channel, not a value, so most of the code didnt work. I remodelled the ch_input_for_malt channel to fit this fact --- conf/modules.config | 2 +- subworkflows/local/metagenomics_profiling.nf | 97 ++++++++------------ 2 files changed, 40 insertions(+), 59 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 1b0d5b307..f99f503bf 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -538,7 +538,7 @@ process { params.metagenomics_maltextract_matches ? "--matches" : "", params.metagenomics_maltextract_megansummary ? "--meganSummary" : "", params.metagenomics_maltextract_topalignment ? "--useTopAlignment" : "", - "${meta.single_stranded}" ? "--singleStranded" : "" + //"${meta.single_stranded}" ? "--singleStranded" : "" ].join(' ').trim() publishDir = [ path: { "${params.outdir}/metagenomics_screening/postprocessing/maltextract/" }, diff --git a/subworkflows/local/metagenomics_profiling.nf b/subworkflows/local/metagenomics_profiling.nf index 6d45cfb4f..550357eb6 100644 --- a/subworkflows/local/metagenomics_profiling.nf +++ b/subworkflows/local/metagenomics_profiling.nf @@ -34,76 +34,57 @@ workflow METAGENOMICS_PROFILING { if ( params.metagenomics_profiling_tool == 'malt' ) { - if ( params.metagenomics_malt_group_size > 0 ) { - ch_input_for_malt = reads - .map { - meta, reads -> - - // Reset entire input meta for MALT to just database name, - // as we don't run run on a per-sample basis due to huge datbaases - // so all samples are in one run and so sample-specific metadata - // unnecessary. Set as database name to prevent `null` job ID and prefix. - - def temp_meta = [ id: database ] - - // Combine reduced sample metadata with updated database parameters metadata, - // make sure id is db_name for publishing purposes. - - [ temp_meta, reads, database ] + // Reset entire input meta for MALT to just database name, + // as we don't run run on a per-sample basis due to huge databases + // so all samples are in one run and so sample-specific metadata + // unnecessary. Set as database name to prevent `null` job ID and prefix. + if ( params.metagenomics_malt_group_size > 0 ) { + ch_input_for_malt_tmp = reads + .map { meta, reads -> reads } + .collate( params.metagenomics_malt_group_size ) //collate into bins of defined lengths + .map{ + reads -> + // add new meta with db-name as id + [[id: file(params.metagenomics_profiling_database).getBaseName() ], reads] } - .groupTuple(by: [0,2], size: params.metagenomics_malt_group_size, remainder: true) - .multiMap { + .combine(database) //combine with database + .multiMap{ + // and split apart again meta, reads, database -> - reads: [ meta, reads ] + reads: [meta, reads] database: database } + ch_input_for_malt = ch_input_for_malt_tmp.reads + database = ch_input_for_malt_tmp.database } else { ch_input_for_malt = reads - .map { - meta, reads -> - - // Reset entire input meta for MALT to just database name, - // as we don't run run on a per-sample basis due to huge datbaases - // so all samples are in one run and so sample-specific metadata - // unnecessary. Set as database name to prevent `null` job ID and prefix. - - def temp_meta = [ id: database ] - - // Combine reduced sample metadata with updated database parameters metadata, - // make sure id is db_name for publishing purposes. - - [ temp_meta, reads, database ] - - } - .groupTuple(by: [0,2]) - .multiMap { - meta, reads, database -> - reads: [ meta, reads ] - database: database + .map { meta, reads -> reads } + .collect() + .map{ + // make sure id is db_name for publishing purposes. + reads -> + [[id: file(params.metagenomics_profiling_database).getBaseName() ], reads] } } - // MALT: We can groupTuple to have all samples in one channel for MALT as database - // since loading takes a long time, we only want to run it once per database - // unless otherwise specified (eg grouping samples) - - MALT_RUN ( ch_input_for_malt.reads, ch_input_for_malt.database ) + // Run MALT + MALT_RUN ( ch_input_for_malt, database ) ch_maltrun_for_megan = MALT_RUN.out.rma6 - .transpose() - .map{ - meta, rma -> - // re-extract meta from file names, use filename without rma to - // ensure we keep paired-end information in downstream filenames - // when no pair-merging - def meta_new = meta.clone() - meta_new['db_name'] = meta.id - meta_new['id'] = rma.baseName - [ meta_new, rma ] - } + .transpose() + .map { + meta, rma -> + // re-extract meta from file names, use filename without rma to + // ensure we keep paired-end information in downstream filenames + // when no pair-merging + def meta_new = meta.clone() + meta_new['db_name'] = meta.id + meta_new['id'] = rma.baseName + [ meta_new, rma ] + } ch_versions = ch_versions.mix( MALT_RUN.out.versions.first() ) ch_raw_classifications = ch_raw_classifications.mix( ch_maltrun_for_megan ) @@ -123,8 +104,8 @@ workflow METAGENOMICS_PROFILING { // run kraken uniq per sample, to preserve the meta-data reads = reads.combine(database) - krakenuniq_reads = reads.map{meta, reads, database -> [meta, reads]} - krakenuniq_db = reads.map{meta, reads, database -> [database]} + krakenuniq_reads = reads.map{ meta, reads, database -> [meta, reads] } + krakenuniq_db = reads.map{ meta, reads, database -> [database] } KRAKENUNIQ_PRELOADEDKRAKENUNIQ ( krakenuniq_reads, From 83a875ba40c33463a7a01d318bb4d435c3dbedcc Mon Sep 17 00:00:00 2001 From: Merlin Szymanski Date: Fri, 16 Jun 2023 12:14:00 +0200 Subject: [PATCH 034/198] make maltextract postprocessing possible --- subworkflows/local/metagenomics.nf | 2 +- subworkflows/local/metagenomics_postprocessing.nf | 2 +- subworkflows/local/metagenomics_profiling.nf | 2 +- workflows/eager.nf | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/subworkflows/local/metagenomics.nf b/subworkflows/local/metagenomics.nf index 0ba78ba16..c3809244a 100644 --- a/subworkflows/local/metagenomics.nf +++ b/subworkflows/local/metagenomics.nf @@ -43,7 +43,7 @@ workflow METAGENOMICS { // - if ( params.metagenomics_postprocessing_tool == 'krakenmerge' || params.metagenomics_profiling_tool == 'kraken2' || params.metagenomics_profiling_tool == 'krakenuniq' ) { + if ( params.metagenomics_postprocessing_tool || ['kraken2', 'krakenuniq'].contains(params.metagenomics_profiling_tool)) { METAGENOMICS_POSTPROCESSING ( METAGENOMICS_PROFILING.out.postprocessing_input ) diff --git a/subworkflows/local/metagenomics_postprocessing.nf b/subworkflows/local/metagenomics_postprocessing.nf index 14953c56b..0893cf1a1 100644 --- a/subworkflows/local/metagenomics_postprocessing.nf +++ b/subworkflows/local/metagenomics_postprocessing.nf @@ -31,7 +31,7 @@ workflow METAGENOMICS_POSTPROCESSING { } - else if ( params.metagenomics_postprocessing_tool == 'krakenmerge' ) { + else { KRAKENPARSE ( ch_postprocessing_input ) diff --git a/subworkflows/local/metagenomics_profiling.nf b/subworkflows/local/metagenomics_profiling.nf index 550357eb6..957c06f48 100644 --- a/subworkflows/local/metagenomics_profiling.nf +++ b/subworkflows/local/metagenomics_profiling.nf @@ -89,7 +89,7 @@ workflow METAGENOMICS_PROFILING { ch_versions = ch_versions.mix( MALT_RUN.out.versions.first() ) ch_raw_classifications = ch_raw_classifications.mix( ch_maltrun_for_megan ) ch_multiqc_files = ch_multiqc_files.mix( MALT_RUN.out.log ) - ch_postprocessing_input = ch_postprocessing_input.mix( ch_maltrun_for_megan ) + ch_postprocessing_input = ch_postprocessing_input.mix( ch_maltrun_for_megan ) } else if ( params.metagenomics_profiling_tool == 'metaphlan' ) { diff --git a/workflows/eager.nf b/workflows/eager.nf index 1306dacc8..d26fdec9d 100644 --- a/workflows/eager.nf +++ b/workflows/eager.nf @@ -43,7 +43,7 @@ if ( params.deduplication_tool == 'dedup' && ! params.preprocessing_excludeunmer // TODO add any other metagenomics screening parameters checks for eg complexity filtering, post-processing if ( params.run_metagenomics && ! params.metagenomics_profiling_database ) { exit 1, ("[nf-core/eager] ERROR: Please provide an appropriate database path for metagenomics screening using --metagenomics_profiling_database") } -if ( params.metagenomics_postprocessing_tool == 'maltextract' && ! params.metagenomics_profiling_tool != 'malt' ) { exit 1, ("[nf-core/eager] ERROR: --metagenomics_postprocessing_tool 'maltextract' can only be run with --metagenomics_profiling_tool 'malt'") } +if ( params.metagenomics_postprocessing_tool == 'maltextract' && params.metagenomics_profiling_tool != 'malt' ) { exit 1, ("[nf-core/eager] ERROR: --metagenomics_postprocessing_tool 'maltextract' can only be run with --metagenomics_profiling_tool 'malt'") } if ( params.metagenomics_postprocessing_tool == 'krakenmerge' && ( ! params.metagenomics_profiling_tool != 'kraken2' || ! params.metagenomics_profiling_tool != 'krakenuniq' ) ) { exit 1, ("[nf-core/eager] ERROR: --metagenomics_postprocessing_tool 'krakenmerge' can only be run with --metagenomics_profiling_tool 'kraken2' or 'krakenuniq'") } From 5dc49c99ac96ca24a61f8286b7a47c5a3ecc8252 Mon Sep 17 00:00:00 2001 From: Merlin Szymanski Date: Fri, 16 Jun 2023 12:23:27 +0200 Subject: [PATCH 035/198] Krakenuniq workflow done --- bin/kraken_parse.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/bin/kraken_parse.py b/bin/kraken_parse.py index 0e23f6dca..abfe28961 100755 --- a/bin/kraken_parse.py +++ b/bin/kraken_parse.py @@ -54,6 +54,8 @@ def parse_kraken(infile, countlim): kmer_dict = {} csvreader = csv.reader(f, delimiter="\t") for line in csvreader: + if line[0].startswith("#") or line[0] == "%": + continue reads = int(line[1]) if reads >= countlim: taxid = line[6] From acecf8ab648f5e264d7b89fb54d965294abe72ae Mon Sep 17 00:00:00 2001 From: Ian Light Date: Fri, 23 Jun 2023 08:59:00 +0000 Subject: [PATCH 036/198] implementation of metaphlanmerge --- modules.json | 5 +++ modules/local/krakenparse.nf | 15 ------- .../metaphlan/mergemetaphlantables/main.nf | 33 ++++++++++++++ .../metaphlan/mergemetaphlantables/meta.yml | 45 +++++++++++++++++++ subworkflows/local/metagenomics.nf | 2 +- .../local/metagenomics_postprocessing.nf | 26 ++++++----- workflows/eager.nf | 2 + 7 files changed, 100 insertions(+), 28 deletions(-) create mode 100644 modules/nf-core/metaphlan/mergemetaphlantables/main.nf create mode 100644 modules/nf-core/metaphlan/mergemetaphlantables/meta.yml diff --git a/modules.json b/modules.json index 579e8d2ab..c5fd01c1d 100644 --- a/modules.json +++ b/modules.json @@ -105,6 +105,11 @@ "git_sha": "0f8a77ff00e65eaeebc509b8156eaa983192474b", "installed_by": ["modules"] }, + "metaphlan/mergemetaphlantables": { + "branch": "master", + "git_sha": "9aa59197c0fb35c29e315bcd10c0fc9e1afc70a8", + "installed_by": ["modules"] + }, "metaphlan/metaphlan": { "branch": "master", "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", diff --git a/modules/local/krakenparse.nf b/modules/local/krakenparse.nf index 3da22c8d1..1dc875302 100644 --- a/modules/local/krakenparse.nf +++ b/modules/local/krakenparse.nf @@ -1,22 +1,7 @@ -// TODO nf-core: A module file SHOULD only define input and output files as command-line parameters. -// All other parameters MUST be provided using the "task.ext" directive, see here: -// https://www.nextflow.io/docs/latest/process.html#ext -// where "task.ext" is a string. -// Any parameters that need to be evaluated in the context of a particular sample -// e.g. single-end/paired-end data MUST also be defined and evaluated appropriately. -// TODO nf-core: Software that can be piped together SHOULD be added to separate module files -// unless there is a run-time, storage advantage in implementing in this way -// e.g. it's ok to have a single module for bwa to output BAM instead of SAM: -// bwa mem | samtools view -B -T ref.fasta - process KRAKENPARSE { tag "$meta.id" label 'process_single' - // TODO nf-core: List required Conda package(s). - // Software MUST be pinned to channel (i.e. "bioconda"), version (i.e. "1.10"). - // For Conda, the build (i.e. "h9402c20_2") must be EXCLUDED to support installation on different operating systems. - // TODO nf-core: See section in main README for further information regarding finding and adding container addresses to the section below. conda "conda-forge::python=3.8.3" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/python:3.8.3' : diff --git a/modules/nf-core/metaphlan/mergemetaphlantables/main.nf b/modules/nf-core/metaphlan/mergemetaphlantables/main.nf new file mode 100644 index 000000000..94c70cd6f --- /dev/null +++ b/modules/nf-core/metaphlan/mergemetaphlantables/main.nf @@ -0,0 +1,33 @@ +process METAPHLAN_MERGEMETAPHLANTABLES { + label 'process_single' + + conda "bioconda::metaphlan=4.0.6" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/metaphlan:4.0.6--pyhca03a8a_0' : + 'quay.io/biocontainers/metaphlan:4.0.6--pyhca03a8a_0' }" + + input: + tuple val(meta), path(profiles) + + output: + tuple val(meta), path("${prefix}.txt") , emit: txt + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + """ + merge_metaphlan_tables.py \\ + $args \\ + -o ${prefix}.txt \\ + ${profiles} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + metaphlan: \$(metaphlan --version 2>&1 | awk '{print \$3}') + END_VERSIONS + """ +} diff --git a/modules/nf-core/metaphlan/mergemetaphlantables/meta.yml b/modules/nf-core/metaphlan/mergemetaphlantables/meta.yml new file mode 100644 index 000000000..3c93964b9 --- /dev/null +++ b/modules/nf-core/metaphlan/mergemetaphlantables/meta.yml @@ -0,0 +1,45 @@ +name: "metaphlan_mergemetaphlantables" +description: Merges output abundance tables from MetaPhlAn4 +keywords: + - metagenomics + - classification + - merge + - table + - profiles +tools: + - metaphlan4: + description: Identify clades (phyla to species) present in the metagenome obtained from a microbiome sample and their relative abundance + homepage: https://huttenhower.sph.harvard.edu/metaphlan/ + documentation: https://github.com/biobakery/MetaPhlAn + doi: "10.1038/s41587-023-01688-w" + licence: ["MIT License"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - profiles: + type: file + description: List of per-sample MetaPhlAn4 taxonomic abundance tables + pattern: "*" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - txt: + type: file + description: Combined MetaPhlAn4 table + pattern: "*.txt" + +authors: + - "@jfy133" + - "@LilyAnderssonLee" diff --git a/subworkflows/local/metagenomics.nf b/subworkflows/local/metagenomics.nf index c3809244a..5cfe9cd75 100644 --- a/subworkflows/local/metagenomics.nf +++ b/subworkflows/local/metagenomics.nf @@ -43,7 +43,7 @@ workflow METAGENOMICS { // - if ( params.metagenomics_postprocessing_tool || ['kraken2', 'krakenuniq'].contains(params.metagenomics_profiling_tool)) { + if ( params.metagenomics_postprocessing_tool || ['kraken2', 'krakenuniq'].contains(params.metagenomics_profiling_tool) ) { METAGENOMICS_POSTPROCESSING ( METAGENOMICS_PROFILING.out.postprocessing_input ) diff --git a/subworkflows/local/metagenomics_postprocessing.nf b/subworkflows/local/metagenomics_postprocessing.nf index 0893cf1a1..fd8148cdf 100644 --- a/subworkflows/local/metagenomics_postprocessing.nf +++ b/subworkflows/local/metagenomics_postprocessing.nf @@ -1,13 +1,8 @@ -// TODO nf-core: If in doubt look at other nf-core/subworkflows to see how we are doing things! :) -// https://github.com/nf-core/modules/tree/master/subworkflows -// You can also ask for help via your pull request or on the #subworkflows channel on the nf-core Slack workspace: -// https://nf-co.re/join -// TODO nf-core: A subworkflow SHOULD import at least two modules - -include { MALTEXTRACT } from '../../modules/nf-core/maltextract/main' -include { AMPS } from '../../modules/nf-core/amps/main' -include { KRAKENPARSE } from '../../modules/local/krakenparse' -include { KRAKENMERGE } from '../../modules/local/krakenmerge' +include { MALTEXTRACT } from '../../modules/nf-core/maltextract/main' +include { AMPS } from '../../modules/nf-core/amps/main' +include { KRAKENPARSE } from '../../modules/local/krakenparse' +include { KRAKENMERGE } from '../../modules/local/krakenmerge' +include { METAPHLAN_MERGEMETAPHLANTABLES } from '../modules/nf-core/metaphlan/mergemetaphlantables/main' workflow METAGENOMICS_POSTPROCESSING { @@ -19,7 +14,7 @@ workflow METAGENOMICS_POSTPROCESSING { ch_results = Channel.empty() ch_multiqc_files = Channel.empty() - if ( params.metagenomics_postprocessing_tool == 'maltextract') { + if ( params.metagenomics_postprocessing_tool == 'maltextract' ) { MALTEXTRACT ( ch_postprocessing_input, params.metagenomics_maltextract_taxon_list, params.metagenomics_maltextract_ncbi_dir ) @@ -31,7 +26,7 @@ workflow METAGENOMICS_POSTPROCESSING { } - else { + elif ( params.metagenomics_postprocessing_tool == 'krakenmerge' ) { KRAKENPARSE ( ch_postprocessing_input ) @@ -50,6 +45,13 @@ workflow METAGENOMICS_POSTPROCESSING { } + elif ( params.metagenomics_postprocessing_tool == 'mergemetaphlantables' ) { + METAPHLAN_MERGEMETAPHLANTABLES ( ch_postprocessing_input ) + + ch_versions = ch_versions.mix( METAPHLAN_MERGEMETAPHLANTABLES.out.versions.first() ) + ch_results = ch_results.mix( METAPHLAN_MERGEMETAPHLANTABLES.out.txt ) + } + emit: versions = ch_versions results = ch_results diff --git a/workflows/eager.nf b/workflows/eager.nf index d26fdec9d..cab215ab3 100644 --- a/workflows/eager.nf +++ b/workflows/eager.nf @@ -47,6 +47,8 @@ if ( params.metagenomics_postprocessing_tool == 'maltextract' && params.metageno if ( params.metagenomics_postprocessing_tool == 'krakenmerge' && ( ! params.metagenomics_profiling_tool != 'kraken2' || ! params.metagenomics_profiling_tool != 'krakenuniq' ) ) { exit 1, ("[nf-core/eager] ERROR: --metagenomics_postprocessing_tool 'krakenmerge' can only be run with --metagenomics_profiling_tool 'kraken2' or 'krakenuniq'") } +if ( params.metagenomics_postprocessing_tool == 'mergemetaphlantables' && ! params.metagenomics_profiling_tool != 'metaphlan' ) { exit 1, ("[nf-core/eager] ERROR: --metagenomics_postprocessing_tool 'mergemetaphlantables' can only be run with --metagenomics_profiling_tool 'metaphlan'") } + /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Report possible warnings From bf0b5c5f695c9b5e08126a80ad8c8be0074aa60c Mon Sep 17 00:00:00 2001 From: Ian Light Date: Fri, 23 Jun 2023 08:59:14 +0000 Subject: [PATCH 037/198] added documentation for metagenomics params --- nextflow_schema.json | 95 ++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 88 insertions(+), 7 deletions(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index eb4e90bd3..f89e6fe1c 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -326,7 +326,7 @@ "type": "boolean", "description": "Specify to skip read-pair merging.", "fa_icon": "fas fa-forward", - "help_text": "Turns off the paired-end read merging, and will result in paired-end mapping modes being used during reference of reads again alignment.\n\nThis can be useful in cases where you have long ancient DNA reads, modern DNA, or when you want to utilise mate-pair 'spatial' information..\n\n⚠️ If you run this and also with --preprocessing_minlength set to a value (as is by default!), you may end up removing single reads from either the pair1 or pair2 file. These reads will be NOT be mapped when aligning with either bwa or bowtie, as both can only accept one (forward) or two (forward and reverse) FASTQs as input in paired-end mode.\n\n> ⚠️ If you run metagenomic screening as well as skipping merging, all reads will be screened as independent reads - not as pairs! - as all FASTQ files from BAM filtering are merged into one. This merged file is _not_ saved in results directory.\n\n> Modifies AdapterRemoval parameter: `--collapse`\n> Modifies fastp parameter: `--merge`" + "help_text": "Turns off the paired-end read merging, and will result in paired-end mapping modes being used during reference of reads again alignment.\n\nThis can be useful in cases where you have long ancient DNA reads, modern DNA, or when you want to utilise mate-pair 'spatial' information..\n\n\u26a0\ufe0f If you run this and also with --preprocessing_minlength set to a value (as is by default!), you may end up removing single reads from either the pair1 or pair2 file. These reads will be NOT be mapped when aligning with either bwa or bowtie, as both can only accept one (forward) or two (forward and reverse) FASTQs as input in paired-end mode.\n\n> \u26a0\ufe0f If you run metagenomic screening as well as skipping merging, all reads will be screened as independent reads - not as pairs! - as all FASTQ files from BAM filtering are merged into one. This merged file is _not_ saved in results directory.\n\n> Modifies AdapterRemoval parameter: `--collapse`\n> Modifies fastp parameter: `--merge`" }, "preprocessing_excludeunmerged": { "type": "boolean", @@ -372,7 +372,7 @@ "type": "integer", "default": 0, "description": "Specify number of bases to hard-trim from 5 prime or front of reads. Exact behaviour varies per tool, see documentation.", - "help_text": "Specify number of bases to hard-trim from 5 prime or front of reads. Exact behaviour varies per tool, see documentation. By default set to `0` to not perform any hard trimming.\n\nThis parameter allows users to 'hard' remove a number of bases from the beginning or end of reads, regardless of quality.\n\n⚠️ when this trimming occurs depends on the tool, i.e., the exact behaviour is not the same between AdapterRemoval and fastp.\n\nFor fastp: this 5p/3p trimming occurs _prior_ to any other trimming (quality, poly-G, adapter). Please see the [fastp documentation](https://github.com/OpenGene/fastp#global-trimming) for more information. If you wish to use this to remove damage prior mapping (to allow more specific mapping), ensure you have manually removed adapters/quality trimmed **prior** to giving the reads to nf-core/eager. Alternatively, you can use Bowtie2's inbuilt pre-mapping read-end trimming functionality. Note that nf-core/eager only allows this hard trimming equally for both forward and reverse reads (i.e., you cannot provide different values for the 5p end for R1 and R2).\n\nFor AdapterRemoval, this trimming happens _after_ the removal of adapters, however prior to quality trimming. Therefore this is more suitable for hard-removal of damage prior mapping (however the Bowtie2 system will be more reliable).\n\n> Modifies AdapterRemoval parameters: `--trim5p`\n> Modifies fastp parameters: `--trim_front1` and/or `--trim_front2`\n", + "help_text": "Specify number of bases to hard-trim from 5 prime or front of reads. Exact behaviour varies per tool, see documentation. By default set to `0` to not perform any hard trimming.\n\nThis parameter allows users to 'hard' remove a number of bases from the beginning or end of reads, regardless of quality.\n\n\u26a0\ufe0f when this trimming occurs depends on the tool, i.e., the exact behaviour is not the same between AdapterRemoval and fastp.\n\nFor fastp: this 5p/3p trimming occurs _prior_ to any other trimming (quality, poly-G, adapter). Please see the [fastp documentation](https://github.com/OpenGene/fastp#global-trimming) for more information. If you wish to use this to remove damage prior mapping (to allow more specific mapping), ensure you have manually removed adapters/quality trimmed **prior** to giving the reads to nf-core/eager. Alternatively, you can use Bowtie2's inbuilt pre-mapping read-end trimming functionality. Note that nf-core/eager only allows this hard trimming equally for both forward and reverse reads (i.e., you cannot provide different values for the 5p end for R1 and R2).\n\nFor AdapterRemoval, this trimming happens _after_ the removal of adapters, however prior to quality trimming. Therefore this is more suitable for hard-removal of damage prior mapping (however the Bowtie2 system will be more reliable).\n\n> Modifies AdapterRemoval parameters: `--trim5p`\n> Modifies fastp parameters: `--trim_front1` and/or `--trim_front2`\n", "fa_icon": "fas fa-cut" }, "preprocessing_trim3p": { @@ -380,7 +380,7 @@ "default": 0, "description": "Specify number of bases to hard-trim from 3 prime or tail of reads. Exact behaviour varies per tool, see documentation.", "fa_icon": "fas fa-cut", - "help_text": "Specify number of bases to hard-trim from 3 prime or tail of reads. Exact behaviour varies per tool, see documentation. By default set to `0` to not perform any hard trimming.\n\nThis parameter allows users to 'hard' remove a number of bases from the beginning or end of reads, regardless of quality.\n\n⚠️ when this trimming occurs depends on the tool, i.e., the exact behaviour is not the same between AdapterRemoval and fastp.\n\nFor fastp: this 5p/3p trimming occurs _prior_ to any other trimming (quality, poly-G, adapter). Please see the [fastp documentation](https://github.com/OpenGene/fastp#global-trimming) for more information. If you wish to use this to remove damage prior mapping (to allow more specific mapping), ensure you have manually removed adapters/quality trimmed **prior** to giving the reads to nf-core/eager. Alternatively, you can use Bowtie2's inbuilt pre-mapping read-end trimming functionality. Note that nf-core/eager only allows this hard trimming equally for both forward and reverse reads (i.e., you cannot provide different values for the 3p end for R1 and R2).\n\nFor AdapterRemoval, this trimming happens _after_ the removal of adapters, however prior to quality trimming. Therefore this is more suitable for hard-removal of damage prior mapping (however the Bowtie2 system will be more reliable).\n\n> Modifies AdapterRemoval parameters: `--trim3p`\n> Modifies fastp parameters: `--trim_tail1` and/or `--trim_tail2`\n" + "help_text": "Specify number of bases to hard-trim from 3 prime or tail of reads. Exact behaviour varies per tool, see documentation. By default set to `0` to not perform any hard trimming.\n\nThis parameter allows users to 'hard' remove a number of bases from the beginning or end of reads, regardless of quality.\n\n\u26a0\ufe0f when this trimming occurs depends on the tool, i.e., the exact behaviour is not the same between AdapterRemoval and fastp.\n\nFor fastp: this 5p/3p trimming occurs _prior_ to any other trimming (quality, poly-G, adapter). Please see the [fastp documentation](https://github.com/OpenGene/fastp#global-trimming) for more information. If you wish to use this to remove damage prior mapping (to allow more specific mapping), ensure you have manually removed adapters/quality trimmed **prior** to giving the reads to nf-core/eager. Alternatively, you can use Bowtie2's inbuilt pre-mapping read-end trimming functionality. Note that nf-core/eager only allows this hard trimming equally for both forward and reverse reads (i.e., you cannot provide different values for the 3p end for R1 and R2).\n\nFor AdapterRemoval, this trimming happens _after_ the removal of adapters, however prior to quality trimming. Therefore this is more suitable for hard-removal of damage prior mapping (however the Bowtie2 system will be more reliable).\n\n> Modifies AdapterRemoval parameters: `--trim3p`\n> Modifies fastp parameters: `--trim_tail1` and/or `--trim_tail2`\n" }, "preprocessing_savepreprocessedreads": { "type": "boolean", @@ -539,12 +539,12 @@ "default": 4, "fa_icon": "fas fa-flag", "description": "Specify the SAM format flag of reads to remove during BAM filtering for downstream genomic steps. Generally not recommended to change.", - "help_text": "You can use this to customise the exact SAM format flag of reads you wish to _remove_ from your BAM file to for downstream _genomic_ analyses.\n\nYou can explore more using a tool from the Broad institute [here](https://broadinstitute.github.io/picard/explain-flags.html)\n\n> ⚠️ Modify at your own risk, alternative flags are not necessarily supported in downstream steps!\n\n> Modifies tool parameter(s):\n> - SAMtools: `-F`" + "help_text": "You can use this to customise the exact SAM format flag of reads you wish to _remove_ from your BAM file to for downstream _genomic_ analyses.\n\nYou can explore more using a tool from the Broad institute [here](https://broadinstitute.github.io/picard/explain-flags.html)\n\n> \u26a0\ufe0f Modify at your own risk, alternative flags are not necessarily supported in downstream steps!\n\n> Modifies tool parameter(s):\n> - SAMtools: `-F`" }, "bamfiltering_retainunmappedgenomicbam": { "type": "boolean", "description": "Specify to retain unmapped reads in the BAM file used for downstream genomic analyses.", - "help_text": "You can use this parameter to retain unmapped reads (optionally also length filtered) in the genomic BAM for downstream analysis. By default, the pipeline only keeps mapped reads for downstream analysis.\n\nThis is also turned on if `--metagenomics_input` is set to `all`.\n\n> ⚠️ This will likely slow down run time of downstream pipeline steps!\n\n> Modifies tool parameter(s):\n> - samtools view: `-f 4` / `-F 4`", + "help_text": "You can use this parameter to retain unmapped reads (optionally also length filtered) in the genomic BAM for downstream analysis. By default, the pipeline only keeps mapped reads for downstream analysis.\n\nThis is also turned on if `--metagenomics_input` is set to `all`.\n\n> \u26a0\ufe0f This will likely slow down run time of downstream pipeline steps!\n\n> Modifies tool parameter(s):\n> - samtools view: `-f 4` / `-F 4`", "fa_icon": "fas fa-piggy-bank" }, "bamfiltering_generateunmappedfastq": { @@ -586,7 +586,7 @@ "description": "Specify which type of reads to go into metagenomic screening.", "enum": ["unmapped", "mapped", "all"], "fa_icon": "fas fa-hand-pointer", - "help_text": "You can select which reads coming out of the read alignment step will be sent for metagenomic analysis.\n\nThis influences which reads are sent to this step, whether you want unmapped reads (used in most cases, as 'host reads' can often be contaminants in microbial genomes), mapped reads (e.g, when doing competitive against a genomic reference of multiple genomes and which to apply LCA correction), or all reads.\n\n> ⚠️ If you skip paired-end merging, all reads will be screened as independent reads - not as pairs! - as all FASTQ files from BAM filtering are merged into one. This merged file is _not_ saved in results directory.\n\n> Modifies tool parameter(s):\n> - samtools fastq: `-f 4` / `-F 4`" + "help_text": "You can select which reads coming out of the read alignment step will be sent for metagenomic analysis.\n\nThis influences which reads are sent to this step, whether you want unmapped reads (used in most cases, as 'host reads' can often be contaminants in microbial genomes), mapped reads (e.g, when doing competitive against a genomic reference of multiple genomes and which to apply LCA correction), or all reads.\n\n> \u26a0\ufe0f If you skip paired-end merging, all reads will be screened as independent reads - not as pairs! - as all FASTQ files from BAM filtering are merged into one. This merged file is _not_ saved in results directory.\n\n> Modifies tool parameter(s):\n> - samtools fastq: `-f 4` / `-F 4`" }, "run_metagenomics_complexityfiltering": { "type": "boolean", @@ -754,6 +754,87 @@ "description": "Define group sizes for running multiple fastq files into malt.", "fa_icon": "fas fa-barcode", "help_text": "Very large fastq files or many fastq files run through MALT at the same time can lead to excessively long runtimes. This parameter allows for parallelization of MALT runs. Please note, MALT is resource heavy and setting this value above the default will spawn N/metagenomics_malt_group_size jobs where N is the number of samples. Please only use this if it is necessary to avoid runtime limits on your HPC cluster." + }, + "metagenomics_postprocessing_tool": { + "type": "string", + "description": "Activate post-processing of metagenomics profiling tool selected.", + "help_text": "Activate the corresponding post-processing tool for your metagenomics profiling software. \n\nmalt --> maltextract\nkrakenuniq/kraken2 --> krakenmerge\nmetaphlan --> mergemetaphlantables\n\nNote: Postprocessing is automatically carried out when using `kraken2` and `krakenuniq` ", + "default": "null", + "fa_icon": "fab fa-buromobelexperte" + }, + "metagenomics_maltextract_taxon_list": { + "type": "string", + "default": null, + "help_text": "\\nPath to a `.txt` file with taxa of interest you wish to assess for aDNA characteristics. In `.txt` file should be one taxon per row, and the taxon should be in a valid [NCBI taxonomy](https://www.ncbi.nlm.nih.gov/taxonomy) name format.\\n\\nOnly when `--metagenomic_tool malt` is also supplied.", + "fa_icon": "fas fa-align-left" + }, + "metagenomics_maltextract_ncbi_dir": { + "type": "string", + "default": null, + "help_text": "Path to directory containing containing the NCBI resource tree and taxonomy table files (ncbi.tre and ncbi.map; available at the [HOPS repository](https://github.com/rhuebler/HOPS/Resources)).\\n\\nOnly when `--metagenomic_tool malt` is also supplied.", + "fa_icon": "fab fa-buffer" + }, + "metagenomics_maltextract_filter": { + "type": "string", + "default": "def_anc", + "description": "Specify which MaltExtract filter to use. Options: 'def_anc', 'ancient', 'default', 'crawl', 'scan', 'srna', 'assignment'.", + "help_text": "Specify which MaltExtract filter to use. This is used to specify what types of characteristics to scan for. The default will output statistics on all alignments, and then a second set with just reads with one C to T mismatch in the first 5 bases. Further details on other parameters can be seen in the [HOPS documentation](https://github.com/rhuebler/HOPS/#maltextract-parameters). Options: `'def_anc'`, `'ancient'`, `'default'`, `'crawl'`, `'scan'`, `'srna'`, 'assignment'. Default: `'def_anc'`.\\n\\nOnly when `--metagenomic_tool malt` is also supplied.\\n\\n> Modifies MaltExtract parameter: `-f`", + "fa_icon": "fas fa-filter", + "enum": ["def_anc", "default", "ancient", "scan", "crawl", "srna"] + }, + "metagenomics_maltextract_toppercent": { + "type": "number", + "default": 0.01, + "description": "Specify percent of top alignments to use.", + "help_text": "Specify frequency of top alignments for each read to be considered for each node.\\nDefault is 0.01, i.e. 1% of all reads (where 1 would correspond to 100%).\\n\\n> :warning: this parameter follows the same concept as `--malt_top_percent` but\\n> uses a different notation i.e. integer (MALT) versus float (MALTExtract)\\n\\nDefault: `0.01`.\\n\\nOnly when `--metagenomic_tool malt` is also supplied.\\n\\n> Modifies MaltExtract parameter: `-a`", + "fa_icon": "fas fa-percent" + }, + "metagenomics_maltextract_destackingoff": { + "type": "string", + "default": "false", + "description": "Turn off destacking.", + "help_text": "Turn off destacking. If left on, a read that overlaps with another read will be\\nremoved (leaving a depth coverage of 1).\\n\\nOnly when `--metagenomic_tool malt` is also supplied.\\n\\n> Modifies MaltExtract parameter: `--destackingOff`", + "fa_icon": "fab fa-stack-overflow" + }, + "metagenomics_maltextract_downsamplingoff": { + "type": "string", + "default": "false", + "description": "Turn off downsampling.", + "help_text": "Turn off downsampling. By default, downsampling is on and will randomly select 10,000 reads if the number of reads on a node exceeds this number. This is to speed up processing, under the assumption at 10,000 reads the species is a 'true positive'.\\n\\nOnly when `--metagenomic_tool malt` is also supplied.\\n\\n> Modifies MaltExtract parameter: `--downSampOff`", + "fa_icon": "fas fa-angle-double-down" + }, + "metagenomics_maltextract_duplicateremovaloff": { + "type": "string", + "default": "false", + "description": "Turn off duplicate removal.", + "help_text": "\\nTurn off duplicate removal. By default, reads that are an exact copy (i.e. same start, stop coordinate and exact sequence match) will be removed as it is considered a PCR duplicate.\\n\\nOnly when `--metagenomic_tool malt` is also supplied.\\n\\n> Modifies MaltExtract parameter: `--dupRemOff`", + "fa_icon": "fas fa-copy" + }, + "metagenomics_maltextract_matches": { + "type": "string", + "default": "false", + "description": "Turn on exporting alignments of hits in BLAST format.", + "help_text": "\\nExport alignments of hits for each node in BLAST format. By default turned off.\\n\\nOnly when `--metagenomic_tool malt` is also supplied.\\n\\n> Modifies MaltExtract parameter: `--matches`", + "fa_icon": "fas fa-equals" + }, + "metagenomics_maltextract_megansummary": { + "type": "string", + "default": "false", + "description": "Turn on export of MEGAN summary files.", + "help_text": "Export 'minimal' summary files (i.e. without alignments) that can be loaded into [MEGAN6](https://doi.org/10.1371/journal.pcbi.1004957). By default turned off.\\n\\nOnly when `--metagenomic_tool malt` is also supplied.\\n\\n> Modifies MaltExtract parameter: `--meganSummary`" + }, + "metagenomics_maltextract_percentidentity": { + "type": "integer", + "default": 85, + "description": "Minimum percent identity alignments are required to have to be reported. Recommended to set same as MALT parameter.", + "help_text": "Minimum percent identity alignments are required to have to be reported. Higher values allows fewer mismatches between read and reference sequence, but therefore will provide greater confidence in the hit. Lower values allow more mismatches, which can account for damage and divergence of a related strain/species to the reference. Recommended to set same as MALT parameter or higher. Default: `85`.\\n\\nOnly when `--metagenomic_tool malt` is also supplied.\\n\\n> Modifies MaltExtract parameter: `--minPI`" + }, + "metagenomics_maltextract_topalignment": { + "type": "string", + "default": "false", + "description": "Turn on using top alignments per read after filtering.", + "help_text": "Use the best alignment of each read for every statistic, except for those concerning read distribution and coverage. Default: off.\\n\\nOnly when `--metagenomic_tool malt` is also supplied.\\n\\n> Modifies MaltExtract parameter: `--useTopAlignment`", + "fa_icon": "fas fa-bahai" } }, "fa_icon": "fas fa-search" @@ -773,7 +854,7 @@ "type": "string", "default": "markduplicates", "description": "Specify which tool to use for deduplication.", - "help_text": "Sets the duplicate read removal tool. By default uses `markduplicates` from Picard. Alternatively an ancient DNA specific read deduplication tool `dedup` (Peltzer et al. 2016) is offered. The latter utilises both ends of paired-end data to remove duplicates (i.e. true exact duplicates, as markduplicates will over-zealously deduplicate anything with the same starting position even if the ends are different).\n\n> ⚠️ DeDup can only be used on collapsed (i.e. merged) reads from paired-end sequencing.", + "help_text": "Sets the duplicate read removal tool. By default uses `markduplicates` from Picard. Alternatively an ancient DNA specific read deduplication tool `dedup` (Peltzer et al. 2016) is offered. The latter utilises both ends of paired-end data to remove duplicates (i.e. true exact duplicates, as markduplicates will over-zealously deduplicate anything with the same starting position even if the ends are different).\n\n> \u26a0\ufe0f DeDup can only be used on collapsed (i.e. merged) reads from paired-end sequencing.", "enum": ["markduplicates", "dedup"], "fa_icon": "fas fa-layer-group" } From 0ac84b4558536270833da29b3045e78fca16dda0 Mon Sep 17 00:00:00 2001 From: Merlin Szymanski Date: Fri, 23 Jun 2023 11:11:04 +0200 Subject: [PATCH 038/198] Fix AMPS output channel names --- subworkflows/local/metagenomics_postprocessing.nf | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/subworkflows/local/metagenomics_postprocessing.nf b/subworkflows/local/metagenomics_postprocessing.nf index 0893cf1a1..fd53592bb 100644 --- a/subworkflows/local/metagenomics_postprocessing.nf +++ b/subworkflows/local/metagenomics_postprocessing.nf @@ -1,8 +1,4 @@ -// TODO nf-core: If in doubt look at other nf-core/subworkflows to see how we are doing things! :) -// https://github.com/nf-core/modules/tree/master/subworkflows -// You can also ask for help via your pull request or on the #subworkflows channel on the nf-core Slack workspace: -// https://nf-co.re/join -// TODO nf-core: A subworkflow SHOULD import at least two modules +// TODO: publish the files in ch_results directly? include { MALTEXTRACT } from '../../modules/nf-core/maltextract/main' include { AMPS } from '../../modules/nf-core/amps/main' @@ -23,11 +19,11 @@ workflow METAGENOMICS_POSTPROCESSING { MALTEXTRACT ( ch_postprocessing_input, params.metagenomics_maltextract_taxon_list, params.metagenomics_maltextract_ncbi_dir ) - AMPS ( MALTEXTRACT.out.results, params.taxon_list, params.metagenomics_maltextract_filter ) + AMPS ( MALTEXTRACT.out.results, params.metagenomics_maltextract_taxon_list, params.metagenomics_maltextract_filter ) ch_versions = ch_versions.mix( MALTEXTRACT.out.versions.first(), AMPS.out.versions.first() ) - ch_results = ch_results.mix( AMPS.out.results.summary_pdf, AMPS.out.tsv, AMPS.out.summary_pdf ) - ch_multiqc_files = ch_multiqc_files.mix( AMPS.out.results.json ) + ch_results = ch_results.mix( AMPS.out.candidate_pdfs, AMPS.out.tsv, AMPS.out.summary_pdf ) + ch_multiqc_files = ch_multiqc_files.mix( AMPS.out.json ) } From 546d25342acad7ea46072311c11215f035577859 Mon Sep 17 00:00:00 2001 From: Ian Light Date: Fri, 23 Jun 2023 09:17:45 +0000 Subject: [PATCH 039/198] metaphlan postprocessing implementation updates --- subworkflows/local/metagenomics_postprocessing.nf | 7 ++++--- subworkflows/local/metagenomics_profiling.nf | 15 ++++++++------- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/subworkflows/local/metagenomics_postprocessing.nf b/subworkflows/local/metagenomics_postprocessing.nf index fd8148cdf..130b1300a 100644 --- a/subworkflows/local/metagenomics_postprocessing.nf +++ b/subworkflows/local/metagenomics_postprocessing.nf @@ -7,7 +7,8 @@ include { METAPHLAN_MERGEMETAPHLANTABLES } from '../modules/nf-core/metaphlan/me workflow METAGENOMICS_POSTPROCESSING { take: - ch_postprocessing_input // different between kraken and malt + ch_postprocessing_input // different between each profiling --> postprocessing tool, + // defined in metagenomics profiling subworkflow main: ch_versions = Channel.empty() @@ -26,7 +27,7 @@ workflow METAGENOMICS_POSTPROCESSING { } - elif ( params.metagenomics_postprocessing_tool == 'krakenmerge' ) { + elif ( params.metagenomics_postprocessing_tool == 'krakenmerge' || ['kraken2', 'krakenuniq'].contains(params.metagenomics_profiling_tool) ) { KRAKENPARSE ( ch_postprocessing_input ) @@ -46,7 +47,7 @@ workflow METAGENOMICS_POSTPROCESSING { } elif ( params.metagenomics_postprocessing_tool == 'mergemetaphlantables' ) { - METAPHLAN_MERGEMETAPHLANTABLES ( ch_postprocessing_input ) + METAPHLAN_MERGEMETAPHLANTABLES ( ch_postprocessing_input , params.metagenomics_profiling_database ) ch_versions = ch_versions.mix( METAPHLAN_MERGEMETAPHLANTABLES.out.versions.first() ) ch_results = ch_results.mix( METAPHLAN_MERGEMETAPHLANTABLES.out.txt ) diff --git a/subworkflows/local/metagenomics_profiling.nf b/subworkflows/local/metagenomics_profiling.nf index 957c06f48..aa9e2f1cd 100644 --- a/subworkflows/local/metagenomics_profiling.nf +++ b/subworkflows/local/metagenomics_profiling.nf @@ -95,8 +95,9 @@ workflow METAGENOMICS_PROFILING { else if ( params.metagenomics_profiling_tool == 'metaphlan' ) { METAPHLAN_METAPHLAN ( reads , database ) - ch_versions = ch_versions.mix( METAPHLAN_METAPHLAN.out.versions.first() ) - ch_raw_profiles = ch_raw_profiles.mix( METAPHLAN_METAPHLAN.out.profile ) + ch_versions = ch_versions.mix( METAPHLAN_METAPHLAN.out.versions.first() ) + ch_raw_profiles = ch_raw_profiles.mix( METAPHLAN_METAPHLAN.out.profile ) + ch_postprocessing_input = ch_postprocessing_input.mix( METAPHLAN_METAPHLAN.out.profile ) } @@ -146,10 +147,10 @@ workflow METAGENOMICS_PROFILING { } emit: - versions = ch_versions // channel: [ versions.yml ] - classifications = ch_raw_classifications - profiles = ch_raw_profiles // channel: [ val(meta), [ reads ] ] - should be text files or biom - postprocessing_input = ch_postprocessing_input - mqc = ch_multiqc_files + versions = ch_versions // channel: [ versions.yml ] + classifications = ch_raw_classifications + profiles = ch_raw_profiles // channel: [ val(meta), [ reads ] ] - should be text files or biom + postprocessing_input = ch_postprocessing_input // channel: [ val(meta), [ inputs_for_postprocessing_tools ] ] // see info at metagenomics_postprocessing + mqc = ch_multiqc_files } From c47b47edcccce394471d4d99a222c057f5c44569 Mon Sep 17 00:00:00 2001 From: Merlin Szymanski Date: Fri, 23 Jun 2023 11:20:07 +0200 Subject: [PATCH 040/198] Minor schema update --- nextflow_schema.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index f89e6fe1c..0afa28765 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -824,7 +824,7 @@ "help_text": "Export 'minimal' summary files (i.e. without alignments) that can be loaded into [MEGAN6](https://doi.org/10.1371/journal.pcbi.1004957). By default turned off.\\n\\nOnly when `--metagenomic_tool malt` is also supplied.\\n\\n> Modifies MaltExtract parameter: `--meganSummary`" }, "metagenomics_maltextract_percentidentity": { - "type": "integer", + "type": "number", "default": 85, "description": "Minimum percent identity alignments are required to have to be reported. Recommended to set same as MALT parameter.", "help_text": "Minimum percent identity alignments are required to have to be reported. Higher values allows fewer mismatches between read and reference sequence, but therefore will provide greater confidence in the hit. Lower values allow more mismatches, which can account for damage and divergence of a related strain/species to the reference. Recommended to set same as MALT parameter or higher. Default: `85`.\\n\\nOnly when `--metagenomic_tool malt` is also supplied.\\n\\n> Modifies MaltExtract parameter: `--minPI`" From 1f733e0b37603c725a78ace2b46b114b516a0bb4 Mon Sep 17 00:00:00 2001 From: Ian Light Date: Fri, 23 Jun 2023 09:24:33 +0000 Subject: [PATCH 041/198] removed unused results channel for metagenomics --- conf/modules.config | 7 +++++++ subworkflows/local/metagenomics_postprocessing.nf | 6 ++---- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index f99f503bf..f6b4d363d 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -569,4 +569,11 @@ process { ] } + withName: METAPHLAN_MERGEMETAPHLANTABLES { + publishDir = [ + path: { "${params.outdir}/metagenomics_screening/postprocessing/mergemetaphlantables/" }, + mode: params.publish_dir_mode, + pattern: '*.txt' + ] + } } diff --git a/subworkflows/local/metagenomics_postprocessing.nf b/subworkflows/local/metagenomics_postprocessing.nf index 130b1300a..cdadbbf96 100644 --- a/subworkflows/local/metagenomics_postprocessing.nf +++ b/subworkflows/local/metagenomics_postprocessing.nf @@ -22,7 +22,6 @@ workflow METAGENOMICS_POSTPROCESSING { AMPS ( MALTEXTRACT.out.results, params.taxon_list, params.metagenomics_maltextract_filter ) ch_versions = ch_versions.mix( MALTEXTRACT.out.versions.first(), AMPS.out.versions.first() ) - ch_results = ch_results.mix( AMPS.out.results.summary_pdf, AMPS.out.tsv, AMPS.out.summary_pdf ) ch_multiqc_files = ch_multiqc_files.mix( AMPS.out.results.json ) } @@ -41,7 +40,6 @@ workflow METAGENOMICS_POSTPROCESSING { KRAKENMERGE ( ch_list_of_kraken_parse_reads.collect() , ch_list_of_kraken_parse_kmer.collect() ) ch_versions = ch_versions.mix( KRAKENPARSE.out.versions.first(), KRAKENMERGE.out.versions.first() ) - ch_results = ch_results.mix( KRAKENMERGE.out.read_count_table, KRAKENMERGE.out.kmer_duplication_table ) ch_multiqc_files = ch_multiqc_files.mix( KRAKENMERGE.out.read_count_table, KRAKENMERGE.out.kmer_duplication_table ) } @@ -50,12 +48,12 @@ workflow METAGENOMICS_POSTPROCESSING { METAPHLAN_MERGEMETAPHLANTABLES ( ch_postprocessing_input , params.metagenomics_profiling_database ) ch_versions = ch_versions.mix( METAPHLAN_MERGEMETAPHLANTABLES.out.versions.first() ) - ch_results = ch_results.mix( METAPHLAN_MERGEMETAPHLANTABLES.out.txt ) + ch_multiqc_files = ch_multiqc_files.mix( METAPHLAN_MERGEMETAPHLANTABLES.out.txt ) + } emit: versions = ch_versions - results = ch_results mqc = ch_multiqc_files } From ee08000292e86373646cee2be399b674879c8428 Mon Sep 17 00:00:00 2001 From: Ian Light Date: Fri, 23 Jun 2023 09:39:10 +0000 Subject: [PATCH 042/198] linting fixes --- modules/local/krakenmerge.nf | 32 -------------------------------- modules/local/krakenparse.nf | 12 ------------ nextflow_schema.json | 12 ++++++------ workflows/eager.nf | 2 +- 4 files changed, 7 insertions(+), 51 deletions(-) diff --git a/modules/local/krakenmerge.nf b/modules/local/krakenmerge.nf index 4e9fc168c..a0925a966 100644 --- a/modules/local/krakenmerge.nf +++ b/modules/local/krakenmerge.nf @@ -1,36 +1,11 @@ -// TODO nf-core: If in doubt look at other nf-core/modules to see how we are doing things! :) -// https://github.com/nf-core/modules/tree/master/modules/nf-core/ -// You can also ask for help via your pull request or on the #modules channel on the nf-core Slack workspace: -// https://nf-co.re/join -// TODO nf-core: A module file SHOULD only define input and output files as command-line parameters. -// All other parameters MUST be provided using the "task.ext" directive, see here: -// https://www.nextflow.io/docs/latest/process.html#ext -// where "task.ext" is a string. -// Any parameters that need to be evaluated in the context of a particular sample -// e.g. single-end/paired-end data MUST also be defined and evaluated appropriately. -// TODO nf-core: Software that can be piped together SHOULD be added to separate module files -// unless there is a run-time, storage advantage in implementing in this way -// e.g. it's ok to have a single module for bwa to output BAM instead of SAM: -// bwa mem | samtools view -B -T ref.fasta -// TODO nf-core: Optional inputs are not currently supported by Nextflow. However, using an empty -// list (`[]`) instead of a file can be used to work around this issue. - process KRAKENMERGE { label 'process_single' - // TODO nf-core: List required Conda package(s). - // Software MUST be pinned to channel (i.e. "bioconda"), version (i.e. "1.10"). - // For Conda, the build (i.e. "h9402c20_2") must be EXCLUDED to support installation on different operating systems. - // TODO nf-core: See section in main README for further information regarding finding and adding container addresses to the section below. conda "conda-forge::pandas=1.5.2" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/pandas:1.5.2' : 'quay.io/biocontainers/pandas:1.5.2' }" input: - // TODO nf-core: Where applicable all sample-specific information e.g. "id", "single_end", "read_group" - // MUST be provided as an input via a Groovy Map called "meta". - // This information may not be required in some instances e.g. indexing reference genome files: - // https://github.com/nf-core/modules/blob/master/modules/nf-core/bwa/index/main.nf // TODO check if this works path kraken_parse_reads @@ -45,13 +20,6 @@ process KRAKENMERGE { task.ext.when == null || task.ext.when script: - - // TODO nf-core: Where possible, a command MUST be provided to obtain the version number of the software e.g. 1.10 - // If the software is unable to output a version number on the command-line then it can be manually specified - // e.g. https://github.com/nf-core/modules/blob/master/modules/nf-core/homer/annotatepeaks/main.nf - // Each software used MUST provide the software name and version number in the YAML version file (versions.yml) - // TODO nf-core: It MUST be possible to pass additional parameters to the tool as a command-line string via the "task.ext.args" directive - // TODO default value for KRAKEN_PARSE min reads shared with MALTEXTRACT, but recommended defaults in tools is 50 vs 1, respectively: add check and warning? def read_out = "kraken_read_count_table.csv" def kmer_out = "kraken_kmer_duplication.csv" """ diff --git a/modules/local/krakenparse.nf b/modules/local/krakenparse.nf index 1dc875302..3e25fabda 100644 --- a/modules/local/krakenparse.nf +++ b/modules/local/krakenparse.nf @@ -7,12 +7,6 @@ process KRAKENPARSE { 'https://depot.galaxyproject.org/singularity/python:3.8.3' : 'quay.io/biocontainers/python:3.8.3' }" input: - // TODO nf-core: Where applicable all sample-specific information e.g. "id", "single_end", "read_group" - // MUST be provided as an input via a Groovy Map called "meta". - // This information may not be required in some instances e.g. indexing reference genome files: - // https://github.com/nf-core/modules/blob/master/modules/nf-core/bwa/index/main.nf - // TODO nf-core: Where applicable please provide/convert compressed files as input/output - // e.g. "*.fastq.gz" and NOT "*.fastq", "*.bam" and NOT "*.sam" etc. tuple val(meta), path(report) output: @@ -25,12 +19,6 @@ process KRAKENPARSE { script: - // TODO nf-core: Where possible, a command MUST be provided to obtain the version number of the software e.g. 1.10 - // If the software is unable to output a version number on the command-line then it can be manually specified - // e.g. https://github.com/nf-core/modules/blob/master/modules/nf-core/homer/annotatepeaks/main.nf - // Each software used MUST provide the software name and version number in the YAML version file (versions.yml) - // TODO nf-core: It MUST be possible to pass additional parameters to the tool as a command-line string via the "task.ext.args" directive - // TODO default value for KRAKEN_PARSE min reads shared with MALTEXTRACT, but recommended defaults in tools is 50 vs 1, respectively: add check and warning? def read_out = "${meta.id}.read_kraken_parsed.csv" def kmer_out = "${meta.id}.kmer_kraken_parsed.csv" """ diff --git a/nextflow_schema.json b/nextflow_schema.json index 0afa28765..db577f359 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -790,35 +790,35 @@ "fa_icon": "fas fa-percent" }, "metagenomics_maltextract_destackingoff": { - "type": "string", + "type": "boolean", "default": "false", "description": "Turn off destacking.", "help_text": "Turn off destacking. If left on, a read that overlaps with another read will be\\nremoved (leaving a depth coverage of 1).\\n\\nOnly when `--metagenomic_tool malt` is also supplied.\\n\\n> Modifies MaltExtract parameter: `--destackingOff`", "fa_icon": "fab fa-stack-overflow" }, "metagenomics_maltextract_downsamplingoff": { - "type": "string", + "type": "boolean", "default": "false", "description": "Turn off downsampling.", "help_text": "Turn off downsampling. By default, downsampling is on and will randomly select 10,000 reads if the number of reads on a node exceeds this number. This is to speed up processing, under the assumption at 10,000 reads the species is a 'true positive'.\\n\\nOnly when `--metagenomic_tool malt` is also supplied.\\n\\n> Modifies MaltExtract parameter: `--downSampOff`", "fa_icon": "fas fa-angle-double-down" }, "metagenomics_maltextract_duplicateremovaloff": { - "type": "string", + "type": "boolean", "default": "false", "description": "Turn off duplicate removal.", "help_text": "\\nTurn off duplicate removal. By default, reads that are an exact copy (i.e. same start, stop coordinate and exact sequence match) will be removed as it is considered a PCR duplicate.\\n\\nOnly when `--metagenomic_tool malt` is also supplied.\\n\\n> Modifies MaltExtract parameter: `--dupRemOff`", "fa_icon": "fas fa-copy" }, "metagenomics_maltextract_matches": { - "type": "string", + "type": "boolean", "default": "false", "description": "Turn on exporting alignments of hits in BLAST format.", "help_text": "\\nExport alignments of hits for each node in BLAST format. By default turned off.\\n\\nOnly when `--metagenomic_tool malt` is also supplied.\\n\\n> Modifies MaltExtract parameter: `--matches`", "fa_icon": "fas fa-equals" }, "metagenomics_maltextract_megansummary": { - "type": "string", + "type": "boolean", "default": "false", "description": "Turn on export of MEGAN summary files.", "help_text": "Export 'minimal' summary files (i.e. without alignments) that can be loaded into [MEGAN6](https://doi.org/10.1371/journal.pcbi.1004957). By default turned off.\\n\\nOnly when `--metagenomic_tool malt` is also supplied.\\n\\n> Modifies MaltExtract parameter: `--meganSummary`" @@ -830,7 +830,7 @@ "help_text": "Minimum percent identity alignments are required to have to be reported. Higher values allows fewer mismatches between read and reference sequence, but therefore will provide greater confidence in the hit. Lower values allow more mismatches, which can account for damage and divergence of a related strain/species to the reference. Recommended to set same as MALT parameter or higher. Default: `85`.\\n\\nOnly when `--metagenomic_tool malt` is also supplied.\\n\\n> Modifies MaltExtract parameter: `--minPI`" }, "metagenomics_maltextract_topalignment": { - "type": "string", + "type": "boolean", "default": "false", "description": "Turn on using top alignments per read after filtering.", "help_text": "Use the best alignment of each read for every statistic, except for those concerning read distribution and coverage. Default: off.\\n\\nOnly when `--metagenomic_tool malt` is also supplied.\\n\\n> Modifies MaltExtract parameter: `--useTopAlignment`", diff --git a/workflows/eager.nf b/workflows/eager.nf index cab215ab3..428507248 100644 --- a/workflows/eager.nf +++ b/workflows/eager.nf @@ -45,7 +45,7 @@ if ( params.run_metagenomics && ! params.metagenomics_profiling_database ) { exi if ( params.metagenomics_postprocessing_tool == 'maltextract' && params.metagenomics_profiling_tool != 'malt' ) { exit 1, ("[nf-core/eager] ERROR: --metagenomics_postprocessing_tool 'maltextract' can only be run with --metagenomics_profiling_tool 'malt'") } -if ( params.metagenomics_postprocessing_tool == 'krakenmerge' && ( ! params.metagenomics_profiling_tool != 'kraken2' || ! params.metagenomics_profiling_tool != 'krakenuniq' ) ) { exit 1, ("[nf-core/eager] ERROR: --metagenomics_postprocessing_tool 'krakenmerge' can only be run with --metagenomics_profiling_tool 'kraken2' or 'krakenuniq'") } +if ( params.metagenomics_postprocessing_tool == 'krakenmerge' || ['kraken2', 'krakenuniq'].contains(params.metagenomics_profiling_tool) ) { exit 1, ("[nf-core/eager] ERROR: --metagenomics_postprocessing_tool 'krakenmerge' can only be run with --metagenomics_profiling_tool 'kraken2' or 'krakenuniq'") } if ( params.metagenomics_postprocessing_tool == 'mergemetaphlantables' && ! params.metagenomics_profiling_tool != 'metaphlan' ) { exit 1, ("[nf-core/eager] ERROR: --metagenomics_postprocessing_tool 'mergemetaphlantables' can only be run with --metagenomics_profiling_tool 'metaphlan'") } From 49230827fe61541d28d545c038b5bf16f1b32ff5 Mon Sep 17 00:00:00 2001 From: Ian Light Date: Fri, 23 Jun 2023 09:49:14 +0000 Subject: [PATCH 043/198] added metaphlan to readme docs --- README.md | 2 +- workflows/eager.nf | 14 ++++++++------ 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index d409a3f4f..570ab08cd 100644 --- a/README.md +++ b/README.md @@ -72,7 +72,7 @@ Additional functionality contained by the pipeline currently includes: #### Metagenomic Screening - Low-sequenced complexity filtering (`BBduk` or `PRINSEQ++`) -- Taxonomic binner with alignment (`MALT`) +- Taxonomic binner with alignment (`MALT` or `MetaPhlAn 4`) - Taxonomic binner without alignment (`Kraken2`) - aDNA characteristic screening of taxonomically binned data from MALT (`MaltExtract`) diff --git a/workflows/eager.nf b/workflows/eager.nf index 428507248..7ece609fd 100644 --- a/workflows/eager.nf +++ b/workflows/eager.nf @@ -24,6 +24,14 @@ if (params.input) { ch_input = file(params.input) } else { exit 1, 'Input sample */ if ( params.bamfiltering_retainunmappedgenomicbam && params.bamfiltering_mappingquality > 0 ) { exit 1, ("[nf-core/eager] ERROR: You cannot both retain unmapped reads and perform quality filtering, as unmapped reads have a mapping quality of 0. Pick one or the other functionality.") } + +// TODO What to do when params.preprocessing_excludeunmerged is provided but the data is SE? +if ( params.deduplication_tool == 'dedup' && ! params.preprocessing_excludeunmerged ) { exit 1, "[nf-core/eager] ERROR: Dedup can only be used on collapsed (i.e. merged) PE reads. For all other cases, please set --deduplication_tool to 'markduplicates'."} + +// Metagenomics failing parameter combinations +// TODO add any other metagenomics screening parameters checks for eg complexity filtering, post-processing +if ( params.run_metagenomics && ! params.metagenomics_profiling_database ) { exit 1, ("[nf-core/eager] ERROR: Please provide an appropriate database path for metagenomics screening using --metagenomics_profiling_database") } + if ( params.metagenomics_complexity_tool == 'prinseq' && params.metagenomics_prinseq_mode == 'dust' && params.metagenomics_complexity_entropy != 0.3 ) { // entropy score was set but dust method picked. If no dust-score provided, assume it was an error and fail if (params.metagenomics_prinseq_dustscore == 0.5) { @@ -37,12 +45,6 @@ if ( params.metagenomics_complexity_tool == 'prinseq' && params.metagenomics_pri } } -// TODO What to do when params.preprocessing_excludeunmerged is provided but the data is SE? -if ( params.deduplication_tool == 'dedup' && ! params.preprocessing_excludeunmerged ) { exit 1, "[nf-core/eager] ERROR: Dedup can only be used on collapsed (i.e. merged) PE reads. For all other cases, please set --deduplication_tool to 'markduplicates'."} - -// TODO add any other metagenomics screening parameters checks for eg complexity filtering, post-processing -if ( params.run_metagenomics && ! params.metagenomics_profiling_database ) { exit 1, ("[nf-core/eager] ERROR: Please provide an appropriate database path for metagenomics screening using --metagenomics_profiling_database") } - if ( params.metagenomics_postprocessing_tool == 'maltextract' && params.metagenomics_profiling_tool != 'malt' ) { exit 1, ("[nf-core/eager] ERROR: --metagenomics_postprocessing_tool 'maltextract' can only be run with --metagenomics_profiling_tool 'malt'") } if ( params.metagenomics_postprocessing_tool == 'krakenmerge' || ['kraken2', 'krakenuniq'].contains(params.metagenomics_profiling_tool) ) { exit 1, ("[nf-core/eager] ERROR: --metagenomics_postprocessing_tool 'krakenmerge' can only be run with --metagenomics_profiling_tool 'kraken2' or 'krakenuniq'") } From aa0d80a83314a09867c971593146f932a91300ff Mon Sep 17 00:00:00 2001 From: Merlin Szymanski Date: Fri, 23 Jun 2023 12:03:09 +0200 Subject: [PATCH 044/198] streamline maltextract channels --- .../local/metagenomics_postprocessing.nf | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/subworkflows/local/metagenomics_postprocessing.nf b/subworkflows/local/metagenomics_postprocessing.nf index cefc59a82..5507879bf 100644 --- a/subworkflows/local/metagenomics_postprocessing.nf +++ b/subworkflows/local/metagenomics_postprocessing.nf @@ -2,7 +2,7 @@ include { MALTEXTRACT } from '../../modules/nf-core/maltextra include { AMPS } from '../../modules/nf-core/amps/main' include { KRAKENPARSE } from '../../modules/local/krakenparse' include { KRAKENMERGE } from '../../modules/local/krakenmerge' -include { METAPHLAN_MERGEMETAPHLANTABLES } from '../modules/nf-core/metaphlan/mergemetaphlantables/main' +include { METAPHLAN_MERGEMETAPHLANTABLES } from '../../modules/nf-core/metaphlan/mergemetaphlantables/main' workflow METAGENOMICS_POSTPROCESSING { @@ -17,9 +17,16 @@ workflow METAGENOMICS_POSTPROCESSING { if ( params.metagenomics_postprocessing_tool == 'maltextract' ) { - MALTEXTRACT ( ch_postprocessing_input, params.metagenomics_maltextract_taxon_list, params.metagenomics_maltextract_ncbi_dir ) + //maltextract doesnt accepts a meta param in the first input channel, so remove it + ch_maltextract_input = ch_postprocessing_input.map{it[1]} - AMPS ( MALTEXTRACT.out.results, params.metagenomics_maltextract_taxon_list, params.metagenomics_maltextract_filter ) + tax_list = Channel.fromPath(params.metagenomics_maltextract_taxon_list) + ncbi_dir = Channel.fromPath(params.metagenomics_maltextract_ncbi_dir) + maltex_filter = Channel.fromPath(params.metagenomics_maltextract_filter) + + MALTEXTRACT ( ch_maltextract_input, tax_list, ncbi_dir) + + AMPS ( MALTEXTRACT.out.results, tax_list, maltex_filter ) ch_versions = ch_versions.mix( MALTEXTRACT.out.versions.first(), AMPS.out.versions.first() ) ch_results = ch_results.mix( AMPS.out.candidate_pdfs, AMPS.out.tsv, AMPS.out.summary_pdf ) @@ -27,7 +34,7 @@ workflow METAGENOMICS_POSTPROCESSING { } - elif ( params.metagenomics_postprocessing_tool == 'krakenmerge' || ['kraken2', 'krakenuniq'].contains(params.metagenomics_profiling_tool) ) { + else if ( params.metagenomics_postprocessing_tool == 'krakenmerge' || ['kraken2', 'krakenuniq'].contains(params.metagenomics_profiling_tool) ) { KRAKENPARSE ( ch_postprocessing_input ) @@ -46,7 +53,7 @@ workflow METAGENOMICS_POSTPROCESSING { } - elif ( params.metagenomics_postprocessing_tool == 'mergemetaphlantables' ) { + else if ( params.metagenomics_postprocessing_tool == 'mergemetaphlantables' ) { METAPHLAN_MERGEMETAPHLANTABLES ( ch_postprocessing_input , params.metagenomics_profiling_database ) ch_versions = ch_versions.mix( METAPHLAN_MERGEMETAPHLANTABLES.out.versions.first() ) From 87e8c8dba7653dae9455942f31e47df5b2afce07 Mon Sep 17 00:00:00 2001 From: Ian Light Date: Fri, 28 Jul 2023 09:30:47 +0000 Subject: [PATCH 045/198] merged dev to metagenomics --- workflows/eager.nf | 282 +++++++++++++++++++++++++++++++++++---------- 1 file changed, 223 insertions(+), 59 deletions(-) diff --git a/workflows/eager.nf b/workflows/eager.nf index 7ece609fd..909f69690 100644 --- a/workflows/eager.nf +++ b/workflows/eager.nf @@ -1,37 +1,25 @@ /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - VALIDATE INPUTS + PRINT PARAMS SUMMARY ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -def summary_params = NfcoreSchema.paramsSummaryMap(workflow, params) +include { paramsSummaryLog; paramsSummaryMap } from 'plugin/nf-validation' -// Validate input parameters -WorkflowEager.initialise(params, log) - -// TODO nf-core: Add all file path parameters for the pipeline to the list below -// Check input path parameters to see if they exist -def checkPathParamList = [ params.input, params.multiqc_config, params.fasta ] -for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true) } } +def logo = NfcoreTemplate.logo(workflow, params.monochrome_logs) +def citation = '\n' + WorkflowMain.citation(workflow) + '\n' +def summary_params = paramsSummaryMap(workflow) -// Check mandatory parameters -if (params.input) { ch_input = file(params.input) } else { exit 1, 'Input samplesheet not specified!' } +// Print parameter summary log to screen +log.info logo + paramsSummaryLog(workflow) + citation -/* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - Check failing parameter combinations -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -*/ +WorkflowEager.initialise(params, log) +// Check failing parameter combinations if ( params.bamfiltering_retainunmappedgenomicbam && params.bamfiltering_mappingquality > 0 ) { exit 1, ("[nf-core/eager] ERROR: You cannot both retain unmapped reads and perform quality filtering, as unmapped reads have a mapping quality of 0. Pick one or the other functionality.") } - -// TODO What to do when params.preprocessing_excludeunmerged is provided but the data is SE? -if ( params.deduplication_tool == 'dedup' && ! params.preprocessing_excludeunmerged ) { exit 1, "[nf-core/eager] ERROR: Dedup can only be used on collapsed (i.e. merged) PE reads. For all other cases, please set --deduplication_tool to 'markduplicates'."} - -// Metagenomics failing parameter combinations -// TODO add any other metagenomics screening parameters checks for eg complexity filtering, post-processing -if ( params.run_metagenomics && ! params.metagenomics_profiling_database ) { exit 1, ("[nf-core/eager] ERROR: Please provide an appropriate database path for metagenomics screening using --metagenomics_profiling_database") } - +if ( params.genotyping_source == 'trimmed' && ! params.run_trim_bam ) { exit 1, ("[nf-core/eager] ERROR: --genotyping_source cannot be 'trimmed' unless BAM trimming is turned on with `--run_trim_bam`.") } +if ( params.genotyping_source == 'pmd' && ! params.run_pmd_filtering ) { exit 1, ("[nf-core/eager] ERROR: --genotyping_source cannot be 'pmd' unless PMD-filtering is ran.") } +if ( params.genotyping_source == 'rescaled' && ! params.run_mapdamage_rescaling ) { exit 1, ("[nf-core/eager] ERROR: --genotyping_source cannot be 'rescaled' unless aDNA damage rescaling is ran.") } if ( params.metagenomics_complexity_tool == 'prinseq' && params.metagenomics_prinseq_mode == 'dust' && params.metagenomics_complexity_entropy != 0.3 ) { // entropy score was set but dust method picked. If no dust-score provided, assume it was an error and fail if (params.metagenomics_prinseq_dustscore == 0.5) { @@ -44,23 +32,18 @@ if ( params.metagenomics_complexity_tool == 'prinseq' && params.metagenomics_pri exit 1, ("[nf-core/eager] ERROR: Metagenomics: You picked PRINSEQ++ with 'entropy' mode but provided a dust score. Please specify an entropy filter threshold using the --metagenomics_complexity_entropy flag") } } +if( params.run_bedtools_coverage ){ + if( !params.mapstats_bedtools_featurefile ) { + exit 1, "[nf-core/eager] ERROR: you have turned on bedtools coverage, but not specified a BED or GFF file with --mapstats_bedtools_featurefile. Please validate your parameters." + } +} -if ( params.metagenomics_postprocessing_tool == 'maltextract' && params.metagenomics_profiling_tool != 'malt' ) { exit 1, ("[nf-core/eager] ERROR: --metagenomics_postprocessing_tool 'maltextract' can only be run with --metagenomics_profiling_tool 'malt'") } - -if ( params.metagenomics_postprocessing_tool == 'krakenmerge' || ['kraken2', 'krakenuniq'].contains(params.metagenomics_profiling_tool) ) { exit 1, ("[nf-core/eager] ERROR: --metagenomics_postprocessing_tool 'krakenmerge' can only be run with --metagenomics_profiling_tool 'kraken2' or 'krakenuniq'") } - -if ( params.metagenomics_postprocessing_tool == 'mergemetaphlantables' && ! params.metagenomics_profiling_tool != 'metaphlan' ) { exit 1, ("[nf-core/eager] ERROR: --metagenomics_postprocessing_tool 'mergemetaphlantables' can only be run with --metagenomics_profiling_tool 'metaphlan'") } - -/* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - Report possible warnings -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -*/ +// TODO What to do when params.preprocessing_excludeunmerged is provided but the data is SE? +if ( params.deduplication_tool == 'dedup' && ! params.preprocessing_excludeunmerged ) { exit 1, "[nf-core/eager] ERROR: Dedup can only be used on collapsed (i.e. merged) PE reads. For all other cases, please set --deduplication_tool to 'markduplicates'."} +// Report possible warnings if ( params.preprocessing_skipadaptertrim && params.preprocessing_adapterlist ) log.warn("[nf-core/eager] --preprocessing_skipadaptertrim will override --preprocessing_adapterlist. Adapter trimming will be skipped!") -if ( params.metagenomics_postprocessing_tool == 'krakenmerge' && params.metagenomics_min_support_reads == 1 ) log.warn("[nf-core/eager] Warning: The default value for krakenmerge minimum reads for outputing a node has not been changed from the default. This default is set for MALT and maltextract. Consider updating to the default value for krakenmerge (50 reads) by setting --metagenomics_min_support_reads 50") - /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CONFIG FILES @@ -83,13 +66,16 @@ ch_multiqc_custom_methods_description = params.multiqc_methods_description ? fil // // TODO rename to active: index_reference, filter_bam etc. -include { INPUT_CHECK } from '../subworkflows/local/input_check' -include { REFERENCE_INDEXING } from '../subworkflows/local/reference_indexing' -include { PREPROCESSING } from '../subworkflows/local/preprocessing' -include { MAP } from '../subworkflows/local/map' -include { FILTER_BAM } from '../subworkflows/local/bamfiltering.nf' -include { DEDUPLICATE } from '../subworkflows/local/deduplicate' -include { METAGENOMICS } from '../subworkflows/local/metagenomics' +include { INPUT_CHECK } from '../subworkflows/local/input_check' +include { REFERENCE_INDEXING } from '../subworkflows/local/reference_indexing' +include { PREPROCESSING } from '../subworkflows/local/preprocessing' +include { MAP } from '../subworkflows/local/map' +include { FILTER_BAM } from '../subworkflows/local/bamfiltering.nf' +include { DEDUPLICATE } from '../subworkflows/local/deduplicate' +include { MANIPULATE_DAMAGE } from '../subworkflows/local/manipulate_damage' +include { METAGENOMICS } from '../subworkflows/local/metagenomics' +include { ESTIMATE_CONTAMINATION } from '../subworkflows/local/estimate_contamination' +include { CALCULATE_DAMAGE } from '../subworkflows/local/calculate_damage' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -100,15 +86,20 @@ include { METAGENOMICS } from '../subworkflows/local/metagenomics' // // MODULE: Installed directly from nf-core/modules // -include { FASTQC } from '../modules/nf-core/fastqc/main' -include { MULTIQC } from '../modules/nf-core/multiqc/main' -include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoftwareversions/main' -include { SAMTOOLS_INDEX } from '../modules/nf-core/samtools/index/main' -include { PRESEQ_CCURVE } from '../modules/nf-core/preseq/ccurve/main' -include { PRESEQ_LCEXTRAP } from '../modules/nf-core/preseq/lcextrap/main' -include { FALCO } from '../modules/nf-core/falco/main' -include { MTNUCRATIO } from '../modules/nf-core/mtnucratio/main' +include { FASTQC } from '../modules/nf-core/fastqc/main' +include { MULTIQC } from '../modules/nf-core/multiqc/main' +include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoftwareversions/main' +include { SAMTOOLS_INDEX } from '../modules/nf-core/samtools/index/main' +include { PRESEQ_CCURVE } from '../modules/nf-core/preseq/ccurve/main' +include { PRESEQ_LCEXTRAP } from '../modules/nf-core/preseq/lcextrap/main' +include { FALCO } from '../modules/nf-core/falco/main' +include { MTNUCRATIO } from '../modules/nf-core/mtnucratio/main' +include { HOST_REMOVAL } from '../modules/local/host_removal' +include { ENDORSPY } from '../modules/nf-core/endorspy/main' +include { SAMTOOLS_FLAGSTAT as SAMTOOLS_FLAGSTATS_BAM_INPUT } from '../modules/nf-core/samtools/flagstat/main' +include { BEDTOOLS_COVERAGE as BEDTOOLS_COVERAGE_DEPTH ; BEDTOOLS_COVERAGE as BEDTOOLS_COVERAGE_BREADTH } from '../modules/nf-core/bedtools/coverage/main' +include { SAMTOOLS_VIEW_GENOME } from '../modules/local/samtools_view_genome.nf' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -145,14 +136,22 @@ workflow EAGER { if ( params.preprocessing_tool == 'fastp' && !adapterlist.extension.matches(".*(fa|fasta|fna|fas)") ) error "[nf-core/eager] ERROR: fastp adapter list requires a `.fasta` format and extension (or fa, fas, fna). Check input: --preprocessing_adapterlist ${params.preprocessing_adapterlist}" } + // Contamination estimation + hapmap_file = file(params.contamination_estimation_angsd_hapmap, checkIfExists:true) + // // SUBWORKFLOW: Read in samplesheet, validate and stage input files // + INPUT_CHECK ( - ch_input + file(params.input) ) ch_versions = ch_versions.mix( INPUT_CHECK.out.versions ) + // TODO: OPTIONAL, you can use nf-validation plugin to create an input channel from the samplesheet with Channel.fromSamplesheet("input") + // See the documentation https://nextflow-io.github.io/nf-validation/samplesheets/fromSamplesheet/ + // ! There is currently no tooling to help you write a sample sheet schema + // // SUBWORKFLOW: Indexing of reference files // @@ -190,8 +189,14 @@ workflow EAGER { // // SUBWORKFLOW: Reference mapping // + ch_reference_for_mapping = REFERENCE_INDEXING.out.reference + .map{ + meta, fasta, fai, dict, index, circular_target, mitochondrion -> + [ meta, index ] + } + + MAP ( ch_reads_for_mapping, ch_reference_for_mapping ) - MAP ( ch_reads_for_mapping, REFERENCE_INDEXING.out.reference.map{meta, fasta, fai, dict, index -> [meta, index]} ) ch_versions = ch_versions.mix( MAP.out.versions ) ch_multiqc_files = ch_multiqc_files.mix( MAP.out.mqc.collect{it[1]}.ifEmpty([]) ) @@ -208,6 +213,16 @@ workflow EAGER { ch_bams_from_input = INPUT_CHECK.out.bams.join( SAMTOOLS_INDEX.out.bai ) } + + // + // MODULE: flagstats of user supplied input BAMs + // + ch_bam_bai_input = INPUT_CHECK.out.bams + .join(SAMTOOLS_INDEX.out.bai) + + SAMTOOLS_FLAGSTATS_BAM_INPUT ( ch_bam_bai_input ) + ch_versions = ch_versions.mix( SAMTOOLS_FLAGSTATS_BAM_INPUT.out.versions ) + // // SUBWORKFLOW: bam filtering (length, mapped/unmapped, quality etc.) // @@ -238,7 +253,7 @@ workflow EAGER { ch_fasta_for_deduplication = REFERENCE_INDEXING.out.reference .multiMap{ - meta, fasta, fai, dict, index -> + meta, fasta, fai, dict, index, circular_target, mitochondrion -> fasta: [ meta, fasta ] fasta_fai: [ meta, fai ] } @@ -255,6 +270,40 @@ workflow EAGER { ch_dedupped_flagstat = Channel.empty() } + // + // MODULE: remove reads mapping to the host from the raw fastq + // + if ( params.run_host_removal ) { + // Preparing bam channel for host removal to be combined with the input fastq channel + // The bam channel consist of [meta, bam, bai] and in the meta we have in addition 'single_end' always set as TRUE and 'reference' set + // To be able to join it with fastq channel, we need to remove them from the meta (done in map) and stored in new_meta + ch_bam_for_host_removal= MAP.out.bam.join(MAP.out.bai) + .map{ + meta, bam, bai -> + new_meta = meta.clone().findAll{ it.key !in [ 'single_end', 'reference' ] } + [ new_meta, meta, bam, bai ] + } + // Preparing fastq channel for host removal to be combined with the bam channel + // The meta of the fastq channel contains additional fields when compared to the meta from the bam channel: lane, colour_chemistry, + // and not necessarily matching single_end. Those fields are dropped of the meta in the map and stored in new_meta + ch_fastqs_for_host_removal= INPUT_CHECK.out.fastqs.map{ + meta, fastqs -> + new_meta = meta.clone().findAll{ it.key !in [ 'lane', 'colour_chemistry', 'single_end' ] } + [ new_meta, meta, fastqs ] + } + // We join the bam and fastq channel with now matching metas (new_meta) referred as meta_join + // and remove the meta_join from the final channel, keeping the original metas for the bam and the fastqs + ch_input_for_host_removal = ch_bam_for_host_removal.join(ch_fastqs_for_host_removal) + .map{ + meta_join, meta_bam, bam, bai, meta_fastq, fastqs -> + [ meta_bam, bam, bai, meta_fastq, fastqs] + } + + HOST_REMOVAL ( ch_input_for_host_removal ) + + ch_versions = ch_versions.mix( HOST_REMOVAL.out.versions ) + } + // // Section: Metagenomics // @@ -263,9 +312,7 @@ workflow EAGER { METAGENOMICS ( ch_bamfiltered_for_metagenomics ) ch_versions = ch_versions.mix( METAGENOMICS.out.versions.first() ) ch_multiqc_files = ch_multiqc_files.mix( METAGENOMICS.out.ch_multiqc_files ) - } - // // MODULE: MTNUCRATIO // @@ -282,6 +329,44 @@ workflow EAGER { ch_versions = ch_versions.mix( MTNUCRATIO.out.versions ) } + // + // MODULE: ENDORSPY (raw, filtered, deduplicated) + // + + ch_flagstat_for_endorspy_raw = MAP.out.flagstat + .mix( SAMTOOLS_FLAGSTATS_BAM_INPUT.out.flagstat ) + + if ( params.run_bamfiltering & !params.skip_deduplication ) { + ch_for_endorspy = ch_flagstat_for_endorspy_raw + .join (FILTER_BAM.out.flagstat) + .join (DEDUPLICATE.out.flagstat) + } else if ( params.run_bamfiltering & params.skip_deduplication ) { + ch_for_endorspy = ch_flagstat_for_endorspy_raw + .join (FILTER_BAM.out.flagstat) + .map{ + meta, flags_raw, flags_filtered -> + [ meta, flags_raw, flags_filtered, [] ] + } + } else if ( !params.run_bamfiltering & !params.skip_deduplication) { + ch_for_endorspy = ch_flagstat_for_endorspy_raw + .join (DEDUPLICATE.out.flagstat) + . map{ + meta, flags_raw, flags_dedup -> + [ meta, flags_raw, [], flags_dedup ] + } + } else { + ch_for_endorspy = ch_flagstat_for_endorspy_raw + .map { + meta, flags_raw -> + [ meta, flags_raw, [], [] ] + } + } + + ENDORSPY ( ch_for_endorspy ) + + ch_versions = ch_versions.mix( ENDORSPY.out.versions ) + ch_multiqc_files = ch_multiqc_files.mix( ENDORSPY.out.json.collect{it[1]}.ifEmpty([]) ) + // // MODULE: PreSeq // @@ -296,6 +381,85 @@ workflow EAGER { ch_versions = ch_versions.mix( PRESEQ_LCEXTRAP.out.versions ) } + + // + // MODULE: Bedtools coverage + // + + if ( params.run_bedtools_coverage ) { + + ch_anno_for_bedtools = Channel.fromPath(params.mapstats_bedtools_featurefile, checkIfExists: true).collect() + + ch_dedupped_for_bedtools = ch_dedupped_bams.combine(ch_anno_for_bedtools) + .map{ + meta, bam, bai, anno -> + [meta, anno, bam] + } + + // Running samtools view to get header + SAMTOOLS_VIEW_GENOME(ch_dedupped_bams) + + ch_genome_for_bedtools = SAMTOOLS_VIEW_GENOME.out.genome + + BEDTOOLS_COVERAGE_BREADTH(ch_dedupped_for_bedtools, ch_genome_for_bedtools) + BEDTOOLS_COVERAGE_DEPTH(ch_dedupped_for_bedtools, ch_genome_for_bedtools) + + ch_versions = ch_versions.mix( SAMTOOLS_VIEW_GENOME.out.versions ) + ch_versions = ch_versions.mix( BEDTOOLS_COVERAGE_BREADTH.out.versions ) + ch_versions = ch_versions.mix( BEDTOOLS_COVERAGE_DEPTH.out.versions ) + } + + + // + // SUBWORKFLOW: Calculate Damage + // + + ch_fasta_for_damagecalculation = REFERENCE_INDEXING.out.reference + .multiMap{ + meta, fasta, fai, dict, index, circular_target, mitochondrion -> + fasta: [ meta, fasta ] + fasta_fai: [ meta, fai ] + } + + if ( !params.skip_damage_calculation ) { + CALCULATE_DAMAGE( ch_dedupped_bams, ch_fasta_for_damagecalculation.fasta, ch_fasta_for_damagecalculation.fasta_fai ) + ch_versions = ch_versions.mix( CALCULATE_DAMAGE.out.versions ) + ch_multiqc_files = ch_multiqc_files.mix(CALCULATE_DAMAGE.out.mqc.collect{it[1]}.ifEmpty([])) + + } + + // + // SUBWORKFLOW: Contamination estimation + // + + if ( params.run_contamination_estimation_angsd ) { + contamination_input = ch_dedupped_bams + ch_hapmap = Channel.of( [ hapmap_file ] ) + hapmap_input = REFERENCE_INDEXING.out.reference + .combine( ch_hapmap ) + .map { + meta, fasta, fai, dict, index, circular_target, mitochondrion, hapmap -> + [ meta, hapmap ] + } + + ESTIMATE_CONTAMINATION( contamination_input, hapmap_input ) + ch_versions = ch_versions.mix( ESTIMATE_CONTAMINATION.out.versions ) + ch_multiqc_files = ch_multiqc_files.mix( ESTIMATE_CONTAMINATION.out.mqc.collect{it[1]}.ifEmpty([]) ) + } + + // + // SUBWORKFLOW: aDNA Damage Manipulation + // + + if ( params.run_mapdamage_rescaling || params.run_pmd_filtering || params.run_trim_bam ) { + MANIPULATE_DAMAGE( ch_dedupped_bams, ch_fasta_for_deduplication.fasta ) + ch_multiqc_files = ch_multiqc_files.mix( MANIPULATE_DAMAGE.out.flagstat.collect{it[1]}.ifEmpty([]) ) + ch_versions = ch_versions.mix( MANIPULATE_DAMAGE.out.versions ) + ch_bams_for_genotyping = params.genotyping_source == 'rescaled' ? MANIPULATE_DAMAGE.out.rescaled : params.genotyping_source == 'pmd' ? MANIPULATE_DAMAGE.out.filtered : params.genotyping_source == 'trimmed' ? MANIPULATE_DAMAGE.out.trimmed : ch_dedupped_bams + } else { + ch_bams_for_genotyping = ch_dedupped_bams + } + // // MODULE: MultiQC // @@ -307,13 +471,13 @@ workflow EAGER { workflow_summary = WorkflowEager.paramsSummaryMultiqc(workflow, summary_params) ch_workflow_summary = Channel.value(workflow_summary) - methods_description = WorkflowEager.methodsDescriptionText(workflow, ch_multiqc_custom_methods_description) + methods_description = WorkflowEager.methodsDescriptionText(workflow, ch_multiqc_custom_methods_description, params) ch_methods_description = Channel.value(methods_description) ch_multiqc_files = ch_multiqc_files.mix(ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml')) ch_multiqc_files = ch_multiqc_files.mix(ch_methods_description.collectFile(name: 'methods_description_mqc.yaml')) ch_multiqc_files = ch_multiqc_files.mix(CUSTOM_DUMPSOFTWAREVERSIONS.out.mqc_yml.collect()) - ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.zip.collect{it[1]}.ifEmpty([])) + //ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.zip.collect{it[1]}.ifEmpty([])) // Replaced with custom mixing MULTIQC ( ch_multiqc_files.collect(), From 810163253e65ef8cd2b257bd86432ba7b9c2a6ef Mon Sep 17 00:00:00 2001 From: Ian Light Date: Fri, 28 Jul 2023 14:04:26 +0000 Subject: [PATCH 046/198] fixed metagenomics metaphlan, test runs --- conf/modules.config | 2 +- modules.json | 22 ++++++------ modules/nf-core/metaphlan/metaphlan/main.nf | 6 ++-- workflows/eager.nf | 38 +++++++++++++++------ 4 files changed, 44 insertions(+), 24 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 3a36f2a28..acd15f607 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -341,7 +341,7 @@ process { withName: SAMTOOLS_FASTQ_MAPPED { tag = { "${meta.reference}|${meta.sample_id}_${meta.library_id}" } ext.args = [ - params.metagenomicscreening_input == 'all' ? '' : '-F 4', + params.metagenomics_input == 'all' ? '' : '-F 4', ].join(' ').trim() ext.prefix = { "${meta.sample_id}_${meta.library_id}_${meta.reference}_mapped" } publishDir = [ diff --git a/modules.json b/modules.json index 79545267b..5107e22b8 100644 --- a/modules.json +++ b/modules.json @@ -10,6 +10,11 @@ "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", "installed_by": ["modules"] }, + "amps": { + "branch": "master", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "installed_by": ["modules"] + }, "angsd/contamination": { "branch": "master", "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", @@ -25,11 +30,6 @@ "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", "installed_by": ["modules"] }, - "amps": { - "branch": "master", - "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", - "installed_by": ["modules"] - }, "bbmap/bbduk": { "branch": "master", "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", @@ -140,6 +140,11 @@ "git_sha": "0f8a77ff00e65eaeebc509b8156eaa983192474b", "installed_by": ["modules"] }, + "mapdamage2": { + "branch": "master", + "git_sha": "0591cad3d725d5c21337f72e638507abf709f75e", + "installed_by": ["modules"] + }, "metaphlan/mergemetaphlantables": { "branch": "master", "git_sha": "9aa59197c0fb35c29e315bcd10c0fc9e1afc70a8", @@ -147,7 +152,7 @@ }, "metaphlan/metaphlan": { "branch": "master", - "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "git_sha": "1038d3de36263159b4138324a646105941ac271a", "installed_by": ["modules"] }, "mtnucratio": { @@ -195,11 +200,6 @@ "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", "installed_by": ["modules"] }, - "samtools/faidx": { - "branch": "master", - "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", - "installed_by": ["modules"] - }, "samtools/fastq": { "branch": "master", "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", diff --git a/modules/nf-core/metaphlan/metaphlan/main.nf b/modules/nf-core/metaphlan/metaphlan/main.nf index 15bd42858..24533571e 100644 --- a/modules/nf-core/metaphlan/metaphlan/main.nf +++ b/modules/nf-core/metaphlan/metaphlan/main.nf @@ -23,12 +23,13 @@ process METAPHLAN_METAPHLAN { script: def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.id}" - def input_type = ("$input".endsWith(".fastq.gz") || "$input".endsWith(".fq.gz")) ? "--input_type fastq" : ("$input".contains(".fasta")) ? "--input_type fasta" : ("$input".endsWith(".bowtie2out.txt")) ? "--input_type bowtie2out" : "--input_type sam" + def input_type = "$input" =~ /.*\.(fastq|fq)/ ? "--input_type fastq" : "$input" =~ /.*\.(fasta|fna|fa)/ ? "--input_type fasta" : "$input".endsWith(".bowtie2out.txt") ? "--input_type bowtie2out" : "--input_type sam" def input_data = ("$input_type".contains("fastq")) && !meta.single_end ? "${input[0]},${input[1]}" : "$input" def bowtie2_out = "$input_type" == "--input_type bowtie2out" || "$input_type" == "--input_type sam" ? '' : "--bowtie2out ${prefix}.bowtie2out.txt" """ - BT2_DB=`find -L "${metaphlan_db_latest}" -name "*rev.1.bt2l" -exec dirname {} \\;` + BT2_DB=`find -L "${metaphlan_db_latest}" -name "*rev.1.bt2*" -exec dirname {} \\;` + BT2_DB_INDEX=`find -L ${metaphlan_db_latest} -name "*.rev.1.bt2*" | sed 's/\\.rev.1.bt2.*\$//' | sed 's/.*\\///'` metaphlan \\ --nproc $task.cpus \\ @@ -37,6 +38,7 @@ process METAPHLAN_METAPHLAN { $args \\ $bowtie2_out \\ --bowtie2db \$BT2_DB \\ + --index \$BT2_DB_INDEX \\ --biom ${prefix}.biom \\ --output_file ${prefix}_profile.txt diff --git a/workflows/eager.nf b/workflows/eager.nf index cf4326632..deda186dc 100644 --- a/workflows/eager.nf +++ b/workflows/eager.nf @@ -32,6 +32,22 @@ if ( params.metagenomics_complexity_tool == 'prinseq' && params.metagenomics_pri exit 1, ("[nf-core/eager] ERROR: Metagenomics: You picked PRINSEQ++ with 'entropy' mode but provided a dust score. Please specify an entropy filter threshold using the --metagenomics_complexity_entropy flag") } } +if ( params.run_metagenomics && ! params.metagenomics_profiling_database ) { + exit 1, ("[nf-core/eager] ERROR: Please provide an appropriate database path for metagenomics screening using --metagenomics_profiling_database") +} + +if ( params.metagenomics_postprocessing_tool == 'maltextract' && params.metagenomics_profiling_tool != 'malt' ) { + exit 1, ("[nf-core/eager] ERROR: --metagenomics_postprocessing_tool 'maltextract' can only be run with --metagenomics_profiling_tool 'malt'") +} + +if ( params.metagenomics_postprocessing_tool == 'krakenmerge' || ['kraken2', 'krakenuniq'].contains(params.metagenomics_profiling_tool) ) { + exit 1, ("[nf-core/eager] ERROR: --metagenomics_postprocessing_tool 'krakenmerge' can only be run with --metagenomics_profiling_tool 'kraken2' or 'krakenuniq'") +} + +if ( params.metagenomics_postprocessing_tool == 'mergemetaphlantables' && ! params.metagenomics_profiling_tool == 'metaphlan' ) { + exit 1, ("[nf-core/eager] ERROR: --metagenomics_postprocessing_tool 'mergemetaphlantables' can only be run with --metagenomics_profiling_tool 'metaphlan'") +} + if( params.run_bedtools_coverage ){ if( !params.mapstats_bedtools_featurefile ) { exit 1, "[nf-core/eager] ERROR: you have turned on bedtools coverage, but not specified a BED or GFF file with --mapstats_bedtools_featurefile. Please validate your parameters." @@ -44,6 +60,8 @@ if ( params.deduplication_tool == 'dedup' && ! params.preprocessing_excludeunmer // Report possible warnings if ( params.preprocessing_skipadaptertrim && params.preprocessing_adapterlist ) log.warn("[nf-core/eager] --preprocessing_skipadaptertrim will override --preprocessing_adapterlist. Adapter trimming will be skipped!") +if ( params.metagenomics_postprocessing_tool == 'krakenmerge' && params.metagenomics_min_support_reads == 1 ) log.warn("[nf-core/eager] Warning: The default value for krakenmerge minimum reads for outputing a node has not been changed from the default. This default is set for MALT and maltextract. Consider updating to the default value for krakenmerge (50 reads) by setting --metagenomics_min_support_reads 50") + /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CONFIG FILES @@ -66,16 +84,16 @@ ch_multiqc_custom_methods_description = params.multiqc_methods_description ? fil // // TODO rename to active: index_reference, filter_bam etc. -include { INPUT_CHECK } from '../subworkflows/local/input_check' -include { REFERENCE_INDEXING } from '../subworkflows/local/reference_indexing' -include { PREPROCESSING } from '../subworkflows/local/preprocessing' -include { MAP } from '../subworkflows/local/map' -include { FILTER_BAM } from '../subworkflows/local/bamfiltering.nf' -include { DEDUPLICATE } from '../subworkflows/local/deduplicate' -include { MANIPULATE_DAMAGE } from '../subworkflows/local/manipulate_damage' -include { METAGENOMICS } from '../subworkflows/local/metagenomics' -include { ESTIMATE_CONTAMINATION } from '../subworkflows/local/estimate_contamination' -include { CALCULATE_DAMAGE } from '../subworkflows/local/calculate_damage' +include { INPUT_CHECK } from '../subworkflows/local/input_check' +include { REFERENCE_INDEXING } from '../subworkflows/local/reference_indexing' +include { PREPROCESSING } from '../subworkflows/local/preprocessing' +include { MAP } from '../subworkflows/local/map' +include { FILTER_BAM } from '../subworkflows/local/bamfiltering.nf' +include { DEDUPLICATE } from '../subworkflows/local/deduplicate' +include { MANIPULATE_DAMAGE } from '../subworkflows/local/manipulate_damage' +include { METAGENOMICS } from '../subworkflows/local/metagenomics' +include { ESTIMATE_CONTAMINATION } from '../subworkflows/local/estimate_contamination' +include { CALCULATE_DAMAGE } from '../subworkflows/local/calculate_damage' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ From 4f3394858e687d66de046cd465a0c7180816bc27 Mon Sep 17 00:00:00 2001 From: Merlin Szymanski Date: Fri, 4 Aug 2023 11:23:18 +0200 Subject: [PATCH 047/198] Tests&Fixes: Krakenextract: uniq prefix, faulty exit condition --- conf/modules.config | 23 +++++++++++-------- modules/local/krakenmerge.nf | 1 - modules/local/krakenparse.nf | 6 ++--- .../local/metagenomics_postprocessing.nf | 3 ++- workflows/eager.nf | 4 ---- 5 files changed, 18 insertions(+), 19 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index acd15f607..ca75976ed 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -781,6 +781,17 @@ process { ] } + // + // MT-NUCLEAR RATIO + // + withName: MTNUCRATIO { + tag = { "${meta.reference}|${meta.sample_id}_${meta.library_id}" } + publishDir = [ + enabled: false + ] + } + + // // METAGENOMIC SCREENING // @@ -813,16 +824,6 @@ process { ] } - // - // MT-NUCLEAR RATIO - // - withName: MTNUCRATIO { - tag = { "${meta.reference}|${meta.sample_id}_${meta.library_id}" } - publishDir = [ - enabled: false - ] - } - withName: MALT_RUN { ext.args = [ "-m ${params.metagenomics_malt_mode}", @@ -901,6 +902,8 @@ process { publishDir = [ enabled: false ] + tag = { "${meta.sample_id}_${meta.library_id}" } + ext.prefix = { "${meta.sample_id}_${meta.library_id}" } } withName: KRAKENMERGE { diff --git a/modules/local/krakenmerge.nf b/modules/local/krakenmerge.nf index a0925a966..c7278fe26 100644 --- a/modules/local/krakenmerge.nf +++ b/modules/local/krakenmerge.nf @@ -7,7 +7,6 @@ process KRAKENMERGE { 'quay.io/biocontainers/pandas:1.5.2' }" input: - // TODO check if this works path kraken_parse_reads path kraken_parse_kmers diff --git a/modules/local/krakenparse.nf b/modules/local/krakenparse.nf index 3e25fabda..9c1314cb6 100644 --- a/modules/local/krakenparse.nf +++ b/modules/local/krakenparse.nf @@ -18,9 +18,9 @@ process KRAKENPARSE { task.ext.when == null || task.ext.when script: - - def read_out = "${meta.id}.read_kraken_parsed.csv" - def kmer_out = "${meta.id}.kmer_kraken_parsed.csv" + def prefix = task.ext.prefix ?: "${meta.id}" + def read_out = "${prefix}.read_kraken_parsed.csv" + def kmer_out = "${prefix}.kmer_kraken_parsed.csv" """ kraken_parse.py \\ -c ${params.metagenomics_min_support_reads} \\ diff --git a/subworkflows/local/metagenomics_postprocessing.nf b/subworkflows/local/metagenomics_postprocessing.nf index a4b39ee73..e38fc50c0 100644 --- a/subworkflows/local/metagenomics_postprocessing.nf +++ b/subworkflows/local/metagenomics_postprocessing.nf @@ -34,7 +34,7 @@ workflow METAGENOMICS_POSTPROCESSING { } - else if ( params.metagenomics_postprocessing_tool == 'krakenmerge' || ['kraken2', 'krakenuniq'].contains(params.metagenomics_profiling_tool) ) { + else if ( ['kraken2', 'krakenuniq'].contains(params.metagenomics_profiling_tool) ) { KRAKENPARSE ( ch_postprocessing_input ) @@ -45,6 +45,7 @@ workflow METAGENOMICS_POSTPROCESSING { meta, kmer_out -> [ kmer_out ] } + KRAKENMERGE ( ch_list_of_kraken_parse_reads.collect() , ch_list_of_kraken_parse_kmer.collect() ) ch_versions = ch_versions.mix( KRAKENPARSE.out.versions.first(), KRAKENMERGE.out.versions.first() ) diff --git a/workflows/eager.nf b/workflows/eager.nf index deda186dc..812ef5328 100644 --- a/workflows/eager.nf +++ b/workflows/eager.nf @@ -40,10 +40,6 @@ if ( params.metagenomics_postprocessing_tool == 'maltextract' && params.metageno exit 1, ("[nf-core/eager] ERROR: --metagenomics_postprocessing_tool 'maltextract' can only be run with --metagenomics_profiling_tool 'malt'") } -if ( params.metagenomics_postprocessing_tool == 'krakenmerge' || ['kraken2', 'krakenuniq'].contains(params.metagenomics_profiling_tool) ) { - exit 1, ("[nf-core/eager] ERROR: --metagenomics_postprocessing_tool 'krakenmerge' can only be run with --metagenomics_profiling_tool 'kraken2' or 'krakenuniq'") -} - if ( params.metagenomics_postprocessing_tool == 'mergemetaphlantables' && ! params.metagenomics_profiling_tool == 'metaphlan' ) { exit 1, ("[nf-core/eager] ERROR: --metagenomics_postprocessing_tool 'mergemetaphlantables' can only be run with --metagenomics_profiling_tool 'metaphlan'") } From 2f4f36b1e46ebcfa2ff45a0a54824c7d515e33a0 Mon Sep 17 00:00:00 2001 From: Ian Light Date: Fri, 4 Aug 2023 09:59:41 +0000 Subject: [PATCH 048/198] metaphlan working --- subworkflows/local/metagenomics.nf | 1 - subworkflows/local/metagenomics_postprocessing.nf | 3 ++- subworkflows/local/metagenomics_profiling.nf | 11 ++++++++++- 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/subworkflows/local/metagenomics.nf b/subworkflows/local/metagenomics.nf index 5cfe9cd75..c8b8a5f8a 100644 --- a/subworkflows/local/metagenomics.nf +++ b/subworkflows/local/metagenomics.nf @@ -42,7 +42,6 @@ workflow METAGENOMICS { // Run the post profiling subworkflow (optionally run for malt, mandatory for kraken2/krakenuniq) // - if ( params.metagenomics_postprocessing_tool || ['kraken2', 'krakenuniq'].contains(params.metagenomics_profiling_tool) ) { METAGENOMICS_POSTPROCESSING ( METAGENOMICS_PROFILING.out.postprocessing_input ) diff --git a/subworkflows/local/metagenomics_postprocessing.nf b/subworkflows/local/metagenomics_postprocessing.nf index e38fc50c0..6a8667d55 100644 --- a/subworkflows/local/metagenomics_postprocessing.nf +++ b/subworkflows/local/metagenomics_postprocessing.nf @@ -54,7 +54,8 @@ workflow METAGENOMICS_POSTPROCESSING { } else if ( params.metagenomics_postprocessing_tool == 'mergemetaphlantables' ) { - METAPHLAN_MERGEMETAPHLANTABLES ( ch_postprocessing_input , params.metagenomics_profiling_database ) + + METAPHLAN_MERGEMETAPHLANTABLES ( ch_postprocessing_input.map{ [[id:"metaphlan_profiles_all_samples_merged"], it[1]] }.groupTuple() ) ch_versions = ch_versions.mix( METAPHLAN_MERGEMETAPHLANTABLES.out.versions.first() ) ch_multiqc_files = ch_multiqc_files.mix( METAPHLAN_MERGEMETAPHLANTABLES.out.txt ) diff --git a/subworkflows/local/metagenomics_profiling.nf b/subworkflows/local/metagenomics_profiling.nf index aa9e2f1cd..980ed49b1 100644 --- a/subworkflows/local/metagenomics_profiling.nf +++ b/subworkflows/local/metagenomics_profiling.nf @@ -94,7 +94,16 @@ workflow METAGENOMICS_PROFILING { else if ( params.metagenomics_profiling_tool == 'metaphlan' ) { - METAPHLAN_METAPHLAN ( reads , database ) + reads = reads + .map { + meta, reads -> + [meta + [id: "${meta.id}_${meta.damage_treatment}"] , reads] + } + .combine(database) + metaphlan_reads = reads.map{ meta, reads, database -> [meta, reads] } + metaphlan_db = reads.map{ meta, reads, database -> [database] } + + METAPHLAN_METAPHLAN ( metaphlan_reads , metaphlan_db ) ch_versions = ch_versions.mix( METAPHLAN_METAPHLAN.out.versions.first() ) ch_raw_profiles = ch_raw_profiles.mix( METAPHLAN_METAPHLAN.out.profile ) ch_postprocessing_input = ch_postprocessing_input.mix( METAPHLAN_METAPHLAN.out.profile ) From b876a3cf82454e8fd116b7655e2c75c64413c4e1 Mon Sep 17 00:00:00 2001 From: Ian Light Date: Mon, 7 Aug 2023 08:22:30 +0000 Subject: [PATCH 049/198] added prefix to config for metaphlan --- conf/modules.config | 1 + docs/development/manual_tests.md | 67 ++++++++++++++++++-- subworkflows/local/metagenomics_profiling.nf | 7 +- 3 files changed, 65 insertions(+), 10 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index ca75976ed..1d06580cd 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -868,6 +868,7 @@ process { mode: params.publish_dir_mode, pattern: '*.{biom,txt}' ] + ext.prefix = { "${meta.sample_id}_${meta.library_id}_${meta.reference}" } } withName: MALTEXTRACT { diff --git a/docs/development/manual_tests.md b/docs/development/manual_tests.md index bae2ba48b..48ad081e2 100644 --- a/docs/development/manual_tests.md +++ b/docs/development/manual_tests.md @@ -505,13 +505,50 @@ nextflow run main.nf -profile docker,test --outdir ./results/AR_dedup_merged -du nextflow run main.nf -profile docker,test --input ~/eager_dsl2_testing/input/only_PE/pe_only.tsv --outdir ./results/AR_dedup_merged_PE_only -dump-channels -ansi-log false --preprocessing_tool 'adapterremoval' --deduplication_tool 'dedup' --preprocessing_excludeunmerged -resume ``` -## Metagenomics +### METAGENOMICS -### Complexityfilter +#### Complexityfilter -### Profiling +#### Profiling -#### Krakenuniq +##### metaphlan + +```bash +## metaphlan with default parameters +## Expect: + +nextflow run -resume ./main.nf -profile test,docker --outdir out \ +--run_metagenomics --metagenomics_profiling_tool metaphlan --metagenomics_profiling_database ./runtest/metaphlandb/ + +# 20230728: Works +``` + +##### krakenuniq + +```bash +nextflow run -resume ../eager3/main.nf -profile test,docker --outdir out \ +--run_metagenomics --metagenomics_profiling_tool krakenuniq --metagenomics_profiling_database ../runtest/refseq_rel215/kraken/Mito_db_kmer22/ + +# 20230623: Works +``` + +##### kraken2 + +```bash +sudo nextflow run -resume ../eager3/main.nf -profile test,docker --outdir out \ +--run_metagenomics --metagenomics_profiling_tool kraken2 --metagenomics_profiling_database kraken2_db/ +# 20230623: Works +``` + +##### malt + +```bash +sudo nextflow run -resume ../eager3/main.nf -profile test,docker --outdir out \ +--run_metagenomics --metagenomics_profiling_tool malt --metagenomics_profiling_database maltdb/maltdb/ +# 20230623: Works +``` + +##### Krakenuniq ```bash ### With saved reads @@ -526,3 +563,25 @@ nextflow run main.nf -profile test,singularity --outdir out --run_metagenomics - # Expected: directory with 1 textfile for each sample: the raw krakenuniq profile nextflow run main.nf -profile test,singularity --outdir out --run_metagenomics --metagenomics_profiling_tool krakenuniq --metagenomics_profiling_database ../runtest/refseq_rel215/kraken/Mito_db_kmer22/ ``` + +#### postprocessing + +##### maltextract + +```bash +nextflow run -resume ../eager3/main.nf -profile test,docker --outdir out \ +--run_metagenomics --metagenomics_profiling_tool malt --metagenomics_profiling_database maltdb/maltdb/ \ +--metagenomics_postprocessing_tool maltextract \ +--metagenomics_maltextract_ncbi_dir maltextract_ncbi_dir/ \ +--metagenomics_maltextract_taxon_list maltdb/target.txt + +# 20230623: No errors, but postpocessing steps dont finish? I need to wait and see how long it takes +``` + +##### mergemetaphlantables + +```bash +nextflow run -resume ./main.nf -profile test,docker --outdir out \ +--run_metagenomics --metagenomics_profiling_tool metaphlan --metagenomics_profiling_database ./runtest/metaphlandb/ --metagenomics_postprocessing_tool mergemetaphlantables +# 20230804: works +``` diff --git a/subworkflows/local/metagenomics_profiling.nf b/subworkflows/local/metagenomics_profiling.nf index 980ed49b1..8fad17a59 100644 --- a/subworkflows/local/metagenomics_profiling.nf +++ b/subworkflows/local/metagenomics_profiling.nf @@ -94,12 +94,7 @@ workflow METAGENOMICS_PROFILING { else if ( params.metagenomics_profiling_tool == 'metaphlan' ) { - reads = reads - .map { - meta, reads -> - [meta + [id: "${meta.id}_${meta.damage_treatment}"] , reads] - } - .combine(database) + reads = reads.combine(database) metaphlan_reads = reads.map{ meta, reads, database -> [meta, reads] } metaphlan_db = reads.map{ meta, reads, database -> [database] } From d4a8a61b0ed79a04d055f8d2b847e5ca1e99e8f9 Mon Sep 17 00:00:00 2001 From: Merlin Szymanski Date: Wed, 9 Aug 2023 16:11:14 +0200 Subject: [PATCH 050/198] Metagenomics: Comprehensive manual tests Added more manual tests to docs/development/manual_tests.md, fixed the prefix in kraken2 --- conf/modules.config | 2 +- docs/development/manual_tests.md | 165 ++++++++++++++++++++++++++----- 2 files changed, 139 insertions(+), 28 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 1d06580cd..8576fb78b 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -833,7 +833,6 @@ process { "-mq ${params.metagenomics_malt_max_queries}", "--memoryMode ${params.metagenomics_malt_memory_mode}", params.metagenomics_malt_min_support_mode == "percent" ? "-supp ${params.metagenomics_malt_min_support_percent}" : "-sup ${params.metagenomics_min_support_reads}", - params.metagenomics_malt_sam_output ? "-a . -f SAM" : "", params.metagenomics_malt_save_reads ? "--alignments ./ -za false" : "" ].join(' ').trim() publishDir = [ @@ -847,6 +846,7 @@ process { ext.args = [ "--report-minimizer-data" ].join(' ').trim() + ext.prefix = { "${meta.sample_id}_${meta.library_id}" } publishDir = [ path: { "${params.outdir}/metagenomics_screening/profiling/kraken2/" }, mode: params.publish_dir_mode, diff --git a/docs/development/manual_tests.md b/docs/development/manual_tests.md index 48ad081e2..125fd87b4 100644 --- a/docs/development/manual_tests.md +++ b/docs/development/manual_tests.md @@ -509,6 +509,64 @@ nextflow run main.nf -profile docker,test --input ~/eager_dsl2_testing/input/onl #### Complexityfilter +##### Test bbduk + +```bash +#### Use bbduk to remove low complexity reads _without_ saving the intermediate files +## Expect: NO additional directory created, but the files in the profiling directory contain the 'complexity' postfix +nextflow run main.nf -profile test,docker \ + --outdir ./out \ + --run_metagenomics \ + --metagenomics_profiling_tool krakenuniq \ + --metagenomics_profiling_database CUSTOM_KRAKEN_DB \ + --run_metagenomics_complexityfiltering \ + --metagenomics_complexity_tool bbduk +``` + +```bash +#### Use bbduk to remove low complexity reads _with_ saving intermediate files +## Expect: Additional directory created 'metagenomics_screening/complexity_filter/bbduk' that contains the fastq files +## with 'complexity' postfix and a bbduk.log file for each library +nextflow run main.nf -profile test,docker \ + --outdir ./out \ + --run_metagenomics \ + --metagenomics_profiling_tool krakenuniq \ + --metagenomics_profiling_database CUSTOM_KRAKEN_DB \ + --run_metagenomics_complexityfiltering \ + --metagenomics_complexity_tool bbduk \ + --metagenomics_complexity_savefastq +``` + +## Test prinseq + +```bash +#### Use prinseq to remove low complexity reads _without_ saving the intermediate files +## Expect: NO additional directory created, but the files in the profiling directory contain the 'complexity_good_out' postfix + +nextflow run main.nf -profile test,docker \ + --outdir out \ + --run_metagenomics \ + --metagenomics_profiling_tool krakenuniq \ + --metagenomics_profiling_database CUSTOM_KRAKEN_DB \ + --run_metagenomics_complexityfiltering \ + --metagenomics_complexity_tool prinseq +``` + +```bash +#### Use prinseq to remove low complexity reads _with_ saving the intermediate files +## Expect: Additional directory created 'metagenomics_screening/complexity_filter/prinseq' that contains the fastq files +## with 'complexity_good_out' postfix and a 'complexity.log' file for each library + +nextflow run main.nf -profile test,docker \ + --outdir out \ + --run_metagenomics \ + --metagenomics_profiling_tool krakenuniq \ + --metagenomics_profiling_database CUSTOM_KRAKEN_DB \ + --run_metagenomics_complexityfiltering \ + --metagenomics_complexity_tool prinseq + --metagenomics_complexity_savefastq +``` + #### Profiling ##### metaphlan @@ -526,42 +584,94 @@ nextflow run -resume ./main.nf -profile test,docker --outdir out \ ##### krakenuniq ```bash -nextflow run -resume ../eager3/main.nf -profile test,docker --outdir out \ ---run_metagenomics --metagenomics_profiling_tool krakenuniq --metagenomics_profiling_database ../runtest/refseq_rel215/kraken/Mito_db_kmer22/ - -# 20230623: Works +#### Use krakenuniq for metagenomics sequence classification, save only report (default) +## Use a custom Database with the -profile test dataset +## Expect: Directory created 'metagenomics_screening/profiling/krakenuniq' that contains one 'krakenuniq.report' file for +## each analyzed library + +nextflow run main.nf -profile test,docker \ + --outdir out \ + --run_metagenomics \ + --metagenomics_profiling_tool krakenuniq \ + --metagenomics_profiling_database CUSTOM_KRAKEN_DB + +#### Use krakenuniq for metagenomics sequence classification, save fastq files +## Use a custom Database with the -profile test dataset +## Expect: Directory created 'metagenomics_screening/profiling/krakenuniq' that contains: +# - 'krakenuniq.report' file +# - 'krakenuniq.classified.txt' file +# - 'classified.fastq.gz' file +# - 'unclassified.fastq.gz' file +# for each analyzed library + +nextflow run main.nf -profile test,docker \ + --outdir out \ + --run_metagenomics \ + --metagenomics_profiling_tool krakenuniq \ + --metagenomics_profiling_database CUSTOM_KRAKEN_DB \ + --metagenomics_kraken_save_reads \ + --metagenomics_kraken_save_read_classifications ``` ##### kraken2 ```bash -sudo nextflow run -resume ../eager3/main.nf -profile test,docker --outdir out \ ---run_metagenomics --metagenomics_profiling_tool kraken2 --metagenomics_profiling_database kraken2_db/ -# 20230623: Works +#### Use kraken2 for metagenomics sequence classification, save only report (default) +## Use a custom database with the -profile test dataset +## Expect: Directory created 'metagenomics_screening/profiling/kraken2' that contains a 'kraken2.report' file +## for each analyzed library + +nextflow run main.nf -profile test,docker \ + --outdir out \ + --run_metagenomics \ + --metagenomics_profiling_tool kraken2 \ + --metagenomics_profiling_database CUSTOM_KRAKEN2_DB + +#### Use krakenuniq for metagenomics sequence classification, save also fastq files +## Use a custom Database with the -profile test dataset +## Expect: Directory created 'metagenomics_screening/profiling/kraken2' that contains: +# - 'kraken2.report' file +# - 'kraken2.classifiedreads.txt' file +# - 'classified.fastq.gz' file +# - 'unclassified.fastq.gz' file +# for each analyzed library + +nextflow run main.nf -profile test,docker \ + --outdir out \ + --run_metagenomics \ + --metagenomics_profiling_tool kraken2 \ + --metagenomics_profiling_database CUSTOM_KRAKEN2_DB \ + --metagenomics_kraken_save_reads \ + --metagenomics_kraken_save_read_classifications ``` ##### malt ```bash -sudo nextflow run -resume ../eager3/main.nf -profile test,docker --outdir out \ ---run_metagenomics --metagenomics_profiling_tool malt --metagenomics_profiling_database maltdb/maltdb/ -# 20230623: Works -``` - -##### Krakenuniq - -```bash -### With saved reads -# Use only the -profile test dataset, provide a custom kraken database -# Expected: directory with 2 fastq-files and 1 textfile for each sample, containing classified, unclassified reads and the raw krakenuniq profile -nextflow run main.nf -profile test,singularity --outdir out --run_metagenomics --metagenomics_profiling_tool krakenuniq --metagenomics_profiling_database ../runtest/refseq_rel215/kraken/Mito_db_kmer22/ --metagenomics_kraken_save_reads -``` - -```bash -### Without saved reads -# Use only the -profile test dataset, provide a custom kraken database -# Expected: directory with 1 textfile for each sample: the raw krakenuniq profile -nextflow run main.nf -profile test,singularity --outdir out --run_metagenomics --metagenomics_profiling_tool krakenuniq --metagenomics_profiling_database ../runtest/refseq_rel215/kraken/Mito_db_kmer22/ +#### Use MALT for metagenomics sequence classification, save only report (default) +## Use a custom database with the -profile test dataset +## Expect: Directory created 'metagenomics_screening/profiling/malt' that contains a '.rma6' file for each analyzed library +## and a single CUSTOM_MALT_DB-malt-run.log file + +nextflow run main.nf -profile test,docker \ + --outdir out \ + --run_metagenomics \ + --metagenomics_profiling_tool malt \ + --metagenomics_profiling_database CUSTOM_MALT_DB + +#### Use MALT for metagenomics sequence classification, save reads +## Use a custom database with the -profile test dataset +## Expect: Directory created 'metagenomics_screening/profiling/malt' that contains for each analyzed library: +# - a '.rma6' file +# - a '.blastn.sam' file +# and a single CUSTOM_MALT_DB-malt-run.log file + +nextflow run main.nf -profile test,docker \ + --outdir out \ + --run_metagenomics \ + --metagenomics_profiling_tool malt \ + --metagenomics_profiling_database CUSTOM_MALT_DB \ + --metagenomics_malt_save_reads ``` #### postprocessing @@ -569,13 +679,14 @@ nextflow run main.nf -profile test,singularity --outdir out --run_metagenomics - ##### maltextract ```bash +# 20230623: No errors, but postpocessing steps dont finish? I need to wait and see how long it takes + nextflow run -resume ../eager3/main.nf -profile test,docker --outdir out \ --run_metagenomics --metagenomics_profiling_tool malt --metagenomics_profiling_database maltdb/maltdb/ \ --metagenomics_postprocessing_tool maltextract \ --metagenomics_maltextract_ncbi_dir maltextract_ncbi_dir/ \ --metagenomics_maltextract_taxon_list maltdb/target.txt -# 20230623: No errors, but postpocessing steps dont finish? I need to wait and see how long it takes ``` ##### mergemetaphlantables From 48cc3ba8d9c4943ca9f8861916dfd470ccacce27 Mon Sep 17 00:00:00 2001 From: Merlin Szymanski Date: Wed, 9 Aug 2023 19:15:52 +0200 Subject: [PATCH 051/198] Test maltextract+AMPS, fix tiny bug in subworkflow, fix publishDir directive --- conf/modules.config | 4 ++-- docs/development/manual_tests.md | 17 ++++++++++------- .../local/metagenomics_postprocessing.nf | 3 ++- 3 files changed, 14 insertions(+), 10 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 8576fb78b..e928ece74 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -887,7 +887,7 @@ process { publishDir = [ path: { "${params.outdir}/metagenomics_screening/postprocessing/maltextract/" }, mode: params.publish_dir_mode, - pattern: '*/results/*' + pattern: 'results' ] } @@ -895,7 +895,7 @@ process { publishDir = [ path: { "${params.outdir}/metagenomics_screening/postprocessing/maltextract/" }, mode: params.publish_dir_mode, - pattern: '*/results/*' + pattern: 'results' ] } diff --git a/docs/development/manual_tests.md b/docs/development/manual_tests.md index 125fd87b4..4a2672489 100644 --- a/docs/development/manual_tests.md +++ b/docs/development/manual_tests.md @@ -679,14 +679,17 @@ nextflow run main.nf -profile test,docker \ ##### maltextract ```bash -# 20230623: No errors, but postpocessing steps dont finish? I need to wait and see how long it takes - -nextflow run -resume ../eager3/main.nf -profile test,docker --outdir out \ ---run_metagenomics --metagenomics_profiling_tool malt --metagenomics_profiling_database maltdb/maltdb/ \ ---metagenomics_postprocessing_tool maltextract \ ---metagenomics_maltextract_ncbi_dir maltextract_ncbi_dir/ \ ---metagenomics_maltextract_taxon_list maltdb/target.txt +### Create a SummaryTable from the Malt rma6 files +# Expected: A directory 'metagenomics_screening/postprocessing/maltextract/results' see the docs for the content of this dir +nextflow run main.nf -profile test,docker \ + --outdir out \ + --run_metagenomics \ + --metagenomics_profiling_tool malt \ + --metagenomics_profiling_database CUSTOM_MALT_DB \ + --metagenomics_postprocessing_tool maltextract \ + --metagenomics_maltextract_ncbi_dir NCBI_DIR \ + --metagenomics_maltextract_taxon_list TAXONLISTFILE ``` ##### mergemetaphlantables diff --git a/subworkflows/local/metagenomics_postprocessing.nf b/subworkflows/local/metagenomics_postprocessing.nf index 6a8667d55..1fa4db04e 100644 --- a/subworkflows/local/metagenomics_postprocessing.nf +++ b/subworkflows/local/metagenomics_postprocessing.nf @@ -22,10 +22,11 @@ workflow METAGENOMICS_POSTPROCESSING { tax_list = Channel.fromPath(params.metagenomics_maltextract_taxon_list) ncbi_dir = Channel.fromPath(params.metagenomics_maltextract_ncbi_dir) - maltex_filter = Channel.fromPath(params.metagenomics_maltextract_filter) MALTEXTRACT ( ch_maltextract_input, tax_list, ncbi_dir) + maltex_filter = params.metagenomics_maltextract_filter + AMPS ( MALTEXTRACT.out.results, tax_list, maltex_filter ) ch_versions = ch_versions.mix( MALTEXTRACT.out.versions.first(), AMPS.out.versions.first() ) From bd35ed63547819dbcf23a125769b60f15a19a728 Mon Sep 17 00:00:00 2001 From: Ian Light Date: Fri, 11 Aug 2023 08:03:53 +0000 Subject: [PATCH 052/198] prettier and linting --- CITATIONS.md | 1 + nextflow_schema.json | 40 +++++++++++++++++++++------------------- 2 files changed, 22 insertions(+), 19 deletions(-) diff --git a/CITATIONS.md b/CITATIONS.md index c37bd1f46..09d250333 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -95,6 +95,7 @@ > Jun, G., Wing, M. K., Abecasis, G. R., & Kang, H. M. (2015). An efficient and scalable analysis framework for variant extraction and refinement from population-scale DNA sequence data. Genome Research, 25(6), 918–925. doi: [10.1101/gr.176552.114](https://doi.org/10.1101/gr.176552.114) - [DamageProfiler](https://doi.org/10.1093/bioinformatics/btab190) + > DamageProfiler Neukamm, J., Peltzer, A., & Nieselt, K. (2020). DamageProfiler: Fast damage pattern calculation for ancient DNA. In Bioinformatics (btab190). doi: [10.1093/bioinformatics/btab190](https://doi.org/10.1093/bioinformatics/btab190). Download: https://github.com/Integrative-Transcriptomics/DamageProfiler - [MALT](https://www.nature.com/articles/s41559-017-0446-6) diff --git a/nextflow_schema.json b/nextflow_schema.json index 865bf123b..cf59b6e55 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -752,7 +752,7 @@ "default": "BlastN", "description": "Specify which alignment mode to use for MALT. Options: 'Unknown', 'BlastN', 'BlastP', 'BlastX', 'Classifier'.", "fa_icon": "fas fa-align-left", - "help_text": "Use this to run the program in 'BlastN', 'BlastP', 'BlastX' modes to align DNA\nand DNA, protein and protein, or DNA reads against protein references\nrespectively. Ensure your database matches the mode. Check the\n[MALT\nmanual](http://ab.inf.uni-tuebingen.de/data/software/malt/download/manual.pdf)\nfor more details. Default: `'BlastN'`\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MALT parameter: `-m`\n", + "help_text": "Use this to run the program in 'BlastN', 'BlastP', 'BlastX' modes to align DNA\nand DNA, protein and protein, or DNA reads against protein references\nrespectively. Ensure your database matches the mode. Check the\n[MALT\nmanual](http://ab.inf.uni-tuebingen.de/data/software/malt/download/manual.pdf)\nfor more details. Default: `'BlastN'`\n\nOnly when `--metagenomics_profiling_tool malt` is also supplied.\n\n> Modifies MALT parameter: `-m`\n", "enum": ["BlastN", "BlastP", "BlastX"] }, "metagenomics_malt_alignment_mode": { @@ -760,7 +760,7 @@ "default": "SemiGlobal", "description": "Specify alignment method for MALT. Options: 'Local', 'SemiGlobal'.", "fa_icon": "fas fa-align-center", - "help_text": "Specify what alignment algorithm to use. Options are 'Local' or 'SemiGlobal'. Local is a BLAST like alignment, but is much slower. Semi-global alignment aligns reads end-to-end. Default: `'SemiGlobal'`\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MALT parameter: `-at`", + "help_text": "Specify what alignment algorithm to use. Options are 'Local' or 'SemiGlobal'. Local is a BLAST like alignment, but is much slower. Semi-global alignment aligns reads end-to-end. Default: `'SemiGlobal'`\n\nOnly when `--metagenomics_profiling_tool malt` is also supplied.\n\n> Modifies MALT parameter: `-at`", "enum": ["Local", "SemiGlobal"] }, "metagenomics_malt_min_percent_identity": { @@ -768,21 +768,21 @@ "default": 85, "description": "Percent identity value threshold for MALT.", "fa_icon": "fas fa-id-card", - "help_text": "Specify the minimum percent identity (or similarity) a sequence must have to the reference for it to be retained. Default is `85`\n\nOnly used when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MALT parameter: `-id`" + "help_text": "Specify the minimum percent identity (or similarity) a sequence must have to the reference for it to be retained. Default is `85`\n\nOnly used when `--metagenomics_profiling_tool malt` is also supplied.\n\n> Modifies MALT parameter: `-id`" }, "metagenomics_malt_top_percent": { "type": "integer", "default": 1, "description": "Specify the percent for LCA algorithm for MALT (see MEGAN6 CE manual).", "fa_icon": "fas fa-percent", - "help_text": "Specify the top percent value of the LCA algorithm. From the [MALT manual](http://ab.inf.uni-tuebingen.de/data/software/malt/download/manual.pdf): \"For each\nread, only those matches are used for taxonomic placement whose bit disjointScore is within\n10% of the best disjointScore for that read.\". Default: `1`.\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MALT parameter: `-top`" + "help_text": "Specify the top percent value of the LCA algorithm. From the [MALT manual](http://ab.inf.uni-tuebingen.de/data/software/malt/download/manual.pdf): \"For each\nread, only those matches are used for taxonomic placement whose bit disjointScore is within\n10% of the best disjointScore for that read.\". Default: `1`.\n\nOnly when `--metagenomics_profiling_tool malt` is also supplied.\n\n> Modifies MALT parameter: `-top`" }, "metagenomics_malt_min_support_mode": { "type": "string", "default": "percent", "description": "Specify whether to use percent or raw number of reads for minimum support required for taxon to be retained for MALT. Options: 'percent', 'reads'.", "fa_icon": "fas fa-drumstick-bite", - "help_text": "Specify whether to use a percentage, or raw number of reads as the value used to decide the minimum support a taxon requires to be retained.\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MALT parameter: `-sup -supp`", + "help_text": "Specify whether to use a percentage, or raw number of reads as the value used to decide the minimum support a taxon requires to be retained.\n\nOnly when `--metagenomics_profiling_tool malt` is also supplied.\n\n> Modifies MALT parameter: `-sup -supp`", "enum": ["percent", "reads"] }, "metagenomics_malt_min_support_percent": { @@ -790,7 +790,7 @@ "default": 0.01, "description": "Specify the minimum percentage of reads a taxon of sample total is required to have to be retained for MALT.", "fa_icon": "fas fa-percentage", - "help_text": "Specify the minimum number of reads (as a percentage of all assigned reads) a given taxon is required to have to be retained as a positive 'hit' in the RMA6 file. This only applies when `--malt_min_support_mode` is set to 'percent'. Default 0.01.\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MALT parameter: `-supp`" + "help_text": "Specify the minimum number of reads (as a percentage of all assigned reads) a given taxon is required to have to be retained as a positive 'hit' in the RMA6 file. This only applies when `--malt_min_support_mode` is set to 'percent'. Default 0.01.\n\nOnly when `--metagenomics_profiling_tool malt` is also supplied.\n\n> Modifies MALT parameter: `-supp`" }, "metagenomics_min_support_reads": { "type": "integer", @@ -804,14 +804,14 @@ "default": 100, "description": "Specify the maximum number of queries a read can have for MALT.", "fa_icon": "fas fa-phone", - "help_text": "Specify the maximum number of alignments a read can have. All further alignments are discarded. Default: `100`\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MALT parameter: `-mq`" + "help_text": "Specify the maximum number of alignments a read can have. All further alignments are discarded. Default: `100`\n\nOnly when `--metagenomics_profiling_tool malt` is also supplied.\n\n> Modifies MALT parameter: `-mq`" }, "metagenomics_malt_memory_mode": { "type": "string", "default": "load", "description": "Specify the memory load method. Do not use 'map' with GPFS file systems for MALT as can be very slow. Options: 'load', 'page', 'map'.", "fa_icon": "fas fa-memory", - "help_text": "\nHow to load the database into memory. Options are `'load'`, `'page'` or `'map'`.\n'load' directly loads the entire database into memory prior seed look up, this\nis slow but compatible with all servers/file systems. `'page'` and `'map'`\nperform a sort of 'chunked' database loading, allowing seed look up prior entire\ndatabase loading. Note that Page and Map modes do not work properly not with\nmany remote file-systems such as GPFS. Default is `'load'`.\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MALT parameter: `--memoryMode`", + "help_text": "\nHow to load the database into memory. Options are `'load'`, `'page'` or `'map'`.\n'load' directly loads the entire database into memory prior seed look up, this\nis slow but compatible with all servers/file systems. `'page'` and `'map'`\nperform a sort of 'chunked' database loading, allowing seed look up prior entire\ndatabase loading. Note that Page and Map modes do not work properly not with\nmany remote file-systems such as GPFS. Default is `'load'`.\n\nOnly when `--metagenomics_profiling_tool malt` is also supplied.\n\n> Modifies MALT parameter: `--memoryMode`", "enum": ["load", "page", "map"] }, "metagenomics_malt_sam_output": { @@ -842,21 +842,23 @@ }, "metagenomics_maltextract_taxon_list": { "type": "string", + "description": "Path to a text file with taxa of interest (one taxon per row, NCBI taxonomy name format)", "default": null, - "help_text": "\\nPath to a `.txt` file with taxa of interest you wish to assess for aDNA characteristics. In `.txt` file should be one taxon per row, and the taxon should be in a valid [NCBI taxonomy](https://www.ncbi.nlm.nih.gov/taxonomy) name format.\\n\\nOnly when `--metagenomic_tool malt` is also supplied.", + "help_text": "\\nPath to a `.txt` file with taxa of interest you wish to assess for aDNA characteristics. In `.txt` file should be one taxon per row, and the taxon should be in a valid [NCBI taxonomy](https://www.ncbi.nlm.nih.gov/taxonomy) name format.\\n\\nOnly when `--metagenomics_profiling_tool malt` is also supplied.", "fa_icon": "fas fa-align-left" }, "metagenomics_maltextract_ncbi_dir": { "type": "string", + "description": "Path to directory containing containing NCBI resource files (ncbi.tre and ncbi.map; available: https://github.com/rhuebler/HOPS/)", "default": null, - "help_text": "Path to directory containing containing the NCBI resource tree and taxonomy table files (ncbi.tre and ncbi.map; available at the [HOPS repository](https://github.com/rhuebler/HOPS/Resources)).\\n\\nOnly when `--metagenomic_tool malt` is also supplied.", + "help_text": "Path to directory containing containing the NCBI resource tree and taxonomy table files (ncbi.tre and ncbi.map; available at the [HOPS repository](https://github.com/rhuebler/HOPS/Resources)).\\n\\nOnly needed when `--metagenomics_profiling_tool malt` is also supplied.", "fa_icon": "fab fa-buffer" }, "metagenomics_maltextract_filter": { "type": "string", "default": "def_anc", "description": "Specify which MaltExtract filter to use. Options: 'def_anc', 'ancient', 'default', 'crawl', 'scan', 'srna', 'assignment'.", - "help_text": "Specify which MaltExtract filter to use. This is used to specify what types of characteristics to scan for. The default will output statistics on all alignments, and then a second set with just reads with one C to T mismatch in the first 5 bases. Further details on other parameters can be seen in the [HOPS documentation](https://github.com/rhuebler/HOPS/#maltextract-parameters). Options: `'def_anc'`, `'ancient'`, `'default'`, `'crawl'`, `'scan'`, `'srna'`, 'assignment'. Default: `'def_anc'`.\\n\\nOnly when `--metagenomic_tool malt` is also supplied.\\n\\n> Modifies MaltExtract parameter: `-f`", + "help_text": "Specify which MaltExtract filter to use. This is used to specify what types of characteristics to scan for. The default will output statistics on all alignments, and then a second set with just reads with one C to T mismatch in the first 5 bases. Further details on other parameters can be seen in the [HOPS documentation](https://github.com/rhuebler/HOPS/#maltextract-parameters). Options: `'def_anc'`, `'ancient'`, `'default'`, `'crawl'`, `'scan'`, `'srna'`, 'assignment'. Default: `'def_anc'`.\\n\\nOnly when `--metagenomics_profiling_tool malt` is also supplied.\\n\\n> Modifies MaltExtract parameter: `-f`", "fa_icon": "fas fa-filter", "enum": ["def_anc", "default", "ancient", "scan", "crawl", "srna"] }, @@ -864,54 +866,54 @@ "type": "number", "default": 0.01, "description": "Specify percent of top alignments to use.", - "help_text": "Specify frequency of top alignments for each read to be considered for each node.\\nDefault is 0.01, i.e. 1% of all reads (where 1 would correspond to 100%).\\n\\n> :warning: this parameter follows the same concept as `--malt_top_percent` but\\n> uses a different notation i.e. integer (MALT) versus float (MALTExtract)\\n\\nDefault: `0.01`.\\n\\nOnly when `--metagenomic_tool malt` is also supplied.\\n\\n> Modifies MaltExtract parameter: `-a`", + "help_text": "Specify frequency of top alignments for each read to be considered for each node.\\nDefault is 0.01, i.e. 1% of all reads (where 1 would correspond to 100%).\\n\\n> :warning: this parameter follows the same concept as `--malt_top_percent` but\\n> uses a different notation i.e. integer (MALT) versus float (MALTExtract)\\n\\nDefault: `0.01`.\\n\\nOnly when `--metagenomics_profiling_tool malt` is also supplied.\\n\\n> Modifies MaltExtract parameter: `-a`", "fa_icon": "fas fa-percent" }, "metagenomics_maltextract_destackingoff": { "type": "boolean", "default": "false", "description": "Turn off destacking.", - "help_text": "Turn off destacking. If left on, a read that overlaps with another read will be\\nremoved (leaving a depth coverage of 1).\\n\\nOnly when `--metagenomic_tool malt` is also supplied.\\n\\n> Modifies MaltExtract parameter: `--destackingOff`", + "help_text": "Turn off destacking. If left on, a read that overlaps with another read will be\\nremoved (leaving a depth coverage of 1).\\n\\nOnly when `--metagenomics_profiling_tool malt` is also supplied.\\n\\n> Modifies MaltExtract parameter: `--destackingOff`", "fa_icon": "fab fa-stack-overflow" }, "metagenomics_maltextract_downsamplingoff": { "type": "boolean", "default": "false", "description": "Turn off downsampling.", - "help_text": "Turn off downsampling. By default, downsampling is on and will randomly select 10,000 reads if the number of reads on a node exceeds this number. This is to speed up processing, under the assumption at 10,000 reads the species is a 'true positive'.\\n\\nOnly when `--metagenomic_tool malt` is also supplied.\\n\\n> Modifies MaltExtract parameter: `--downSampOff`", + "help_text": "Turn off downsampling. By default, downsampling is on and will randomly select 10,000 reads if the number of reads on a node exceeds this number. This is to speed up processing, under the assumption at 10,000 reads the species is a 'true positive'.\\n\\nOnly when `--metagenomics_profiling_tool malt` is also supplied.\\n\\n> Modifies MaltExtract parameter: `--downSampOff`", "fa_icon": "fas fa-angle-double-down" }, "metagenomics_maltextract_duplicateremovaloff": { "type": "boolean", "default": "false", "description": "Turn off duplicate removal.", - "help_text": "\\nTurn off duplicate removal. By default, reads that are an exact copy (i.e. same start, stop coordinate and exact sequence match) will be removed as it is considered a PCR duplicate.\\n\\nOnly when `--metagenomic_tool malt` is also supplied.\\n\\n> Modifies MaltExtract parameter: `--dupRemOff`", + "help_text": "\\nTurn off duplicate removal. By default, reads that are an exact copy (i.e. same start, stop coordinate and exact sequence match) will be removed as it is considered a PCR duplicate.\\n\\nOnly when `--metagenomics_profiling_tool malt` is also supplied.\\n\\n> Modifies MaltExtract parameter: `--dupRemOff`", "fa_icon": "fas fa-copy" }, "metagenomics_maltextract_matches": { "type": "boolean", "default": "false", "description": "Turn on exporting alignments of hits in BLAST format.", - "help_text": "\\nExport alignments of hits for each node in BLAST format. By default turned off.\\n\\nOnly when `--metagenomic_tool malt` is also supplied.\\n\\n> Modifies MaltExtract parameter: `--matches`", + "help_text": "\\nExport alignments of hits for each node in BLAST format. By default turned off.\\n\\nOnly when `--metagenomics_profiling_tool malt` is also supplied.\\n\\n> Modifies MaltExtract parameter: `--matches`", "fa_icon": "fas fa-equals" }, "metagenomics_maltextract_megansummary": { "type": "boolean", "default": "false", "description": "Turn on export of MEGAN summary files.", - "help_text": "Export 'minimal' summary files (i.e. without alignments) that can be loaded into [MEGAN6](https://doi.org/10.1371/journal.pcbi.1004957). By default turned off.\\n\\nOnly when `--metagenomic_tool malt` is also supplied.\\n\\n> Modifies MaltExtract parameter: `--meganSummary`" + "help_text": "Export 'minimal' summary files (i.e. without alignments) that can be loaded into [MEGAN6](https://doi.org/10.1371/journal.pcbi.1004957). By default turned off.\\n\\nOnly when `--metagenomics_profiling_tool malt` is also supplied.\\n\\n> Modifies MaltExtract parameter: `--meganSummary`" }, "metagenomics_maltextract_percentidentity": { "type": "number", "default": 85, "description": "Minimum percent identity alignments are required to have to be reported. Recommended to set same as MALT parameter.", - "help_text": "Minimum percent identity alignments are required to have to be reported. Higher values allows fewer mismatches between read and reference sequence, but therefore will provide greater confidence in the hit. Lower values allow more mismatches, which can account for damage and divergence of a related strain/species to the reference. Recommended to set same as MALT parameter or higher. Default: `85`.\\n\\nOnly when `--metagenomic_tool malt` is also supplied.\\n\\n> Modifies MaltExtract parameter: `--minPI`" + "help_text": "Minimum percent identity alignments are required to have to be reported. Higher values allows fewer mismatches between read and reference sequence, but therefore will provide greater confidence in the hit. Lower values allow more mismatches, which can account for damage and divergence of a related strain/species to the reference. Recommended to set same as MALT parameter or higher. Default: `85`.\\n\\nOnly when `--metagenomics_profiling_tool malt` is also supplied.\\n\\n> Modifies MaltExtract parameter: `--minPI`" }, "metagenomics_maltextract_topalignment": { "type": "boolean", "default": "false", "description": "Turn on using top alignments per read after filtering.", - "help_text": "Use the best alignment of each read for every statistic, except for those concerning read distribution and coverage. Default: off.\\n\\nOnly when `--metagenomic_tool malt` is also supplied.\\n\\n> Modifies MaltExtract parameter: `--useTopAlignment`", + "help_text": "Use the best alignment of each read for every statistic, except for those concerning read distribution and coverage. Default: off.\\n\\nOnly when `--metagenomics_profiling_tool malt` is also supplied.\\n\\n> Modifies MaltExtract parameter: `--useTopAlignment`", "fa_icon": "fas fa-bahai" } }, From cabdd540ba034d2a59e417c9b8fd4f69a9a70e11 Mon Sep 17 00:00:00 2001 From: Merlin Szymanski Date: Fri, 11 Aug 2023 13:06:56 +0200 Subject: [PATCH 053/198] Finish tests for MaltExtract --- conf/modules.config | 2 +- subworkflows/local/metagenomics.nf | 4 ---- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index e928ece74..f8f085e61 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -882,7 +882,7 @@ process { params.metagenomics_maltextract_matches ? "--matches" : "", params.metagenomics_maltextract_megansummary ? "--meganSummary" : "", params.metagenomics_maltextract_topalignment ? "--useTopAlignment" : "", - //"${meta.single_stranded}" ? "--singleStranded" : "" + { meta.strandedness } == "single" ? '--singleStranded' : '' ].join(' ').trim() publishDir = [ path: { "${params.outdir}/metagenomics_screening/postprocessing/maltextract/" }, diff --git a/subworkflows/local/metagenomics.nf b/subworkflows/local/metagenomics.nf index c8b8a5f8a..9e6d73f0e 100644 --- a/subworkflows/local/metagenomics.nf +++ b/subworkflows/local/metagenomics.nf @@ -9,10 +9,6 @@ workflow METAGENOMICS { // Define channels ch_multiqc_files = Channel.empty() ch_versions = Channel.empty() - ch_bamfiltered_for_metagenomics = ch_bamfiltered_for_metagenomics - .map{ meta, fastq -> - [meta+['single_end':true], fastq] - } // // Run the complexity filter subworkflow From 2aeacea2d4866001e57a14f3d411f125b55a430c Mon Sep 17 00:00:00 2001 From: Ian Light Date: Fri, 29 Sep 2023 08:26:01 +0000 Subject: [PATCH 054/198] updated gunzip linting --- modules/nf-core/gunzip/main.nf | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/modules/nf-core/gunzip/main.nf b/modules/nf-core/gunzip/main.nf index e7189d2fb..73bf08cde 100644 --- a/modules/nf-core/gunzip/main.nf +++ b/modules/nf-core/gunzip/main.nf @@ -21,10 +21,14 @@ process GUNZIP { def args = task.ext.args ?: '' gunzip = archive.toString() - '.gz' """ - gunzip \\ - -f \\ + # Not calling gunzip itself because it creates files + # with the original group ownership rather than the + # default one for that user / the work directory + gzip \\ + -cd \\ $args \\ - $archive + $archive \\ + > $gunzip cat <<-END_VERSIONS > versions.yml "${task.process}": From 24a476a01126eaa03916063b8bf86a14d11b8d23 Mon Sep 17 00:00:00 2001 From: Ian Light Date: Fri, 29 Sep 2023 08:26:27 +0000 Subject: [PATCH 055/198] fixed incorrectly named param --- conf/test_multiref.config | 4 ++-- modules.json | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/conf/test_multiref.config b/conf/test_multiref.config index 23fcdb942..57bec736c 100644 --- a/conf/test_multiref.config +++ b/conf/test_multiref.config @@ -30,8 +30,8 @@ params { bamfiltering_minreadlength = 30 bamfiltering_mappingquality = 37 - // Metagenomic screening - run_metagenomicscreening = false + // Metagenomics + run_metagenomics = false } diff --git a/modules.json b/modules.json index 5107e22b8..45f6ebabc 100644 --- a/modules.json +++ b/modules.json @@ -117,7 +117,7 @@ }, "gunzip": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "git_sha": "e06548bfa36ee31869b81041879dd6b3a83b1d57", "installed_by": ["modules"] }, "kraken2/kraken2": { From 137ebb26ee15592199925d32839d5adb9561cf4b Mon Sep 17 00:00:00 2001 From: Ian Light Date: Fri, 29 Sep 2023 08:58:07 +0000 Subject: [PATCH 056/198] added parameter general combo checks --- workflows/eager.nf | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/workflows/eager.nf b/workflows/eager.nf index 812ef5328..fd7837908 100644 --- a/workflows/eager.nf +++ b/workflows/eager.nf @@ -26,14 +26,20 @@ if ( params.metagenomics_complexity_tool == 'prinseq' && params.metagenomics_pri exit 1, ("[nf-core/eager] ERROR: Metagenomics: You picked PRINSEQ++ with 'dust' mode but provided an entropy score. Please specify a dust filter threshold using the --metagenomics_prinseq_dustscore flag") } } + if ( params.metagenomics_complexity_tool == 'prinseq' && params.metagenomics_prinseq_mode == 'entropy' && params.metagenomics_prinseq_dustscore != 0.5 ) { // dust score was set but entropy method picked. If no entropy-score provided, assume it was an error and fail if (params.metagenomics_complexity_entropy == 0.3) { exit 1, ("[nf-core/eager] ERROR: Metagenomics: You picked PRINSEQ++ with 'entropy' mode but provided a dust score. Please specify an entropy filter threshold using the --metagenomics_complexity_entropy flag") } } + +if ( params.run_metagenomics && ! params.metagenomics_profiling_tool ) { + exit 1, ("[nf-core/eager] ERROR: --run_metagenomics flagged, but no database provided! Please choose an appropriate metagenomics screening tool by setting --metagenomics_profiling_tool to one of 'malt', 'krakenuniq', 'kraken2', or 'metaphlan'") +} + if ( params.run_metagenomics && ! params.metagenomics_profiling_database ) { - exit 1, ("[nf-core/eager] ERROR: Please provide an appropriate database path for metagenomics screening using --metagenomics_profiling_database") + exit 1, ("[nf-core/eager] ERROR: Please provide an appropriate database path for metagenomics screening using --metagenomics_profiling_database. Note this database should correspond to ${params.metagenomics_profiling_tool}") } if ( params.metagenomics_postprocessing_tool == 'maltextract' && params.metagenomics_profiling_tool != 'malt' ) { From ecd2ec0b498d1d6d506338556e0674089e6a5361 Mon Sep 17 00:00:00 2001 From: Ian Light Date: Fri, 29 Sep 2023 08:58:26 +0000 Subject: [PATCH 057/198] added parsing of parameter into malt --- conf/modules.config | 1 + 1 file changed, 1 insertion(+) diff --git a/conf/modules.config b/conf/modules.config index f8f085e61..c76366b25 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -832,6 +832,7 @@ process { "-id ${params.metagenomics_malt_min_percent_identity}", "-mq ${params.metagenomics_malt_max_queries}", "--memoryMode ${params.metagenomics_malt_memory_mode}", + params.malt_sam_output ? "-a . -f SAM" : "", params.metagenomics_malt_min_support_mode == "percent" ? "-supp ${params.metagenomics_malt_min_support_percent}" : "-sup ${params.metagenomics_min_support_reads}", params.metagenomics_malt_save_reads ? "--alignments ./ -za false" : "" ].join(' ').trim() From db3e2fde0212981a68fecfa41326658a75d681e1 Mon Sep 17 00:00:00 2001 From: Ian Light Date: Fri, 29 Sep 2023 08:58:43 +0000 Subject: [PATCH 058/198] updated descriptions/help for metagenomics params --- nextflow_schema.json | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index cf59b6e55..fb5bf2691 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -829,9 +829,9 @@ "metagenomics_malt_group_size": { "type": "integer", "default": 0, - "description": "Define group sizes for running multiple fastq files into malt.", + "description": "Define how many fastq files should be submitted in the same malt run. Default value of 0 sends all files at once.", "fa_icon": "fas fa-barcode", - "help_text": "Very large fastq files or many fastq files run through MALT at the same time can lead to excessively long runtimes. This parameter allows for parallelization of MALT runs. Please note, MALT is resource heavy and setting this value above the default will spawn N/metagenomics_malt_group_size jobs where N is the number of samples. Please only use this if it is necessary to avoid runtime limits on your HPC cluster." + "help_text": "Very large fastq files or many fastq files run through MALT at the same time can lead to excessively long runtimes. This parameter allows for parallelization of MALT runs. Please note, MALT is resource heavy and setting this value above the default (0) will spawn at minimum N/metagenomics_malt_group_size jobs where N is the number of samples. Please only use this if it is necessary to avoid runtime limits on your HPC cluster." }, "metagenomics_postprocessing_tool": { "type": "string", @@ -844,7 +844,7 @@ "type": "string", "description": "Path to a text file with taxa of interest (one taxon per row, NCBI taxonomy name format)", "default": null, - "help_text": "\\nPath to a `.txt` file with taxa of interest you wish to assess for aDNA characteristics. In `.txt` file should be one taxon per row, and the taxon should be in a valid [NCBI taxonomy](https://www.ncbi.nlm.nih.gov/taxonomy) name format.\\n\\nOnly when `--metagenomics_profiling_tool malt` is also supplied.", + "help_text": "\\nPath to a `.txt` file with taxa of interest you wish to assess for aDNA characteristics. In `.txt` file should be one taxon per row, and the taxon should be in a valid [NCBI taxonomy](https://www.ncbi.nlm.nih.gov/taxonomy) name format.\\n\\nOnly needed when `--metagenomics_profiling_tool malt` is also supplied.", "fa_icon": "fas fa-align-left" }, "metagenomics_maltextract_ncbi_dir": { From c1b484693fcbcfc25b7831e9018b8e564ecc913c Mon Sep 17 00:00:00 2001 From: Ian Light Date: Fri, 6 Oct 2023 08:43:24 +0000 Subject: [PATCH 059/198] hotfix misnamed param --- conf/modules.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/modules.config b/conf/modules.config index c76366b25..802cb1eb1 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -832,7 +832,7 @@ process { "-id ${params.metagenomics_malt_min_percent_identity}", "-mq ${params.metagenomics_malt_max_queries}", "--memoryMode ${params.metagenomics_malt_memory_mode}", - params.malt_sam_output ? "-a . -f SAM" : "", + params.metagenomics_malt_sam_output ? "-a . -f SAM" : "", params.metagenomics_malt_min_support_mode == "percent" ? "-supp ${params.metagenomics_malt_min_support_percent}" : "-sup ${params.metagenomics_min_support_reads}", params.metagenomics_malt_save_reads ? "--alignments ./ -za false" : "" ].join(' ').trim() From a8938a0016ce1a13a88e873c46135a1e6e523779 Mon Sep 17 00:00:00 2001 From: Merlin Szymanski Date: Fri, 6 Oct 2023 12:17:23 +0200 Subject: [PATCH 060/198] Fix config typo, add single_end param to metagenomics workflow --- conf/modules.config | 2 +- subworkflows/local/metagenomics.nf | 8 ++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/conf/modules.config b/conf/modules.config index c76366b25..802cb1eb1 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -832,7 +832,7 @@ process { "-id ${params.metagenomics_malt_min_percent_identity}", "-mq ${params.metagenomics_malt_max_queries}", "--memoryMode ${params.metagenomics_malt_memory_mode}", - params.malt_sam_output ? "-a . -f SAM" : "", + params.metagenomics_malt_sam_output ? "-a . -f SAM" : "", params.metagenomics_malt_min_support_mode == "percent" ? "-supp ${params.metagenomics_malt_min_support_percent}" : "-sup ${params.metagenomics_min_support_reads}", params.metagenomics_malt_save_reads ? "--alignments ./ -za false" : "" ].join(' ').trim() diff --git a/subworkflows/local/metagenomics.nf b/subworkflows/local/metagenomics.nf index 9e6d73f0e..775275682 100644 --- a/subworkflows/local/metagenomics.nf +++ b/subworkflows/local/metagenomics.nf @@ -10,6 +10,12 @@ workflow METAGENOMICS { ch_multiqc_files = Channel.empty() ch_versions = Channel.empty() + // Add single_end parameter to meta. + // Reads were merged before, so single_end is always true! + ch_bamfiltered_for_metagenomics = ch_bamfiltered_for_metagenomics.map{ + meta, bamfiltered -> [meta+['single_end':true], bamfiltered] + } + // // Run the complexity filter subworkflow // @@ -29,6 +35,8 @@ workflow METAGENOMICS { database = Channel.fromPath(params.metagenomics_profiling_database) + + METAGENOMICS_PROFILING( ch_reads_for_metagenomics, database ) ch_versions = ch_versions.mix( METAGENOMICS_PROFILING.out.versions.first() ) From e51f20e6c3b07f03ce7ca6ce1543e7160aacf7d9 Mon Sep 17 00:00:00 2001 From: Ian Light Date: Fri, 20 Oct 2023 09:49:57 +0000 Subject: [PATCH 061/198] updated parameter descriptions, names for clarity --- conf/modules.config | 4 +- nextflow.config | 6 +-- nextflow_schema.json | 46 +++++++++---------- subworkflows/local/metagenomics.nf | 2 +- .../local/metagenomics_postprocessing.nf | 4 +- 5 files changed, 31 insertions(+), 31 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 802cb1eb1..06f2c8ecd 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -876,13 +876,13 @@ process { ext.args = [ "-f ${params.metagenomics_maltextract_filter}", "-a ${params.metagenomics_maltextract_toppercent}", - "--minPI ${params.metagenomics_maltextract_percentidentity}", + "--minPI ${params.metagenomics_maltextract_minpercentidentity}", params.metagenomics_maltextract_destackingoff ? "--destackingOff" : "", params.metagenomics_maltextract_downsamplingoff ? "--downSampOff" : "", params.metagenomics_maltextract_duplicateremovaloff ? "--dupRemOff" : "", params.metagenomics_maltextract_matches ? "--matches" : "", params.metagenomics_maltextract_megansummary ? "--meganSummary" : "", - params.metagenomics_maltextract_topalignment ? "--useTopAlignment" : "", + params.metagenomics_maltextract_usetopalignment ? "--useTopAlignment" : "", { meta.strandedness } == "single" ? '--singleStranded' : '' ].join(' ').trim() publishDir = [ diff --git a/nextflow.config b/nextflow.config index be390bb09..28ff65145 100644 --- a/nextflow.config +++ b/nextflow.config @@ -150,7 +150,7 @@ params { metagenomics_malt_max_queries = 100 metagenomics_malt_memory_mode = 'load' metagenomics_malt_group_size = 0 - metagenomics_postprocessing_tool = null + metagenomics_run_postprocessing = true metagenomics_maltextract_taxon_list = null metagenomics_maltextract_ncbi_dir = null metagenomics_maltextract_filter = 'def_anc' @@ -160,8 +160,8 @@ params { metagenomics_maltextract_duplicateremovaloff = false metagenomics_maltextract_matches = false metagenomics_maltextract_megansummary = false - metagenomics_maltextract_percentidentity = 85.0 - metagenomics_maltextract_topalignment = false + metagenomics_maltextract_minpercentidentity = 85.0 + metagenomics_maltextract_usetopalignment = false // Host Removal run_host_removal = false diff --git a/nextflow_schema.json b/nextflow_schema.json index fb5bf2691..7e8bc402d 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -654,7 +654,7 @@ "properties": { "run_metagenomics": { "type": "boolean", - "description": "Turn on metagenomic screening of mapped, unmapped, or all reads.", + "description": "Turn on metagenomic screening of mapped, unmapped, or all reads. Requires subsequent specification of `--metagenomics_profiling_tool` and `--metagenomics_profiling_database`.", "fa_icon": "fas fa-power-off", "help_text": "Turns on the metagenomic screening subworkflow of the pipeline, where reads are screened against large databases. Typically used for pathogen screening or microbial community analysis.\n\nIf supplied, this will also turn on the BAM filtering subworkflow of the pipeline." }, @@ -669,8 +669,8 @@ "run_metagenomics_complexityfiltering": { "type": "boolean", "fa_icon": "fas fa-power-off", - "help_text": "Turns on a subworkflow of the pipeline that filters the fastq-files for complexity before the metagenomics profiling\nUse the metagenomics_complexity_tool parameter to select a method", - "description": "Run a complexity filter on the metagenomics input files before classification. Specifiy the tool to use with the `metagenomics_complexity_tool` parameter, save with `metagenomics_complexity_savefastq`" + "description": "Run a complexity filter on the metagenomics input files before classification. Specifiy the tool to use with the `metagenomics_complexity_tool` parameter, save with `metagenomics_complexity_savefastq`", + "help_text": "Turns on a subworkflow of the pipeline that filters the fastq-files for complexity before the metagenomics profiling\nUse the metagenomics_complexity_tool parameter to select a method" }, "metagenomics_complexity_tool": { "type": "string", @@ -710,7 +710,7 @@ }, "metagenomics_profiling_tool": { "type": "string", - "description": "Specify which tool to use for metagenomic profiling and screening.", + "description": "Specify which tool to use for metagenomic profiling and screening. Required if `--run_metagenomics` flagged.", "enum": ["malt", "metaphlan", "kraken2", "krakenuniq"], "fa_icon": "fas fa-toolbox", "help_text": "Select which tool to run metagenomics profiling on designated metagenomics_screening_input. These tools behave vastly differently due to performing read profiling using different methods and yield vastly different reuslts." @@ -718,14 +718,14 @@ "metagenomics_profiling_database": { "type": "string", "format": "directory-path", - "description": "Specify a databse directory to run metagenomics profiling on. In the case of kraken2, this can be a tar.gz of the directory.", + "description": "Specify a databse directory to run metagenomics profiling on. In the case of kraken2, this can be a tar.gz of the directory. Required if `--run_metagenomics` flagged.", "fa_icon": "fas fa-database", "help_text": "Select which tool to run metagenomics profiling database to use with the designated metagenomics_profiling_tool on the selected metagenomics_screening_input. These databases are NOT cross-compatible and need to be pre-built/downloaded for use in nf-core/eager. Database construction is often a balancing act between breadth of sequence diversity and size." }, "metagenomics_kraken_save_reads": { "type": "boolean", "fa_icon": "fas fa-save", - "description": "Turn on saving reads assigned by of KrakenUniq or Kraken2", + "description": "Turn on saving reads assigned by KrakenUniq or Kraken2", "help_text": "Save reads that do and do not have a taxonomic classification in your output results directory in FASTQ format.\n\n> Modifies tool parameter(s):\n> - krakenuniq: `--classified-out` and `--unclassified-out`" }, "metagenomics_kraken_save_read_classifications": { @@ -833,7 +833,7 @@ "fa_icon": "fas fa-barcode", "help_text": "Very large fastq files or many fastq files run through MALT at the same time can lead to excessively long runtimes. This parameter allows for parallelization of MALT runs. Please note, MALT is resource heavy and setting this value above the default (0) will spawn at minimum N/metagenomics_malt_group_size jobs where N is the number of samples. Please only use this if it is necessary to avoid runtime limits on your HPC cluster." }, - "metagenomics_postprocessing_tool": { + "metagenomics_run_postprocessing": { "type": "string", "description": "Activate post-processing of metagenomics profiling tool selected.", "help_text": "Activate the corresponding post-processing tool for your metagenomics profiling software. \n\nmalt --> maltextract\nkrakenuniq/kraken2 --> krakenmerge\nmetaphlan --> mergemetaphlantables\n\nNote: Postprocessing is automatically carried out when using `kraken2` and `krakenuniq` ", @@ -843,15 +843,15 @@ "metagenomics_maltextract_taxon_list": { "type": "string", "description": "Path to a text file with taxa of interest (one taxon per row, NCBI taxonomy name format)", - "default": null, - "help_text": "\\nPath to a `.txt` file with taxa of interest you wish to assess for aDNA characteristics. In `.txt` file should be one taxon per row, and the taxon should be in a valid [NCBI taxonomy](https://www.ncbi.nlm.nih.gov/taxonomy) name format.\\n\\nOnly needed when `--metagenomics_profiling_tool malt` is also supplied.", + "default": "None", + "help_text": "Path to a `.txt` file with taxa of interest you wish to assess for aDNA characteristics. In `.txt` file should be one taxon per row, and the taxon should be in a valid [NCBI taxonomy](https://www.ncbi.nlm.nih.gov/taxonomy) name format corresponding to a taxonomic node in your MALT database.\\n\\nNecessary when `--metagenomics_profiling_tool malt` specified and `--metagenomics_run_postprocessing` flagged.", "fa_icon": "fas fa-align-left" }, "metagenomics_maltextract_ncbi_dir": { "type": "string", "description": "Path to directory containing containing NCBI resource files (ncbi.tre and ncbi.map; available: https://github.com/rhuebler/HOPS/)", - "default": null, - "help_text": "Path to directory containing containing the NCBI resource tree and taxonomy table files (ncbi.tre and ncbi.map; available at the [HOPS repository](https://github.com/rhuebler/HOPS/Resources)).\\n\\nOnly needed when `--metagenomics_profiling_tool malt` is also supplied.", + "default": "None", + "help_text": "Path to directory containing containing the NCBI resource tree and taxonomy table files (ncbi.tre and ncbi.map; available at the [HOPS repository](https://github.com/rhuebler/HOPS/Resources)).\\n\\nNecessary when `--metagenomics_profiling_tool malt` and `--metagenomics_run_postprocessing` specified.", "fa_icon": "fab fa-buffer" }, "metagenomics_maltextract_filter": { @@ -871,47 +871,47 @@ }, "metagenomics_maltextract_destackingoff": { "type": "boolean", - "default": "false", + "default": false, "description": "Turn off destacking.", "help_text": "Turn off destacking. If left on, a read that overlaps with another read will be\\nremoved (leaving a depth coverage of 1).\\n\\nOnly when `--metagenomics_profiling_tool malt` is also supplied.\\n\\n> Modifies MaltExtract parameter: `--destackingOff`", "fa_icon": "fab fa-stack-overflow" }, "metagenomics_maltextract_downsamplingoff": { "type": "boolean", - "default": "false", + "default": false, "description": "Turn off downsampling.", "help_text": "Turn off downsampling. By default, downsampling is on and will randomly select 10,000 reads if the number of reads on a node exceeds this number. This is to speed up processing, under the assumption at 10,000 reads the species is a 'true positive'.\\n\\nOnly when `--metagenomics_profiling_tool malt` is also supplied.\\n\\n> Modifies MaltExtract parameter: `--downSampOff`", "fa_icon": "fas fa-angle-double-down" }, "metagenomics_maltextract_duplicateremovaloff": { "type": "boolean", - "default": "false", + "default": false, "description": "Turn off duplicate removal.", - "help_text": "\\nTurn off duplicate removal. By default, reads that are an exact copy (i.e. same start, stop coordinate and exact sequence match) will be removed as it is considered a PCR duplicate.\\n\\nOnly when `--metagenomics_profiling_tool malt` is also supplied.\\n\\n> Modifies MaltExtract parameter: `--dupRemOff`", + "help_text": "Turn off duplicate removal. By default, reads that are an exact copy (i.e. same start, stop coordinate and exact sequence match) will be removed as it is considered a PCR duplicate.\\n\\nOnly when `--metagenomics_profiling_tool malt` is also supplied.\\n\\n> Modifies MaltExtract parameter: `--dupRemOff`", "fa_icon": "fas fa-copy" }, "metagenomics_maltextract_matches": { "type": "boolean", - "default": "false", + "default": false, "description": "Turn on exporting alignments of hits in BLAST format.", - "help_text": "\\nExport alignments of hits for each node in BLAST format. By default turned off.\\n\\nOnly when `--metagenomics_profiling_tool malt` is also supplied.\\n\\n> Modifies MaltExtract parameter: `--matches`", + "help_text": "Export alignments of hits for each node in BLAST format. By default turned off.\\n\\nOnly when `--metagenomics_profiling_tool malt` is also supplied.\\n\\n> Modifies MaltExtract parameter: `--matches`", "fa_icon": "fas fa-equals" }, "metagenomics_maltextract_megansummary": { "type": "boolean", - "default": "false", + "default": false, "description": "Turn on export of MEGAN summary files.", "help_text": "Export 'minimal' summary files (i.e. without alignments) that can be loaded into [MEGAN6](https://doi.org/10.1371/journal.pcbi.1004957). By default turned off.\\n\\nOnly when `--metagenomics_profiling_tool malt` is also supplied.\\n\\n> Modifies MaltExtract parameter: `--meganSummary`" }, - "metagenomics_maltextract_percentidentity": { + "metagenomics_maltextract_minpercentidentity": { "type": "number", - "default": 85, - "description": "Minimum percent identity alignments are required to have to be reported. Recommended to set same as MALT parameter.", + "default": 85.0, + "description": "Minimum percent identity alignments are required to have to be reported as candidate reads. Recommended to set same as MALT parameter.", "help_text": "Minimum percent identity alignments are required to have to be reported. Higher values allows fewer mismatches between read and reference sequence, but therefore will provide greater confidence in the hit. Lower values allow more mismatches, which can account for damage and divergence of a related strain/species to the reference. Recommended to set same as MALT parameter or higher. Default: `85`.\\n\\nOnly when `--metagenomics_profiling_tool malt` is also supplied.\\n\\n> Modifies MaltExtract parameter: `--minPI`" }, - "metagenomics_maltextract_topalignment": { + "metagenomics_maltextract_usetopalignment": { "type": "boolean", - "default": "false", + "default": false, "description": "Turn on using top alignments per read after filtering.", "help_text": "Use the best alignment of each read for every statistic, except for those concerning read distribution and coverage. Default: off.\\n\\nOnly when `--metagenomics_profiling_tool malt` is also supplied.\\n\\n> Modifies MaltExtract parameter: `--useTopAlignment`", "fa_icon": "fas fa-bahai" diff --git a/subworkflows/local/metagenomics.nf b/subworkflows/local/metagenomics.nf index 775275682..c9242d32e 100644 --- a/subworkflows/local/metagenomics.nf +++ b/subworkflows/local/metagenomics.nf @@ -46,7 +46,7 @@ workflow METAGENOMICS { // Run the post profiling subworkflow (optionally run for malt, mandatory for kraken2/krakenuniq) // - if ( params.metagenomics_postprocessing_tool || ['kraken2', 'krakenuniq'].contains(params.metagenomics_profiling_tool) ) { + if ( params.metagenomics_run_postprocessing || ['kraken2', 'krakenuniq'].contains(params.metagenomics_profiling_tool) ) { METAGENOMICS_POSTPROCESSING ( METAGENOMICS_PROFILING.out.postprocessing_input ) diff --git a/subworkflows/local/metagenomics_postprocessing.nf b/subworkflows/local/metagenomics_postprocessing.nf index 1fa4db04e..00341b63c 100644 --- a/subworkflows/local/metagenomics_postprocessing.nf +++ b/subworkflows/local/metagenomics_postprocessing.nf @@ -15,7 +15,7 @@ workflow METAGENOMICS_POSTPROCESSING { ch_results = Channel.empty() ch_multiqc_files = Channel.empty() - if ( params.metagenomics_postprocessing_tool == 'maltextract' ) { + if ( params.metagenomics_run_postprocessing && params.metagenomics_profiling_tool == 'malt' ) { //maltextract doesnt accepts a meta param in the first input channel, so remove it ch_maltextract_input = ch_postprocessing_input.map{it[1]} @@ -54,7 +54,7 @@ workflow METAGENOMICS_POSTPROCESSING { } - else if ( params.metagenomics_postprocessing_tool == 'mergemetaphlantables' ) { + else if ( params.metagenomics_run_postprocessing && params.metagenomics_profiling_tool == 'metaphlan' ) { METAPHLAN_MERGEMETAPHLANTABLES ( ch_postprocessing_input.map{ [[id:"metaphlan_profiles_all_samples_merged"], it[1]] }.groupTuple() ) From 38057983db1624ab19c963939bbb40ba9102cedb Mon Sep 17 00:00:00 2001 From: Ian Light Date: Fri, 20 Oct 2023 09:54:59 +0000 Subject: [PATCH 062/198] added parameter checks to top of eager execution --- docs/development/manual_tests.md | 6 +++-- nextflow_schema.json | 2 +- workflows/eager.nf | 41 +++++++++++++++++++------------- 3 files changed, 29 insertions(+), 20 deletions(-) diff --git a/docs/development/manual_tests.md b/docs/development/manual_tests.md index 4a2672489..7cfda0517 100644 --- a/docs/development/manual_tests.md +++ b/docs/development/manual_tests.md @@ -329,6 +329,8 @@ All possible parameters Tests +## NOTE: metagenomics input generation will now fail pre-pipeline parameter checks, since --run_metagenomics requires the subsequent declaration of --metagenomics_profiling_tool and --metagenomics_profiling_database! + ```bash ## Check no BAM filtering ## Expect: full completion of pipeline without any bam filtering execution @@ -687,7 +689,7 @@ nextflow run main.nf -profile test,docker \ --run_metagenomics \ --metagenomics_profiling_tool malt \ --metagenomics_profiling_database CUSTOM_MALT_DB \ - --metagenomics_postprocessing_tool maltextract \ + --metagenomics_run_postprocessing \ --metagenomics_maltextract_ncbi_dir NCBI_DIR \ --metagenomics_maltextract_taxon_list TAXONLISTFILE ``` @@ -696,6 +698,6 @@ nextflow run main.nf -profile test,docker \ ```bash nextflow run -resume ./main.nf -profile test,docker --outdir out \ ---run_metagenomics --metagenomics_profiling_tool metaphlan --metagenomics_profiling_database ./runtest/metaphlandb/ --metagenomics_postprocessing_tool mergemetaphlantables +--run_metagenomics --metagenomics_profiling_tool metaphlan --metagenomics_profiling_database ./runtest/metaphlandb/ --metagenomics_run_postprocessing # 20230804: works ``` diff --git a/nextflow_schema.json b/nextflow_schema.json index 7e8bc402d..58498130d 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -834,7 +834,7 @@ "help_text": "Very large fastq files or many fastq files run through MALT at the same time can lead to excessively long runtimes. This parameter allows for parallelization of MALT runs. Please note, MALT is resource heavy and setting this value above the default (0) will spawn at minimum N/metagenomics_malt_group_size jobs where N is the number of samples. Please only use this if it is necessary to avoid runtime limits on your HPC cluster." }, "metagenomics_run_postprocessing": { - "type": "string", + "type": "boolean", "description": "Activate post-processing of metagenomics profiling tool selected.", "help_text": "Activate the corresponding post-processing tool for your metagenomics profiling software. \n\nmalt --> maltextract\nkrakenuniq/kraken2 --> krakenmerge\nmetaphlan --> mergemetaphlantables\n\nNote: Postprocessing is automatically carried out when using `kraken2` and `krakenuniq` ", "default": "null", diff --git a/workflows/eager.nf b/workflows/eager.nf index fd7837908..350d5b5c3 100644 --- a/workflows/eager.nf +++ b/workflows/eager.nf @@ -27,27 +27,35 @@ if ( params.metagenomics_complexity_tool == 'prinseq' && params.metagenomics_pri } } -if ( params.metagenomics_complexity_tool == 'prinseq' && params.metagenomics_prinseq_mode == 'entropy' && params.metagenomics_prinseq_dustscore != 0.5 ) { - // dust score was set but entropy method picked. If no entropy-score provided, assume it was an error and fail - if (params.metagenomics_complexity_entropy == 0.3) { - exit 1, ("[nf-core/eager] ERROR: Metagenomics: You picked PRINSEQ++ with 'entropy' mode but provided a dust score. Please specify an entropy filter threshold using the --metagenomics_complexity_entropy flag") +// metagenomics related parameter checks +if ( params.run_metagenomics ) { + // failing metagenomics parameter combinations/declarations + if ( params.metagenomics_complexity_tool == 'prinseq' && params.metagenomics_prinseq_mode == 'entropy' && params.metagenomics_prinseq_dustscore != 0.5 ) { + // dust score was set but entropy method picked. If no entropy-score provided, assume it was an error and fail + if (params.metagenomics_complexity_entropy == 0.3) { + exit 1, ("[nf-core/eager] ERROR: Metagenomics: You picked PRINSEQ++ with 'entropy' mode but provided a dust score. Please specify an entropy filter threshold using the --metagenomics_complexity_entropy flag") + } } -} -if ( params.run_metagenomics && ! params.metagenomics_profiling_tool ) { - exit 1, ("[nf-core/eager] ERROR: --run_metagenomics flagged, but no database provided! Please choose an appropriate metagenomics screening tool by setting --metagenomics_profiling_tool to one of 'malt', 'krakenuniq', 'kraken2', or 'metaphlan'") -} + if ( ! params.metagenomics_profiling_tool ) { + exit 1, ("[nf-core/eager] ERROR: --run_metagenomics flagged, but no profiling tool selected! Please choose an appropriate metagenomics screening tool by setting --metagenomics_profiling_tool to one of 'malt', 'krakenuniq', 'kraken2', or 'metaphlan' and declare the path to the database directory using `--metagenomics_profiling_database`.") + } -if ( params.run_metagenomics && ! params.metagenomics_profiling_database ) { - exit 1, ("[nf-core/eager] ERROR: Please provide an appropriate database path for metagenomics screening using --metagenomics_profiling_database. Note this database should correspond to ${params.metagenomics_profiling_tool}") -} + if ( ! params.metagenomics_profiling_database ) { + exit 1, ("[nf-core/eager] ERROR: Please provide an appropriate database path for metagenomics screening using --metagenomics_profiling_database. Note this database should correspond to ${params.metagenomics_profiling_tool}") + } -if ( params.metagenomics_postprocessing_tool == 'maltextract' && params.metagenomics_profiling_tool != 'malt' ) { - exit 1, ("[nf-core/eager] ERROR: --metagenomics_postprocessing_tool 'maltextract' can only be run with --metagenomics_profiling_tool 'malt'") -} + if ( params.metagenomics_profiling_tool == 'malt' && params.metagenomics_run_postprocessing ) { + if ( ! params.metagenomics_maltextract_ncbi_dir ) { + exit 1, ("[nf-core/eager] ERROR: Postprocessing for MALT requires additional parameters specified. Please provide a path to the NBCI directory for MaltExtract using `--metagenomics_maltextract_ncbi_dir`.") + } + if ( ! params.metagenomics_maltextract_ncbi_dir ) { + exit 1, ("[nf-core/eager] ERROR: Postprocessing for MALT requires additional parameters specified. Please provide a path to the taxon list for MaltExtract using `--metagenomics_maltextract_taxon_list`.") + } + } -if ( params.metagenomics_postprocessing_tool == 'mergemetaphlantables' && ! params.metagenomics_profiling_tool == 'metaphlan' ) { - exit 1, ("[nf-core/eager] ERROR: --metagenomics_postprocessing_tool 'mergemetaphlantables' can only be run with --metagenomics_profiling_tool 'metaphlan'") + // warnings for metagenomics parameter combinations + if ( params.metagenomics_profiling_tool == 'krakenmerge' && params.metagenomics_min_support_reads == 1 ) log.warn("[nf-core/eager] Warning: The default value for krakenmerge minimum reads for outputing a node has not been changed from the default. The default is set for use with MALT and maltextract. Consider updating to the default value for krakenmerge (50 reads) by setting --metagenomics_min_support_reads 50") } if( params.run_bedtools_coverage ){ @@ -62,7 +70,6 @@ if ( params.deduplication_tool == 'dedup' && ! params.preprocessing_excludeunmer // Report possible warnings if ( params.preprocessing_skipadaptertrim && params.preprocessing_adapterlist ) log.warn("[nf-core/eager] --preprocessing_skipadaptertrim will override --preprocessing_adapterlist. Adapter trimming will be skipped!") -if ( params.metagenomics_postprocessing_tool == 'krakenmerge' && params.metagenomics_min_support_reads == 1 ) log.warn("[nf-core/eager] Warning: The default value for krakenmerge minimum reads for outputing a node has not been changed from the default. This default is set for MALT and maltextract. Consider updating to the default value for krakenmerge (50 reads) by setting --metagenomics_min_support_reads 50") /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ From e4b506e2f1a76cd0b1fbc7162e2944b5e40f9cc4 Mon Sep 17 00:00:00 2001 From: Ian Light Date: Fri, 3 Nov 2023 09:21:17 +0000 Subject: [PATCH 063/198] fixed linting error --- nextflow_schema.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index b098601d2..70933203a 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -824,7 +824,7 @@ "type": "boolean", "description": "Activate post-processing of metagenomics profiling tool selected.", "help_text": "Activate the corresponding post-processing tool for your metagenomics profiling software. \n\nmalt --> maltextract\nkrakenuniq/kraken2 --> krakenmerge\nmetaphlan --> mergemetaphlantables\n\nNote: Postprocessing is automatically carried out when using `kraken2` and `krakenuniq` ", - "default": "null", + "default": "true", "fa_icon": "fab fa-buromobelexperte" }, "metagenomics_maltextract_taxon_list": { From a7af65b1bb1a739a4c4b01537807c68d101316d0 Mon Sep 17 00:00:00 2001 From: Ian Light Date: Fri, 3 Nov 2023 10:48:10 +0000 Subject: [PATCH 064/198] updated documentation for output files --- conf/modules.config | 2 +- docs/output.md | 128 ++++++++++++++++++++++++++++++++++++++++++- nextflow_schema.json | 2 +- 3 files changed, 127 insertions(+), 5 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 26e319925..ab6599fbb 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -854,7 +854,7 @@ process { withName: KRAKEN2_KRAKEN2 { ext.args = [ - "--report-minimizer-data" + params.metagenomics_kraken2_save_minimizers ? "--report-minimizer-data" : "" ].join(' ').trim() ext.prefix = { "${meta.sample_id}_${meta.library_id}" } publishDir = [ diff --git a/docs/output.md b/docs/output.md index a6de7067a..7662a85b0 100644 --- a/docs/output.md +++ b/docs/output.md @@ -300,14 +300,14 @@ Please be aware, that intermediate length and mapping quality filtered genomic B You may also receive the files above if metagenomic screening is turned on. -### Metagenomics Screening +### Metagenomics Complexity Filtering #### Bbduk
Output files -- `metagenomic_complexity_filter/` +- `metagenomics_screening/complexity_filter/bbduk` - `*_complexity.fastq.gz`: FASTQ file containing the complexity filtered reads - `*.log`: LOG file containing filter stats @@ -327,7 +327,7 @@ Using complexity-filtered fastq-files as input for metagenomic classifiers can r
Output files -- `metagenomic_complexity_filter/` +- `metagenomics_screening/complexity_filter/prinseq` - `*_complexity_good_out.fastq.gz`: FASTQ file containing the complexity filtered reads - `*_complexity.log`: LOG file containing filter stats @@ -342,6 +342,128 @@ The saved files are the _good_ files, passing the `dust` or `entropy` filter tre **Note:** To save output files, set the `--metagenomics_complexity_savefastq` flag +### Metagenomics Profiling + +#### MALT + +[MALT](https://software-ab.cs.uni-tuebingen.de/download/malt) is a fast replacement for BLASTX, BLASTP and BLASTN, and provides both local and semi-global alignment capabilities. + +
+Output files + +- `metagenomics_screening/profiling/malt/` + - `.rma6`: binary file containing all alignments and taxonomic information of hits that can be loaded into the [MEGAN6](https://uni-tuebingen.de/fakultaeten/mathematisch-naturwissenschaftliche-fakultaet/fachbereiche/informatik/lehrstuehle/algorithms-in-bioinformatics/software/megan6/) interactive viewer + - `.blastn.sam`: sparse SAM file containing alignments of each hit (if `--metagenomics_malt_sam_output`) + - `*.log`: LOG file containing runtime log of MALT + +
+ +MALT is a metagenomic aligner (equivalent to BLAST, but much faster). It produces direct alignments of sequencing reads in a reference genome. It is often used for metagenomic profiling or pathogen screening, and specifically in nf-core/eager, of off-target reads from genome mapping. + +You will receive output for each library. This means that if you use TSV input and have one library sequenced over multiple lanes and sequencing types, these are merged and you will get mapping statistics of all lanes and sequencing configurations in one value. + +The main output of MALT is the `.rma6` file format, which can be only loaded into MEGAN and it's related tools. + +You will only receive the `.sam` files if you supply `--malt_save_reads` parameters to the pipeline. + +### MetaPhlAn + +[MetaPhlAn](https://github.com/biobakery/metaphlan) is a computational tool for profiling the composition of microbial communities (Bacteria, Archaea and Eukaryotes) from metagenomic shotgun sequencing data (i.e. not 16S) with species-level resolution via marker genes. + +
+Output files + +- `metagenomics_screening/profiling/metaphlan/` + - `.biom`: taxonomic profile in BIOM format + - `.bowtie2out.txt`: BowTie2 alignment information (can be re-used for skipping alignment when re-running MetaPhlAn with different parameters) + - `_profile.txt`: MetaPhlAn taxonomic profile including abundance estimates + +
+ +The main taxonomic profiling file from MetaPhlAn is the `*_profile.txt` file. This provides the abundance estimates from MetaPhlAn however does not include raw counts by default. Intermediate Bowtie2 output `.bowtie2out.txt`, which presents a condensed representation of the mapping results of your sequencing reads to MetaPhlAn's marker gene sequences. The alignments are listed in tab-separated columns, including Read ID and Marker Gene ID, with each alignment represented on a separate line. + +#### Kraken2 + +[Kraken](https://ccb.jhu.edu/software/kraken2/) is a taxonomic sequence classifier that assigns taxonomic labels to DNA sequences. Kraken examines the k-mers within a query sequence and uses the information within those k-mers to query a database. That database maps -mers to the lowest common ancestor (LCA) of all genomes known to contain a given k-mer. + +
+Output files + +- `metagenomics_screening/profiling/kraken2/` + - `.classified.fastq.gz`: FASTQ file containing all reads that had a hit against a reference in the database for a given sample + - `.unclassified.fastq.gz`: FASTQ file containing all reads that did not have a hit in the database for a given sample + - `.report.txt`: A Kraken2 report that summarises the fraction abundance, taxonomic ID, number of Kmers, taxonomic path of all the hits in the Kraken2 run for a given sample. Will be 6 column rather than 8 if `--metagenomics_kraken2_save_minimizers` specified. + - `.classifiedreads.txt`: A list of read IDs and the hits each read had against each database for a given sample + +
+ +The main taxonomic classification file from Kraken2 is the `_combined_reports.txt` or `*report.txt` file. The former provides you the broadest over view of the taxonomic classification results across all samples against a single database, where you get two columns for each sample e.g. `2_all` and `2_lvl`, as well as a summarised column summing up across all samples `tot_all` and `tot_lvl`. The latter gives you the most information for a single sample. The report file is also used for the taxpasta step. + +You will only receive the `.fastq` and `*classifiedreads.txt` file if you supply `--metagenomics_kraken_save_reads` and/or `--metagenomics_kraken_save_readclassifications` parameters to the pipeline. + +#### KrakenUniq + +[KrakenUniq](https://github.com/fbreitwieser/krakenuniq) (formerly KrakenHLL) is an extenson to the fast k-mer-based classification [Kraken](https://github.com/DerrickWood/kraken) with an efficient algorithm for additionally assessing the coverage of unique k-mers found in each species in a dataset. + +
+Output files + +- `metagenomics_screening/profiling/krakenuniq/` + - `.classified.fastq.gz`: FASTQ file containing all reads that had a hit against a reference in the database for a given sample + - `.unclassified.fastq.gz`: FASTQ file containing all reads that did not have a hit in the database for a given sample + - `.report.txt`: A Kraken2-style report that summarises the fraction abundance, taxonomic ID, number of Kmers, taxonomic path of all the hits, with an additional column for k-mer coverage, that allows for more accurate distinguishing between false-positive/true-postitive hits + - `.classifiedreads.txt`: A list of read IDs and the hits each read had against each database for a given sample + +
+ +The main taxonomic classification file from KrakenUniq is the `*report.txt` file. This is an extension of the Kraken2 report with the additional k-mer coverage information that provides more information about the accuracy of hits. + +You will only receive the `*.fastq.gz` and `*.classifiedreads.txt` file if you supply `--metagenomics_kraken_save_reads` and/or `--metagenomics_kraken_save_readclassifications` parameters to the pipeline. + +:::info +The output system of KrakenUniq can result in other `stdout` or `stderr` logging information being saved in the report file, therefore you must check your report files before downstream use! +::: + +### Metagenomics Postprocessing + +#### maltextract + +The output directory for maltextract, as implemented under [HOPS](https://github.com/rhuebler/HOPS), which applies various heuristics of ancient authenticity and presence to megan read assignments across a given set of candidate taxon. + +
+Output files + +- `metagenomics_screening/postprocessing/maltextract/` + - `results`: Results output by maltextract + - `default`: Directory containing summary TSV tables for all reads + - `ancient`: Directory contiaining summary TSV tables for reads with evidence of aDNA damage + - `pdf_candidate_profiles`: Direcotry containing directories for each candidate taxon with varying levels of support for presence of genetic material for a given sample + - `count_table.tsv`: TSV containing reads assigned to each node in candidate taxon list across all samples + +
+ +The main files of interest are within the `pdf_candidate_profiles` direcotry. The file prefixes declare various levels of confidence in a given sample, with stp1 being less confidently ancient and present than stp2, than stp3. Results are highly dependent upon the taxon being analyzed, as different microbial genera are more liable to cross mapping and contamination than others. + +#### Merge MetaPhlanTables + +The output contains a file named `metaphlan_profiles_all_samples_merged.txt`, which provides an overview of the classification results for all samples. + +
+Output files + +- `metagenomics_screening/postprocessing/mergemetaphlantables/` + - `metaphlan_profiles_all_samples_merged.txt`: A combined profile of all samples + +#### krakenmerge + +Automatic postprocessing and merging of kraken reports into a single combined profile of all samples. + +
+Output files + +- `metagenomics_screening/postprocessing/kraken_merge/` + - `combined_reports.txt`: A combined profile of all samples (as generated by `krakentools`) + ### Deduplication
diff --git a/nextflow_schema.json b/nextflow_schema.json index 70933203a..cad5086c2 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -583,7 +583,7 @@ "type": "boolean", "description": "Turn on filtering of reads in BAM files after mapping. By default, only mapped reads retained.", "fa_icon": "fas fa-power-off", - "help_text": "Turns on the filtering subworkflow for mapped BAM files coming out of the read alignment step. Filtering includes removal of unmapped reads, length filtering, and mapping quality filtering.\n\nWhen turning on bam filtering, by default only the mapped/unmapped filter is activated, thus only mapped reads are retained for downstream analyses. See `--bamfiltering_retainunmappedgenomicbam` to retain unmapped reads, if filtering only for length and/or quality is preferred.\n\nNote this subworkflow can also be activated if `--run_metagenomic_screening` is supplied." + "help_text": "Turns on the filtering subworkflow for mapped BAM files coming out of the read alignment step. Filtering includes removal of unmapped reads, length filtering, and mapping quality filtering.\n\nWhen turning on bam filtering, by default only the mapped/unmapped filter is activated, thus only mapped reads are retained for downstream analyses. See `--bamfiltering_retainunmappedgenomicbam` to retain unmapped reads, if filtering only for length and/or quality is preferred.\n\nNote this subworkflow can also be activated if `--run_metagenomics_screening` is supplied." }, "bamfiltering_minreadlength": { "type": "integer", From c76875480b35fba20f76a0fcc2de237493e0b2e4 Mon Sep 17 00:00:00 2001 From: Merlin Szymanski Date: Fri, 24 Nov 2023 11:48:27 +0100 Subject: [PATCH 065/198] Apply minor suggestions from code review Co-authored-by: James A. Fellows Yates --- docs/development/manual_tests.md | 2 +- docs/output.md | 4 ++-- nextflow.config | 2 +- subworkflows/local/metagenomics.nf | 2 -- subworkflows/local/metagenomics_postprocessing.nf | 4 +--- subworkflows/local/metagenomics_profiling.nf | 1 - workflows/eager.nf | 1 - 7 files changed, 5 insertions(+), 11 deletions(-) diff --git a/docs/development/manual_tests.md b/docs/development/manual_tests.md index b6e3fde3f..99518be71 100644 --- a/docs/development/manual_tests.md +++ b/docs/development/manual_tests.md @@ -520,7 +520,7 @@ nextflow run main.nf -profile docker,test --input ~/eager_dsl2_testing/input/onl ```bash #### Use bbduk to remove low complexity reads _without_ saving the intermediate files -## Expect: NO additional directory created, but the files in the profiling directory contain the 'complexity' postfix +## Expect: NO additional directory created, but the files in the profiling directory contain the 'complexity' suffix nextflow run main.nf -profile test,docker \ --outdir ./out \ --run_metagenomics \ diff --git a/docs/output.md b/docs/output.md index 7662a85b0..0379ef27f 100644 --- a/docs/output.md +++ b/docs/output.md @@ -436,8 +436,8 @@ The output directory for maltextract, as implemented under [HOPS](https://github - `metagenomics_screening/postprocessing/maltextract/` - `results`: Results output by maltextract - `default`: Directory containing summary TSV tables for all reads - - `ancient`: Directory contiaining summary TSV tables for reads with evidence of aDNA damage - - `pdf_candidate_profiles`: Direcotry containing directories for each candidate taxon with varying levels of support for presence of genetic material for a given sample + - `ancient`: Directory containing summary TSV tables for reads with evidence of aDNA damage + - `pdf_candidate_profiles`: Directory containing directories for each candidate taxon with varying levels of support for presence of genetic material for a given sample - `count_table.tsv`: TSV containing reads assigned to each node in candidate taxon list across all samples
diff --git a/nextflow.config b/nextflow.config index ba7bf5dc3..7a00e6dc6 100644 --- a/nextflow.config +++ b/nextflow.config @@ -124,7 +124,7 @@ params { // Metagenomic Screening run_metagenomics = false - metagenomics_input = 'unmapped' // mapped, all, unmapped -> mapped vs all specified in SAMTOOLS_FASTQ_MAPPED in modules.conf, unmapped hardcoded SAMTOOLS_FASTQ_UMAPPED + metagenomics_input = 'unmapped' // mapped, all, unmapped -> mapped vs all specified in SAMTOOLS_FASTQ_MAPPED in modules.conf, unmapped hardcoded SAMTOOLS_FASTQ_UNMAPPED run_metagenomics_complexityfiltering = false metagenomics_complexity_tool = 'bbduk' metagenomics_complexity_savefastq = false diff --git a/subworkflows/local/metagenomics.nf b/subworkflows/local/metagenomics.nf index c9242d32e..88a801373 100644 --- a/subworkflows/local/metagenomics.nf +++ b/subworkflows/local/metagenomics.nf @@ -35,8 +35,6 @@ workflow METAGENOMICS { database = Channel.fromPath(params.metagenomics_profiling_database) - - METAGENOMICS_PROFILING( ch_reads_for_metagenomics, database ) ch_versions = ch_versions.mix( METAGENOMICS_PROFILING.out.versions.first() ) diff --git a/subworkflows/local/metagenomics_postprocessing.nf b/subworkflows/local/metagenomics_postprocessing.nf index 00341b63c..5edb63ed7 100644 --- a/subworkflows/local/metagenomics_postprocessing.nf +++ b/subworkflows/local/metagenomics_postprocessing.nf @@ -25,9 +25,7 @@ workflow METAGENOMICS_POSTPROCESSING { MALTEXTRACT ( ch_maltextract_input, tax_list, ncbi_dir) - maltex_filter = params.metagenomics_maltextract_filter - - AMPS ( MALTEXTRACT.out.results, tax_list, maltex_filter ) + AMPS ( MALTEXTRACT.out.results, tax_list, params.metagenomics_maltextract_filter ) ch_versions = ch_versions.mix( MALTEXTRACT.out.versions.first(), AMPS.out.versions.first() ) ch_results = ch_results.mix( AMPS.out.candidate_pdfs, AMPS.out.tsv, AMPS.out.summary_pdf ) diff --git a/subworkflows/local/metagenomics_profiling.nf b/subworkflows/local/metagenomics_profiling.nf index 8fad17a59..b1898a339 100644 --- a/subworkflows/local/metagenomics_profiling.nf +++ b/subworkflows/local/metagenomics_profiling.nf @@ -16,7 +16,6 @@ workflow METAGENOMICS_PROFILING { database main: - ch_versions = Channel.empty() ch_raw_classifications = Channel.empty() ch_raw_profiles = Channel.empty() diff --git a/workflows/eager.nf b/workflows/eager.nf index 752cf548f..5d8755817 100644 --- a/workflows/eager.nf +++ b/workflows/eager.nf @@ -71,7 +71,6 @@ if ( params.deduplication_tool == 'dedup' && ! params.preprocessing_excludeunmer // Report possible warnings if ( params.preprocessing_skipadaptertrim && params.preprocessing_adapterlist ) log.warn("[nf-core/eager] --preprocessing_skipadaptertrim will override --preprocessing_adapterlist. Adapter trimming will be skipped!") - /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CONFIG FILES From e7e42619fa7a7bab3866fcde965ac46300d260d7 Mon Sep 17 00:00:00 2001 From: Merlin Szymanski Date: Fri, 24 Nov 2023 12:49:04 +0100 Subject: [PATCH 066/198] Minor edits from code-review --- CITATIONS.md | 22 ++++++++++---------- README.md | 2 +- conf/modules.config | 3 --- conf/test.config | 2 +- conf/test_multiref.config | 2 +- subworkflows/local/metagenomics_profiling.nf | 2 +- 6 files changed, 15 insertions(+), 18 deletions(-) diff --git a/CITATIONS.md b/CITATIONS.md index c69c1a5b9..353f43f94 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -2,11 +2,11 @@ ## [nf-core](https://pubmed.ncbi.nlm.nih.gov/32055031/) -> Ewels PA, Peltzer A, Fillinger S, Patel H, Alneberg J, Wilm A, Garcia MU, Di Tommaso P, Nahnsen S. The nf-core framework for community-curated bioinformatics pipelines. Nat Biotechnol. 2020 Mar;38(3):276-278. doi: 10.1038/s41587-020-0439-x. PubMed PMID: 32055031. +> Ewels PA, Peltzer A, Fillinger S, Patel H, Alneberg J, Wilm A, Garcia MU, Di Tommaso P, Nahnsen S. The nf-core framework for community-curated bioinformatics pipelines. Nat Biotechnol. 2020 Mar;38(3):276-278. doi: [10.1038/s41587-020-0439-x](https://doi.org/10.1038/s41587-020-0439-x). PubMed PMID: 32055031. ## [Nextflow](https://pubmed.ncbi.nlm.nih.gov/28398311/) -> Di Tommaso P, Chatzou M, Floden EW, Barja PP, Palumbo E, Notredame C. Nextflow enables reproducible computational workflows. Nat Biotechnol. 2017 Apr 11;35(4):316-319. doi: 10.1038/nbt.3820. PubMed PMID: 28398311. +> Di Tommaso P, Chatzou M, Floden EW, Barja PP, Palumbo E, Notredame C. Nextflow enables reproducible computational workflows. Nat Biotechnol. 2017 Apr 11;35(4):316-319. doi: [10.1038/nbt.3820](https://doi.org/10.1038/nbt.3820). PubMed PMID: 28398311. ## Pipeline tools @@ -16,7 +16,7 @@ - [MultiQC](https://pubmed.ncbi.nlm.nih.gov/27312411/) - > Ewels P, Magnusson M, Lundin S, Käller M. MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics. 2016 Oct 1;32(19):3047-8. doi: 10.1093/bioinformatics/btw354. Epub 2016 Jun 16. PubMed PMID: 27312411; PubMed Central PMCID: PMC5039924. + > Ewels P, Magnusson M, Lundin S, Käller M. MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics. 2016 Oct 1;32(19):3047-8. doi: [10.1093/bioinformatics/btw354](10.1093/bioinformatics/btw354). Epub 2016 Jun 16. PubMed PMID: 27312411; PubMed Central PMCID: PMC5039924. - [Falco](https://doi.org/10.12688%2Ff1000research.21142.2) @@ -32,7 +32,7 @@ - [Picard Tools](https://broadinstitute.github.io/picard/) - > Broad Institute (2019). Picard Toolkit. GitHub Repository: https://broadinstitute.github.io/picard/ + > Broad Institute (2019). Picard Toolkit. GitHub Repository: [https://broadinstitute.github.io/picard/](https://broadinstitute.github.io/picard/) - [bwa](https://doi.org/10.1093/bioinformatics/btp324) @@ -96,7 +96,7 @@ - [QualiMap](https://doi.org/10.1093/bioinformatics/btv566) - > QualiMap Okonechnikov, K., Conesa, A., & García-Alcalde, F. (2016). Qualimap 2: advanced multi-sample quality control for high-throughput sequencing data. Bioinformatics , 32(2), 292–294. Download: http://qualimap.bioinfo.cipf.es/ + > QualiMap Okonechnikov, K., Conesa, A., & García-Alcalde, F. (2016). Qualimap 2: advanced multi-sample quality control for high-throughput sequencing data. Bioinformatics , 32(2), 292–294. Download: [http://qualimap.bioinfo.cipf.es/](http://qualimap.bioinfo.cipf.es/) - [DamageProfiler](https://doi.org/10.1093/bioinformatics/btab190) @@ -108,11 +108,11 @@ - [Kraken2](https://doi.org/10.1186/s13059-019-1891-0) - > Wood, Derrick E., Jennifer Lu, and Ben Langmead. 2019. Improved Metagenomic Analysis with Kraken 2. Genome Biology 20 (1): 257. doi: 10.1186/s13059-019-1891-0. + > Wood, Derrick E., Jennifer Lu, and Ben Langmead. 2019. Improved Metagenomic Analysis with Kraken 2. Genome Biology 20 (1): 257. doi: [10.1186/s13059-019-1891-0](https://doi.org/10.1186/s13059-019-1891-0). - [KrakenUniq](https://doi.org/10.1186/s13059-018-1568-0) - > Breitwieser, Florian P., Daniel N. Baker, and Steven L. Salzberg. 2018. KrakenUniq: confident and fast metagenomics classification using unique k-mer counts. Genome Biology 19 (1): 198. doi: 10.1186/s13059-018-1568-0 + > Breitwieser, Florian P., Daniel N. Baker, and Steven L. Salzberg. 2018. KrakenUniq: confident and fast metagenomics classification using unique k-mer counts. Genome Biology 19 (1): 198. doi: [10.1186/s13059-018-1568-0](https://doi.org/10.1186/s13059-018-1568-0) - [MetaPhlAn](https://doi.org/10.1038/s41587-023-01688-w) @@ -126,16 +126,16 @@ - [Bioconda](https://pubmed.ncbi.nlm.nih.gov/29967506/) - > Grüning B, Dale R, Sjödin A, Chapman BA, Rowe J, Tomkins-Tinch CH, Valieris R, Köster J; Bioconda Team. Bioconda: sustainable and comprehensive software distribution for the life sciences. Nat Methods. 2018 Jul;15(7):475-476. doi: 10.1038/s41592-018-0046-7. PubMed PMID: 29967506. + > Grüning B, Dale R, Sjödin A, Chapman BA, Rowe J, Tomkins-Tinch CH, Valieris R, Köster J; Bioconda Team. Bioconda: sustainable and comprehensive software distribution for the life sciences. Nat Methods. 2018 Jul;15(7):475-476. doi: [10.1038/s41592-018-0046-7](https://doi.org/10.1038/s41592-018-0046-7). PubMed PMID: 29967506. - [BioContainers](https://pubmed.ncbi.nlm.nih.gov/28379341/) - > da Veiga Leprevost F, Grüning B, Aflitos SA, Röst HL, Uszkoreit J, Barsnes H, Vaudel M, Moreno P, Gatto L, Weber J, Bai M, Jimenez RC, Sachsenberg T, Pfeuffer J, Alvarez RV, Griss J, Nesvizhskii AI, Perez-Riverol Y. BioContainers: an open-source and community-driven framework for software standardization. Bioinformatics. 2017 Aug 15;33(16):2580-2582. doi: 10.1093/bioinformatics/btx192. PubMed PMID: 28379341; PubMed Central PMCID: PMC5870671. + > da Veiga Leprevost F, Grüning B, Aflitos SA, Röst HL, Uszkoreit J, Barsnes H, Vaudel M, Moreno P, Gatto L, Weber J, Bai M, Jimenez RC, Sachsenberg T, Pfeuffer J, Alvarez RV, Griss J, Nesvizhskii AI, Perez-Riverol Y. BioContainers: an open-source and community-driven framework for software standardization. Bioinformatics. 2017 Aug 15;33(16):2580-2582. doi: [10.1093/bioinformatics/btx192](https://doi.org/10.1093/bioinformatics/btx192). PubMed PMID: 28379341; PubMed Central PMCID: PMC5870671. - [Docker](https://dl.acm.org/doi/10.5555/2600239.2600241) - > Merkel, D. (2014). Docker: lightweight linux containers for consistent development and deployment. Linux Journal, 2014(239), 2. doi: 10.5555/2600239.2600241. + > Merkel, D. (2014). Docker: lightweight linux containers for consistent development and deployment. Linux Journal, 2014(239), 2. doi: [10.5555/2600239.2600241](https://doi.org/10.5555/2600239.2600241). - [Singularity](https://pubmed.ncbi.nlm.nih.gov/28494014/) - > Kurtzer GM, Sochat V, Bauer MW. Singularity: Scientific containers for mobility of compute. PLoS One. 2017 May 11;12(5):e0177459. doi: 10.1371/journal.pone.0177459. eCollection 2017. PubMed PMID: 28494014; PubMed Central PMCID: PMC5426675. + > Kurtzer GM, Sochat V, Bauer MW. Singularity: Scientific containers for mobility of compute. PLoS One. 2017 May 11;12(5):e0177459. doi: [10.1371/journal.pone.0177459](https://doi.org/10.1371/journal.pone.0177459). eCollection 2017. PubMed PMID: 28494014; PubMed Central PMCID: PMC5426675. diff --git a/README.md b/README.md index b014ec903..728e15b42 100644 --- a/README.md +++ b/README.md @@ -72,7 +72,7 @@ Additional functionality contained by the pipeline currently includes: - Low-sequenced complexity filtering (`BBduk` or `PRINSEQ++`) - Taxonomic binner with alignment (`MALT` or `MetaPhlAn 4`) -- Taxonomic binner without alignment (`Kraken2`) +- Taxonomic binner without alignment (`Kraken2`,`KrakenUniq`) - aDNA characteristic screening of taxonomically binned data from MALT (`MaltExtract`) #### Functionality Overview diff --git a/conf/modules.config b/conf/modules.config index ab6599fbb..f65abb722 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -359,9 +359,6 @@ process { pattern: '*.fastq.gz', enabled: params.bamfiltering_generatemappedfastq ] - ext.args = [ - params.metagenomics_input == 'all' ? '' : '-F 4', - ].join(' ').trim() ext.prefix = { "${meta.id}_${meta.library_id}_mapped" } } diff --git a/conf/test.config b/conf/test.config index e041d3b2c..5600ef81b 100644 --- a/conf/test.config +++ b/conf/test.config @@ -37,7 +37,7 @@ params { mapstats_bedtools_featurefile = 'https://raw.githubusercontent.com/nf-core/test-datasets/eager/reference/Mammoth/Mammoth_MT_Krause.gff3' // Metagenomic screening - run_metagenomics = false + run_metagenomics = false } diff --git a/conf/test_multiref.config b/conf/test_multiref.config index 2dbdb0124..a4d20576f 100644 --- a/conf/test_multiref.config +++ b/conf/test_multiref.config @@ -31,7 +31,7 @@ params { bamfiltering_mappingquality = 37 // Metagenomics - run_metagenomics = false + run_metagenomics = false } diff --git a/subworkflows/local/metagenomics_profiling.nf b/subworkflows/local/metagenomics_profiling.nf index 8fad17a59..b95a2ad30 100644 --- a/subworkflows/local/metagenomics_profiling.nf +++ b/subworkflows/local/metagenomics_profiling.nf @@ -117,7 +117,7 @@ workflow METAGENOMICS_PROFILING { krakenuniq_db, params.metagenomics_krakenuniq_ram_chunk_size, params.metagenomics_kraken_save_reads, - true, + true, // save read assignments params.metagenomics_kraken_save_read_classifications ) From a6bd020f93d0c4f749c2ecb99cdfb46c0bcd56cf Mon Sep 17 00:00:00 2001 From: Ian Light Date: Fri, 1 Dec 2023 10:08:34 +0000 Subject: [PATCH 067/198] minor review suggestions -error calls, param names --- conf/modules.config | 18 ++++----- docs/development/manual_tests.md | 14 +++---- docs/output.md | 10 ++--- modules/local/krakenparse.nf | 2 +- nextflow.config | 32 ++++++++-------- nextflow_schema.json | 38 +++++++++---------- .../local/metagenomics_postprocessing.nf | 4 +- subworkflows/local/metagenomics_profiling.nf | 10 ++--- workflows/eager.nf | 30 +++++++-------- 9 files changed, 79 insertions(+), 79 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index f65abb722..9872f5977 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -833,14 +833,14 @@ process { withName: MALT_RUN { ext.args = [ "-m ${params.metagenomics_malt_mode}", - "-at ${params.metagenomics_malt_alignment_mode}", - "-top ${params.metagenomics_malt_top_percent}", - "-id ${params.metagenomics_malt_min_percent_identity}", - "-mq ${params.metagenomics_malt_max_queries}", - "--memoryMode ${params.metagenomics_malt_memory_mode}", - params.metagenomics_malt_sam_output ? "-a . -f SAM" : "", - params.metagenomics_malt_min_support_mode == "percent" ? "-supp ${params.metagenomics_malt_min_support_percent}" : "-sup ${params.metagenomics_min_support_reads}", - params.metagenomics_malt_save_reads ? "--alignments ./ -za false" : "" + "-at ${params.metagenomics_malt_alignmentmode}", + "-top ${params.metagenomics_malt_toppercent}", + "-id ${params.metagenomics_malt_minpercentidentity}", + "-mq ${params.metagenomics_malt_maxqueries}", + "--memoryMode ${params.metagenomics_malt_memorymode}", + params.metagenomics_malt_savesamoutput ? "-a . -f SAM" : "", + params.metagenomics_malt_minsupportmode == "percent" ? "-supp ${params.metagenomics_malt_minsupportpercent}" : "-sup ${params.metagenomics_minsupportreads}", + params.metagenomics_malt_savereads ? "--alignments ./ -za false" : "" ].join(' ').trim() publishDir = [ path: { "${params.outdir}/metagenomics_screening/profiling/malt/" }, @@ -851,7 +851,7 @@ process { withName: KRAKEN2_KRAKEN2 { ext.args = [ - params.metagenomics_kraken2_save_minimizers ? "--report-minimizer-data" : "" + params.metagenomics_kraken2_saveminimizers ? "--report-minimizer-data" : "" ].join(' ').trim() ext.prefix = { "${meta.sample_id}_${meta.library_id}" } publishDir = [ diff --git a/docs/development/manual_tests.md b/docs/development/manual_tests.md index 99518be71..9310928dd 100644 --- a/docs/development/manual_tests.md +++ b/docs/development/manual_tests.md @@ -616,8 +616,8 @@ nextflow run main.nf -profile test,docker \ --run_metagenomics \ --metagenomics_profiling_tool krakenuniq \ --metagenomics_profiling_database CUSTOM_KRAKEN_DB \ - --metagenomics_kraken_save_reads \ - --metagenomics_kraken_save_read_classifications + --metagenomics_kraken_savereads \ + --metagenomics_kraken_savereadclassifications ``` ##### kraken2 @@ -648,8 +648,8 @@ nextflow run main.nf -profile test,docker \ --run_metagenomics \ --metagenomics_profiling_tool kraken2 \ --metagenomics_profiling_database CUSTOM_KRAKEN2_DB \ - --metagenomics_kraken_save_reads \ - --metagenomics_kraken_save_read_classifications + --metagenomics_kraken_savereads \ + --metagenomics_kraken_savereadclassifications ``` ##### malt @@ -678,7 +678,7 @@ nextflow run main.nf -profile test,docker \ --run_metagenomics \ --metagenomics_profiling_tool malt \ --metagenomics_profiling_database CUSTOM_MALT_DB \ - --metagenomics_malt_save_reads + --metagenomics_malt_savereads ``` #### postprocessing @@ -695,8 +695,8 @@ nextflow run main.nf -profile test,docker \ --metagenomics_profiling_tool malt \ --metagenomics_profiling_database CUSTOM_MALT_DB \ --metagenomics_run_postprocessing \ - --metagenomics_maltextract_ncbi_dir NCBI_DIR \ - --metagenomics_maltextract_taxon_list TAXONLISTFILE + --metagenomics_maltextract_ncbidir NCBI_DIR \ + --metagenomics_maltextract_taxonlist TAXONLISTFILE ``` ##### mergemetaphlantables diff --git a/docs/output.md b/docs/output.md index 0379ef27f..a06a7baf6 100644 --- a/docs/output.md +++ b/docs/output.md @@ -353,7 +353,7 @@ The saved files are the _good_ files, passing the `dust` or `entropy` filter tre - `metagenomics_screening/profiling/malt/` - `.rma6`: binary file containing all alignments and taxonomic information of hits that can be loaded into the [MEGAN6](https://uni-tuebingen.de/fakultaeten/mathematisch-naturwissenschaftliche-fakultaet/fachbereiche/informatik/lehrstuehle/algorithms-in-bioinformatics/software/megan6/) interactive viewer - - `.blastn.sam`: sparse SAM file containing alignments of each hit (if `--metagenomics_malt_sam_output`) + - `.blastn.sam`: sparse SAM file containing alignments of each hit (if `--metagenomics_malt_savesamoutput`) - `*.log`: LOG file containing runtime log of MALT
@@ -364,7 +364,7 @@ You will receive output for each library. This means that if you use TSV input a The main output of MALT is the `.rma6` file format, which can be only loaded into MEGAN and it's related tools. -You will only receive the `.sam` files if you supply `--malt_save_reads` parameters to the pipeline. +You will only receive the `.sam` files if you supply `--metagenomics_malt_savesamoutput` parameters to the pipeline. ### MetaPhlAn @@ -392,14 +392,14 @@ The main taxonomic profiling file from MetaPhlAn is the `*_profile.txt` file. Th - `metagenomics_screening/profiling/kraken2/` - `.classified.fastq.gz`: FASTQ file containing all reads that had a hit against a reference in the database for a given sample - `.unclassified.fastq.gz`: FASTQ file containing all reads that did not have a hit in the database for a given sample - - `.report.txt`: A Kraken2 report that summarises the fraction abundance, taxonomic ID, number of Kmers, taxonomic path of all the hits in the Kraken2 run for a given sample. Will be 6 column rather than 8 if `--metagenomics_kraken2_save_minimizers` specified. + - `.report.txt`: A Kraken2 report that summarises the fraction abundance, taxonomic ID, number of Kmers, taxonomic path of all the hits in the Kraken2 run for a given sample. Will be 6 column rather than 8 if `--metagenomics_kraken2_saveminimizers` specified. - `.classifiedreads.txt`: A list of read IDs and the hits each read had against each database for a given sample
The main taxonomic classification file from Kraken2 is the `_combined_reports.txt` or `*report.txt` file. The former provides you the broadest over view of the taxonomic classification results across all samples against a single database, where you get two columns for each sample e.g. `2_all` and `2_lvl`, as well as a summarised column summing up across all samples `tot_all` and `tot_lvl`. The latter gives you the most information for a single sample. The report file is also used for the taxpasta step. -You will only receive the `.fastq` and `*classifiedreads.txt` file if you supply `--metagenomics_kraken_save_reads` and/or `--metagenomics_kraken_save_readclassifications` parameters to the pipeline. +You will only receive the `.fastq` and `*classifiedreads.txt` file if you supply `--metagenomics_kraken_savereads` and/or `--metagenomics_kraken_save_readclassifications` parameters to the pipeline. #### KrakenUniq @@ -418,7 +418,7 @@ You will only receive the `.fastq` and `*classifiedreads.txt` file if you supply The main taxonomic classification file from KrakenUniq is the `*report.txt` file. This is an extension of the Kraken2 report with the additional k-mer coverage information that provides more information about the accuracy of hits. -You will only receive the `*.fastq.gz` and `*.classifiedreads.txt` file if you supply `--metagenomics_kraken_save_reads` and/or `--metagenomics_kraken_save_readclassifications` parameters to the pipeline. +You will only receive the `*.fastq.gz` and `*.classifiedreads.txt` file if you supply `--metagenomics_kraken_savereads` and/or `--metagenomics_kraken_save_readclassifications` parameters to the pipeline. :::info The output system of KrakenUniq can result in other `stdout` or `stderr` logging information being saved in the report file, therefore you must check your report files before downstream use! diff --git a/modules/local/krakenparse.nf b/modules/local/krakenparse.nf index 9c1314cb6..fd0ea985f 100644 --- a/modules/local/krakenparse.nf +++ b/modules/local/krakenparse.nf @@ -23,7 +23,7 @@ process KRAKENPARSE { def kmer_out = "${prefix}.kmer_kraken_parsed.csv" """ kraken_parse.py \\ - -c ${params.metagenomics_min_support_reads} \\ + -c ${params.metagenomics_minsupportreads} \\ -or $read_out \\ -ok $kmer_out \\ $report diff --git a/nextflow.config b/nextflow.config index 7a00e6dc6..dba36deb8 100644 --- a/nextflow.config +++ b/nextflow.config @@ -133,25 +133,25 @@ params { metagenomics_prinseq_dustscore = 0.5 metagenomics_profiling_tool = null metagenomics_profiling_database = null - metagenomics_krakenuniq_ram_chunk_size = '16G' - metagenomics_kraken_save_reads = false - metagenomics_kraken_save_read_classifications = false - metagenomics_kraken2_save_minimizers = false + metagenomics_krakenuniq_ramchunksize = '16G' + metagenomics_kraken_savereads = false + metagenomics_kraken_savereadclassifications = false + metagenomics_kraken2_saveminimizers = false metagenomics_malt_mode = 'BlastN' - metagenomics_malt_alignment_mode = 'SemiGlobal' - metagenomics_malt_save_reads = false - metagenomics_malt_sam_output = false - metagenomics_malt_min_support_mode = 'percent' - metagenomics_malt_min_support_percent = 0.01 - metagenomics_min_support_reads = 1 - metagenomics_malt_min_percent_identity = 85 - metagenomics_malt_top_percent = 1 - metagenomics_malt_max_queries = 100 - metagenomics_malt_memory_mode = 'load' + metagenomics_malt_alignmentmode = 'SemiGlobal' + metagenomics_malt_savereads = false + metagenomics_malt_savesamoutput = false + metagenomics_malt_minsupportmode = 'percent' + metagenomics_malt_minsupportpercent = 0.01 + metagenomics_minsupportreads = 1 + metagenomics_malt_minpercentidentity = 85 + metagenomics_malt_toppercent = 1 + metagenomics_malt_maxqueries = 100 + metagenomics_malt_memorymode = 'load' metagenomics_malt_group_size = 0 metagenomics_run_postprocessing = true - metagenomics_maltextract_taxon_list = null - metagenomics_maltextract_ncbi_dir = null + metagenomics_maltextract_taxonlist = null + metagenomics_maltextract_ncbidir = null metagenomics_maltextract_filter = 'def_anc' metagenomics_maltextract_toppercent = 0.01 metagenomics_maltextract_destackingoff = false diff --git a/nextflow_schema.json b/nextflow_schema.json index cad5086c2..83ce2d19f 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -700,35 +700,35 @@ "description": "Specify which tool to use for metagenomic profiling and screening. Required if `--run_metagenomics` flagged.", "enum": ["malt", "metaphlan", "kraken2", "krakenuniq"], "fa_icon": "fas fa-toolbox", - "help_text": "Select which tool to run metagenomics profiling on designated metagenomics_screening_input. These tools behave vastly differently due to performing read profiling using different methods and yield vastly different reuslts." + "help_text": "Select which tool to run metagenomics profiling on designated metagenomics_input. These tools behave vastly differently due to performing read profiling using different methods and yield vastly different reuslts.\n\nMALT and MetaPhlAn are alignment based, whereas Kraken2 and KrakenUniq are k-mer based.\n\nMALT has addtional postprocessing available (via `--run_metagenomics_postprocessing`) which can help authenticate alignments to a provided list of taxonomic nodes using established ancientDNA characteristics.\n\nMetaPhlAn performs profiling on the metagenomcis input data. This may be used to characterize the metagenomic community of a sample but care must be taken that you are not just looking at the modern metagenome of an ancient sample (for instance, soil microbes on a bone)\n\n Kraken2 and KrakenUniq are metagenomics classifiers that rely on fast k-mer-matching rather than whole-read alignments and are very memory efficient." }, "metagenomics_profiling_database": { "type": "string", "format": "directory-path", "description": "Specify a databse directory to run metagenomics profiling on. In the case of kraken2, this can be a tar.gz of the directory. Required if `--run_metagenomics` flagged.", "fa_icon": "fas fa-database", - "help_text": "Select which tool to run metagenomics profiling database to use with the designated metagenomics_profiling_tool on the selected metagenomics_screening_input. These databases are NOT cross-compatible and need to be pre-built/downloaded for use in nf-core/eager. Database construction is often a balancing act between breadth of sequence diversity and size." + "help_text": "Select which tool to run metagenomics profiling database to use with the designated metagenomics_profiling_tool on the selected metagenomics_input. These databases are NOT cross-compatible and need to be pre-built/downloaded for use in nf-core/eager. Database construction is often a balancing act between breadth of sequence diversity and size." }, - "metagenomics_kraken_save_reads": { + "metagenomics_kraken_savereads": { "type": "boolean", "fa_icon": "fas fa-save", "description": "Turn on saving reads assigned by KrakenUniq or Kraken2", "help_text": "Save reads that do and do not have a taxonomic classification in your output results directory in FASTQ format.\n\n> Modifies tool parameter(s):\n> - krakenuniq: `--classified-out` and `--unclassified-out`" }, - "metagenomics_kraken_save_read_classifications": { + "metagenomics_kraken_savereadclassifications": { "type": "boolean", "fa_icon": "fas fa-save", "description": "Turn on saving of KrakenUniq or Kraken2 per-read taxonomic assignment file", "help_text": "Save a text file that contains a list of each read that had a taxonomic assignment, with information on specific taxonomic taxonomic assignment that that read recieved.\n\n> Modifies tool parameter(s):\n> - krakenuniq: `--output`" }, - "metagenomics_krakenuniq_ram_chunk_size": { + "metagenomics_krakenuniq_ramchunksize": { "type": "string", "default": "16G", "description": "Specify how large to chunk database when loading into memory for KrakenUniq", "fa_icon": "fas fa-database", "help_text": "nf-core/taxprofiler utilises a 'low memory' option for KrakenUniq that can reduce the amount of RAM the process requires using the `--preloaded` option.\n\nA further extension to this option is that you can specify how large each chunk of the database should be that gets loaded into memory at any one time. You can specify the amount of RAM to chunk the database to with this parameter, and is particularly useful for people with limited computational resources.\n\nMore information about this parameter can be seen [here](https://github.com/fbreitwieser/krakenuniq/blob/master/README.md#new-release-v07).\n\n> Modifies KrakenUniq parameter: --preload-size\n\n" }, - "metagenomics_kraken2_save_minimizers": { + "metagenomics_kraken2_saveminimizers": { "type": "boolean", "description": "Turn on saving minimizer information in the kraken2 report thus increasing to an eight column layout.", "fa_icon": "fas fa-save", @@ -742,7 +742,7 @@ "help_text": "Use this to run the program in 'BlastN', 'BlastP', 'BlastX' modes to align DNA\nand DNA, protein and protein, or DNA reads against protein references\nrespectively. Ensure your database matches the mode. Check the\n[MALT\nmanual](http://ab.inf.uni-tuebingen.de/data/software/malt/download/manual.pdf)\nfor more details. Default: `'BlastN'`\n\nOnly when `--metagenomics_profiling_tool malt` is also supplied.\n\n> Modifies MALT parameter: `-m`\n", "enum": ["BlastN", "BlastP", "BlastX"] }, - "metagenomics_malt_alignment_mode": { + "metagenomics_malt_alignmentmode": { "type": "string", "default": "SemiGlobal", "description": "Specify alignment method for MALT. Options: 'Local', 'SemiGlobal'.", @@ -750,21 +750,21 @@ "help_text": "Specify what alignment algorithm to use. Options are 'Local' or 'SemiGlobal'. Local is a BLAST like alignment, but is much slower. Semi-global alignment aligns reads end-to-end. Default: `'SemiGlobal'`\n\nOnly when `--metagenomics_profiling_tool malt` is also supplied.\n\n> Modifies MALT parameter: `-at`", "enum": ["Local", "SemiGlobal"] }, - "metagenomics_malt_min_percent_identity": { + "metagenomics_malt_minpercentidentity": { "type": "integer", "default": 85, "description": "Percent identity value threshold for MALT.", "fa_icon": "fas fa-id-card", "help_text": "Specify the minimum percent identity (or similarity) a sequence must have to the reference for it to be retained. Default is `85`\n\nOnly used when `--metagenomics_profiling_tool malt` is also supplied.\n\n> Modifies MALT parameter: `-id`" }, - "metagenomics_malt_top_percent": { + "metagenomics_malt_toppercent": { "type": "integer", "default": 1, "description": "Specify the percent for LCA algorithm for MALT (see MEGAN6 CE manual).", "fa_icon": "fas fa-percent", "help_text": "Specify the top percent value of the LCA algorithm. From the [MALT manual](http://ab.inf.uni-tuebingen.de/data/software/malt/download/manual.pdf): \"For each\nread, only those matches are used for taxonomic placement whose bit disjointScore is within\n10% of the best disjointScore for that read.\". Default: `1`.\n\nOnly when `--metagenomics_profiling_tool malt` is also supplied.\n\n> Modifies MALT parameter: `-top`" }, - "metagenomics_malt_min_support_mode": { + "metagenomics_malt_minsupportmode": { "type": "string", "default": "percent", "description": "Specify whether to use percent or raw number of reads for minimum support required for taxon to be retained for MALT. Options: 'percent', 'reads'.", @@ -772,28 +772,28 @@ "help_text": "Specify whether to use a percentage, or raw number of reads as the value used to decide the minimum support a taxon requires to be retained.\n\nOnly when `--metagenomics_profiling_tool malt` is also supplied.\n\n> Modifies MALT parameter: `-sup -supp`", "enum": ["percent", "reads"] }, - "metagenomics_malt_min_support_percent": { + "metagenomics_malt_minsupportpercent": { "type": "number", "default": 0.01, "description": "Specify the minimum percentage of reads a taxon of sample total is required to have to be retained for MALT.", "fa_icon": "fas fa-percentage", "help_text": "Specify the minimum number of reads (as a percentage of all assigned reads) a given taxon is required to have to be retained as a positive 'hit' in the RMA6 file. This only applies when `--malt_min_support_mode` is set to 'percent'. Default 0.01.\n\nOnly when `--metagenomics_profiling_tool malt` is also supplied.\n\n> Modifies MALT parameter: `-supp`" }, - "metagenomics_min_support_reads": { + "metagenomics_minsupportreads": { "type": "integer", "default": 1, "description": "Specify a minimum number of reads a taxon of sample total is required to have to be retained in malt or kraken. Not compatible with --malt_min_support_mode 'percent'.", "fa_icon": "fas fa-sort-numeric-up-alt", "help_text": "For usage in malt or kraken: Specify the minimum number of reads a given taxon is required to have to be retained as a positive 'hit'. \n For usage in kraken2 or krakenuniq: Specify the number of hits on a clade to retain it in the final report when using kraken_parse. Default: 1. \nFor malt, this only applies when `--malt_min_support_mode` is set to 'reads'. \n\n> Modifies MALT or kraken_parse.py parameter: `-sup` and `-c` respectively\n" }, - "metagenomics_malt_max_queries": { + "metagenomics_malt_maxqueries": { "type": "integer", "default": 100, "description": "Specify the maximum number of queries a read can have for MALT.", "fa_icon": "fas fa-phone", "help_text": "Specify the maximum number of alignments a read can have. All further alignments are discarded. Default: `100`\n\nOnly when `--metagenomics_profiling_tool malt` is also supplied.\n\n> Modifies MALT parameter: `-mq`" }, - "metagenomics_malt_memory_mode": { + "metagenomics_malt_memorymode": { "type": "string", "default": "load", "description": "Specify the memory load method. Do not use 'map' with GPFS file systems for MALT as can be very slow. Options: 'load', 'page', 'map'.", @@ -801,13 +801,13 @@ "help_text": "\nHow to load the database into memory. Options are `'load'`, `'page'` or `'map'`.\n'load' directly loads the entire database into memory prior seed look up, this\nis slow but compatible with all servers/file systems. `'page'` and `'map'`\nperform a sort of 'chunked' database loading, allowing seed look up prior entire\ndatabase loading. Note that Page and Map modes do not work properly not with\nmany remote file-systems such as GPFS. Default is `'load'`.\n\nOnly when `--metagenomics_profiling_tool malt` is also supplied.\n\n> Modifies MALT parameter: `--memoryMode`", "enum": ["load", "page", "map"] }, - "metagenomics_malt_sam_output": { + "metagenomics_malt_savesamoutput": { "type": "boolean", "description": "Specify to also produce SAM alignment files. Note this includes both aligned and unaligned reads, and are gzipped. Note this will result in very large file sizes.", "fa_icon": "fas fa-file-alt", "help_text": "Specify to _also_ produce gzipped SAM files of all alignments and un-aligned reads in addition to RMA6 files. These are **not** soft-clipped or in 'sparse' format. Can be useful for downstream analyses due to more common file format. \n\n:warning: can result in very large run output directories as this is essentially duplication of the RMA6 files.\n\n> Modifies MALT parameter `-a -f`" }, - "metagenomics_malt_save_reads": { + "metagenomics_malt_savereads": { "type": "boolean", "fa_icon": "fas fa-save", "description": "Turn on saving of MALT-aligned reads", @@ -827,14 +827,14 @@ "default": "true", "fa_icon": "fab fa-buromobelexperte" }, - "metagenomics_maltextract_taxon_list": { + "metagenomics_maltextract_taxonlist": { "type": "string", "description": "Path to a text file with taxa of interest (one taxon per row, NCBI taxonomy name format)", "default": "None", - "help_text": "Path to a `.txt` file with taxa of interest you wish to assess for aDNA characteristics. In `.txt` file should be one taxon per row, and the taxon should be in a valid [NCBI taxonomy](https://www.ncbi.nlm.nih.gov/taxonomy) name format corresponding to a taxonomic node in your MALT database.\\n\\nNecessary when `--metagenomics_profiling_tool malt` specified and `--metagenomics_run_postprocessing` flagged.", + "help_text": "Path to a `.txt` file with taxa of interest you wish to assess for aDNA characteristics. In `.txt` file should be one taxon per row, and the taxon should be in a valid [NCBI taxonomy](https://www.ncbi.nlm.nih.gov/taxonomy) name format corresponding to a taxonomic node in your MALT database. An example can be found on the [HOPS github](https://raw.githubusercontent.com/rhuebler/HOPS/external/Resources/default_list.txt).\\n\\nNecessary when `--metagenomics_profiling_tool malt` specified and `--metagenomics_run_postprocessing` flagged.", "fa_icon": "fas fa-align-left" }, - "metagenomics_maltextract_ncbi_dir": { + "metagenomics_maltextract_ncbidir": { "type": "string", "description": "Path to directory containing containing NCBI resource files (ncbi.tre and ncbi.map; available: https://github.com/rhuebler/HOPS/)", "default": "None", diff --git a/subworkflows/local/metagenomics_postprocessing.nf b/subworkflows/local/metagenomics_postprocessing.nf index 5edb63ed7..e42b9bfdd 100644 --- a/subworkflows/local/metagenomics_postprocessing.nf +++ b/subworkflows/local/metagenomics_postprocessing.nf @@ -20,8 +20,8 @@ workflow METAGENOMICS_POSTPROCESSING { //maltextract doesnt accepts a meta param in the first input channel, so remove it ch_maltextract_input = ch_postprocessing_input.map{it[1]} - tax_list = Channel.fromPath(params.metagenomics_maltextract_taxon_list) - ncbi_dir = Channel.fromPath(params.metagenomics_maltextract_ncbi_dir) + tax_list = Channel.fromPath(params.metagenomics_maltextract_taxonlist) + ncbi_dir = Channel.fromPath(params.metagenomics_maltextract_ncbidir) MALTEXTRACT ( ch_maltextract_input, tax_list, ncbi_dir) diff --git a/subworkflows/local/metagenomics_profiling.nf b/subworkflows/local/metagenomics_profiling.nf index 93d143ab0..8fbba0179 100644 --- a/subworkflows/local/metagenomics_profiling.nf +++ b/subworkflows/local/metagenomics_profiling.nf @@ -114,10 +114,10 @@ workflow METAGENOMICS_PROFILING { KRAKENUNIQ_PRELOADEDKRAKENUNIQ ( krakenuniq_reads, krakenuniq_db, - params.metagenomics_krakenuniq_ram_chunk_size, - params.metagenomics_kraken_save_reads, + params.metagenomics_krakenuniq_ramchunksize, + params.metagenomics_kraken_savereads, true, // save read assignments - params.metagenomics_kraken_save_read_classifications + params.metagenomics_kraken_savereadclassifications ) ch_multiqc_files = ch_multiqc_files.mix( KRAKENUNIQ_PRELOADEDKRAKENUNIQ.out.report ) @@ -138,8 +138,8 @@ workflow METAGENOMICS_PROFILING { KRAKEN2_KRAKEN2 ( kraken2_reads, kraken2_db, - params.metagenomics_kraken_save_reads, - params.metagenomics_kraken_save_read_classifications + params.metagenomics_kraken_savereads, + params.metagenomics_kraken_savereadclassifications ) ch_multiqc_files = ch_multiqc_files.mix( KRAKEN2_KRAKEN2.out.report ) diff --git a/workflows/eager.nf b/workflows/eager.nf index 5d8755817..cc2a2948a 100644 --- a/workflows/eager.nf +++ b/workflows/eager.nf @@ -16,14 +16,14 @@ log.info logo + paramsSummaryLog(workflow) + citation WorkflowEager.initialise(params, log) // Check failing parameter combinations -if ( params.bamfiltering_retainunmappedgenomicbam && params.bamfiltering_mappingquality > 0 ) { exit 1, ("[nf-core/eager] ERROR: You cannot both retain unmapped reads and perform quality filtering, as unmapped reads have a mapping quality of 0. Pick one or the other functionality.") } -if ( params.genotyping_source == 'trimmed' && ! params.run_trim_bam ) { exit 1, ("[nf-core/eager] ERROR: --genotyping_source cannot be 'trimmed' unless BAM trimming is turned on with `--run_trim_bam`.") } -if ( params.genotyping_source == 'pmd' && ! params.run_pmd_filtering ) { exit 1, ("[nf-core/eager] ERROR: --genotyping_source cannot be 'pmd' unless PMD-filtering is ran.") } -if ( params.genotyping_source == 'rescaled' && ! params.run_mapdamage_rescaling ) { exit 1, ("[nf-core/eager] ERROR: --genotyping_source cannot be 'rescaled' unless aDNA damage rescaling is ran.") } +if ( params.bamfiltering_retainunmappedgenomicbam && params.bamfiltering_mappingquality > 0 ) { error("[nf-core/eager] ERROR: You cannot both retain unmapped reads and perform quality filtering, as unmapped reads have a mapping quality of 0. Pick one or the other functionality.") } +if ( params.genotyping_source == 'trimmed' && ! params.run_trim_bam ) { error("[nf-core/eager] ERROR: --genotyping_source cannot be 'trimmed' unless BAM trimming is turned on with `--run_trim_bam`.") } +if ( params.genotyping_source == 'pmd' && ! params.run_pmd_filtering ) { error("[nf-core/eager] ERROR: --genotyping_source cannot be 'pmd' unless PMD-filtering is ran.") } +if ( params.genotyping_source == 'rescaled' && ! params.run_mapdamage_rescaling ) { error("[nf-core/eager] ERROR: --genotyping_source cannot be 'rescaled' unless aDNA damage rescaling is ran.") } if ( params.metagenomics_complexity_tool == 'prinseq' && params.metagenomics_prinseq_mode == 'dust' && params.metagenomics_complexity_entropy != 0.3 ) { // entropy score was set but dust method picked. If no dust-score provided, assume it was an error and fail if (params.metagenomics_prinseq_dustscore == 0.5) { - exit 1, ("[nf-core/eager] ERROR: Metagenomics: You picked PRINSEQ++ with 'dust' mode but provided an entropy score. Please specify a dust filter threshold using the --metagenomics_prinseq_dustscore flag") + error("[nf-core/eager] ERROR: Metagenomics: You picked PRINSEQ++ with 'dust' mode but provided an entropy score. Please specify a dust filter threshold using the --metagenomics_prinseq_dustscore flag") } } @@ -33,40 +33,40 @@ if ( params.run_metagenomics ) { if ( params.metagenomics_complexity_tool == 'prinseq' && params.metagenomics_prinseq_mode == 'entropy' && params.metagenomics_prinseq_dustscore != 0.5 ) { // dust score was set but entropy method picked. If no entropy-score provided, assume it was an error and fail if (params.metagenomics_complexity_entropy == 0.3) { - exit 1, ("[nf-core/eager] ERROR: Metagenomics: You picked PRINSEQ++ with 'entropy' mode but provided a dust score. Please specify an entropy filter threshold using the --metagenomics_complexity_entropy flag") + error("[nf-core/eager] ERROR: Metagenomics: You picked PRINSEQ++ with 'entropy' mode but provided a dust score. Please specify an entropy filter threshold using the --metagenomics_complexity_entropy flag") } } if ( ! params.metagenomics_profiling_tool ) { - exit 1, ("[nf-core/eager] ERROR: --run_metagenomics flagged, but no profiling tool selected! Please choose an appropriate metagenomics screening tool by setting --metagenomics_profiling_tool to one of 'malt', 'krakenuniq', 'kraken2', or 'metaphlan' and declare the path to the database directory using `--metagenomics_profiling_database`.") + error("[nf-core/eager] ERROR: --run_metagenomics flagged, but no profiling tool selected! Please choose an appropriate metagenomics screening tool by setting --metagenomics_profiling_tool to one of 'malt', 'krakenuniq', 'kraken2', or 'metaphlan' and declare the path to the database directory using `--metagenomics_profiling_database`.") } if ( ! params.metagenomics_profiling_database ) { - exit 1, ("[nf-core/eager] ERROR: Please provide an appropriate database path for metagenomics screening using --metagenomics_profiling_database. Note this database should correspond to ${params.metagenomics_profiling_tool}") + error("[nf-core/eager] ERROR: Please provide an appropriate database path for metagenomics screening using --metagenomics_profiling_database. Note this database should correspond to ${params.metagenomics_profiling_tool}") } if ( params.metagenomics_profiling_tool == 'malt' && params.metagenomics_run_postprocessing ) { - if ( ! params.metagenomics_maltextract_ncbi_dir ) { - exit 1, ("[nf-core/eager] ERROR: Postprocessing for MALT requires additional parameters specified. Please provide a path to the NBCI directory for MaltExtract using `--metagenomics_maltextract_ncbi_dir`.") + if ( ! params.metagenomics_maltextract_ncbidir ) { + error("[nf-core/eager] ERROR: Postprocessing for MALT requires additional parameters specified. Please provide a path to the NBCI directory for MaltExtract using `--metagenomics_maltextract_ncbidir`.") } - if ( ! params.metagenomics_maltextract_ncbi_dir ) { - exit 1, ("[nf-core/eager] ERROR: Postprocessing for MALT requires additional parameters specified. Please provide a path to the taxon list for MaltExtract using `--metagenomics_maltextract_taxon_list`.") + if ( ! params.metagenomics_maltextract_ncbidir ) { + error("[nf-core/eager] ERROR: Postprocessing for MALT requires additional parameters specified. Please provide a path to the taxon list for MaltExtract using `--metagenomics_maltextract_taxonlist`.") } } // warnings for metagenomics parameter combinations - if ( params.metagenomics_profiling_tool == 'krakenmerge' && params.metagenomics_min_support_reads == 1 ) log.warn("[nf-core/eager] Warning: The default value for krakenmerge minimum reads for outputing a node has not been changed from the default. The default is set for use with MALT and maltextract. Consider updating to the default value for krakenmerge (50 reads) by setting --metagenomics_min_support_reads 50") + if ( params.metagenomics_profiling_tool == 'krakenmerge' && params.metagenomics_minsupportreads == 1 ) log.warn("[nf-core/eager] Warning: The default value for krakenmerge minimum reads for outputing a node has not been changed from the default. The default is set for use with MALT and maltextract. Consider updating to the default value for krakenmerge (50 reads) by setting --metagenomics_minsupportreads 50") } if( params.run_bedtools_coverage ){ if( !params.mapstats_bedtools_featurefile ) { - exit 1, "[nf-core/eager] ERROR: you have turned on bedtools coverage, but not specified a BED or GFF file with --mapstats_bedtools_featurefile. Please validate your parameters." + error("[nf-core/eager] ERROR: you have turned on bedtools coverage, but not specified a BED or GFF file with --mapstats_bedtools_featurefile. Please validate your parameters.") } } // TODO What to do when params.preprocessing_excludeunmerged is provided but the data is SE? -if ( params.deduplication_tool == 'dedup' && ! params.preprocessing_excludeunmerged ) { exit 1, "[nf-core/eager] ERROR: Dedup can only be used on collapsed (i.e. merged) PE reads. For all other cases, please set --deduplication_tool to 'markduplicates'."} +if ( params.deduplication_tool == 'dedup' && ! params.preprocessing_excludeunmerged ) { error"[nf-core/eager] ERROR: Dedup can only be used on collapsed (i.e. merged) PE reads. For all other cases, please set --deduplication_tool to 'markduplicates'."} // Report possible warnings if ( params.preprocessing_skipadaptertrim && params.preprocessing_adapterlist ) log.warn("[nf-core/eager] --preprocessing_skipadaptertrim will override --preprocessing_adapterlist. Adapter trimming will be skipped!") From 4e97e96901fbeaa59759da7df357b562cf9e986a Mon Sep 17 00:00:00 2001 From: Merlin Szymanski Date: Fri, 1 Dec 2023 11:55:00 +0100 Subject: [PATCH 068/198] Remove krakenparse+krakenmerge, add taxpasta --- conf/modules.config | 16 ++---- modules.json | 10 ++++ modules/local/krakenmerge.nf | 34 ------------ modules/local/krakenparse.nf | 36 ------------ .../nf-core/taxpasta/merge/environment.yml | 7 +++ modules/nf-core/taxpasta/merge/main.nf | 47 ++++++++++++++++ modules/nf-core/taxpasta/merge/meta.yml | 55 +++++++++++++++++++ .../taxpasta/standardise/environment.yml | 7 +++ modules/nf-core/taxpasta/standardise/main.nf | 42 ++++++++++++++ modules/nf-core/taxpasta/standardise/meta.yml | 49 +++++++++++++++++ .../local/metagenomics_postprocessing.nf | 28 +++++----- 11 files changed, 235 insertions(+), 96 deletions(-) delete mode 100644 modules/local/krakenmerge.nf delete mode 100644 modules/local/krakenparse.nf create mode 100644 modules/nf-core/taxpasta/merge/environment.yml create mode 100644 modules/nf-core/taxpasta/merge/main.nf create mode 100644 modules/nf-core/taxpasta/merge/meta.yml create mode 100644 modules/nf-core/taxpasta/standardise/environment.yml create mode 100644 modules/nf-core/taxpasta/standardise/main.nf create mode 100644 modules/nf-core/taxpasta/standardise/meta.yml diff --git a/conf/modules.config b/conf/modules.config index f65abb722..da7737dee 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -359,7 +359,6 @@ process { pattern: '*.fastq.gz', enabled: params.bamfiltering_generatemappedfastq ] - ext.prefix = { "${meta.id}_${meta.library_id}_mapped" } } withName: SAMTOOLS_FLAGSTAT_FILTERED { @@ -906,20 +905,13 @@ process { ] } - withName: KRAKENPARSE{ + withName: TAXPASTA_MERGE { publishDir = [ - enabled: false - ] - tag = { "${meta.sample_id}_${meta.library_id}" } - ext.prefix = { "${meta.sample_id}_${meta.library_id}" } - } - - withName: KRAKENMERGE { - publishDir = [ - path: { "${params.outdir}/metagenomics_screening/postprocessing/kraken_merge/" }, + path: { "${params.outdir}/metagenomics_screening/postprocessing/taxpasta/" }, mode: params.publish_dir_mode, - pattern: '*.csv' + pattern: '*.{csv,tsv,ods,xlsx,arrow,parquet,biom}' ] + ext.args = { "--profiler ${meta.profiler} --output taxpasta_table.tsv" } } withName: METAPHLAN_MERGEMETAPHLANTABLES { diff --git a/modules.json b/modules.json index dacedc273..c75929016 100644 --- a/modules.json +++ b/modules.json @@ -239,6 +239,16 @@ "branch": "master", "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", "installed_by": ["modules", "bam_split_by_region"] + }, + "taxpasta/merge": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "taxpasta/standardise": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] } } }, diff --git a/modules/local/krakenmerge.nf b/modules/local/krakenmerge.nf deleted file mode 100644 index c7278fe26..000000000 --- a/modules/local/krakenmerge.nf +++ /dev/null @@ -1,34 +0,0 @@ -process KRAKENMERGE { - label 'process_single' - - conda "conda-forge::pandas=1.5.2" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/pandas:1.5.2' : - 'quay.io/biocontainers/pandas:1.5.2' }" - input: - - path kraken_parse_reads - path kraken_parse_kmers - - output: - path "kraken_read_count_table.csv" , emit: read_count_table - path "kraken_kmer_duplication.csv" , emit: kmer_duplication_table - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def read_out = "kraken_read_count_table.csv" - def kmer_out = "kraken_kmer_duplication.csv" - """ - merge_kraken_res.py \\ - -or $read_out \\ - -ok $kmer_out \\ - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - python: \$(python --version | sed 's/Python //g') - END_VERSIONS - """ -} diff --git a/modules/local/krakenparse.nf b/modules/local/krakenparse.nf deleted file mode 100644 index 9c1314cb6..000000000 --- a/modules/local/krakenparse.nf +++ /dev/null @@ -1,36 +0,0 @@ -process KRAKENPARSE { - tag "$meta.id" - label 'process_single' - - conda "conda-forge::python=3.8.3" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/python:3.8.3' : - 'quay.io/biocontainers/python:3.8.3' }" - input: - tuple val(meta), path(report) - - output: - tuple val(meta), path("*read_kraken_parsed.csv"), emit: read_kraken_parsed - tuple val(meta), path("*kmer_kraken_parsed.csv"), emit: kmer_kraken_parsed - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def prefix = task.ext.prefix ?: "${meta.id}" - def read_out = "${prefix}.read_kraken_parsed.csv" - def kmer_out = "${prefix}.kmer_kraken_parsed.csv" - """ - kraken_parse.py \\ - -c ${params.metagenomics_min_support_reads} \\ - -or $read_out \\ - -ok $kmer_out \\ - $report - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - python: \$(python --version | sed 's/Python //g') - END_VERSIONS - """ -} diff --git a/modules/nf-core/taxpasta/merge/environment.yml b/modules/nf-core/taxpasta/merge/environment.yml new file mode 100644 index 000000000..cfcd405e1 --- /dev/null +++ b/modules/nf-core/taxpasta/merge/environment.yml @@ -0,0 +1,7 @@ +name: taxpasta_merge +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::taxpasta=0.6.1 diff --git a/modules/nf-core/taxpasta/merge/main.nf b/modules/nf-core/taxpasta/merge/main.nf new file mode 100644 index 000000000..1321caa7e --- /dev/null +++ b/modules/nf-core/taxpasta/merge/main.nf @@ -0,0 +1,47 @@ +process TAXPASTA_MERGE { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/taxpasta:0.6.1--pyhdfd78af_0': + 'biocontainers/taxpasta:0.6.1--pyhdfd78af_0' }" + + + input: + tuple val(meta), path(profiles) + path taxonomy + path samplesheet + + output: + tuple val(meta), path("*.{tsv,csv,arrow,parquet,biom}"), emit: merged_profiles + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + // N.B.: Taxpasta requires a --profiler option and will fail without it. + // This must be specified via a `nextflow.config` or `modules.config`, for + // example, as "--profiler kraken2". Additionally, it requires a --output + // option with the output file name. The desired format will be parsed from + // the name and should correspond to the output pattern specified above, + // e.g., "--output ${task.ext.prefix}.tsv". + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def taxonomy_option = taxonomy ? "--taxonomy ${taxonomy}" : '' + def samplesheet_input = samplesheet ? "-s ${samplesheet}" : '' + """ + taxpasta merge \\ + $args \\ + $taxonomy_option \\ + $samplesheet_input \\ + $profiles + + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + taxpasta: \$(taxpasta --version) + END_VERSIONS + """ +} diff --git a/modules/nf-core/taxpasta/merge/meta.yml b/modules/nf-core/taxpasta/merge/meta.yml new file mode 100644 index 000000000..6945d1053 --- /dev/null +++ b/modules/nf-core/taxpasta/merge/meta.yml @@ -0,0 +1,55 @@ +name: "taxpasta_merge" +description: Standardise and merge two or more taxonomic profiles into a single table +keywords: + - taxonomic profile + - standardise + - standardisation + - metagenomics + - taxonomic profiling + - otu tables + - taxon tables +tools: + - "taxpasta": + description: "TAXonomic Profile Aggregation and STAndardisation" + homepage: "https://taxpasta.readthedocs.io/" + documentation: "https://taxpasta.readthedocs.io/" + tool_dev_url: "https://github.com/taxprofiler/taxpasta" + licence: "['Apache-2.0']" +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - profiles: + type: file + description: A list of taxonomic profiler output files (typically in text format, mandatory) + pattern: "*.{tsv,csv,arrow,parquet,biom}" + - samplesheet: + type: file + description: A samplesheet describing the sample name and a filepath to a taxonomic abundance profile that needs to be relative from the work environment. The profiles must be provided even if you give a samplesheet as argument (optional) + pattern: "*.{tsv,csv,ods,xlsx,arrow,parquet}" + - taxonomy: + type: directory + description: Directory containing at a minimum nodes.dmp and names.dmp files (optional) + pattern: "*/" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - merged_profiles: + type: file + description: Output file with standardised multiple profiles in one go and have all profiles combined into a single table. + pattern: "*.{tsv,csv,ods,xlsx,arrow,parquet,biom}" +authors: + - "@sofstam" + - "@jfy133" +maintainers: + - "@sofstam" + - "@jfy133" diff --git a/modules/nf-core/taxpasta/standardise/environment.yml b/modules/nf-core/taxpasta/standardise/environment.yml new file mode 100644 index 000000000..81b35fc48 --- /dev/null +++ b/modules/nf-core/taxpasta/standardise/environment.yml @@ -0,0 +1,7 @@ +name: taxpasta_standardise +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::taxpasta=0.6.1 diff --git a/modules/nf-core/taxpasta/standardise/main.nf b/modules/nf-core/taxpasta/standardise/main.nf new file mode 100644 index 000000000..83693d4e4 --- /dev/null +++ b/modules/nf-core/taxpasta/standardise/main.nf @@ -0,0 +1,42 @@ +process TAXPASTA_STANDARDISE { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/taxpasta:0.6.1--pyhdfd78af_0': + 'biocontainers/taxpasta:0.6.1--pyhdfd78af_0' }" + + input: + tuple val(meta), path(profile) + path taxonomy + + output: + tuple val(meta), path("*.{tsv,csv,arrow,parquet,biom}"), emit: standardised_profile + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + // N.B.: Taxpasta requires a --profiler option and will fail without it. + // This must be specified via a `nextflow.config` or `modules.config`, for + // example, as "--profiler kraken2". Additionally, it requires a --output + // option with the output file name. The desired format will be parsed from + // the name and should correspond to the output pattern specified above, + // e.g., "--output ${task.ext.prefix}.tsv". + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def taxonomy_option = taxonomy ? "--taxonomy ${taxonomy}" : '' + """ + taxpasta standardise \\ + $args \\ + $taxonomy_option \\ + $profile + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + taxpasta: \$(taxpasta --version) + END_VERSIONS + """ +} diff --git a/modules/nf-core/taxpasta/standardise/meta.yml b/modules/nf-core/taxpasta/standardise/meta.yml new file mode 100644 index 000000000..a902b00e0 --- /dev/null +++ b/modules/nf-core/taxpasta/standardise/meta.yml @@ -0,0 +1,49 @@ +name: "taxpasta_standardise" +description: "Standardise the output of a wide range of taxonomic profilers" +keywords: + - taxonomic profile + - standardise + - standardisation + - metagenomics + - taxonomic profiling + - otu tables + - taxon tables +tools: + - "taxpasta": + description: "TAXonomic Profile Aggregation and STAndardisation" + homepage: "https://taxpasta.readthedocs.io/" + documentation: "https://taxpasta.readthedocs.io/" + tool_dev_url: "https://github.com/taxprofiler/taxpasta" + licence: "['Apache-2.0']" +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - profile: + type: file + description: profiler output file (mandatory) + pattern: "*" + - taxonomy: + type: directory + description: Directory containing at a minimum nodes.dmp and names.dmp files (optional) + pattern: "*/" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - standardised_profile: + type: file + description: Standardised taxonomic profile + pattern: "*.{tsv,csv,arrow,parquet,biom}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@Midnighter" +maintainers: + - "@Midnighter" diff --git a/subworkflows/local/metagenomics_postprocessing.nf b/subworkflows/local/metagenomics_postprocessing.nf index 5edb63ed7..097acab3a 100644 --- a/subworkflows/local/metagenomics_postprocessing.nf +++ b/subworkflows/local/metagenomics_postprocessing.nf @@ -1,8 +1,8 @@ include { MALTEXTRACT } from '../../modules/nf-core/maltextract/main' include { AMPS } from '../../modules/nf-core/amps/main' -include { KRAKENPARSE } from '../../modules/local/krakenparse' -include { KRAKENMERGE } from '../../modules/local/krakenmerge' include { METAPHLAN_MERGEMETAPHLANTABLES } from '../../modules/nf-core/metaphlan/mergemetaphlantables/main' +include { TAXPASTA_STANDARDISE } from '../../modules/nf-core/taxpasta/standardise/main' +include { TAXPASTA_MERGE } from '../../modules/nf-core/taxpasta/merge/main' workflow METAGENOMICS_POSTPROCESSING { @@ -35,21 +35,21 @@ workflow METAGENOMICS_POSTPROCESSING { else if ( ['kraken2', 'krakenuniq'].contains(params.metagenomics_profiling_tool) ) { - KRAKENPARSE ( ch_postprocessing_input ) - - ch_list_of_kraken_parse_reads = KRAKENPARSE.out.read_kraken_parsed.map { - meta, read_out -> [ read_out ] + ch_postprocessing_input = ch_postprocessing_input + .map{ + meta, report -> + [report] } - ch_list_of_kraken_parse_kmer = KRAKENPARSE.out.kmer_kraken_parsed.map { - meta, kmer_out -> [ kmer_out ] + .collect() + .map{ + reports -> + [ + ["id":"taxpasta", "profiler":params.metagenomics_profiling_tool], + reports + ] } - - KRAKENMERGE ( ch_list_of_kraken_parse_reads.collect() , ch_list_of_kraken_parse_kmer.collect() ) - - ch_versions = ch_versions.mix( KRAKENPARSE.out.versions.first(), KRAKENMERGE.out.versions.first() ) - ch_multiqc_files = ch_multiqc_files.mix( KRAKENMERGE.out.read_count_table, KRAKENMERGE.out.kmer_duplication_table ) - + TAXPASTA_MERGE( ch_postprocessing_input, [], [] ) } else if ( params.metagenomics_run_postprocessing && params.metagenomics_profiling_tool == 'metaphlan' ) { From 7f45286a73b420df45eaf845f61232397a0d7102 Mon Sep 17 00:00:00 2001 From: Ian Light Date: Fri, 1 Dec 2023 10:57:31 +0000 Subject: [PATCH 069/198] rm duplic of malt_savesamoutput to just savereads --- conf/modules.config | 3 +-- docs/output.md | 4 ++-- nextflow.config | 57 ++++++++++++++++++++++---------------------- nextflow_schema.json | 10 ++------ 4 files changed, 33 insertions(+), 41 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 9872f5977..d423470c8 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -838,14 +838,13 @@ process { "-id ${params.metagenomics_malt_minpercentidentity}", "-mq ${params.metagenomics_malt_maxqueries}", "--memoryMode ${params.metagenomics_malt_memorymode}", - params.metagenomics_malt_savesamoutput ? "-a . -f SAM" : "", params.metagenomics_malt_minsupportmode == "percent" ? "-supp ${params.metagenomics_malt_minsupportpercent}" : "-sup ${params.metagenomics_minsupportreads}", params.metagenomics_malt_savereads ? "--alignments ./ -za false" : "" ].join(' ').trim() publishDir = [ path: { "${params.outdir}/metagenomics_screening/profiling/malt/" }, mode: params.publish_dir_mode, - pattern: '*.{rma6,log,sam}' + pattern: '*.{rma6,log,sam,sam.gz}' ] } diff --git a/docs/output.md b/docs/output.md index a06a7baf6..8564f76d0 100644 --- a/docs/output.md +++ b/docs/output.md @@ -353,7 +353,7 @@ The saved files are the _good_ files, passing the `dust` or `entropy` filter tre - `metagenomics_screening/profiling/malt/` - `.rma6`: binary file containing all alignments and taxonomic information of hits that can be loaded into the [MEGAN6](https://uni-tuebingen.de/fakultaeten/mathematisch-naturwissenschaftliche-fakultaet/fachbereiche/informatik/lehrstuehle/algorithms-in-bioinformatics/software/megan6/) interactive viewer - - `.blastn.sam`: sparse SAM file containing alignments of each hit (if `--metagenomics_malt_savesamoutput`) + - `.blastn.sam`: sparse SAM file containing alignments of each hit (if `--metagenomics_malt_savereads`) - `*.log`: LOG file containing runtime log of MALT
@@ -364,7 +364,7 @@ You will receive output for each library. This means that if you use TSV input a The main output of MALT is the `.rma6` file format, which can be only loaded into MEGAN and it's related tools. -You will only receive the `.sam` files if you supply `--metagenomics_malt_savesamoutput` parameters to the pipeline. +You will only receive the `.sam` files if you supply `--metagenomics_malt_savereads` parameters to the pipeline. ### MetaPhlAn diff --git a/nextflow.config b/nextflow.config index dba36deb8..262402a41 100644 --- a/nextflow.config +++ b/nextflow.config @@ -123,44 +123,43 @@ params { bamfiltering_savefilteredbams = false // can include unmapped reads if --bamfiltering_retainunmappedgenomicbam specified // Metagenomic Screening - run_metagenomics = false - metagenomics_input = 'unmapped' // mapped, all, unmapped -> mapped vs all specified in SAMTOOLS_FASTQ_MAPPED in modules.conf, unmapped hardcoded SAMTOOLS_FASTQ_UNMAPPED - run_metagenomics_complexityfiltering = false - metagenomics_complexity_tool = 'bbduk' - metagenomics_complexity_savefastq = false - metagenomics_complexity_entropy = 0.3 - metagenomics_prinseq_mode = 'entropy' - metagenomics_prinseq_dustscore = 0.5 - metagenomics_profiling_tool = null - metagenomics_profiling_database = null - metagenomics_krakenuniq_ramchunksize = '16G' + run_metagenomics = false + metagenomics_input = 'unmapped' // mapped, all, unmapped -> mapped vs all specified in SAMTOOLS_FASTQ_MAPPED in modules.conf, unmapped hardcoded SAMTOOLS_FASTQ_UNMAPPED + run_metagenomics_complexityfiltering = false + metagenomics_complexity_tool = 'bbduk' + metagenomics_complexity_savefastq = false + metagenomics_complexity_entropy = 0.3 + metagenomics_prinseq_mode = 'entropy' + metagenomics_prinseq_dustscore = 0.5 + metagenomics_profiling_tool = null + metagenomics_profiling_database = null + metagenomics_krakenuniq_ramchunksize = '16G' metagenomics_kraken_savereads = false - metagenomics_kraken_savereadclassifications = false + metagenomics_kraken_savereadclassifications = false metagenomics_kraken2_saveminimizers = false - metagenomics_malt_mode = 'BlastN' + metagenomics_malt_mode = 'BlastN' metagenomics_malt_alignmentmode = 'SemiGlobal' metagenomics_malt_savereads = false - metagenomics_malt_savesamoutput = false - metagenomics_malt_minsupportmode = 'percent' - metagenomics_malt_minsupportpercent = 0.01 - metagenomics_minsupportreads = 1 - metagenomics_malt_minpercentidentity = 85 + metagenomics_malt_minsupportmode = 'percent' + metagenomics_malt_minsupportpercent = 0.01 + metagenomics_minsupportreads = 1 + metagenomics_malt_minpercentidentity = 85 metagenomics_malt_toppercent = 1 metagenomics_malt_maxqueries = 100 metagenomics_malt_memorymode = 'load' - metagenomics_malt_group_size = 0 - metagenomics_run_postprocessing = true + metagenomics_malt_group_size = 0 + metagenomics_run_postprocessing = false metagenomics_maltextract_taxonlist = null metagenomics_maltextract_ncbidir = null - metagenomics_maltextract_filter = 'def_anc' - metagenomics_maltextract_toppercent = 0.01 - metagenomics_maltextract_destackingoff = false - metagenomics_maltextract_downsamplingoff = false - metagenomics_maltextract_duplicateremovaloff = false - metagenomics_maltextract_matches = false - metagenomics_maltextract_megansummary = false - metagenomics_maltextract_minpercentidentity = 85.0 - metagenomics_maltextract_usetopalignment = false + metagenomics_maltextract_filter = 'def_anc' + metagenomics_maltextract_toppercent = 0.01 + metagenomics_maltextract_destackingoff = false + metagenomics_maltextract_downsamplingoff = false + metagenomics_maltextract_duplicateremovaloff = false + metagenomics_maltextract_matches = false + metagenomics_maltextract_megansummary = false + metagenomics_maltextract_minpercentidentity = 85.0 + metagenomics_maltextract_usetopalignment = false // Host Removal run_host_removal = false diff --git a/nextflow_schema.json b/nextflow_schema.json index 83ce2d19f..4f5779632 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -801,17 +801,11 @@ "help_text": "\nHow to load the database into memory. Options are `'load'`, `'page'` or `'map'`.\n'load' directly loads the entire database into memory prior seed look up, this\nis slow but compatible with all servers/file systems. `'page'` and `'map'`\nperform a sort of 'chunked' database loading, allowing seed look up prior entire\ndatabase loading. Note that Page and Map modes do not work properly not with\nmany remote file-systems such as GPFS. Default is `'load'`.\n\nOnly when `--metagenomics_profiling_tool malt` is also supplied.\n\n> Modifies MALT parameter: `--memoryMode`", "enum": ["load", "page", "map"] }, - "metagenomics_malt_savesamoutput": { + "metagenomics_malt_savereads": { "type": "boolean", "description": "Specify to also produce SAM alignment files. Note this includes both aligned and unaligned reads, and are gzipped. Note this will result in very large file sizes.", "fa_icon": "fas fa-file-alt", - "help_text": "Specify to _also_ produce gzipped SAM files of all alignments and un-aligned reads in addition to RMA6 files. These are **not** soft-clipped or in 'sparse' format. Can be useful for downstream analyses due to more common file format. \n\n:warning: can result in very large run output directories as this is essentially duplication of the RMA6 files.\n\n> Modifies MALT parameter `-a -f`" - }, - "metagenomics_malt_savereads": { - "type": "boolean", - "fa_icon": "fas fa-save", - "description": "Turn on saving of MALT-aligned reads", - "help_text": "Turns on saving of MALT aligned reads in SAM format.\n\nNote that the SAM format produce by MALT is not completely valid, and may not work with downstream tools.\n\n> Modifies tool parameter(s):\n> - malt-run: `--alignments`, `-za`" + "help_text": "Specify to _also_ produce gzipped SAM files of all alignments and un-aligned reads in addition to RMA6 files. These are **not** soft-clipped or in 'sparse' format. Can be useful for downstream analyses due to more common file format. \n\n:warning: can result in very large run output directories as this is essentially duplication of the RMA6 files.\n\n> Modified parameters for malt-run: `--alignments`, `-za`" }, "metagenomics_malt_group_size": { "type": "integer", From 8271ac6f65bade709a678dd73c557a9d0f5ac374 Mon Sep 17 00:00:00 2001 From: Ian Light Date: Fri, 1 Dec 2023 10:59:13 +0000 Subject: [PATCH 070/198] added gzip for savereads --- conf/modules.config | 4 ++-- nextflow_schema.json | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index d423470c8..b28e09ee3 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -839,12 +839,12 @@ process { "-mq ${params.metagenomics_malt_maxqueries}", "--memoryMode ${params.metagenomics_malt_memorymode}", params.metagenomics_malt_minsupportmode == "percent" ? "-supp ${params.metagenomics_malt_minsupportpercent}" : "-sup ${params.metagenomics_minsupportreads}", - params.metagenomics_malt_savereads ? "--alignments ./ -za false" : "" + params.metagenomics_malt_savereads ? "--alignments ./" : "" ].join(' ').trim() publishDir = [ path: { "${params.outdir}/metagenomics_screening/profiling/malt/" }, mode: params.publish_dir_mode, - pattern: '*.{rma6,log,sam,sam.gz}' + pattern: '*.{rma6,log,sam.gz}' ] } diff --git a/nextflow_schema.json b/nextflow_schema.json index 4f5779632..a7b249aa4 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -805,7 +805,7 @@ "type": "boolean", "description": "Specify to also produce SAM alignment files. Note this includes both aligned and unaligned reads, and are gzipped. Note this will result in very large file sizes.", "fa_icon": "fas fa-file-alt", - "help_text": "Specify to _also_ produce gzipped SAM files of all alignments and un-aligned reads in addition to RMA6 files. These are **not** soft-clipped or in 'sparse' format. Can be useful for downstream analyses due to more common file format. \n\n:warning: can result in very large run output directories as this is essentially duplication of the RMA6 files.\n\n> Modified parameters for malt-run: `--alignments`, `-za`" + "help_text": "Specify to _also_ produce gzipped SAM files of all alignments and un-aligned reads in addition to RMA6 files. These are **not** soft-clipped or in 'sparse' format. Can be useful for downstream analyses due to more common file format. \n\n:warning: can result in very large run output directories as this is essentially duplication of the RMA6 files.\n\n> Modified parameter for malt-run: `--alignments`" }, "metagenomics_malt_group_size": { "type": "integer", From a17d0c362f9931bdf56ec1f7eba0c154df7c4271 Mon Sep 17 00:00:00 2001 From: Ian Light Date: Fri, 1 Dec 2023 12:43:59 +0000 Subject: [PATCH 071/198] linting --- nextflow_schema.json | 1 - 1 file changed, 1 deletion(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index a7b249aa4..c21bb08b5 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -818,7 +818,6 @@ "type": "boolean", "description": "Activate post-processing of metagenomics profiling tool selected.", "help_text": "Activate the corresponding post-processing tool for your metagenomics profiling software. \n\nmalt --> maltextract\nkrakenuniq/kraken2 --> krakenmerge\nmetaphlan --> mergemetaphlantables\n\nNote: Postprocessing is automatically carried out when using `kraken2` and `krakenuniq` ", - "default": "true", "fa_icon": "fab fa-buromobelexperte" }, "metagenomics_maltextract_taxonlist": { From db427bc4fd3a860cb57573a2d309919ee523ad27 Mon Sep 17 00:00:00 2001 From: Merlin Szymanski Date: Fri, 12 Jan 2024 11:23:28 +0100 Subject: [PATCH 072/198] Remove Metaphlanmergetables, add taxpasta --- conf/modules.config | 8 --- docs/development/manual_tests.md | 1 + docs/output.md | 32 +++++------- modules.json | 5 -- .../metaphlan/mergemetaphlantables/main.nf | 33 ------------- .../metaphlan/mergemetaphlantables/meta.yml | 45 ----------------- .../taxpasta/standardise/environment.yml | 7 --- modules/nf-core/taxpasta/standardise/main.nf | 42 ---------------- modules/nf-core/taxpasta/standardise/meta.yml | 49 ------------------- nextflow_schema.json | 2 +- .../local/metagenomics_postprocessing.nf | 26 ++++------ 11 files changed, 24 insertions(+), 226 deletions(-) delete mode 100644 modules/nf-core/metaphlan/mergemetaphlantables/main.nf delete mode 100644 modules/nf-core/metaphlan/mergemetaphlantables/meta.yml delete mode 100644 modules/nf-core/taxpasta/standardise/environment.yml delete mode 100644 modules/nf-core/taxpasta/standardise/main.nf delete mode 100644 modules/nf-core/taxpasta/standardise/meta.yml diff --git a/conf/modules.config b/conf/modules.config index b17c0ab1c..fd648c635 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -913,14 +913,6 @@ process { ext.args = { "--profiler ${meta.profiler} --output taxpasta_table.tsv" } } - withName: METAPHLAN_MERGEMETAPHLANTABLES { - publishDir = [ - path: { "${params.outdir}/metagenomics_screening/postprocessing/mergemetaphlantables/" }, - mode: params.publish_dir_mode, - pattern: '*.txt' - ] - } - withName: 'QUALIMAP_BAMQC_WITHBED|QUALIMAP_BAMQC_NOBED' { tag = { "${meta.reference}|${meta.sample_id}_${meta.library_id}" } publishDir = [ diff --git a/docs/development/manual_tests.md b/docs/development/manual_tests.md index 9310928dd..e301d703b 100644 --- a/docs/development/manual_tests.md +++ b/docs/development/manual_tests.md @@ -700,6 +700,7 @@ nextflow run main.nf -profile test,docker \ ``` ##### mergemetaphlantables +(update: Jan 2024, removed, parsing with taxpasta) ```bash nextflow run -resume ./main.nf -profile test,docker --outdir out \ diff --git a/docs/output.md b/docs/output.md index 8564f76d0..b4b20b862 100644 --- a/docs/output.md +++ b/docs/output.md @@ -426,6 +426,18 @@ The output system of KrakenUniq can result in other `stdout` or `stderr` logging ### Metagenomics Postprocessing +#### taxpasta + +the output created by the `taxpasta merge` command. It combines the results of all the samples analyzed with eager. The file provides an overview of the classification results for all samples combined + +
+Output files + +- `metagenomics_screening/postprocessing/taxpasta/` + - `{metaphlan, krakenuniq, kraken2}_profiles_all_samples_merged.txt` + +
+ #### maltextract The output directory for maltextract, as implemented under [HOPS](https://github.com/rhuebler/HOPS), which applies various heuristics of ancient authenticity and presence to megan read assignments across a given set of candidate taxon. @@ -444,26 +456,6 @@ The output directory for maltextract, as implemented under [HOPS](https://github The main files of interest are within the `pdf_candidate_profiles` direcotry. The file prefixes declare various levels of confidence in a given sample, with stp1 being less confidently ancient and present than stp2, than stp3. Results are highly dependent upon the taxon being analyzed, as different microbial genera are more liable to cross mapping and contamination than others. -#### Merge MetaPhlanTables - -The output contains a file named `metaphlan_profiles_all_samples_merged.txt`, which provides an overview of the classification results for all samples. - -
-Output files - -- `metagenomics_screening/postprocessing/mergemetaphlantables/` - - `metaphlan_profiles_all_samples_merged.txt`: A combined profile of all samples - -#### krakenmerge - -Automatic postprocessing and merging of kraken reports into a single combined profile of all samples. - -
-Output files - -- `metagenomics_screening/postprocessing/kraken_merge/` - - `combined_reports.txt`: A combined profile of all samples (as generated by `krakentools`) - ### Deduplication
diff --git a/modules.json b/modules.json index c75929016..0b7b3ff72 100644 --- a/modules.json +++ b/modules.json @@ -145,11 +145,6 @@ "git_sha": "0591cad3d725d5c21337f72e638507abf709f75e", "installed_by": ["modules"] }, - "metaphlan/mergemetaphlantables": { - "branch": "master", - "git_sha": "9aa59197c0fb35c29e315bcd10c0fc9e1afc70a8", - "installed_by": ["modules"] - }, "metaphlan/metaphlan": { "branch": "master", "git_sha": "1038d3de36263159b4138324a646105941ac271a", diff --git a/modules/nf-core/metaphlan/mergemetaphlantables/main.nf b/modules/nf-core/metaphlan/mergemetaphlantables/main.nf deleted file mode 100644 index 94c70cd6f..000000000 --- a/modules/nf-core/metaphlan/mergemetaphlantables/main.nf +++ /dev/null @@ -1,33 +0,0 @@ -process METAPHLAN_MERGEMETAPHLANTABLES { - label 'process_single' - - conda "bioconda::metaphlan=4.0.6" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/metaphlan:4.0.6--pyhca03a8a_0' : - 'quay.io/biocontainers/metaphlan:4.0.6--pyhca03a8a_0' }" - - input: - tuple val(meta), path(profiles) - - output: - tuple val(meta), path("${prefix}.txt") , emit: txt - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - prefix = task.ext.prefix ?: "${meta.id}" - """ - merge_metaphlan_tables.py \\ - $args \\ - -o ${prefix}.txt \\ - ${profiles} - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - metaphlan: \$(metaphlan --version 2>&1 | awk '{print \$3}') - END_VERSIONS - """ -} diff --git a/modules/nf-core/metaphlan/mergemetaphlantables/meta.yml b/modules/nf-core/metaphlan/mergemetaphlantables/meta.yml deleted file mode 100644 index 3c93964b9..000000000 --- a/modules/nf-core/metaphlan/mergemetaphlantables/meta.yml +++ /dev/null @@ -1,45 +0,0 @@ -name: "metaphlan_mergemetaphlantables" -description: Merges output abundance tables from MetaPhlAn4 -keywords: - - metagenomics - - classification - - merge - - table - - profiles -tools: - - metaphlan4: - description: Identify clades (phyla to species) present in the metagenome obtained from a microbiome sample and their relative abundance - homepage: https://huttenhower.sph.harvard.edu/metaphlan/ - documentation: https://github.com/biobakery/MetaPhlAn - doi: "10.1038/s41587-023-01688-w" - licence: ["MIT License"] - -input: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - profiles: - type: file - description: List of per-sample MetaPhlAn4 taxonomic abundance tables - pattern: "*" - -output: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - versions: - type: file - description: File containing software versions - pattern: "versions.yml" - - txt: - type: file - description: Combined MetaPhlAn4 table - pattern: "*.txt" - -authors: - - "@jfy133" - - "@LilyAnderssonLee" diff --git a/modules/nf-core/taxpasta/standardise/environment.yml b/modules/nf-core/taxpasta/standardise/environment.yml deleted file mode 100644 index 81b35fc48..000000000 --- a/modules/nf-core/taxpasta/standardise/environment.yml +++ /dev/null @@ -1,7 +0,0 @@ -name: taxpasta_standardise -channels: - - conda-forge - - bioconda - - defaults -dependencies: - - bioconda::taxpasta=0.6.1 diff --git a/modules/nf-core/taxpasta/standardise/main.nf b/modules/nf-core/taxpasta/standardise/main.nf deleted file mode 100644 index 83693d4e4..000000000 --- a/modules/nf-core/taxpasta/standardise/main.nf +++ /dev/null @@ -1,42 +0,0 @@ -process TAXPASTA_STANDARDISE { - tag "$meta.id" - label 'process_single' - - conda "${moduleDir}/environment.yml" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/taxpasta:0.6.1--pyhdfd78af_0': - 'biocontainers/taxpasta:0.6.1--pyhdfd78af_0' }" - - input: - tuple val(meta), path(profile) - path taxonomy - - output: - tuple val(meta), path("*.{tsv,csv,arrow,parquet,biom}"), emit: standardised_profile - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - // N.B.: Taxpasta requires a --profiler option and will fail without it. - // This must be specified via a `nextflow.config` or `modules.config`, for - // example, as "--profiler kraken2". Additionally, it requires a --output - // option with the output file name. The desired format will be parsed from - // the name and should correspond to the output pattern specified above, - // e.g., "--output ${task.ext.prefix}.tsv". - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - def taxonomy_option = taxonomy ? "--taxonomy ${taxonomy}" : '' - """ - taxpasta standardise \\ - $args \\ - $taxonomy_option \\ - $profile - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - taxpasta: \$(taxpasta --version) - END_VERSIONS - """ -} diff --git a/modules/nf-core/taxpasta/standardise/meta.yml b/modules/nf-core/taxpasta/standardise/meta.yml deleted file mode 100644 index a902b00e0..000000000 --- a/modules/nf-core/taxpasta/standardise/meta.yml +++ /dev/null @@ -1,49 +0,0 @@ -name: "taxpasta_standardise" -description: "Standardise the output of a wide range of taxonomic profilers" -keywords: - - taxonomic profile - - standardise - - standardisation - - metagenomics - - taxonomic profiling - - otu tables - - taxon tables -tools: - - "taxpasta": - description: "TAXonomic Profile Aggregation and STAndardisation" - homepage: "https://taxpasta.readthedocs.io/" - documentation: "https://taxpasta.readthedocs.io/" - tool_dev_url: "https://github.com/taxprofiler/taxpasta" - licence: "['Apache-2.0']" -input: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - profile: - type: file - description: profiler output file (mandatory) - pattern: "*" - - taxonomy: - type: directory - description: Directory containing at a minimum nodes.dmp and names.dmp files (optional) - pattern: "*/" -output: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - standardised_profile: - type: file - description: Standardised taxonomic profile - pattern: "*.{tsv,csv,arrow,parquet,biom}" - - versions: - type: file - description: File containing software versions - pattern: "versions.yml" -authors: - - "@Midnighter" -maintainers: - - "@Midnighter" diff --git a/nextflow_schema.json b/nextflow_schema.json index c21bb08b5..7f359a23f 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -817,7 +817,7 @@ "metagenomics_run_postprocessing": { "type": "boolean", "description": "Activate post-processing of metagenomics profiling tool selected.", - "help_text": "Activate the corresponding post-processing tool for your metagenomics profiling software. \n\nmalt --> maltextract\nkrakenuniq/kraken2 --> krakenmerge\nmetaphlan --> mergemetaphlantables\n\nNote: Postprocessing is automatically carried out when using `kraken2` and `krakenuniq` ", + "help_text": "Activate the corresponding post-processing tool for your metagenomics profiling software. \n\nmalt --> maltextract\nkrakenuniq/kraken2/metaphlan --> taxpasta\n\nNote: Postprocessing is automatically carried out when using `kraken2` and `krakenuniq` ", "fa_icon": "fab fa-buromobelexperte" }, "metagenomics_maltextract_taxonlist": { diff --git a/subworkflows/local/metagenomics_postprocessing.nf b/subworkflows/local/metagenomics_postprocessing.nf index de504741d..9c211ae3f 100644 --- a/subworkflows/local/metagenomics_postprocessing.nf +++ b/subworkflows/local/metagenomics_postprocessing.nf @@ -1,8 +1,6 @@ -include { MALTEXTRACT } from '../../modules/nf-core/maltextract/main' -include { AMPS } from '../../modules/nf-core/amps/main' -include { METAPHLAN_MERGEMETAPHLANTABLES } from '../../modules/nf-core/metaphlan/mergemetaphlantables/main' -include { TAXPASTA_STANDARDISE } from '../../modules/nf-core/taxpasta/standardise/main' -include { TAXPASTA_MERGE } from '../../modules/nf-core/taxpasta/merge/main' +include { MALTEXTRACT } from '../../modules/nf-core/maltextract/main' +include { AMPS } from '../../modules/nf-core/amps/main' +include { TAXPASTA_MERGE } from '../../modules/nf-core/taxpasta/merge/main' workflow METAGENOMICS_POSTPROCESSING { @@ -33,7 +31,7 @@ workflow METAGENOMICS_POSTPROCESSING { } - else if ( ['kraken2', 'krakenuniq'].contains(params.metagenomics_profiling_tool) ) { + else if ( ['kraken2', 'krakenuniq', 'metaphlan'].contains(params.metagenomics_profiling_tool) ) { ch_postprocessing_input = ch_postprocessing_input .map{ @@ -44,21 +42,17 @@ workflow METAGENOMICS_POSTPROCESSING { .map{ reports -> [ - ["id":"taxpasta", "profiler":params.metagenomics_profiling_tool], + [ + "id":"${params.metagenomics_profiling_tool}_profiles_all_samples_merged_taxpasta", + "profiler":params.metagenomics_profiling_tool + ], reports ] } TAXPASTA_MERGE( ch_postprocessing_input, [], [] ) - } - - else if ( params.metagenomics_run_postprocessing && params.metagenomics_profiling_tool == 'metaphlan' ) { - - METAPHLAN_MERGEMETAPHLANTABLES ( ch_postprocessing_input.map{ [[id:"metaphlan_profiles_all_samples_merged"], it[1]] }.groupTuple() ) - - ch_versions = ch_versions.mix( METAPHLAN_MERGEMETAPHLANTABLES.out.versions.first() ) - ch_multiqc_files = ch_multiqc_files.mix( METAPHLAN_MERGEMETAPHLANTABLES.out.txt ) - + ch_versions = TAXPASTA_MERGE.out.versions + ch_multiqc_files = TAXPASTA_MERGE.out.merged_profiles } emit: From 0b59da9665e0ce538dc6dff4dadd8a98accc5516 Mon Sep 17 00:00:00 2001 From: Merlin Szymanski Date: Fri, 12 Jan 2024 12:02:29 +0100 Subject: [PATCH 073/198] Remove unnecessary remnants of taxprofiler code --- modules.json | 10 ++-- .../nf-core/megan/rma2info/environment.yml | 7 +++ modules/nf-core/megan/rma2info/main.nf | 38 ++++++++++++++ modules/nf-core/megan/rma2info/meta.yml | 50 +++++++++++++++++++ subworkflows/local/metagenomics.nf | 4 +- .../local/metagenomics_postprocessing.nf | 1 + subworkflows/local/metagenomics_profiling.nf | 43 ++++++---------- 7 files changed, 119 insertions(+), 34 deletions(-) create mode 100644 modules/nf-core/megan/rma2info/environment.yml create mode 100644 modules/nf-core/megan/rma2info/main.nf create mode 100644 modules/nf-core/megan/rma2info/meta.yml diff --git a/modules.json b/modules.json index 0b7b3ff72..a0e427f94 100644 --- a/modules.json +++ b/modules.json @@ -145,6 +145,11 @@ "git_sha": "0591cad3d725d5c21337f72e638507abf709f75e", "installed_by": ["modules"] }, + "megan/rma2info": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, "metaphlan/metaphlan": { "branch": "master", "git_sha": "1038d3de36263159b4138324a646105941ac271a", @@ -239,11 +244,6 @@ "branch": "master", "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", "installed_by": ["modules"] - }, - "taxpasta/standardise": { - "branch": "master", - "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", - "installed_by": ["modules"] } } }, diff --git a/modules/nf-core/megan/rma2info/environment.yml b/modules/nf-core/megan/rma2info/environment.yml new file mode 100644 index 000000000..28a3f6f8e --- /dev/null +++ b/modules/nf-core/megan/rma2info/environment.yml @@ -0,0 +1,7 @@ +name: megan_rma2info +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::megan=6.21.7 diff --git a/modules/nf-core/megan/rma2info/main.nf b/modules/nf-core/megan/rma2info/main.nf new file mode 100644 index 000000000..851fa9ea4 --- /dev/null +++ b/modules/nf-core/megan/rma2info/main.nf @@ -0,0 +1,38 @@ +process MEGAN_RMA2INFO { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/megan:6.24.20--h9ee0642_0': + 'biocontainers/megan:6.24.20--h9ee0642_0' }" + + input: + tuple val(meta), path(rma6) + val(megan_summary) + + output: + tuple val(meta), path("*.txt.gz") , emit: txt + tuple val(meta), path("*.megan"), optional: true, emit: megan_summary + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def summary = megan_summary ? "-es ${prefix}.megan" : "" + """ + rma2info \\ + -i ${rma6} \\ + -o ${prefix}.txt.gz \\ + ${summary} \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + megan: \$(echo \$(rma2info 2>&1) | grep version | sed 's/.*version //g;s/, built.*//g') + END_VERSIONS + """ +} diff --git a/modules/nf-core/megan/rma2info/meta.yml b/modules/nf-core/megan/rma2info/meta.yml new file mode 100644 index 000000000..af3dd96c1 --- /dev/null +++ b/modules/nf-core/megan/rma2info/meta.yml @@ -0,0 +1,50 @@ +name: "megan_rma2info" +description: Analyses an RMA file and exports information in text format +keywords: + - megan + - rma6 + - classification + - conversion +tools: + - "megan": + description: "A tool for studying the taxonomic content of a set of DNA reads" + homepage: "https://uni-tuebingen.de/fakultaeten/mathematisch-naturwissenschaftliche-fakultaet/fachbereiche/informatik/lehrstuehle/algorithms-in-bioinformatics/software/megan6/" + documentation: "https://software-ab.cs.uni-tuebingen.de/download/megan6/welcome.html" + tool_dev_url: "https://github.com/husonlab/megan-ce" + doi: "10.1371/journal.pcbi.1004957" + licence: "['GPL >=3']" +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - rma6: + type: file + description: RMA6 file from MEGAN or MALT + pattern: "*.rma6" + - megan_summary: + type: boolean + description: Specify whether to generate an MEGAN summary file +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - txt: + type: file + description: Compressed text file + pattern: "*.txt.gz" + - megan_summary: + type: file + description: Optionally generated MEGAN summary file + pattern: "*.megan" +authors: + - "@jfy133" +maintainers: + - "@jfy133" diff --git a/subworkflows/local/metagenomics.nf b/subworkflows/local/metagenomics.nf index 88a801373..476cf156f 100644 --- a/subworkflows/local/metagenomics.nf +++ b/subworkflows/local/metagenomics.nf @@ -37,7 +37,7 @@ workflow METAGENOMICS { METAGENOMICS_PROFILING( ch_reads_for_metagenomics, database ) - ch_versions = ch_versions.mix( METAGENOMICS_PROFILING.out.versions.first() ) + ch_versions = ch_versions.mix( METAGENOMICS_PROFILING.out.versions ) ch_multiqc_files = ch_multiqc_files.mix( METAGENOMICS_PROFILING.out.mqc.collect{it[1]}.ifEmpty([]) ) // @@ -48,7 +48,7 @@ workflow METAGENOMICS { METAGENOMICS_POSTPROCESSING ( METAGENOMICS_PROFILING.out.postprocessing_input ) - ch_versions = ch_versions.mix( METAGENOMICS_POSTPROCESSING.out.versions.first() ) + ch_versions = ch_versions.mix( METAGENOMICS_POSTPROCESSING.out.versions ) ch_multiqc_files = ch_multiqc_files.mix( METAGENOMICS_POSTPROCESSING.out.mqc.collect{it[1]}.ifEmpty([]) ) } diff --git a/subworkflows/local/metagenomics_postprocessing.nf b/subworkflows/local/metagenomics_postprocessing.nf index 9c211ae3f..9b366e73b 100644 --- a/subworkflows/local/metagenomics_postprocessing.nf +++ b/subworkflows/local/metagenomics_postprocessing.nf @@ -1,6 +1,7 @@ include { MALTEXTRACT } from '../../modules/nf-core/maltextract/main' include { AMPS } from '../../modules/nf-core/amps/main' include { TAXPASTA_MERGE } from '../../modules/nf-core/taxpasta/merge/main' +include { MEGAN_RMA2INFO } from '../../modules/nf-core/megan/rma2info/main' workflow METAGENOMICS_POSTPROCESSING { diff --git a/subworkflows/local/metagenomics_profiling.nf b/subworkflows/local/metagenomics_profiling.nf index 8fbba0179..ce32bc657 100644 --- a/subworkflows/local/metagenomics_profiling.nf +++ b/subworkflows/local/metagenomics_profiling.nf @@ -16,11 +16,9 @@ workflow METAGENOMICS_PROFILING { database main: - ch_versions = Channel.empty() - ch_raw_classifications = Channel.empty() - ch_raw_profiles = Channel.empty() - ch_multiqc_files = Channel.empty() - ch_postprocessing_input = Channel.empty() + ch_versions = Channel.empty() + ch_multiqc_files = Channel.empty() + ch_postprocessing_input = Channel.empty() /* PREPARE PROFILER INPUT CHANNELS & RUN PROFILING @@ -85,10 +83,9 @@ workflow METAGENOMICS_PROFILING { [ meta_new, rma ] } - ch_versions = ch_versions.mix( MALT_RUN.out.versions.first() ) - ch_raw_classifications = ch_raw_classifications.mix( ch_maltrun_for_megan ) - ch_multiqc_files = ch_multiqc_files.mix( MALT_RUN.out.log ) - ch_postprocessing_input = ch_postprocessing_input.mix( ch_maltrun_for_megan ) + ch_versions = MALT_RUN.out.versions.first() + ch_multiqc_files = MALT_RUN.out.log + ch_postprocessing_input = ch_maltrun_for_megan } else if ( params.metagenomics_profiling_tool == 'metaphlan' ) { @@ -98,13 +95,12 @@ workflow METAGENOMICS_PROFILING { metaphlan_db = reads.map{ meta, reads, database -> [database] } METAPHLAN_METAPHLAN ( metaphlan_reads , metaphlan_db ) - ch_versions = ch_versions.mix( METAPHLAN_METAPHLAN.out.versions.first() ) - ch_raw_profiles = ch_raw_profiles.mix( METAPHLAN_METAPHLAN.out.profile ) - ch_postprocessing_input = ch_postprocessing_input.mix( METAPHLAN_METAPHLAN.out.profile ) + ch_versions = METAPHLAN_METAPHLAN.out.versions.first() + ch_postprocessing_input = METAPHLAN_METAPHLAN.out.profile } - if ( params.metagenomics_profiling_tool == 'krakenuniq' ) { + else if ( params.metagenomics_profiling_tool == 'krakenuniq' ) { // run kraken uniq per sample, to preserve the meta-data reads = reads.combine(database) @@ -120,12 +116,9 @@ workflow METAGENOMICS_PROFILING { params.metagenomics_kraken_savereadclassifications ) - ch_multiqc_files = ch_multiqc_files.mix( KRAKENUNIQ_PRELOADEDKRAKENUNIQ.out.report ) - ch_versions = ch_versions.mix( KRAKENUNIQ_PRELOADEDKRAKENUNIQ.out.versions.first() ) - ch_raw_classifications = ch_raw_classifications.mix( KRAKENUNIQ_PRELOADEDKRAKENUNIQ.out.classified_assignment ) - ch_raw_profiles = ch_raw_profiles.mix( KRAKENUNIQ_PRELOADEDKRAKENUNIQ.out.report ) - ch_multiqc_files = ch_multiqc_files.mix( KRAKENUNIQ_PRELOADEDKRAKENUNIQ.out.report ) - ch_postprocessing_input = ch_postprocessing_input.mix( KRAKENUNIQ_PRELOADEDKRAKENUNIQ.out.report ) + ch_versions = KRAKENUNIQ_PRELOADEDKRAKENUNIQ.out.versions.first() + ch_multiqc_files = KRAKENUNIQ_PRELOADEDKRAKENUNIQ.out.report + ch_postprocessing_input = KRAKENUNIQ_PRELOADEDKRAKENUNIQ.out.report } else if ( params.metagenomics_profiling_tool == 'kraken2' ) { @@ -142,17 +135,13 @@ workflow METAGENOMICS_PROFILING { params.metagenomics_kraken_savereadclassifications ) - ch_multiqc_files = ch_multiqc_files.mix( KRAKEN2_KRAKEN2.out.report ) - ch_versions = ch_versions.mix( KRAKEN2_KRAKEN2.out.versions.first() ) - ch_raw_classifications = ch_raw_classifications.mix( KRAKEN2_KRAKEN2.out.classified_reads_assignment ) - ch_raw_profiles = ch_raw_profiles.mix( KRAKEN2_KRAKEN2.out.report ) - ch_postprocessing_input = ch_postprocessing_input.mix( KRAKEN2_KRAKEN2.out.report ) + ch_multiqc_files = KRAKEN2_KRAKEN2.out.report + ch_versions = KRAKEN2_KRAKEN2.out.versions.first() + ch_postprocessing_input = KRAKEN2_KRAKEN2.out.report } emit: - versions = ch_versions // channel: [ versions.yml ] - classifications = ch_raw_classifications - profiles = ch_raw_profiles // channel: [ val(meta), [ reads ] ] - should be text files or biom + versions = ch_versions // channel: [ versions.yml ] postprocessing_input = ch_postprocessing_input // channel: [ val(meta), [ inputs_for_postprocessing_tools ] ] // see info at metagenomics_postprocessing mqc = ch_multiqc_files From ad944abdfc9fd5b15dd9ba0ccf5c328205144857 Mon Sep 17 00:00:00 2001 From: Merlin Szymanski Date: Fri, 12 Jan 2024 13:02:26 +0100 Subject: [PATCH 074/198] include taxpasta also for malt --- .../local/metagenomics_postprocessing.nf | 49 ++++++++++--------- subworkflows/local/metagenomics_profiling.nf | 8 +-- 2 files changed, 30 insertions(+), 27 deletions(-) diff --git a/subworkflows/local/metagenomics_postprocessing.nf b/subworkflows/local/metagenomics_postprocessing.nf index 9b366e73b..76ceccc0e 100644 --- a/subworkflows/local/metagenomics_postprocessing.nf +++ b/subworkflows/local/metagenomics_postprocessing.nf @@ -11,9 +11,9 @@ workflow METAGENOMICS_POSTPROCESSING { main: ch_versions = Channel.empty() - ch_results = Channel.empty() ch_multiqc_files = Channel.empty() + // For MALT we have an additional step that includes maltextract+amps if ( params.metagenomics_run_postprocessing && params.metagenomics_profiling_tool == 'malt' ) { //maltextract doesnt accepts a meta param in the first input channel, so remove it @@ -26,36 +26,39 @@ workflow METAGENOMICS_POSTPROCESSING { AMPS ( MALTEXTRACT.out.results, tax_list, params.metagenomics_maltextract_filter ) + + //Also, prepare Malt for taxpasta by running rma2info + + MEGAN_RMA2INFO( ch_postprocessing_input, true ) + ch_postprocessing_input = MEGAN_RMA2INFO.out.txt + ch_versions = ch_versions.mix( MALTEXTRACT.out.versions.first(), AMPS.out.versions.first() ) - ch_results = ch_results.mix( AMPS.out.candidate_pdfs, AMPS.out.tsv, AMPS.out.summary_pdf ) ch_multiqc_files = ch_multiqc_files.mix( AMPS.out.json ) - } - else if ( ['kraken2', 'krakenuniq', 'metaphlan'].contains(params.metagenomics_profiling_tool) ) { + // Run taxpasta for everything! - ch_postprocessing_input = ch_postprocessing_input - .map{ - meta, report -> - [report] - } - .collect() - .map{ - reports -> + ch_postprocessing_input = ch_postprocessing_input + .map{ + meta, report -> + [report] + } + .collect() + .map{ + reports -> + [ [ - [ - "id":"${params.metagenomics_profiling_tool}_profiles_all_samples_merged_taxpasta", - "profiler":params.metagenomics_profiling_tool - ], - reports - ] - } - - TAXPASTA_MERGE( ch_postprocessing_input, [], [] ) - ch_versions = TAXPASTA_MERGE.out.versions - ch_multiqc_files = TAXPASTA_MERGE.out.merged_profiles + "id":"${params.metagenomics_profiling_tool}_profiles_all_samples_merged_taxpasta", + "profiler":params.metagenomics_profiling_tool == 'malt' ? 'megan6' : params.metagenomics_profiling_tool + ], + reports + ] } + TAXPASTA_MERGE( ch_postprocessing_input, [], [] ) + ch_versions = ch_versions.mix(TAXPASTA_MERGE.out.versions) + ch_multiqc_files = ch_multiqc_files.mix(TAXPASTA_MERGE.out.merged_profiles) + emit: versions = ch_versions mqc = ch_multiqc_files diff --git a/subworkflows/local/metagenomics_profiling.nf b/subworkflows/local/metagenomics_profiling.nf index ce32bc657..2fdc9f8d1 100644 --- a/subworkflows/local/metagenomics_profiling.nf +++ b/subworkflows/local/metagenomics_profiling.nf @@ -77,10 +77,10 @@ workflow METAGENOMICS_PROFILING { // re-extract meta from file names, use filename without rma to // ensure we keep paired-end information in downstream filenames // when no pair-merging - def meta_new = meta.clone() - meta_new['db_name'] = meta.id - meta_new['id'] = rma.baseName - [ meta_new, rma ] + [ + meta+['db_name':meta.id, 'id': rma.baseName], + rma + ] } ch_versions = MALT_RUN.out.versions.first() From a9faf597f8326b66187d99e55eadaf7bc3b9c589 Mon Sep 17 00:00:00 2001 From: Ian Light Date: Fri, 26 Jan 2024 11:32:38 +0000 Subject: [PATCH 075/198] resolution for malt postprocessing submisison --- docs/development/manual_tests.md | 42 +++++++++++++++++++ .../local/metagenomics_postprocessing.nf | 12 +++++- subworkflows/local/metagenomics_profiling.nf | 10 ++++- 3 files changed, 61 insertions(+), 3 deletions(-) diff --git a/docs/development/manual_tests.md b/docs/development/manual_tests.md index e301d703b..15f1db2b8 100644 --- a/docs/development/manual_tests.md +++ b/docs/development/manual_tests.md @@ -697,9 +697,51 @@ nextflow run main.nf -profile test,docker \ --metagenomics_run_postprocessing \ --metagenomics_maltextract_ncbidir NCBI_DIR \ --metagenomics_maltextract_taxonlist TAXONLISTFILE + + + +# for generating test data +mkdir testing && cd testing +git clone https://github.com/rhuebler/HOPS.git +mkdir test_data && cd test_data +curl -L ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR895/000/ERR8958750/ERR8958750_1.fastq.gz -o ERR8958750_1.fastq.gz +curl -L ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR895/000/ERR8958750/ERR8958750_2.fastq.gz -o ERR8958750_2.fastq.gz +curl -L ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR895/001/ERR8958751/ERR8958751_1.fastq.gz -o ERR8958751_1.fastq.gz +curl -L ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR895/001/ERR8958751/ERR8958751_2.fastq.gz -o ERR8958751_2.fastq.gz +curl -L ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR895/002/ERR8958752/ERR8958752_1.fastq.gz -o ERR8958752_1.fastq.gz +curl -L ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR895/002/ERR8958752/ERR8958752_2.fastq.gz -o ERR8958752_2.fastq.gz +curl -L ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR895/003/ERR8958753/ERR8958753_1.fastq.gz -o ERR8958753_1.fastq.gz +curl -L ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR895/003/ERR8958753/ERR8958753_2.fastq.gz -o ERR8958753_2.fastq.gz +curl -L ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR895/004/ERR8958754/ERR8958754_1.fastq.gz -o ERR8958754_1.fastq.gz +curl -L ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR895/004/ERR8958754/ERR8958754_2.fastq.gz -o ERR8958754_2.fastq.gz +ls -1 | while read file; do +zcat $file | head -n 4000 > ${file}_reduced.fastq +done + +echo "sample_id library_id lane colour_chemistry pairment strandedness damage_treatment r1 r2 bam bam_reference_id +HOP001 ERR8958750 0 4 paired double half /workspace/eager/testing/test_data/ERR8958750_1.fastq.gz_reduced.fastq.gz /workspace/eager/testing/test_data/ERR8958750_2.fastq.gz_reduced.fastq.gz NA NA +HOP001 ERR8958751 0 2 paired double half /workspace/eager/testing/test_data/ERR8958751_1.fastq.gz_reduced.fastq.gz /workspace/eager/testing/test_data/ERR8958751_2.fastq.gz_reduced.fastq.gz NA NA +HOP001 ERR8958752 0 2 paired double half /workspace/eager/testing/test_data/ERR8958752_1.fastq.gz_reduced.fastq.gz /workspace/eager/testing/test_data/ERR8958752_2.fastq.gz_reduced.fastq.gz NA NA +HOP001 ERR8958753 0 2 paired double half /workspace/eager/testing/test_data/ERR8958753_1.fastq.gz_reduced.fastq.gz /workspace/eager/testing/test_data/ERR8958753_2.fastq.gz_reduced.fastq.gz NA NA +HOP001 ERR8958754 0 2 paired double none /workspace/eager/testing/test_data/ERR8958754_1.fastq.gz_reduced.fastq.gz /workspace/eager/testing/test_data/ERR8958754_2.fastq.gz_reduced.fastq.gz NA NA" | sed 's/ /\t/g' > test.tsv + +nextflow run ../main.nf -profile docker \ + --input test.tsv \ + --outdir ./out \ + --run_metagenomics \ + --metagenomics_profiling_tool malt \ + --metagenomics_profiling_database /workspace/eager/testing/HOPS/Test_Data/Test_Database/ \ + --metagenomics_run_postprocessing \ + --metagenomics_maltextract_ncbidir HOPS/Resources \ + --metagenomics_maltextract_taxonlist HOPS/Resources/default_list.txt \ + --fasta Mammoth_MT_Krause.fasta \ + --skip_damage_calculation \ + --skip_qualimap \ + --metagenomics_malt_group_size 3 ``` ##### mergemetaphlantables + (update: Jan 2024, removed, parsing with taxpasta) ```bash diff --git a/subworkflows/local/metagenomics_postprocessing.nf b/subworkflows/local/metagenomics_postprocessing.nf index 76ceccc0e..cd9fa6a98 100644 --- a/subworkflows/local/metagenomics_postprocessing.nf +++ b/subworkflows/local/metagenomics_postprocessing.nf @@ -17,7 +17,11 @@ workflow METAGENOMICS_POSTPROCESSING { if ( params.metagenomics_run_postprocessing && params.metagenomics_profiling_tool == 'malt' ) { //maltextract doesnt accepts a meta param in the first input channel, so remove it - ch_maltextract_input = ch_postprocessing_input.map{it[1]} + ch_maltextract_input = ch_postprocessing_input.first().map { + meta, rma, rma_collected -> + rma_collected + } + ch_maltextract_input.view() tax_list = Channel.fromPath(params.metagenomics_maltextract_taxonlist) ncbi_dir = Channel.fromPath(params.metagenomics_maltextract_ncbidir) @@ -28,8 +32,12 @@ workflow METAGENOMICS_POSTPROCESSING { //Also, prepare Malt for taxpasta by running rma2info + ch_rma2info_input = ch_postprocessing_input.map { + meta, rma, rma_collected -> + [ meta, rma ] + } - MEGAN_RMA2INFO( ch_postprocessing_input, true ) + MEGAN_RMA2INFO( ch_rma2info_input, true ) ch_postprocessing_input = MEGAN_RMA2INFO.out.txt ch_versions = ch_versions.mix( MALTEXTRACT.out.versions.first(), AMPS.out.versions.first() ) diff --git a/subworkflows/local/metagenomics_profiling.nf b/subworkflows/local/metagenomics_profiling.nf index 2fdc9f8d1..5784a8e0d 100644 --- a/subworkflows/local/metagenomics_profiling.nf +++ b/subworkflows/local/metagenomics_profiling.nf @@ -83,9 +83,17 @@ workflow METAGENOMICS_PROFILING { ] } + ch_maltrun_for_maltextract = MALT_RUN.out.rma6.map { + id,rma6 -> rma6 + } + .collect() + .toList() + + ch_maltrun_for_postprocessing = ch_maltrun_for_megan.combine(ch_maltrun_for_maltextract) + ch_versions = MALT_RUN.out.versions.first() ch_multiqc_files = MALT_RUN.out.log - ch_postprocessing_input = ch_maltrun_for_megan + ch_postprocessing_input = ch_maltrun_for_postprocessing } else if ( params.metagenomics_profiling_tool == 'metaphlan' ) { From a5a93839a31d35599f52432cf03f3587c13e4944 Mon Sep 17 00:00:00 2001 From: Ian Light Date: Fri, 26 Jan 2024 11:33:31 +0000 Subject: [PATCH 076/198] updated manual test prep --- docs/development/manual_tests.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/development/manual_tests.md b/docs/development/manual_tests.md index 15f1db2b8..400ef8e5e 100644 --- a/docs/development/manual_tests.md +++ b/docs/development/manual_tests.md @@ -717,6 +717,7 @@ curl -L ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR895/004/ERR8958754/ERR8958754_2.fa ls -1 | while read file; do zcat $file | head -n 4000 > ${file}_reduced.fastq done +cd .. echo "sample_id library_id lane colour_chemistry pairment strandedness damage_treatment r1 r2 bam bam_reference_id HOP001 ERR8958750 0 4 paired double half /workspace/eager/testing/test_data/ERR8958750_1.fastq.gz_reduced.fastq.gz /workspace/eager/testing/test_data/ERR8958750_2.fastq.gz_reduced.fastq.gz NA NA From f67ba4b3996be68a4be9b8751d0b4cd455de2933 Mon Sep 17 00:00:00 2001 From: Ian Light Date: Fri, 9 Feb 2024 10:50:31 +0000 Subject: [PATCH 077/198] working outputting of logs --- CITATION.cff | 50 +++--- assets/email_template.html | 144 +++++++++++++----- conf/modules.config | 1 + docs/development/manual_tests.md | 6 + docs/output.md | 2 +- nextflow_schema.json | 2 +- .../local/metagenomics_postprocessing.nf | 1 - subworkflows/local/metagenomics_profiling.nf | 17 ++- 8 files changed, 148 insertions(+), 75 deletions(-) diff --git a/CITATION.cff b/CITATION.cff index 3bbef8b81..70a3487b2 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -25,28 +25,28 @@ doi: 10.7717/peerj.10947 date-released: 2022-08-02 url: https://github.com/nf-core/eager prefered-citation: - type: article - authors: - - family-names: Fellows Yates - given-names: James A. - - family-names: Lamnidis - given-names: Thiseas C. - - family-names: Borry - given-names: Maxime - - family-names: Andrades Valtueña - given-names: Aida - - family-names: Fagernãs - given-names: Zandra - - family-names: Clayton - given-names: Stephen - - family-names: Garcia - given-names: Maxime U. - - family-names: Neukamm - given-names: Judith - - family-names: Peltzer - given-names: Alexander - doi: 10.7717/peerj.10947 - start: e10947 - title: "Reproducible, portable, and efficient ancient genome reconstruction with nf-core/eager" - year: 2021 - url: https://dx.doi.org/10.1038/10.7717/peerj.10947 + type: article + authors: + - family-names: Fellows Yates + given-names: James A. + - family-names: Lamnidis + given-names: Thiseas C. + - family-names: Borry + given-names: Maxime + - family-names: Andrades Valtueña + given-names: Aida + - family-names: Fagernãs + given-names: Zandra + - family-names: Clayton + given-names: Stephen + - family-names: Garcia + given-names: Maxime U. + - family-names: Neukamm + given-names: Judith + - family-names: Peltzer + given-names: Alexander + doi: 10.7717/peerj.10947 + start: e10947 + title: "Reproducible, portable, and efficient ancient genome reconstruction with nf-core/eager" + year: 2021 + url: https://dx.doi.org/10.1038/10.7717/peerj.10947 diff --git a/assets/email_template.html b/assets/email_template.html index 36bfc9c8d..cf5efd36c 100644 --- a/assets/email_template.html +++ b/assets/email_template.html @@ -1,53 +1,113 @@ - - - - + + + + - - nf-core/eager Pipeline Report - - -
+ + nf-core/eager Pipeline Report + + +
+ - +

nf-core/eager v${version}

+

Run Name: $runName

-

nf-core/eager v${version}

-

Run Name: $runName

- -<% if (!success){ - out << """ -
-

nf-core/eager execution completed unsuccessfully!

+ <% if (!success){ out << """ +
+

nf-core/eager execution completed unsuccessfully!

The exit status of the task that caused the workflow execution to fail was: $exitStatus.

The full error message was:

-
${errorReport}
-
- """ -} else { - out << """ -
+
${errorReport}
+
+ """ } else { out << """ +
nf-core/eager execution completed successfully! -
- """ -} -%> +
+ """ } %> -

The workflow was completed at $dateComplete (duration: $duration)

-

The command used to launch the workflow was as follows:

-
$commandLine
+

The workflow was completed at $dateComplete (duration: $duration)

+

The command used to launch the workflow was as follows:

+
+$commandLine
-

Pipeline Configuration:

- - - <% out << summary.collect{ k,v -> "" }.join("\n") %> - -
$k
$v
+

Pipeline Configuration:

+ + + <% out << summary.collect{ k,v -> " + + + + + " }.join("\n") %> + +
+ $k + +
$v
+
-

nf-core/eager

-

https://github.com/nf-core/eager

- -
- - +

nf-core/eager

+

https://github.com/nf-core/eager

+
+ diff --git a/conf/modules.config b/conf/modules.config index fd648c635..2f75ff812 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -845,6 +845,7 @@ process { mode: params.publish_dir_mode, pattern: '*.{rma6,log,sam.gz}' ] + ext.prefix = { "${meta.label}_${meta.id}-run" } } withName: KRAKEN2_KRAKEN2 { diff --git a/docs/development/manual_tests.md b/docs/development/manual_tests.md index 400ef8e5e..19c1e43da 100644 --- a/docs/development/manual_tests.md +++ b/docs/development/manual_tests.md @@ -702,7 +702,12 @@ nextflow run main.nf -profile test,docker \ # for generating test data mkdir testing && cd testing +wget https://raw.githubusercontent.com/nf-core/test-datasets/eager/reference/Mammoth/Mammoth_MT_Krause.fasta git clone https://github.com/rhuebler/HOPS.git +cd HOPS/Test_Data/Test_Database +unzip table0.db.zip +unzip table0.idx.zip +cd ../../.. mkdir test_data && cd test_data curl -L ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR895/000/ERR8958750/ERR8958750_1.fastq.gz -o ERR8958750_1.fastq.gz curl -L ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR895/000/ERR8958750/ERR8958750_2.fastq.gz -o ERR8958750_2.fastq.gz @@ -716,6 +721,7 @@ curl -L ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR895/004/ERR8958754/ERR8958754_1.fa curl -L ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR895/004/ERR8958754/ERR8958754_2.fastq.gz -o ERR8958754_2.fastq.gz ls -1 | while read file; do zcat $file | head -n 4000 > ${file}_reduced.fastq +gzip ${file}_reduced.fastq done cd .. diff --git a/docs/output.md b/docs/output.md index b4b20b862..bb4480f9f 100644 --- a/docs/output.md +++ b/docs/output.md @@ -354,7 +354,7 @@ The saved files are the _good_ files, passing the `dust` or `entropy` filter tre - `metagenomics_screening/profiling/malt/` - `.rma6`: binary file containing all alignments and taxonomic information of hits that can be loaded into the [MEGAN6](https://uni-tuebingen.de/fakultaeten/mathematisch-naturwissenschaftliche-fakultaet/fachbereiche/informatik/lehrstuehle/algorithms-in-bioinformatics/software/megan6/) interactive viewer - `.blastn.sam`: sparse SAM file containing alignments of each hit (if `--metagenomics_malt_savereads`) - - `*.log`: LOG file containing runtime log of MALT + - `*.log`: LOG file containing runtime log of MALT. NOTE: If you are running parallel malt runs with `--metagenomics_malt_group_size` set above 0, your log files will be labelled with the name of _one_ of the input files run for each of the parallel executions.
diff --git a/nextflow_schema.json b/nextflow_schema.json index 7f359a23f..fdfff9ef7 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -812,7 +812,7 @@ "default": 0, "description": "Define how many fastq files should be submitted in the same malt run. Default value of 0 sends all files at once.", "fa_icon": "fas fa-barcode", - "help_text": "Very large fastq files or many fastq files run through MALT at the same time can lead to excessively long runtimes. This parameter allows for parallelization of MALT runs. Please note, MALT is resource heavy and setting this value above the default (0) will spawn at minimum N/metagenomics_malt_group_size jobs where N is the number of samples. Please only use this if it is necessary to avoid runtime limits on your HPC cluster." + "help_text": "Very many (large) fastq files run through MALT at the same time can lead to excessively long runtimes. This parameter allows for parallelization of MALT runs. Please note, MALT is resource heavy and setting this value above the default (0) will spawn at minimum N/metagenomics_malt_group_size jobs where N is the number of samples. Please only use this if it is necessary to avoid runtime limits on your HPC cluster since the overhead of loading a database is high." }, "metagenomics_run_postprocessing": { "type": "boolean", diff --git a/subworkflows/local/metagenomics_postprocessing.nf b/subworkflows/local/metagenomics_postprocessing.nf index cd9fa6a98..15ade81d3 100644 --- a/subworkflows/local/metagenomics_postprocessing.nf +++ b/subworkflows/local/metagenomics_postprocessing.nf @@ -21,7 +21,6 @@ workflow METAGENOMICS_POSTPROCESSING { meta, rma, rma_collected -> rma_collected } - ch_maltextract_input.view() tax_list = Channel.fromPath(params.metagenomics_maltextract_taxonlist) ncbi_dir = Channel.fromPath(params.metagenomics_maltextract_ncbidir) diff --git a/subworkflows/local/metagenomics_profiling.nf b/subworkflows/local/metagenomics_profiling.nf index 5784a8e0d..0888361af 100644 --- a/subworkflows/local/metagenomics_profiling.nf +++ b/subworkflows/local/metagenomics_profiling.nf @@ -37,21 +37,28 @@ workflow METAGENOMICS_PROFILING { // unnecessary. Set as database name to prevent `null` job ID and prefix. if ( params.metagenomics_malt_group_size > 0 ) { + ch_labels_for_malt_tmp = reads + .map { meta, reads -> meta } + .collate(params.metagenomics_malt_group_size) + .map(meta -> meta.first().library_id ) + ch_input_for_malt_tmp = reads .map { meta, reads -> reads } .collate( params.metagenomics_malt_group_size ) //collate into bins of defined lengths .map{ reads -> // add new meta with db-name as id - [[id: file(params.metagenomics_profiling_database).getBaseName() ], reads] + [[label: file(params.metagenomics_profiling_database).getBaseName() ], reads] } - .combine(database) //combine with database + .combine(database) + .merge(ch_labels_for_malt_tmp) //combine with database .multiMap{ // and split apart again - meta, reads, database -> - reads: [meta, reads] + meta, reads, database, ids -> + reads: [meta + ['id':ids], reads] database: database } + ch_input_for_malt = ch_input_for_malt_tmp.reads database = ch_input_for_malt_tmp.database } @@ -63,7 +70,7 @@ workflow METAGENOMICS_PROFILING { .map{ // make sure id is db_name for publishing purposes. reads -> - [[id: file(params.metagenomics_profiling_database).getBaseName() ], reads] + [[label: file(params.metagenomics_profiling_database).getBaseName(), id: 'all' ], reads] } } From e9af20173b643892aea0b6402e56a9014c7b0ea6 Mon Sep 17 00:00:00 2001 From: Ian Light Date: Fri, 16 Feb 2024 11:15:44 +0000 Subject: [PATCH 078/198] working taxpasta, working maltlog merging --- conf/modules.config | 20 +++++ modules.json | 5 ++ modules/nf-core/cat/cat/environment.yml | 7 ++ modules/nf-core/cat/cat/main.nf | 79 ++++++++++++++++++++ modules/nf-core/cat/cat/meta.yml | 36 +++++++++ subworkflows/local/metagenomics_profiling.nf | 16 ++++ 6 files changed, 163 insertions(+) create mode 100644 modules/nf-core/cat/cat/environment.yml create mode 100644 modules/nf-core/cat/cat/main.nf create mode 100644 modules/nf-core/cat/cat/meta.yml diff --git a/conf/modules.config b/conf/modules.config index 2f75ff812..97147425c 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -848,6 +848,15 @@ process { ext.prefix = { "${meta.label}_${meta.id}-run" } } + withName: CAT_CAT_MALT { + ext.prefix = { "${meta.databasename}_runtime_log_concatenated.log" } + publishDir = [ + path: { "${params.outdir}/metagenomics_screening/profiling/malt/" }, + mode: params.publish_dir_mode, + pattern: '*.{log}' + ] + } + withName: KRAKEN2_KRAKEN2 { ext.args = [ params.metagenomics_kraken2_saveminimizers ? "--report-minimizer-data" : "" @@ -897,6 +906,17 @@ process { ] } + withName: 'MEGAN_RMA2INFO' { + tag = {"${meta.id}"} + ext.args = "-c2c Taxonomy" + ext.prefix = { "${meta.id}" } + publishDir = [ + path: { "${params.outdir}/metagenomics_screening/postprocessing/megan_summaries/" }, + mode: params.publish_dir_mode, + pattern: '*.{txt.gz,megan}' + ] + } + withName: AMPS { publishDir = [ path: { "${params.outdir}/metagenomics_screening/postprocessing/maltextract/" }, diff --git a/modules.json b/modules.json index a0e427f94..dc7301d2f 100644 --- a/modules.json +++ b/modules.json @@ -75,6 +75,11 @@ "git_sha": "603ecbd9f45300c9788f197d2a15a005685b4220", "installed_by": ["fastq_align_bwaaln"] }, + "cat/cat": { + "branch": "master", + "git_sha": "81f27e75847087865299cc46605deb3b09b4e0a2", + "installed_by": ["modules"] + }, "cat/fastq": { "branch": "master", "git_sha": "5c460c5a4736974abde2843294f35307ee2b0e5e", diff --git a/modules/nf-core/cat/cat/environment.yml b/modules/nf-core/cat/cat/environment.yml new file mode 100644 index 000000000..17a04ef23 --- /dev/null +++ b/modules/nf-core/cat/cat/environment.yml @@ -0,0 +1,7 @@ +name: cat_cat +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - conda-forge::pigz=2.3.4 diff --git a/modules/nf-core/cat/cat/main.nf b/modules/nf-core/cat/cat/main.nf new file mode 100644 index 000000000..adbdbd7ba --- /dev/null +++ b/modules/nf-core/cat/cat/main.nf @@ -0,0 +1,79 @@ +process CAT_CAT { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/pigz:2.3.4' : + 'biocontainers/pigz:2.3.4' }" + + input: + tuple val(meta), path(files_in) + + output: + tuple val(meta), path("${prefix}"), emit: file_out + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: '' + def file_list = files_in.collect { it.toString() } + + // choose appropriate concatenation tool depending on input and output format + + // | input | output | command1 | command2 | + // |-----------|------------|----------|----------| + // | gzipped | gzipped | cat | | + // | ungzipped | ungzipped | cat | | + // | gzipped | ungzipped | zcat | | + // | ungzipped | gzipped | cat | pigz | + + // Use input file ending as default + prefix = task.ext.prefix ?: "${meta.id}${getFileSuffix(file_list[0])}" + out_zip = prefix.endsWith('.gz') + in_zip = file_list[0].endsWith('.gz') + command1 = (in_zip && !out_zip) ? 'zcat' : 'cat' + command2 = (!in_zip && out_zip) ? "| pigz -c -p $task.cpus $args2" : '' + if(file_list.contains(prefix.trim())) { + error "The name of the input file can't be the same as for the output prefix in the " + + "module CAT_CAT (currently `$prefix`). Please choose a different one." + } + """ + $command1 \\ + $args \\ + ${file_list.join(' ')} \\ + $command2 \\ + > ${prefix} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + pigz: \$( pigz --version 2>&1 | sed 's/pigz //g' ) + END_VERSIONS + """ + + stub: + def file_list = files_in.collect { it.toString() } + prefix = task.ext.prefix ?: "${meta.id}${file_list[0].substring(file_list[0].lastIndexOf('.'))}" + if(file_list.contains(prefix.trim())) { + error "The name of the input file can't be the same as for the output prefix in the " + + "module CAT_CAT (currently `$prefix`). Please choose a different one." + } + """ + touch $prefix + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + pigz: \$( pigz --version 2>&1 | sed 's/pigz //g' ) + END_VERSIONS + """ +} + +// for .gz files also include the second to last extension if it is present. E.g., .fasta.gz +def getFileSuffix(filename) { + def match = filename =~ /^.*?((\.\w{1,5})?(\.\w{1,5}\.gz$))/ + return match ? match[0][1] : filename.substring(filename.lastIndexOf('.')) +} + diff --git a/modules/nf-core/cat/cat/meta.yml b/modules/nf-core/cat/cat/meta.yml new file mode 100644 index 000000000..00a8db0bc --- /dev/null +++ b/modules/nf-core/cat/cat/meta.yml @@ -0,0 +1,36 @@ +name: cat_cat +description: A module for concatenation of gzipped or uncompressed files +keywords: + - concatenate + - gzip + - cat +tools: + - cat: + description: Just concatenation + documentation: https://man7.org/linux/man-pages/man1/cat.1.html + licence: ["GPL-3.0-or-later"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - files_in: + type: file + description: List of compressed / uncompressed files + pattern: "*" +output: + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - file_out: + type: file + description: Concatenated file. Will be gzipped if file_out ends with ".gz" + pattern: "${file_out}" +authors: + - "@erikrikarddaniel" + - "@FriederikeHanssen" +maintainers: + - "@erikrikarddaniel" + - "@FriederikeHanssen" diff --git a/subworkflows/local/metagenomics_profiling.nf b/subworkflows/local/metagenomics_profiling.nf index 0888361af..01a0ca3fe 100644 --- a/subworkflows/local/metagenomics_profiling.nf +++ b/subworkflows/local/metagenomics_profiling.nf @@ -8,6 +8,7 @@ include { MALT_RUN } from '../../modules/nf-core/malt/run/ include { KRAKEN2_KRAKEN2 } from '../../modules/nf-core/kraken2/kraken2/main' include { KRAKENUNIQ_PRELOADEDKRAKENUNIQ } from '../../modules/nf-core/krakenuniq/preloadedkrakenuniq/main' include { METAPHLAN_METAPHLAN } from '../../modules/nf-core/metaphlan/metaphlan/main' +include { CAT_CAT as CAT_CAT_MALT } from '../../modules/nf-core/cat/cat/main' workflow METAGENOMICS_PROFILING { @@ -96,6 +97,21 @@ workflow METAGENOMICS_PROFILING { .collect() .toList() + // Recombine log files for outputting if parallel execution was run + if ( params.metagenomics_malt_group_size > 0 ) { + ch_log_for_cat = + MALT_RUN.out.log + .map { + meta,log -> log + } + .collect() + .map { + log -> [['databasename': file(params.metagenomics_profiling_database).getBaseName()], log] + } + + CAT_CAT_MALT ( ch_log_for_cat ) + } + ch_maltrun_for_postprocessing = ch_maltrun_for_megan.combine(ch_maltrun_for_maltextract) ch_versions = MALT_RUN.out.versions.first() From f02baf20eacba1863d9bf738304a3bef00dfa0f8 Mon Sep 17 00:00:00 2001 From: Ian Light Date: Fri, 23 Feb 2024 10:51:15 +0000 Subject: [PATCH 079/198] process name prefix fix for cat_cat --- conf/modules.config | 2 +- subworkflows/local/metagenomics_profiling.nf | 7 ++++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 97147425c..8287435a4 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -849,7 +849,7 @@ process { } withName: CAT_CAT_MALT { - ext.prefix = { "${meta.databasename}_runtime_log_concatenated.log" } + ext.prefix = { "${meta.id}_runtime_log_concatenated.log" } publishDir = [ path: { "${params.outdir}/metagenomics_screening/profiling/malt/" }, mode: params.publish_dir_mode, diff --git a/subworkflows/local/metagenomics_profiling.nf b/subworkflows/local/metagenomics_profiling.nf index 01a0ca3fe..bb0715509 100644 --- a/subworkflows/local/metagenomics_profiling.nf +++ b/subworkflows/local/metagenomics_profiling.nf @@ -32,6 +32,11 @@ workflow METAGENOMICS_PROFILING { if ( params.metagenomics_profiling_tool == 'malt' ) { + // Optional parallel run of malt available: + // If parallel execution, split into groups with meta id of the first library id of group + // Merging of maltlog will be done by concatenation + + // If no parallel execution (default): // Reset entire input meta for MALT to just database name, // as we don't run run on a per-sample basis due to huge databases // so all samples are in one run and so sample-specific metadata @@ -106,7 +111,7 @@ workflow METAGENOMICS_PROFILING { } .collect() .map { - log -> [['databasename': file(params.metagenomics_profiling_database).getBaseName()], log] + log -> [['id': file(params.metagenomics_profiling_database).getBaseName()], log] } CAT_CAT_MALT ( ch_log_for_cat ) From c8a83f57d8cbc9f7ee50307f21844e0bc8468f20 Mon Sep 17 00:00:00 2001 From: Merlin Szymanski Date: Mon, 18 Mar 2024 15:01:53 +0100 Subject: [PATCH 080/198] remove unnecessary .first() declaration from versions channel --- subworkflows/local/metagenomics_complexityfilter.nf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/subworkflows/local/metagenomics_complexityfilter.nf b/subworkflows/local/metagenomics_complexityfilter.nf index 2ec5f01a3..8524e1992 100644 --- a/subworkflows/local/metagenomics_complexityfilter.nf +++ b/subworkflows/local/metagenomics_complexityfilter.nf @@ -13,13 +13,13 @@ workflow METAGENOMICS_COMPLEXITYFILTER { if (params.metagenomics_complexity_tool == 'bbduk') { BBMAP_BBDUK( ch_bamfiltered_for_metagenomics, [] ) - ch_versions = BBMAP_BBDUK.out.versions.first() + ch_versions = BBMAP_BBDUK.out.versions ch_reads_for_metagenomics = BBMAP_BBDUK.out.reads } else if ( params.metagenomics_complexity_tool == 'prinseq' ) { // check if e.g. dustscore is set but entropy enabled PRINSEQPLUSPLUS ( ch_bamfiltered_for_metagenomics ) - ch_versions = PRINSEQPLUSPLUS.out.versions.first() + ch_versions = PRINSEQPLUSPLUS.out.versions ch_reads_for_metagenomics = PRINSEQPLUSPLUS.out.good_reads } From bd0c0896b106cfb7613e13663f30172a7080efba Mon Sep 17 00:00:00 2001 From: Merlin Szymanski Date: Mon, 18 Mar 2024 15:04:57 +0100 Subject: [PATCH 081/198] Ignore AMPS if it fails AMPS fails to create a heatmap and crashes if there are too few numbers of input sequences. There is nothing we can do besides expecting a failure here... Needs to be updated as soon as AMPS is updated --- conf/modules.config | 1 + 1 file changed, 1 insertion(+) diff --git a/conf/modules.config b/conf/modules.config index 8287435a4..38fd3940e 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -923,6 +923,7 @@ process { mode: params.publish_dir_mode, pattern: 'results' ] + errorStrategy = 'ignore' // required as it fails the run for low reads: https://github.com/rhuebler/HOPS/issues/9 } withName: TAXPASTA_MERGE { From ecd7a4de667c7158a9dbbf2b0e37dc6d5fd8a59e Mon Sep 17 00:00:00 2001 From: Merlin Szymanski Date: Mon, 18 Mar 2024 16:11:34 +0100 Subject: [PATCH 082/198] Run krakenuniq with all samples at once As discussed here https://github.com/nf-core/eager/pull/1019#discussion_r1397057636 --- .../local/metagenomics_postprocessing.nf | 2 +- subworkflows/local/metagenomics_profiling.nf | 33 +++++++++++-------- 2 files changed, 20 insertions(+), 15 deletions(-) diff --git a/subworkflows/local/metagenomics_postprocessing.nf b/subworkflows/local/metagenomics_postprocessing.nf index 15ade81d3..b69c2e920 100644 --- a/subworkflows/local/metagenomics_postprocessing.nf +++ b/subworkflows/local/metagenomics_postprocessing.nf @@ -48,7 +48,7 @@ workflow METAGENOMICS_POSTPROCESSING { ch_postprocessing_input = ch_postprocessing_input .map{ meta, report -> - [report] + report } .collect() .map{ diff --git a/subworkflows/local/metagenomics_profiling.nf b/subworkflows/local/metagenomics_profiling.nf index bb0715509..65e211fa4 100644 --- a/subworkflows/local/metagenomics_profiling.nf +++ b/subworkflows/local/metagenomics_profiling.nf @@ -13,8 +13,8 @@ include { CAT_CAT as CAT_CAT_MALT } from '../../modules/nf-core/cat/cat/m workflow METAGENOMICS_PROFILING { take: - reads - database + ch_reads + ch_database main: ch_versions = Channel.empty() @@ -137,36 +137,41 @@ workflow METAGENOMICS_PROFILING { } else if ( params.metagenomics_profiling_tool == 'krakenuniq' ) { - // run kraken uniq per sample, to preserve the meta-data + // run krakenuniq once for all samples - reads = reads.combine(database) - krakenuniq_reads = reads.map{ meta, reads, database -> [meta, reads] } - krakenuniq_db = reads.map{ meta, reads, database -> [database] } + ch_reads = ch_reads.map{ meta, file -> file } + .collect() + .map{files -> [ + ['single_end':true], files + ]} KRAKENUNIQ_PRELOADEDKRAKENUNIQ ( - krakenuniq_reads, - krakenuniq_db, + ch_reads, + ch_database, params.metagenomics_krakenuniq_ramchunksize, params.metagenomics_kraken_savereads, true, // save read assignments params.metagenomics_kraken_savereadclassifications ) - ch_versions = KRAKENUNIQ_PRELOADEDKRAKENUNIQ.out.versions.first() + ch_versions = KRAKENUNIQ_PRELOADEDKRAKENUNIQ.out.versions ch_multiqc_files = KRAKENUNIQ_PRELOADEDKRAKENUNIQ.out.report ch_postprocessing_input = KRAKENUNIQ_PRELOADEDKRAKENUNIQ.out.report + + KRAKENUNIQ_PRELOADEDKRAKENUNIQ.out.report.view() } else if ( params.metagenomics_profiling_tool == 'kraken2' ) { // run kraken2 per sample + // it lacks the option of krakenuniq - reads = reads.combine(database) - kraken2_reads = reads.map{meta, reads, database -> [meta, reads]} - kraken2_db = reads.map{meta, reads, database -> [database]} + ch_reads = ch_reads.combine(ch_database) + ch_kraken2_reads = ch_reads.map{meta, reads, database -> [meta, reads]} + ch_kraken2_db = ch_reads.map{meta, reads, database -> [database]} KRAKEN2_KRAKEN2 ( - kraken2_reads, - kraken2_db, + ch_kraken2_reads, + ch_kraken2_db, params.metagenomics_kraken_savereads, params.metagenomics_kraken_savereadclassifications ) From 5fd2740e4b36ad7addc5dbcb597df139aee0809f Mon Sep 17 00:00:00 2001 From: Merlin Szymanski Date: Tue, 19 Mar 2024 11:47:03 +0100 Subject: [PATCH 083/198] fix channel renaming --- subworkflows/local/metagenomics_postprocessing.nf | 2 +- subworkflows/local/metagenomics_profiling.nf | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/subworkflows/local/metagenomics_postprocessing.nf b/subworkflows/local/metagenomics_postprocessing.nf index b69c2e920..44a647982 100644 --- a/subworkflows/local/metagenomics_postprocessing.nf +++ b/subworkflows/local/metagenomics_postprocessing.nf @@ -16,7 +16,7 @@ workflow METAGENOMICS_POSTPROCESSING { // For MALT we have an additional step that includes maltextract+amps if ( params.metagenomics_run_postprocessing && params.metagenomics_profiling_tool == 'malt' ) { - //maltextract doesnt accepts a meta param in the first input channel, so remove it + //maltextract doesnt accept a meta param in the first input channel, so remove it ch_maltextract_input = ch_postprocessing_input.first().map { meta, rma, rma_collected -> rma_collected diff --git a/subworkflows/local/metagenomics_profiling.nf b/subworkflows/local/metagenomics_profiling.nf index 65e211fa4..aed12ef3a 100644 --- a/subworkflows/local/metagenomics_profiling.nf +++ b/subworkflows/local/metagenomics_profiling.nf @@ -43,12 +43,12 @@ workflow METAGENOMICS_PROFILING { // unnecessary. Set as database name to prevent `null` job ID and prefix. if ( params.metagenomics_malt_group_size > 0 ) { - ch_labels_for_malt_tmp = reads + ch_labels_for_malt_tmp = ch_reads .map { meta, reads -> meta } .collate(params.metagenomics_malt_group_size) .map(meta -> meta.first().library_id ) - ch_input_for_malt_tmp = reads + ch_input_for_malt_tmp = ch_reads .map { meta, reads -> reads } .collate( params.metagenomics_malt_group_size ) //collate into bins of defined lengths .map{ @@ -56,7 +56,7 @@ workflow METAGENOMICS_PROFILING { // add new meta with db-name as id [[label: file(params.metagenomics_profiling_database).getBaseName() ], reads] } - .combine(database) + .combine(ch_database) .merge(ch_labels_for_malt_tmp) //combine with database .multiMap{ // and split apart again @@ -66,11 +66,11 @@ workflow METAGENOMICS_PROFILING { } ch_input_for_malt = ch_input_for_malt_tmp.reads - database = ch_input_for_malt_tmp.database + ch_database = ch_input_for_malt_tmp.database } else { - ch_input_for_malt = reads + ch_input_for_malt = ch_reads .map { meta, reads -> reads } .collect() .map{ @@ -81,7 +81,7 @@ workflow METAGENOMICS_PROFILING { } // Run MALT - MALT_RUN ( ch_input_for_malt, database ) + MALT_RUN ( ch_input_for_malt, ch_database ) ch_maltrun_for_megan = MALT_RUN.out.rma6 .transpose() From 8b954f15227f1ddb3b3a762684cc564e96bdd366 Mon Sep 17 00:00:00 2001 From: Merlin Szymanski Date: Tue, 19 Mar 2024 15:28:30 +0100 Subject: [PATCH 084/198] Update MaltExtract module --- modules.json | 2 +- modules/nf-core/maltextract/environment.yml | 7 +++++++ modules/nf-core/maltextract/main.nf | 10 +++++----- modules/nf-core/maltextract/meta.yml | 12 ++++++++---- subworkflows/local/metagenomics_postprocessing.nf | 9 ++++++--- 5 files changed, 27 insertions(+), 13 deletions(-) create mode 100644 modules/nf-core/maltextract/environment.yml diff --git a/modules.json b/modules.json index dc7301d2f..38c729e85 100644 --- a/modules.json +++ b/modules.json @@ -142,7 +142,7 @@ }, "maltextract": { "branch": "master", - "git_sha": "0f8a77ff00e65eaeebc509b8156eaa983192474b", + "git_sha": "8840ece9ee7528480dec95796e017be02ada0dc0", "installed_by": ["modules"] }, "mapdamage2": { diff --git a/modules/nf-core/maltextract/environment.yml b/modules/nf-core/maltextract/environment.yml new file mode 100644 index 000000000..f87a299c2 --- /dev/null +++ b/modules/nf-core/maltextract/environment.yml @@ -0,0 +1,7 @@ +name: maltextract +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::hops=0.35 diff --git a/modules/nf-core/maltextract/main.nf b/modules/nf-core/maltextract/main.nf index d44b54c60..663cf638c 100644 --- a/modules/nf-core/maltextract/main.nf +++ b/modules/nf-core/maltextract/main.nf @@ -2,19 +2,19 @@ process MALTEXTRACT { label 'process_medium' - conda "bioconda::hops=0.35" + conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/hops:0.35--hdfd78af_1' : - 'quay.io/biocontainers/hops:0.35--hdfd78af_1' }" + 'biocontainers/hops:0.35--hdfd78af_1' }" input: - path rma6 + tuple val(meta), path(rma6) path taxon_list path ncbi_dir output: - path "results" , emit: results - path "versions.yml" , emit: versions + tuple val(meta), path("results") , emit: results + path "versions.yml" , emit: versions when: task.ext.when == null || task.ext.when diff --git a/modules/nf-core/maltextract/meta.yml b/modules/nf-core/maltextract/meta.yml index c365a7c5e..f7648d807 100644 --- a/modules/nf-core/maltextract/meta.yml +++ b/modules/nf-core/maltextract/meta.yml @@ -22,8 +22,12 @@ tools: tool_dev_url: https://github.com/rhuebler/hops doi: "10.1186/s13059-019-1903-0" licence: ["GPL 3"] - input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] - rma6: type: file description: RMA6 files from MALT @@ -36,7 +40,6 @@ input: type: directory description: Directory containing NCBI taxonomy map and tre files pattern: "${ncbi_dir}/" - output: - versions: type: file @@ -45,7 +48,8 @@ output: - results: type: directory description: Directory containing MaltExtract text results files - pattern: "*.rma6" - + pattern: "results/" authors: - "@jfy133" +maintainers: + - "@jfy133" diff --git a/subworkflows/local/metagenomics_postprocessing.nf b/subworkflows/local/metagenomics_postprocessing.nf index 44a647982..ed93946b6 100644 --- a/subworkflows/local/metagenomics_postprocessing.nf +++ b/subworkflows/local/metagenomics_postprocessing.nf @@ -16,18 +16,21 @@ workflow METAGENOMICS_POSTPROCESSING { // For MALT we have an additional step that includes maltextract+amps if ( params.metagenomics_run_postprocessing && params.metagenomics_profiling_tool == 'malt' ) { - //maltextract doesnt accept a meta param in the first input channel, so remove it ch_maltextract_input = ch_postprocessing_input.first().map { meta, rma, rma_collected -> - rma_collected + [meta, rma_collected] } tax_list = Channel.fromPath(params.metagenomics_maltextract_taxonlist) ncbi_dir = Channel.fromPath(params.metagenomics_maltextract_ncbidir) + //#TODO: Branch of here + MALTEXTRACT ( ch_maltextract_input, tax_list, ncbi_dir) - AMPS ( MALTEXTRACT.out.results, tax_list, params.metagenomics_maltextract_filter ) + ch_amps_input = MALTEXTRACT.out.results.map{ it[1] } + + AMPS ( ch_amps_input, tax_list, params.metagenomics_maltextract_filter ) //Also, prepare Malt for taxpasta by running rma2info From b8977437975f999780475618a6232a637a1bb0c3 Mon Sep 17 00:00:00 2001 From: Merlin Szymanski Date: Wed, 20 Mar 2024 10:52:57 +0100 Subject: [PATCH 085/198] WIP: Malt-profiling: Keep strandedness information for malt_group_size 0 --- subworkflows/local/metagenomics_profiling.nf | 43 ++++++++++++++++---- 1 file changed, 34 insertions(+), 9 deletions(-) diff --git a/subworkflows/local/metagenomics_profiling.nf b/subworkflows/local/metagenomics_profiling.nf index aed12ef3a..1d2b1e2be 100644 --- a/subworkflows/local/metagenomics_profiling.nf +++ b/subworkflows/local/metagenomics_profiling.nf @@ -42,6 +42,11 @@ workflow METAGENOMICS_PROFILING { // so all samples are in one run and so sample-specific metadata // unnecessary. Set as database name to prevent `null` job ID and prefix. + ch_reads.branch{ + ss: it[0].strandedness == 'single' + ds: true + }.set { ch_reads_stranded } + if ( params.metagenomics_malt_group_size > 0 ) { ch_labels_for_malt_tmp = ch_reads .map { meta, reads -> meta } @@ -70,18 +75,38 @@ workflow METAGENOMICS_PROFILING { } else { - ch_input_for_malt = ch_reads - .map { meta, reads -> reads } - .collect() - .map{ - // make sure id is db_name for publishing purposes. - reads -> - [[label: file(params.metagenomics_profiling_database).getBaseName(), id: 'all' ], reads] - } + // group the double-stranded entries + // reduce the meta to the bare minimum common-information + ch_malt_input_ds = ch_reads_stranded.ds.map{ meta, reads -> + [ + [label: file(params.metagenomics_profiling_database).getBaseName(), id: 'all_ds', strandedness:'double' ], + reads + ] + } + .groupTuple(by:0) + // group the single-stranded entries + // reduce the meta to the bare minimum common-information + ch_malt_input_ss = ch_reads_stranded.ss.map{ meta, reads -> + [ + [label: file(params.metagenomics_profiling_database).getBaseName(), id: 'all_ss', strandedness:'single' ], + reads + ] + } + .groupTuple(by:0) + // combine to one channel again (to run MALT twice) + ch_input_for_malt = ch_malt_input_ds.concat(ch_malt_input_ss) + // combine with the database + ch_input_for_malt = ch_input_for_malt.combine(ch_database) } // Run MALT - MALT_RUN ( ch_input_for_malt, ch_database ) + // Split Channels into reads and database + ch_input_for_malt = ch_input_for_malt.multiMap{ meta, reads, database -> + reads: [meta, reads] + database: database + } + + MALT_RUN ( ch_input_for_malt.reads, ch_input_for_malt.database ) ch_maltrun_for_megan = MALT_RUN.out.rma6 .transpose() From ad4ab3644bd8794104bba40a8dc189770167a181 Mon Sep 17 00:00:00 2001 From: Merlin Szymanski Date: Wed, 20 Mar 2024 11:46:40 +0100 Subject: [PATCH 086/198] Run MALT by group_size and strandedness --- subworkflows/local/metagenomics_profiling.nf | 72 ++++++-------------- 1 file changed, 19 insertions(+), 53 deletions(-) diff --git a/subworkflows/local/metagenomics_profiling.nf b/subworkflows/local/metagenomics_profiling.nf index 1d2b1e2be..50247fa24 100644 --- a/subworkflows/local/metagenomics_profiling.nf +++ b/subworkflows/local/metagenomics_profiling.nf @@ -42,70 +42,36 @@ workflow METAGENOMICS_PROFILING { // so all samples are in one run and so sample-specific metadata // unnecessary. Set as database name to prevent `null` job ID and prefix. - ch_reads.branch{ - ss: it[0].strandedness == 'single' - ds: true - }.set { ch_reads_stranded } - if ( params.metagenomics_malt_group_size > 0 ) { - ch_labels_for_malt_tmp = ch_reads - .map { meta, reads -> meta } - .collate(params.metagenomics_malt_group_size) - .map(meta -> meta.first().library_id ) - - ch_input_for_malt_tmp = ch_reads - .map { meta, reads -> reads } - .collate( params.metagenomics_malt_group_size ) //collate into bins of defined lengths - .map{ - reads -> - // add new meta with db-name as id - [[label: file(params.metagenomics_profiling_database).getBaseName() ], reads] - } - .combine(ch_database) - .merge(ch_labels_for_malt_tmp) //combine with database - .multiMap{ - // and split apart again - meta, reads, database, ids -> - reads: [meta + ['id':ids], reads] - database: database - } - - ch_input_for_malt = ch_input_for_malt_tmp.reads - ch_database = ch_input_for_malt_tmp.database - } + def label = file(params.metagenomics_profiling_database).getBaseName() + def n = 0 - else { - // group the double-stranded entries - // reduce the meta to the bare minimum common-information - ch_malt_input_ds = ch_reads_stranded.ds.map{ meta, reads -> - [ - [label: file(params.metagenomics_profiling_database).getBaseName(), id: 'all_ds', strandedness:'double' ], - reads - ] - } - .groupTuple(by:0) - // group the single-stranded entries - // reduce the meta to the bare minimum common-information - ch_malt_input_ss = ch_reads_stranded.ss.map{ meta, reads -> + //replace the meta in a way that groupTuple splits the entries + //by strandedness and metagenomics_malt_group_size + + ch_input_for_malt = ch_reads.map{ meta, reads -> + [ [ - [label: file(params.metagenomics_profiling_database).getBaseName(), id: 'all_ss', strandedness:'single' ], - reads - ] - } - .groupTuple(by:0) - // combine to one channel again (to run MALT twice) - ch_input_for_malt = ch_malt_input_ds.concat(ch_malt_input_ss) - // combine with the database - ch_input_for_malt = ch_input_for_malt.combine(ch_database) + label: label, + strandedness:meta.strandedness, + id:"${meta.strandedness}_${params.metagenomics_malt_group_size > 0 ? n++%params.metagenomics_malt_group_size : 'all'}" + ], + reads + ] } + .groupTuple(by:0) + + // We might have multiple chunks in the reads_channel + // each of which requires a database + ch_input_for_malt = ch_input_for_malt.combine(ch_database).view() - // Run MALT // Split Channels into reads and database ch_input_for_malt = ch_input_for_malt.multiMap{ meta, reads, database -> reads: [meta, reads] database: database } + // Run MALT MALT_RUN ( ch_input_for_malt.reads, ch_input_for_malt.database ) ch_maltrun_for_megan = MALT_RUN.out.rma6 From 358185878136d7374199fef34c9a2211671b9634 Mon Sep 17 00:00:00 2001 From: Merlin Szymanski Date: Wed, 20 Mar 2024 13:22:46 +0100 Subject: [PATCH 087/198] include taxpasta standardise for single sample runs --- conf/modules.config | 9 ++++ modules.json | 5 ++ .../taxpasta/standardise/environment.yml | 7 +++ modules/nf-core/taxpasta/standardise/main.nf | 42 ++++++++++++++++ modules/nf-core/taxpasta/standardise/meta.yml | 49 +++++++++++++++++++ subworkflows/local/metagenomics.nf | 1 + .../local/metagenomics_postprocessing.nf | 43 ++++++++++------ subworkflows/local/metagenomics_profiling.nf | 7 ++- 8 files changed, 145 insertions(+), 18 deletions(-) create mode 100644 modules/nf-core/taxpasta/standardise/environment.yml create mode 100644 modules/nf-core/taxpasta/standardise/main.nf create mode 100644 modules/nf-core/taxpasta/standardise/meta.yml diff --git a/conf/modules.config b/conf/modules.config index 38fd3940e..3ebf875c2 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -935,6 +935,15 @@ process { ext.args = { "--profiler ${meta.profiler} --output taxpasta_table.tsv" } } + withName: TAXPASTA_STANDARDISE { + publishDir = [ + path: { "${params.outdir}/metagenomics_screening/postprocessing/taxpasta/" }, + mode: params.publish_dir_mode, + pattern: '*.{csv,tsv,ods,xlsx,arrow,parquet,biom}' + ] + ext.args = { "--profiler ${meta.profiler} --output taxpasta_table.tsv" } + } + withName: 'QUALIMAP_BAMQC_WITHBED|QUALIMAP_BAMQC_NOBED' { tag = { "${meta.reference}|${meta.sample_id}_${meta.library_id}" } publishDir = [ diff --git a/modules.json b/modules.json index 38c729e85..072693419 100644 --- a/modules.json +++ b/modules.json @@ -249,6 +249,11 @@ "branch": "master", "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", "installed_by": ["modules"] + }, + "taxpasta/standardise": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] } } }, diff --git a/modules/nf-core/taxpasta/standardise/environment.yml b/modules/nf-core/taxpasta/standardise/environment.yml new file mode 100644 index 000000000..81b35fc48 --- /dev/null +++ b/modules/nf-core/taxpasta/standardise/environment.yml @@ -0,0 +1,7 @@ +name: taxpasta_standardise +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::taxpasta=0.6.1 diff --git a/modules/nf-core/taxpasta/standardise/main.nf b/modules/nf-core/taxpasta/standardise/main.nf new file mode 100644 index 000000000..83693d4e4 --- /dev/null +++ b/modules/nf-core/taxpasta/standardise/main.nf @@ -0,0 +1,42 @@ +process TAXPASTA_STANDARDISE { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/taxpasta:0.6.1--pyhdfd78af_0': + 'biocontainers/taxpasta:0.6.1--pyhdfd78af_0' }" + + input: + tuple val(meta), path(profile) + path taxonomy + + output: + tuple val(meta), path("*.{tsv,csv,arrow,parquet,biom}"), emit: standardised_profile + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + // N.B.: Taxpasta requires a --profiler option and will fail without it. + // This must be specified via a `nextflow.config` or `modules.config`, for + // example, as "--profiler kraken2". Additionally, it requires a --output + // option with the output file name. The desired format will be parsed from + // the name and should correspond to the output pattern specified above, + // e.g., "--output ${task.ext.prefix}.tsv". + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def taxonomy_option = taxonomy ? "--taxonomy ${taxonomy}" : '' + """ + taxpasta standardise \\ + $args \\ + $taxonomy_option \\ + $profile + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + taxpasta: \$(taxpasta --version) + END_VERSIONS + """ +} diff --git a/modules/nf-core/taxpasta/standardise/meta.yml b/modules/nf-core/taxpasta/standardise/meta.yml new file mode 100644 index 000000000..a902b00e0 --- /dev/null +++ b/modules/nf-core/taxpasta/standardise/meta.yml @@ -0,0 +1,49 @@ +name: "taxpasta_standardise" +description: "Standardise the output of a wide range of taxonomic profilers" +keywords: + - taxonomic profile + - standardise + - standardisation + - metagenomics + - taxonomic profiling + - otu tables + - taxon tables +tools: + - "taxpasta": + description: "TAXonomic Profile Aggregation and STAndardisation" + homepage: "https://taxpasta.readthedocs.io/" + documentation: "https://taxpasta.readthedocs.io/" + tool_dev_url: "https://github.com/taxprofiler/taxpasta" + licence: "['Apache-2.0']" +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - profile: + type: file + description: profiler output file (mandatory) + pattern: "*" + - taxonomy: + type: directory + description: Directory containing at a minimum nodes.dmp and names.dmp files (optional) + pattern: "*/" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - standardised_profile: + type: file + description: Standardised taxonomic profile + pattern: "*.{tsv,csv,arrow,parquet,biom}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@Midnighter" +maintainers: + - "@Midnighter" diff --git a/subworkflows/local/metagenomics.nf b/subworkflows/local/metagenomics.nf index 476cf156f..19dc06c29 100644 --- a/subworkflows/local/metagenomics.nf +++ b/subworkflows/local/metagenomics.nf @@ -12,6 +12,7 @@ workflow METAGENOMICS { // Add single_end parameter to meta. // Reads were merged before, so single_end is always true! + ch_bamfiltered_for_metagenomics = ch_bamfiltered_for_metagenomics.map{ meta, bamfiltered -> [meta+['single_end':true], bamfiltered] } diff --git a/subworkflows/local/metagenomics_postprocessing.nf b/subworkflows/local/metagenomics_postprocessing.nf index ed93946b6..a670ba953 100644 --- a/subworkflows/local/metagenomics_postprocessing.nf +++ b/subworkflows/local/metagenomics_postprocessing.nf @@ -1,7 +1,8 @@ -include { MALTEXTRACT } from '../../modules/nf-core/maltextract/main' -include { AMPS } from '../../modules/nf-core/amps/main' -include { TAXPASTA_MERGE } from '../../modules/nf-core/taxpasta/merge/main' -include { MEGAN_RMA2INFO } from '../../modules/nf-core/megan/rma2info/main' +include { MALTEXTRACT } from '../../modules/nf-core/maltextract/main' +include { AMPS } from '../../modules/nf-core/amps/main' +include { TAXPASTA_MERGE } from '../../modules/nf-core/taxpasta/merge/main' +include { TAXPASTA_STANDARDISE } from '../../modules/nf-core/taxpasta/standardise/main' +include { MEGAN_RMA2INFO } from '../../modules/nf-core/megan/rma2info/main' workflow METAGENOMICS_POSTPROCESSING { @@ -13,6 +14,7 @@ workflow METAGENOMICS_POSTPROCESSING { ch_versions = Channel.empty() ch_multiqc_files = Channel.empty() + // For MALT we have an additional step that includes maltextract+amps if ( params.metagenomics_run_postprocessing && params.metagenomics_profiling_tool == 'malt' ) { @@ -24,8 +26,6 @@ workflow METAGENOMICS_POSTPROCESSING { tax_list = Channel.fromPath(params.metagenomics_maltextract_taxonlist) ncbi_dir = Channel.fromPath(params.metagenomics_maltextract_ncbidir) - //#TODO: Branch of here - MALTEXTRACT ( ch_maltextract_input, tax_list, ncbi_dir) ch_amps_input = MALTEXTRACT.out.results.map{ it[1] } @@ -46,29 +46,44 @@ workflow METAGENOMICS_POSTPROCESSING { ch_multiqc_files = ch_multiqc_files.mix( AMPS.out.json ) } - // Run taxpasta for everything! + // Run taxpasta for everything! + ch_report_count = ch_postprocessing_input.count() ch_postprocessing_input = ch_postprocessing_input .map{ meta, report -> - report - } - .collect() - .map{ - reports -> [ [ "id":"${params.metagenomics_profiling_tool}_profiles_all_samples_merged_taxpasta", "profiler":params.metagenomics_profiling_tool == 'malt' ? 'megan6' : params.metagenomics_profiling_tool ], - reports + report ] } + .groupTuple(by:0) + .combine(ch_report_count) + .branch{ + standardise: it[2] == 1 + merge: true + } - TAXPASTA_MERGE( ch_postprocessing_input, [], [] ) + ch_standardise_input = ch_postprocessing_input.standardise.map{ meta, reports, count -> + [meta, reports] + } + + TAXPASTA_STANDARDISE( ch_standardise_input, [] ) + ch_versions = ch_versions.mix(TAXPASTA_STANDARDISE.out.versions) + ch_multiqc_files = ch_multiqc_files.mix(TAXPASTA_STANDARDISE.out.standardised_profile) + + ch_merge_input = ch_postprocessing_input.merge.map{ meta, reports, count -> + [meta, reports] + } + + TAXPASTA_MERGE( ch_merge_input, [], [] ) ch_versions = ch_versions.mix(TAXPASTA_MERGE.out.versions) ch_multiqc_files = ch_multiqc_files.mix(TAXPASTA_MERGE.out.merged_profiles) + emit: versions = ch_versions mqc = ch_multiqc_files diff --git a/subworkflows/local/metagenomics_profiling.nf b/subworkflows/local/metagenomics_profiling.nf index 50247fa24..cd8c32c38 100644 --- a/subworkflows/local/metagenomics_profiling.nf +++ b/subworkflows/local/metagenomics_profiling.nf @@ -54,7 +54,7 @@ workflow METAGENOMICS_PROFILING { [ label: label, strandedness:meta.strandedness, - id:"${meta.strandedness}_${params.metagenomics_malt_group_size > 0 ? n++%params.metagenomics_malt_group_size : 'all'}" + id:"${meta.strandedness}stranded_${params.metagenomics_malt_group_size > 0 ? n++%params.metagenomics_malt_group_size : 'all'}" ], reads ] @@ -63,7 +63,7 @@ workflow METAGENOMICS_PROFILING { // We might have multiple chunks in the reads_channel // each of which requires a database - ch_input_for_malt = ch_input_for_malt.combine(ch_database).view() + ch_input_for_malt = ch_input_for_malt.combine(ch_database) // Split Channels into reads and database ch_input_for_malt = ch_input_for_malt.multiMap{ meta, reads, database -> @@ -102,7 +102,7 @@ workflow METAGENOMICS_PROFILING { } .collect() .map { - log -> [['id': file(params.metagenomics_profiling_database).getBaseName()], log] + log -> [['id': label], log] } CAT_CAT_MALT ( ch_log_for_cat ) @@ -149,7 +149,6 @@ workflow METAGENOMICS_PROFILING { ch_multiqc_files = KRAKENUNIQ_PRELOADEDKRAKENUNIQ.out.report ch_postprocessing_input = KRAKENUNIQ_PRELOADEDKRAKENUNIQ.out.report - KRAKENUNIQ_PRELOADEDKRAKENUNIQ.out.report.view() } else if ( params.metagenomics_profiling_tool == 'kraken2' ) { From 210f60cfa5ccbc569d0dc300aa732816fc7fa38f Mon Sep 17 00:00:00 2001 From: Merlin Szymanski Date: Wed, 20 Mar 2024 14:44:39 +0100 Subject: [PATCH 088/198] Run maltextract for each malt run --- conf/modules.config | 4 +- .../local/metagenomics_postprocessing.nf | 54 +++++++++++++----- subworkflows/local/metagenomics_profiling.nf | 57 +++++++------------ 3 files changed, 63 insertions(+), 52 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 3ebf875c2..cf1c1352c 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -897,10 +897,10 @@ process { params.metagenomics_maltextract_matches ? "--matches" : "", params.metagenomics_maltextract_megansummary ? "--meganSummary" : "", params.metagenomics_maltextract_usetopalignment ? "--useTopAlignment" : "", - { meta.strandedness } == "single" ? '--singleStranded' : '' + { meta.strandedness } == "single" ? '--singleStranded' : '', ].join(' ').trim() publishDir = [ - path: { "${params.outdir}/metagenomics_screening/postprocessing/maltextract/" }, + path: { "${params.outdir}/metagenomics_screening/postprocessing/maltextract/${meta.id}/" }, mode: params.publish_dir_mode, pattern: 'results' ] diff --git a/subworkflows/local/metagenomics_postprocessing.nf b/subworkflows/local/metagenomics_postprocessing.nf index a670ba953..fde207549 100644 --- a/subworkflows/local/metagenomics_postprocessing.nf +++ b/subworkflows/local/metagenomics_postprocessing.nf @@ -14,29 +14,54 @@ workflow METAGENOMICS_POSTPROCESSING { ch_versions = Channel.empty() ch_multiqc_files = Channel.empty() - // For MALT we have an additional step that includes maltextract+amps if ( params.metagenomics_run_postprocessing && params.metagenomics_profiling_tool == 'malt' ) { - ch_maltextract_input = ch_postprocessing_input.first().map { - meta, rma, rma_collected -> - [meta, rma_collected] - } - tax_list = Channel.fromPath(params.metagenomics_maltextract_taxonlist) ncbi_dir = Channel.fromPath(params.metagenomics_maltextract_ncbidir) - MALTEXTRACT ( ch_maltextract_input, tax_list, ncbi_dir) - - ch_amps_input = MALTEXTRACT.out.results.map{ it[1] } + // Malt could have been executed multiple times (group_size paramter and strandedness) + // We want to combine the chunks, but run MaltExtract on double and singlestranded individually + ch_strandedness = ch_postprocessing_input + .transpose() + .map{ meta, reads -> + [ + meta+['id':"${meta.strandedness}stranded"], + reads + ] + } + .groupTuple(by:0) + + // could no be two entries in the channel, so combine with the tax_list and ncbi + ch_maltextract_input = ch_strandedness + .combine(tax_list) + .combine(ncbi_dir) + .multiMap{ + rma6:[it[0],it[1]] + tax_list:it[2] + ncbi_dir:it[3] + } - AMPS ( ch_amps_input, tax_list, params.metagenomics_maltextract_filter ) + //RUN MaltExtract + MALTEXTRACT ( ch_maltextract_input.rma6, ch_maltextract_input.tax_list, ch_maltextract_input.ncbi_dir) + // now we need to run AMPS for each MALTEXTRACT output + ch_amps_input = MALTEXTRACT.out.results.map{ it[1] } - //Also, prepare Malt for taxpasta by running rma2info - ch_rma2info_input = ch_postprocessing_input.map { - meta, rma, rma_collected -> - [ meta, rma ] + AMPS ( ch_amps_input, ch_maltextract_input.tax_list, params.metagenomics_maltextract_filter ) + + //Now, prepare Malt rma6 output for taxpasta by running rma2info + ch_rma2info_input = ch_postprocessing_input + .transpose() + .map { + meta, rma -> + // re-extract meta from file names, use filename without rma to + // ensure we keep paired-end information in downstream filenames + // when no pair-merging + [ + meta+['db_name':meta.id, 'id': rma.baseName], + rma + ] } MEGAN_RMA2INFO( ch_rma2info_input, true ) @@ -89,4 +114,3 @@ workflow METAGENOMICS_POSTPROCESSING { mqc = ch_multiqc_files } - diff --git a/subworkflows/local/metagenomics_profiling.nf b/subworkflows/local/metagenomics_profiling.nf index cd8c32c38..42134ffce 100644 --- a/subworkflows/local/metagenomics_profiling.nf +++ b/subworkflows/local/metagenomics_profiling.nf @@ -21,6 +21,15 @@ workflow METAGENOMICS_PROFILING { ch_multiqc_files = Channel.empty() ch_postprocessing_input = Channel.empty() + //FOR TESTING + def x=1 + ch_reads = ch_reads.map{ meta, reads -> + [ + meta + ['strandedness': x++%2==0 ? 'single' : 'double' ], + reads + ] + } + /* PREPARE PROFILER INPUT CHANNELS & RUN PROFILING */ @@ -60,6 +69,7 @@ workflow METAGENOMICS_PROFILING { ] } .groupTuple(by:0) + .view() // We might have multiple chunks in the reads_channel // each of which requires a database @@ -74,45 +84,22 @@ workflow METAGENOMICS_PROFILING { // Run MALT MALT_RUN ( ch_input_for_malt.reads, ch_input_for_malt.database ) - ch_maltrun_for_megan = MALT_RUN.out.rma6 - .transpose() - .map { - meta, rma -> - // re-extract meta from file names, use filename without rma to - // ensure we keep paired-end information in downstream filenames - // when no pair-merging - [ - meta+['db_name':meta.id, 'id': rma.baseName], - rma - ] - } - - ch_maltrun_for_maltextract = MALT_RUN.out.rma6.map { - id,rma6 -> rma6 - } - .collect() - .toList() - - // Recombine log files for outputting if parallel execution was run - if ( params.metagenomics_malt_group_size > 0 ) { - ch_log_for_cat = - MALT_RUN.out.log - .map { - meta,log -> log - } - .collect() - .map { - log -> [['id': label], log] - } - - CAT_CAT_MALT ( ch_log_for_cat ) - } + // Recombine log files for outputting + ch_log_for_cat = + MALT_RUN.out.log + .map { + meta,log -> log + } + .collect() + .map { + log -> [['id': label], log] + } - ch_maltrun_for_postprocessing = ch_maltrun_for_megan.combine(ch_maltrun_for_maltextract) + CAT_CAT_MALT ( ch_log_for_cat ) ch_versions = MALT_RUN.out.versions.first() ch_multiqc_files = MALT_RUN.out.log - ch_postprocessing_input = ch_maltrun_for_postprocessing + ch_postprocessing_input = MALT_RUN.out.rma6 } else if ( params.metagenomics_profiling_tool == 'metaphlan' ) { From f6dc0033917f35c37e36b6cfd6786c5a969198ea Mon Sep 17 00:00:00 2001 From: Merlin Szymanski Date: Wed, 20 Mar 2024 15:05:31 +0100 Subject: [PATCH 089/198] Fix grouping of reads by malt_group_size for malt run --- subworkflows/local/metagenomics_profiling.nf | 26 +++++++++----------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/subworkflows/local/metagenomics_profiling.nf b/subworkflows/local/metagenomics_profiling.nf index 42134ffce..4e3b5e23a 100644 --- a/subworkflows/local/metagenomics_profiling.nf +++ b/subworkflows/local/metagenomics_profiling.nf @@ -21,15 +21,6 @@ workflow METAGENOMICS_PROFILING { ch_multiqc_files = Channel.empty() ch_postprocessing_input = Channel.empty() - //FOR TESTING - def x=1 - ch_reads = ch_reads.map{ meta, reads -> - [ - meta + ['strandedness': x++%2==0 ? 'single' : 'double' ], - reads - ] - } - /* PREPARE PROFILER INPUT CHANNELS & RUN PROFILING */ @@ -53,26 +44,33 @@ workflow METAGENOMICS_PROFILING { def label = file(params.metagenomics_profiling_database).getBaseName() + + // For the next step we need the number of groups for the spezified number of input files + ch_groups = params.metagenomics_malt_group_size > 0 ? ch_reads.collate(params.metagenomics_malt_group_size).count() : Channel.of(1) + // this is for enumerating the channel-entries in the ch_reads channel def n = 0 //replace the meta in a way that groupTuple splits the entries //by strandedness and metagenomics_malt_group_size - - ch_input_for_malt = ch_reads.map{ meta, reads -> + //NOTE: known limitations + // this method splits the entries into groups of size malt_group_size, but if there is a mix of ds and ss entries + // the groups are split again by ds and ss with groupTuple. So they might end up smaller than malt_group_size + // could be prevented by branching early and running the lower part twice for ss and ds individually + // but this is an edge-case and might never be relevant... + ch_input_for_malt = ch_reads.combine(ch_groups).map{ meta, reads, n_groups -> [ [ label: label, strandedness:meta.strandedness, - id:"${meta.strandedness}stranded_${params.metagenomics_malt_group_size > 0 ? n++%params.metagenomics_malt_group_size : 'all'}" + id:"${meta.strandedness}stranded_${n++%n_groups}" ], reads ] } .groupTuple(by:0) - .view() // We might have multiple chunks in the reads_channel - // each of which requires a database + // each of which requires the database ch_input_for_malt = ch_input_for_malt.combine(ch_database) // Split Channels into reads and database From a5c39bf6b3d03cca908f93377fb131b1323ae8d0 Mon Sep 17 00:00:00 2001 From: Merlin Szymanski Date: Wed, 20 Mar 2024 15:22:24 +0100 Subject: [PATCH 090/198] Code cleanup done --- subworkflows/local/metagenomics_profiling.nf | 48 ++++++++++++-------- 1 file changed, 29 insertions(+), 19 deletions(-) diff --git a/subworkflows/local/metagenomics_profiling.nf b/subworkflows/local/metagenomics_profiling.nf index 4e3b5e23a..0cf7d258c 100644 --- a/subworkflows/local/metagenomics_profiling.nf +++ b/subworkflows/local/metagenomics_profiling.nf @@ -74,10 +74,11 @@ workflow METAGENOMICS_PROFILING { ch_input_for_malt = ch_input_for_malt.combine(ch_database) // Split Channels into reads and database - ch_input_for_malt = ch_input_for_malt.multiMap{ meta, reads, database -> - reads: [meta, reads] - database: database - } + ch_input_for_malt = ch_input_for_malt + .multiMap{ meta, reads, database -> + reads: [meta, reads] + database: database + } // Run MALT MALT_RUN ( ch_input_for_malt.reads, ch_input_for_malt.database ) @@ -102,11 +103,14 @@ workflow METAGENOMICS_PROFILING { else if ( params.metagenomics_profiling_tool == 'metaphlan' ) { - reads = reads.combine(database) - metaphlan_reads = reads.map{ meta, reads, database -> [meta, reads] } - metaphlan_db = reads.map{ meta, reads, database -> [database] } + ch_metaphlan_input = ch_reads + .combine(ch_database) + .multiMap{ meta, reads, db -> + reads: [meta, reads] + database: db + } - METAPHLAN_METAPHLAN ( metaphlan_reads , metaphlan_db ) + METAPHLAN_METAPHLAN ( ch_metaphlan_input.reads , ch_metaphlan_input.database ) ch_versions = METAPHLAN_METAPHLAN.out.versions.first() ch_postprocessing_input = METAPHLAN_METAPHLAN.out.profile @@ -115,14 +119,17 @@ workflow METAGENOMICS_PROFILING { else if ( params.metagenomics_profiling_tool == 'krakenuniq' ) { // run krakenuniq once for all samples - ch_reads = ch_reads.map{ meta, file -> file } - .collect() - .map{files -> [ - ['single_end':true], files - ]} + ch_krakenuniq_input = ch_reads + .map{ meta, file -> + [ + ['single_end':true], + file + ] + } + .groupTuple(by:0) KRAKENUNIQ_PRELOADEDKRAKENUNIQ ( - ch_reads, + ch_krakenuniq_input, ch_database, params.metagenomics_krakenuniq_ramchunksize, params.metagenomics_kraken_savereads, @@ -140,13 +147,16 @@ workflow METAGENOMICS_PROFILING { // run kraken2 per sample // it lacks the option of krakenuniq - ch_reads = ch_reads.combine(ch_database) - ch_kraken2_reads = ch_reads.map{meta, reads, database -> [meta, reads]} - ch_kraken2_db = ch_reads.map{meta, reads, database -> [database]} + ch_kraken2_input = ch_reads + .combine(ch_database) + .multiMap{ meta, reads, db -> + reads: [meta, reads] + database: db + } KRAKEN2_KRAKEN2 ( - ch_kraken2_reads, - ch_kraken2_db, + ch_kraken2_input.reads, + ch_kraken2_input.database, params.metagenomics_kraken_savereads, params.metagenomics_kraken_savereadclassifications ) From 89478bd86c0409ffebd34cd78f21f6bf96f837dd Mon Sep 17 00:00:00 2001 From: Merlin Szymanski Date: Wed, 20 Mar 2024 15:37:12 +0100 Subject: [PATCH 091/198] Testing: Fix taxpasta merge w/ krakenuniq --- .../local/metagenomics_postprocessing.nf | 34 ++++++++++--------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/subworkflows/local/metagenomics_postprocessing.nf b/subworkflows/local/metagenomics_postprocessing.nf index fde207549..f1626b8fe 100644 --- a/subworkflows/local/metagenomics_postprocessing.nf +++ b/subworkflows/local/metagenomics_postprocessing.nf @@ -72,25 +72,27 @@ workflow METAGENOMICS_POSTPROCESSING { } // Run taxpasta for everything! - ch_report_count = ch_postprocessing_input.count() + // We need to know how many reports we have, so that we can run either taxpasta standardise or taxpasta merge + ch_report_count = ch_postprocessing_input.transpose().count() ch_postprocessing_input = ch_postprocessing_input - .map{ - meta, report -> - [ + .transpose() + .map{ + meta, report -> [ - "id":"${params.metagenomics_profiling_tool}_profiles_all_samples_merged_taxpasta", - "profiler":params.metagenomics_profiling_tool == 'malt' ? 'megan6' : params.metagenomics_profiling_tool - ], - report - ] - } - .groupTuple(by:0) - .combine(ch_report_count) - .branch{ - standardise: it[2] == 1 - merge: true - } + [ + "id":"${params.metagenomics_profiling_tool}_profiles_all_samples_merged_taxpasta", + "profiler":params.metagenomics_profiling_tool == 'malt' ? 'megan6' : params.metagenomics_profiling_tool + ], + report + ] + } + .groupTuple(by:0) + .combine(ch_report_count) + .branch{ + standardise: it[2] == 1 + merge: true + } ch_standardise_input = ch_postprocessing_input.standardise.map{ meta, reports, count -> [meta, reports] From c3fbbd640158ace81885c6eff9b40a71a4ebf947 Mon Sep 17 00:00:00 2001 From: Merlin Szymanski Date: Wed, 20 Mar 2024 16:26:12 +0100 Subject: [PATCH 092/198] resolve merge-conflict in manual_test.md --- docs/development/manual_tests.md | 226 +++++++++++++++++++++++-------- 1 file changed, 169 insertions(+), 57 deletions(-) diff --git a/docs/development/manual_tests.md b/docs/development/manual_tests.md index dac9131c5..b08736550 100644 --- a/docs/development/manual_tests.md +++ b/docs/development/manual_tests.md @@ -350,8 +350,6 @@ All possible parameters Tests -## NOTE: metagenomics input generation will now fail pre-pipeline parameter checks, since --run_metagenomics requires the subsequent declaration of --metagenomics_profiling_tool and --metagenomics_profiling_database! - ```bash ## Check no BAM filtering ## Expect: full completion of pipeline without any bam filtering execution @@ -636,27 +634,7 @@ nextflow run main.nf -profile test,docker \ --metagenomics_kraken_savereadclassifications ``` -<<<<<<< HEAD ##### kraken2 -======= -#### With mapDamage2 - -```bash -## mapDamage2 with default parameters -## Expect: mapdamage directory with 3pGtoA_freq, 5pCtoT_freq, dnacomp_genome, dnacomp, Fragmisincorporation_plot, Length_plot, lgdistribution, misincorporation, Runtime_log -nextflow run main.nf -profile test,docker --outdir ./results --damagecalculation_tool mapdamage -``` - -```bash -## mapDamage2 with downsampling to 100 reads -## Expect: mapdamage directory with 3pGtoA_freq, 5pCtoT_freq, dnacomp_genome, dnacomp, Fragmisincorporation_plot, Length_plot, lgdistribution, misincorporation, Runtime_log -nextflow run main.nf -profile test,docker --outdir ./results --damagecalculation_tool mapdamage --damagecalculation_mapdamage_downsample 100 -``` - -### ESTIMATE CONTAMINATION - -#### With ANGSD ->>>>>>> eager/dev ```bash #### Use kraken2 for metagenomics sequence classification, save only report (default) @@ -719,42 +697,7 @@ nextflow run main.nf -profile test,docker \ #### postprocessing -<<<<<<< HEAD ##### maltextract -======= -```bash -## PMD filtering with default parameters -## Expect: damage_manipulation directory with a bam and bai and flagstat per library (9 files total). -nextflow run . -profile test,docker --run_pmd_filtering -resume --outdir ./results -## number of reads in each file after filtering: -# JK2782_JK2782_TGGCCGATCAACGA_BAM_pmdfiltered.bam: 70 -# JK2782_JK2782_TGGCCGATCAACGA_pmdfiltered.bam: 180 -# JK2802_JK2802_AGAATAACCTACCA_pmdfiltered.bam: 55 - - -## PMD filtering with changed parameters -## Expect: damage_manipulation directory with a bam and bai and flagstat per library (9 files total). Commands checked to ensure parameter gets propagated. -nextflow run . -profile test,docker --run_pmd_filtering -resume --outdir ./results --damage_manipulation_pmdtools_threshold 4 -## number of reads in each file after filtering: -# JK2782_JK2782_TGGCCGATCAACGA_BAM_pmdfiltered.bam: 64 -# JK2782_JK2782_TGGCCGATCAACGA_pmdfiltered.bam: 137 -# JK2802_JK2802_AGAATAACCTACCA_pmdfiltered.bam: 30 -``` - -```bash -## PMD filtering with fasta masking -## Expect: damage_manipulation directory with *.masked.fa and bam and bai and flagstat per library -nextflow run . -profile test_humanbam,docker --run_pmd_filtering --damage_manipulation_pmdtools_reference_mask https://raw.githubusercontent.com/nf-core/test-datasets/eager/reference/Human/1240K.pos.list_hs37d5.0based.bed.gz -resume --outdir ./results -``` - -```bash -## PMD filtering with fasta masking for 1 of 2 references -## Expect: damage_manipulation directory with hs37d5_chr21-MT.masked.fa and bam and bai and flagstat per library and reference (22 files total). hs37d5_chr21-MT first masked with 1240K.pos.list_hs37d5.0based.bed.gz from reference sheet, PMD filtering run with masked reference fasta for hs37d5 and non-masked reference fasta for Mammoth_MT -nextflow run . -profile test_multiref,docker --run_pmd_filtering --outdir ./results -``` - -## BAM trimming ->>>>>>> eager/dev ```bash ### Create a SummaryTable from the Malt rma6 files @@ -827,6 +770,175 @@ nextflow run -resume ./main.nf -profile test,docker --outdir out \ --run_metagenomics --metagenomics_profiling_tool metaphlan --metagenomics_profiling_database ./runtest/metaphlandb/ --metagenomics_run_postprocessing # 20230804: works ``` +## Mapping statistics + +### ENDOSPY + +All possible paramters + +``` +// BAM Filtering +params.run_bamfiltering +//Deduplication +params.skip_deduplication +``` + +Tests + +```{bash} +##Check if mapping + filtering + deduplication is done, meaning params.run_bamfiltering is true and params.skip_deduplication is false +##Expect: a json for each of the of the libraries with all the stats calculates (percent on target raw, percent on target modified, percent on target postdedup, clonality and percent duplicates) +##Checked: there is 3 jsons with all the stats calculates + +nextflow run ../main.nf -profile docker,test --outdir results_endorspy_all -w results_endorspy_all/work --run_bamfiltering + +##Check if mapping only has been performed, meaning params.run_bamfiltering is false and params.skip_deduplication is true +##Expect: a json for each of the of the libraries with only percent on target raw +##Checked: there is 3 jsons with only Percent on target (%) + +nextflow run ../main.nf -profile docker,test --outdir results_endorspy_map_only -w results_endorspy_map_only/work --skip_deduplication --run_bamfiltering false + +##Check if mapping and run_bamfiltering done but no dedepup, meaning params.run_bamfiltering is true and params.skip_deduplication is true +##Checked: there is 3 jsons with only Percent on target (%) and Percent on target modified (%) calculated + +nextflow run ../main.nf -profile docker,test --outdir results_endorspy_map_filtering_nodedup -w results_endorspy_map_filtering_nodedup/work --run_bamfiltering --skip_deduplication + +##Check if mapping and dedup done but no bam filtering, meaning params.run_bamfiltering is false and params.skip_deduplication is true +##Checked: there is 3 jsons with Percent on target (%), Percent on target postdedup (%), Clonality and Percent Duplicates (%) + +nextflow run ../main.nf -profile docker,test --outdir results_endorspy_map_nofiltering_dedup -w results_endorspy_map_nofiltering_dedup/work --run_bamfiltering false + +##Check if mapping + filtering + deduplication is done (meaning params.run_bamfiltering is true and params.skip_deduplication is false) and multiple reference used +##Expect: a json for each of the of the libraries with all the stats calculates (percent on target raw, percent on target modified, percent on target postdedup, clonality and percent duplicates) for each of the references +##Checked: there is 6 jsons with all the stats calculates: one for each of the references (2) for each of the samples (3 samples in total). All the stats were calculated. + +nextflow run ../main.nf -profile docker,test_multiref --outdir results_endorspy_all_multiref -w results_endorspy_all_multiref/work --run_bamfiltering +``` + +### CALCULATE DAMAGE + +#### With DamageProfiler + +```bash +## DamageProfiler with default parameters +## Expect:damageprofiler directory with txt, pdf, svg for each library (19 files total per library). +nextflow run main.nf -profile test,conda --outdir ./results -resume +``` + +#### With mapDamage2 + +```bash +## mapDamage2 with default parameters +## Expect: mapdamage directory with 3pGtoA_freq, 5pCtoT_freq, dnacomp_genome, dnacomp, Fragmisincorporation_plot, Length_plot, lgdistribution, misincorporation, Runtime_log +nextflow run main.nf -profile test,docker --outdir ./results --damagecalculation_tool mapdamage +``` + +```bash +## mapDamage2 with downsampling to 100 reads +## Expect: mapdamage directory with 3pGtoA_freq, 5pCtoT_freq, dnacomp_genome, dnacomp, Fragmisincorporation_plot, Length_plot, lgdistribution, misincorporation, Runtime_log +nextflow run main.nf -profile test,docker --outdir ./results --damagecalculation_tool mapdamage --damagecalculation_mapdamage_downsample 100 +``` + +### ESTIMATE CONTAMINATION + +#### With ANGSD + +```bash +## ANGSD contamination estimation with default parameters +## Expect: contamination_estimation/angsd directory with txt for each library and 'nuclear_contamination.txt' summary table. +nextflow run main.nf -profile test,humanbam --outdir ./results --run_contamination_angsd -resume + +## ANGSD contamination estimation with quality filters reduced +## Expect: contamination_estimation/angsd directory with txt for each library and 'nuclear_contamination.txt' summary table. +nextflow run main.nf -profile test,humanbam --outdir ./results --run_contamination_angsd --angsd_minq 0 --angsd_mapq 0 -resume +``` + +### MANIPULATE DAMAGE + +## Rescaling + +```bash +## Rescaling with default parameters +## Expect: damage_manipulation directory with a bam and bai per library (4 files total, cause one sample is full UDG), and 2 results_* directories with 6 Stats_out_MCMC_* files each. +nextflow run . -profile test,docker --run_mapdamage_rescaling -resume --outdir ./results + +## Rescaling with changed rescale lengths +## Expect: damage_manipulation directory with a bam and bai per library (4 files total, cause one sample is full UDG), and 2 results_* directories with 6 Stats_out_MCMC_* files each. +## Commands checked to ensure parameter gets propagated (Yes, together with default --seq-length of 12.) +nextflow run . -profile test,docker --run_mapdamage_rescaling --damage_manipulation_rescale_length_5p 3 --damage_manipulation_rescale_length_3p 3 -resume --outdir ./results +``` + +## PMD Filtering + +```bash +## PMD filtering with default parameters +## Expect: damage_manipulation directory with a bam and bai and flagstat per library (9 files total). +nextflow run . -profile test,docker --run_pmd_filtering -resume --outdir ./results +## number of reads in each file after filtering: +# JK2782_JK2782_TGGCCGATCAACGA_BAM_pmdfiltered.bam: 70 +# JK2782_JK2782_TGGCCGATCAACGA_pmdfiltered.bam: 180 +# JK2802_JK2802_AGAATAACCTACCA_pmdfiltered.bam: 55 + + +## PMD filtering with changed parameters +## Expect: damage_manipulation directory with a bam and bai and flagstat per library (9 files total). Commands checked to ensure parameter gets propagated. +nextflow run . -profile test,docker --run_pmd_filtering -resume --outdir ./results --damage_manipulation_pmdtools_threshold 4 +## number of reads in each file after filtering: +# JK2782_JK2782_TGGCCGATCAACGA_BAM_pmdfiltered.bam: 64 +# JK2782_JK2782_TGGCCGATCAACGA_pmdfiltered.bam: 137 +# JK2802_JK2802_AGAATAACCTACCA_pmdfiltered.bam: 30 +``` + +```bash +## PMD filtering with fasta masking +## Expect: damage_manipulation directory with *.masked.fa and bam and bai and flagstat per library +nextflow run . -profile test_humanbam,docker --run_pmd_filtering --damage_manipulation_pmdtools_reference_mask https://raw.githubusercontent.com/nf-core/test-datasets/eager/reference/Human/1240K.pos.list_hs37d5.0based.bed.gz -resume --outdir ./results +``` + +```bash +## PMD filtering with fasta masking for 1 of 2 references +## Expect: damage_manipulation directory with hs37d5_chr21-MT.masked.fa and bam and bai and flagstat per library and reference (22 files total). hs37d5_chr21-MT first masked with 1240K.pos.list_hs37d5.0based.bed.gz from reference sheet, PMD filtering run with masked reference fasta for hs37d5 and non-masked reference fasta for Mammoth_MT +nextflow run . -profile test_multiref,docker --run_pmd_filtering --outdir ./results +``` + +## BAM trimming + +```bash +## BAM trimming with default parameters (0bp trim) +## Expect: damage_manipulation directory with a bam and bai per library. No trimming actually done. (6 files total. full UDG still goes through module but trimming is 0bp) +nextflow run . -profile test,docker --run_trim_bam -resume --outdir ./results + +## BAM trimming with changed parameters +## Expect: damage_manipulation directory with a bam and bai per library. Trimming is done. 0 bp for full UDG, 1-2bp for half, 5-7 for none. (6 files total) +## Giving different on each side to make sure arguments are passed correctly. +nextflow run . -profile test,docker \ + -resume \ + --outdir ./results \ + --run_trim_bam \ + --damage_manipulation_bamutils_trim_double_stranded_none_udg_left 5 \ + --damage_manipulation_bamutils_trim_double_stranded_none_udg_right 7 \ + --damage_manipulation_bamutils_trim_double_stranded_half_udg_left 1 \ + --damage_manipulation_bamutils_trim_double_stranded_half_udg_right 2 +``` + +## All together + +```bash +## All together with default parameters + non-0 trimming. +## Expect: damage_manipulation directory with _pmdfiltered, and _pmdfiltered_trimmed bams and bai per library, plus pmd_filtered flagstat files. (5 * 3 = 15 files total). +## Also _rescaled bam/bai for libraries that are not full-UDG. (15 + 4 = 19 files total), and 2 results_* directories with 6 Stats_out_MCMC_* files each. +## Number of reads in each file after trimming should match filtered flagstat. +nextflow run . -profile test,docker \ + -resume \ + --outdir ./results \ + --run_mapdamage_rescaling \ + --run_pmd_filtering \ + --run_trim_bam \ + --damage_manipulation_bamutils_trim_double_stranded_none_udg_left 5 \ + --damage_manipulation_bamutils_trim_double_stranded_none_udg_right 7 \ + --damage_manipulation_bamutils_trim_double_stranded_half_udg_left 1 \ + --damage_manipulation_bamutils_trim_double_stranded_half_udg_right 2 +``` ### LIBRARY_MERGE From ecff009a096d1147a4c6cc2d41541c2f6ac778f7 Mon Sep 17 00:00:00 2001 From: Merlin Szymanski Date: Wed, 20 Mar 2024 17:00:18 +0100 Subject: [PATCH 093/198] Update kraken2 module --- modules.json | 2 +- .../nf-core/kraken2/kraken2/environment.yml | 8 +++++ modules/nf-core/kraken2/kraken2/main.nf | 31 +++++++++++++++++-- modules/nf-core/kraken2/kraken2/meta.yml | 7 +++-- 4 files changed, 43 insertions(+), 5 deletions(-) create mode 100644 modules/nf-core/kraken2/kraken2/environment.yml diff --git a/modules.json b/modules.json index 9265d092a..b6db441a5 100644 --- a/modules.json +++ b/modules.json @@ -172,7 +172,7 @@ }, "kraken2/kraken2": { "branch": "master", - "git_sha": "7c695e0147df1157413e06246d9b0094617d3e6b", + "git_sha": "653218e79ffa76fde20319e9062f8b8da5cf7555", "installed_by": ["modules"] }, "krakenuniq/preloadedkrakenuniq": { diff --git a/modules/nf-core/kraken2/kraken2/environment.yml b/modules/nf-core/kraken2/kraken2/environment.yml new file mode 100644 index 000000000..63be419bd --- /dev/null +++ b/modules/nf-core/kraken2/kraken2/environment.yml @@ -0,0 +1,8 @@ +name: kraken2_kraken2 +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::kraken2=2.1.2 + - conda-forge::pigz=2.6 diff --git a/modules/nf-core/kraken2/kraken2/main.nf b/modules/nf-core/kraken2/kraken2/main.nf index 5901064e7..92cd9c34f 100644 --- a/modules/nf-core/kraken2/kraken2/main.nf +++ b/modules/nf-core/kraken2/kraken2/main.nf @@ -2,10 +2,10 @@ process KRAKEN2_KRAKEN2 { tag "$meta.id" label 'process_high' - conda "bioconda::kraken2=2.1.2 conda-forge::pigz=2.6" + conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/mulled-v2-5799ab18b5fc681e75923b2450abaa969907ec98:87fc08d11968d081f3e8a37131c1f1f6715b6542-0' : - 'quay.io/biocontainers/mulled-v2-5799ab18b5fc681e75923b2450abaa969907ec98:87fc08d11968d081f3e8a37131c1f1f6715b6542-0' }" + 'biocontainers/mulled-v2-5799ab18b5fc681e75923b2450abaa969907ec98:87fc08d11968d081f3e8a37131c1f1f6715b6542-0' }" input: tuple val(meta), path(reads) @@ -55,4 +55,31 @@ process KRAKEN2_KRAKEN2 { pigz: \$( pigz --version 2>&1 | sed 's/pigz //g' ) END_VERSIONS """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def paired = meta.single_end ? "" : "--paired" + def classified = meta.single_end ? "${prefix}.classified.fastq.gz" : "${prefix}.classified_1.fastq.gz ${prefix}.classified_2.fastq.gz" + def unclassified = meta.single_end ? "${prefix}.unclassified.fastq.gz" : "${prefix}.unclassified_1.fastq.gz ${prefix}.unclassified_2.fastq.gz" + def readclassification_option = save_reads_assignment ? "--output ${prefix}.kraken2.classifiedreads.txt" : "--output /dev/null" + def compress_reads_command = save_output_fastqs ? "pigz -p $task.cpus *.fastq" : "" + + """ + touch ${prefix}.kraken2.report.txt + if [ "$save_output_fastqs" == "true" ]; then + touch $classified + touch $unclassified + fi + if [ "$save_reads_assignment" == "true" ]; then + touch ${prefix}.kraken2.classifiedreads.txt + fi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + kraken2: \$(echo \$(kraken2 --version 2>&1) | sed 's/^.*Kraken version //; s/ .*\$//') + pigz: \$( pigz --version 2>&1 | sed 's/pigz //g' ) + END_VERSIONS + """ + } diff --git a/modules/nf-core/kraken2/kraken2/meta.yml b/modules/nf-core/kraken2/kraken2/meta.yml index 7129fe3a0..7909ffe7e 100644 --- a/modules/nf-core/kraken2/kraken2/meta.yml +++ b/modules/nf-core/kraken2/kraken2/meta.yml @@ -28,12 +28,12 @@ input: type: directory description: Kraken2 database - save_output_fastqs: - type: boolean + type: string description: | If true, optional commands are added to save classified and unclassified reads as fastq files - save_reads_assignment: - type: boolean + type: string description: | If true, an optional command is added to save a file reporting the taxonomic classification of each input read @@ -73,3 +73,6 @@ output: authors: - "@joseespinosa" - "@drpatelh" +maintainers: + - "@joseespinosa" + - "@drpatelh" From a7b163ce3a50ad3f865f6b3eccd8a06f3d84310a Mon Sep 17 00:00:00 2001 From: Merlin Szymanski Date: Wed, 20 Mar 2024 17:14:58 +0100 Subject: [PATCH 094/198] linting and update nextflow_schema.json --- assets/email_template.html | 81 ++++++++++++++++---------------------- nextflow_schema.json | 13 ++---- 2 files changed, 36 insertions(+), 58 deletions(-) diff --git a/assets/email_template.html b/assets/email_template.html index 4aac0879c..2b66b754d 100644 --- a/assets/email_template.html +++ b/assets/email_template.html @@ -1,68 +1,53 @@ - - + nf-core/eager Pipeline Report - -
+
- + -

nf-core/eager ${version}

-

Run Name: $runName

+

nf-core/eager ${version}

+

Run Name: $runName

- <% if (!success){ out << """ -
-

nf-core/eager execution completed unsuccessfully!

-

The exit status of the task that caused the workflow execution to fail was: $exitStatus.

-

The full error message was:

-
${errorReport}
-
- """ - } else { - out << """ -
- nf-core/eager execution completed successfully! +<% if (!success){ + out << """ +
+

nf-core/eager execution completed unsuccessfully!

+

The exit status of the task that caused the workflow execution to fail was: $exitStatus.

+

The full error message was:

+
${errorReport}
+
+ """ +} else { + out << """ +
+ nf-core/eager execution completed successfully!
""" - } - %> +} +%> -

The workflow was completed at $dateComplete (duration: $duration)

-

The command used to launch the workflow was as follows:

-
$commandLine
+

The workflow was completed at $dateComplete (duration: $duration)

+

The command used to launch the workflow was as follows:

+
$commandLine
-

Pipeline Configuration:

- - - <% out << summary.collect{ k,v -> " - - - " }.join("\n") %> - -
- $k -
$v
-
+

Pipeline Configuration:

+ + + <% out << summary.collect{ k,v -> "" }.join("\n") %> + +
$k
$v
-

nf-core/eager

-

https://github.com/nf-core/eager

+

nf-core/eager

+

https://github.com/nf-core/eager

-
+
- - \ No newline at end of file + diff --git a/nextflow_schema.json b/nextflow_schema.json index 8d7aa2803..4e989d26b 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -831,14 +831,14 @@ "metagenomics_maltextract_taxonlist": { "type": "string", "description": "Path to a text file with taxa of interest (one taxon per row, NCBI taxonomy name format)", - "default": "None", + "default": null, "help_text": "Path to a `.txt` file with taxa of interest you wish to assess for aDNA characteristics. In `.txt` file should be one taxon per row, and the taxon should be in a valid [NCBI taxonomy](https://www.ncbi.nlm.nih.gov/taxonomy) name format corresponding to a taxonomic node in your MALT database. An example can be found on the [HOPS github](https://raw.githubusercontent.com/rhuebler/HOPS/external/Resources/default_list.txt).\\n\\nNecessary when `--metagenomics_profiling_tool malt` specified and `--metagenomics_run_postprocessing` flagged.", "fa_icon": "fas fa-align-left" }, "metagenomics_maltextract_ncbidir": { "type": "string", "description": "Path to directory containing containing NCBI resource files (ncbi.tre and ncbi.map; available: https://github.com/rhuebler/HOPS/)", - "default": "None", + "default": null, "help_text": "Path to directory containing containing the NCBI resource tree and taxonomy table files (ncbi.tre and ncbi.map; available at the [HOPS repository](https://github.com/rhuebler/HOPS/Resources)).\\n\\nNecessary when `--metagenomics_profiling_tool malt` and `--metagenomics_run_postprocessing` specified.", "fa_icon": "fab fa-buffer" }, @@ -859,47 +859,41 @@ }, "metagenomics_maltextract_destackingoff": { "type": "boolean", - "default": false, "description": "Turn off destacking.", "help_text": "Turn off destacking. If left on, a read that overlaps with another read will be\\nremoved (leaving a depth coverage of 1).\\n\\nOnly when `--metagenomics_profiling_tool malt` is also supplied.\\n\\n> Modifies MaltExtract parameter: `--destackingOff`", "fa_icon": "fab fa-stack-overflow" }, "metagenomics_maltextract_downsamplingoff": { "type": "boolean", - "default": false, "description": "Turn off downsampling.", "help_text": "Turn off downsampling. By default, downsampling is on and will randomly select 10,000 reads if the number of reads on a node exceeds this number. This is to speed up processing, under the assumption at 10,000 reads the species is a 'true positive'.\\n\\nOnly when `--metagenomics_profiling_tool malt` is also supplied.\\n\\n> Modifies MaltExtract parameter: `--downSampOff`", "fa_icon": "fas fa-angle-double-down" }, "metagenomics_maltextract_duplicateremovaloff": { "type": "boolean", - "default": false, "description": "Turn off duplicate removal.", "help_text": "Turn off duplicate removal. By default, reads that are an exact copy (i.e. same start, stop coordinate and exact sequence match) will be removed as it is considered a PCR duplicate.\\n\\nOnly when `--metagenomics_profiling_tool malt` is also supplied.\\n\\n> Modifies MaltExtract parameter: `--dupRemOff`", "fa_icon": "fas fa-copy" }, "metagenomics_maltextract_matches": { "type": "boolean", - "default": false, "description": "Turn on exporting alignments of hits in BLAST format.", "help_text": "Export alignments of hits for each node in BLAST format. By default turned off.\\n\\nOnly when `--metagenomics_profiling_tool malt` is also supplied.\\n\\n> Modifies MaltExtract parameter: `--matches`", "fa_icon": "fas fa-equals" }, "metagenomics_maltextract_megansummary": { "type": "boolean", - "default": false, "description": "Turn on export of MEGAN summary files.", "help_text": "Export 'minimal' summary files (i.e. without alignments) that can be loaded into [MEGAN6](https://doi.org/10.1371/journal.pcbi.1004957). By default turned off.\\n\\nOnly when `--metagenomics_profiling_tool malt` is also supplied.\\n\\n> Modifies MaltExtract parameter: `--meganSummary`" }, "metagenomics_maltextract_minpercentidentity": { "type": "number", - "default": 85.0, + "default": 85, "description": "Minimum percent identity alignments are required to have to be reported as candidate reads. Recommended to set same as MALT parameter.", "help_text": "Minimum percent identity alignments are required to have to be reported. Higher values allows fewer mismatches between read and reference sequence, but therefore will provide greater confidence in the hit. Lower values allow more mismatches, which can account for damage and divergence of a related strain/species to the reference. Recommended to set same as MALT parameter or higher. Default: `85`.\\n\\nOnly when `--metagenomics_profiling_tool malt` is also supplied.\\n\\n> Modifies MaltExtract parameter: `--minPI`" }, "metagenomics_maltextract_usetopalignment": { "type": "boolean", - "default": false, "description": "Turn on using top alignments per read after filtering.", "help_text": "Use the best alignment of each read for every statistic, except for those concerning read distribution and coverage. Default: off.\\n\\nOnly when `--metagenomics_profiling_tool malt` is also supplied.\\n\\n> Modifies MaltExtract parameter: `--useTopAlignment`", "fa_icon": "fas fa-bahai" @@ -1502,7 +1496,6 @@ "properties": { "run_sexdeterrmine": { "type": "boolean", - "default": false, "fa_icon": "fas fa-transgender-alt", "description": "Turn on sex determination for human reference genomes. This will run on single- and double-stranded variants of a library separately.", "help_text": "Specify to run the optional process of sex determination." From 973f16863c0f330e729862034069a6f8aee11e24 Mon Sep 17 00:00:00 2001 From: Merlin Szymanski Date: Wed, 20 Mar 2024 18:48:10 +0100 Subject: [PATCH 095/198] Linting again... --- docs/development/manual_tests.md | 1 + nextflow.config | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/development/manual_tests.md b/docs/development/manual_tests.md index 34023a775..0f0115c8b 100644 --- a/docs/development/manual_tests.md +++ b/docs/development/manual_tests.md @@ -770,6 +770,7 @@ nextflow run -resume ./main.nf -profile test,docker --outdir out \ --run_metagenomics --metagenomics_profiling_tool metaphlan --metagenomics_profiling_database ./runtest/metaphlandb/ --metagenomics_run_postprocessing # 20230804: works ``` + ## Mapping statistics ### ENDOSPY diff --git a/nextflow.config b/nextflow.config index 5af188208..74f37fb24 100644 --- a/nextflow.config +++ b/nextflow.config @@ -137,7 +137,7 @@ params { metagenomics_prinseq_dustscore = 0.5 metagenomics_profiling_tool = null metagenomics_profiling_database = null - metagenomics_krakenuniq_ramchunksize = '16G' + metagenomics_krakenuniq_ramchunksize = "16G" metagenomics_kraken_savereads = false metagenomics_kraken_savereadclassifications = false metagenomics_kraken2_saveminimizers = false From 76b322a15d3de1121c1cf984772d47bd7125c952 Mon Sep 17 00:00:00 2001 From: Merlin Szymanski Date: Fri, 12 Apr 2024 12:57:59 +0200 Subject: [PATCH 096/198] Add kraken2 test-profile, add UNTAR module --- conf/test_kraken2.config | 33 ++++++++++ modules.json | 5 ++ modules/nf-core/untar/environment.yml | 11 ++++ modules/nf-core/untar/main.nf | 63 ++++++++++++++++++++ modules/nf-core/untar/meta.yml | 46 ++++++++++++++ nextflow.config | 3 +- nextflow_schema.json | 2 +- subworkflows/local/metagenomics_profiling.nf | 20 +++++++ 8 files changed, 181 insertions(+), 2 deletions(-) create mode 100644 conf/test_kraken2.config create mode 100644 modules/nf-core/untar/environment.yml create mode 100644 modules/nf-core/untar/main.nf create mode 100644 modules/nf-core/untar/meta.yml diff --git a/conf/test_kraken2.config b/conf/test_kraken2.config new file mode 100644 index 000000000..2176ea58f --- /dev/null +++ b/conf/test_kraken2.config @@ -0,0 +1,33 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test for + metagenomics krakenuniq. + + Use as follows: + nextflow run nf-core/eager -profile test_krakenuniq, --outdir + +---------------------------------------------------------------------------------------- +*/ + +params { + config_profile_name = 'Kraken2 test profile' + config_profile_description = 'Minimal test dataset to check the metagenomics kraken2 pipeline function' + + // Limit resources so that this can run on GitHub Actions + max_cpus = 2 + max_memory = '6.GB' + max_time = '6.h' + + // Input data + input = 'https://github.com/nf-core/test-datasets/raw/eager/testdata/Mammoth/samplesheet_v3.tsv' + + // Genome references + fasta = 'https://raw.githubusercontent.com/nf-core/test-datasets/eager/reference/Mammoth/Mammoth_MT_Krause.fasta' + + // Metagenomics + run_metagenomics = true + metagenomics_profiling_tool = 'kraken2' + metagenomics_profiling_database = 'https://raw.githubusercontent.com/nf-core/test-datasets/eager/databases/kraken/eager_test.tar.gz' +} diff --git a/modules.json b/modules.json index b6db441a5..7f9ef2066 100644 --- a/modules.json +++ b/modules.json @@ -324,6 +324,11 @@ "branch": "master", "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", "installed_by": ["modules"] + }, + "untar": { + "branch": "master", + "git_sha": "5caf7640a9ef1d18d765d55339be751bb0969dfa", + "installed_by": ["modules"] } } }, diff --git a/modules/nf-core/untar/environment.yml b/modules/nf-core/untar/environment.yml new file mode 100644 index 000000000..0c9cbb101 --- /dev/null +++ b/modules/nf-core/untar/environment.yml @@ -0,0 +1,11 @@ +name: untar + +channels: + - conda-forge + - bioconda + - defaults + +dependencies: + - conda-forge::grep=3.11 + - conda-forge::sed=4.7 + - conda-forge::tar=1.34 diff --git a/modules/nf-core/untar/main.nf b/modules/nf-core/untar/main.nf new file mode 100644 index 000000000..8a75bb957 --- /dev/null +++ b/modules/nf-core/untar/main.nf @@ -0,0 +1,63 @@ +process UNTAR { + tag "$archive" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : + 'nf-core/ubuntu:20.04' }" + + input: + tuple val(meta), path(archive) + + output: + tuple val(meta), path("$prefix"), emit: untar + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: '' + prefix = task.ext.prefix ?: ( meta.id ? "${meta.id}" : archive.baseName.toString().replaceFirst(/\.tar$/, "")) + + """ + mkdir $prefix + + ## Ensures --strip-components only applied when top level of tar contents is a directory + ## If just files or multiple directories, place all in prefix + if [[ \$(tar -taf ${archive} | grep -o -P "^.*?\\/" | uniq | wc -l) -eq 1 ]]; then + tar \\ + -C $prefix --strip-components 1 \\ + -xavf \\ + $args \\ + $archive \\ + $args2 + else + tar \\ + -C $prefix \\ + -xavf \\ + $args \\ + $archive \\ + $args2 + fi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + untar: \$(echo \$(tar --version 2>&1) | sed 's/^.*(GNU tar) //; s/ Copyright.*\$//') + END_VERSIONS + """ + + stub: + prefix = task.ext.prefix ?: ( meta.id ? "${meta.id}" : archive.toString().replaceFirst(/\.[^\.]+(.gz)?$/, "")) + """ + mkdir $prefix + touch ${prefix}/file.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + untar: \$(echo \$(tar --version 2>&1) | sed 's/^.*(GNU tar) //; s/ Copyright.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/untar/meta.yml b/modules/nf-core/untar/meta.yml new file mode 100644 index 000000000..a9a2110f5 --- /dev/null +++ b/modules/nf-core/untar/meta.yml @@ -0,0 +1,46 @@ +name: untar +description: Extract files. +keywords: + - untar + - uncompress + - extract +tools: + - untar: + description: | + Extract tar.gz files. + documentation: https://www.gnu.org/software/tar/manual/ + licence: ["GPL-3.0-or-later"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - archive: + type: file + description: File to be untar + pattern: "*.{tar}.{gz}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - untar: + type: directory + description: Directory containing contents of archive + pattern: "*/" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@joseespinosa" + - "@drpatelh" + - "@matthdsm" + - "@jfy133" +maintainers: + - "@joseespinosa" + - "@drpatelh" + - "@matthdsm" + - "@jfy133" diff --git a/nextflow.config b/nextflow.config index 74f37fb24..11a264036 100644 --- a/nextflow.config +++ b/nextflow.config @@ -378,7 +378,8 @@ profiles { test { includeConfig 'conf/test.config' } test_full { includeConfig 'conf/test_full.config' } test_humanbam { includeConfig 'conf/test_humanbam.config' } - test_multiref { includeConfig 'conf/test_multiref.config' } + test_multiref { includeConfig 'conf/test_multiref.config' } + test_kraken2 { includeConfig 'conf/test_kraken2.config' } } // Set default registry for Apptainer, Docker, Podman and Singularity independent of -profile diff --git a/nextflow_schema.json b/nextflow_schema.json index 4e989d26b..f24a80378 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -712,7 +712,7 @@ }, "metagenomics_profiling_database": { "type": "string", - "format": "directory-path", + "format": "path", "description": "Specify a databse directory to run metagenomics profiling on. In the case of kraken2, this can be a tar.gz of the directory. Required if `--run_metagenomics` flagged.", "fa_icon": "fas fa-database", "help_text": "Select which tool to run metagenomics profiling database to use with the designated metagenomics_profiling_tool on the selected metagenomics_input. These databases are NOT cross-compatible and need to be pre-built/downloaded for use in nf-core/eager. Database construction is often a balancing act between breadth of sequence diversity and size." diff --git a/subworkflows/local/metagenomics_profiling.nf b/subworkflows/local/metagenomics_profiling.nf index 0cf7d258c..2afc7280e 100644 --- a/subworkflows/local/metagenomics_profiling.nf +++ b/subworkflows/local/metagenomics_profiling.nf @@ -9,6 +9,7 @@ include { KRAKEN2_KRAKEN2 } from '../../modules/nf-core/kraken2/k include { KRAKENUNIQ_PRELOADEDKRAKENUNIQ } from '../../modules/nf-core/krakenuniq/preloadedkrakenuniq/main' include { METAPHLAN_METAPHLAN } from '../../modules/nf-core/metaphlan/metaphlan/main' include { CAT_CAT as CAT_CAT_MALT } from '../../modules/nf-core/cat/cat/main' +include { UNTAR } from '../../modules/nf-core/untar/main' workflow METAGENOMICS_PROFILING { @@ -21,6 +22,25 @@ workflow METAGENOMICS_PROFILING { ch_multiqc_files = Channel.empty() ch_postprocessing_input = Channel.empty() + /* + UNTAR THE DATABASE IF NECESSARY + */ + + ch_database = ch_database + .branch{ + untar: it ==~ /.*.tar.gz/ + base:true + } + + // untar the database + ch_untar_input = ch_database.untar.map{ [[], it] } + + UNTAR( ch_untar_input ) + ch_untar_output = UNTAR.out.untar.map{ it[1] } + + // back to the original database channel... + ch_database = ch_database.base.mix(ch_untar_output) + /* PREPARE PROFILER INPUT CHANNELS & RUN PROFILING */ From 01943597c12982a8dedf313ac68a9804aeff2b35 Mon Sep 17 00:00:00 2001 From: Merlin Szymanski Date: Fri, 12 Apr 2024 13:30:22 +0200 Subject: [PATCH 097/198] Apply first suggestions from code review Co-authored-by: James A. Fellows Yates --- docs/output.md | 4 ++-- subworkflows/local/metagenomics.nf | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/docs/output.md b/docs/output.md index 6eef71667..dba2e5b86 100644 --- a/docs/output.md +++ b/docs/output.md @@ -438,9 +438,9 @@ the output created by the `taxpasta merge` command. It combines the results of a
-#### maltextract +#### maltExtract -The output directory for maltextract, as implemented under [HOPS](https://github.com/rhuebler/HOPS), which applies various heuristics of ancient authenticity and presence to megan read assignments across a given set of candidate taxon. +The output directory for maltExtract, as implemented under [HOPS](https://github.com/rhuebler/HOPS), which applies various heuristics of ancient authenticity and presence to MEGAN read assignments across a given set of candidate taxon.
Output files diff --git a/subworkflows/local/metagenomics.nf b/subworkflows/local/metagenomics.nf index 19dc06c29..a37348d4f 100644 --- a/subworkflows/local/metagenomics.nf +++ b/subworkflows/local/metagenomics.nf @@ -37,7 +37,6 @@ workflow METAGENOMICS { database = Channel.fromPath(params.metagenomics_profiling_database) METAGENOMICS_PROFILING( ch_reads_for_metagenomics, database ) - ch_versions = ch_versions.mix( METAGENOMICS_PROFILING.out.versions ) ch_multiqc_files = ch_multiqc_files.mix( METAGENOMICS_PROFILING.out.mqc.collect{it[1]}.ifEmpty([]) ) From dbc67276da89bf43b7b776ad3f4cbc806fda7941 Mon Sep 17 00:00:00 2001 From: Merlin Szymanski Date: Fri, 12 Apr 2024 13:31:49 +0200 Subject: [PATCH 098/198] (Review) Remove HOST_REMOVAL duplicates --- workflows/eager.nf | 68 +--------------------------------------------- 1 file changed, 1 insertion(+), 67 deletions(-) diff --git a/workflows/eager.nf b/workflows/eager.nf index 2d4bbb6b2..db5b6b56f 100644 --- a/workflows/eager.nf +++ b/workflows/eager.nf @@ -260,40 +260,7 @@ workflow EAGER { // // MODULE: remove reads mapping to the host from the raw fastq // - if ( params.run_host_removal ) { - // Preparing bam channel for host removal to be combined with the input fastq channel - // The bam channel consist of [meta, bam, bai] and in the meta we have in addition 'single_end' always set as TRUE and 'reference' set - // To be able to join it with fastq channel, we need to remove them from the meta (done in map) and stored in new_meta - ch_bam_for_host_removal= MAP.out.bam.join(MAP.out.bai) - .map{ - meta, bam, bai -> - new_meta = meta.clone().findAll{ it.key !in [ 'single_end', 'reference' ] } - [ new_meta, meta, bam, bai ] - } - // Preparing fastq channel for host removal to be combined with the bam channel - // The meta of the fastq channel contains additional fields when compared to the meta from the bam channel: lane, colour_chemistry, - // and not necessarily matching single_end. Those fields are dropped of the meta in the map and stored in new_meta - ch_fastqs_for_host_removal= ch_samplesheet_fastqs.map{ - meta, fastqs -> - new_meta = meta.clone().findAll{ it.key !in [ 'lane', 'colour_chemistry', 'single_end' ] } - [ new_meta, meta, fastqs ] - } - // We join the bam and fastq channel with now matching metas (new_meta) referred as meta_join - // and remove the meta_join from the final channel, keeping the original metas for the bam and the fastqs - ch_input_for_host_removal = ch_bam_for_host_removal.join(ch_fastqs_for_host_removal) - .map{ - meta_join, meta_bam, bam, bai, meta_fastq, fastqs -> - [ meta_bam, bam, bai, meta_fastq, fastqs] - } - HOST_REMOVAL ( ch_input_for_host_removal ) - - ch_versions = ch_versions.mix( HOST_REMOVAL.out.versions ) - } - - // - // MODULE: remove reads mapping to the host from the raw fastq - // if ( params.run_host_removal ) { // Preparing bam channel for host removal to be combined with the input fastq channel // The bam channel consist of [meta, bam, bai] and in the meta we have in addition 'single_end' always set as TRUE and 'reference' set @@ -307,7 +274,7 @@ workflow EAGER { // Preparing fastq channel for host removal to be combined with the bam channel // The meta of the fastq channel contains additional fields when compared to the meta from the bam channel: lane, colour_chemistry, // and not necessarily matching single_end. Those fields are dropped of the meta in the map and stored in new_meta - ch_fastqs_for_host_removal= INPUT_CHECK.out.fastqs.map{ + ch_fastqs_for_host_removal= ch_samplesheet_fastqs.map{ meta, fastqs -> new_meta = meta.clone().findAll{ it.key !in [ 'lane', 'colour_chemistry', 'single_end' ] } [ new_meta, meta, fastqs ] @@ -325,39 +292,6 @@ workflow EAGER { ch_versions = ch_versions.mix( HOST_REMOVAL.out.versions ) } - // - // MODULE: remove reads mapping to the host from the raw fastq - // - if ( params.run_host_removal ) { - // Preparing bam channel for host removal to be combined with the input fastq channel - // The bam channel consist of [meta, bam, bai] and in the meta we have in addition 'single_end' always set as TRUE and 'reference' set - // To be able to join it with fastq channel, we need to remove them from the meta (done in map) and stored in new_meta - ch_bam_for_host_removal= MAP.out.bam.join(MAP.out.bai) - .map{ - meta, bam, bai -> - new_meta = meta.clone().findAll{ it.key !in [ 'single_end', 'reference' ] } - [ new_meta, meta, bam, bai ] - } - // Preparing fastq channel for host removal to be combined with the bam channel - // The meta of the fastq channel contains additional fields when compared to the meta from the bam channel: lane, colour_chemistry, - // and not necessarily matching single_end. Those fields are dropped of the meta in the map and stored in new_meta - ch_fastqs_for_host_removal= INPUT_CHECK.out.fastqs.map{ - meta, fastqs -> - new_meta = meta.clone().findAll{ it.key !in [ 'lane', 'colour_chemistry', 'single_end' ] } - [ new_meta, meta, fastqs ] - } - // We join the bam and fastq channel with now matching metas (new_meta) referred as meta_join - // and remove the meta_join from the final channel, keeping the original metas for the bam and the fastqs - ch_input_for_host_removal = ch_bam_for_host_removal.join(ch_fastqs_for_host_removal) - .map{ - meta_join, meta_bam, bam, bai, meta_fastq, fastqs -> - [ meta_bam, bam, bai, meta_fastq, fastqs] - } - - HOST_REMOVAL ( ch_input_for_host_removal ) - - ch_versions = ch_versions.mix( HOST_REMOVAL.out.versions ) - } // // Section: Metagenomics From 28f687c1bf9bfcc5e894d9b742168fd8556bf1fd Mon Sep 17 00:00:00 2001 From: Merlin Szymanski Date: Fri, 12 Apr 2024 13:43:20 +0200 Subject: [PATCH 099/198] Add citations for megan, hops and taxpasta --- CITATIONS.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/CITATIONS.md b/CITATIONS.md index 99e5be3f2..0ce614b0a 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -130,6 +130,14 @@ > Vågene, Å.J., Herbig, A., Campana, M.G., Nelly, M., García, R., Warinner, C., Sabin, S., Spyrou, M.A., Valtueña, A.A., Huson, D., Tuross, N., Bos, K.I. & Krause, J. (2018). Salmonella enterica genomes from victims of a major sixteenth-century epidemic in Mexico. Nat Ecol Evol 2, 520–528. doi: [10.1038/s41559-017-0446-6](https://doi.org/10.1038/s41559-017-0446-6) +- [HOPS](https://doi.org/10.1186/s13059-019-1903-0) + + > Hübler, R., Key, F.M., Warinner, C. et al. (2019). HOPS: automated detection and authentication of pathogen DNA in archaeological remains. Genome Biol 20, 280. doi: [10.1186/s13059-019-1903-0](https://doi.org/10.1186/s13059-019-1903-0) + +- [MEGAN](https://doi.org/10.1101/gr.5969107) + + > Daniel H. Huson, Alexander F. Auch, Ji Qi, and Stephan C. Schuster (2007). MEGAN analysis of metagenomic data. Genome Res. 17000, Published in Advance January 25, 2007, doi: [10.1101/gr.5969107](https://doi.org/10.1101/gr.5969107) + - [Kraken2](https://doi.org/10.1186/s13059-019-1891-0) > Wood, Derrick E., Jennifer Lu, and Ben Langmead. 2019. Improved Metagenomic Analysis with Kraken 2. Genome Biology 20 (1): 257. doi: [10.1186/s13059-019-1891-0](https://doi.org/10.1186/s13059-019-1891-0). @@ -142,6 +150,10 @@ > Blanco-Míguez, A., Beghini, F., Cumbo, F. et al. Extending and improving metagenomic taxonomic profiling with uncharacterized species using MetaPhlAn 4. Nat Biotechnol (2023). doi: [10.1038/s41587-023-01688-w](https://doi.org/10.1038/s41587-023-01688-w) +- [TAXPASTA](https://doi.org/10.21105/joss.05627) + + > Beber et al., (2023). TAXPASTA: TAXonomic Profile Aggregation and STAndardisation. Journal of Open Source Software, 8(87), 5627, doi: [10.21105/joss.05627](https://doi.org/10.21105/joss.05627) + ## Software packaging/containerisation tools - [Anaconda](https://anaconda.com) From 46a9cbfd50d6e2d5c12f88c7fa9d7892ff1e6755 Mon Sep 17 00:00:00 2001 From: Merlin Szymanski Date: Fri, 12 Apr 2024 13:55:14 +0200 Subject: [PATCH 100/198] (Review) Remove kraken_parse and kraken_merge scripts --- bin/kraken_parse.py | 96 ----------------------------------------- bin/merge_kraken_res.py | 74 ------------------------------- nextflow_schema.json | 2 +- 3 files changed, 1 insertion(+), 171 deletions(-) delete mode 100755 bin/kraken_parse.py delete mode 100755 bin/merge_kraken_res.py diff --git a/bin/kraken_parse.py b/bin/kraken_parse.py deleted file mode 100755 index abfe28961..000000000 --- a/bin/kraken_parse.py +++ /dev/null @@ -1,96 +0,0 @@ -#!/usr/bin/env python - -# Written by Maxime Borry and released under the MIT license. -# See git repository (https://github.com/nf-core/eager) for full license text. - -import argparse -import csv - - -def _get_args(): - """This function parses and return arguments passed in""" - parser = argparse.ArgumentParser( - prog="kraken_parse", formatter_class=argparse.RawDescriptionHelpFormatter, description="Parsing kraken" - ) - parser.add_argument("krakenReport", help="path to kraken report file") - parser.add_argument( - "-c", dest="count", default=50, help="Minimum number of hits on clade to report it. Default = 50" - ) - parser.add_argument( - "-or", dest="readout", default=None, help="Read count output file. Default = .read_kraken_parsed.csv" - ) - parser.add_argument( - "-ok", dest="kmerout", default=None, help="Kmer Output file. Default = .kmer_kraken_parsed.csv" - ) - - args = parser.parse_args() - - infile = args.krakenReport - countlim = int(args.count) - readout = args.readout - kmerout = args.kmerout - - return (infile, countlim, readout, kmerout) - - -def _get_basename(file_name): - if ("/") in file_name: - basename = file_name.split("/")[-1].split(".")[0] - else: - basename = file_name.split(".")[0] - return basename - - -def parse_kraken(infile, countlim): - """ - INPUT: - infile (str): path to kraken report file - countlim (int): lowest count threshold to report hit - OUTPUT: - resdict (dict): key=taxid, value=readCount - """ - with open(infile, "r") as f: - read_dict = {} - kmer_dict = {} - csvreader = csv.reader(f, delimiter="\t") - for line in csvreader: - if line[0].startswith("#") or line[0] == "%": - continue - reads = int(line[1]) - if reads >= countlim: - taxid = line[6] - kmer = line[3] - unique_kmer = line[4] - try: - kmer_duplicity = float(kmer) / float(unique_kmer) - except ZeroDivisionError: - kmer_duplicity = 0 - read_dict[taxid] = reads - kmer_dict[taxid] = kmer_duplicity - - return (read_dict, kmer_dict) - - -def write_output(resdict, infile, outfile): - with open(outfile, "w") as f: - basename = _get_basename(infile) - f.write(f"TAXID,{basename}\n") - for akey in resdict.keys(): - f.write(f"{akey},{resdict[akey]}\n") - - -if __name__ == "__main__": - INFILE, COUNTLIM, readout, kmerout = _get_args() - - if not readout: - read_outfile = _get_basename(INFILE) + ".read_kraken_parsed.csv" - else: - read_outfile = readout - if not kmerout: - kmer_outfile = _get_basename(INFILE) + ".kmer_kraken_parsed.csv" - else: - kmer_outfile = kmerout - - read_dict, kmer_dict = parse_kraken(infile=INFILE, countlim=COUNTLIM) - write_output(resdict=read_dict, infile=INFILE, outfile=read_outfile) - write_output(resdict=kmer_dict, infile=INFILE, outfile=kmer_outfile) diff --git a/bin/merge_kraken_res.py b/bin/merge_kraken_res.py deleted file mode 100755 index f13a38ee2..000000000 --- a/bin/merge_kraken_res.py +++ /dev/null @@ -1,74 +0,0 @@ -#!/usr/bin/env python3 - -# Written by Maxime Borry and released under the MIT license. -# Modifications for DSL2 compliance and integration into eager DSL2 release by @ilight1542 -# See git repository (https://github.com/nf-core/eager) for full license text. - -import argparse -import os -import pandas as pd -import numpy as np - - -def _get_args(): - """This function parses and return arguments passed in""" - parser = argparse.ArgumentParser( - prog="merge_kraken_res", - formatter_class=argparse.RawDescriptionHelpFormatter, - description="Merging csv count files in one table", - ) - parser.add_argument( - "-or", - dest="readout", - default="kraken_read_count_table.csv", - help="Read count output file. Default = kraken_read_count_table.csv", - ) - parser.add_argument( - "-ok", - dest="kmerout", - default="kraken_kmer_unicity_table.csv", - help="Kmer unicity output file. Default = kraken_kmer_unicity_table.csv", - ) - args = parser.parse_args() - - readout = args.readout - kmerout = args.kmerout - - return (readout, kmerout) - - -def get_csv(): - tmp = [i for i in os.listdir() if ".csv" in i] - kmer = [i for i in tmp if ".kmer_" in i] - read = [i for i in tmp if ".read_" in i] - return (read, kmer) - - -def _get_basename(file_name): - if ("/") in file_name: - basename = file_name.split("/")[-1].split(".")[0] - else: - basename = file_name.split(".")[0] - return basename - - -def merge_csv(all_csv): - df = pd.read_csv(all_csv[0], index_col=0) - for i in range(1, len(all_csv)): - df_tmp = pd.read_csv(all_csv[i], index_col=0) - df = pd.merge(left=df, right=df_tmp, on="TAXID", how="outer", validate="1:1") - df.fillna(0, inplace=True) - return df - - -def write_csv(pd_dataframe, outfile): - pd_dataframe.to_csv(outfile) - - -if __name__ == "__main__": - READOUT, KMEROUT = _get_args() - reads, kmers = get_csv() - read_df = merge_csv(reads) - kmer_df = merge_csv(kmers) - write_csv(read_df, READOUT) - write_csv(kmer_df, KMEROUT) diff --git a/nextflow_schema.json b/nextflow_schema.json index 4e989d26b..449758f21 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -792,7 +792,7 @@ "default": 1, "description": "Specify a minimum number of reads a taxon of sample total is required to have to be retained in malt or kraken. Not compatible with --malt_min_support_mode 'percent'.", "fa_icon": "fas fa-sort-numeric-up-alt", - "help_text": "For usage in malt or kraken: Specify the minimum number of reads a given taxon is required to have to be retained as a positive 'hit'. \n For usage in kraken2 or krakenuniq: Specify the number of hits on a clade to retain it in the final report when using kraken_parse. Default: 1. \nFor malt, this only applies when `--malt_min_support_mode` is set to 'reads'. \n\n> Modifies MALT or kraken_parse.py parameter: `-sup` and `-c` respectively\n" + "help_text": "For usage in malt: Specify the minimum number of reads a given taxon is required to have to be retained as a positive 'hit'.Default: 1. \nFor malt, this only applies when `--malt_min_support_mode` is set to 'reads'. \n\n> Modifies MALT parameter: `-sup` \n" }, "metagenomics_malt_maxqueries": { "type": "integer", From e2ade6d22ea35665a56f2ea5b2348ff82c6a4156 Mon Sep 17 00:00:00 2001 From: Merlin Szymanski Date: Fri, 12 Apr 2024 14:24:25 +0200 Subject: [PATCH 101/198] Apply minor suggestions from code review Missed in the first screening Co-authored-by: James A. Fellows Yates --- conf/test_humanbam.config | 2 +- docs/output.md | 2 +- nextflow_schema.json | 2 +- subworkflows/local/metagenomics_postprocessing.nf | 6 +++--- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/conf/test_humanbam.config b/conf/test_humanbam.config index a6dfa5c88..a1d30952f 100644 --- a/conf/test_humanbam.config +++ b/conf/test_humanbam.config @@ -48,5 +48,5 @@ params { bamfiltering_mappingquality = 37 // Metagenomic screening - run_metagenomic = false + run_metagenomics = false } diff --git a/docs/output.md b/docs/output.md index dba2e5b86..f66ca8f17 100644 --- a/docs/output.md +++ b/docs/output.md @@ -428,7 +428,7 @@ The output system of KrakenUniq can result in other `stdout` or `stderr` logging #### taxpasta -the output created by the `taxpasta merge` command. It combines the results of all the samples analyzed with eager. The file provides an overview of the classification results for all samples combined +the output created by the `taxpasta merge` command. It combines the results of all the samples analyzed with a given metagenomic classifer by nf-core/eager. The file provides an overview of the classification results for all samples combined
Output files diff --git a/nextflow_schema.json b/nextflow_schema.json index 449758f21..695393feb 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -734,7 +734,7 @@ "default": "16G", "description": "Specify how large to chunk database when loading into memory for KrakenUniq", "fa_icon": "fas fa-database", - "help_text": "nf-core/taxprofiler utilises a 'low memory' option for KrakenUniq that can reduce the amount of RAM the process requires using the `--preloaded` option.\n\nA further extension to this option is that you can specify how large each chunk of the database should be that gets loaded into memory at any one time. You can specify the amount of RAM to chunk the database to with this parameter, and is particularly useful for people with limited computational resources.\n\nMore information about this parameter can be seen [here](https://github.com/fbreitwieser/krakenuniq/blob/master/README.md#new-release-v07).\n\n> Modifies KrakenUniq parameter: --preload-size\n\n" + "help_text": "nf-core/eager utilises a 'low memory' option for KrakenUniq that can reduce the amount of RAM the process requires using the `--preloaded` option.\n\nA further extension to this option is that you can specify how large each chunk of the database should be that gets loaded into memory at any one time. You can specify the amount of RAM to chunk the database to with this parameter, and is particularly useful for people with limited computational resources.\n\nMore information about this parameter can be seen [here](https://github.com/fbreitwieser/krakenuniq/blob/master/README.md#new-release-v07).\n\n> Modifies KrakenUniq parameter: --preload-size\n\n" }, "metagenomics_kraken2_saveminimizers": { "type": "boolean", diff --git a/subworkflows/local/metagenomics_postprocessing.nf b/subworkflows/local/metagenomics_postprocessing.nf index f1626b8fe..1a0a5c10b 100644 --- a/subworkflows/local/metagenomics_postprocessing.nf +++ b/subworkflows/local/metagenomics_postprocessing.nf @@ -26,7 +26,7 @@ workflow METAGENOMICS_POSTPROCESSING { .transpose() .map{ meta, reads -> [ - meta+['id':"${meta.strandedness}stranded"], + meta + [ 'id': "${meta.strandedness}stranded" ], reads ] } @@ -43,7 +43,7 @@ workflow METAGENOMICS_POSTPROCESSING { } //RUN MaltExtract - MALTEXTRACT ( ch_maltextract_input.rma6, ch_maltextract_input.tax_list, ch_maltextract_input.ncbi_dir) + MALTEXTRACT ( ch_maltextract_input.rma6, ch_maltextract_input.tax_list, ch_maltextract_input.ncbi_dir ) // now we need to run AMPS for each MALTEXTRACT output ch_amps_input = MALTEXTRACT.out.results.map{ it[1] } @@ -59,7 +59,7 @@ workflow METAGENOMICS_POSTPROCESSING { // ensure we keep paired-end information in downstream filenames // when no pair-merging [ - meta+['db_name':meta.id, 'id': rma.baseName], + meta + ['db_name': meta.id, 'id': rma.baseName ], rma ] } From a2f62702ffcbdbf370b6e98aa93250ab5d0d8cd3 Mon Sep 17 00:00:00 2001 From: Merlin Szymanski Date: Fri, 12 Apr 2024 15:11:51 +0200 Subject: [PATCH 102/198] (Review) Apply suggestions from code review --- conf/modules.config | 24 +++++++++---------- docs/development/manual_tests.md | 10 -------- .../local/metagenomics_postprocessing.nf | 2 +- subworkflows/local/metagenomics_profiling.nf | 7 +++--- 4 files changed, 17 insertions(+), 26 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 7b567e81c..66c94fc70 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -856,7 +856,7 @@ process { ext.prefix = { "${meta.sample_id}_${meta.library_id}_complexity" } publishDir = [ [ - path: { "${params.outdir}/metagenomics_screening/complexity_filter/prinseq" }, + path: { "${params.outdir}/metagenomics/complexity_filter/prinseq" }, mode: params.publish_dir_mode, pattern: '*{_good_out.fastq.gz,_good_out_R1.fastq.gz,_good_out_R2.fastq.gz,log}', enabled: params.metagenomics_complexity_savefastq @@ -869,7 +869,7 @@ process { ext.args = { "entropymask=f entropy=${params.metagenomics_complexity_entropy}" } ext.prefix = { "${meta.sample_id}_${meta.library_id}_complexity" } publishDir = [ - path: { "${params.outdir}/metagenomics_screening/complexity_filter/bbduk/" }, + path: { "${params.outdir}/metagenomics/complexity_filter/bbduk/" }, mode: params.publish_dir_mode, pattern: '*.{fastq.gz,log}', enabled: params.metagenomics_complexity_savefastq @@ -888,7 +888,7 @@ process { params.metagenomics_malt_savereads ? "--alignments ./" : "" ].join(' ').trim() publishDir = [ - path: { "${params.outdir}/metagenomics_screening/profiling/malt/" }, + path: { "${params.outdir}/metagenomics/profiling/malt/" }, mode: params.publish_dir_mode, pattern: '*.{rma6,log,sam.gz}' ] @@ -898,7 +898,7 @@ process { withName: CAT_CAT_MALT { ext.prefix = { "${meta.id}_runtime_log_concatenated.log" } publishDir = [ - path: { "${params.outdir}/metagenomics_screening/profiling/malt/" }, + path: { "${params.outdir}/metagenomics/profiling/malt/" }, mode: params.publish_dir_mode, pattern: '*.{log}' ] @@ -910,7 +910,7 @@ process { ].join(' ').trim() ext.prefix = { "${meta.sample_id}_${meta.library_id}" } publishDir = [ - path: { "${params.outdir}/metagenomics_screening/profiling/kraken2/" }, + path: { "${params.outdir}/metagenomics/profiling/kraken2/" }, mode: params.publish_dir_mode, pattern: '*.{txt,fastq.gz}' ] @@ -918,7 +918,7 @@ process { withName: ".*KRAKENUNIQ_PRELOADEDKRAKENUNIQ" { publishDir = [ - path: { "${params.outdir}/metagenomics_screening/profiling/krakenuniq/" }, + path: { "${params.outdir}/metagenomics/profiling/krakenuniq/" }, mode: params.publish_dir_mode, pattern: '*.{txt,fastq.gz}' ] @@ -926,7 +926,7 @@ process { withName: METAPHLAN_METAPHLAN { publishDir = [ - path: { "${params.outdir}/metagenomics_screening/profiling/metaphlan/" }, + path: { "${params.outdir}/metagenomics/profiling/metaphlan/" }, mode: params.publish_dir_mode, pattern: '*.{biom,txt}' ] @@ -947,7 +947,7 @@ process { { meta.strandedness } == "single" ? '--singleStranded' : '', ].join(' ').trim() publishDir = [ - path: { "${params.outdir}/metagenomics_screening/postprocessing/maltextract/${meta.id}/" }, + path: { "${params.outdir}/metagenomics/postprocessing/maltextract/${meta.id}/" }, mode: params.publish_dir_mode, pattern: 'results' ] @@ -958,7 +958,7 @@ process { ext.args = "-c2c Taxonomy" ext.prefix = { "${meta.id}" } publishDir = [ - path: { "${params.outdir}/metagenomics_screening/postprocessing/megan_summaries/" }, + path: { "${params.outdir}/metagenomics/postprocessing/megan_summaries/" }, mode: params.publish_dir_mode, pattern: '*.{txt.gz,megan}' ] @@ -966,7 +966,7 @@ process { withName: AMPS { publishDir = [ - path: { "${params.outdir}/metagenomics_screening/postprocessing/maltextract/" }, + path: { "${params.outdir}/metagenomics/postprocessing/maltextract/" }, mode: params.publish_dir_mode, pattern: 'results' ] @@ -975,7 +975,7 @@ process { withName: TAXPASTA_MERGE { publishDir = [ - path: { "${params.outdir}/metagenomics_screening/postprocessing/taxpasta/" }, + path: { "${params.outdir}/metagenomics/postprocessing/taxpasta/" }, mode: params.publish_dir_mode, pattern: '*.{csv,tsv,ods,xlsx,arrow,parquet,biom}' ] @@ -984,7 +984,7 @@ process { withName: TAXPASTA_STANDARDISE { publishDir = [ - path: { "${params.outdir}/metagenomics_screening/postprocessing/taxpasta/" }, + path: { "${params.outdir}/metagenomics/postprocessing/taxpasta/" }, mode: params.publish_dir_mode, pattern: '*.{csv,tsv,ods,xlsx,arrow,parquet,biom}' ] diff --git a/docs/development/manual_tests.md b/docs/development/manual_tests.md index 0f0115c8b..1b0ac787b 100644 --- a/docs/development/manual_tests.md +++ b/docs/development/manual_tests.md @@ -761,16 +761,6 @@ nextflow run ../main.nf -profile docker \ --metagenomics_malt_group_size 3 ``` -##### mergemetaphlantables - -(update: Jan 2024, removed, parsing with taxpasta) - -```bash -nextflow run -resume ./main.nf -profile test,docker --outdir out \ ---run_metagenomics --metagenomics_profiling_tool metaphlan --metagenomics_profiling_database ./runtest/metaphlandb/ --metagenomics_run_postprocessing -# 20230804: works -``` - ## Mapping statistics ### ENDOSPY diff --git a/subworkflows/local/metagenomics_postprocessing.nf b/subworkflows/local/metagenomics_postprocessing.nf index 1a0a5c10b..e4078229a 100644 --- a/subworkflows/local/metagenomics_postprocessing.nf +++ b/subworkflows/local/metagenomics_postprocessing.nf @@ -32,7 +32,7 @@ workflow METAGENOMICS_POSTPROCESSING { } .groupTuple(by:0) - // could no be two entries in the channel, so combine with the tax_list and ncbi + // could now be two entries in the channel, so combine with the tax_list and ncbi ch_maltextract_input = ch_strandedness .combine(tax_list) .combine(ncbi_dir) diff --git a/subworkflows/local/metagenomics_profiling.nf b/subworkflows/local/metagenomics_profiling.nf index 0cf7d258c..d74195803 100644 --- a/subworkflows/local/metagenomics_profiling.nf +++ b/subworkflows/local/metagenomics_profiling.nf @@ -45,8 +45,9 @@ workflow METAGENOMICS_PROFILING { def label = file(params.metagenomics_profiling_database).getBaseName() - // For the next step we need the number of groups for the spezified number of input files - ch_groups = params.metagenomics_malt_group_size > 0 ? ch_reads.collate(params.metagenomics_malt_group_size).count() : Channel.of(1) + // For the next step we need the number of analysis-groups for the spezified number of input files + // since we work with channels, we need a channel that stores that information + ch_tmp_groups = params.metagenomics_malt_group_size > 0 ? ch_reads.collate(params.metagenomics_malt_group_size).count() : Channel.of(1) // this is for enumerating the channel-entries in the ch_reads channel def n = 0 @@ -57,7 +58,7 @@ workflow METAGENOMICS_PROFILING { // the groups are split again by ds and ss with groupTuple. So they might end up smaller than malt_group_size // could be prevented by branching early and running the lower part twice for ss and ds individually // but this is an edge-case and might never be relevant... - ch_input_for_malt = ch_reads.combine(ch_groups).map{ meta, reads, n_groups -> + ch_input_for_malt = ch_reads.combine(ch_tmp_groups).map{ meta, reads, n_groups -> [ [ label: label, From 1ef0ee963725d19f7aa1ae5a6c631a10255131fe Mon Sep 17 00:00:00 2001 From: Ian Light Date: Fri, 26 Apr 2024 08:50:43 +0000 Subject: [PATCH 103/198] added config for malt testing --- conf/test_malt.config | 33 +++++++++++++++++++++++++++++++++ nextflow.config | 1 + 2 files changed, 34 insertions(+) create mode 100644 conf/test_malt.config diff --git a/conf/test_malt.config b/conf/test_malt.config new file mode 100644 index 000000000..b53333d9d --- /dev/null +++ b/conf/test_malt.config @@ -0,0 +1,33 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test for + metagenomics malt. + + Use as follows: + nextflow run nf-core/eager -profile test_malt, --outdir + +---------------------------------------------------------------------------------------- +*/ + +params { + config_profile_name = 'malt test profile' + config_profile_description = 'Minimal test dataset to check the metagenomics malt pipeline function' + + // Limit resources so that this can run on GitHub Actions + max_cpus = 2 + max_memory = '6.GB' + max_time = '6.h' + + // Input data + input = 'https://github.com/nf-core/test-datasets/raw/eager/testdata/Mammoth/samplesheet_v3.tsv' + + // Genome references + fasta = 'https://raw.githubusercontent.com/nf-core/test-datasets/eager/reference/Mammoth/Mammoth_MT_Krause.fasta' + + // Metagenomics + run_metagenomics = true + metagenomics_profiling_tool = 'malt' + metagenomics_profiling_database = 'https://raw.githubusercontent.com/nf-core/test-datasets/eager/databases/malt/eager_test.tar.gz' +} diff --git a/nextflow.config b/nextflow.config index 11a264036..edaaa691b 100644 --- a/nextflow.config +++ b/nextflow.config @@ -380,6 +380,7 @@ profiles { test_humanbam { includeConfig 'conf/test_humanbam.config' } test_multiref { includeConfig 'conf/test_multiref.config' } test_kraken2 { includeConfig 'conf/test_kraken2.config' } + test_malt { includeConfig 'conf/test_malt.config' } } // Set default registry for Apptainer, Docker, Podman and Singularity independent of -profile From 34135235c15b8c65b2f81064bd8ac3278ef076be Mon Sep 17 00:00:00 2001 From: Merlin Szymanski Date: Fri, 26 Apr 2024 10:52:51 +0200 Subject: [PATCH 104/198] Add krakenuniq test profile --- conf/test_krakenuniq.config | 33 +++++++++++++++++++++++++++++++++ nextflow.config | 13 +++++++------ 2 files changed, 40 insertions(+), 6 deletions(-) create mode 100644 conf/test_krakenuniq.config diff --git a/conf/test_krakenuniq.config b/conf/test_krakenuniq.config new file mode 100644 index 000000000..9528c336a --- /dev/null +++ b/conf/test_krakenuniq.config @@ -0,0 +1,33 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test for + metagenomics krakenuniq. + + Use as follows: + nextflow run nf-core/eager -profile test_krakenuniq, --outdir + +---------------------------------------------------------------------------------------- +*/ + +params { + config_profile_name = 'KrakenUniq test profile' + config_profile_description = 'Minimal test dataset to check the metagenomics krakenuniq pipeline function' + + // Limit resources so that this can run on GitHub Actions + max_cpus = 2 + max_memory = '6.GB' + max_time = '6.h' + + // Input data + input = 'https://github.com/nf-core/test-datasets/raw/eager/testdata/Mammoth/samplesheet_v3.tsv' + + // Genome references + fasta = 'https://raw.githubusercontent.com/nf-core/test-datasets/eager/reference/Mammoth/Mammoth_MT_Krause.fasta' + + // Metagenomics + run_metagenomics = true + metagenomics_profiling_tool = 'krakenuniq' + metagenomics_profiling_database = 'https://github.com/nf-core/test-datasets/raw/taxprofiler/data/database/krakenuniq/testdb-krakenuniq.tar.gz' +} diff --git a/nextflow.config b/nextflow.config index edaaa691b..09459ba6c 100644 --- a/nextflow.config +++ b/nextflow.config @@ -375,12 +375,13 @@ profiles { executor.cpus = 4 executor.memory = 8.GB } - test { includeConfig 'conf/test.config' } - test_full { includeConfig 'conf/test_full.config' } - test_humanbam { includeConfig 'conf/test_humanbam.config' } - test_multiref { includeConfig 'conf/test_multiref.config' } - test_kraken2 { includeConfig 'conf/test_kraken2.config' } - test_malt { includeConfig 'conf/test_malt.config' } + test { includeConfig 'conf/test.config' } + test_full { includeConfig 'conf/test_full.config' } + test_humanbam { includeConfig 'conf/test_humanbam.config' } + test_multiref { includeConfig 'conf/test_multiref.config' } + test_kraken2 { includeConfig 'conf/test_kraken2.config' } + test_malt { includeConfig 'conf/test_malt.config' } + test_krakenuniq { includeConfig 'conf/test_krakenuniq.config'} } // Set default registry for Apptainer, Docker, Podman and Singularity independent of -profile From b0b59df65f01c5747e9e866d6abd76462b9c8a97 Mon Sep 17 00:00:00 2001 From: Merlin Szymanski Date: Fri, 24 May 2024 11:59:08 +0200 Subject: [PATCH 105/198] Add metaphlan testprofile --- conf/test_metaphlan.config | 33 +++++++++++++++++++++++++++++++++ nextflow.config | 1 + 2 files changed, 34 insertions(+) create mode 100644 conf/test_metaphlan.config diff --git a/conf/test_metaphlan.config b/conf/test_metaphlan.config new file mode 100644 index 000000000..b44d3f592 --- /dev/null +++ b/conf/test_metaphlan.config @@ -0,0 +1,33 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test for + metagenomics krakenuniq. + + Use as follows: + nextflow run nf-core/eager -profile test_krakenuniq, --outdir + +---------------------------------------------------------------------------------------- +*/ + +params { + config_profile_name = 'Metaphlan3 test profile' + config_profile_description = 'Minimal test dataset to check the metagenomics metaphlan3 pipeline function' + + // Limit resources so that this can run on GitHub Actions + max_cpus = 2 + max_memory = '6.GB' + max_time = '6.h' + + // Input data + input = 'https://github.com/nf-core/test-datasets/raw/eager/testdata/Mammoth/samplesheet_v3.tsv' + + // Genome references + fasta = 'https://raw.githubusercontent.com/nf-core/test-datasets/eager/reference/Mammoth/Mammoth_MT_Krause.fasta' + + // Metagenomics + run_metagenomics = true + metagenomics_profiling_tool = 'metaphlan' + metagenomics_profiling_database = 'https://github.com/nf-core/test-datasets/raw/409834b927c3a4e9314691b1125acee1434f7dd8/data/delete_me/metaphlan4_database.tar.gz' +} diff --git a/nextflow.config b/nextflow.config index 09459ba6c..82269ae53 100644 --- a/nextflow.config +++ b/nextflow.config @@ -382,6 +382,7 @@ profiles { test_kraken2 { includeConfig 'conf/test_kraken2.config' } test_malt { includeConfig 'conf/test_malt.config' } test_krakenuniq { includeConfig 'conf/test_krakenuniq.config'} + test_metaphlan { includeConfig 'conf/test_metaphlan.config' } } // Set default registry for Apptainer, Docker, Podman and Singularity independent of -profile From 5d4309fb03b84fa80bd9e54fedd4a6acd9152f51 Mon Sep 17 00:00:00 2001 From: Merlin Szymanski Date: Fri, 14 Jun 2024 10:35:21 +0200 Subject: [PATCH 106/198] stage input channels in eager subworkflow, not in downstream swfs --- subworkflows/local/metagenomics.nf | 9 +++++---- subworkflows/local/metagenomics_postprocessing.nf | 13 +++++-------- workflows/eager.nf | 14 +++++++++++++- 3 files changed, 23 insertions(+), 13 deletions(-) diff --git a/subworkflows/local/metagenomics.nf b/subworkflows/local/metagenomics.nf index a37348d4f..1af725f30 100644 --- a/subworkflows/local/metagenomics.nf +++ b/subworkflows/local/metagenomics.nf @@ -4,6 +4,9 @@ include { METAGENOMICS_POSTPROCESSING } from './metagenomics_postprocessing' workflow METAGENOMICS { take: ch_bamfiltered_for_metagenomics + take: ch_database + take: ch_tax_list + take: ch_ncbi_dir main: // Define channels @@ -34,9 +37,7 @@ workflow METAGENOMICS { // Run the profiling subworkflow // - database = Channel.fromPath(params.metagenomics_profiling_database) - - METAGENOMICS_PROFILING( ch_reads_for_metagenomics, database ) + METAGENOMICS_PROFILING( ch_reads_for_metagenomics, ch_database ) ch_versions = ch_versions.mix( METAGENOMICS_PROFILING.out.versions ) ch_multiqc_files = ch_multiqc_files.mix( METAGENOMICS_PROFILING.out.mqc.collect{it[1]}.ifEmpty([]) ) @@ -46,7 +47,7 @@ workflow METAGENOMICS { if ( params.metagenomics_run_postprocessing || ['kraken2', 'krakenuniq'].contains(params.metagenomics_profiling_tool) ) { - METAGENOMICS_POSTPROCESSING ( METAGENOMICS_PROFILING.out.postprocessing_input ) + METAGENOMICS_POSTPROCESSING ( METAGENOMICS_PROFILING.out.postprocessing_input, ch_tax_list, ch_ncbi_dir ) ch_versions = ch_versions.mix( METAGENOMICS_POSTPROCESSING.out.versions ) ch_multiqc_files = ch_multiqc_files.mix( METAGENOMICS_POSTPROCESSING.out.mqc.collect{it[1]}.ifEmpty([]) ) diff --git a/subworkflows/local/metagenomics_postprocessing.nf b/subworkflows/local/metagenomics_postprocessing.nf index e4078229a..fd85050db 100644 --- a/subworkflows/local/metagenomics_postprocessing.nf +++ b/subworkflows/local/metagenomics_postprocessing.nf @@ -6,9 +6,9 @@ include { MEGAN_RMA2INFO } from '../../modules/nf-core/megan/rma2info/main workflow METAGENOMICS_POSTPROCESSING { - take: - ch_postprocessing_input // different between each profiling --> postprocessing tool, - // defined in metagenomics profiling subworkflow + take: ch_postprocessing_input // different between each profiling --> postprocessing tool, defined in metagenomics profiling subworkflow + take: ch_tax_list + take: ch_ncbi_dir main: ch_versions = Channel.empty() @@ -17,9 +17,6 @@ workflow METAGENOMICS_POSTPROCESSING { // For MALT we have an additional step that includes maltextract+amps if ( params.metagenomics_run_postprocessing && params.metagenomics_profiling_tool == 'malt' ) { - tax_list = Channel.fromPath(params.metagenomics_maltextract_taxonlist) - ncbi_dir = Channel.fromPath(params.metagenomics_maltextract_ncbidir) - // Malt could have been executed multiple times (group_size paramter and strandedness) // We want to combine the chunks, but run MaltExtract on double and singlestranded individually ch_strandedness = ch_postprocessing_input @@ -34,8 +31,8 @@ workflow METAGENOMICS_POSTPROCESSING { // could now be two entries in the channel, so combine with the tax_list and ncbi ch_maltextract_input = ch_strandedness - .combine(tax_list) - .combine(ncbi_dir) + .combine(ch_tax_list) + .combine(ch_ncbi_dir) .multiMap{ rma6:[it[0],it[1]] tax_list:it[2] diff --git a/workflows/eager.nf b/workflows/eager.nf index db5b6b56f..b34d2fd7d 100644 --- a/workflows/eager.nf +++ b/workflows/eager.nf @@ -298,7 +298,19 @@ workflow EAGER { // if ( params.run_metagenomics ) { - METAGENOMICS ( ch_bamfiltered_for_metagenomics ) + + ch_database = Channel.fromPath(params.metagenomics_profiling_database) + + // this is for MALT + ch_tax_list = Channel.empty() + ch_ncbi_dir = Channel.empty() + + if ( params.metagenomics_run_postprocessing && params.metagenomics_profiling_tool == 'malt' ){ + ch_tax_list = Channel.fromPath(params.metagenomics_maltextract_taxonlist, checkIfExists:true) + ch_ncbi_dir = Channel.fromPath(params.metagenomics_maltextract_ncbidir, checkIfExists:true) + } + + METAGENOMICS ( ch_bamfiltered_for_metagenomics, ch_database, ch_tax_list, ch_ncbi_dir ) ch_versions = ch_versions.mix( METAGENOMICS.out.versions.first() ) ch_multiqc_files = ch_multiqc_files.mix( METAGENOMICS.out.ch_multiqc_files ) } From 714a7a5b90db672e1f2d152a2fef82c14a1be95a Mon Sep 17 00:00:00 2001 From: Merlin Szymanski Date: Fri, 14 Jun 2024 10:52:29 +0200 Subject: [PATCH 107/198] Skip the maltextract 'results' dir, save directly to process folder --- conf/modules.config | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 66c94fc70..9f09abba5 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -947,9 +947,10 @@ process { { meta.strandedness } == "single" ? '--singleStranded' : '', ].join(' ').trim() publishDir = [ - path: { "${params.outdir}/metagenomics/postprocessing/maltextract/${meta.id}/" }, + path: { "${params.outdir}/metagenomics/postprocessing/maltextract/" }, mode: params.publish_dir_mode, - pattern: 'results' + pattern: 'results', + saveAs: { "${meta.id}" } ] } From 010ffcd02b5f2e8542e24257dd5db671c1711bf4 Mon Sep 17 00:00:00 2001 From: Judith Ballesteros Date: Fri, 14 Jun 2024 11:49:23 +0200 Subject: [PATCH 108/198] Starting subworkflow for circularmapper --- modules.json | 10 +++ .../circulargenerator/environment.yml | 9 +++ .../circularmapper/circulargenerator/main.nf | 50 ++++++++++++ .../circularmapper/circulargenerator/meta.yml | 55 +++++++++++++ .../realignsamfile/environment.yml | 7 ++ .../circularmapper/realignsamfile/main.nf | 57 +++++++++++++ .../circularmapper/realignsamfile/meta.yml | 58 +++++++++++++ subworkflows/local/circularmapper.nf | 81 +++++++++++++++++++ 8 files changed, 327 insertions(+) create mode 100644 modules/nf-core/circularmapper/circulargenerator/environment.yml create mode 100644 modules/nf-core/circularmapper/circulargenerator/main.nf create mode 100644 modules/nf-core/circularmapper/circulargenerator/meta.yml create mode 100644 modules/nf-core/circularmapper/realignsamfile/environment.yml create mode 100644 modules/nf-core/circularmapper/realignsamfile/main.nf create mode 100644 modules/nf-core/circularmapper/realignsamfile/meta.yml create mode 100644 subworkflows/local/circularmapper.nf diff --git a/modules.json b/modules.json index 7b4d35fce..fc368c50a 100644 --- a/modules.json +++ b/modules.json @@ -95,6 +95,16 @@ "git_sha": "02fd5bd7275abad27aad32d5c852e0a9b1b98882", "installed_by": ["modules"] }, + "circularmapper/circulargenerator": { + "branch": "master", + "git_sha": "5890d9e73aaa803fc6be94b1822539b4204d8cff", + "installed_by": ["modules"] + }, + "circularmapper/realignsamfile": { + "branch": "master", + "git_sha": "5890d9e73aaa803fc6be94b1822539b4204d8cff", + "installed_by": ["modules"] + }, "damageprofiler": { "branch": "master", "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", diff --git a/modules/nf-core/circularmapper/circulargenerator/environment.yml b/modules/nf-core/circularmapper/circulargenerator/environment.yml new file mode 100644 index 000000000..f1e1201ef --- /dev/null +++ b/modules/nf-core/circularmapper/circulargenerator/environment.yml @@ -0,0 +1,9 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +name: "circularmapper_circulargenerator" + +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::circularmapper=1.93.5 diff --git a/modules/nf-core/circularmapper/circulargenerator/main.nf b/modules/nf-core/circularmapper/circulargenerator/main.nf new file mode 100644 index 000000000..b1664d032 --- /dev/null +++ b/modules/nf-core/circularmapper/circulargenerator/main.nf @@ -0,0 +1,50 @@ +// This module does the following: +//creating a modified reference genome, with an elongation of the an specified amount of bases +process CIRCULARMAPPER_CIRCULARGENERATOR { + + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/circularmapper:1.93.5--h2a3209d_3': + 'biocontainers/circularmapper:1.93.5--h2a3209d_3' }" + + input: + tuple val(meta), path(reference) + val(elong) + + output: + tuple val(meta), path("*_${elong}.fasta"), emit: fasta + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + circulargenerator -e ${elong} \ + -i ${reference} \ + -s ${prefix} \ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + circulargenerator: \$(circulargenerator -h | grep 'usage' | sed 's/usage: CircularGenerator//') + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}_${elong}.fasta + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + circulargenerator: \$(circulargenerator -h | grep 'usage' | sed 's/usage: CircularGenerator//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/circularmapper/circulargenerator/meta.yml b/modules/nf-core/circularmapper/circulargenerator/meta.yml new file mode 100644 index 000000000..3e6a51ada --- /dev/null +++ b/modules/nf-core/circularmapper/circulargenerator/meta.yml @@ -0,0 +1,55 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json +name: "circularmapper_circulargenerator" +description: A method to improve mappings on circular genomes, using the BWA mapper. +keywords: + - sort + - example + - genomics +tools: + - "circulargenerator": + description: "Creating a modified reference genome, with an elongation of the an specified amount of bases" + homepage: "https://github.com/apeltzer/CircularMapper" + documentation: "https://github.com/apeltzer/CircularMapper/blob/master/docs/contents/userguide.rst" + tool_dev_url: "https://github.com/apeltzer/CircularMapper" + doi: "no DOI available" + licence: ["GPL v3"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + + - reference: + type: file + description: Genome fasta file + pattern: "*.fasta" + + - elong: + type: integer + description: The number of bases that the ends of the target chromosome in the reference genome should be elongated by + +output: + #Only when we have meta + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + + - fasta: + type: file + description: Genome fasta file + pattern: "*.fasta" + +authors: + - "@apalleja" +maintainers: + - "" diff --git a/modules/nf-core/circularmapper/realignsamfile/environment.yml b/modules/nf-core/circularmapper/realignsamfile/environment.yml new file mode 100644 index 000000000..d9beb5ae1 --- /dev/null +++ b/modules/nf-core/circularmapper/realignsamfile/environment.yml @@ -0,0 +1,7 @@ +name: circularmapper_realignsamfile +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::circularmapper=1.93.5 diff --git a/modules/nf-core/circularmapper/realignsamfile/main.nf b/modules/nf-core/circularmapper/realignsamfile/main.nf new file mode 100644 index 000000000..579815df0 --- /dev/null +++ b/modules/nf-core/circularmapper/realignsamfile/main.nf @@ -0,0 +1,57 @@ +process CIRCULARMAPPER_REALIGNSAMFILE { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/circularmapper:1.93.5--h4a94de4_1': + 'biocontainers/circularmapper:1.93.5--h4a94de4_1' }" + + input: + tuple val(meta), path(bam) + tuple val(meta2), path(fasta) + val(elongation_factor) + + output: + tuple val(meta), path("*_realigned.bam") , emit: bam + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def VERSION = '1.93.5' // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions. + """ + realignsamfile \\ + -Xmx${task.memory.toGiga()}g \\ + ${args} \\ + -e ${elongation_factor} \\ + -i ${bam} \\ + -r ${fasta} + + ## realignsamfile has a hardcoded output name. Rename if necessary to use prefix. + if [[ "${bam.getBaseName()}_realigned.bam" != "${prefix}_realigned.bam" ]]; then + mv ${bam.getBaseName()}_realigned.bam ${prefix}_realigned.bam + fi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + CircularMapper: ${VERSION} + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def VERSION = '1.93.5' // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions. + """ + touch ${prefix}_realigned.bam + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + CircularMapper: ${VERSION} + END_VERSIONS + """ +} diff --git a/modules/nf-core/circularmapper/realignsamfile/meta.yml b/modules/nf-core/circularmapper/realignsamfile/meta.yml new file mode 100644 index 000000000..bc4173754 --- /dev/null +++ b/modules/nf-core/circularmapper/realignsamfile/meta.yml @@ -0,0 +1,58 @@ +name: "circularmapper_realignsamfile" +description: Realign reads mapped with BWA to elongated reference genome +keywords: + - realign + - circular + - map + - reference + - fasta + - bam + - short-read + - bwa +tools: + - "circularmapper": + description: "A method to improve mappings on circular genomes such as Mitochondria." + homepage: "https://circularmapper.readthedocs.io/en/latest/index.html" + documentation: "https://circularmapper.readthedocs.io/en/latest/index.html" + tool_dev_url: "https://github.com/apeltzer/CircularMapper/" + doi: "10.1186/s13059-016-0918-z" + licence: ["GPL v3"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'test', single_end:false ]` + - bam: + type: file + description: BAM/SAM file + pattern: "*.{bam,sam}" + - meta2: + type: map + description: | + Groovy Map containing reference information + e.g. `[ id:'test' ]` + - fasta: + type: file + description: Input elongated genome fasta + - elongation_factor: + type: integer + description: The elongation factor used when running circulargenerator, i.e. the number of bases that the ends of the target chromosome in the reference genome was elongated by +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'test', single_end:false ]` + - bam: + type: file + description: Realigned BAM file + pattern: "*.bam" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@shyama-mama" + - "@jbv2" + - "@TCLamnidis" diff --git a/subworkflows/local/circularmapper.nf b/subworkflows/local/circularmapper.nf new file mode 100644 index 000000000..9d6abe311 --- /dev/null +++ b/subworkflows/local/circularmapper.nf @@ -0,0 +1,81 @@ +// +// Run circularmapper +// + +include { CIRCULARMAPPER_CIRCULARGENERATOR } from '../../modules/nf-core/circularmapper/circulargenerator/main' +include { CIRCULARMAPPER_REALIGNSAMFILE } from '../../modules/nf-core/circularmapper/realignsamfile/main' +include { BWA_ALN as BWA_ALN_CIRCULARMAPPER } from '../../modules/nf-core/bwa/aln/main' +include { BWA_INDEX as BWA_INDEX_CIRCULARMAPPER } from '../../modules/nf-core/bwa/index/main' +include { BWA_SAMSE as BWA_SAMSE_CIRCULARMAPPER } from '../../../modules/nf-core/bwa/samse/main' + +workflow CIRCULARMAPPER { + + take: + fasta_reference // channel (mandatory): [ val(meta), path(reference) ] + eval // channel (mandatory): val(elongation value) + fastq_reads // channel (mandatory): [ val(meta), path(reads) ] + + main: + + ch_versions = Channel.empty() + ch_multiqc_files = Channel.empty() + + if ( params.run_circularmapper ) { + + ch_reference = fasta_reference + ch_eval = eval + + CIRCULARMAPPER_CIRCULARGENERATOR(ch_reference, ch_eval) + ch_versions = ch_versions.mix( CIRCULARMAPPER_CIRCULARGENERATOR.out.versions.first() ) + + BWA_INDEX_CIRCULARMAPPER(CIRCULARMAPPER_CIRCULARGENERATOR.out.fasta) + ch_versions = ch_versions.mix( BWA_INDEX_CIRCULARMAPPER.out.versions.first() ) + + ch_input_bwa_aln = fastq_reads + .map { + // Prepend a new meta that contains the meta.reference value as the new_meta.reference attribute + addNewMetaFromAttributes( it, "reference" , "reference" , false ) + } + .groupTuple(by:0) + .combine( BWA_INDEX_CIRCULARMAPPER.out.index, by: 0 ) // [ [meta], fastq, bai ] + .multiMap { + combo_meta, metas, fastq, ref_bai, bai -> + def ids = metas.collect { meta -> meta.id } + fastqs: [ combo_meta + [id: ids], fastq ] + bai: [ ref_bai, bai ] + } + + BWA_ALN_CIRCULARMAPPER(ch_input_bwa_aln) + ch_versions = ch_versions.mix( BWA_ALN_CIRCULARMAPPER.out.versions.first() ) + + ch_input_bwa_samse = ch_input_bwa_aln + .combine( BWA_ALN_CIRCULARMAPPER.out.sai, by: 0 ) // [ [meta], fastq, bai, sai ] + .multiMap { + metas, fastq, ref_bai, bai, ref_sai, sai -> + fastqs: [ metas, fastq, sai ] + bai: [ ref_bai, bai ] + } + + BWA_SAMSE_CIRCULARMAPPER(ch_input_bwa_samse) + ch_versions = ch_versions.mix( BWA_SAMSE_CIRCULARMAPPER.out.versions.first() ) + + ch_input_realignsamfile = BWA_SAMSE_CIRCULARMAPPER.out.bam + .combine(CIRCULARMAPPER_CIRCULARGENERATOR.out.fasta, by: 0) + .combine(ch_eval) + .multiMap { + ref_bam, bam, ref_fasta, fasta, ch_eval -> + bam: [ ref_bam, bam ] + fasta: [ ref_fasta, fasta ] + eval: [ ch_eval ] + } + + CIRCULARMAPPER_REALIGNSAMFILE(ch_input_realignsamfile) + ch_versions = ch_versions.mix( CIRCULARMAPPER_REALIGNSAMFILE.out.versions.first() ) + + emit: + + bam = CIRCULARMAPPER_REALIGNSAMFILE.out.bam // channel: [ val(meta), path(bam) ] + versions = ch_versions // channel: [ path(versions.yml) ] + + } +} From 3bbb28c65ec164b3c4f8a640576970a1c4f5529f Mon Sep 17 00:00:00 2001 From: Judith Ballesteros Date: Fri, 21 Jun 2024 11:58:59 +0200 Subject: [PATCH 109/198] adding parameters and outputs. Missing to add subworkflow to the main workflow --- CITATIONS.md | 4 ++++ conf/modules.config | 20 ++++++++++++++++++++ docs/output.md | 11 +++++++++++ nextflow.config | 4 ++++ nextflow_schema.json | 19 ++++++++++++++----- subworkflows/local/circularmapper.nf | 2 +- 6 files changed, 54 insertions(+), 6 deletions(-) diff --git a/CITATIONS.md b/CITATIONS.md index 3715b56a8..8c742dc34 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -126,6 +126,10 @@ > Sex.DetERRmine.py Lamnidis, T.C. et al., 2018. Ancient Fennoscandian genomes reveal origin and spread of Siberian ancestry in Europe. Nature communications, 9(1), p.5018. Available at: http://dx.doi.org/10.1038/s41467-018-07483-5. Download: https://github.com/TCLamnidis/Sex.DetERRmine + - [CircularMapper](https://doi.org/10.1186/s13059-016-0918-z) + + > Peltzer, A., Jäger, G., Herbig, A., Seitz, A., Kniep, C., Krause, J., & Nieselt, K. (2016). EAGER: efficient ancient genome reconstruction. Genome Biology, 17(1), 1–14. doi: [10.1186/s13059-016-0918-z](https://doi.org/10.1186/s13059-016-0918-z) + ## Software packaging/containerisation tools - [Anaconda](https://anaconda.com) diff --git a/conf/modules.config b/conf/modules.config index b03153561..c4f01939b 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -529,6 +529,26 @@ process { ] } + withName: CIRCULARMAPPER_CIRCULARGENERATOR { + tag = { "${meta.reference}|${meta.sample_id}_${meta.library_id}" } + ext.prefix = { "${meta.sample_id}_${meta.library_id}_${meta.reference}" } + publishDir = [ + path: { "${params.outdir}/mapping/circularmapper/" }, + mode: params.publish_dir_mode, + pattern: '*[0-9].fasta' + ] + } + + withName: CIRCULARMAPPER_REALIGNSAMFILE { + tag = { "${meta.reference}|${meta.sample_id}_${meta.library_id}" } + ext.prefix = { "${meta.sample_id}_${meta.library_id}_${meta.reference}" } + publishDir = [ + path: { "${params.outdir}/mapping/circularmapper/" }, + mode: params.publish_dir_mode, + pattern: '*_realigned.bam' + ] + } + // // DEDUPLICATION // diff --git a/docs/output.md b/docs/output.md index 9237f0c55..b29959f56 100644 --- a/docs/output.md +++ b/docs/output.md @@ -634,3 +634,14 @@ When using pileupCaller for genotyping, single-stranded and double-stranded libr
[ANGSD](http://www.popgen.dk/angsd/index.php/ANGSD) is a software for analyzing next generation sequencing data. It can estimate genotype likelihoods and allele frequencies from next-generation sequencing data. The output provided is a bgzipped genotype likelihood file, containing likelihoods across all samples per reference. Users can specify the model used for genotype likelihood estimation, as well as the output format. For more information on the available options, see the [ANGSD](https://www.popgen.dk/angsd/index.php/Genotype_Likelihoods). + +#### CircularMapper + +
+Output files + +- `mapping/circularmapper` + + - `*realigned.bam`: BAM file realigned to the extended reference + +
diff --git a/nextflow.config b/nextflow.config index 602331d5e..d7b93a3aa 100644 --- a/nextflow.config +++ b/nextflow.config @@ -179,6 +179,10 @@ params { mapstats_preseq_cval = 0.95 mapstats_preseq_defects_mode = false + //Circular Mapper + run_circularmapper = true + elongation_factor = 500 + // Damage Calculation options skip_damagecalculation = false damagecalculation_tool = 'damageprofiler' diff --git a/nextflow_schema.json b/nextflow_schema.json index 9b1102705..237c676ee 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -1162,6 +1162,18 @@ "snpcapture_bed": { "type": "string", "description": "Path to snp capture in BED format. Provided file can also be gzipped." + }, + "run_circularmapper": { + "type": "boolean", + "default": true, + "fa_icon": "fas fa-check-circle" + }, + "elongation_factor": { + "type": "integer", + "default": 500, + "description": "Specify the number of bases to extend reference by (circularmapper only)", + "help_text": "The number of bases to extend the reference genome with. By default this is set to 500 if not specified otherwise.", + "fa_icon": "fas fa-external-link-alt" } }, "fa_icon": "fas fa-search" @@ -1312,7 +1324,7 @@ }, "contamination_estimation_angsd_hapmap": { "type": "string", - "default": "${projectDir}/assets/angsd_resources/HapMapChrX.gz", + "default": "/Users/judith_ballesteros/Documents/GitHub/eager/assets/angsd_resources/HapMapChrX.gz", "description": "Path to HapMap file of chromosome for contamination estimation..", "help_text": "The haplotype map, or \"HapMap\", records the location of haplotype blocks and their tag SNPs.", "fa_icon": "fas fa-map" @@ -1394,14 +1406,11 @@ { "$ref": "#/definitions/host_removal" }, - { - "$ref": "#/definitions/human_sex_determination" - }, { "$ref": "#/definitions/contamination_estimation" }, { - "$ref": "#/definitions/contamination_estimation" + "$ref": "#/definitions/human_sex_determination" } ] } diff --git a/subworkflows/local/circularmapper.nf b/subworkflows/local/circularmapper.nf index 9d6abe311..1d98dc4e5 100644 --- a/subworkflows/local/circularmapper.nf +++ b/subworkflows/local/circularmapper.nf @@ -74,7 +74,7 @@ workflow CIRCULARMAPPER { emit: - bam = CIRCULARMAPPER_REALIGNSAMFILE.out.bam // channel: [ val(meta), path(bam) ] + bam = CIRCULARMAPPER_REALIGNSAMFILE.out.bam // channel: [ val(meta), path(bam) ] versions = ch_versions // channel: [ path(versions.yml) ] } From 31f590da8db4c285c9f8d0901b96e4eadd5da6c9 Mon Sep 17 00:00:00 2001 From: Merlin Szymanski Date: Fri, 28 Jun 2024 11:08:20 +0200 Subject: [PATCH 110/198] Fix copy-paste error in bamfiltering module --- subworkflows/local/bamfiltering.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/subworkflows/local/bamfiltering.nf b/subworkflows/local/bamfiltering.nf index 41c7f7f16..95f984860 100644 --- a/subworkflows/local/bamfiltering.nf +++ b/subworkflows/local/bamfiltering.nf @@ -116,7 +116,7 @@ workflow FILTER_BAM { if ( ( params.run_metagenomics && params.metagenomics_input == 'unmapped' ) && params.preprocessing_skippairmerging ) { ch_fastq_for_metagenomics = CAT_FASTQ_UNMAPPED.out.reads } else if ( ( params.run_metagenomics && ( params.metagenomics_input == 'mapped' || params.metagenomics_input == 'all' ) ) && params.preprocessing_skippairmerging ) { - ch_fastq_for_metagenomics = CAT_FASTQ_UNMAPPED.out.reads + ch_fastq_for_metagenomics = CAT_FASTQ_MAPPED.out.reads } else if ( params.run_metagenomics && params.metagenomics_input == 'unmapped' ) { ch_fastq_for_metagenomics = SAMTOOLS_FASTQ_UNMAPPED.out.other } else if ( params.run_metagenomics && ( params.metagenomics_input == 'mapped' || params.metagenomics_input == 'all' )) { From 25ec1dadbf6283bb1752e383691b8c0feedea769 Mon Sep 17 00:00:00 2001 From: Judith Ballesteros Date: Fri, 28 Jun 2024 11:42:31 +0200 Subject: [PATCH 111/198] adding circularmapper to map.nf --- nextflow.config | 1 - nextflow_schema.json | 19 +++++++------------ subworkflows/local/circularmapper.nf | 4 +--- subworkflows/local/map.nf | 21 +++++++++++++++++++++ 4 files changed, 29 insertions(+), 16 deletions(-) diff --git a/nextflow.config b/nextflow.config index d7b93a3aa..63663b004 100644 --- a/nextflow.config +++ b/nextflow.config @@ -180,7 +180,6 @@ params { mapstats_preseq_defects_mode = false //Circular Mapper - run_circularmapper = true elongation_factor = 500 // Damage Calculation options diff --git a/nextflow_schema.json b/nextflow_schema.json index 237c676ee..8678793dc 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -603,6 +603,13 @@ "description": "Specify the maximum fragment length for Bowtie2 paired-end mapping mode only.", "help_text": "The maximum fragment for valid paired-end alignments. Only for paired-end mapping (i.e. unmerged), and therefore typically only useful for modern data.\n\n> Modifies Bowtie2 parameter: `--maxins`", "fa_icon": "fas fa-exchange-alt" + }, + "elongation_factor": { + "type": "integer", + "default": 500, + "description": "Specify the number of bases to extend reference by (circularmapper only)", + "help_text": "The number of bases to extend the reference genome with. By default this is set to 500 if not specified otherwise.", + "fa_icon": "fas fa-external-link-alt" } }, "fa_icon": "fas fa-layer-group" @@ -1162,18 +1169,6 @@ "snpcapture_bed": { "type": "string", "description": "Path to snp capture in BED format. Provided file can also be gzipped." - }, - "run_circularmapper": { - "type": "boolean", - "default": true, - "fa_icon": "fas fa-check-circle" - }, - "elongation_factor": { - "type": "integer", - "default": 500, - "description": "Specify the number of bases to extend reference by (circularmapper only)", - "help_text": "The number of bases to extend the reference genome with. By default this is set to 500 if not specified otherwise.", - "fa_icon": "fas fa-external-link-alt" } }, "fa_icon": "fas fa-search" diff --git a/subworkflows/local/circularmapper.nf b/subworkflows/local/circularmapper.nf index 1d98dc4e5..73578bda1 100644 --- a/subworkflows/local/circularmapper.nf +++ b/subworkflows/local/circularmapper.nf @@ -6,7 +6,7 @@ include { CIRCULARMAPPER_CIRCULARGENERATOR } from '../../modules/nf-core/ci include { CIRCULARMAPPER_REALIGNSAMFILE } from '../../modules/nf-core/circularmapper/realignsamfile/main' include { BWA_ALN as BWA_ALN_CIRCULARMAPPER } from '../../modules/nf-core/bwa/aln/main' include { BWA_INDEX as BWA_INDEX_CIRCULARMAPPER } from '../../modules/nf-core/bwa/index/main' -include { BWA_SAMSE as BWA_SAMSE_CIRCULARMAPPER } from '../../../modules/nf-core/bwa/samse/main' +include { BWA_SAMSE as BWA_SAMSE_CIRCULARMAPPER } from '../../modules/nf-core/bwa/samse/main' workflow CIRCULARMAPPER { @@ -20,7 +20,6 @@ workflow CIRCULARMAPPER { ch_versions = Channel.empty() ch_multiqc_files = Channel.empty() - if ( params.run_circularmapper ) { ch_reference = fasta_reference ch_eval = eval @@ -77,5 +76,4 @@ workflow CIRCULARMAPPER { bam = CIRCULARMAPPER_REALIGNSAMFILE.out.bam // channel: [ val(meta), path(bam) ] versions = ch_versions // channel: [ path(versions.yml) ] - } } diff --git a/subworkflows/local/map.nf b/subworkflows/local/map.nf index 7aa267d37..f738161c5 100644 --- a/subworkflows/local/map.nf +++ b/subworkflows/local/map.nf @@ -12,6 +12,7 @@ include { SAMTOOLS_INDEX as SAMTOOLS_INDEX_MEM } from '../../modules/nf include { SAMTOOLS_INDEX as SAMTOOLS_INDEX_BT2 } from '../../modules/nf-core/samtools/index/main' include { SAMTOOLS_INDEX as SAMTOOLS_INDEX_MERGED_LANES } from '../../modules/nf-core/samtools/index/main' include { SAMTOOLS_FLAGSTAT as SAMTOOLS_FLAGSTAT_MAPPED } from '../../modules/nf-core/samtools/flagstat/main' +include { CIRCULARMAPPER } from '../../subworkflows/local/circularmapper' workflow MAP { take: @@ -113,8 +114,28 @@ workflow MAP { SAMTOOLS_INDEX_BT2 ( ch_mapped_lane_bam ) ch_versions = ch_versions.mix(SAMTOOLS_INDEX_BT2.out.versions.first()) ch_mapped_lane_bai = params.fasta_largeref ? SAMTOOLS_INDEX_BT2.out.csi : SAMTOOLS_INDEX_BT2.out.bai + + } else if ( params.mapping_tool == 'circularmapper' ) { + ch_eval = params.elongation_factor + + ch_input_for_circularmapper = reads + .combine(index.map{ meta, index, fasta -> [ meta, fasta ] }) + .combine(ch_eval) + .multiMap { + meta, reads, meta2, fasta, eval -> + reads: [ meta, reads ] + index: [ meta2, fasta ] + elon: [ eval ] + } + CIRCULARMAPPER(ch_input_for_circularmapper) + ch_versions = ch_versions.mix ( CIRCULARMAPPER.out.versions.first() ) + ch_mapped_bam = CIRCULARMAPPER.out.bam + ch_mapped_bai = Channel.empty() // Circularmapper doesn't give a bai + + } + // Only run merge lanes if we have more than one BAM to merge! ch_input_for_lane_merge = ch_mapped_lane_bam .map { From 68f724359e2983cb400afafac395a17504f0c4d0 Mon Sep 17 00:00:00 2001 From: Judith Ballesteros Date: Fri, 28 Jun 2024 11:45:58 +0200 Subject: [PATCH 112/198] removing local path --- nextflow_schema.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index 8678793dc..7193b1a3f 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -1319,7 +1319,7 @@ }, "contamination_estimation_angsd_hapmap": { "type": "string", - "default": "/Users/judith_ballesteros/Documents/GitHub/eager/assets/angsd_resources/HapMapChrX.gz", + "default": "${projectDir}/assets/angsd_resources/HapMapChrX.gz", "description": "Path to HapMap file of chromosome for contamination estimation..", "help_text": "The haplotype map, or \"HapMap\", records the location of haplotype blocks and their tag SNPs.", "fa_icon": "fas fa-map" From 3fac93d0b28b95081ef32d26ca1f6de9aa93c542 Mon Sep 17 00:00:00 2001 From: Judith Ballesteros Date: Fri, 28 Jun 2024 12:03:53 +0200 Subject: [PATCH 113/198] fixing schema --- nextflow_schema.json | 563 +++++++++++++++++++++---------------------- 1 file changed, 280 insertions(+), 283 deletions(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index 06737bf47..b59b73117 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -19,14 +19,14 @@ "mimetype": "text/csv", "pattern": "^\\S+\\.(c|t)sv$", "schema": "assets/schema_input.json", - "description": "Path to tab- or comma-separated file containing information about the samples in the experiment.", - "help_text": "You will need to create a design file with information about the samples in your experiment before running the pipeline. Use this parameter to specify its location. It has to be a tab- or comma-separated file with 11 columns, and a header row. See [usage docs](https://nf-co.re/eager/usage#samplesheet-input).", + "description": "Path to comma-separated file containing information about the samples in the experiment.", + "help_text": "You will need to create a design file with information about the samples in your experiment before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row. See [usage docs](https://nf-co.re/eager/usage#samplesheet-input).", "fa_icon": "fas fa-file-csv" }, "convert_inputbam": { "type": "boolean", "description": "Specify to convert input BAM files back to FASTQ for remapping", - "help_text": "This parameter tells the pipeline to convert the BAM files listed in the `--input` TSV or CSV sheet back to FASTQ format to allow re-preprocessing and mapping.\n\nCan be useful when you want to ensure consistent mapping parameters across all libraries when incorporating public data, however be careful of biases that may come from re-processing again (the BAM files may already be clipped, or only mapped reads with different settings are included so you may not have all reads from the original publication).", + "help_text": "This parameter tells the pipeline to convert the BAM files listed in the `--input` TSV or CSV sheet back to FASTQ format to allow re-preprocessing and mapping\n\nCan be useful when you want to ensure consistent mapping parameters across all libraries when incorporating public data, however be careful of biases that may come from re-processing again (the BAM files may already be clipped, or only mapped reads with different settings are included so you may not have all reads from the original publication).", "fa_icon": "fas fa-undo-alt" }, "outdir": { @@ -62,32 +62,31 @@ "mimetype": "text/plain", "pattern": "^\\S+\\.fn?a(sta)?(\\.gz)?$", "errorMessage": "The path to the reference FASTA file must not contain spaces and must have file extensions '.fasta', '.fa', '.fas', '.fna', '.fasta.gz','.fa.gz','.fas.gz' or '.fna.gz'.", - "description": "Path to FASTA file of the reference genome.", + "description": "Path to FASTA genome file.", "help_text": "This parameter is *mandatory* if `--genome` or `--fasta_sheet` are not specified. If you don't supply a mapper index (e.g. for BWA), this will be generated for you automatically. Combine with `--save_reference` to save mapper index for future runs.", "fa_icon": "far fa-file-code" }, "fasta_fai": { "type": "string", - "description": "Specify path to samtools FASTA index.", + "description": "Path to samtools FASTA index (typically ending in '.fai'). If not supplied will be made for you.", "help_text": "If you want to use a pre-existing `samtools faidx` index, use this to specify the required FASTA index file for the selected reference genome. This should be generated by samtools faidx and has a file suffix of `.fai`.", "fa_icon": "fas fa-address-book" }, "fasta_dict": { "type": "string", - "description": "Specify path to Picard sequence dictionary file.", + "description": "Path to picard sequence dictionary file (typically ending in '.dict'). If not supplied will be made for you.", "help_text": "If you want to use a pre-existing `picard CreateSequenceDictionary` dictionary file, use this to specify the required `.dict` file for the selected reference genome.", "fa_icon": "fas fa-address-book" }, "fasta_mapperindexdir": { "type": "string", - "description": "Specify path to directory containing index files of the FASTA for a given mapper.", - "help_text": "For most people this will likely be the same directory that contains the file you provided to `--fasta`.\n\nIf you want to use pre-existing `bwa index` indices, the directory should contain files ending in '.amb' '.ann' '.bwt'. If you want to use pre-existing `bowtie2 build` indices, the directory should contain files ending in'.1.bt2', '.2.bt2', '.rev.1.bt2'.\n\nIn any case do not include the files themselves in the path. nf-core/eager will automagically detect the index files by searching for the FASTA filename with the corresponding `bwa index`/`bowtie2 build` file suffixes. If not supplied, the indices will be generated for you.\n\n", - "fa_icon": "fas fa-folder-open" + "description": "Path to directory containing index files of the FASTA for a given mapper.", + "help_text": "For most people this will likely be the same directory that contains the file you provided to `--fasta`.\n\nIf you want to use pre-existing `bwa index` indices, the directory should contain files ending in '.amb' '.ann' '.bwt'. If you want to use pre-existing `bowtie2 build` indices, the directory should contain files ending in'.1.bt2', '.2.bt2', '.rev.1.bt2'. \n\nIn any case do not include the files themselves in the path. nf-core/eager will automagically detect the index files by searching for the FASTA filename with the corresponding `bwa index`/`bowtie2 build` file suffixes. If not supplied, the indices will be generated for you.\n\n" }, "save_reference": { "type": "boolean", - "description": "Specify to save any pipeline-generated reference genome indices in the results directory.", - "help_text": "Use this if you do not have pre-made reference FASTA indices for `bwa`, `samtools` and `picard`. If you turn this on, the indices nf-core/eager generates for you and will be saved in the `/results/reference_genomes` for you. If not supplied, nf-core/eager generated index references will be deleted.\n\n> Modifies SAMtools index command: `-c`", + "description": "Specify to save any pipeline-generated reference genome indices in the results directory.", + "help_text": "Use this if you do not have pre-made reference FASTA indices for `bwa`, `samtools` and `picard`. If you turn this on, the indices nf-core/eager generates for you and will be saved in the `/results/reference_genomes` for you. If not supplied, nf-core/eager generated index references will be deleted.\n\n> modifies SAMtools index command: `-c`", "fa_icon": "fas fa-save" }, "fasta_sheet": { @@ -126,8 +125,8 @@ }, "fasta_circular_target": { "type": "string", - "description": "Specify the FASTA header of the target chromosome to extend when using `circularmapper`.", - "help_text": "The entry (chromosome, contig, etc.) in your FASTA reference that you'd like to be treated as circular.\n\nApplies only when providing a single FASTA file via `--fasta` (NOT multi-reference input - see reference TSV/CSV input).\n\n> Modifies tool parameter(s):\n> - circulargenerator `-s`", + "description": "Specify the FASTA header of the target chromosome to extend. Only applies when using `circularmapper`.", + "help_text": "The entry (chromosome, contig, etc.) in your FASTA reference that you'd like to be treated as circular.\n\nApplies only when providing a single FASTA file via `--fasta` (NOT multi-reference input - see reference TSV/CSV input).\n\n> Modifies tool parameter(s):\n> - circulargenerator `-s`\n", "fa_icon": "fas fa-bullseye" } } @@ -336,7 +335,7 @@ "preprocessing": { "title": "Preprocessing", "type": "object", - "description": "Removal of adapters, paired-end merging, poly-G removal, etc.", + "description": "Removal of adapters, paired-end merging, poly-G removal etc.", "default": "", "properties": { "sequencing_qc_tool": { @@ -349,8 +348,8 @@ }, "skip_preprocessing": { "type": "boolean", - "description": "Specify to skip all preprocessing steps (adapter removal, paired-end merging, poly-G trimming, etc).", - "help_text": "Specify to skip all preprocessing steps (adapter removal, paired-end merging, poly-G trimming etc).\n\nThis will also mean you will only get one set of FastQC results (of the input reads).", + "description": "Specify to skip all preprocessing steps (adapter removal, paired-end merging, poly-g trimming etc).", + "help_text": "Specify to skip all preprocessing steps (adapter removal, paired-end merging, poly-g trimming etc).\n\nThis will also mean you will only get one set of FastQC results (of the input reads).", "fa_icon": "fas fa-forward" }, "preprocessing_tool": { @@ -365,18 +364,18 @@ "type": "boolean", "description": "Specify to skip read-pair merging.", "fa_icon": "fas fa-forward", - "help_text": "Turns off the paired-end read merging, and will result in paired-end mapping modes being used during reference of reads again alignment.\n\nThis can be useful in cases where you have long ancient DNA reads, modern DNA or when you want to utilise mate-pair 'spatial' information.\n\n ⚠️ If you run this with --preprocessing_minlength set to a value (as is by default!), you may end up removing single reads from either the pair1 or pair2 file. These reads will be NOT be mapped when aligning with either BWA or bowtie, as both can only accept one (forward) or two (forward and reverse) FASTQs as input in paired-end mode.\n\n> ⚠️ If you run metagenomic screening as well as skipping merging, all reads will be screened as independent reads - not as pairs! - as all FASTQ files from BAM filtering are merged into one. This merged file is _not_ saved in results directory.\n\n> Modifies AdapterRemoval parameter: `--collapse`\n> Modifies fastp parameter: `--merge`" + "help_text": "Turns off the paired-end read merging, and will result in paired-end mapping modes being used during reference of reads again alignment.\n\nThis can be useful in cases where you have long ancient DNA reads, modern DNA, or when you want to utilise mate-pair 'spatial' information..\n\n\u26a0\ufe0f If you run this and also with --preprocessing_minlength set to a value (as is by default!), you may end up removing single reads from either the pair1 or pair2 file. These reads will be NOT be mapped when aligning with either bwa or bowtie, as both can only accept one (forward) or two (forward and reverse) FASTQs as input in paired-end mode.\n\n> \u26a0\ufe0f If you run metagenomic screening as well as skipping merging, all reads will be screened as independent reads - not as pairs! - as all FASTQ files from BAM filtering are merged into one. This merged file is _not_ saved in results directory.\n\n> Modifies AdapterRemoval parameter: `--collapse`\n> Modifies fastp parameter: `--merge`" }, "preprocessing_excludeunmerged": { "type": "boolean", - "description": "Specify to exclude read-pairs that did not overlap sufficiently for merging (i.e., keep merged reads only).", + "description": "Specify to exclude pairs that did not overlap sufficiently for merging (i.e., keep merged reads only).", "fa_icon": "fas fa-trash-alt", - "help_text": "Specify to exclude read-pairs that did not overlap sufficiently for merging (i.e., keep merged reads only). Singletons (i.e. reads missing a pair) or un-merged reads (where there wasn't sufficient overlap) are discarded.\n\nMost ancient DNA molecules are very short, and the majority are expected to merge. Specifying this parameter can sometimes be useful when dealing with ultra-short aDNA reads to reduce the number of longer-reads you may have in your library that are derived from modern contamination. It can also speed up run time of mapping steps.\n\nYou may want to use this if you want ensure only the best quality reads for your analysis, but with the penalty of potentially losing still valid data (even if some reads have slightly lower quality and/or are longer). It is highly recommended when using 'dedup' deduplication tool." + "help_text": "Specify to exclude pairs that did not overlap sufficiently for merging (i.e., keep merged reads only), in otherwords singletons (i.e. reads missing a pair), or un-merged reads (where there wasn't sufficient overlap) are discarded.\n\nMost ancient DNA molecules are very short, and the majority are expected to merge. Specifying this parameter can sometimes be useful when dealing with ultra-short aDNA reads to reduce the number of longer-reads you may have in your library that are derived from modern contamination. It can also speed up run time of mapping steps.\n\nYou may want to use this if you want ensure only the best quality reads for your analysis, but with the penalty of potentially losing still valid data (even if some reads have slightly lower quality and/or are longer). It is highly recommended when using 'dedup' deduplication tool." }, "preprocessing_skipadaptertrim": { "type": "boolean", "description": "Specify to skip removal of adapters.", - "help_text": "Specify to turn off trimming of adapters from reads.\n\nYou may wish to do this if you are using publicly available data, that _should_ have all library artefacts from reads removed.\n\nThis will override any other adapter parameters provided (i.e, `--preprocessing_adapterlist` and `--preprocessing_adapter{1,2}` will be ignored)!\n\n> Modifies AdapterRemoval parameter: `--adapter1` and `--adapter2` (sets both to an empty string)\n> Applies fastp parameter: `--disable_adapter_trimming`", + "help_text": "Specify to turn off trimming of adapters from reads.\n\nYou may wish to do this if you are using public data (e.g. ENA, SRA), that _should_ have all library artefacts from reads.\n\nThis will override any other adapter parameters provided (i.e, `--preprocessing_adapterlist` and or/ `--preprocessing_adapter{1,2}` will be ignored)!\n\n> Modifies AdapterRemoval parameter: `--adapter1` and `--adapter2` (sets both to an empty string)\n> Applies fastp parameter: `--disable_adapter_trimming`", "fa_icon": "fas fa-forward" }, "preprocessing_adapter1": { @@ -387,93 +386,93 @@ }, "preprocessing_adapter2": { "type": "string", - "description": "Specify the nucleotide sequence for the reverse read/R2.", + "description": "Specify the nucleotide sequence for the forward read/R2.", "fa_icon": "fas fa-grip-lines", "help_text": "Specify a nucleotide sequence for the forward read/R2.\n\nIf not modified by the user, the default for the particular preprocessing tool will be used. To turn off adapter trimming use `--preprocessing_skipadaptertrim`.\n\n> Modifies AdapterRemoval parameter: `--adapter2`\n> Modifies fastp parameter: `--adapter_sequence_r2`" }, "preprocessing_adapterlist": { "type": "string", - "description": "Specify a list of all possible adapters to trim.", - "help_text": "Specify a file with a list of adapter (combinations) to remove from all files.\n\nOverrides the `--preprocessing_adapter1`/`--preprocessing_adapter2` parameters.\n\nNote that the two tools have slightly different behaviours.\n\nFor AdapterRemoval this consists of a two column table with a `.txt` extension: first column represents forward strand, second column for reverse strand. You must supply all possible combinations, one per line, and this list is applied to all files. Only Adapters in this list will be screened for and removed. See AdapterRemoval documentation for more information.\n\nFor fastp this consists of a standard FASTA format with a `.fasta`/`.fa`/`.fna`/`.fas` extension. The adapter sequence in this file should be at least 6bp long, otherwise it will be skipped. fastp will first perform auto-detection and removal of adapters and then _additionally_ remove adapters present in the FASTA file one by one will.\n\n> Modifies AdapterRemoval parameter: `--adapter-list`\n> Modifies fastp parameter: `--adapter_fasta`", + "description": "Specify a list of all possible adapters to trim. Overrides --preprocessing_adapter1/2. Formats: .txt (AdapterRemoval) or .fasta. (fastp).", + "help_text": "Allows to supply a file with a list of adapter (combinations) to remove from all files. \n\nOverrides the `--preprocessing_adapter1`/`--preprocessing_adapter2` parameters . \n\nNote that the two tools have slightly different behaviours.\n\nFor AdapterRemoval this consists of a two column table with a `.txt` extension: first column represents forward strand, second column for reverse strand. You must supply all possibly combinations, one per line, and this list is applied to all files. Only Adapters in this list will be screened for and removed. See AdapterRemoval documentation for more information.\n\nFor fastp this consists of a standard FASTA format with a `.fasta`/`.fa`/`.fna`/`.fas` extension. The adapter sequence in this file should be at least 6bp long, otherwise it will be skipped. fastp first will perform auto-detection of reads and will be removed , and then _additionally_ adapters present in the FASTA file one by one will be removed.\n\n> Modifies AdapterRemoval parameter: `--adapter-list`\n> Modifies fastp parameter: `--adapter_fasta`", "fa_icon": "fas fa-list" }, "preprocessing_minlength": { "type": "integer", "default": 25, "description": "Specify the minimum length reads must have to be retained.", - "help_text": "Specify the minimum length reads must have to be retained.\n\nReads smaller than this length after trimming are discarded and not included in downstream analyses. Typically in ancient DNA, users will set this to 30 or for very old samples around 25 bp - reads any shorter that this often are not specific enough to provide useful information.\n\n> Modifies AdapterRemoval parameter: `--minlength`\n> Modifies fastp parameter: `--length_required`", - "fa_icon": "fas fa-ruler-horizontal" + "help_text": "Specify the minimum length reads must have to be retained. \n\nReads smaller than this length after trimming are discarded and not included in downstream analyses. Typically in ancient DNA users will set this to 30 or for very old samples around 25 bp - reads any shorter that this often are not specific enough to provide useful information.\n\n> Modifies AdapterRemoval parameter: `--minlength`\n> Modifies fastp parameter: `--length_required`", + "fa_icon": "fas fa-ruler" }, "preprocessing_trim5p": { "type": "integer", "default": 0, - "description": "Specify number of bases to hard-trim from 5 prime or front of reads.", - "help_text": "Specify number of bases to hard-trim from 5 prime or front of reads. Exact behaviour varies per tool, see documentation. By default set to `0` to not perform any hard trimming.\n\nThis parameter allows users to 'hard' remove a number of bases from the beginning or end of reads, regardless of quality.\n\n ⚠️ When this trimming occurs depends on the tool, i.e., the exact behaviour is not the same between AdapterRemoval and fastp.\n\nFor fastp: 5p/3p trimming occurs _prior_ to any other trimming (quality, poly-G, adapter). Please see the [fastp documentation](https://github.com/OpenGene/fastp#global-trimming) for more information. If you wish to use this to remove damage prior to mapping (to allow more specific mapping), ensure you have manually removed adapters/quality trimmed **prior** to giving the reads to nf-core/eager. Alternatively, you can use Bowtie 2's inbuilt pre-mapping read-end trimming functionality. Note that nf-core/eager only allows this hard trimming equally for both forward and reverse reads (i.e., you cannot provide different values for the 5p end for R1 and R2).\n\nFor AdapterRemoval, this trimming happens _after_ the removal of adapters, however prior to quality trimming. Therefore, this is more suitable for hard-removal of damage before mapping (however the Bowtie 2 system will be more reliable).\n\n> Modifies AdapterRemoval parameters: `--trim5p`\n> Modifies fastp parameters: `--trim_front1` and/or `--trim_front2`", + "description": "Specify number of bases to hard-trim from 5 prime or front of reads. Exact behaviour varies per tool, see documentation.", + "help_text": "Specify number of bases to hard-trim from 5 prime or front of reads. Exact behaviour varies per tool, see documentation. By default set to `0` to not perform any hard trimming.\n\nThis parameter allows users to 'hard' remove a number of bases from the beginning or end of reads, regardless of quality.\n\n\u26a0\ufe0f when this trimming occurs depends on the tool, i.e., the exact behaviour is not the same between AdapterRemoval and fastp.\n\nFor fastp: this 5p/3p trimming occurs _prior_ to any other trimming (quality, poly-G, adapter). Please see the [fastp documentation](https://github.com/OpenGene/fastp#global-trimming) for more information. If you wish to use this to remove damage prior mapping (to allow more specific mapping), ensure you have manually removed adapters/quality trimmed **prior** to giving the reads to nf-core/eager. Alternatively, you can use Bowtie2's inbuilt pre-mapping read-end trimming functionality. Note that nf-core/eager only allows this hard trimming equally for both forward and reverse reads (i.e., you cannot provide different values for the 5p end for R1 and R2).\n\nFor AdapterRemoval, this trimming happens _after_ the removal of adapters, however prior to quality trimming. Therefore this is more suitable for hard-removal of damage prior mapping (however the Bowtie2 system will be more reliable).\n\n> Modifies AdapterRemoval parameters: `--trim5p`\n> Modifies fastp parameters: `--trim_front1` and/or `--trim_front2`\n", "fa_icon": "fas fa-cut" }, "preprocessing_trim3p": { "type": "integer", "default": 0, - "description": "Specify number of bases to hard-trim from 3 prime or tail of reads.", + "description": "Specify number of bases to hard-trim from 3 prime or tail of reads. Exact behaviour varies per tool, see documentation.", "fa_icon": "fas fa-cut", - "help_text": "Specify number of bases to hard-trim from 3 prime or tail of reads. Exact behaviour varies per tool, see documentation. By default set to `0` to not perform any hard trimming.\n\nThis parameter allows users to 'hard' remove a number of bases from the beginning or end of reads, regardless of quality.\n\n⚠️ When this trimming occurs depends on the tool, i.e., the exact behaviour is not the same between AdapterRemoval and fastp.\n\nFor fastp: 5p/3p trimming occurs _prior_ to any other trimming (quality, poly-G, adapter). Please see the [fastp documentation](https://github.com/OpenGene/fastp#global-trimming) for more information. If you wish to use this to remove damage prior to mapping (to allow more specific mapping), ensure you have manually removed adapters/quality trimmed **prior** to giving the reads to nf-core/eager. Alternatively, you can use Bowtie 2's inbuilt pre-mapping read-end trimming functionality. Note that nf-core/eager only allows this hard trimming equally for both forward and reverse reads (i.e., you cannot provide different values for the 3p end for R1 and R2).\n\nFor AdapterRemoval, this trimming happens _after_ the removal of adapters, however prior to quality trimming. Therefore this is more suitable for hard-removal of damage before mapping (however the Bowtie 2 system will be more reliable).\n\n> Modifies AdapterRemoval parameters: `--trim3p`\n> Modifies fastp parameters: `--trim_tail1` and/or `--trim_tail2`" + "help_text": "Specify number of bases to hard-trim from 3 prime or tail of reads. Exact behaviour varies per tool, see documentation. By default set to `0` to not perform any hard trimming.\n\nThis parameter allows users to 'hard' remove a number of bases from the beginning or end of reads, regardless of quality.\n\n\u26a0\ufe0f when this trimming occurs depends on the tool, i.e., the exact behaviour is not the same between AdapterRemoval and fastp.\n\nFor fastp: this 5p/3p trimming occurs _prior_ to any other trimming (quality, poly-G, adapter). Please see the [fastp documentation](https://github.com/OpenGene/fastp#global-trimming) for more information. If you wish to use this to remove damage prior mapping (to allow more specific mapping), ensure you have manually removed adapters/quality trimmed **prior** to giving the reads to nf-core/eager. Alternatively, you can use Bowtie2's inbuilt pre-mapping read-end trimming functionality. Note that nf-core/eager only allows this hard trimming equally for both forward and reverse reads (i.e., you cannot provide different values for the 3p end for R1 and R2).\n\nFor AdapterRemoval, this trimming happens _after_ the removal of adapters, however prior to quality trimming. Therefore this is more suitable for hard-removal of damage prior mapping (however the Bowtie2 system will be more reliable).\n\n> Modifies AdapterRemoval parameters: `--trim3p`\n> Modifies fastp parameters: `--trim_tail1` and/or `--trim_tail2`\n" }, "preprocessing_savepreprocessedreads": { "type": "boolean", "description": "Specify to save the preprocessed reads in the results directory.", "fa_icon": "fas fa-save", - "help_text": "Specify to save the preprocessed reads in FASTQ format the results directory.\n\nThis can be useful for re-analysing FASTQ files manually, or uploading to public data repositories such as ENA/SRA (provided you don't filter by length or merge paired reads)." + "help_text": "Specify to save the preprocessed reads in FASTQ format the results directory. \n\nThis can be useful for re-analysing in FASTQ files manually, or uploading to public data repositories such as ENA/SRA (provided you don't do length filtering nor merging)." }, "preprocessing_fastp_complexityfilter": { "type": "boolean", - "description": "Specify to turn on sequence complexity filtering of reads.", + "description": "Specify to turn on sequence complexity filtering of reads with fastp.", "help_text": "Performs a poly-G tail removal step in the beginning of the pipeline using fastp.\n\nThis can be useful for trimming ploy-G tails from short-fragments sequenced on two-colour Illumina chemistry such as NextSeqs or NovaSeqs (where no-fluorescence is read as a G on two-colour chemistry), which can inflate reported GC content values.\n\n> Modifies fastp parameter: `--trim_poly_g`", - "fa_icon": "fas fa-power-off" + "fa_icon": "fas fa-cut" }, "preprocessing_fastp_complexityfilter_threshold": { "type": "integer", "default": 10, "description": "Specify the complexity threshold that must be reached or exceeded to retain reads.", "help_text": "This option can be used to define the minimum length of a poly-G tail to begin low complexity trimming.\n\n> Modifies fastp parameter: `--poly_g_min_len`", - "fa_icon": "fas fa-filter" + "fa_icon": "fas fa-ruler" }, "preprocessing_adapterremoval_preserve5p": { "type": "boolean", - "description": "Skip AdapterRemoval quality and N base trimming at 5 prime end.", + "description": "Skip AdapterRemoval base trimming (n, quality) of 5 prime end.", "help_text": "Turns off quality based trimming at the 5p end of reads when any of the AdapterRemoval quality or N trimming options are used. Only 3p end of reads will be removed.\n\nThis also entirely disables quality based trimming of collapsed reads, since both ends of these are informative for PCR duplicate filtering. For more information see the AdapterRemoval [documentation](https://adapterremoval.readthedocs.io/en/stable/manpage.html#cmdoption-adapterremoval-preserve5p).\n\n> Modifies AdapterRemoval parameters: `--preserve5p`", "fa_icon": "fas fa-shield-alt" }, "preprocessing_adapterremoval_skipqualitytrimming": { "type": "boolean", - "description": "Specify to skip AdapterRemoval quality and N trimming at the ends of reads.", - "help_text": "Turns off AdapterRemoval quality trimming from ends of reads.\n\nThis can be useful to reduce runtime when running public data that has already been processed.\n\n> Modifies AdapterRemoval parameters: `--trimqualities` ", + "description": "Skip AdapterRemoval quality and N trimming from ends of reads.", + "help_text": "Turns off AdapterRemoval quality trimming from ends of reads. \n\nThis can be useful to reduce runtime when running public data that has already been processed.\n\n> Modifies AdapterRemoval parameters: `--trimqualities` ", "fa_icon": "fas fa-forward" }, "preprocessing_adapterremoval_trimbasequalitymin": { "type": "integer", "default": 20, "description": "Specify AdapterRemoval minimum base quality for trimming off bases.", - "help_text": "Defines the minimum read quality per base that is required for a base to be kept by AdapterRemoval. Individual bases at the ends of reads falling below this threshold will be clipped off.\n\n> Modifies AdapterRemoval parameter: `--minquality`", - "fa_icon": "fas fa-filter" + "help_text": "Defines the minimum read quality per base that is required for a base to be kept. Individual bases at the ends of reads falling below this threshold will be clipped off.\n\n> Modifies AdapterRemoval parameter: `--minquality`", + "fa_icon": "fas fa-ruler-vertical" }, "preprocessing_adapterremoval_skipntrimming": { "type": "boolean", - "description": "Specify to skip AdapterRemoval N trimming (quality trimming only).", - "help_text": "Turns off AdapterRemoval N trimming from ends of reads.\n\nThis can be useful to reduce runtime when running publicly available data that has already been processed.\n\n> Modifies AdapterRemoval parameters: `--trimns` ", + "description": "Skip AdapterRemoval N trimming (quality trimming only).", + "help_text": "Turns off AdapterRemoval N trimming from ends of reads. \n\nThis can be useful to reduce runtime when running public data that has already been processed.\n\n> Modifies AdapterRemoval parameters: `--trimns` ", "fa_icon": "fas fa-forward" }, "preprocessing_adapterremoval_adapteroverlap": { "type": "integer", "default": 1, "description": "Specify the AdapterRemoval minimum adapter overlap required for trimming.", - "fa_icon": "fas fa-filter", + "fa_icon": "fas fa-ruler-horizontal", "help_text": "Specifies a minimum number of bases that overlap with the adapter sequence before AdapterRemoval trims adapters sequences from reads.\n\n> Modifies AdapterRemoval parameter: `--minadapteroverlap`" }, "preprocessing_adapterremoval_qualitymax": { "type": "integer", "default": 41, - "description": "Specify the AdapterRemoval maximum Phred score used in input FASTQ files.", - "help_text": "Specify maximum Phred score of the quality field of FASTQ files.\n\nThe quality-score range can vary depending on the machine and version (e.g. see diagram [here](https://en.wikipedia.org/wiki/FASTQ_format#Encoding), and this allows you to increase from the default AdapterRemoval value of 41.\n\nNote that while this can theoretically provide you with more confident and precise base call information, many downstream tools only accept FASTQ files with Phred scores limited to a max of 41, and therefore increasing the default for this parameter may make the resulting preprocessed files incompatible with some downstream tools.\n\n> Modifies AdapterRemoval parameters: `--qualitymax`", + "description": "Specify the AdapterRemoval maximum Phred score used in input FASTQ files", + "help_text": "Specify maximum Phred score of the quality field of FASTQ files. \n\nThe quality-score range can vary depending on the machine and version (e.g. see diagram [here](https://en.wikipedia.org/wiki/FASTQ_format#Encoding), and this allows you to increase from the default AdapterRemoval value of 41. \n\nNote that while this theoretically can provide you with more confident and precise base call information, many downstream tools only accept FASTQ files with Phred scores limited to a max of 41, and therefore increasing the default for this parameter may make the resulting preprocessed files incompatible with some downstream tools.\n\n> Modifies AdapterRemoval parameters: `--qualitymax`", "fa_icon": "fas fa-tachometer-alt" } }, @@ -487,9 +486,9 @@ "properties": { "run_fastq_sharding": { "type": "boolean", - "description": "Specify to turn on FASTQ sharding.", + "description": "Turn on FastQ sharding.", "fa_icon": "fas fa-power-off", - "help_text": "Sharding will split the FASTQs into smaller chunks before mapping. These chunks are then mapped in parallel. This approach can speed up the mapping process for larger FASTQ files." + "help_text": "Sharding will split the FastQs into smaller chunks before mapping. These chunks are then mapped in parallel. This approach can speed up the mapping process for larger FastQ files." }, "fastq_shard_size": { "type": "integer", @@ -503,99 +502,99 @@ "default": "bwaaln", "enum": ["bwaaln", "bwamem", "bowtie2", "circularmapper"], "description": "Specify which mapper to use.", - "help_text": "Specify which mapping tool to use. Options are BWA aln ('`bwaaln`'), BWA mem ('`bwamem`'), circularmapper ('`circularmapper`'), or Bowtie 2 ('`bowtie2`'). BWA aln is the default and highly suited for short-read ancient DNA. BWA mem can be quite useful for modern DNA, but is rarely used in projects for ancient DNA. CircularMapper enhances the mapping procedure to circular references, using the BWA algorithm but utilizing an extend-remap procedure (see [Peltzer et al 2016](https://doi.org/10.1186/s13059-016-0918-z) for details). Bowtie 2 is similar to BWA aln, and has recently been suggested to provide slightly better results under certain conditions ([Poullet and Orlando 2020](https://doi.org/10.3389/fevo.2020.00105)), as well as providing extra functionality (such as FASTQ trimming).\n\nMore documentation can be seen for each tool under:\n\n- [BWA aln](http://bio-bwa.sourceforge.net/bwa.shtml#3)\n- [BWA mem](http://bio-bwa.sourceforge.net/bwa.shtml#3)\n- [CircularMapper](https://circularmapper.readthedocs.io/en/latest/contents/userguide.html)\n- [Bowtie 2](http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml#command-line)", - "fa_icon": "fas fa-hammer" + "help_text": "Specify which mapping tool to use. Options are BWA aln ('`bwaaln`'), BWA mem ('`bwamem`'), circularmapper ('`circularmapper`'), or bowtie2 ('`bowtie2`'). BWA aln is the default and highly suited for short-read ancient DNA. BWA mem can be quite useful for modern DNA, but is rarely used in projects for ancient DNA. CircularMapper enhances the mapping procedure to circular references, using the BWA algorithm but utilizing a extend-remap procedure (see Peltzer et al 2016, Genome Biology for details). Bowtie2 is similar to BWA aln, and has recently been suggested to provide slightly better results under certain conditions ([Poullet and Orlando 2020](https://doi.org/10.3389/fevo.2020.00105)), as well as providing extra functionality (such as FASTQ trimming).\n\nMore documentation can be seen for each tool under:\n\n- [BWA aln](http://bio-bwa.sourceforge.net/bwa.shtml#3)\n- [BWA mem](http://bio-bwa.sourceforge.net/bwa.shtml#3)\n- [CircularMapper](https://circularmapper.readthedocs.io/en/latest/contents/userguide.html)\n- [Bowtie2](http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml#command-line)", + "fa_icon": "fas fa-layer-group" }, "fasta_largeref": { "type": "boolean", - "description": "Specify to generate '.csi' BAM indices instead of '.bai' for larger reference genomes.", - "help_text": "This parameter is required to be set for large reference genomes. If your reference genome is larger than 3.5GB, the `samtools index` calls in the pipeline need to generate `.csi` indices instead of `.bai` indices to compensate for the size of the reference genome (with samtools: `-c`). This parameter is not required for smaller references (including the human reference genomes hg19 or grch37/grch38).", + "description": "Specify to generate more recent '.csi' BAM indices. If your reference genome is larger than 3.5GB, this is recommended due to more efficient data handling with the '.csi' format over the older '.bai'.", + "help_text": "This parameter is required to be set for large reference genomes. If your reference genome is larger than 3.5GB, the `samtools index` calls in the pipeline need to generate `.csi` indices instead of `.bai` indices to compensate for the size of the reference genome (with samtools: `-c`). This parameter is not required for smaller references (including the human hg19 or grch37/grch38 references), but >4GB genomes have been shown to need `.csi` indices.", "fa_icon": "fas fa-address-book" }, "mapping_bwaaln_n": { "type": "number", "default": 0.01, - "description": "Specify the amount of allowed mismatches in the alignment for mapping with BWA aln.", - "help_text": "Specify how many mismatches are allowed in a read during alignment with BWA aln. Default is set following recommendations from [Oliva et al. 2021](https://doi.org/10.1093/bib/bbab076) who compared alignment to human reference genomes.\n\nIf you're uncertain what value to use, check out this [Shiny App](https://apeltzer.shinyapps.io/bwa-mismatches/) for more information.\n\n> Modifies BWA aln parameter: `-n`", + "description": "Specify the -n parameter for BWA aln, i.e. amount of allowed mismatches in the alignment.", + "help_text": "Configures the `bwa aln -n` parameter, defining how many mismatches are allowed in a read. Default is set following recommendations from [Oliva et al. 2021](https://doi.org/10.1093/bib/bbab076) who tested when aligning to human reference genomes. \n\nIf you're uncertain what to set check out this [Shiny App](https://apeltzer.shinyapps.io/bwa-mismatches/) for more information on how to set this parameter efficiently.\n\n> Modifies bwa aln parameter: `-n`", "fa_icon": "fas fa-sort-numeric-down" }, "mapping_bwaaln_k": { "type": "integer", "default": 2, - "description": "Specify the maximum edit distance allowed in a seed for mapping with BWA aln.", - "help_text": "Specify the maximum edit distance during the seeding phase of the BWA aln mapping algorithm.\n\n> Modifies BWA aln parameter: `-k`", + "description": "Specify the -k parameter for BWA aln, i.e. maximum edit distance allowed in a seed.", + "help_text": "Configures the bwa aln `-k` parameter for the maximum edit distance during the seeding phase of the mapping algorithm.\n\n> Modifies BWA aln parameter: `-k`", "fa_icon": "fas fa-people-arrows" }, "mapping_bwaaln_l": { "type": "integer", "default": 1024, - "description": "Specify the length of seeds to be used for BWA aln.", - "help_text": "Specify the length of the seed used in BWA aln. Default is set to be 'turned off' at the recommendation of [Oliva et al. 2021](https://doi.org/10.1093/bib/bbab076) who tested when aligning to human reference genomes. Seeding is 'turned off' by specifying an arbitrarily long number to force the entire read to act as the seed.\n\nNote: Despite being recommended, turning off seeding can result in long runtimes!\n\n> Modifies BWA aln parameter: `-l`", + "description": "Specify the -l parameter for BWA aln i.e. the length of seeds to be used.", + "help_text": "Configures the length of the seed used in bwa aln `-l`. Default is set to be 'turned off' at the recommendation of [Oliva et al. 2021](https://doi.org/10.1093/bib/bbab076) who tested when aligning to human reference genomes. Seeding is 'turned off' by specifying an arbitrarily long number to force the entire read to act as the seed. \n\nNote: Despite being recommended, turning off seeding can result in long runtimes!\n\n> Modifies BWA aln parameter: `-l`", "fa_icon": "fas fa-ruler-horizontal" }, "mapping_bwaaln_o": { "type": "integer", "default": 2, - "description": "Specify the number of gaps allowed for alignment with BWA aln.", - "help_text": "Specify the number of gaps allowed for mapping with BWA aln. Default is set to BWA default.\n\n> Modifies BWA aln parameter: `-o`", + "description": "Specify the -o parameter for BWA aln i.e. the number of gaps allowed.", + "help_text": "Configures the number of gaps used in bwa aln. Default is set to bwa default.\n\n> Modifies BWA aln parameter: `-o`", "fa_icon": "fas fa-people-arrows" }, "mapping_bwamem_k": { "type": "integer", "default": 19, - "description": "Specify the minimum seed length for alignment with BWA mem.", - "help_text": "Configures the minimum seed length used in BWA mem. Default is set to BWA default.\n\n> Modifies BWA mem parameter: `-k`", + "description": "Specify the -k parameter for BWA mem i.e. the minimum seed length", + "help_text": "Configures the minimum seed length used in BWA-MEM. Default is set to BWA default.\n\n> Modifies BWA-MEM parameter: `-k`", "fa_icon": "fas fa-seedling" }, "mapping_bwamem_r": { "type": "number", "default": 1.5, - "description": "Specify the re-seeding threshold for alignment with BWA mem.", - "help_text": "Configures the re-seeding threshold used in BWA mem. Default is set to BWA default.\n\n> Modifies BWA mem parameter: `-r`", + "description": "Specify the -k parameter for BWA mem i.e. the minimum seed length", + "help_text": "Configures the re-seeding used in BWA-MEM. Default is set to BWA default.\n\n> Modifies BWA-MEM parameter: `-r`", "fa_icon": "fas fa-angle-double-left" }, "mapping_bowtie2_alignmode": { "type": "string", "default": "local", - "description": "Specify the Bowtie 2 alignment mode.", - "help_text": "Specify the type of read alignment to use with Bowtie 2. 'Local' allows only partial alignment of read with ends of reads possibly 'soft-clipped' (i.e. remain unaligned/ignored), if the soft-clipped alignment provides best alignment score. 'End-to-end' requires all nucleotides to be aligned.\nDefault is set following [Cahill et al (2018)](https://doi.org/10.1093/molbev/msy018) and [Poullet and Orlando 2020](https://www.frontiersin.org/articles/10.3389/fevo.2020.00105/full)\n\n> Modifies Bowtie 2 presets: `--local`, `--end-to-end`", - "fa_icon": "fas fa-toggle-on", + "description": "Specify the bowtie2 alignment mode.", + "help_text": "The type of read alignment to use. Local allows only partial alignment of read, with ends of reads possibly 'soft-clipped' (i.e. remain unaligned/ignored), if the soft-clipped alignment provides best alignment score. End-to-end requires all nucleotides to be aligned. \nDefault is set following [Cahill et al (2018)](https://doi.org/10.1093/molbev/msy018) and [Poullet and Orlando 2020](https://www.frontiersin.org/articles/10.3389/fevo.2020.00105/full)\n\n> Modifies Bowtie2 presets: `--local`, `--end-to-end`", + "fa_icon": "fas fa-arrows-alt-h", "enum": ["local", "end-to-end"] }, "mapping_bowtie2_sensitivity": { "type": "string", "default": "sensitive", - "description": "Specify the level of sensitivity for the Bowtie 2 alignment mode.", - "help_text": "Specify the Bowtie 2 'preset' to use. These strings apply to both `--mapping_bowtie2_alignmode` options. See the Bowtie 2 manual for actual settings.\nDefault is set following [Poullet and Orlando (2020)](https://www.frontiersin.org/articles/10.3389/fevo.2020.00105/full), when running damaged-data without UDG treatment.\n\n> Modifies the Bowtie 2 parameters: `--fast`, `--very-fast`, `--sensitive`, `--very-sensitive`, `--fast-local`, `--very-fast-local`, `--sensitive-local`, `--very-sensitive-local`", + "description": "Specify the level of sensitivity for the bowtie2 alignment mode.", + "help_text": "The Bowtie2 'preset' to use. These strings apply to both --mapping_bowtie2_alignmode options. See the Bowtie2 manual for actual settings. \nDefault is set following [Poullet and Orlando (2020)](https://www.frontiersin.org/articles/10.3389/fevo.2020.00105/full), when running damaged-data without UDG treatment)\n\nModifies the Bowtie2 parameters: `--fast`, `--very-fast`, `--sensitive`, `--very-sensitive`, `--fast-local`, `--very-fast-local`, `--sensitive-local`, `--very-sensitive-local`", "fa_icon": "fas fa-microscope", "enum": ["fast", "very-fast", "sensitive", "very-sensitive"] }, "mapping_bowtie2_n": { "type": "integer", "default": 0, - "description": "Specify the number of mismatches in seed for alignment with Bowtie 2.", - "help_text": "Specify the number of mismatches allowed in the seed during seed-and-extend procedure of Bowtie 2. This will override any values set with `--mapping_bowtie2_sensitivity`. Can either be 0 or 1.\n\n> Modifies Bowtie 2 parameter: `-N`", + "description": "Specify the -N parameter for bowtie2 (mismatches in seed). This will override defaults from alignmode/sensitivity.", + "help_text": "The number of mismatches allowed in the seed during seed-and-extend procedure of Bowtie2. This will override any values set with --mapping_bowtie2_sensitivity. Can either be 0 or 1.\n\n>Modifies Bowtie2 parameter: `-N`", "fa_icon": "fas fa-sort-numeric-down" }, "mapping_bowtie2_l": { "type": "integer", "default": 20, - "description": "Specify the length of seed substrings for Bowtie 2.", - "help_text": "Specify the length of the seed sub-string to use during seeding of Bowtie 2. This will override any values set with `--mapping_bowtie2_sensitivity`.\n\n> Modifies Bowtie 2 parameter: `-L`", + "description": "Specify the -L parameter for bowtie2 (length of seed substrings). This will override defaults from alignmode/sensitivity.", + "help_text": "The length of the seed sub-string to use during seeding. This will override any values set with --mapping_bowtie2_sensitivity.\n\n> Modifies Bowtie2 parameter: `-L`", "fa_icon": "fas fa-ruler-horizontal" }, "mapping_bowtie2_trim5": { "type": "integer", "default": 0, - "description": "Specify the number of bases to trim off from 5 prime end of read before alignment with Bowtie 2.", - "help_text": "Specify the number of bases to trim at the 5' (left) end of read before alignment with Bowtie 2. This may be useful when left-over sequencing artefacts of in-line barcodes are present.\n\n> Modifies Bowtie 2 parameter: `--trim5`", + "description": "Specify number of bases to trim off from 5' (left) end of read before alignment.", + "help_text": "Number of bases to trim at the 5' (left) end of read prior alignment. Maybe useful when left-over sequencing artefacts of in-line barcodes present.\n\n> Modifies Bowtie2 parameter: `--trim5`", "fa_icon": "fas fa-cut" }, "mapping_bowtie2_trim3": { "type": "integer", "default": 0, - "description": "Specify the number of bases to trim off from 3 prime end of read before alignment with Bowtie 2.", - "help_text": "Specify the number of bases to trim at the 3' (right) end of read before alignment with Bowtie 2. This may be useful when left-over sequencing artefacts of in-line barcodes are present.\n\n> Modifies Bowtie 2 parameter: `--trim3`", + "description": "Specify number of bases to trim off from 3' (right) end of read before alignment.", + "help_text": "Number of bases to trim at the 3' (right) end of read prior alignment. Maybe useful when left-over sequencing artefacts of in-line barcodes present.\n\n> Modifies Bowtie2 parameter: `--trim3`", "fa_icon": "fas fa-cut" }, "mapping_bowtie2_maxins": { @@ -604,13 +603,7 @@ "description": "Specify the maximum fragment length for Bowtie2 paired-end mapping mode only.", "help_text": "The maximum fragment for valid paired-end alignments. Only for paired-end mapping (i.e. unmerged), and therefore typically only useful for modern data.\n\n> Modifies Bowtie2 parameter: `--maxins`", "fa_icon": "fas fa-exchange-alt" - }, - "elongation_factor": { - "type": "integer", - "default": 500, - "description": "Specify the number of bases to extend reference by (circularmapper only)", - "help_text": "The number of bases to extend the reference genome with. By default this is set to 500 if not specified otherwise.", - "fa_icon": "fas fa-external-link-alt" + } }, "fa_icon": "fas fa-layer-group" }, @@ -622,54 +615,54 @@ "properties": { "run_bamfiltering": { "type": "boolean", - "description": "Specify to turn on filtering of reads in BAM files after mapping. By default, only mapped reads retained.", + "description": "Turn on filtering of reads in BAM files after mapping. By default, only mapped reads retained.", "fa_icon": "fas fa-power-off", - "help_text": "Specify to turns on the filtering subworkflow for mapped BAM files after the read alignment step. Filtering includes removal of unmapped reads, length filtering, and mapping quality filtering.\n\nWhen turning on BAM filtering, by default only the mapped/unmapped filter is activated, thus only mapped reads are retained for downstream analyses. See `--bamfiltering_retainunmappedgenomicbam` to retain unmapped reads, if filtering only for length and/or quality is preferred.\n\nNote this subworkflow can also be activated if `--run_metagenomic_screening` is supplied." + "help_text": "Turns on the filtering subworkflow for mapped BAM files coming out of the read alignment step. Filtering includes removal of unmapped reads, length filtering, and mapping quality filtering.\n\nWhen turning on bam filtering, by default only the mapped/unmapped filter is activated, thus only mapped reads are retained for downstream analyses. See `--bamfiltering_retainunmappedgenomicbam` to retain unmapped reads, if filtering only for length and/or quality is preferred.\n\nNote this subworkflow can also be activated if `--run_metagenomic_screening` is supplied." }, "bamfiltering_minreadlength": { "type": "integer", "default": 0, "description": "Specify the minimum read length mapped reads should have for downstream genomic analysis.", - "help_text": "Specify to remove mapped reads that fall below a certain length threshold after mapping.\n\nThis can be useful to get more realistic 'endogenous DNA' or 'on target read' percentages.\n\nIf used _instead of_ minimum length read filtering at AdapterRemoval, you can get more more realistic endogenous DNA estimates when most of your reads are very short (e.g. in single-stranded libraries or samples with highly degraded DNA). In these cases, the default minimum length filter at earlier adapter clipping/read merging will remove a very large amount of your reads in your library (including valid reads), thus making an artificially small denominator for a typical endogenous DNA calculation.\n\nTherefore by retaining all of your reads until _after_ mapping (i.e., turning off the adapter clipping/read merging filter), you can generate more 'real' endogenous DNA estimates immediately after mapping (with a better denominator). Then after estimating this, filter using this parameter to retain only 'useful' reads (i.e., those long enough to provide higher confidence of their mapped position) for downstream analyses.\n\nBy specifying `0`, no length filtering is performed.\n\nNote that by default the output BAM files of this step are _not_ stored in the results directory (as it is assumed that deduplicated BAM files are preferred). See `--bamfiltering_savefilteredbams` if you wish to save these.\n\n> Modifies filter_bam_fragment_length.py parameter: `-l`", - "fa_icon": "fas fa-filter" + "help_text": "You can use this to remove mapped reads that fall below a certain length after mapping.\n\nThis can be useful to get more realistic 'endogenous DNA' or 'on target read' percentages.\n\nIf used _instead of_ minimum length read filtering at AdapterRemoval, you can get more more realistic endogenous DNA estimates when most of your reads are very short (e.g. in single-stranded libraries or samples with highly degraded DNA). In these cases, the default minimum length filter at earlier adapter clipping/read merging will remove a very large amount of your reads in your library (including valid reads), thus making an artificially small denominator for a typical endogenous DNA calculation. \n\nTherefore by retaining all of your reads until _after_ mapping (i.e., turning off the adapter clipping/read merging filter), you can generate more 'real' endogenous DNA estimates immediately after mapping (with a better denominator). Then after estimating this, filter using this parameter to retain only 'useful' reads (i.e., those long enough to provide higher confidence of their mapped position) for downstream analyses.\n\nBy specifying `0`, no length filtering is performed.\n\nNote that by default the output BAM files of this step are _not_ stored in the results directory (as it is assumed that deduplicated BAM files are preferred). See `--bamfiltering_savefilteredbams` if you wish to save these.\n\n> Modifies tool parameter(s):\n> - filter_bam_fragment_length.py: `-l`", + "fa_icon": "fas fa-ruler-horizontal" }, "bamfiltering_mappingquality": { "type": "integer", "default": 0, "description": "Specify the minimum mapping quality reads should have for downstream genomic analysis.", - "help_text": "Specify a mapping quality threshold for mapped reads to be kept for downstream analysis.\n\nBy default all reads are retained and this option is therefore set to 0 to ensure no quality filtering is performed.\n\nNote that by default the output BAM files of this step are _not_ stored in the results directory (as it is assumed that deduplicated BAM files are preferred). See `--bamfiltering_savefilteredbams` if you wish to save these.\n\n> Modifies samtools view parameter: `-q`", - "fa_icon": "fas fa-filter" + "help_text": "Specify a mapping quality threshold for mapped reads to be kept for downstream analysis.\n\nBy default all reads are retained and is therefore set to 0 to ensure no quality filtering is performed.\n\nNote that by default the output BAM files of this step are _not_ stored in the results directory (as it is assumed that deduplicated BAM files are preferred). See `--bamfiltering_savefilteredbams` if you wish to save these.\n\n> Modifies tool parameter(s):\n> - samtools view `-q`", + "fa_icon": "fas fa-thermometer-full" }, "bamfilter_genomicbamfilterflag": { "type": "integer", "default": 4, "fa_icon": "fas fa-flag", - "description": "Specify the SAM format flag of reads to remove during BAM filtering for downstream genomic steps.", - "help_text": "Specify to customise the exact SAM format flag of reads you wish to _remove_ from your BAM file to for downstream _genomic_ analyses.\n\nYou can explore more using a tool from the Broad Institute [here](https://broadinstitute.github.io/picard/explain-flags.html)\n\n> ⚠️ Modify at your own risk, alternative flags are not necessarily supported in downstream steps!\n\n> Modifies samtools parameter: `-F`" + "description": "Specify the SAM format flag of reads to remove during BAM filtering for downstream genomic steps. Generally not recommended to change.", + "help_text": "You can use this to customise the exact SAM format flag of reads you wish to _remove_ from your BAM file to for downstream _genomic_ analyses.\n\nYou can explore more using a tool from the Broad institute [here](https://broadinstitute.github.io/picard/explain-flags.html)\n\n> \u26a0\ufe0f Modify at your own risk, alternative flags are not necessarily supported in downstream steps!\n\n> Modifies tool parameter(s):\n> - SAMtools: `-F`" }, "bamfiltering_retainunmappedgenomicbam": { "type": "boolean", "description": "Specify to retain unmapped reads in the BAM file used for downstream genomic analyses.", - "help_text": "Specify to retain unmapped reads (optionally also length-filtered) in the genomic BAM for downstream analysis. By default, the pipeline only keeps mapped reads for downstream analysis.\n\nThis is also turned on if `--metagenomicscreening_input` is set to `all`.\n\n> ⚠️ This will likely slow down run time of downstream pipeline steps!\n\n> Modifies samtools view parameters: `-f 4` / `-F 4`", + "help_text": "You can use this parameter to retain unmapped reads (optionally also length filtered) in the genomic BAM for downstream analysis. By default, the pipeline only keeps mapped reads for downstream analysis.\n\nThis is also turned on if `--metagenomicscreening_input` is set to `all`.\n\n> \u26a0\ufe0f This will likely slow down run time of downstream pipeline steps!\n\n> Modifies tool parameter(s):\n> - samtools view: `-f 4` / `-F 4`", "fa_icon": "fas fa-piggy-bank" }, "bamfiltering_generateunmappedfastq": { "type": "boolean", - "description": "Specify to generate FASTQ files containing only unmapped reads from the aligner generated BAM files.", - "help_text": "Specify to turn on the generation and saving of FASTQs of only the unmapped reads from the mapping step in the results directory.\n\nThis can be useful if you wish to do other analysis of the unmapped reads independently of the pipeline.\n\nNote: the reads in these FASTQ files have _not_ undergone length of quality filtering\n\n> Modifies samtools fastq parameter: `-f 4`", + "description": "Generate FASTQ files containing only unmapped reads from the aligner generated BAM files.", + "help_text": "This turns on the generation and saving of FASTQs of only the unmapped reads from the mapping step in the results directory, using `samtools fastq`.\n\nThis could be useful if you wish to do other analysis of the unmapped reads independently of the pipeline.\n\nNote: the reads in these FASTQ files have _not_ undergone length of quality filtering\n\n> Modifies tool parameter(s):\n> - samtools fastq: `-f 4`", "fa_icon": "fas fa-file-alt" }, "bamfiltering_generatemappedfastq": { "type": "boolean", - "description": "Specify to generate FASTQ files containing only mapped reads from the aligner generated BAM files.", - "help_text": "Specify to turn on the generation and saving of FASTQs of only the mapped reads from the mapping step in the results directory.\n\nThis can be useful if you wish to do other analysis of the mapped reads independently of the pipeline, such as remapping with different parameters (whereby only including mapped reads will speed up computation time during the re-mapping due to reduced input data).\n\nNote the reads in these FASTQ files have _not_ undergone length of quality filtering\n\n> Modifies samtools fastq parameter: `-F 4`", + "description": "Generate FASTQ files containing only mapped reads from the aligner generated BAM files .", + "help_text": "This turns on the generation and saving of FASTQs of only the mapped reads from the mapping step in the results directory, using `samtools fastq`.\n\nThis could be useful if you wish to do other analysis of the mapped reads independently of the pipeline, such as remapping with different parameters (whereby only including mapped reads will speed up computation time during the re-mapping due to reduced input data).\n\nNote the reads in these FASTQ files have _not_ undergone length of quality filtering\n\n> Modifies tool parameter(s):\n> - samtools fastq: `-F 4`", "fa_icon": "far fa-file-alt" }, "bamfiltering_savefilteredbams": { "type": "boolean", - "description": "Specify to save the intermediate filtered genomic BAM files in the results directory.", - "help_text": "Specify to save intermediate length- and/or quality-filtered genomic BAM files in the results directory.", - "fa_icon": "fas fa-save" + "description": "Save in the results directory the intermediate filtered genomic BAM files that are sent for downstream genomic analyses.", + "help_text": "This saves intermediate length and/or quality filtered genomic BAM files in the results directory.", + "fa_icon": "far fa-save" } }, "fa_icon": "fas fa-filter" @@ -677,64 +670,64 @@ "metagenomics": { "title": "Metagenomics", "type": "object", - "description": "Options related to metagenomic screening.", + "description": "Options to related to metagenomic screening.", "default": "", "properties": { "run_metagenomicscreening": { "type": "boolean", - "description": "Specify to turn on metagenomic screening of mapped, unmapped or all reads.", + "description": "Turn on metagenomic screening of mapped, unmapped, or all reads.", "fa_icon": "fas fa-power-off", - "help_text": "Specify to turn on the metagenomic screening subworkflow of the pipeline, where reads are screened against large databases. Typically used for pathogen screening or microbial community analysis.\n\nIf supplied, this will also turn on the BAM filtering subworkflow of the pipeline." + "help_text": "Turns on the metagenomic screening subworkflow of the pipeline, where reads are screened against large databases. Typically used for pathogen screening or microbial community analysis.\n\nIf supplied, this will also turn on the BAM filtering subworkflow of the pipeline." }, "metagenomicscreening_input": { "type": "string", "default": "unmapped", - "description": "Specify which type of reads to use for metagenomic screening.", + "description": "Specify which type of reads to go into metagenomic screening.", "enum": ["unmapped", "mapped", "all"], "fa_icon": "fas fa-hand-pointer", - "help_text": "Specify to select which mapped reads will be sent for metagenomic analysis.\n\nThis influences which reads are sent to this step, whether you want unmapped reads (used in most cases, as 'host reads' can often be contaminants in microbial genomes), mapped reads (e.g, when doing competitive against a genomic reference of multiple genomes and which to apply LCA correction) or all reads.\n\n> ⚠️ If you skip paired-end merging, all reads will be screened as independent reads - not as pairs! - as all FASTQ files from BAM filtering are merged into one. This merged file is _not_ saved in results directory.\n\n> Modifies samtools fastq parameters: `-f 4` / `-F 4`" + "help_text": "You can select which reads coming out of the read alignment step will be sent for metagenomic analysis.\n\nThis influences which reads are sent to this step, whether you want unmapped reads (used in most cases, as 'host reads' can often be contaminants in microbial genomes), mapped reads (e.g, when doing competitive against a genomic reference of multiple genomes and which to apply LCA correction), or all reads.\n\n> \u26a0\ufe0f If you skip paired-end merging, all reads will be screened as independent reads - not as pairs! - as all FASTQ files from BAM filtering are merged into one. This merged file is _not_ saved in results directory.\n\n> Modifies tool parameter(s):\n> - samtools fastq: `-f 4` / `-F 4`" }, "run_metagenomics_complexityfiltering": { "type": "boolean", "fa_icon": "fas fa-power-off", - "help_text": "Specify to turn on a subworkflow of the pipeline that filters the FASTQ files for complexity before the metagenomics profiling.\nUse the metagenomics_complexity_tool parameter to select a method.", - "description": "Specify to run a complexity filter on the metagenomics input files before classification." + "help_text": "Turns on a subworkflow of the pipeline that filters the fastq-files for complexity before the metagenomics profiling\nUse the metagenomics_complexity_tool parameter to select a method", + "description": "Run a complexity filter on the metagenomics input files before classification. Specifiy the tool to use with the `metagenomics_complexity_tool` parameter, save with `metagenomics_complexity_savefastq`" }, "metagenomics_complexity_savefastq": { "type": "boolean", "fa_icon": "fas fa-save", - "description": "Specify to save FASTQ files containing the complexity-filtered reads before metagenomic classification.", - "help_text": "Specify to save the complexity-filtered FASTQ files to the results directory." + "description": "Save FASTQ files containing the complexity filtered reads (before metagenomic classification).", + "help_text": "Save the complexity-filtered fastq-files to the results directory" }, "metagenomics_complexity_tool": { "type": "string", "default": "bbduk", - "description": "Specify which tool to use for trimming, filtering or reformatting of FASTQ reads that go into metagenomics screening.", + "description": "Specify which tool to use for trimming, filtering, or reformatting of fastq reads that go into metagenomics screening.", "enum": ["bbduk", "prinseq"], - "fa_icon": "fas fa-hammer", - "help_text": "Specify to select which tool is used to generate a final set of reads for the metagenomic classifier after any necessary trimming, filtering or reformatting of the reads.\n\nThis intermediate file is not saved in the results directory unless marked with `--metagenomics_complexity_savefastq`." + "fa_icon": "fas fa-hand-pointer", + "help_text": "You can select which tool is used to generate a final set of reads for the metagenomic classifier after any necessary trimming, filtering or reformatting of the reads.\n\nThis intermediate file is not saved in the results directory, unless marked with `--metagenomics_complexity_savefastq`." }, "metagenomics_complexity_entropy": { "type": "number", "fa_icon": "fas fa-sort-numeric-up", - "description": "Specify the entropy threshold under which a sequencing read will be complexity-filtered out.", + "description": "Specify the entropy threshold that under which a sequencing read will be complexity filtered out. This should be between 0-1.", "default": 0.3, - "help_text": "Specify the minimum 'entropy' value for complexity filtering for the BBDuk or PRINSEQ++ tools.\n\nThis value will only be used for PRINSEQ++ if `--metagenomics_prinseq_mode` is set to `entropy`.\n\nEntropy here corresponds to the amount of sequence variation existing within the read. Higher values correspond to more variety and thus will likely result in more specific matching to a taxon's reference genome. The trade-off here is fewer reads (or abundance information) available for having a confident identification.\n\n> Modifies parameters:\n> - BBDuk: `entropy=`\n> - PRINSEQ++: `-lc_entropy`" + "help_text": "Specify the minimum 'entropy' value for complexity filtering for the BBDuk or PRINSEQ++ tools.\n\nThis value will only be used for PRINSEQ++ if `--metagenomics_prinseq_mode` is set to `entropy`.\n\nEntropy here corresponds to the amount of sequence variation exists within the read. Higher values correspond to more variety, and thus will likely result in more specific matching to a taxon's reference genome. The trade off here is fewer reads (or abundance information) available for having a confident identification.\n\n> Modifies tool parameter(s):\n> - BBDuk: `entropy=`\n> - PRINSEQ++: `-lc_entropy`\n\n" }, "metagenomics_prinseq_mode": { "type": "string", "default": "entropy", "enum": ["entropy", "dust"], - "fa_icon": "fas fa-toggle-on", - "description": "Specify the complexity filter mode for PRINSEQ++.", - "help_text": "Specify the complexity filter mode for PRINSEQ++.\n\nUse the selected mode together with the correct flag:\n'dust' requires the `--metagenomics_prinseq_dustscore` parameter set\n'entropy' requires the `--metagenomics_complexity_entropy` parameter set\n\n> Modifies parameters:\n> - PRINSEQ++: `-lc_entropy`\n> - PRINSEQ++: `-lc_dust`" + "fa_icon": "fas fa-check-square", + "description": "Specify the complexity filter mode for PRINSEQ++", + "help_text": "Specify the complexity filter mode for PRINSEQ++ \n\nUse the selected mode together with the correct flag:\n'dust' requires the `--metagenomics_prinseq_dustscore` parameter set\n'entropy' requires the `--metagenomics_complexity_entropy` parameter set\n\n> Sets one of the tool parameter(s):\n> - PRINSEQ++: `-lc_entropy`\n> - PRINSEQ++: `-lc_dust`" }, "metagenomics_prinseq_dustscore": { "type": "number", "default": 0.5, - "fa_icon": "fas fa-filter", - "description": "Specify the minimum dust score for PRINTSEQ++ complexity filtering.", - "help_text": "Specify the minimum dust score below which low-complexity reads will be removed. A dust score is based on how often different tri-nucleotides occur along a read.\n\n> Modifies PRINSEQ++ parameter: `--lc_dust`" + "fa_icon": "fas fa-head-side-mask", + "description": "Specify the minimum dust score for PRINTSEQ++ complexity filtering", + "help_text": "Specify the minimum dust score below which low-complexity reads will be removed. A DUST score is based on how often different tri-nucleotides occur along a read.\n\n> Modifies tool parameter(s):\n> - PRINSEQ++: `--lc_dust`" } }, "fa_icon": "fas fa-search" @@ -754,9 +747,9 @@ "type": "string", "default": "markduplicates", "description": "Specify which tool to use for deduplication.", - "help_text": "Specify which duplicate read removal tool to use. While `markduplicates` is set by default, an ancient DNA specific read deduplication tool `dedup` is offered (see [Peltzer et al. 2016](https://doi.org/10.1186/s13059-016-0918-z) for details). The latter utilises both ends of paired-end data to remove duplicates (i.e. true exact duplicates, as markduplicates will over-zealously deduplicate anything with the same starting position even if the ends are different).\n\n> ⚠️ DeDup can only be used on collapsed (i.e. merged) reads from paired-end sequencing.", + "help_text": "Sets the duplicate read removal tool. Alternatively an ancient DNA specific read deduplication tool `dedup` (Peltzer et al. 2016) is offered. The latter utilises both ends of paired-end data to remove duplicates (i.e. true exact duplicates, as markduplicates will over-zealously deduplicate anything with the same starting position even if the ends are different).\n\n> \u26a0\ufe0f DeDup can only be used on collapsed (i.e. merged) reads from paired-end sequencing.", "enum": ["markduplicates", "dedup"], - "fa_icon": "fas fa-hammer" + "fa_icon": "fas fa-layer-group" } }, "fa_icon": "fas fa-clone" @@ -770,127 +763,127 @@ "properties": { "run_mapdamage_rescaling": { "type": "boolean", - "fa_icon": "fas fa-power-off", - "description": "Specify to turn on damage rescaling of BAM files using mapDamage2 to probabilistically remove damage.", - "help_text": "Specify to turn on mapDamage2's BAM rescaling functionality. This probabilistically replaces Ts back to Cs depending on the likelihood this reference-mismatch was originally caused by damage. If the library is specified to be single-stranded, this will automatically use the `--single-stranded` mode.\nThis process will ameliorate the effects of aDNA damage, but also increase reference-bias.\n\n**This functionality does not have any MultiQC output.**\n ⚠️ Rescaled libraries will not be merged with non-scaled libraries of the same sample for downstream genotyping, as the model may be different for each library. If you wish to merge these, please do this manually and re-run nf-core/eager using the merged BAMs as input.\n\n> Modifies mapDamage2 parameter: `--rescale`" + "fa_icon": "fas fa-map", + "description": "Turn on damage rescaling of BAM files using mapDamage2 to probabilistically remove damage.", + "help_text": "Turns on mapDamage2's BAM rescaling functionality. This probabilistically replaces Ts back to Cs depending on the likelihood this reference-mismatch was originally caused by damage. If the library is specified to be single stranded, this will automatically use the --single-stranded mode.\nThis process will ameliorate the effects of aDNA damage, but also increase reference-bias. \n\n**This functionality does not have any MultiQC output.**\nwarning: rescaled libraries will not be merged with non-scaled libraries of the same sample for downstream genotyping, as the model may be different for each library. If you wish to merge these, please do this manually and re-run nf-core/eager using the merged BAMs as input.\n\n> Modifies the `--rescale` parameter of mapDamage2" }, "damage_manipulation_rescale_seqlength": { "type": "integer", "default": 12, - "description": "Specify the length of read sequence to use from each side for rescaling.", - "help_text": "Specify the length in bp from the end of the read that mapDamage should rescale at both ends. This can be overridden by `--rescalelength*p`.\n\n> Modifies mapDamage2 parameter: `--seq-length`", + "description": "Length of read sequence to use from each side for rescaling. Can be overridden by `--rescalelength*p`.", + "help_text": "Specify the length in bp from the end of the read that mapDamage should rescale at both ends.\n\n> Modifies the `--seq-length` parameter of mapDamage2.", "fa_icon": "fas fa-ruler-horizontal" }, "damage_manipulation_rescale_length_5p": { "type": "integer", "default": 0, - "description": "Specify the length of read for mapDamage2 to rescale from 5 prime end.", - "help_text": "Specify the length in bp from the end of the read that mapDamage should rescale. This overrides `--rescale_seqlength`.\n\n> Modifies mapDamage2 parameter: `--rescale-length-5p`", + "description": "Length of read for mapDamage2 to rescale from 5p end. Only used if not 0, otherwise `--rescale_seqlength` used.", + "help_text": "Specify the length in bp from the end of the read that mapDamage should rescale. Overrides `--rescale_seqlength`.\n\n> Modifies the `--rescale-length-5p` parameter of mapDamage2.", "fa_icon": "fas fa-balance-scale-right" }, "damage_manipulation_rescale_length_3p": { "type": "integer", "default": 0, - "description": "Specify the length of read for mapDamage2 to rescale from 3 prime end.", - "help_text": "Specify the length in bp from the end of the read that mapDamage should rescale. This overrides `--rescale_seqlength`.\n\n> Modifies mapDamage2 parameter `--rescale-length-3p`", + "description": "Length of read for mapDamage2 to rescale from 3p end. Only used if not 0 otherwise `--rescale_seqlength` used.", + "help_text": "Specify the length in bp from the end of the read that mapDamage should rescale. Overrides `--rescale_seqlength`.\n\n> Modifies the `--rescale-length-3p` parameter of mapDamage2.", "fa_icon": "fas fa-balance-scale-left" }, "run_pmd_filtering": { "type": "boolean", - "description": "Specify to turn on PMDtools filtering.", - "help_text": "Specify to run PMDtools for damage-based read filtering in sequencing libraries.", + "description": "Turn on PMDtools filtering.", + "help_text": "Specifies to run PMDtools for damage based read filtering in sequencing libraries.", "fa_icon": "fas fa-power-off" }, "damage_manipulation_pmdtools_threshold": { "type": "integer", "default": 3, "fa_icon": "far fa-chart-bar", - "description": "Specify PMD score threshold for PMDtools.", - "help_text": "Specify the PMDScore threshold to use when filtering BAM files for DNA damage. Only reads which surpass this damage score are considered for downstream analysis.\n\n> Modifies PMDtools parameter: `--threshold`" + "description": "Specify PMDScore threshold for PMDtools.", + "help_text": "Specifies the PMDScore threshold to use in the pipeline when filtering BAM files for DNA damage. Only reads which surpass this damage score are considered for downstream DNA analysis.\n\n> Modifies PMDtools parameter: `--threshold`" }, "damage_manipulation_pmdtools_masked_reference": { "type": "string", "fa_icon": "fas fa-mask", - "help_text": "Specify a FASTA file to use as reference for `samtools calmd` prior to PMD filtering.\nSetting the SNPs that are part of the used capture set as `N` can alleviate reference bias when running PMD filtering on capture data, where you might not want the allele of a SNP to be counted as damage when it is a transition.", - "description": "Specify a masked FASTA file with positions to be used with PMDtools.", + "help_text": "Supplying a FASTA file will use this file as reference for `samtools calmd` prior to PMD filtering. /nSetting the SNPs that are part of the used capture set as `N` can alleviate reference bias when running PMD filtering on capture data, where you might not want the allele of a SNP to be counted as damage when it is a transition.", + "description": "Specify a masked FASTA file with positions to be used with pmdtools.", "pattern": "^\\S+\\.fa?(\\sta)$", "format": "file-path" }, "damage_manipulation_pmdtools_reference_mask": { "type": "string", "fa_icon": "fas fa-mask", - "help_text": "Specify a BED file to activate masking of the reference FASTA at the contained sites prior to running PMDtools. Positions that are in the provided BED file will be replaced by Ns in the reference genome.\nThis can alleviate reference bias when running PMD filtering on capture data, where you might not want the allele of a transition SNP to be counted as damage. Masking of the reference is done using `bedtools maskfasta`.", - "description": "Specify a BED file to be used to mask the reference FASTA prior to running PMDtools.", + "help_text": "Supplying a bedfile to this parameter activates masking of the reference fasta at the contained sites prior to running PMDtools. Positions that are in the provided bedfile will be replaced by Ns in the reference genome. \nThis can alleviate reference bias when running PMD filtering on capture data, where you might not want the allele of a transition SNP to be counted as damage. Masking of the reference is done using `bedtools maskfasta`.", + "description": "Specify a bedfile to be used to mask the reference fasta prior to running pmdtools.", "pattern": "^\\S+\\.bed?(\\.gz)$", "format": "file-path" }, "run_trim_bam": { "type": "boolean", - "fa_icon": "fas fa-power-off", - "description": "Specify to turn on BAM trimming for non-UDG or half-UDG libraries.", - "help_text": "Specify to turn on the BAM trimming of [n] bases from reads in the deduplicated BAM file. Damage assessment in PMDtools or DamageProfiler remains untouched, as data is routed through this independently. BAM trimming is typically performed to reduce errors during genotyping that can be caused by aDNA damage.\n\nBAM trimming will only affect libraries with 'damage_treatment' of 'none' or 'half'. Complete UDG treatment ('full') should have removed all damage during library construction, so trimming of 0 bp is performed. The amount of bases that will be trimmed off from each side of the molecule should be set separately for libraries depending on their 'strandedness' and 'damage_treatment'.\n\n> Note: additional artefacts such as barcodes or adapters should be removed prior to mapping and not in this step." + "fa_icon": "fas fa-eraser", + "description": "Turn on BAM trimming. Will only affect non-UDG or half-UDG libraries.", + "help_text": "Turns on the BAM trimming method. Trims off [n] bases from reads in the deduplicated BAM file. Damage assessment in PMDtools or DamageProfiler remains untouched, as data is routed through this independently. BAM trimming is typically performed to reduce errors during genotyping that can be caused by aDNA damage.\n\nBAM trimming will only affect libraries with 'damage_treatment' of 'none' or 'half'. Complete UDG treatment ('full') should have removed all damage, during library construction so trimming of 0 bp is performed. The amount of bases that will be trimmed off from each side of the molecule should be set separately for libraries with depending on their 'strandedness' and 'damage_treatment'.\n\n> Note: additional artefacts such as bar-codes or adapters should be removed prior to mapping and not in this step." }, "damage_manipulation_bamutils_trim_double_stranded_none_udg_left": { "type": "integer", "default": 0, - "fa_icon": "fas fa-cut", - "description": "Specify the number of bases to clip off reads from 'left' (5 prime) end of reads for double-stranded non-UDG libraries.", - "help_text": "Specify the number of bases to clip off reads from 'left' (5 prime) end of reads for double-stranded non-UDG libraries. By default, this is set to 0, and therefore clips off no bases on the left side of reads from double-stranded libraries whose UDG treatment is set to 'none'. Note that reverse reads will automatically be clipped off at the reverse side with this (automatically reverses left and right for the reverse read).\n\n> Modifies bamUtil's trimBam parameter: `-L`" + "fa_icon": "fas fa-ruler-combined", + "description": "Specify the number of bases to clip off reads from 'left' end of read for double-stranded non-UDG libraries.", + "help_text": "Default is set to 0, and therefore clips off no bases on the left side of reads from double-stranded libraries whose UDG treatment is set to'none'. Note that reverse reads will automatically be clipped off at the reverse side with this (automatically reverses left and right for the reverse read).\n\n> Modifies bamUtil's trimBam parameter: `-L`" }, "damage_manipulation_bamutils_trim_double_stranded_none_udg_right": { "type": "integer", "default": 0, - "fa_icon": "fas fa-cut", - "description": "Specify the number of bases to clip off reads from 'right' (3 prime) end of reads for double-stranded non-UDG libraries.", - "help_text": "Specify the number of bases to clip off reads from 'right' (3 prime) end of reads for double-stranded non-UDG libraries. By default, this is set to 0, and therefore clips off no bases on the right side of reads from double-stranded libraries whose UDG treatment is set to 'none'. Note that reverse reads will automatically be clipped off at the reverse side with this (automatically reverses left and right for the reverse read).\n\n> Modifies bamUtil's trimBam parameter: `-R`" + "fa_icon": "fas fa-ruler", + "description": "Specify the number of bases to clip off reads from 'right' end of read for double-stranded non-UDG libraries.", + "help_text": "Default is set to 0, and therefore clips off no bases on the right side of reads from double-stranded libraries whose UDG treatment is set to'none'. Note that reverse reads will automatically be clipped off at the reverse side with this (automatically reverses left and right for the reverse read).\n\n> Modifies bamUtil's trimBam parameter: `-R`" }, "damage_manipulation_bamutils_trim_double_stranded_half_udg_left": { "type": "integer", "default": 0, - "fa_icon": "fas fa-cut", - "help_text": "Specify the number of bases to clip off reads from 'left' (5 prime) end of read for double-stranded half-UDG libraries. By default, this is set to 0, and therefore clips off no bases on the left side of reads from double-stranded libraries whose UDG treatment is set to 'half'. Note that reverse reads will automatically be clipped off at the reverse side with this (automatically reverses left and right for the reverse read).\n\n> Modifies bamUtil's trimBam parameter: `-L`", - "description": "Specify the number of bases to clip off reads from 'left' (5 prime) end of read for double-stranded half-UDG libraries." + "fa_icon": "fas fa-ruler-combined", + "help_text": "Default is set to 0, and therefore clips off no bases on the left side of reads from double-stranded libraries whose UDG treatment is set to 'half'. Note that reverse reads will automatically be clipped off at the reverse side with this (automatically reverses left and right for the reverse read).\n\n> Modifies bamUtil's trimBam parameter: `-L`", + "description": "Specify the number of bases to clip off reads from 'left' end of read for double-stranded half-UDG libraries." }, "damage_manipulation_bamutils_trim_double_stranded_half_udg_right": { "type": "integer", "default": 0, - "fa_icon": "fas fa-cut", - "description": "Specify the number of bases to clip off reads from 'right' (3 prime) end of read for double-stranded half-UDG libraries.", - "help_text": "Specify the number of bases to clip off reads from 'right' (3 prime) end of read for double-stranded half-UDG libraries. By default, this is set to 0, and therefore clips off no bases on the right side of reads from double-stranded libraries whose UDG treatment is set to 'half'. Note that reverse reads will automatically be clipped off at the reverse side with this (automatically reverses left and right for the reverse read).\n\n> Modifies bamUtil's trimBam parameter: `-R`" + "fa_icon": "fas fa-ruler", + "description": "Specify the number of bases to clip off reads from 'right' end of read for double-stranded half-UDG libraries.", + "help_text": "Default is set to 0, and therefore clips off no bases on the right side of reads from double-stranded libraries whose UDG treatment is set to 'half'. Note that reverse reads will automatically be clipped off at the reverse side with this (automatically reverses left and right for the reverse read).\n\n> Modifies bamUtil's trimBam parameter: `-R`" }, "damage_manipulation_bamutils_trim_single_stranded_none_udg_left": { "type": "integer", "default": 0, - "fa_icon": "fas fa-cut", - "help_text": "Specify the number of bases to clip off reads from 'left' (5 prime) end of read for single-stranded non-UDG libraries. By default, this is set to 0, and therefore clips off no bases on the left side of reads from single-stranded libraries whose UDG treatment is set to 'none'. Note that reverse reads will automatically be clipped off at the reverse side with this (automatically reverses left and right for the reverse read).\n\n> Modifies bamUtil's trimBam parameter: `-L`", - "description": "Specify the number of bases to clip off reads from 'left' (5 prime) end of read for single-stranded non-UDG libraries." + "fa_icon": "fas fa-ruler-combined", + "help_text": "Default is set to 0, and therefore clips off no bases on the left side of reads from single-stranded libraries whose UDG treatment is set to 'none'. Note that reverse reads will automatically be clipped off at the reverse side with this (automatically reverses left and right for the reverse read).\n\n> Modifies bamUtil's trimBam parameter: `-L`", + "description": "Specify the number of bases to clip off reads from 'left' end of read for single-stranded non-UDG libraries." }, "damage_manipulation_bamutils_trim_single_stranded_none_udg_right": { "type": "integer", "default": 0, - "fa_icon": "fas fa-cut", - "description": "Specify the number of bases to clip off reads from 'right' (3 prime) end of read for single-stranded non-UDG libraries.", - "help_text": "Specify the number of bases to clip off reads from 'right' (3 prime) end of read for single-stranded non-UDG libraries. By default, this is set to 0, and therefore clips off no bases on the right side of reads from single-stranded libraries whose UDG treatment is set to 'none'. Note that reverse reads will automatically be clipped off at the reverse side with this (automatically reverses left and right for the reverse read).\n\n> Modifies bamUtil's trimBam parameter: `-R`" + "fa_icon": "fas fa-ruler", + "description": "Specify the number of bases to clip off reads from 'right' end of read for single-stranded non-UDG libraries.", + "help_text": "Default is set to 0, and therefore clips off no bases on the right side of reads from single-stranded libraries whose UDG treatment is set to 'none'. Note that reverse reads will automatically be clipped off at the reverse side with this (automatically reverses left and right for the reverse read).\n\n> Modifies bamUtil's trimBam parameter: `-R`" }, "damage_manipulation_bamutils_trim_single_stranded_half_udg_left": { "type": "integer", "default": 0, - "fa_icon": "fas fa-cut", - "help_text": "Specify the number of bases to clip off reads from 'left' (5 prime) end of read for single-stranded half-UDG libraries. By default, this is set to 0, and therefore clips off no bases on the left side of reads from single-stranded libraries whose UDG treatment is set to 'half'. Note that reverse reads will automatically be clipped off at the reverse side with this (automatically reverses left and right for the reverse read).\n\n> Modifies bamUtil's trimBam parameter: `-L`", - "description": "Specify the number of bases to clip off reads from 'left' (5 prime) end of read for single-stranded half-UDG libraries." + "fa_icon": "fas fa-ruler-combined", + "help_text": "Default is set to 0, and therefore clips off no bases on the left side of reads from single-stranded libraries whose UDG treatment is set to 'half'. Note that reverse reads will automatically be clipped off at the reverse side with this (automatically reverses left and right for the reverse read).\n\n> Modifies bamUtil's trimBam parameter: `-L`", + "description": "Specify the number of bases to clip off reads from 'left' end of read for single-stranded half-UDG libraries." }, "damage_manipulation_bamutils_trim_single_stranded_half_udg_right": { "type": "integer", "default": 0, - "fa_icon": "fas fa-cut", - "description": "Specify the number of bases to clip off reads from 'right' (3 prime) end of read for single-stranded half-UDG libraries.", - "help_text": "Specify the number of bases to clip off reads from 'right' (3 prime) end of read for single-stranded half-UDG libraries. By default, this is set to 0, and therefore clips off no bases on the right side of reads from single-stranded libraries whose UDG treatment is set to 'half'. Note that reverse reads will automatically be clipped off at the reverse side with this (automatically reverses left and right for the reverse read).\n\n> Modifies bamUtil's trimBam parameter: `-R`" + "fa_icon": "fas fa-ruler", + "description": "Specify the number of bases to clip off reads from 'right' end of read for single-stranded half-UDG libraries.", + "help_text": "Default is set to 0, and therefore clips off no bases on the right side of reads from single-stranded libraries whose UDG treatment is set to 'half'. Note that reverse reads will automatically be clipped off at the reverse side with this (automatically reverses left and right for the reverse read).\n\n> Modifies bamUtil's trimBam parameter: `-R`" }, "damage_manipulation_bamutils_softclip": { "type": "boolean", - "fa_icon": "fas fa-mask", - "description": "Specify to turn on soft-trimming instead of hard masking.", - "help_text": "Specify to turn on soft-trimming instead of hard masking of bases. By default, nf-core/eager uses hard trimming, which sets trimmed bases to 'N' with quality '!' in the BAM output. Turn this on to use soft-trimming instead, which masks reads at the read ends using the CIGAR string instead.\n\n> Modifies bamUtil's trimBam parameter: `-c`" + "fa_icon": "fas fa-paint-roller", + "description": "Turn on using soft-trimming instead of hard masking.", + "help_text": "By default, nf-core/eager uses hard trimming, which sets trimmed bases to 'N' with quality '!' in the BAM output. Turn this on to use soft-trimming instead, which masks reads at the read ends using the CIGAR string instead.\n\n> Modifies bam trimBam parameter: `-c`" } } }, @@ -903,28 +896,28 @@ "run_genotyping": { "type": "boolean", "fa_icon": "fas fa-power-off", - "description": "Specify to turn on genotyping of BAM files.", - "help_text": "Specify to turn on genotyping. `--genotyping_source` and `--genotyping_tool` must also be provided together with this option." + "description": "Turn on genotyping of BAM files.", + "help_text": "Turns on genotyping. `--genotyping_source` and `--genotyping_tool` must also be provided together with this option." }, "genotyping_source": { "type": "string", "description": "Specify which input BAM to use for genotyping.", - "help_text": "Specify which BAM file to use for genotyping, depending on what BAM processing modules you have turned on. Options are: 'raw' (to use the reads used as input for damage manipulation); 'pmd' (for pmdtools output); 'trimmed' (for base-clipped BAMs. Base-clipped-PMD-filtered BAMs if both filtering and trimming are requested); 'rescaled' (for mapDamage2 rescaling output).\nWarning: Depending on the parameters you provided, 'raw' can refer to all mapped reads, filtered reads (if BAM filtering has been performed), or the deduplicated reads (if deduplication was performed).", + "help_text": "Indicates which BAM file to use for genotyping, depending on what BAM processing modules you have turned on. Options are: 'raw' (to use the reads used as input for damage manipulation); 'pmd' (for pmdtools output); 'trimmed' (for base-clipped BAMs. Base-clipped-PMD-filtered BAMs if both filtering and trimming are requested); 'rescaled' (for mapDamage2 rescaling output).\nWarning: Depending on the parameters you provided, 'raw' can refer to all mapped reads, filtered reads (if bam filtering has been performed), or the deduplicated reads (if deduplication was performed).", "fa_icon": "fas fa-faucet", "enum": ["raw", "pmd", "trimmed", "rescaled"] }, "genotyping_tool": { "type": "string", - "fa_icon": "fas fa-hammer", + "fa_icon": "fas fa-tools", "enum": ["ug", "hc", "freebayes", "pileupcaller", "angsd"], - "help_text": "Specify which genotyper to use. Current options are: pileupCaller, ANGSD, GATK UnifiedGenotyper (v3.5), GATK HaplotypeCaller (v4) or FreeBayes.\n\n> Note that while UnifiedGenotyper is more suitable for low-coverage ancient DNA (HaplotypeCaller does de novo assembly around each variant site), be aware GATK v3.5 it is officially deprecated by the Broad Institute (but is used here for compatibility with MultiVCFAnalyzer).", - "description": "Specify which genotyper to use." + "help_text": "Specifies which genotyper to use. Current options are: GATK (v3.5) UnifiedGenotyper or GATK Haplotype Caller (v4); and the FreeBayes Caller.\n\n> Note that while UnifiedGenotyper is more suitable for low-coverage ancient DNA (HaplotypeCaller does de novo assembly around each variant site), be aware GATK 3.5 it is officially deprecated by the Broad Institute (but is used here for compatibility with MultiVCFAnalyzer).", + "description": "Specify which genotyper to use between: GATK UnifiedGenotyper, GATK HaplotypeCaller, Freebayes, or pileupCaller." }, "skip_bcftools_stats": { "type": "boolean", - "fa_icon": "fas fa-forward", - "description": "Specify to skip generation of VCF-based variant calling statistics with bcftools.", - "help_text": "Specify to disable running of `bcftools stats` against VCF files from GATK and FreeBayes genotypers.\n\nThis will automatically include the FASTA reference for INDEL-related statistics." + "fa_icon": "far fa-chart-bar", + "description": "Skip bcftools stats generation for VCF based variant calling statistics", + "help_text": "Disables running of `bcftools stats` against VCF files from GATK and FreeBayes genotypers.\n\nIf ran, `bcftools stats` will automatically include the FASTA reference for INDEL-related statistics." }, "genotyping_reference_ploidy": { "type": "integer", @@ -936,27 +929,27 @@ "genotyping_pileupcaller_min_base_quality": { "type": "integer", "default": 30, - "description": "Specify the base mapping quality to be used for genotyping with pileupCaller.", - "help_text": "Specify the minimum base quality to be used when generating the samtools mpileup used as input for genotyping with pileupCaller.\n\n> Modifies samtools mpileup parameter: `-Q`.", + "description": "The base mapping quality to be used for genotyping with pileupcaller.", + "help_text": "The minimum base quality to be used when generating the samtools mpileup used as input for genotyping with pileupCaller. \n\n> Modifies samtools mpileup parameter: `-Q`.", "fa_icon": "fas fa-filter" }, "genotyping_pileupcaller_min_map_quality": { "type": "integer", "default": 30, "fa_icon": "fas fa-filter", - "description": "Specify the minimum mapping quality to be used for genotyping with pileupCaller.", - "help_text": "Specify the minimum mapping quality to be used when generating the samtools mpileup used as input for genotyping with pileupCaller.\n\n> Modifies samtools mpileup parameter: `-q`." + "description": "The minimum mapping quality to be used for genotyping with pileupcaller.", + "help_text": "The minimum mapping quality to be used when generating the samtools mpileup used as input for genotyping with pileupCaller. \n\n> Modifies samtools mpileup parameter: `-q`." }, "genotyping_pileupcaller_bedfile": { "type": "string", "fa_icon": "fas fa-bed", - "help_text": "Specify a SNP panel in the form of a BED file of sites at which to generate a pileup for pileupCaller.", + "help_text": "Specify a SNP panel in the form of a bed file of sites at which to generate a pileup for pileupCaller.", "format": "file-path", - "description": "Specify the path to SNP panel in BED format for pileupCaller." + "description": "Specify the path to SNP panel in bed format for pileupCaller." }, "genotyping_pileupcaller_snpfile": { "type": "string", - "help_text": "Specify a SNP panel in [EIGENSTRAT](https://github.com/DReichLab/EIG/blob/master/CONVERTF/README) format of sites to be called with pileupCaller.", + "help_text": "Specify a SNP panel in [EIGENSTRAT](https://github.com/DReichLab/EIG/blob/master/CONVERTF/README) format, pileupCaller will call these sites.", "fa_icon": "fas fa-sliders-h", "format": "file-path", "description": "Specify the path to SNP panel in EIGENSTRAT format for pileupCaller." @@ -965,15 +958,15 @@ "type": "string", "default": "randomHaploid", "fa_icon": "fas fa-toolbox", - "description": "Specify the SNP calling method to use for genotyping with pileupCaller.", - "help_text": "Specify the SNP calling method to use for genotyping. 'randomHaploid' will randomly sample a read overlapping the SNP and produce a homozygous genotype with the allele supported by that read (often called 'pseudohaploid' or 'pseudodiploid'). 'randomDiploid` will randomly sample two reads overlapping the SNP and produce a genotype comprised of the two alleles supported by the two reads. 'majorityCall' will produce a genotype that is homozygous for the allele that appears in the majority of reads overlapping the SNP.\n\n> Modifies pileupCaller parameters: `--randomHaploid` `--randomDiploid` `--majorityCall`", + "description": "Specify the SNP calling method to use for genotyping.", + "help_text": "Specify the SNP calling method to use for genotyping. 'randomHaploid' will randomly sample a read overlapping the SNP, and produce a homozygous genotype with the allele supported by that read (often called 'pseudohaploid' or 'pseudodiploid'). 'randomDiploid` will randomly sample two reads overlapping the SNP and produce a genotype comprised of the two alleles supported by the two reads. 'majorityCall' will produce a genotype that is homozygous for the allele that appears in the majority of reads overlapping the SNP.\n\n> Modifies pileupCaller parameters: `--randomHaploid` `--randomDiploid` `--majorityCall`", "enum": ["randomHaploid", "randomDiploid", "majorityCall"] }, "genotyping_pileupcaller_transitions_mode": { "type": "string", "default": "AllSites", - "description": "Specify the calling mode for transitions with pileupCaller.", - "help_text": "Specify if genotypes of transition SNPs should be called, set to missing, or excluded from the genotypes respectively.\n\n> Modifies pileupCaller parameter: `--skipTransitions` `--transitionsMissing`", + "description": "Specify the calling mode for transitions.", + "help_text": "Specify if genotypes of transition SNPs should be called, set to missing, or excluded from the genotypes respectively. \n\n> Modifies pileupCaller parameter: `--skipTransitions` `--transitionsMissing`", "enum": ["AllSites", "TransitionsMissing", "SkipTransitions"], "fa_icon": "fas fa-toggle-on" }, @@ -982,13 +975,13 @@ "default": 30, "fa_icon": "fas fa-balance-scale-right", "description": "Specify GATK phred-scaled confidence threshold.", - "help_text": "Specify a GATK genotyper phred-scaled confidence threshold of a given SNP/INDEL call.\n\n> Modifies GATK UnifiedGenotyper or HaplotypeCaller parameter: `-stand_call_conf`" + "help_text": "If selected, specify a GATK genotyper phred-scaled confidence threshold of a given SNP/INDEL call.\n\n> Modifies GATK UnifiedGenotyper or HaplotypeCaller parameter: `-stand_call_conf`" }, "genotyping_gatk_dbsnp": { "type": "string", - "help_text": "Specify VCF file for output VCF SNP annotation, e.g. if you want to annotate your VCF file with 'rs' SNP IDs. Check GATK documentation for more information. Gzip not accepted.", + "help_text": "(Optional) Specify VCF file for output VCF SNP annotation e.g. if you want to annotate your VCF file with 'rs' SNP IDs. Check GATK documentation for more information. Gzip not accepted.", "fa_icon": "fas fa-pen-alt", - "description": "Specify VCF file for SNP annotation of output VCF files for GATK.", + "description": "Specify VCF file for SNP annotation of output VCF files. Optional. Gzip not accepted.", "pattern": "^\\S+\\.vcf$", "format": "file-path", "mimetype": "VCF" @@ -997,16 +990,16 @@ "type": "integer", "default": 250, "fa_icon": "fas fa-icicles", - "description": "Specify the maximum depth coverage allowed for genotyping with GATK before down-sampling is turned on.", - "help_text": "Specify the maximum depth coverage allowed for genotyping before down-sampling is turned on. Any position with a coverage higher than this value will be randomly down-sampled to this many reads.\n\n> Modifies GATK UnifiedGenotyper parameter: `-dcov`" + "description": "Maximum depth coverage allowed for genotyping before down-sampling is turned on.", + "help_text": "Maximum depth coverage allowed for genotyping before down-sampling is turned on. Any position with a coverage higher than this value will be randomly down-sampled to this many reads.\n\n> Modifies GATK UnifiedGenotyper parameter: `-dcov`" }, "genotyping_gatk_ug_out_mode": { "type": "string", "default": "EMIT_VARIANTS_ONLY", - "description": "Specify GATK UnifiedGenotyper output mode.", + "description": "Specify GATK output mode.", "enum": ["EMIT_VARIANTS_ONLY", "EMIT_ALL_CONFIDENT_SITES", "EMIT_ALL_SITES"], - "help_text": "Specify GATK UnifiedGenotyper output mode to use when producing the output VCF (i.e. produce calls for every site or just confidence sites.)\n\n> Modifies GATK UnifiedGenotyper parameter: `--output_mode`", - "fa_icon": "fas fa-toggle-on" + "help_text": "If GATK UnifiedGenotyper is selected as the genotyping tool, this defines the output mode to use when producing the output VCF (i.e. produce calls for every site or just confidence sites.)\n\n> Modifies GATK UnifiedGenotyper parameter: `--output_mode`", + "fa_icon": "fas fa-bullhorn" }, "genotyping_gatk_ug_genotype_mode": { "type": "string", @@ -1014,49 +1007,49 @@ "description": "Specify UnifiedGenotyper likelihood model.", "enum": ["SNP", "INDEL", "BOTH", "GENERALPLOIDYSNP", "GENERALPLOIDYINDEL"], "fa_icon": "fas fa-project-diagram", - "help_text": "Specify GATK UnifiedGenotyper likelihood model, i.e. whether to call only SNPs or INDELS etc.\n\n> Modifies GATK UnifiedGenotyper parameter: `--genotype_likelihoods_model`" + "help_text": "If GATK UnifiedGenotyper is selected as the genotyping tool, this sets which likelihood model to follow, i.e. whether to call only SNPs or INDELS etc.\n\n> Modifies GATK UnifiedGenotyper parameter: `--genotype_likelihoods_model`" }, "genotyping_gatk_ug_keeprealignbam": { "type": "boolean", - "fa_icon": "fas fa-save", + "fa_icon": "far fa-save", "description": "Specify to keep the BAM output of re-alignment around variants from GATK UnifiedGenotyper.", - "help_text": "Specify to output the BAMs that have realigned reads (with GATK (v3) IndelRealigner) around possible variants for improved genotyping with GATK UnifiedGenotyper in addition to the standard VCF output.\n\nThese BAMs will be stored in the same folder as the corresponding VCF files." + "help_text": "If GATK UnifiedGenotyper is selected as the genotyping tool, providing this parameter will output the BAMs that have realigned reads (with GATK's (v3) IndelRealigner) around possible variants for improved genotyping in addition to the standard VCF output.\n\nThese BAMs will be stored in the same folder as the corresponding VCF files." }, "genotyping_gatk_ug_defaultbasequalities": { "type": "integer", "default": -1, - "description": "Specify to supply a default base quality if a read is missing a base quality score.", - "help_text": "Specify a value to set base quality scores for genotyping with GATK UnifiedGenotyper, if reads are missing this information. Might be useful if you have 'synthetically' generated reads (e.g. chopping up a reference genome). Default is set to `-1` which is to not set any default quality (turned off).\n\n> Modifies GATK UnifiedGenotyper parameter: `--defaultBaseQualities`", + "description": "Supply a default base quality if a read is missing a base quality score. Setting to -1 turns this off.", + "help_text": "If GATK UnifiedGenotyper is selected as the genotyping tool, specify a value to set base quality scores, if reads are missing this information. Might be useful if you have 'synthetically' generated reads (e.g. chopping up a reference genome). Default is set to `-1` which is to not set any default quality (turned off). \n\n> Modifies GATK UnifiedGenotyper parameter: `--defaultBaseQualities`", "fa_icon": "fas fa-redo-alt" }, "genotyping_gatk_hc_out_mode": { "type": "string", "default": "EMIT_VARIANTS_ONLY", - "fa_icon": "fas fa-toggle-on", - "description": "Specify GATK HaplotypeCaller output mode.", - "help_text": "Specify the type of sites that should be included in the output VCF after genotyping with GATK HaplotypeCaller (i.e. produce calls for every site or just confidence sites).\n\n> Modifies GATK HaplotypeCaller parameter: `--output_mode`", + "fa_icon": "fas fa-bullhorn", + "description": "Specify GATK output mode.", + "help_text": "If GATK HaplotypeCaller is selected as the genotyping tool, this sets the type of sites that should be included in the output VCF (i.e. produce calls for every site or just confidence sites). \n\n> Modifies GATK HaplotypeCaller parameter: `--output_mode`", "enum": ["EMIT_VARIANTS_ONLY", "EMIT_ALL_CONFIDENT_SITES", "EMIT_ALL_ACTIVE_SITES"] }, "genotyping_gatk_hc_emitrefconf": { "type": "string", "default": "GVCF", - "fa_icon": "fas fa-toggle-on", + "fa_icon": "fas fa-bullhorn", "description": "Specify HaplotypeCaller mode for emitting reference confidence calls.", - "help_text": "Specify GATK HaplotypeCaller mode for emitting reference confidence calls.\n\n> Modifies GATK HaplotypeCaller parameter: `--emit-ref-confidence`", + "help_text": "If GATK HaplotypeCaller is selected as the genotyping tool, this sets the mode for emitting reference confidence calls.\n\n> Modifies GATK HaplotypeCaller parameter: `--emit-ref-confidence`", "enum": ["NONE", "BP_RESOLUTION", "GVCF"] }, "genotyping_freebayes_min_alternate_count": { "type": "integer", "default": 1, - "description": "Specify minimum required supporting observations of an alternate allele to consider a variant in FreeBayes.", - "help_text": "Specify the minimum count of observations supporting an alternate allele within a single individual in order to evaluate the position during genotyping with FreeBayes.\n\n> Modifies FreeBayes parameter: `-C`", - "fa_icon": "fas fa-filter" + "description": "Specify minimum required supporting observations of an alternate allele to consider a variant.", + "help_text": "Require at least this count of observations supporting an alternate allele within a single individual in order to evaluate the position.\n\n> Modifies freebayes parameter: `-C`", + "fa_icon": "fas fa-align-center" }, "genotyping_freebayes_skip_coverage": { "type": "integer", "default": 0, - "description": "Specify to skip over regions of high depth by discarding alignments overlapping positions where total read depth is greater than specified in FreeBayes.", - "help_text": "Specify to skip over regions of high depth by discarding alignments overlapping positions where total read depth is greater than the specified value during genotyping with FreeBayes. This is set to 0 by default, which deactivates this behaviour.\n\n> Modifies FreeBayes parameter: `-g`", + "description": "Specify to skip over regions of high depth by discarding alignments overlapping positions where total read depth is greater than specified.", + "help_text": "Specify to skip over regions of high depth by discarding alignments overlapping positions where total read depth is greater than the specified value. Setting to 0 (the default) deactivates this behaviour.\n\n> Modifies freebayes parameter: `-g`", "fa_icon": "fab fa-think-peaks" }, "genotyping_angsd_glmodel": { @@ -1064,7 +1057,7 @@ "default": "samtools", "fa_icon": "fas fa-project-diagram", "description": "Specify which ANGSD genotyping likelihood model to use.", - "help_text": "Specify which genotype likelihood model to use in ANGSD.\n\n> Modifies ANGSD parameter: `-GL`", + "help_text": "Specify which genotype likelihood model to use.\n\n> Modifies angsd parameter: `-GL`", "enum": ["samtools", "gatk", "soapsnp", "syk"] }, "genotyping_angsd_glformat": { @@ -1072,7 +1065,7 @@ "default": "binary", "fa_icon": "fas fa-text-height", "description": "Specify the formatting of the output VCF for ANGSD genotype likelihood results.", - "help_text": "Specifies what type of genotyping likelihood file format will be output by ANGSD.\n\nThe options refer to the following descriptions respectively:\n\n- `binary`: binary output of all 10 log genotype likelihood\n- `beagle_binary`: beagle likelihood file\n- `binary_three`: binary 3 times likelihood\n- `text`: text output of all 10 log genotype likelihoods.\n\nSee the [ANGSD documentation](http://www.popgen.dk/angsd/) for more information on which to select for your downstream applications.\n\n> Modifies ANGSD parameter: `-doGlf`", + "help_text": "Specifies what type of genotyping likelihood file format will be output.\n\nThe options refer to the following descriptions respectively:\n\n- `binary`: binary output of all 10 log genotype likelihood\n- `beagle_binary`: beagle likelihood file\n- `binary_three`: binary 3 times likelihood\n- `text`: text output of all 10 log genotype likelihoods.\n\nSee the [ANGSD documentation](http://www.popgen.dk/angsd/) for more information on which to select for your downstream applications.\n\n> Modifies angsd parameter: `-doGlf`", "enum": ["binary", "beagle_binary", "binary_three", "text"] } }, @@ -1089,15 +1082,15 @@ "properties": { "run_mtnucratio": { "type": "boolean", - "description": "Specify to turn on mitochondrial to nuclear ratio calculation.", - "help_text": "Specify to turn on estimation of the ratio of mitochondrial to nuclear reads.", - "fa_icon": "fas fa-power-off" + "description": "Turn on mitochondrial to nuclear ratio calculation.", + "help_text": "Turn on the module to estimate the ratio of mitochondrial to nuclear reads.", + "fa_icon": "fas fa-balance-scale-left" }, "mitochondrion_header": { "type": "string", "default": "MT", - "description": "Specify the name of the reference FASTA entry corresponding to the mitochondrial genome.", - "help_text": "Specify the FASTA entry in the reference file specified as `--fasta`, which acts as the mitochondrial 'chromosome' to base the ratio calculation on. The tool only accepts the first section of the header before the first space. The default chromosome name is based on hs37d5/GrCH37 human reference genome.", + "description": "Specify the name of the reference FASTA entry corresponding to the mitochondrial genome (up to the first space).", + "help_text": "Specify the FASTA entry in the reference file specified as --fasta, which acts as the mitochondrial 'chromosome' to base the ratio calculation on. The tool only accepts the first section of the header before the first space. The default chromosome name is based on hs37d5/GrCH37 human reference genome.", "fa_icon": "fas fa-heading" } } @@ -1105,19 +1098,19 @@ "mapping_statistics": { "title": "Mapping statistics", "type": "object", - "description": "Options for the calculation of mapping statistics", + "description": "", "default": "", "properties": { "mapstats_skip_preseq": { "type": "boolean", - "help_text": "Specify to turn off the computation of library complexity estimation.", - "description": "Specify to turn off the computation of library complexity estimation with preseq.", + "help_text": "Turns off the computation of library complexity estimation.", + "description": "Turns off the computation of library complexity estimation.", "fa_icon": "fas fa-forward" }, "mapstats_preseq_mode": { "type": "string", "default": "c_curve", - "help_text": "Specify which mode of preseq to run.\n\nFrom the [preseq documentation](http://smithlabresearch.org/wp-content/uploads/manual.pdf):\n\nc curve is used to compute the expected complexity curve of a mapped read file with a hypergeometric formula\n\nlc extrap is used to generate the expected yield for theoretical larger experiments and bounds on the number of distinct reads in the library and the associated confidence intervals, which is computed by bootstrapping the observed duplicate counts histogram.", + "help_text": "Specify which mode of preseq to run.\n\nFrom the [PreSeq documentation](http://smithlabresearch.org/wp-content/uploads/manual.pdf):\n\nc curve is used to compute the expected complexity curve of a mapped read file with a hypergeometric formula\n\nlc extrap is used to generate the expected yield for theoretical larger experiments and bounds on the number of distinct reads in the library and the associated confidence intervals, which is computed by bootstrapping the observed duplicate counts histogram.", "description": "Specify which mode of preseq to run.", "fa_icon": "fas fa-toggle-on", "enum": ["c_curve", "lc_extrap"] @@ -1125,52 +1118,57 @@ "mapstats_preseq_stepsize": { "type": "integer", "default": 1000, - "description": "Specify the step size (i.e., sampling regularity) of preseq.", - "help_text": "Specify the step size of preseq's c_curve and lc_extrap methods. This can be useful when few reads are present and allow preseq to be used for extrapolation of shallow sequencing results.\n\n\n> Modifies preseq parameter:\n> `-s`", + "description": "Specify the step size (i.e., sampling regularity) of Preseq.", + "help_text": "Can be used to configure the step size of Preseq's c_curve and lc_extrap method. Can be useful when few reads and allow Preseq to be used for extrapolation of shallow sequencing results.\n\n\n> Modifies tool parameter(s)\n> - preseq: `-s`", "fa_icon": "fas fa-shoe-prints" }, "mapstats_preseq_terms": { "type": "integer", "default": 100, "fa_icon": "fas fa-sort-numeric-up-alt", - "help_text": "Specify the maximum number of terms that preseq's lc_extrap mode will use.\n\n> Modifies preseq lc_extrap parameter: `-x`", - "description": "Specify the maximum number of terms that preseq's lc_extrap mode will use." + "help_text": "Specify the maximum number of terms that lc_extrap mode will use.\n \n> Modifies preseq lc_extrap parameter: `-x`", + "description": "Specify the maximum number of terms that lc_extrap mode will use." }, "mapstats_preseq_maxextrap": { "type": "integer", "default": 10000000000, "fa_icon": "fas fa-ban", - "help_text": "Specify the maximum extrapolation that preseq's lc_extrap mode will perform.\n\n> Modifies preseq lc_extrap parameter: `-e`", - "description": "Specify the maximum extrapolation to use for preseq's lc_extrap mode." + "help_text": "Specify the maximum extrapolation that lc_extrap mode will perform.\n\n> Modifies preseq lc_extrap parameter: `-e`", + "description": "Specify the maximum extrapolation (lc_extrap mode only)" }, "mapstats_preseq_bootstrap": { "type": "integer", "default": 100, "fa_icon": "fab fa-bootstrap", - "help_text": "Specify the number of bootstraps preseq's lc_extrap mode will perform to calculate confidence intervals.\n\n> Modifies preseq lc_extrap parameter: `-n`", - "description": "Specify number of bootstraps to perform in preseq's lc_extrap mode." + "help_text": "Specify the number of bootstraps lc_extrap mode will perform to calculate confidence intervals.\n\n> Modifies preseq lc_extrap parameter: `-n`", + "description": "Specify number of bootstraps to perform (lc_extrap mode only)" }, "mapstats_preseq_cval": { "type": "number", "default": 0.95, "fa_icon": "fas fa-check-circle", - "help_text": "Specify the allowed level of confidence intervals used for prerseq's lc_extrap mode.\n\n> Modifies preseq lc_extrap parameter: `-c`", - "description": "Specify confidence interval level for preseq's lc_extrap mode." + "help_text": "Specify the allowed level of confidence intervals used for lc_extrap mode.\n\n> Modifies preseq lc_extrap parameter: `-c`", + "description": "Specify confidence interval level (lc_extrap mode only)" }, "mapstats_preseq_defects_mode": { "type": "boolean", - "description": "Specify to turn on preseq defects mode to extrapolate without testing for defects in lc_extrap mode.", - "help_text": "Specify to activate defects mode of `preseq lc_extrap`, which runs the extrapolation without testing for defects.\n\n> Modifies preseq lc_extrap parameter: `-D`", - "fa_icon": "fas fa-power-off" + "description": "Turns on defects mode to extrapolate without testing for defects (lc_extrap mode only).", + "help_text": "Activates defects mode of `lc_extrap`, which does the extrapolation without testing for defects.\n\n> Modifies preseq lc_extrap parameter: `-D`", + "fa_icon": "fab fa-creative-commons-sampling-plus" }, "skip_qualimap": { - "type": "boolean", - "description": "Specify to turn off coverage calculation with Qualimap.", - "fa_icon": "fas fa-forward" + "type": "boolean" }, "snpcapture_bed": { "type": "string", - "description": "Specify path to SNP capture positions in BED format for coverage calculations with Qualimap." + "description": "Path to snp capture in BED format. Provided file can also be gzipped." + }, + "elongation_factor": { + "type": "integer", + "default": 500, + "description": "Specify the number of bases to extend reference by (circularmapper only)", + "help_text": "The number of bases to extend the reference genome with. By default this is set to 500 if not specified otherwise.", + "fa_icon": "fas fa-external-link-alt" } }, "fa_icon": "fas fa-search" @@ -1181,92 +1179,91 @@ "description": "Options for calculating and filtering for characteristic ancient DNA damage patterns.", "default": "", "fa_icon": "fas fa-chart-line", - "help_text": "More documentation can be found at the follow links for:\n\n[DamageProfiler](https://github.com/Integrative-Transcriptomics/DamageProfiler)\n\nIf using TSV input, DamageProfiler is performed per library, i.e. after lane merging. BAM Trimming is only performed on non-UDG and half-UDG treated data.", + "help_text": "More documentation can be seen in the follow links for:\n\n[DamageProfiler](https://github.com/Integrative-Transcriptomics/DamageProfiler)\n\nIf using TSV input, DamageProfiler is performed per library, i.e. after lane merging. BAM Trimming is only performed on non-UDG and half-UDG treated data.", "properties": { "skip_damagecalculation": { "type": "boolean", "fa_icon": "fas fa-forward", - "help_text": "Specify to turn off computation of DNA damage profiles.", - "description": "Specify to turn off ancient DNA damage calculation." + "help_text": "Turns off damage calculation to compute DNA damage profiles." }, "damagecalculation_tool": { "type": "string", "default": "damageprofiler", "enum": ["damageprofiler", "mapdamage"], - "fa_icon": "fas fa-hammer", + "fa_icon": "fas fa-tools", "description": "Specify the tool to use for damage calculation.", "help_text": "Specify the tool to be used for damage calculation. DamageProfiler is generally faster than mapDamage2, but the latter has an option to limit the number of reads used. This can significantly speed up the processing of very large files, where the damage estimates are already accurate after processing only a fraction of the input." }, "damagecalculation_yaxis": { "type": "number", "default": 0.3, - "description": "Specify the maximum misincorporation frequency that should be displayed on damage plot.", - "help_text": "Specify the maximum misincorporation frequency that should be displayed in the damage plot.\n\n> Modifies DamageProfiler parameter: `-yaxis_dp_max` or mapDamage2 parameter: `--ymax`", + "description": "Specify the maximum misincorporation frequency that should be displayed on damage plot. Set to 0 to 'autoscale'.", + "help_text": "Specifies what the maximum misincorporation frequency should be displayed as, in the damage plot.\n\n> Modifies DamageProfiler parameter: -yaxis_dp_max or mapDamage2 parameter: --ymax", "fa_icon": "fas fa-ruler-combined" }, "damagecalculation_xaxis": { "type": "integer", "default": 25, "description": "Specify number of bases of each read to be considered for plotting damage estimation.", - "help_text": "Specify the number of bases to be considered for plotting nucleotide misincorporations.\n\n> Modifies DamageProfiler parameter: `-t` or mapDamage2 parameter: `-m`", + "help_text": "Specifies the number of bases to be considered for plotting nucleotide misincorporations.\n\n> Modifies DamageProfiler parameter: -t or mapDamage2 parameter: -m\n", "fa_icon": "far fa-chart-bar" }, "damagecalculation_damageprofiler_length": { "type": "integer", "default": 100, - "description": "Specify the length filter for DamageProfiler.", - "help_text": "Specify the number of bases which are considered for frequency computations.\n\n> Modifies DamageProfiler parameter: `-l`", - "fa_icon": "fas fa-ruler-horizontal" + "description": "Specifies the length filter for DamageProfiler.", + "help_text": "Number of bases which are considered for frequency computations, by default set to 100.`\n\n> Modifies DamageProfiler parameter: -l", + "fa_icon": "fas fa-sort-amount-down" }, "damagecalculation_mapdamage_downsample": { "type": "integer", "default": 0, "fa_icon": "fas fa-compress-alt", - "description": "Specify the maximum number of reads to consider for damage calculation with mapDamage.", - "help_text": "Specify the maximum number of reads used for damage calculation in mapDamage2. This can be used to significantly reduce the amount of time required for damage assessment. Note that a too low value can also obtain incorrect results.\n\n> Modifies mapDamage2 parameter: `-n`" + "description": "Specify the maximum number of reads to consider for damage calculation. Defaults value is 0 (i.e. no downsampling is performed).", + "help_text": "The maximum number of reads used for damage calculation in mapDamage2. Can be used to significantly reduce the amount of time required for damage assessment. Note that a too low value can also obtain incorrect results.\n\n>Modifies mapDamage2 parameter: -n\n" } } }, "feature_annotation_statistics": { "title": "Feature Annotation Statistics", "type": "object", - "description": "Options for calculating reference annotation statistics (e.g. gene coverages)", + "description": "Options for getting reference annotation statistics (e.g. gene coverages)", "default": "", "properties": { "run_bedtools_coverage": { "type": "boolean", - "description": "Specify to turn on calculation of number of reads, depth and breadth coverage of features in reference with bedtools.", - "fa_icon": "fas fa-power-off", - "help_text": "Specify to turn on the bedtools module, producing statistics for breadth (or percent coverage), and depth (or X fold) coverages.\n\n> Modifies bedtools coverage parameter: `-mean`" + "description": "Turn on ability to calculate no. reads, depth and breadth coverage of features in reference.", + "fa_icon": "fas fa-chart-area", + "help_text": "Specifies to turn on the bedtools module, producing statistics for breadth (or percent coverage), and depth (or X fold) coverages.\n\n> Modifies tool parameter(s):\n- bedtools coverage: `-mean`" }, "mapstats_bedtools_featurefile": { "type": "string", - "description": "Specify path to GFF or BED file containing positions of features in reference file for bedtools.", + "description": "Path to GFF or BED file containing positions of features in reference file (--fasta). Path should be enclosed in quotes.", "fa_icon": "fas fa-file-signature", "help_text": "Specify the path to a GFF/BED containing the feature coordinates (or any acceptable input for [`bedtools coverage`](https://bedtools.readthedocs.io/en/latest/content/tools/coverage.html)). Must be in quotes.\n" } }, "fa_icon": "fas fa-scroll", - "help_text": "If you're interested in looking at coverage stats for certain features on your reference such as genes, SNPs etc., you can use the following bedtools module for this purpose.\n\nMore documentation on bedtools can be seen in the [bedtools\ndocumentation](https://bedtools.readthedocs.io/en/latest/)\n\nIf using TSV input, bedtools is run after library merging of same-named library BAMs that have the same type of UDG treatment." + "help_text": "If you're interested in looking at coverage stats for certain features on your\nreference such as genes, SNPs etc., you can use the following bedtools module\nfor this purpose.\n\nMore documentation on bedtools can be seen in the [bedtools\ndocumentation](https://bedtools.readthedocs.io/en/latest/)\n\nIf using TSV input, bedtools is run after library merging of same-named library\nBAMs that have the same type of UDG treatment.\n" }, "host_removal": { "title": "Host Removal", "type": "object", - "description": "Options for removing host-mapped reads", + "description": "", "default": "", "properties": { "run_host_removal": { "type": "boolean", - "description": "Specify to turn on creation of pre-adapter-removal and/or read-pair-merging FASTQ files without reads that mapped to reference (e.g. for public upload of privacy sensitive non-host data).", - "help_text": "Specify to recreate pre-adapter-removal and/or read-pair-merging FASTQ files but without reads that mapped to reference (e.g. for public upload of privacy-sensitive non-host data)", + "description": "Turn on per-lane creation of pre-adapter-removal and/or read-pair-merging FASTQ files without reads that mapped to reference (e.g. for public upload of privacy sensitive non-host data)", + "help_text": "Recreates pre-adapter-removal and/or read-pair-merging FASTQ files but without reads that mapped to reference (e.g. for public upload of privacy-sensitive non-host data)", "fa_icon": "fas fa-power-off" }, "host_removal_mode": { "type": "string", "default": "remove", - "description": "Specify the host-mapped read removal mode.", - "help_text": "Specify the host-mapped read removal mode.\n\n> Modifies extract_map_reads.py parameter: -m", - "fa_icon": "fas fa-toggle-on", + "description": "Host-mapped read removal mode. Remove mapped reads completely from FASTQ (remove) or just mask the host sequence of mapped reads with N (replace).", + "help_text": "Modifies extract_map_reads.py parameter: -m", + "fa_icon": "fas fa-plane-slash", "enum": ["remove", "replace"] } }, @@ -1275,56 +1272,56 @@ "contamination_estimation": { "title": "Contamination estimation", "type": "object", - "description": "Options for the estimation of contamination in human data", + "description": "Options for the estimation of contamination", "default": "", "fa_icon": "fas fa-radiation-alt", "properties": { "run_contamination_estimation_angsd": { "type": "boolean", - "description": "Specify to turn on nuclear contamination estimation for genomes with ANGSD.", - "help_text": "Specify to run nuclear DNA contamination estimation with ANGSD.", + "description": "Turn on nuclear contamination estimation for genomes with ANGSD.", + "help_text": "Specify to run the optional processes for nuclear DNA contamination estimation with ANGSD.", "fa_icon": "fas fa-power-off" }, "contamination_estimation_angsd_chrom_name": { "type": "string", "default": "X", - "description": "Specify the name of the chromosome to be used for contamination estimation with ANGSD.", - "help_text": "Specify the name of the chromosome to be used for contamination estimation with ANGSD as specified in your FASTA/BAM header, e.g. 'X' for hs37d5 or 'chrX' for hg19", + "description": "The name of the chromosome to be used for contamination estimation.", + "help_text": "The name of the chromosome as specified in your FASTA/bam header.\ne.g. 'X' for hs37d5, 'chrX' for HG19", "fa_icon": "fas fa-address-card" }, "contamination_estimation_angsd_range_from": { "type": "integer", "default": 5000000, - "description": "Specify the first position on the chromosome to be used for contamination estimation with ANGSD.", - "help_text": "Specify the beginning of the genetic range that should be utilised for nuclear contamination estimation with ANGSD.", + "description": "The first position on the chromosome to be used for contamination estimation with ANGSD.", + "help_text": "The beginning of the genetic range that should be utilised for nuclear contamination estimation.", "fa_icon": "fas fa-map-marker-alt" }, "contamination_estimation_angsd_range_to": { "type": "integer", "default": 154900000, - "help_text": "Specify the end of the genetic range that should be utilised for nuclear contamination estimation with ANGSD.", - "description": "Specify the last position on the chromosome to be used for contamination estimation with ANGSD.", + "help_text": "The end of the genetic range that should be utilised for nuclear contamination estimation.", + "description": "The last position on the chromosome to be used for contamination estimation with ANGSD.", "fa_icon": "fas fa-map-marker-alt" }, "contamination_estimation_angsd_mapq": { "type": "integer", "default": 30, - "help_text": "Specify the minimum mapping quality reads should have for contamination estimation with ANGSD.\n\n> Modifies ANGSD parameter: `-minMapQ`", + "help_text": "> Modifies angsd parameter: `-minMapQ`", "description": "Specify the minimum mapping quality reads should have for contamination estimation with ANGSD.", - "fa_icon": "fas fa-filter" + "fa_icon": "fas fa-thermometer-full" }, "contamination_estimation_angsd_minq": { "type": "integer", "default": 30, "description": "Specify the minimum base quality reads should have for contamination estimation with ANGSD.", - "help_text": "Specify the minimum base quality reads should have for contamination estimation with ANGSD.\n\n> Modifies ANGSD parameter: `-minQ`", - "fa_icon": "fas fa-filter" + "help_text": "> Modifies angsd parameter: `-minQ`", + "fa_icon": "fas fa-ruler-vertical" }, "contamination_estimation_angsd_hapmap": { "type": "string", - "default": "${projectDir}/assets/angsd_resources/HapMapChrX.gz", - "description": "Specify path to HapMap file of chromosome for contamination estimation with ANGSD.", - "help_text": "Specify a path to HapMap file of chromosome for contamination estimation with ANGSD. The haplotype map, or \"HapMap\", records the location of haplotype blocks and their tag SNPs.", + "default": "/Users/judith_ballesteros/Documents/GitHub/eager/assets/angsd_resources/HapMapChrX.gz", + "description": "Path to HapMap file of chromosome for contamination estimation..", + "help_text": "The haplotype map, or \"HapMap\", records the location of haplotype blocks and their tag SNPs.", "fa_icon": "fas fa-map" } } @@ -1332,20 +1329,20 @@ "human_sex_determination": { "title": "Human Sex Determination", "type": "object", - "description": "Options for the calculation of genetic sex of human individuals.", + "description": "Options for the calculation of biological sex of human individuals.", "default": "", "properties": { "run_sexdeterrmine": { "type": "boolean", - "fa_icon": "fas fa-power-off", - "description": "Specify to turn on sex determination for genomes mapped to human reference genomes with Sex.DetERRmine.", - "help_text": "Specify to run genetic sex determination." + "fa_icon": "fas fa-transgender-alt", + "description": "Turn on sex determination for human reference genomes. This will run on single- and double-stranded variants of a library separately.", + "help_text": "Specify to run the optional process of sex determination." }, "sexdeterrmine_bedfile": { "type": "string", "fa_icon": "fas fa-bed", - "description": "Specify path to SNP panel in BED format for error bar calculation.", - "help_text": "Specify a BED file with SNPs to be used for X-/Y-rate calculation. Running without this parameter will considerably increase runtime, and render the resulting error bars untrustworthy. Theoretically, any set of SNPs that are distant enough that two SNPs are unlikely to be covered by the same read can be used here. The programme was coded with the 1240k panel in mind." + "description": "Specify path to SNP panel in bed format for error bar calculation. Optional (see documentation).", + "help_text": "Specify an optional bedfile of the list of SNPs to be used for X-/Y-rate calculation. Running without this parameter will considerably increase runtime, and render the resulting error bars untrustworthy. Theoretically, any set of SNPs that are distant enough that two SNPs are unlikely to be covered by the same read can be used here. The programme was coded with the 1240K panel in mind." } }, "fa_icon": "fas fa-transgender-alt", From c98d7f877695aed16956495298e6642a4f7b5687 Mon Sep 17 00:00:00 2001 From: Merlin Szymanski Date: Fri, 28 Jun 2024 13:02:07 +0200 Subject: [PATCH 114/198] Fix one more bug in bamfilter module --- subworkflows/local/bamfiltering.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/subworkflows/local/bamfiltering.nf b/subworkflows/local/bamfiltering.nf index 95f984860..12f9cbd03 100644 --- a/subworkflows/local/bamfiltering.nf +++ b/subworkflows/local/bamfiltering.nf @@ -99,7 +99,7 @@ workflow FILTER_BAM { // TODO: see request https://github.com/nf-core/eager/issues/945 if ( ( params.run_metagenomics && ( params.metagenomics_input == 'mapped' || params.metagenomics_input == 'all' ) ) && params.preprocessing_skippairmerging ) { - ch_paired_fastq_for_cat = SAMTOOLS_FASTQ_UNMAPPED.out.fastq + ch_paired_fastq_for_cat = SAMTOOLS_FASTQ_MAPPED.out.fastq .mix(SAMTOOLS_FASTQ_MAPPED.out.singleton) .mix(SAMTOOLS_FASTQ_MAPPED.out.other) .groupTuple() From 2e9bb578669df17d2248c67f5f4a95a8be38e569 Mon Sep 17 00:00:00 2001 From: Ian Light Date: Fri, 28 Jun 2024 12:38:34 +0000 Subject: [PATCH 115/198] resolved PR comments (formating, redundant info) --- nextflow_schema.json | 58 ++++++++++++++++++++++---------------------- 1 file changed, 29 insertions(+), 29 deletions(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index fb030fe4a..ce024e5fb 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -713,9 +713,9 @@ "metagenomics_profiling_database": { "type": "string", "format": "path", - "description": "Specify a databse directory to run metagenomics profiling on. In the case of kraken2, this can be a tar.gz of the directory. Required if `--run_metagenomics` flagged.", + "description": "Specify a databse directory or .tar.gz file of a database directory to run metagenomics profiling on. Required if `--run_metagenomics` flagged.", "fa_icon": "fas fa-database", - "help_text": "Select which tool to run metagenomics profiling database to use with the designated metagenomics_profiling_tool on the selected metagenomics_input. These databases are NOT cross-compatible and need to be pre-built/downloaded for use in nf-core/eager. Database construction is often a balancing act between breadth of sequence diversity and size." + "help_text": "Specify a metagenomics profiling database to use with the designated metagenomics_profiling_tool on the selected metagenomics_input. Databases can be provided both as a directory, or a tar.gz of a directory. Metagenomic databases are NOT compatible across different tools (ie a MALT database is different from a kraken2 database).\n\nAll databases need to be pre-built/downloaded for use in nf-core/eager. Database construction is often a balancing act between breadth of sequence diversity and size.\n\nModifies tool parameter(s):\n> - krakenuniq: `--db`\n> - kraken2: `--db`\n> - MetaPhlAn: `--bowtie2db` and `--index`\n> - MALT: '-index'" }, "metagenomics_kraken_savereads": { "type": "boolean", @@ -740,22 +740,22 @@ "type": "boolean", "description": "Turn on saving minimizer information in the kraken2 report thus increasing to an eight column layout.", "fa_icon": "fas fa-save", - "help_text": "Turn on saving minimizer information in the kraken2 report thus increasing to an eight column layout.\n\nAdds `--report-minimizer-data` to the kraken2 command." + "help_text": "Turn on saving minimizer information in the kraken2 report thus increasing to an eight column layout.\n\n Modifies kraken2 parameter: `--report-minimizer-data`." }, "metagenomics_malt_mode": { "type": "string", "default": "BlastN", - "description": "Specify which alignment mode to use for MALT. Options: 'Unknown', 'BlastN', 'BlastP', 'BlastX', 'Classifier'.", + "description": "Specify which alignment mode to use for MALT.", "fa_icon": "fas fa-align-left", - "help_text": "Use this to run the program in 'BlastN', 'BlastP', 'BlastX' modes to align DNA\nand DNA, protein and protein, or DNA reads against protein references\nrespectively. Ensure your database matches the mode. Check the\n[MALT\nmanual](http://ab.inf.uni-tuebingen.de/data/software/malt/download/manual.pdf)\nfor more details. Default: `'BlastN'`\n\nOnly when `--metagenomics_profiling_tool malt` is also supplied.\n\n> Modifies MALT parameter: `-m`\n", + "help_text": "Use this to run the program in 'BlastN', 'BlastP', 'BlastX' modes to align DNA\nand DNA, protein and protein, or DNA reads against protein references\nrespectively. Ensure your database matches the mode. Check the\n[MALT\nmanual](http://ab.inf.uni-tuebingen.de/data/software/malt/download/manual.pdf)\nfor more details. Default: `'BlastN'`\n\nOnly when `--metagenomics_profiling_tool malt` is also supplied.\n\n> Modifies tool parameter(s):\n> - MALT: `-m`\n", "enum": ["BlastN", "BlastP", "BlastX"] }, "metagenomics_malt_alignmentmode": { "type": "string", "default": "SemiGlobal", - "description": "Specify alignment method for MALT. Options: 'Local', 'SemiGlobal'.", + "description": "Specify alignment method for MALT.", "fa_icon": "fas fa-align-center", - "help_text": "Specify what alignment algorithm to use. Options are 'Local' or 'SemiGlobal'. Local is a BLAST like alignment, but is much slower. Semi-global alignment aligns reads end-to-end. Default: `'SemiGlobal'`\n\nOnly when `--metagenomics_profiling_tool malt` is also supplied.\n\n> Modifies MALT parameter: `-at`", + "help_text": "Specify what alignment algorithm to use. Options are 'Local' or 'SemiGlobal'. Local is a BLAST like alignment, but is much slower. Semi-global alignment aligns reads end-to-end. Default: `'SemiGlobal'`\n\nOnly when `--metagenomics_profiling_tool malt` is also supplied.\n\n> Modifies tool parameter(s):\n> - MALT: `-at`", "enum": ["Local", "SemiGlobal"] }, "metagenomics_malt_minpercentidentity": { @@ -763,21 +763,21 @@ "default": 85, "description": "Percent identity value threshold for MALT.", "fa_icon": "fas fa-id-card", - "help_text": "Specify the minimum percent identity (or similarity) a sequence must have to the reference for it to be retained. Default is `85`\n\nOnly used when `--metagenomics_profiling_tool malt` is also supplied.\n\n> Modifies MALT parameter: `-id`" + "help_text": "Specify the minimum percent identity (or similarity) a sequence must have to the reference for it to be retained. Default is `85`\n\nOnly used when `--metagenomics_profiling_tool malt` is also supplied.\n\n> Modifies tool parameter(s):\n> - MALT:`-id`" }, "metagenomics_malt_toppercent": { "type": "integer", "default": 1, "description": "Specify the percent for LCA algorithm for MALT (see MEGAN6 CE manual).", "fa_icon": "fas fa-percent", - "help_text": "Specify the top percent value of the LCA algorithm. From the [MALT manual](http://ab.inf.uni-tuebingen.de/data/software/malt/download/manual.pdf): \"For each\nread, only those matches are used for taxonomic placement whose bit disjointScore is within\n10% of the best disjointScore for that read.\". Default: `1`.\n\nOnly when `--metagenomics_profiling_tool malt` is also supplied.\n\n> Modifies MALT parameter: `-top`" + "help_text": "Specify the top percent value of the LCA algorithm. From the [MALT manual](http://ab.inf.uni-tuebingen.de/data/software/malt/download/manual.pdf): \"For each\nread, only those matches are used for taxonomic placement whose bit disjointScore is within\n10% of the best disjointScore for that read.\". Default: `1`.\n\nOnly when `--metagenomics_profiling_tool malt` is also supplied.\n\n> Modifies tool parameter(s):\n> - MALT: `-top`" }, "metagenomics_malt_minsupportmode": { "type": "string", "default": "percent", - "description": "Specify whether to use percent or raw number of reads for minimum support required for taxon to be retained for MALT. Options: 'percent', 'reads'.", + "description": "Specify whether to use percent or raw number of reads for minimum support required for taxon to be retained for MALT.", "fa_icon": "fas fa-drumstick-bite", - "help_text": "Specify whether to use a percentage, or raw number of reads as the value used to decide the minimum support a taxon requires to be retained.\n\nOnly when `--metagenomics_profiling_tool malt` is also supplied.\n\n> Modifies MALT parameter: `-sup -supp`", + "help_text": "Specify whether to use a percentage, or raw number of reads as the value used to decide the minimum support a taxon requires to be retained.\n\nOnly when `--metagenomics_profiling_tool malt` is also supplied.\n\n> Modifies tool parameter(s):\n> - MALT: `-sup` and `-supp`", "enum": ["percent", "reads"] }, "metagenomics_malt_minsupportpercent": { @@ -785,35 +785,35 @@ "default": 0.01, "description": "Specify the minimum percentage of reads a taxon of sample total is required to have to be retained for MALT.", "fa_icon": "fas fa-percentage", - "help_text": "Specify the minimum number of reads (as a percentage of all assigned reads) a given taxon is required to have to be retained as a positive 'hit' in the RMA6 file. This only applies when `--malt_min_support_mode` is set to 'percent'. Default 0.01.\n\nOnly when `--metagenomics_profiling_tool malt` is also supplied.\n\n> Modifies MALT parameter: `-supp`" + "help_text": "Specify the minimum number of reads (as a percentage of all assigned reads) a given taxon is required to have to be retained as a positive 'hit' in the RMA6 file. This only applies when `--malt_min_support_mode` is set to 'percent'. Default 0.01.\n\nOnly when `--metagenomics_profiling_tool malt` is also supplied.\n\n> Modifies tool parameter(s):\n> - MALT: `-supp`" }, "metagenomics_minsupportreads": { "type": "integer", "default": 1, "description": "Specify a minimum number of reads a taxon of sample total is required to have to be retained in malt or kraken. Not compatible with --malt_min_support_mode 'percent'.", "fa_icon": "fas fa-sort-numeric-up-alt", - "help_text": "For usage in malt: Specify the minimum number of reads a given taxon is required to have to be retained as a positive 'hit'.Default: 1. \nFor malt, this only applies when `--malt_min_support_mode` is set to 'reads'. \n\n> Modifies MALT parameter: `-sup` \n" + "help_text": "For usage in malt: Specify the minimum number of reads a given taxon is required to have to be retained as a positive 'hit'.Default: 1. \nFor malt, this only applies when `--malt_min_support_mode` is set to 'reads'. \n\n> Modifies tool parameter(s):\n> - MALT: `-sup` \n" }, "metagenomics_malt_maxqueries": { "type": "integer", "default": 100, "description": "Specify the maximum number of queries a read can have for MALT.", "fa_icon": "fas fa-phone", - "help_text": "Specify the maximum number of alignments a read can have. All further alignments are discarded. Default: `100`\n\nOnly when `--metagenomics_profiling_tool malt` is also supplied.\n\n> Modifies MALT parameter: `-mq`" + "help_text": "Specify the maximum number of alignments a read can have. All further alignments are discarded. Default: `100`\n\nOnly when `--metagenomics_profiling_tool malt` is also supplied.\n\n> Modifies tool parameter(s):\n> - MALT: `-mq`" }, "metagenomics_malt_memorymode": { "type": "string", "default": "load", - "description": "Specify the memory load method. Do not use 'map' with GPFS file systems for MALT as can be very slow. Options: 'load', 'page', 'map'.", + "description": "Specify the memory load method. Do not use 'map' with GPFS file systems for MALT as can be very slow.", "fa_icon": "fas fa-memory", - "help_text": "\nHow to load the database into memory. Options are `'load'`, `'page'` or `'map'`.\n'load' directly loads the entire database into memory prior seed look up, this\nis slow but compatible with all servers/file systems. `'page'` and `'map'`\nperform a sort of 'chunked' database loading, allowing seed look up prior entire\ndatabase loading. Note that Page and Map modes do not work properly not with\nmany remote file-systems such as GPFS. Default is `'load'`.\n\nOnly when `--metagenomics_profiling_tool malt` is also supplied.\n\n> Modifies MALT parameter: `--memoryMode`", + "help_text": "\nHow to load the database into memory. Options are `'load'`, `'page'` or `'map'`.\n'load' directly loads the entire database into memory prior seed look up, this\nis slow but compatible with all servers/file systems. `'page'` and `'map'`\nperform a sort of 'chunked' database loading, allowing seed look up prior entire\ndatabase loading. Note that Page and Map modes do not work properly not with\nmany remote file-systems such as GPFS. Default is `'load'`.\n\nOnly when `--metagenomics_profiling_tool malt` is also supplied.\n\n> Modifies tool parameter(s):\n> - MALT: `--memoryMode`", "enum": ["load", "page", "map"] }, "metagenomics_malt_savereads": { "type": "boolean", "description": "Specify to also produce SAM alignment files. Note this includes both aligned and unaligned reads, and are gzipped. Note this will result in very large file sizes.", "fa_icon": "fas fa-file-alt", - "help_text": "Specify to _also_ produce gzipped SAM files of all alignments and un-aligned reads in addition to RMA6 files. These are **not** soft-clipped or in 'sparse' format. Can be useful for downstream analyses due to more common file format. \n\n:warning: can result in very large run output directories as this is essentially duplication of the RMA6 files.\n\n> Modified parameter for malt-run: `--alignments`" + "help_text": "Specify to _also_ produce gzipped SAM files of all alignments and un-aligned reads in addition to RMA6 files. These are **not** soft-clipped or in 'sparse' format. Can be useful for downstream analyses due to more common file format. \n\n:warning: can result in very large run output directories as this is essentially duplication of the RMA6 files.\n\n> Sets tool parameter(s):\n> - MALT: `--alignments`" }, "metagenomics_malt_group_size": { "type": "integer", @@ -832,21 +832,21 @@ "type": "string", "description": "Path to a text file with taxa of interest (one taxon per row, NCBI taxonomy name format)", "default": null, - "help_text": "Path to a `.txt` file with taxa of interest you wish to assess for aDNA characteristics. In `.txt` file should be one taxon per row, and the taxon should be in a valid [NCBI taxonomy](https://www.ncbi.nlm.nih.gov/taxonomy) name format corresponding to a taxonomic node in your MALT database. An example can be found on the [HOPS github](https://raw.githubusercontent.com/rhuebler/HOPS/external/Resources/default_list.txt).\\n\\nNecessary when `--metagenomics_profiling_tool malt` specified and `--metagenomics_run_postprocessing` flagged.", + "help_text": "Path to a `.txt` file with taxa of interest you wish to assess for aDNA characteristics. In `.txt` file should be one taxon per row, and the taxon should be in a valid [NCBI taxonomy](https://www.ncbi.nlm.nih.gov/taxonomy) name format corresponding to a taxonomic node in your MALT database. An example can be found on the [HOPS github](https://raw.githubusercontent.com/rhuebler/HOPS/external/Resources/default_list.txt).\\n\\nNecessary when `--metagenomics_profiling_tool malt` specified and `--metagenomics_run_postprocessing` flagged.\n\n Modifies tool parameter(s):\n> - MaltExtract: `-t`", "fa_icon": "fas fa-align-left" }, "metagenomics_maltextract_ncbidir": { "type": "string", "description": "Path to directory containing containing NCBI resource files (ncbi.tre and ncbi.map; available: https://github.com/rhuebler/HOPS/)", "default": null, - "help_text": "Path to directory containing containing the NCBI resource tree and taxonomy table files (ncbi.tre and ncbi.map; available at the [HOPS repository](https://github.com/rhuebler/HOPS/Resources)).\\n\\nNecessary when `--metagenomics_profiling_tool malt` and `--metagenomics_run_postprocessing` specified.", + "help_text": "Path to directory containing containing the NCBI resource tree and taxonomy table files (ncbi.tre and ncbi.map; available at the [HOPS repository](https://github.com/rhuebler/HOPS/Resources)).\\n\\nNecessary when `--metagenomics_profiling_tool malt` and `--metagenomics_run_postprocessing` specified.\n\n Modifies tool parameter(s):\n> - MaltExtract: `-r`", "fa_icon": "fab fa-buffer" }, "metagenomics_maltextract_filter": { "type": "string", "default": "def_anc", - "description": "Specify which MaltExtract filter to use. Options: 'def_anc', 'ancient', 'default', 'crawl', 'scan', 'srna', 'assignment'.", - "help_text": "Specify which MaltExtract filter to use. This is used to specify what types of characteristics to scan for. The default will output statistics on all alignments, and then a second set with just reads with one C to T mismatch in the first 5 bases. Further details on other parameters can be seen in the [HOPS documentation](https://github.com/rhuebler/HOPS/#maltextract-parameters). Options: `'def_anc'`, `'ancient'`, `'default'`, `'crawl'`, `'scan'`, `'srna'`, 'assignment'. Default: `'def_anc'`.\\n\\nOnly when `--metagenomics_profiling_tool malt` is also supplied.\\n\\n> Modifies MaltExtract parameter: `-f`", + "description": "Specify which MaltExtract filter to use.", + "help_text": "Specify which MaltExtract filter to use. This is used to specify what types of characteristics to scan for. The default will output statistics on all alignments, and then a second set with just reads with one C to T mismatch in the first 5 bases. Further details on other parameters can be seen in the [HOPS documentation](https://github.com/rhuebler/HOPS/#maltextract-parameters). Options: `'def_anc'`, `'ancient'`, `'default'`, `'crawl'`, `'scan'`, `'srna'`, 'assignment'. Default: `'def_anc'`.\\n\\nOnly when `--metagenomics_profiling_tool malt` is also supplied.\n\n> Modifies tool parameter(s):\n> - MaltExtract: `-f`", "fa_icon": "fas fa-filter", "enum": ["def_anc", "default", "ancient", "scan", "crawl", "srna"] }, @@ -854,48 +854,48 @@ "type": "number", "default": 0.01, "description": "Specify percent of top alignments to use.", - "help_text": "Specify frequency of top alignments for each read to be considered for each node.\\nDefault is 0.01, i.e. 1% of all reads (where 1 would correspond to 100%).\\n\\n> :warning: this parameter follows the same concept as `--malt_top_percent` but\\n> uses a different notation i.e. integer (MALT) versus float (MALTExtract)\\n\\nDefault: `0.01`.\\n\\nOnly when `--metagenomics_profiling_tool malt` is also supplied.\\n\\n> Modifies MaltExtract parameter: `-a`", + "help_text": "Specify frequency of top alignments for each read to be considered for each node.\\nDefault is 0.01, i.e. 1% of all reads (where 1 would correspond to 100%).\\n\\n> :warning: this parameter follows the same concept as `--malt_top_percent` but\\n> uses a different notation i.e. integer (MALT) versus float (MALTExtract)\\n\\nDefault: `0.01`.\\n\\nOnly when `--metagenomics_profiling_tool malt` is also supplied.\n\n> Modifies tool parameter(s):\n> - MaltExtract: `-a`", "fa_icon": "fas fa-percent" }, "metagenomics_maltextract_destackingoff": { "type": "boolean", "description": "Turn off destacking.", - "help_text": "Turn off destacking. If left on, a read that overlaps with another read will be\\nremoved (leaving a depth coverage of 1).\\n\\nOnly when `--metagenomics_profiling_tool malt` is also supplied.\\n\\n> Modifies MaltExtract parameter: `--destackingOff`", + "help_text": "Turn off destacking. If left on, a read that overlaps with another read will be\\nremoved (leaving a depth coverage of 1).\\n\\nOnly when `--metagenomics_profiling_tool malt` is also supplied.\n\n> Sets tool parameter(s):\n> - MaltExtract: `--destackingOff`", "fa_icon": "fab fa-stack-overflow" }, "metagenomics_maltextract_downsamplingoff": { "type": "boolean", "description": "Turn off downsampling.", - "help_text": "Turn off downsampling. By default, downsampling is on and will randomly select 10,000 reads if the number of reads on a node exceeds this number. This is to speed up processing, under the assumption at 10,000 reads the species is a 'true positive'.\\n\\nOnly when `--metagenomics_profiling_tool malt` is also supplied.\\n\\n> Modifies MaltExtract parameter: `--downSampOff`", + "help_text": "Turn off downsampling. By default, downsampling is on and will randomly select 10,000 reads if the number of reads on a node exceeds this number. This is to speed up processing, under the assumption at 10,000 reads the species is a 'true positive'.\\n\\nOnly when `--metagenomics_profiling_tool malt` is also supplied.\n\n> Sets tool parameter(s):\n> - MaltExtract: `--downSampOff`", "fa_icon": "fas fa-angle-double-down" }, "metagenomics_maltextract_duplicateremovaloff": { "type": "boolean", "description": "Turn off duplicate removal.", - "help_text": "Turn off duplicate removal. By default, reads that are an exact copy (i.e. same start, stop coordinate and exact sequence match) will be removed as it is considered a PCR duplicate.\\n\\nOnly when `--metagenomics_profiling_tool malt` is also supplied.\\n\\n> Modifies MaltExtract parameter: `--dupRemOff`", + "help_text": "Turn off duplicate removal. By default, reads that are an exact copy (i.e. same start, stop coordinate and exact sequence match) will be removed as it is considered a PCR duplicate.\\n\\nOnly when `--metagenomics_profiling_tool malt` is also supplied.\n\n> Sets tool parameter(s):\n> - MaltExtract: `--dupRemOff`", "fa_icon": "fas fa-copy" }, "metagenomics_maltextract_matches": { "type": "boolean", "description": "Turn on exporting alignments of hits in BLAST format.", - "help_text": "Export alignments of hits for each node in BLAST format. By default turned off.\\n\\nOnly when `--metagenomics_profiling_tool malt` is also supplied.\\n\\n> Modifies MaltExtract parameter: `--matches`", + "help_text": "Export alignments of hits for each node in BLAST format. By default turned off.\\n\\nOnly when `--metagenomics_profiling_tool malt` is also supplied.\n\n> Modifies tool parameter(s):\n> - MaltExtract: `--matches`", "fa_icon": "fas fa-equals" }, "metagenomics_maltextract_megansummary": { "type": "boolean", "description": "Turn on export of MEGAN summary files.", - "help_text": "Export 'minimal' summary files (i.e. without alignments) that can be loaded into [MEGAN6](https://doi.org/10.1371/journal.pcbi.1004957). By default turned off.\\n\\nOnly when `--metagenomics_profiling_tool malt` is also supplied.\\n\\n> Modifies MaltExtract parameter: `--meganSummary`" + "help_text": "Export 'minimal' summary files (i.e. without alignments) that can be loaded into [MEGAN6](https://doi.org/10.1371/journal.pcbi.1004957). By default turned off.\\n\\nOnly when `--metagenomics_profiling_tool malt` is also supplied.\n\n> Sets tool parameter(s):\n> - MaltExtract: `--meganSummary`" }, "metagenomics_maltextract_minpercentidentity": { "type": "number", "default": 85, "description": "Minimum percent identity alignments are required to have to be reported as candidate reads. Recommended to set same as MALT parameter.", - "help_text": "Minimum percent identity alignments are required to have to be reported. Higher values allows fewer mismatches between read and reference sequence, but therefore will provide greater confidence in the hit. Lower values allow more mismatches, which can account for damage and divergence of a related strain/species to the reference. Recommended to set same as MALT parameter or higher. Default: `85`.\\n\\nOnly when `--metagenomics_profiling_tool malt` is also supplied.\\n\\n> Modifies MaltExtract parameter: `--minPI`" + "help_text": "Minimum percent identity alignments are required to have to be reported. Higher values allows fewer mismatches between read and reference sequence, but therefore will provide greater confidence in the hit. Lower values allow more mismatches, which can account for damage and divergence of a related strain/species to the reference. Recommended to set same as MALT parameter or higher. Default: `85`.\\n\\nOnly when `--metagenomics_profiling_tool malt` is also supplied.\n\n> Modifies tool parameter(s):\n> - MaltExtract: `--minPI`" }, "metagenomics_maltextract_usetopalignment": { "type": "boolean", "description": "Turn on using top alignments per read after filtering.", - "help_text": "Use the best alignment of each read for every statistic, except for those concerning read distribution and coverage. Default: off.\\n\\nOnly when `--metagenomics_profiling_tool malt` is also supplied.\\n\\n> Modifies MaltExtract parameter: `--useTopAlignment`", + "help_text": "Use the best alignment of each read for every statistic, except for those concerning read distribution and coverage. Default: off.\\n\\nOnly when `--metagenomics_profiling_tool malt` is also supplied.\n\n> Sets tool parameter(s):\n> - MaltExtract: `--useTopAlignment`", "fa_icon": "fas fa-bahai" } }, From 735a104e978331ab12e515edde49d5c90ce54ba5 Mon Sep 17 00:00:00 2001 From: Ian Light Date: Fri, 28 Jun 2024 12:42:05 +0000 Subject: [PATCH 116/198] last removal of options: text --- nextflow_schema.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index ce024e5fb..98462cfae 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -846,7 +846,7 @@ "type": "string", "default": "def_anc", "description": "Specify which MaltExtract filter to use.", - "help_text": "Specify which MaltExtract filter to use. This is used to specify what types of characteristics to scan for. The default will output statistics on all alignments, and then a second set with just reads with one C to T mismatch in the first 5 bases. Further details on other parameters can be seen in the [HOPS documentation](https://github.com/rhuebler/HOPS/#maltextract-parameters). Options: `'def_anc'`, `'ancient'`, `'default'`, `'crawl'`, `'scan'`, `'srna'`, 'assignment'. Default: `'def_anc'`.\\n\\nOnly when `--metagenomics_profiling_tool malt` is also supplied.\n\n> Modifies tool parameter(s):\n> - MaltExtract: `-f`", + "help_text": "Specify which MaltExtract filter to use. This is used to specify what types of characteristics to scan for. The default will output statistics on all alignments, and then a second set with just reads with one C to T mismatch in the first 5 bases. Further details on other parameters can be seen in the [HOPS documentation](https://github.com/rhuebler/HOPS/#maltextract-parameters).\\n\\nOnly when `--metagenomics_profiling_tool malt` is also supplied.\n\n> Modifies tool parameter(s):\n> - MaltExtract: `-f`", "fa_icon": "fas fa-filter", "enum": ["def_anc", "default", "ancient", "scan", "crawl", "srna"] }, From 098d16456260f785a7a9aefd15d0b5d3b20cf3e7 Mon Sep 17 00:00:00 2001 From: Merlin Szymanski Date: Fri, 28 Jun 2024 16:39:37 +0200 Subject: [PATCH 117/198] remove redundant 'single_end = true' statement --- subworkflows/local/metagenomics.nf | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/subworkflows/local/metagenomics.nf b/subworkflows/local/metagenomics.nf index 1af725f30..d5b4d3de9 100644 --- a/subworkflows/local/metagenomics.nf +++ b/subworkflows/local/metagenomics.nf @@ -13,12 +13,10 @@ workflow METAGENOMICS { ch_multiqc_files = Channel.empty() ch_versions = Channel.empty() - // Add single_end parameter to meta. - // Reads were merged before, so single_end is always true! - - ch_bamfiltered_for_metagenomics = ch_bamfiltered_for_metagenomics.map{ - meta, bamfiltered -> [meta+['single_end':true], bamfiltered] - } + // Important Note: + // Due to the bamfilter submodule + // The single_end parameter in the meta is always true! + // keep in mind, in case this is changed in the future // // Run the complexity filter subworkflow From 56dfec92ec28e7f5186ed71ca79995e8cc968851 Mon Sep 17 00:00:00 2001 From: Ian Light Date: Fri, 5 Jul 2024 08:28:14 +0000 Subject: [PATCH 118/198] added meta.reference to tool outputs (complexity + kraken2) --- conf/modules.config | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index fc086ff11..ae618f0bf 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -858,7 +858,7 @@ process { params.metagenomics_prinseq_mode == 'dust' ? "-lc_dust=${params.metagenomics_prinseq_dustscore}" : "-lc_entropy=${params.metagenomics_complexity_entropy}", "-trim_qual_left=0 -trim_qual_left=0 -trim_qual_window=0 -trim_qual_step=0", ].join(' ').trim() - ext.prefix = { "${meta.sample_id}_${meta.library_id}_complexity" } + ext.prefix = { "${meta.sample_id}_${meta.library_id}_${meta.reference}_complexity" } publishDir = [ [ path: { "${params.outdir}/metagenomics/complexity_filter/prinseq" }, @@ -872,7 +872,7 @@ process { withName: ".*BBMAP_BBDUK" { tag = { "${meta.reference}|${meta.sample_id}_${meta.library_id}" } ext.args = { "entropymask=f entropy=${params.metagenomics_complexity_entropy}" } - ext.prefix = { "${meta.sample_id}_${meta.library_id}_complexity" } + ext.prefix = { "${meta.sample_id}_${meta.library_id}_${meta.reference}_complexity" } publishDir = [ path: { "${params.outdir}/metagenomics/complexity_filter/bbduk/" }, mode: params.publish_dir_mode, @@ -913,7 +913,7 @@ process { ext.args = [ params.metagenomics_kraken2_saveminimizers ? "--report-minimizer-data" : "" ].join(' ').trim() - ext.prefix = { "${meta.sample_id}_${meta.library_id}" } + ext.prefix = { "${meta.sample_id}_${meta.library_id}_${meta.reference}" } publishDir = [ path: { "${params.outdir}/metagenomics/profiling/kraken2/" }, mode: params.publish_dir_mode, From ed9e94845e36dbfbc96dfa55bed05ba6fcb9b11c Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 5 Jul 2024 10:29:37 +0200 Subject: [PATCH 119/198] Revert "fixing schema" This reverts commit 3fac93d0b28b95081ef32d26ca1f6de9aa93c542. --- nextflow_schema.json | 563 ++++++++++++++++++++++--------------------- 1 file changed, 283 insertions(+), 280 deletions(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index b59b73117..06737bf47 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -19,14 +19,14 @@ "mimetype": "text/csv", "pattern": "^\\S+\\.(c|t)sv$", "schema": "assets/schema_input.json", - "description": "Path to comma-separated file containing information about the samples in the experiment.", - "help_text": "You will need to create a design file with information about the samples in your experiment before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row. See [usage docs](https://nf-co.re/eager/usage#samplesheet-input).", + "description": "Path to tab- or comma-separated file containing information about the samples in the experiment.", + "help_text": "You will need to create a design file with information about the samples in your experiment before running the pipeline. Use this parameter to specify its location. It has to be a tab- or comma-separated file with 11 columns, and a header row. See [usage docs](https://nf-co.re/eager/usage#samplesheet-input).", "fa_icon": "fas fa-file-csv" }, "convert_inputbam": { "type": "boolean", "description": "Specify to convert input BAM files back to FASTQ for remapping", - "help_text": "This parameter tells the pipeline to convert the BAM files listed in the `--input` TSV or CSV sheet back to FASTQ format to allow re-preprocessing and mapping\n\nCan be useful when you want to ensure consistent mapping parameters across all libraries when incorporating public data, however be careful of biases that may come from re-processing again (the BAM files may already be clipped, or only mapped reads with different settings are included so you may not have all reads from the original publication).", + "help_text": "This parameter tells the pipeline to convert the BAM files listed in the `--input` TSV or CSV sheet back to FASTQ format to allow re-preprocessing and mapping.\n\nCan be useful when you want to ensure consistent mapping parameters across all libraries when incorporating public data, however be careful of biases that may come from re-processing again (the BAM files may already be clipped, or only mapped reads with different settings are included so you may not have all reads from the original publication).", "fa_icon": "fas fa-undo-alt" }, "outdir": { @@ -62,31 +62,32 @@ "mimetype": "text/plain", "pattern": "^\\S+\\.fn?a(sta)?(\\.gz)?$", "errorMessage": "The path to the reference FASTA file must not contain spaces and must have file extensions '.fasta', '.fa', '.fas', '.fna', '.fasta.gz','.fa.gz','.fas.gz' or '.fna.gz'.", - "description": "Path to FASTA genome file.", + "description": "Path to FASTA file of the reference genome.", "help_text": "This parameter is *mandatory* if `--genome` or `--fasta_sheet` are not specified. If you don't supply a mapper index (e.g. for BWA), this will be generated for you automatically. Combine with `--save_reference` to save mapper index for future runs.", "fa_icon": "far fa-file-code" }, "fasta_fai": { "type": "string", - "description": "Path to samtools FASTA index (typically ending in '.fai'). If not supplied will be made for you.", + "description": "Specify path to samtools FASTA index.", "help_text": "If you want to use a pre-existing `samtools faidx` index, use this to specify the required FASTA index file for the selected reference genome. This should be generated by samtools faidx and has a file suffix of `.fai`.", "fa_icon": "fas fa-address-book" }, "fasta_dict": { "type": "string", - "description": "Path to picard sequence dictionary file (typically ending in '.dict'). If not supplied will be made for you.", + "description": "Specify path to Picard sequence dictionary file.", "help_text": "If you want to use a pre-existing `picard CreateSequenceDictionary` dictionary file, use this to specify the required `.dict` file for the selected reference genome.", "fa_icon": "fas fa-address-book" }, "fasta_mapperindexdir": { "type": "string", - "description": "Path to directory containing index files of the FASTA for a given mapper.", - "help_text": "For most people this will likely be the same directory that contains the file you provided to `--fasta`.\n\nIf you want to use pre-existing `bwa index` indices, the directory should contain files ending in '.amb' '.ann' '.bwt'. If you want to use pre-existing `bowtie2 build` indices, the directory should contain files ending in'.1.bt2', '.2.bt2', '.rev.1.bt2'. \n\nIn any case do not include the files themselves in the path. nf-core/eager will automagically detect the index files by searching for the FASTA filename with the corresponding `bwa index`/`bowtie2 build` file suffixes. If not supplied, the indices will be generated for you.\n\n" + "description": "Specify path to directory containing index files of the FASTA for a given mapper.", + "help_text": "For most people this will likely be the same directory that contains the file you provided to `--fasta`.\n\nIf you want to use pre-existing `bwa index` indices, the directory should contain files ending in '.amb' '.ann' '.bwt'. If you want to use pre-existing `bowtie2 build` indices, the directory should contain files ending in'.1.bt2', '.2.bt2', '.rev.1.bt2'.\n\nIn any case do not include the files themselves in the path. nf-core/eager will automagically detect the index files by searching for the FASTA filename with the corresponding `bwa index`/`bowtie2 build` file suffixes. If not supplied, the indices will be generated for you.\n\n", + "fa_icon": "fas fa-folder-open" }, "save_reference": { "type": "boolean", - "description": "Specify to save any pipeline-generated reference genome indices in the results directory.", - "help_text": "Use this if you do not have pre-made reference FASTA indices for `bwa`, `samtools` and `picard`. If you turn this on, the indices nf-core/eager generates for you and will be saved in the `/results/reference_genomes` for you. If not supplied, nf-core/eager generated index references will be deleted.\n\n> modifies SAMtools index command: `-c`", + "description": "Specify to save any pipeline-generated reference genome indices in the results directory.", + "help_text": "Use this if you do not have pre-made reference FASTA indices for `bwa`, `samtools` and `picard`. If you turn this on, the indices nf-core/eager generates for you and will be saved in the `/results/reference_genomes` for you. If not supplied, nf-core/eager generated index references will be deleted.\n\n> Modifies SAMtools index command: `-c`", "fa_icon": "fas fa-save" }, "fasta_sheet": { @@ -125,8 +126,8 @@ }, "fasta_circular_target": { "type": "string", - "description": "Specify the FASTA header of the target chromosome to extend. Only applies when using `circularmapper`.", - "help_text": "The entry (chromosome, contig, etc.) in your FASTA reference that you'd like to be treated as circular.\n\nApplies only when providing a single FASTA file via `--fasta` (NOT multi-reference input - see reference TSV/CSV input).\n\n> Modifies tool parameter(s):\n> - circulargenerator `-s`\n", + "description": "Specify the FASTA header of the target chromosome to extend when using `circularmapper`.", + "help_text": "The entry (chromosome, contig, etc.) in your FASTA reference that you'd like to be treated as circular.\n\nApplies only when providing a single FASTA file via `--fasta` (NOT multi-reference input - see reference TSV/CSV input).\n\n> Modifies tool parameter(s):\n> - circulargenerator `-s`", "fa_icon": "fas fa-bullseye" } } @@ -335,7 +336,7 @@ "preprocessing": { "title": "Preprocessing", "type": "object", - "description": "Removal of adapters, paired-end merging, poly-G removal etc.", + "description": "Removal of adapters, paired-end merging, poly-G removal, etc.", "default": "", "properties": { "sequencing_qc_tool": { @@ -348,8 +349,8 @@ }, "skip_preprocessing": { "type": "boolean", - "description": "Specify to skip all preprocessing steps (adapter removal, paired-end merging, poly-g trimming etc).", - "help_text": "Specify to skip all preprocessing steps (adapter removal, paired-end merging, poly-g trimming etc).\n\nThis will also mean you will only get one set of FastQC results (of the input reads).", + "description": "Specify to skip all preprocessing steps (adapter removal, paired-end merging, poly-G trimming, etc).", + "help_text": "Specify to skip all preprocessing steps (adapter removal, paired-end merging, poly-G trimming etc).\n\nThis will also mean you will only get one set of FastQC results (of the input reads).", "fa_icon": "fas fa-forward" }, "preprocessing_tool": { @@ -364,18 +365,18 @@ "type": "boolean", "description": "Specify to skip read-pair merging.", "fa_icon": "fas fa-forward", - "help_text": "Turns off the paired-end read merging, and will result in paired-end mapping modes being used during reference of reads again alignment.\n\nThis can be useful in cases where you have long ancient DNA reads, modern DNA, or when you want to utilise mate-pair 'spatial' information..\n\n\u26a0\ufe0f If you run this and also with --preprocessing_minlength set to a value (as is by default!), you may end up removing single reads from either the pair1 or pair2 file. These reads will be NOT be mapped when aligning with either bwa or bowtie, as both can only accept one (forward) or two (forward and reverse) FASTQs as input in paired-end mode.\n\n> \u26a0\ufe0f If you run metagenomic screening as well as skipping merging, all reads will be screened as independent reads - not as pairs! - as all FASTQ files from BAM filtering are merged into one. This merged file is _not_ saved in results directory.\n\n> Modifies AdapterRemoval parameter: `--collapse`\n> Modifies fastp parameter: `--merge`" + "help_text": "Turns off the paired-end read merging, and will result in paired-end mapping modes being used during reference of reads again alignment.\n\nThis can be useful in cases where you have long ancient DNA reads, modern DNA or when you want to utilise mate-pair 'spatial' information.\n\n ⚠️ If you run this with --preprocessing_minlength set to a value (as is by default!), you may end up removing single reads from either the pair1 or pair2 file. These reads will be NOT be mapped when aligning with either BWA or bowtie, as both can only accept one (forward) or two (forward and reverse) FASTQs as input in paired-end mode.\n\n> ⚠️ If you run metagenomic screening as well as skipping merging, all reads will be screened as independent reads - not as pairs! - as all FASTQ files from BAM filtering are merged into one. This merged file is _not_ saved in results directory.\n\n> Modifies AdapterRemoval parameter: `--collapse`\n> Modifies fastp parameter: `--merge`" }, "preprocessing_excludeunmerged": { "type": "boolean", - "description": "Specify to exclude pairs that did not overlap sufficiently for merging (i.e., keep merged reads only).", + "description": "Specify to exclude read-pairs that did not overlap sufficiently for merging (i.e., keep merged reads only).", "fa_icon": "fas fa-trash-alt", - "help_text": "Specify to exclude pairs that did not overlap sufficiently for merging (i.e., keep merged reads only), in otherwords singletons (i.e. reads missing a pair), or un-merged reads (where there wasn't sufficient overlap) are discarded.\n\nMost ancient DNA molecules are very short, and the majority are expected to merge. Specifying this parameter can sometimes be useful when dealing with ultra-short aDNA reads to reduce the number of longer-reads you may have in your library that are derived from modern contamination. It can also speed up run time of mapping steps.\n\nYou may want to use this if you want ensure only the best quality reads for your analysis, but with the penalty of potentially losing still valid data (even if some reads have slightly lower quality and/or are longer). It is highly recommended when using 'dedup' deduplication tool." + "help_text": "Specify to exclude read-pairs that did not overlap sufficiently for merging (i.e., keep merged reads only). Singletons (i.e. reads missing a pair) or un-merged reads (where there wasn't sufficient overlap) are discarded.\n\nMost ancient DNA molecules are very short, and the majority are expected to merge. Specifying this parameter can sometimes be useful when dealing with ultra-short aDNA reads to reduce the number of longer-reads you may have in your library that are derived from modern contamination. It can also speed up run time of mapping steps.\n\nYou may want to use this if you want ensure only the best quality reads for your analysis, but with the penalty of potentially losing still valid data (even if some reads have slightly lower quality and/or are longer). It is highly recommended when using 'dedup' deduplication tool." }, "preprocessing_skipadaptertrim": { "type": "boolean", "description": "Specify to skip removal of adapters.", - "help_text": "Specify to turn off trimming of adapters from reads.\n\nYou may wish to do this if you are using public data (e.g. ENA, SRA), that _should_ have all library artefacts from reads.\n\nThis will override any other adapter parameters provided (i.e, `--preprocessing_adapterlist` and or/ `--preprocessing_adapter{1,2}` will be ignored)!\n\n> Modifies AdapterRemoval parameter: `--adapter1` and `--adapter2` (sets both to an empty string)\n> Applies fastp parameter: `--disable_adapter_trimming`", + "help_text": "Specify to turn off trimming of adapters from reads.\n\nYou may wish to do this if you are using publicly available data, that _should_ have all library artefacts from reads removed.\n\nThis will override any other adapter parameters provided (i.e, `--preprocessing_adapterlist` and `--preprocessing_adapter{1,2}` will be ignored)!\n\n> Modifies AdapterRemoval parameter: `--adapter1` and `--adapter2` (sets both to an empty string)\n> Applies fastp parameter: `--disable_adapter_trimming`", "fa_icon": "fas fa-forward" }, "preprocessing_adapter1": { @@ -386,93 +387,93 @@ }, "preprocessing_adapter2": { "type": "string", - "description": "Specify the nucleotide sequence for the forward read/R2.", + "description": "Specify the nucleotide sequence for the reverse read/R2.", "fa_icon": "fas fa-grip-lines", "help_text": "Specify a nucleotide sequence for the forward read/R2.\n\nIf not modified by the user, the default for the particular preprocessing tool will be used. To turn off adapter trimming use `--preprocessing_skipadaptertrim`.\n\n> Modifies AdapterRemoval parameter: `--adapter2`\n> Modifies fastp parameter: `--adapter_sequence_r2`" }, "preprocessing_adapterlist": { "type": "string", - "description": "Specify a list of all possible adapters to trim. Overrides --preprocessing_adapter1/2. Formats: .txt (AdapterRemoval) or .fasta. (fastp).", - "help_text": "Allows to supply a file with a list of adapter (combinations) to remove from all files. \n\nOverrides the `--preprocessing_adapter1`/`--preprocessing_adapter2` parameters . \n\nNote that the two tools have slightly different behaviours.\n\nFor AdapterRemoval this consists of a two column table with a `.txt` extension: first column represents forward strand, second column for reverse strand. You must supply all possibly combinations, one per line, and this list is applied to all files. Only Adapters in this list will be screened for and removed. See AdapterRemoval documentation for more information.\n\nFor fastp this consists of a standard FASTA format with a `.fasta`/`.fa`/`.fna`/`.fas` extension. The adapter sequence in this file should be at least 6bp long, otherwise it will be skipped. fastp first will perform auto-detection of reads and will be removed , and then _additionally_ adapters present in the FASTA file one by one will be removed.\n\n> Modifies AdapterRemoval parameter: `--adapter-list`\n> Modifies fastp parameter: `--adapter_fasta`", + "description": "Specify a list of all possible adapters to trim.", + "help_text": "Specify a file with a list of adapter (combinations) to remove from all files.\n\nOverrides the `--preprocessing_adapter1`/`--preprocessing_adapter2` parameters.\n\nNote that the two tools have slightly different behaviours.\n\nFor AdapterRemoval this consists of a two column table with a `.txt` extension: first column represents forward strand, second column for reverse strand. You must supply all possible combinations, one per line, and this list is applied to all files. Only Adapters in this list will be screened for and removed. See AdapterRemoval documentation for more information.\n\nFor fastp this consists of a standard FASTA format with a `.fasta`/`.fa`/`.fna`/`.fas` extension. The adapter sequence in this file should be at least 6bp long, otherwise it will be skipped. fastp will first perform auto-detection and removal of adapters and then _additionally_ remove adapters present in the FASTA file one by one will.\n\n> Modifies AdapterRemoval parameter: `--adapter-list`\n> Modifies fastp parameter: `--adapter_fasta`", "fa_icon": "fas fa-list" }, "preprocessing_minlength": { "type": "integer", "default": 25, "description": "Specify the minimum length reads must have to be retained.", - "help_text": "Specify the minimum length reads must have to be retained. \n\nReads smaller than this length after trimming are discarded and not included in downstream analyses. Typically in ancient DNA users will set this to 30 or for very old samples around 25 bp - reads any shorter that this often are not specific enough to provide useful information.\n\n> Modifies AdapterRemoval parameter: `--minlength`\n> Modifies fastp parameter: `--length_required`", - "fa_icon": "fas fa-ruler" + "help_text": "Specify the minimum length reads must have to be retained.\n\nReads smaller than this length after trimming are discarded and not included in downstream analyses. Typically in ancient DNA, users will set this to 30 or for very old samples around 25 bp - reads any shorter that this often are not specific enough to provide useful information.\n\n> Modifies AdapterRemoval parameter: `--minlength`\n> Modifies fastp parameter: `--length_required`", + "fa_icon": "fas fa-ruler-horizontal" }, "preprocessing_trim5p": { "type": "integer", "default": 0, - "description": "Specify number of bases to hard-trim from 5 prime or front of reads. Exact behaviour varies per tool, see documentation.", - "help_text": "Specify number of bases to hard-trim from 5 prime or front of reads. Exact behaviour varies per tool, see documentation. By default set to `0` to not perform any hard trimming.\n\nThis parameter allows users to 'hard' remove a number of bases from the beginning or end of reads, regardless of quality.\n\n\u26a0\ufe0f when this trimming occurs depends on the tool, i.e., the exact behaviour is not the same between AdapterRemoval and fastp.\n\nFor fastp: this 5p/3p trimming occurs _prior_ to any other trimming (quality, poly-G, adapter). Please see the [fastp documentation](https://github.com/OpenGene/fastp#global-trimming) for more information. If you wish to use this to remove damage prior mapping (to allow more specific mapping), ensure you have manually removed adapters/quality trimmed **prior** to giving the reads to nf-core/eager. Alternatively, you can use Bowtie2's inbuilt pre-mapping read-end trimming functionality. Note that nf-core/eager only allows this hard trimming equally for both forward and reverse reads (i.e., you cannot provide different values for the 5p end for R1 and R2).\n\nFor AdapterRemoval, this trimming happens _after_ the removal of adapters, however prior to quality trimming. Therefore this is more suitable for hard-removal of damage prior mapping (however the Bowtie2 system will be more reliable).\n\n> Modifies AdapterRemoval parameters: `--trim5p`\n> Modifies fastp parameters: `--trim_front1` and/or `--trim_front2`\n", + "description": "Specify number of bases to hard-trim from 5 prime or front of reads.", + "help_text": "Specify number of bases to hard-trim from 5 prime or front of reads. Exact behaviour varies per tool, see documentation. By default set to `0` to not perform any hard trimming.\n\nThis parameter allows users to 'hard' remove a number of bases from the beginning or end of reads, regardless of quality.\n\n ⚠️ When this trimming occurs depends on the tool, i.e., the exact behaviour is not the same between AdapterRemoval and fastp.\n\nFor fastp: 5p/3p trimming occurs _prior_ to any other trimming (quality, poly-G, adapter). Please see the [fastp documentation](https://github.com/OpenGene/fastp#global-trimming) for more information. If you wish to use this to remove damage prior to mapping (to allow more specific mapping), ensure you have manually removed adapters/quality trimmed **prior** to giving the reads to nf-core/eager. Alternatively, you can use Bowtie 2's inbuilt pre-mapping read-end trimming functionality. Note that nf-core/eager only allows this hard trimming equally for both forward and reverse reads (i.e., you cannot provide different values for the 5p end for R1 and R2).\n\nFor AdapterRemoval, this trimming happens _after_ the removal of adapters, however prior to quality trimming. Therefore, this is more suitable for hard-removal of damage before mapping (however the Bowtie 2 system will be more reliable).\n\n> Modifies AdapterRemoval parameters: `--trim5p`\n> Modifies fastp parameters: `--trim_front1` and/or `--trim_front2`", "fa_icon": "fas fa-cut" }, "preprocessing_trim3p": { "type": "integer", "default": 0, - "description": "Specify number of bases to hard-trim from 3 prime or tail of reads. Exact behaviour varies per tool, see documentation.", + "description": "Specify number of bases to hard-trim from 3 prime or tail of reads.", "fa_icon": "fas fa-cut", - "help_text": "Specify number of bases to hard-trim from 3 prime or tail of reads. Exact behaviour varies per tool, see documentation. By default set to `0` to not perform any hard trimming.\n\nThis parameter allows users to 'hard' remove a number of bases from the beginning or end of reads, regardless of quality.\n\n\u26a0\ufe0f when this trimming occurs depends on the tool, i.e., the exact behaviour is not the same between AdapterRemoval and fastp.\n\nFor fastp: this 5p/3p trimming occurs _prior_ to any other trimming (quality, poly-G, adapter). Please see the [fastp documentation](https://github.com/OpenGene/fastp#global-trimming) for more information. If you wish to use this to remove damage prior mapping (to allow more specific mapping), ensure you have manually removed adapters/quality trimmed **prior** to giving the reads to nf-core/eager. Alternatively, you can use Bowtie2's inbuilt pre-mapping read-end trimming functionality. Note that nf-core/eager only allows this hard trimming equally for both forward and reverse reads (i.e., you cannot provide different values for the 3p end for R1 and R2).\n\nFor AdapterRemoval, this trimming happens _after_ the removal of adapters, however prior to quality trimming. Therefore this is more suitable for hard-removal of damage prior mapping (however the Bowtie2 system will be more reliable).\n\n> Modifies AdapterRemoval parameters: `--trim3p`\n> Modifies fastp parameters: `--trim_tail1` and/or `--trim_tail2`\n" + "help_text": "Specify number of bases to hard-trim from 3 prime or tail of reads. Exact behaviour varies per tool, see documentation. By default set to `0` to not perform any hard trimming.\n\nThis parameter allows users to 'hard' remove a number of bases from the beginning or end of reads, regardless of quality.\n\n⚠️ When this trimming occurs depends on the tool, i.e., the exact behaviour is not the same between AdapterRemoval and fastp.\n\nFor fastp: 5p/3p trimming occurs _prior_ to any other trimming (quality, poly-G, adapter). Please see the [fastp documentation](https://github.com/OpenGene/fastp#global-trimming) for more information. If you wish to use this to remove damage prior to mapping (to allow more specific mapping), ensure you have manually removed adapters/quality trimmed **prior** to giving the reads to nf-core/eager. Alternatively, you can use Bowtie 2's inbuilt pre-mapping read-end trimming functionality. Note that nf-core/eager only allows this hard trimming equally for both forward and reverse reads (i.e., you cannot provide different values for the 3p end for R1 and R2).\n\nFor AdapterRemoval, this trimming happens _after_ the removal of adapters, however prior to quality trimming. Therefore this is more suitable for hard-removal of damage before mapping (however the Bowtie 2 system will be more reliable).\n\n> Modifies AdapterRemoval parameters: `--trim3p`\n> Modifies fastp parameters: `--trim_tail1` and/or `--trim_tail2`" }, "preprocessing_savepreprocessedreads": { "type": "boolean", "description": "Specify to save the preprocessed reads in the results directory.", "fa_icon": "fas fa-save", - "help_text": "Specify to save the preprocessed reads in FASTQ format the results directory. \n\nThis can be useful for re-analysing in FASTQ files manually, or uploading to public data repositories such as ENA/SRA (provided you don't do length filtering nor merging)." + "help_text": "Specify to save the preprocessed reads in FASTQ format the results directory.\n\nThis can be useful for re-analysing FASTQ files manually, or uploading to public data repositories such as ENA/SRA (provided you don't filter by length or merge paired reads)." }, "preprocessing_fastp_complexityfilter": { "type": "boolean", - "description": "Specify to turn on sequence complexity filtering of reads with fastp.", + "description": "Specify to turn on sequence complexity filtering of reads.", "help_text": "Performs a poly-G tail removal step in the beginning of the pipeline using fastp.\n\nThis can be useful for trimming ploy-G tails from short-fragments sequenced on two-colour Illumina chemistry such as NextSeqs or NovaSeqs (where no-fluorescence is read as a G on two-colour chemistry), which can inflate reported GC content values.\n\n> Modifies fastp parameter: `--trim_poly_g`", - "fa_icon": "fas fa-cut" + "fa_icon": "fas fa-power-off" }, "preprocessing_fastp_complexityfilter_threshold": { "type": "integer", "default": 10, "description": "Specify the complexity threshold that must be reached or exceeded to retain reads.", "help_text": "This option can be used to define the minimum length of a poly-G tail to begin low complexity trimming.\n\n> Modifies fastp parameter: `--poly_g_min_len`", - "fa_icon": "fas fa-ruler" + "fa_icon": "fas fa-filter" }, "preprocessing_adapterremoval_preserve5p": { "type": "boolean", - "description": "Skip AdapterRemoval base trimming (n, quality) of 5 prime end.", + "description": "Skip AdapterRemoval quality and N base trimming at 5 prime end.", "help_text": "Turns off quality based trimming at the 5p end of reads when any of the AdapterRemoval quality or N trimming options are used. Only 3p end of reads will be removed.\n\nThis also entirely disables quality based trimming of collapsed reads, since both ends of these are informative for PCR duplicate filtering. For more information see the AdapterRemoval [documentation](https://adapterremoval.readthedocs.io/en/stable/manpage.html#cmdoption-adapterremoval-preserve5p).\n\n> Modifies AdapterRemoval parameters: `--preserve5p`", "fa_icon": "fas fa-shield-alt" }, "preprocessing_adapterremoval_skipqualitytrimming": { "type": "boolean", - "description": "Skip AdapterRemoval quality and N trimming from ends of reads.", - "help_text": "Turns off AdapterRemoval quality trimming from ends of reads. \n\nThis can be useful to reduce runtime when running public data that has already been processed.\n\n> Modifies AdapterRemoval parameters: `--trimqualities` ", + "description": "Specify to skip AdapterRemoval quality and N trimming at the ends of reads.", + "help_text": "Turns off AdapterRemoval quality trimming from ends of reads.\n\nThis can be useful to reduce runtime when running public data that has already been processed.\n\n> Modifies AdapterRemoval parameters: `--trimqualities` ", "fa_icon": "fas fa-forward" }, "preprocessing_adapterremoval_trimbasequalitymin": { "type": "integer", "default": 20, "description": "Specify AdapterRemoval minimum base quality for trimming off bases.", - "help_text": "Defines the minimum read quality per base that is required for a base to be kept. Individual bases at the ends of reads falling below this threshold will be clipped off.\n\n> Modifies AdapterRemoval parameter: `--minquality`", - "fa_icon": "fas fa-ruler-vertical" + "help_text": "Defines the minimum read quality per base that is required for a base to be kept by AdapterRemoval. Individual bases at the ends of reads falling below this threshold will be clipped off.\n\n> Modifies AdapterRemoval parameter: `--minquality`", + "fa_icon": "fas fa-filter" }, "preprocessing_adapterremoval_skipntrimming": { "type": "boolean", - "description": "Skip AdapterRemoval N trimming (quality trimming only).", - "help_text": "Turns off AdapterRemoval N trimming from ends of reads. \n\nThis can be useful to reduce runtime when running public data that has already been processed.\n\n> Modifies AdapterRemoval parameters: `--trimns` ", + "description": "Specify to skip AdapterRemoval N trimming (quality trimming only).", + "help_text": "Turns off AdapterRemoval N trimming from ends of reads.\n\nThis can be useful to reduce runtime when running publicly available data that has already been processed.\n\n> Modifies AdapterRemoval parameters: `--trimns` ", "fa_icon": "fas fa-forward" }, "preprocessing_adapterremoval_adapteroverlap": { "type": "integer", "default": 1, "description": "Specify the AdapterRemoval minimum adapter overlap required for trimming.", - "fa_icon": "fas fa-ruler-horizontal", + "fa_icon": "fas fa-filter", "help_text": "Specifies a minimum number of bases that overlap with the adapter sequence before AdapterRemoval trims adapters sequences from reads.\n\n> Modifies AdapterRemoval parameter: `--minadapteroverlap`" }, "preprocessing_adapterremoval_qualitymax": { "type": "integer", "default": 41, - "description": "Specify the AdapterRemoval maximum Phred score used in input FASTQ files", - "help_text": "Specify maximum Phred score of the quality field of FASTQ files. \n\nThe quality-score range can vary depending on the machine and version (e.g. see diagram [here](https://en.wikipedia.org/wiki/FASTQ_format#Encoding), and this allows you to increase from the default AdapterRemoval value of 41. \n\nNote that while this theoretically can provide you with more confident and precise base call information, many downstream tools only accept FASTQ files with Phred scores limited to a max of 41, and therefore increasing the default for this parameter may make the resulting preprocessed files incompatible with some downstream tools.\n\n> Modifies AdapterRemoval parameters: `--qualitymax`", + "description": "Specify the AdapterRemoval maximum Phred score used in input FASTQ files.", + "help_text": "Specify maximum Phred score of the quality field of FASTQ files.\n\nThe quality-score range can vary depending on the machine and version (e.g. see diagram [here](https://en.wikipedia.org/wiki/FASTQ_format#Encoding), and this allows you to increase from the default AdapterRemoval value of 41.\n\nNote that while this can theoretically provide you with more confident and precise base call information, many downstream tools only accept FASTQ files with Phred scores limited to a max of 41, and therefore increasing the default for this parameter may make the resulting preprocessed files incompatible with some downstream tools.\n\n> Modifies AdapterRemoval parameters: `--qualitymax`", "fa_icon": "fas fa-tachometer-alt" } }, @@ -486,9 +487,9 @@ "properties": { "run_fastq_sharding": { "type": "boolean", - "description": "Turn on FastQ sharding.", + "description": "Specify to turn on FASTQ sharding.", "fa_icon": "fas fa-power-off", - "help_text": "Sharding will split the FastQs into smaller chunks before mapping. These chunks are then mapped in parallel. This approach can speed up the mapping process for larger FastQ files." + "help_text": "Sharding will split the FASTQs into smaller chunks before mapping. These chunks are then mapped in parallel. This approach can speed up the mapping process for larger FASTQ files." }, "fastq_shard_size": { "type": "integer", @@ -502,99 +503,99 @@ "default": "bwaaln", "enum": ["bwaaln", "bwamem", "bowtie2", "circularmapper"], "description": "Specify which mapper to use.", - "help_text": "Specify which mapping tool to use. Options are BWA aln ('`bwaaln`'), BWA mem ('`bwamem`'), circularmapper ('`circularmapper`'), or bowtie2 ('`bowtie2`'). BWA aln is the default and highly suited for short-read ancient DNA. BWA mem can be quite useful for modern DNA, but is rarely used in projects for ancient DNA. CircularMapper enhances the mapping procedure to circular references, using the BWA algorithm but utilizing a extend-remap procedure (see Peltzer et al 2016, Genome Biology for details). Bowtie2 is similar to BWA aln, and has recently been suggested to provide slightly better results under certain conditions ([Poullet and Orlando 2020](https://doi.org/10.3389/fevo.2020.00105)), as well as providing extra functionality (such as FASTQ trimming).\n\nMore documentation can be seen for each tool under:\n\n- [BWA aln](http://bio-bwa.sourceforge.net/bwa.shtml#3)\n- [BWA mem](http://bio-bwa.sourceforge.net/bwa.shtml#3)\n- [CircularMapper](https://circularmapper.readthedocs.io/en/latest/contents/userguide.html)\n- [Bowtie2](http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml#command-line)", - "fa_icon": "fas fa-layer-group" + "help_text": "Specify which mapping tool to use. Options are BWA aln ('`bwaaln`'), BWA mem ('`bwamem`'), circularmapper ('`circularmapper`'), or Bowtie 2 ('`bowtie2`'). BWA aln is the default and highly suited for short-read ancient DNA. BWA mem can be quite useful for modern DNA, but is rarely used in projects for ancient DNA. CircularMapper enhances the mapping procedure to circular references, using the BWA algorithm but utilizing an extend-remap procedure (see [Peltzer et al 2016](https://doi.org/10.1186/s13059-016-0918-z) for details). Bowtie 2 is similar to BWA aln, and has recently been suggested to provide slightly better results under certain conditions ([Poullet and Orlando 2020](https://doi.org/10.3389/fevo.2020.00105)), as well as providing extra functionality (such as FASTQ trimming).\n\nMore documentation can be seen for each tool under:\n\n- [BWA aln](http://bio-bwa.sourceforge.net/bwa.shtml#3)\n- [BWA mem](http://bio-bwa.sourceforge.net/bwa.shtml#3)\n- [CircularMapper](https://circularmapper.readthedocs.io/en/latest/contents/userguide.html)\n- [Bowtie 2](http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml#command-line)", + "fa_icon": "fas fa-hammer" }, "fasta_largeref": { "type": "boolean", - "description": "Specify to generate more recent '.csi' BAM indices. If your reference genome is larger than 3.5GB, this is recommended due to more efficient data handling with the '.csi' format over the older '.bai'.", - "help_text": "This parameter is required to be set for large reference genomes. If your reference genome is larger than 3.5GB, the `samtools index` calls in the pipeline need to generate `.csi` indices instead of `.bai` indices to compensate for the size of the reference genome (with samtools: `-c`). This parameter is not required for smaller references (including the human hg19 or grch37/grch38 references), but >4GB genomes have been shown to need `.csi` indices.", + "description": "Specify to generate '.csi' BAM indices instead of '.bai' for larger reference genomes.", + "help_text": "This parameter is required to be set for large reference genomes. If your reference genome is larger than 3.5GB, the `samtools index` calls in the pipeline need to generate `.csi` indices instead of `.bai` indices to compensate for the size of the reference genome (with samtools: `-c`). This parameter is not required for smaller references (including the human reference genomes hg19 or grch37/grch38).", "fa_icon": "fas fa-address-book" }, "mapping_bwaaln_n": { "type": "number", "default": 0.01, - "description": "Specify the -n parameter for BWA aln, i.e. amount of allowed mismatches in the alignment.", - "help_text": "Configures the `bwa aln -n` parameter, defining how many mismatches are allowed in a read. Default is set following recommendations from [Oliva et al. 2021](https://doi.org/10.1093/bib/bbab076) who tested when aligning to human reference genomes. \n\nIf you're uncertain what to set check out this [Shiny App](https://apeltzer.shinyapps.io/bwa-mismatches/) for more information on how to set this parameter efficiently.\n\n> Modifies bwa aln parameter: `-n`", + "description": "Specify the amount of allowed mismatches in the alignment for mapping with BWA aln.", + "help_text": "Specify how many mismatches are allowed in a read during alignment with BWA aln. Default is set following recommendations from [Oliva et al. 2021](https://doi.org/10.1093/bib/bbab076) who compared alignment to human reference genomes.\n\nIf you're uncertain what value to use, check out this [Shiny App](https://apeltzer.shinyapps.io/bwa-mismatches/) for more information.\n\n> Modifies BWA aln parameter: `-n`", "fa_icon": "fas fa-sort-numeric-down" }, "mapping_bwaaln_k": { "type": "integer", "default": 2, - "description": "Specify the -k parameter for BWA aln, i.e. maximum edit distance allowed in a seed.", - "help_text": "Configures the bwa aln `-k` parameter for the maximum edit distance during the seeding phase of the mapping algorithm.\n\n> Modifies BWA aln parameter: `-k`", + "description": "Specify the maximum edit distance allowed in a seed for mapping with BWA aln.", + "help_text": "Specify the maximum edit distance during the seeding phase of the BWA aln mapping algorithm.\n\n> Modifies BWA aln parameter: `-k`", "fa_icon": "fas fa-people-arrows" }, "mapping_bwaaln_l": { "type": "integer", "default": 1024, - "description": "Specify the -l parameter for BWA aln i.e. the length of seeds to be used.", - "help_text": "Configures the length of the seed used in bwa aln `-l`. Default is set to be 'turned off' at the recommendation of [Oliva et al. 2021](https://doi.org/10.1093/bib/bbab076) who tested when aligning to human reference genomes. Seeding is 'turned off' by specifying an arbitrarily long number to force the entire read to act as the seed. \n\nNote: Despite being recommended, turning off seeding can result in long runtimes!\n\n> Modifies BWA aln parameter: `-l`", + "description": "Specify the length of seeds to be used for BWA aln.", + "help_text": "Specify the length of the seed used in BWA aln. Default is set to be 'turned off' at the recommendation of [Oliva et al. 2021](https://doi.org/10.1093/bib/bbab076) who tested when aligning to human reference genomes. Seeding is 'turned off' by specifying an arbitrarily long number to force the entire read to act as the seed.\n\nNote: Despite being recommended, turning off seeding can result in long runtimes!\n\n> Modifies BWA aln parameter: `-l`", "fa_icon": "fas fa-ruler-horizontal" }, "mapping_bwaaln_o": { "type": "integer", "default": 2, - "description": "Specify the -o parameter for BWA aln i.e. the number of gaps allowed.", - "help_text": "Configures the number of gaps used in bwa aln. Default is set to bwa default.\n\n> Modifies BWA aln parameter: `-o`", + "description": "Specify the number of gaps allowed for alignment with BWA aln.", + "help_text": "Specify the number of gaps allowed for mapping with BWA aln. Default is set to BWA default.\n\n> Modifies BWA aln parameter: `-o`", "fa_icon": "fas fa-people-arrows" }, "mapping_bwamem_k": { "type": "integer", "default": 19, - "description": "Specify the -k parameter for BWA mem i.e. the minimum seed length", - "help_text": "Configures the minimum seed length used in BWA-MEM. Default is set to BWA default.\n\n> Modifies BWA-MEM parameter: `-k`", + "description": "Specify the minimum seed length for alignment with BWA mem.", + "help_text": "Configures the minimum seed length used in BWA mem. Default is set to BWA default.\n\n> Modifies BWA mem parameter: `-k`", "fa_icon": "fas fa-seedling" }, "mapping_bwamem_r": { "type": "number", "default": 1.5, - "description": "Specify the -k parameter for BWA mem i.e. the minimum seed length", - "help_text": "Configures the re-seeding used in BWA-MEM. Default is set to BWA default.\n\n> Modifies BWA-MEM parameter: `-r`", + "description": "Specify the re-seeding threshold for alignment with BWA mem.", + "help_text": "Configures the re-seeding threshold used in BWA mem. Default is set to BWA default.\n\n> Modifies BWA mem parameter: `-r`", "fa_icon": "fas fa-angle-double-left" }, "mapping_bowtie2_alignmode": { "type": "string", "default": "local", - "description": "Specify the bowtie2 alignment mode.", - "help_text": "The type of read alignment to use. Local allows only partial alignment of read, with ends of reads possibly 'soft-clipped' (i.e. remain unaligned/ignored), if the soft-clipped alignment provides best alignment score. End-to-end requires all nucleotides to be aligned. \nDefault is set following [Cahill et al (2018)](https://doi.org/10.1093/molbev/msy018) and [Poullet and Orlando 2020](https://www.frontiersin.org/articles/10.3389/fevo.2020.00105/full)\n\n> Modifies Bowtie2 presets: `--local`, `--end-to-end`", - "fa_icon": "fas fa-arrows-alt-h", + "description": "Specify the Bowtie 2 alignment mode.", + "help_text": "Specify the type of read alignment to use with Bowtie 2. 'Local' allows only partial alignment of read with ends of reads possibly 'soft-clipped' (i.e. remain unaligned/ignored), if the soft-clipped alignment provides best alignment score. 'End-to-end' requires all nucleotides to be aligned.\nDefault is set following [Cahill et al (2018)](https://doi.org/10.1093/molbev/msy018) and [Poullet and Orlando 2020](https://www.frontiersin.org/articles/10.3389/fevo.2020.00105/full)\n\n> Modifies Bowtie 2 presets: `--local`, `--end-to-end`", + "fa_icon": "fas fa-toggle-on", "enum": ["local", "end-to-end"] }, "mapping_bowtie2_sensitivity": { "type": "string", "default": "sensitive", - "description": "Specify the level of sensitivity for the bowtie2 alignment mode.", - "help_text": "The Bowtie2 'preset' to use. These strings apply to both --mapping_bowtie2_alignmode options. See the Bowtie2 manual for actual settings. \nDefault is set following [Poullet and Orlando (2020)](https://www.frontiersin.org/articles/10.3389/fevo.2020.00105/full), when running damaged-data without UDG treatment)\n\nModifies the Bowtie2 parameters: `--fast`, `--very-fast`, `--sensitive`, `--very-sensitive`, `--fast-local`, `--very-fast-local`, `--sensitive-local`, `--very-sensitive-local`", + "description": "Specify the level of sensitivity for the Bowtie 2 alignment mode.", + "help_text": "Specify the Bowtie 2 'preset' to use. These strings apply to both `--mapping_bowtie2_alignmode` options. See the Bowtie 2 manual for actual settings.\nDefault is set following [Poullet and Orlando (2020)](https://www.frontiersin.org/articles/10.3389/fevo.2020.00105/full), when running damaged-data without UDG treatment.\n\n> Modifies the Bowtie 2 parameters: `--fast`, `--very-fast`, `--sensitive`, `--very-sensitive`, `--fast-local`, `--very-fast-local`, `--sensitive-local`, `--very-sensitive-local`", "fa_icon": "fas fa-microscope", "enum": ["fast", "very-fast", "sensitive", "very-sensitive"] }, "mapping_bowtie2_n": { "type": "integer", "default": 0, - "description": "Specify the -N parameter for bowtie2 (mismatches in seed). This will override defaults from alignmode/sensitivity.", - "help_text": "The number of mismatches allowed in the seed during seed-and-extend procedure of Bowtie2. This will override any values set with --mapping_bowtie2_sensitivity. Can either be 0 or 1.\n\n>Modifies Bowtie2 parameter: `-N`", + "description": "Specify the number of mismatches in seed for alignment with Bowtie 2.", + "help_text": "Specify the number of mismatches allowed in the seed during seed-and-extend procedure of Bowtie 2. This will override any values set with `--mapping_bowtie2_sensitivity`. Can either be 0 or 1.\n\n> Modifies Bowtie 2 parameter: `-N`", "fa_icon": "fas fa-sort-numeric-down" }, "mapping_bowtie2_l": { "type": "integer", "default": 20, - "description": "Specify the -L parameter for bowtie2 (length of seed substrings). This will override defaults from alignmode/sensitivity.", - "help_text": "The length of the seed sub-string to use during seeding. This will override any values set with --mapping_bowtie2_sensitivity.\n\n> Modifies Bowtie2 parameter: `-L`", + "description": "Specify the length of seed substrings for Bowtie 2.", + "help_text": "Specify the length of the seed sub-string to use during seeding of Bowtie 2. This will override any values set with `--mapping_bowtie2_sensitivity`.\n\n> Modifies Bowtie 2 parameter: `-L`", "fa_icon": "fas fa-ruler-horizontal" }, "mapping_bowtie2_trim5": { "type": "integer", "default": 0, - "description": "Specify number of bases to trim off from 5' (left) end of read before alignment.", - "help_text": "Number of bases to trim at the 5' (left) end of read prior alignment. Maybe useful when left-over sequencing artefacts of in-line barcodes present.\n\n> Modifies Bowtie2 parameter: `--trim5`", + "description": "Specify the number of bases to trim off from 5 prime end of read before alignment with Bowtie 2.", + "help_text": "Specify the number of bases to trim at the 5' (left) end of read before alignment with Bowtie 2. This may be useful when left-over sequencing artefacts of in-line barcodes are present.\n\n> Modifies Bowtie 2 parameter: `--trim5`", "fa_icon": "fas fa-cut" }, "mapping_bowtie2_trim3": { "type": "integer", "default": 0, - "description": "Specify number of bases to trim off from 3' (right) end of read before alignment.", - "help_text": "Number of bases to trim at the 3' (right) end of read prior alignment. Maybe useful when left-over sequencing artefacts of in-line barcodes present.\n\n> Modifies Bowtie2 parameter: `--trim3`", + "description": "Specify the number of bases to trim off from 3 prime end of read before alignment with Bowtie 2.", + "help_text": "Specify the number of bases to trim at the 3' (right) end of read before alignment with Bowtie 2. This may be useful when left-over sequencing artefacts of in-line barcodes are present.\n\n> Modifies Bowtie 2 parameter: `--trim3`", "fa_icon": "fas fa-cut" }, "mapping_bowtie2_maxins": { @@ -603,7 +604,13 @@ "description": "Specify the maximum fragment length for Bowtie2 paired-end mapping mode only.", "help_text": "The maximum fragment for valid paired-end alignments. Only for paired-end mapping (i.e. unmerged), and therefore typically only useful for modern data.\n\n> Modifies Bowtie2 parameter: `--maxins`", "fa_icon": "fas fa-exchange-alt" - } + }, + "elongation_factor": { + "type": "integer", + "default": 500, + "description": "Specify the number of bases to extend reference by (circularmapper only)", + "help_text": "The number of bases to extend the reference genome with. By default this is set to 500 if not specified otherwise.", + "fa_icon": "fas fa-external-link-alt" }, "fa_icon": "fas fa-layer-group" }, @@ -615,54 +622,54 @@ "properties": { "run_bamfiltering": { "type": "boolean", - "description": "Turn on filtering of reads in BAM files after mapping. By default, only mapped reads retained.", + "description": "Specify to turn on filtering of reads in BAM files after mapping. By default, only mapped reads retained.", "fa_icon": "fas fa-power-off", - "help_text": "Turns on the filtering subworkflow for mapped BAM files coming out of the read alignment step. Filtering includes removal of unmapped reads, length filtering, and mapping quality filtering.\n\nWhen turning on bam filtering, by default only the mapped/unmapped filter is activated, thus only mapped reads are retained for downstream analyses. See `--bamfiltering_retainunmappedgenomicbam` to retain unmapped reads, if filtering only for length and/or quality is preferred.\n\nNote this subworkflow can also be activated if `--run_metagenomic_screening` is supplied." + "help_text": "Specify to turns on the filtering subworkflow for mapped BAM files after the read alignment step. Filtering includes removal of unmapped reads, length filtering, and mapping quality filtering.\n\nWhen turning on BAM filtering, by default only the mapped/unmapped filter is activated, thus only mapped reads are retained for downstream analyses. See `--bamfiltering_retainunmappedgenomicbam` to retain unmapped reads, if filtering only for length and/or quality is preferred.\n\nNote this subworkflow can also be activated if `--run_metagenomic_screening` is supplied." }, "bamfiltering_minreadlength": { "type": "integer", "default": 0, "description": "Specify the minimum read length mapped reads should have for downstream genomic analysis.", - "help_text": "You can use this to remove mapped reads that fall below a certain length after mapping.\n\nThis can be useful to get more realistic 'endogenous DNA' or 'on target read' percentages.\n\nIf used _instead of_ minimum length read filtering at AdapterRemoval, you can get more more realistic endogenous DNA estimates when most of your reads are very short (e.g. in single-stranded libraries or samples with highly degraded DNA). In these cases, the default minimum length filter at earlier adapter clipping/read merging will remove a very large amount of your reads in your library (including valid reads), thus making an artificially small denominator for a typical endogenous DNA calculation. \n\nTherefore by retaining all of your reads until _after_ mapping (i.e., turning off the adapter clipping/read merging filter), you can generate more 'real' endogenous DNA estimates immediately after mapping (with a better denominator). Then after estimating this, filter using this parameter to retain only 'useful' reads (i.e., those long enough to provide higher confidence of their mapped position) for downstream analyses.\n\nBy specifying `0`, no length filtering is performed.\n\nNote that by default the output BAM files of this step are _not_ stored in the results directory (as it is assumed that deduplicated BAM files are preferred). See `--bamfiltering_savefilteredbams` if you wish to save these.\n\n> Modifies tool parameter(s):\n> - filter_bam_fragment_length.py: `-l`", - "fa_icon": "fas fa-ruler-horizontal" + "help_text": "Specify to remove mapped reads that fall below a certain length threshold after mapping.\n\nThis can be useful to get more realistic 'endogenous DNA' or 'on target read' percentages.\n\nIf used _instead of_ minimum length read filtering at AdapterRemoval, you can get more more realistic endogenous DNA estimates when most of your reads are very short (e.g. in single-stranded libraries or samples with highly degraded DNA). In these cases, the default minimum length filter at earlier adapter clipping/read merging will remove a very large amount of your reads in your library (including valid reads), thus making an artificially small denominator for a typical endogenous DNA calculation.\n\nTherefore by retaining all of your reads until _after_ mapping (i.e., turning off the adapter clipping/read merging filter), you can generate more 'real' endogenous DNA estimates immediately after mapping (with a better denominator). Then after estimating this, filter using this parameter to retain only 'useful' reads (i.e., those long enough to provide higher confidence of their mapped position) for downstream analyses.\n\nBy specifying `0`, no length filtering is performed.\n\nNote that by default the output BAM files of this step are _not_ stored in the results directory (as it is assumed that deduplicated BAM files are preferred). See `--bamfiltering_savefilteredbams` if you wish to save these.\n\n> Modifies filter_bam_fragment_length.py parameter: `-l`", + "fa_icon": "fas fa-filter" }, "bamfiltering_mappingquality": { "type": "integer", "default": 0, "description": "Specify the minimum mapping quality reads should have for downstream genomic analysis.", - "help_text": "Specify a mapping quality threshold for mapped reads to be kept for downstream analysis.\n\nBy default all reads are retained and is therefore set to 0 to ensure no quality filtering is performed.\n\nNote that by default the output BAM files of this step are _not_ stored in the results directory (as it is assumed that deduplicated BAM files are preferred). See `--bamfiltering_savefilteredbams` if you wish to save these.\n\n> Modifies tool parameter(s):\n> - samtools view `-q`", - "fa_icon": "fas fa-thermometer-full" + "help_text": "Specify a mapping quality threshold for mapped reads to be kept for downstream analysis.\n\nBy default all reads are retained and this option is therefore set to 0 to ensure no quality filtering is performed.\n\nNote that by default the output BAM files of this step are _not_ stored in the results directory (as it is assumed that deduplicated BAM files are preferred). See `--bamfiltering_savefilteredbams` if you wish to save these.\n\n> Modifies samtools view parameter: `-q`", + "fa_icon": "fas fa-filter" }, "bamfilter_genomicbamfilterflag": { "type": "integer", "default": 4, "fa_icon": "fas fa-flag", - "description": "Specify the SAM format flag of reads to remove during BAM filtering for downstream genomic steps. Generally not recommended to change.", - "help_text": "You can use this to customise the exact SAM format flag of reads you wish to _remove_ from your BAM file to for downstream _genomic_ analyses.\n\nYou can explore more using a tool from the Broad institute [here](https://broadinstitute.github.io/picard/explain-flags.html)\n\n> \u26a0\ufe0f Modify at your own risk, alternative flags are not necessarily supported in downstream steps!\n\n> Modifies tool parameter(s):\n> - SAMtools: `-F`" + "description": "Specify the SAM format flag of reads to remove during BAM filtering for downstream genomic steps.", + "help_text": "Specify to customise the exact SAM format flag of reads you wish to _remove_ from your BAM file to for downstream _genomic_ analyses.\n\nYou can explore more using a tool from the Broad Institute [here](https://broadinstitute.github.io/picard/explain-flags.html)\n\n> ⚠️ Modify at your own risk, alternative flags are not necessarily supported in downstream steps!\n\n> Modifies samtools parameter: `-F`" }, "bamfiltering_retainunmappedgenomicbam": { "type": "boolean", "description": "Specify to retain unmapped reads in the BAM file used for downstream genomic analyses.", - "help_text": "You can use this parameter to retain unmapped reads (optionally also length filtered) in the genomic BAM for downstream analysis. By default, the pipeline only keeps mapped reads for downstream analysis.\n\nThis is also turned on if `--metagenomicscreening_input` is set to `all`.\n\n> \u26a0\ufe0f This will likely slow down run time of downstream pipeline steps!\n\n> Modifies tool parameter(s):\n> - samtools view: `-f 4` / `-F 4`", + "help_text": "Specify to retain unmapped reads (optionally also length-filtered) in the genomic BAM for downstream analysis. By default, the pipeline only keeps mapped reads for downstream analysis.\n\nThis is also turned on if `--metagenomicscreening_input` is set to `all`.\n\n> ⚠️ This will likely slow down run time of downstream pipeline steps!\n\n> Modifies samtools view parameters: `-f 4` / `-F 4`", "fa_icon": "fas fa-piggy-bank" }, "bamfiltering_generateunmappedfastq": { "type": "boolean", - "description": "Generate FASTQ files containing only unmapped reads from the aligner generated BAM files.", - "help_text": "This turns on the generation and saving of FASTQs of only the unmapped reads from the mapping step in the results directory, using `samtools fastq`.\n\nThis could be useful if you wish to do other analysis of the unmapped reads independently of the pipeline.\n\nNote: the reads in these FASTQ files have _not_ undergone length of quality filtering\n\n> Modifies tool parameter(s):\n> - samtools fastq: `-f 4`", + "description": "Specify to generate FASTQ files containing only unmapped reads from the aligner generated BAM files.", + "help_text": "Specify to turn on the generation and saving of FASTQs of only the unmapped reads from the mapping step in the results directory.\n\nThis can be useful if you wish to do other analysis of the unmapped reads independently of the pipeline.\n\nNote: the reads in these FASTQ files have _not_ undergone length of quality filtering\n\n> Modifies samtools fastq parameter: `-f 4`", "fa_icon": "fas fa-file-alt" }, "bamfiltering_generatemappedfastq": { "type": "boolean", - "description": "Generate FASTQ files containing only mapped reads from the aligner generated BAM files .", - "help_text": "This turns on the generation and saving of FASTQs of only the mapped reads from the mapping step in the results directory, using `samtools fastq`.\n\nThis could be useful if you wish to do other analysis of the mapped reads independently of the pipeline, such as remapping with different parameters (whereby only including mapped reads will speed up computation time during the re-mapping due to reduced input data).\n\nNote the reads in these FASTQ files have _not_ undergone length of quality filtering\n\n> Modifies tool parameter(s):\n> - samtools fastq: `-F 4`", + "description": "Specify to generate FASTQ files containing only mapped reads from the aligner generated BAM files.", + "help_text": "Specify to turn on the generation and saving of FASTQs of only the mapped reads from the mapping step in the results directory.\n\nThis can be useful if you wish to do other analysis of the mapped reads independently of the pipeline, such as remapping with different parameters (whereby only including mapped reads will speed up computation time during the re-mapping due to reduced input data).\n\nNote the reads in these FASTQ files have _not_ undergone length of quality filtering\n\n> Modifies samtools fastq parameter: `-F 4`", "fa_icon": "far fa-file-alt" }, "bamfiltering_savefilteredbams": { "type": "boolean", - "description": "Save in the results directory the intermediate filtered genomic BAM files that are sent for downstream genomic analyses.", - "help_text": "This saves intermediate length and/or quality filtered genomic BAM files in the results directory.", - "fa_icon": "far fa-save" + "description": "Specify to save the intermediate filtered genomic BAM files in the results directory.", + "help_text": "Specify to save intermediate length- and/or quality-filtered genomic BAM files in the results directory.", + "fa_icon": "fas fa-save" } }, "fa_icon": "fas fa-filter" @@ -670,64 +677,64 @@ "metagenomics": { "title": "Metagenomics", "type": "object", - "description": "Options to related to metagenomic screening.", + "description": "Options related to metagenomic screening.", "default": "", "properties": { "run_metagenomicscreening": { "type": "boolean", - "description": "Turn on metagenomic screening of mapped, unmapped, or all reads.", + "description": "Specify to turn on metagenomic screening of mapped, unmapped or all reads.", "fa_icon": "fas fa-power-off", - "help_text": "Turns on the metagenomic screening subworkflow of the pipeline, where reads are screened against large databases. Typically used for pathogen screening or microbial community analysis.\n\nIf supplied, this will also turn on the BAM filtering subworkflow of the pipeline." + "help_text": "Specify to turn on the metagenomic screening subworkflow of the pipeline, where reads are screened against large databases. Typically used for pathogen screening or microbial community analysis.\n\nIf supplied, this will also turn on the BAM filtering subworkflow of the pipeline." }, "metagenomicscreening_input": { "type": "string", "default": "unmapped", - "description": "Specify which type of reads to go into metagenomic screening.", + "description": "Specify which type of reads to use for metagenomic screening.", "enum": ["unmapped", "mapped", "all"], "fa_icon": "fas fa-hand-pointer", - "help_text": "You can select which reads coming out of the read alignment step will be sent for metagenomic analysis.\n\nThis influences which reads are sent to this step, whether you want unmapped reads (used in most cases, as 'host reads' can often be contaminants in microbial genomes), mapped reads (e.g, when doing competitive against a genomic reference of multiple genomes and which to apply LCA correction), or all reads.\n\n> \u26a0\ufe0f If you skip paired-end merging, all reads will be screened as independent reads - not as pairs! - as all FASTQ files from BAM filtering are merged into one. This merged file is _not_ saved in results directory.\n\n> Modifies tool parameter(s):\n> - samtools fastq: `-f 4` / `-F 4`" + "help_text": "Specify to select which mapped reads will be sent for metagenomic analysis.\n\nThis influences which reads are sent to this step, whether you want unmapped reads (used in most cases, as 'host reads' can often be contaminants in microbial genomes), mapped reads (e.g, when doing competitive against a genomic reference of multiple genomes and which to apply LCA correction) or all reads.\n\n> ⚠️ If you skip paired-end merging, all reads will be screened as independent reads - not as pairs! - as all FASTQ files from BAM filtering are merged into one. This merged file is _not_ saved in results directory.\n\n> Modifies samtools fastq parameters: `-f 4` / `-F 4`" }, "run_metagenomics_complexityfiltering": { "type": "boolean", "fa_icon": "fas fa-power-off", - "help_text": "Turns on a subworkflow of the pipeline that filters the fastq-files for complexity before the metagenomics profiling\nUse the metagenomics_complexity_tool parameter to select a method", - "description": "Run a complexity filter on the metagenomics input files before classification. Specifiy the tool to use with the `metagenomics_complexity_tool` parameter, save with `metagenomics_complexity_savefastq`" + "help_text": "Specify to turn on a subworkflow of the pipeline that filters the FASTQ files for complexity before the metagenomics profiling.\nUse the metagenomics_complexity_tool parameter to select a method.", + "description": "Specify to run a complexity filter on the metagenomics input files before classification." }, "metagenomics_complexity_savefastq": { "type": "boolean", "fa_icon": "fas fa-save", - "description": "Save FASTQ files containing the complexity filtered reads (before metagenomic classification).", - "help_text": "Save the complexity-filtered fastq-files to the results directory" + "description": "Specify to save FASTQ files containing the complexity-filtered reads before metagenomic classification.", + "help_text": "Specify to save the complexity-filtered FASTQ files to the results directory." }, "metagenomics_complexity_tool": { "type": "string", "default": "bbduk", - "description": "Specify which tool to use for trimming, filtering, or reformatting of fastq reads that go into metagenomics screening.", + "description": "Specify which tool to use for trimming, filtering or reformatting of FASTQ reads that go into metagenomics screening.", "enum": ["bbduk", "prinseq"], - "fa_icon": "fas fa-hand-pointer", - "help_text": "You can select which tool is used to generate a final set of reads for the metagenomic classifier after any necessary trimming, filtering or reformatting of the reads.\n\nThis intermediate file is not saved in the results directory, unless marked with `--metagenomics_complexity_savefastq`." + "fa_icon": "fas fa-hammer", + "help_text": "Specify to select which tool is used to generate a final set of reads for the metagenomic classifier after any necessary trimming, filtering or reformatting of the reads.\n\nThis intermediate file is not saved in the results directory unless marked with `--metagenomics_complexity_savefastq`." }, "metagenomics_complexity_entropy": { "type": "number", "fa_icon": "fas fa-sort-numeric-up", - "description": "Specify the entropy threshold that under which a sequencing read will be complexity filtered out. This should be between 0-1.", + "description": "Specify the entropy threshold under which a sequencing read will be complexity-filtered out.", "default": 0.3, - "help_text": "Specify the minimum 'entropy' value for complexity filtering for the BBDuk or PRINSEQ++ tools.\n\nThis value will only be used for PRINSEQ++ if `--metagenomics_prinseq_mode` is set to `entropy`.\n\nEntropy here corresponds to the amount of sequence variation exists within the read. Higher values correspond to more variety, and thus will likely result in more specific matching to a taxon's reference genome. The trade off here is fewer reads (or abundance information) available for having a confident identification.\n\n> Modifies tool parameter(s):\n> - BBDuk: `entropy=`\n> - PRINSEQ++: `-lc_entropy`\n\n" + "help_text": "Specify the minimum 'entropy' value for complexity filtering for the BBDuk or PRINSEQ++ tools.\n\nThis value will only be used for PRINSEQ++ if `--metagenomics_prinseq_mode` is set to `entropy`.\n\nEntropy here corresponds to the amount of sequence variation existing within the read. Higher values correspond to more variety and thus will likely result in more specific matching to a taxon's reference genome. The trade-off here is fewer reads (or abundance information) available for having a confident identification.\n\n> Modifies parameters:\n> - BBDuk: `entropy=`\n> - PRINSEQ++: `-lc_entropy`" }, "metagenomics_prinseq_mode": { "type": "string", "default": "entropy", "enum": ["entropy", "dust"], - "fa_icon": "fas fa-check-square", - "description": "Specify the complexity filter mode for PRINSEQ++", - "help_text": "Specify the complexity filter mode for PRINSEQ++ \n\nUse the selected mode together with the correct flag:\n'dust' requires the `--metagenomics_prinseq_dustscore` parameter set\n'entropy' requires the `--metagenomics_complexity_entropy` parameter set\n\n> Sets one of the tool parameter(s):\n> - PRINSEQ++: `-lc_entropy`\n> - PRINSEQ++: `-lc_dust`" + "fa_icon": "fas fa-toggle-on", + "description": "Specify the complexity filter mode for PRINSEQ++.", + "help_text": "Specify the complexity filter mode for PRINSEQ++.\n\nUse the selected mode together with the correct flag:\n'dust' requires the `--metagenomics_prinseq_dustscore` parameter set\n'entropy' requires the `--metagenomics_complexity_entropy` parameter set\n\n> Modifies parameters:\n> - PRINSEQ++: `-lc_entropy`\n> - PRINSEQ++: `-lc_dust`" }, "metagenomics_prinseq_dustscore": { "type": "number", "default": 0.5, - "fa_icon": "fas fa-head-side-mask", - "description": "Specify the minimum dust score for PRINTSEQ++ complexity filtering", - "help_text": "Specify the minimum dust score below which low-complexity reads will be removed. A DUST score is based on how often different tri-nucleotides occur along a read.\n\n> Modifies tool parameter(s):\n> - PRINSEQ++: `--lc_dust`" + "fa_icon": "fas fa-filter", + "description": "Specify the minimum dust score for PRINTSEQ++ complexity filtering.", + "help_text": "Specify the minimum dust score below which low-complexity reads will be removed. A dust score is based on how often different tri-nucleotides occur along a read.\n\n> Modifies PRINSEQ++ parameter: `--lc_dust`" } }, "fa_icon": "fas fa-search" @@ -747,9 +754,9 @@ "type": "string", "default": "markduplicates", "description": "Specify which tool to use for deduplication.", - "help_text": "Sets the duplicate read removal tool. Alternatively an ancient DNA specific read deduplication tool `dedup` (Peltzer et al. 2016) is offered. The latter utilises both ends of paired-end data to remove duplicates (i.e. true exact duplicates, as markduplicates will over-zealously deduplicate anything with the same starting position even if the ends are different).\n\n> \u26a0\ufe0f DeDup can only be used on collapsed (i.e. merged) reads from paired-end sequencing.", + "help_text": "Specify which duplicate read removal tool to use. While `markduplicates` is set by default, an ancient DNA specific read deduplication tool `dedup` is offered (see [Peltzer et al. 2016](https://doi.org/10.1186/s13059-016-0918-z) for details). The latter utilises both ends of paired-end data to remove duplicates (i.e. true exact duplicates, as markduplicates will over-zealously deduplicate anything with the same starting position even if the ends are different).\n\n> ⚠️ DeDup can only be used on collapsed (i.e. merged) reads from paired-end sequencing.", "enum": ["markduplicates", "dedup"], - "fa_icon": "fas fa-layer-group" + "fa_icon": "fas fa-hammer" } }, "fa_icon": "fas fa-clone" @@ -763,127 +770,127 @@ "properties": { "run_mapdamage_rescaling": { "type": "boolean", - "fa_icon": "fas fa-map", - "description": "Turn on damage rescaling of BAM files using mapDamage2 to probabilistically remove damage.", - "help_text": "Turns on mapDamage2's BAM rescaling functionality. This probabilistically replaces Ts back to Cs depending on the likelihood this reference-mismatch was originally caused by damage. If the library is specified to be single stranded, this will automatically use the --single-stranded mode.\nThis process will ameliorate the effects of aDNA damage, but also increase reference-bias. \n\n**This functionality does not have any MultiQC output.**\nwarning: rescaled libraries will not be merged with non-scaled libraries of the same sample for downstream genotyping, as the model may be different for each library. If you wish to merge these, please do this manually and re-run nf-core/eager using the merged BAMs as input.\n\n> Modifies the `--rescale` parameter of mapDamage2" + "fa_icon": "fas fa-power-off", + "description": "Specify to turn on damage rescaling of BAM files using mapDamage2 to probabilistically remove damage.", + "help_text": "Specify to turn on mapDamage2's BAM rescaling functionality. This probabilistically replaces Ts back to Cs depending on the likelihood this reference-mismatch was originally caused by damage. If the library is specified to be single-stranded, this will automatically use the `--single-stranded` mode.\nThis process will ameliorate the effects of aDNA damage, but also increase reference-bias.\n\n**This functionality does not have any MultiQC output.**\n ⚠️ Rescaled libraries will not be merged with non-scaled libraries of the same sample for downstream genotyping, as the model may be different for each library. If you wish to merge these, please do this manually and re-run nf-core/eager using the merged BAMs as input.\n\n> Modifies mapDamage2 parameter: `--rescale`" }, "damage_manipulation_rescale_seqlength": { "type": "integer", "default": 12, - "description": "Length of read sequence to use from each side for rescaling. Can be overridden by `--rescalelength*p`.", - "help_text": "Specify the length in bp from the end of the read that mapDamage should rescale at both ends.\n\n> Modifies the `--seq-length` parameter of mapDamage2.", + "description": "Specify the length of read sequence to use from each side for rescaling.", + "help_text": "Specify the length in bp from the end of the read that mapDamage should rescale at both ends. This can be overridden by `--rescalelength*p`.\n\n> Modifies mapDamage2 parameter: `--seq-length`", "fa_icon": "fas fa-ruler-horizontal" }, "damage_manipulation_rescale_length_5p": { "type": "integer", "default": 0, - "description": "Length of read for mapDamage2 to rescale from 5p end. Only used if not 0, otherwise `--rescale_seqlength` used.", - "help_text": "Specify the length in bp from the end of the read that mapDamage should rescale. Overrides `--rescale_seqlength`.\n\n> Modifies the `--rescale-length-5p` parameter of mapDamage2.", + "description": "Specify the length of read for mapDamage2 to rescale from 5 prime end.", + "help_text": "Specify the length in bp from the end of the read that mapDamage should rescale. This overrides `--rescale_seqlength`.\n\n> Modifies mapDamage2 parameter: `--rescale-length-5p`", "fa_icon": "fas fa-balance-scale-right" }, "damage_manipulation_rescale_length_3p": { "type": "integer", "default": 0, - "description": "Length of read for mapDamage2 to rescale from 3p end. Only used if not 0 otherwise `--rescale_seqlength` used.", - "help_text": "Specify the length in bp from the end of the read that mapDamage should rescale. Overrides `--rescale_seqlength`.\n\n> Modifies the `--rescale-length-3p` parameter of mapDamage2.", + "description": "Specify the length of read for mapDamage2 to rescale from 3 prime end.", + "help_text": "Specify the length in bp from the end of the read that mapDamage should rescale. This overrides `--rescale_seqlength`.\n\n> Modifies mapDamage2 parameter `--rescale-length-3p`", "fa_icon": "fas fa-balance-scale-left" }, "run_pmd_filtering": { "type": "boolean", - "description": "Turn on PMDtools filtering.", - "help_text": "Specifies to run PMDtools for damage based read filtering in sequencing libraries.", + "description": "Specify to turn on PMDtools filtering.", + "help_text": "Specify to run PMDtools for damage-based read filtering in sequencing libraries.", "fa_icon": "fas fa-power-off" }, "damage_manipulation_pmdtools_threshold": { "type": "integer", "default": 3, "fa_icon": "far fa-chart-bar", - "description": "Specify PMDScore threshold for PMDtools.", - "help_text": "Specifies the PMDScore threshold to use in the pipeline when filtering BAM files for DNA damage. Only reads which surpass this damage score are considered for downstream DNA analysis.\n\n> Modifies PMDtools parameter: `--threshold`" + "description": "Specify PMD score threshold for PMDtools.", + "help_text": "Specify the PMDScore threshold to use when filtering BAM files for DNA damage. Only reads which surpass this damage score are considered for downstream analysis.\n\n> Modifies PMDtools parameter: `--threshold`" }, "damage_manipulation_pmdtools_masked_reference": { "type": "string", "fa_icon": "fas fa-mask", - "help_text": "Supplying a FASTA file will use this file as reference for `samtools calmd` prior to PMD filtering. /nSetting the SNPs that are part of the used capture set as `N` can alleviate reference bias when running PMD filtering on capture data, where you might not want the allele of a SNP to be counted as damage when it is a transition.", - "description": "Specify a masked FASTA file with positions to be used with pmdtools.", + "help_text": "Specify a FASTA file to use as reference for `samtools calmd` prior to PMD filtering.\nSetting the SNPs that are part of the used capture set as `N` can alleviate reference bias when running PMD filtering on capture data, where you might not want the allele of a SNP to be counted as damage when it is a transition.", + "description": "Specify a masked FASTA file with positions to be used with PMDtools.", "pattern": "^\\S+\\.fa?(\\sta)$", "format": "file-path" }, "damage_manipulation_pmdtools_reference_mask": { "type": "string", "fa_icon": "fas fa-mask", - "help_text": "Supplying a bedfile to this parameter activates masking of the reference fasta at the contained sites prior to running PMDtools. Positions that are in the provided bedfile will be replaced by Ns in the reference genome. \nThis can alleviate reference bias when running PMD filtering on capture data, where you might not want the allele of a transition SNP to be counted as damage. Masking of the reference is done using `bedtools maskfasta`.", - "description": "Specify a bedfile to be used to mask the reference fasta prior to running pmdtools.", + "help_text": "Specify a BED file to activate masking of the reference FASTA at the contained sites prior to running PMDtools. Positions that are in the provided BED file will be replaced by Ns in the reference genome.\nThis can alleviate reference bias when running PMD filtering on capture data, where you might not want the allele of a transition SNP to be counted as damage. Masking of the reference is done using `bedtools maskfasta`.", + "description": "Specify a BED file to be used to mask the reference FASTA prior to running PMDtools.", "pattern": "^\\S+\\.bed?(\\.gz)$", "format": "file-path" }, "run_trim_bam": { "type": "boolean", - "fa_icon": "fas fa-eraser", - "description": "Turn on BAM trimming. Will only affect non-UDG or half-UDG libraries.", - "help_text": "Turns on the BAM trimming method. Trims off [n] bases from reads in the deduplicated BAM file. Damage assessment in PMDtools or DamageProfiler remains untouched, as data is routed through this independently. BAM trimming is typically performed to reduce errors during genotyping that can be caused by aDNA damage.\n\nBAM trimming will only affect libraries with 'damage_treatment' of 'none' or 'half'. Complete UDG treatment ('full') should have removed all damage, during library construction so trimming of 0 bp is performed. The amount of bases that will be trimmed off from each side of the molecule should be set separately for libraries with depending on their 'strandedness' and 'damage_treatment'.\n\n> Note: additional artefacts such as bar-codes or adapters should be removed prior to mapping and not in this step." + "fa_icon": "fas fa-power-off", + "description": "Specify to turn on BAM trimming for non-UDG or half-UDG libraries.", + "help_text": "Specify to turn on the BAM trimming of [n] bases from reads in the deduplicated BAM file. Damage assessment in PMDtools or DamageProfiler remains untouched, as data is routed through this independently. BAM trimming is typically performed to reduce errors during genotyping that can be caused by aDNA damage.\n\nBAM trimming will only affect libraries with 'damage_treatment' of 'none' or 'half'. Complete UDG treatment ('full') should have removed all damage during library construction, so trimming of 0 bp is performed. The amount of bases that will be trimmed off from each side of the molecule should be set separately for libraries depending on their 'strandedness' and 'damage_treatment'.\n\n> Note: additional artefacts such as barcodes or adapters should be removed prior to mapping and not in this step." }, "damage_manipulation_bamutils_trim_double_stranded_none_udg_left": { "type": "integer", "default": 0, - "fa_icon": "fas fa-ruler-combined", - "description": "Specify the number of bases to clip off reads from 'left' end of read for double-stranded non-UDG libraries.", - "help_text": "Default is set to 0, and therefore clips off no bases on the left side of reads from double-stranded libraries whose UDG treatment is set to'none'. Note that reverse reads will automatically be clipped off at the reverse side with this (automatically reverses left and right for the reverse read).\n\n> Modifies bamUtil's trimBam parameter: `-L`" + "fa_icon": "fas fa-cut", + "description": "Specify the number of bases to clip off reads from 'left' (5 prime) end of reads for double-stranded non-UDG libraries.", + "help_text": "Specify the number of bases to clip off reads from 'left' (5 prime) end of reads for double-stranded non-UDG libraries. By default, this is set to 0, and therefore clips off no bases on the left side of reads from double-stranded libraries whose UDG treatment is set to 'none'. Note that reverse reads will automatically be clipped off at the reverse side with this (automatically reverses left and right for the reverse read).\n\n> Modifies bamUtil's trimBam parameter: `-L`" }, "damage_manipulation_bamutils_trim_double_stranded_none_udg_right": { "type": "integer", "default": 0, - "fa_icon": "fas fa-ruler", - "description": "Specify the number of bases to clip off reads from 'right' end of read for double-stranded non-UDG libraries.", - "help_text": "Default is set to 0, and therefore clips off no bases on the right side of reads from double-stranded libraries whose UDG treatment is set to'none'. Note that reverse reads will automatically be clipped off at the reverse side with this (automatically reverses left and right for the reverse read).\n\n> Modifies bamUtil's trimBam parameter: `-R`" + "fa_icon": "fas fa-cut", + "description": "Specify the number of bases to clip off reads from 'right' (3 prime) end of reads for double-stranded non-UDG libraries.", + "help_text": "Specify the number of bases to clip off reads from 'right' (3 prime) end of reads for double-stranded non-UDG libraries. By default, this is set to 0, and therefore clips off no bases on the right side of reads from double-stranded libraries whose UDG treatment is set to 'none'. Note that reverse reads will automatically be clipped off at the reverse side with this (automatically reverses left and right for the reverse read).\n\n> Modifies bamUtil's trimBam parameter: `-R`" }, "damage_manipulation_bamutils_trim_double_stranded_half_udg_left": { "type": "integer", "default": 0, - "fa_icon": "fas fa-ruler-combined", - "help_text": "Default is set to 0, and therefore clips off no bases on the left side of reads from double-stranded libraries whose UDG treatment is set to 'half'. Note that reverse reads will automatically be clipped off at the reverse side with this (automatically reverses left and right for the reverse read).\n\n> Modifies bamUtil's trimBam parameter: `-L`", - "description": "Specify the number of bases to clip off reads from 'left' end of read for double-stranded half-UDG libraries." + "fa_icon": "fas fa-cut", + "help_text": "Specify the number of bases to clip off reads from 'left' (5 prime) end of read for double-stranded half-UDG libraries. By default, this is set to 0, and therefore clips off no bases on the left side of reads from double-stranded libraries whose UDG treatment is set to 'half'. Note that reverse reads will automatically be clipped off at the reverse side with this (automatically reverses left and right for the reverse read).\n\n> Modifies bamUtil's trimBam parameter: `-L`", + "description": "Specify the number of bases to clip off reads from 'left' (5 prime) end of read for double-stranded half-UDG libraries." }, "damage_manipulation_bamutils_trim_double_stranded_half_udg_right": { "type": "integer", "default": 0, - "fa_icon": "fas fa-ruler", - "description": "Specify the number of bases to clip off reads from 'right' end of read for double-stranded half-UDG libraries.", - "help_text": "Default is set to 0, and therefore clips off no bases on the right side of reads from double-stranded libraries whose UDG treatment is set to 'half'. Note that reverse reads will automatically be clipped off at the reverse side with this (automatically reverses left and right for the reverse read).\n\n> Modifies bamUtil's trimBam parameter: `-R`" + "fa_icon": "fas fa-cut", + "description": "Specify the number of bases to clip off reads from 'right' (3 prime) end of read for double-stranded half-UDG libraries.", + "help_text": "Specify the number of bases to clip off reads from 'right' (3 prime) end of read for double-stranded half-UDG libraries. By default, this is set to 0, and therefore clips off no bases on the right side of reads from double-stranded libraries whose UDG treatment is set to 'half'. Note that reverse reads will automatically be clipped off at the reverse side with this (automatically reverses left and right for the reverse read).\n\n> Modifies bamUtil's trimBam parameter: `-R`" }, "damage_manipulation_bamutils_trim_single_stranded_none_udg_left": { "type": "integer", "default": 0, - "fa_icon": "fas fa-ruler-combined", - "help_text": "Default is set to 0, and therefore clips off no bases on the left side of reads from single-stranded libraries whose UDG treatment is set to 'none'. Note that reverse reads will automatically be clipped off at the reverse side with this (automatically reverses left and right for the reverse read).\n\n> Modifies bamUtil's trimBam parameter: `-L`", - "description": "Specify the number of bases to clip off reads from 'left' end of read for single-stranded non-UDG libraries." + "fa_icon": "fas fa-cut", + "help_text": "Specify the number of bases to clip off reads from 'left' (5 prime) end of read for single-stranded non-UDG libraries. By default, this is set to 0, and therefore clips off no bases on the left side of reads from single-stranded libraries whose UDG treatment is set to 'none'. Note that reverse reads will automatically be clipped off at the reverse side with this (automatically reverses left and right for the reverse read).\n\n> Modifies bamUtil's trimBam parameter: `-L`", + "description": "Specify the number of bases to clip off reads from 'left' (5 prime) end of read for single-stranded non-UDG libraries." }, "damage_manipulation_bamutils_trim_single_stranded_none_udg_right": { "type": "integer", "default": 0, - "fa_icon": "fas fa-ruler", - "description": "Specify the number of bases to clip off reads from 'right' end of read for single-stranded non-UDG libraries.", - "help_text": "Default is set to 0, and therefore clips off no bases on the right side of reads from single-stranded libraries whose UDG treatment is set to 'none'. Note that reverse reads will automatically be clipped off at the reverse side with this (automatically reverses left and right for the reverse read).\n\n> Modifies bamUtil's trimBam parameter: `-R`" + "fa_icon": "fas fa-cut", + "description": "Specify the number of bases to clip off reads from 'right' (3 prime) end of read for single-stranded non-UDG libraries.", + "help_text": "Specify the number of bases to clip off reads from 'right' (3 prime) end of read for single-stranded non-UDG libraries. By default, this is set to 0, and therefore clips off no bases on the right side of reads from single-stranded libraries whose UDG treatment is set to 'none'. Note that reverse reads will automatically be clipped off at the reverse side with this (automatically reverses left and right for the reverse read).\n\n> Modifies bamUtil's trimBam parameter: `-R`" }, "damage_manipulation_bamutils_trim_single_stranded_half_udg_left": { "type": "integer", "default": 0, - "fa_icon": "fas fa-ruler-combined", - "help_text": "Default is set to 0, and therefore clips off no bases on the left side of reads from single-stranded libraries whose UDG treatment is set to 'half'. Note that reverse reads will automatically be clipped off at the reverse side with this (automatically reverses left and right for the reverse read).\n\n> Modifies bamUtil's trimBam parameter: `-L`", - "description": "Specify the number of bases to clip off reads from 'left' end of read for single-stranded half-UDG libraries." + "fa_icon": "fas fa-cut", + "help_text": "Specify the number of bases to clip off reads from 'left' (5 prime) end of read for single-stranded half-UDG libraries. By default, this is set to 0, and therefore clips off no bases on the left side of reads from single-stranded libraries whose UDG treatment is set to 'half'. Note that reverse reads will automatically be clipped off at the reverse side with this (automatically reverses left and right for the reverse read).\n\n> Modifies bamUtil's trimBam parameter: `-L`", + "description": "Specify the number of bases to clip off reads from 'left' (5 prime) end of read for single-stranded half-UDG libraries." }, "damage_manipulation_bamutils_trim_single_stranded_half_udg_right": { "type": "integer", "default": 0, - "fa_icon": "fas fa-ruler", - "description": "Specify the number of bases to clip off reads from 'right' end of read for single-stranded half-UDG libraries.", - "help_text": "Default is set to 0, and therefore clips off no bases on the right side of reads from single-stranded libraries whose UDG treatment is set to 'half'. Note that reverse reads will automatically be clipped off at the reverse side with this (automatically reverses left and right for the reverse read).\n\n> Modifies bamUtil's trimBam parameter: `-R`" + "fa_icon": "fas fa-cut", + "description": "Specify the number of bases to clip off reads from 'right' (3 prime) end of read for single-stranded half-UDG libraries.", + "help_text": "Specify the number of bases to clip off reads from 'right' (3 prime) end of read for single-stranded half-UDG libraries. By default, this is set to 0, and therefore clips off no bases on the right side of reads from single-stranded libraries whose UDG treatment is set to 'half'. Note that reverse reads will automatically be clipped off at the reverse side with this (automatically reverses left and right for the reverse read).\n\n> Modifies bamUtil's trimBam parameter: `-R`" }, "damage_manipulation_bamutils_softclip": { "type": "boolean", - "fa_icon": "fas fa-paint-roller", - "description": "Turn on using soft-trimming instead of hard masking.", - "help_text": "By default, nf-core/eager uses hard trimming, which sets trimmed bases to 'N' with quality '!' in the BAM output. Turn this on to use soft-trimming instead, which masks reads at the read ends using the CIGAR string instead.\n\n> Modifies bam trimBam parameter: `-c`" + "fa_icon": "fas fa-mask", + "description": "Specify to turn on soft-trimming instead of hard masking.", + "help_text": "Specify to turn on soft-trimming instead of hard masking of bases. By default, nf-core/eager uses hard trimming, which sets trimmed bases to 'N' with quality '!' in the BAM output. Turn this on to use soft-trimming instead, which masks reads at the read ends using the CIGAR string instead.\n\n> Modifies bamUtil's trimBam parameter: `-c`" } } }, @@ -896,28 +903,28 @@ "run_genotyping": { "type": "boolean", "fa_icon": "fas fa-power-off", - "description": "Turn on genotyping of BAM files.", - "help_text": "Turns on genotyping. `--genotyping_source` and `--genotyping_tool` must also be provided together with this option." + "description": "Specify to turn on genotyping of BAM files.", + "help_text": "Specify to turn on genotyping. `--genotyping_source` and `--genotyping_tool` must also be provided together with this option." }, "genotyping_source": { "type": "string", "description": "Specify which input BAM to use for genotyping.", - "help_text": "Indicates which BAM file to use for genotyping, depending on what BAM processing modules you have turned on. Options are: 'raw' (to use the reads used as input for damage manipulation); 'pmd' (for pmdtools output); 'trimmed' (for base-clipped BAMs. Base-clipped-PMD-filtered BAMs if both filtering and trimming are requested); 'rescaled' (for mapDamage2 rescaling output).\nWarning: Depending on the parameters you provided, 'raw' can refer to all mapped reads, filtered reads (if bam filtering has been performed), or the deduplicated reads (if deduplication was performed).", + "help_text": "Specify which BAM file to use for genotyping, depending on what BAM processing modules you have turned on. Options are: 'raw' (to use the reads used as input for damage manipulation); 'pmd' (for pmdtools output); 'trimmed' (for base-clipped BAMs. Base-clipped-PMD-filtered BAMs if both filtering and trimming are requested); 'rescaled' (for mapDamage2 rescaling output).\nWarning: Depending on the parameters you provided, 'raw' can refer to all mapped reads, filtered reads (if BAM filtering has been performed), or the deduplicated reads (if deduplication was performed).", "fa_icon": "fas fa-faucet", "enum": ["raw", "pmd", "trimmed", "rescaled"] }, "genotyping_tool": { "type": "string", - "fa_icon": "fas fa-tools", + "fa_icon": "fas fa-hammer", "enum": ["ug", "hc", "freebayes", "pileupcaller", "angsd"], - "help_text": "Specifies which genotyper to use. Current options are: GATK (v3.5) UnifiedGenotyper or GATK Haplotype Caller (v4); and the FreeBayes Caller.\n\n> Note that while UnifiedGenotyper is more suitable for low-coverage ancient DNA (HaplotypeCaller does de novo assembly around each variant site), be aware GATK 3.5 it is officially deprecated by the Broad Institute (but is used here for compatibility with MultiVCFAnalyzer).", - "description": "Specify which genotyper to use between: GATK UnifiedGenotyper, GATK HaplotypeCaller, Freebayes, or pileupCaller." + "help_text": "Specify which genotyper to use. Current options are: pileupCaller, ANGSD, GATK UnifiedGenotyper (v3.5), GATK HaplotypeCaller (v4) or FreeBayes.\n\n> Note that while UnifiedGenotyper is more suitable for low-coverage ancient DNA (HaplotypeCaller does de novo assembly around each variant site), be aware GATK v3.5 it is officially deprecated by the Broad Institute (but is used here for compatibility with MultiVCFAnalyzer).", + "description": "Specify which genotyper to use." }, "skip_bcftools_stats": { "type": "boolean", - "fa_icon": "far fa-chart-bar", - "description": "Skip bcftools stats generation for VCF based variant calling statistics", - "help_text": "Disables running of `bcftools stats` against VCF files from GATK and FreeBayes genotypers.\n\nIf ran, `bcftools stats` will automatically include the FASTA reference for INDEL-related statistics." + "fa_icon": "fas fa-forward", + "description": "Specify to skip generation of VCF-based variant calling statistics with bcftools.", + "help_text": "Specify to disable running of `bcftools stats` against VCF files from GATK and FreeBayes genotypers.\n\nThis will automatically include the FASTA reference for INDEL-related statistics." }, "genotyping_reference_ploidy": { "type": "integer", @@ -929,27 +936,27 @@ "genotyping_pileupcaller_min_base_quality": { "type": "integer", "default": 30, - "description": "The base mapping quality to be used for genotyping with pileupcaller.", - "help_text": "The minimum base quality to be used when generating the samtools mpileup used as input for genotyping with pileupCaller. \n\n> Modifies samtools mpileup parameter: `-Q`.", + "description": "Specify the base mapping quality to be used for genotyping with pileupCaller.", + "help_text": "Specify the minimum base quality to be used when generating the samtools mpileup used as input for genotyping with pileupCaller.\n\n> Modifies samtools mpileup parameter: `-Q`.", "fa_icon": "fas fa-filter" }, "genotyping_pileupcaller_min_map_quality": { "type": "integer", "default": 30, "fa_icon": "fas fa-filter", - "description": "The minimum mapping quality to be used for genotyping with pileupcaller.", - "help_text": "The minimum mapping quality to be used when generating the samtools mpileup used as input for genotyping with pileupCaller. \n\n> Modifies samtools mpileup parameter: `-q`." + "description": "Specify the minimum mapping quality to be used for genotyping with pileupCaller.", + "help_text": "Specify the minimum mapping quality to be used when generating the samtools mpileup used as input for genotyping with pileupCaller.\n\n> Modifies samtools mpileup parameter: `-q`." }, "genotyping_pileupcaller_bedfile": { "type": "string", "fa_icon": "fas fa-bed", - "help_text": "Specify a SNP panel in the form of a bed file of sites at which to generate a pileup for pileupCaller.", + "help_text": "Specify a SNP panel in the form of a BED file of sites at which to generate a pileup for pileupCaller.", "format": "file-path", - "description": "Specify the path to SNP panel in bed format for pileupCaller." + "description": "Specify the path to SNP panel in BED format for pileupCaller." }, "genotyping_pileupcaller_snpfile": { "type": "string", - "help_text": "Specify a SNP panel in [EIGENSTRAT](https://github.com/DReichLab/EIG/blob/master/CONVERTF/README) format, pileupCaller will call these sites.", + "help_text": "Specify a SNP panel in [EIGENSTRAT](https://github.com/DReichLab/EIG/blob/master/CONVERTF/README) format of sites to be called with pileupCaller.", "fa_icon": "fas fa-sliders-h", "format": "file-path", "description": "Specify the path to SNP panel in EIGENSTRAT format for pileupCaller." @@ -958,15 +965,15 @@ "type": "string", "default": "randomHaploid", "fa_icon": "fas fa-toolbox", - "description": "Specify the SNP calling method to use for genotyping.", - "help_text": "Specify the SNP calling method to use for genotyping. 'randomHaploid' will randomly sample a read overlapping the SNP, and produce a homozygous genotype with the allele supported by that read (often called 'pseudohaploid' or 'pseudodiploid'). 'randomDiploid` will randomly sample two reads overlapping the SNP and produce a genotype comprised of the two alleles supported by the two reads. 'majorityCall' will produce a genotype that is homozygous for the allele that appears in the majority of reads overlapping the SNP.\n\n> Modifies pileupCaller parameters: `--randomHaploid` `--randomDiploid` `--majorityCall`", + "description": "Specify the SNP calling method to use for genotyping with pileupCaller.", + "help_text": "Specify the SNP calling method to use for genotyping. 'randomHaploid' will randomly sample a read overlapping the SNP and produce a homozygous genotype with the allele supported by that read (often called 'pseudohaploid' or 'pseudodiploid'). 'randomDiploid` will randomly sample two reads overlapping the SNP and produce a genotype comprised of the two alleles supported by the two reads. 'majorityCall' will produce a genotype that is homozygous for the allele that appears in the majority of reads overlapping the SNP.\n\n> Modifies pileupCaller parameters: `--randomHaploid` `--randomDiploid` `--majorityCall`", "enum": ["randomHaploid", "randomDiploid", "majorityCall"] }, "genotyping_pileupcaller_transitions_mode": { "type": "string", "default": "AllSites", - "description": "Specify the calling mode for transitions.", - "help_text": "Specify if genotypes of transition SNPs should be called, set to missing, or excluded from the genotypes respectively. \n\n> Modifies pileupCaller parameter: `--skipTransitions` `--transitionsMissing`", + "description": "Specify the calling mode for transitions with pileupCaller.", + "help_text": "Specify if genotypes of transition SNPs should be called, set to missing, or excluded from the genotypes respectively.\n\n> Modifies pileupCaller parameter: `--skipTransitions` `--transitionsMissing`", "enum": ["AllSites", "TransitionsMissing", "SkipTransitions"], "fa_icon": "fas fa-toggle-on" }, @@ -975,13 +982,13 @@ "default": 30, "fa_icon": "fas fa-balance-scale-right", "description": "Specify GATK phred-scaled confidence threshold.", - "help_text": "If selected, specify a GATK genotyper phred-scaled confidence threshold of a given SNP/INDEL call.\n\n> Modifies GATK UnifiedGenotyper or HaplotypeCaller parameter: `-stand_call_conf`" + "help_text": "Specify a GATK genotyper phred-scaled confidence threshold of a given SNP/INDEL call.\n\n> Modifies GATK UnifiedGenotyper or HaplotypeCaller parameter: `-stand_call_conf`" }, "genotyping_gatk_dbsnp": { "type": "string", - "help_text": "(Optional) Specify VCF file for output VCF SNP annotation e.g. if you want to annotate your VCF file with 'rs' SNP IDs. Check GATK documentation for more information. Gzip not accepted.", + "help_text": "Specify VCF file for output VCF SNP annotation, e.g. if you want to annotate your VCF file with 'rs' SNP IDs. Check GATK documentation for more information. Gzip not accepted.", "fa_icon": "fas fa-pen-alt", - "description": "Specify VCF file for SNP annotation of output VCF files. Optional. Gzip not accepted.", + "description": "Specify VCF file for SNP annotation of output VCF files for GATK.", "pattern": "^\\S+\\.vcf$", "format": "file-path", "mimetype": "VCF" @@ -990,16 +997,16 @@ "type": "integer", "default": 250, "fa_icon": "fas fa-icicles", - "description": "Maximum depth coverage allowed for genotyping before down-sampling is turned on.", - "help_text": "Maximum depth coverage allowed for genotyping before down-sampling is turned on. Any position with a coverage higher than this value will be randomly down-sampled to this many reads.\n\n> Modifies GATK UnifiedGenotyper parameter: `-dcov`" + "description": "Specify the maximum depth coverage allowed for genotyping with GATK before down-sampling is turned on.", + "help_text": "Specify the maximum depth coverage allowed for genotyping before down-sampling is turned on. Any position with a coverage higher than this value will be randomly down-sampled to this many reads.\n\n> Modifies GATK UnifiedGenotyper parameter: `-dcov`" }, "genotyping_gatk_ug_out_mode": { "type": "string", "default": "EMIT_VARIANTS_ONLY", - "description": "Specify GATK output mode.", + "description": "Specify GATK UnifiedGenotyper output mode.", "enum": ["EMIT_VARIANTS_ONLY", "EMIT_ALL_CONFIDENT_SITES", "EMIT_ALL_SITES"], - "help_text": "If GATK UnifiedGenotyper is selected as the genotyping tool, this defines the output mode to use when producing the output VCF (i.e. produce calls for every site or just confidence sites.)\n\n> Modifies GATK UnifiedGenotyper parameter: `--output_mode`", - "fa_icon": "fas fa-bullhorn" + "help_text": "Specify GATK UnifiedGenotyper output mode to use when producing the output VCF (i.e. produce calls for every site or just confidence sites.)\n\n> Modifies GATK UnifiedGenotyper parameter: `--output_mode`", + "fa_icon": "fas fa-toggle-on" }, "genotyping_gatk_ug_genotype_mode": { "type": "string", @@ -1007,49 +1014,49 @@ "description": "Specify UnifiedGenotyper likelihood model.", "enum": ["SNP", "INDEL", "BOTH", "GENERALPLOIDYSNP", "GENERALPLOIDYINDEL"], "fa_icon": "fas fa-project-diagram", - "help_text": "If GATK UnifiedGenotyper is selected as the genotyping tool, this sets which likelihood model to follow, i.e. whether to call only SNPs or INDELS etc.\n\n> Modifies GATK UnifiedGenotyper parameter: `--genotype_likelihoods_model`" + "help_text": "Specify GATK UnifiedGenotyper likelihood model, i.e. whether to call only SNPs or INDELS etc.\n\n> Modifies GATK UnifiedGenotyper parameter: `--genotype_likelihoods_model`" }, "genotyping_gatk_ug_keeprealignbam": { "type": "boolean", - "fa_icon": "far fa-save", + "fa_icon": "fas fa-save", "description": "Specify to keep the BAM output of re-alignment around variants from GATK UnifiedGenotyper.", - "help_text": "If GATK UnifiedGenotyper is selected as the genotyping tool, providing this parameter will output the BAMs that have realigned reads (with GATK's (v3) IndelRealigner) around possible variants for improved genotyping in addition to the standard VCF output.\n\nThese BAMs will be stored in the same folder as the corresponding VCF files." + "help_text": "Specify to output the BAMs that have realigned reads (with GATK (v3) IndelRealigner) around possible variants for improved genotyping with GATK UnifiedGenotyper in addition to the standard VCF output.\n\nThese BAMs will be stored in the same folder as the corresponding VCF files." }, "genotyping_gatk_ug_defaultbasequalities": { "type": "integer", "default": -1, - "description": "Supply a default base quality if a read is missing a base quality score. Setting to -1 turns this off.", - "help_text": "If GATK UnifiedGenotyper is selected as the genotyping tool, specify a value to set base quality scores, if reads are missing this information. Might be useful if you have 'synthetically' generated reads (e.g. chopping up a reference genome). Default is set to `-1` which is to not set any default quality (turned off). \n\n> Modifies GATK UnifiedGenotyper parameter: `--defaultBaseQualities`", + "description": "Specify to supply a default base quality if a read is missing a base quality score.", + "help_text": "Specify a value to set base quality scores for genotyping with GATK UnifiedGenotyper, if reads are missing this information. Might be useful if you have 'synthetically' generated reads (e.g. chopping up a reference genome). Default is set to `-1` which is to not set any default quality (turned off).\n\n> Modifies GATK UnifiedGenotyper parameter: `--defaultBaseQualities`", "fa_icon": "fas fa-redo-alt" }, "genotyping_gatk_hc_out_mode": { "type": "string", "default": "EMIT_VARIANTS_ONLY", - "fa_icon": "fas fa-bullhorn", - "description": "Specify GATK output mode.", - "help_text": "If GATK HaplotypeCaller is selected as the genotyping tool, this sets the type of sites that should be included in the output VCF (i.e. produce calls for every site or just confidence sites). \n\n> Modifies GATK HaplotypeCaller parameter: `--output_mode`", + "fa_icon": "fas fa-toggle-on", + "description": "Specify GATK HaplotypeCaller output mode.", + "help_text": "Specify the type of sites that should be included in the output VCF after genotyping with GATK HaplotypeCaller (i.e. produce calls for every site or just confidence sites).\n\n> Modifies GATK HaplotypeCaller parameter: `--output_mode`", "enum": ["EMIT_VARIANTS_ONLY", "EMIT_ALL_CONFIDENT_SITES", "EMIT_ALL_ACTIVE_SITES"] }, "genotyping_gatk_hc_emitrefconf": { "type": "string", "default": "GVCF", - "fa_icon": "fas fa-bullhorn", + "fa_icon": "fas fa-toggle-on", "description": "Specify HaplotypeCaller mode for emitting reference confidence calls.", - "help_text": "If GATK HaplotypeCaller is selected as the genotyping tool, this sets the mode for emitting reference confidence calls.\n\n> Modifies GATK HaplotypeCaller parameter: `--emit-ref-confidence`", + "help_text": "Specify GATK HaplotypeCaller mode for emitting reference confidence calls.\n\n> Modifies GATK HaplotypeCaller parameter: `--emit-ref-confidence`", "enum": ["NONE", "BP_RESOLUTION", "GVCF"] }, "genotyping_freebayes_min_alternate_count": { "type": "integer", "default": 1, - "description": "Specify minimum required supporting observations of an alternate allele to consider a variant.", - "help_text": "Require at least this count of observations supporting an alternate allele within a single individual in order to evaluate the position.\n\n> Modifies freebayes parameter: `-C`", - "fa_icon": "fas fa-align-center" + "description": "Specify minimum required supporting observations of an alternate allele to consider a variant in FreeBayes.", + "help_text": "Specify the minimum count of observations supporting an alternate allele within a single individual in order to evaluate the position during genotyping with FreeBayes.\n\n> Modifies FreeBayes parameter: `-C`", + "fa_icon": "fas fa-filter" }, "genotyping_freebayes_skip_coverage": { "type": "integer", "default": 0, - "description": "Specify to skip over regions of high depth by discarding alignments overlapping positions where total read depth is greater than specified.", - "help_text": "Specify to skip over regions of high depth by discarding alignments overlapping positions where total read depth is greater than the specified value. Setting to 0 (the default) deactivates this behaviour.\n\n> Modifies freebayes parameter: `-g`", + "description": "Specify to skip over regions of high depth by discarding alignments overlapping positions where total read depth is greater than specified in FreeBayes.", + "help_text": "Specify to skip over regions of high depth by discarding alignments overlapping positions where total read depth is greater than the specified value during genotyping with FreeBayes. This is set to 0 by default, which deactivates this behaviour.\n\n> Modifies FreeBayes parameter: `-g`", "fa_icon": "fab fa-think-peaks" }, "genotyping_angsd_glmodel": { @@ -1057,7 +1064,7 @@ "default": "samtools", "fa_icon": "fas fa-project-diagram", "description": "Specify which ANGSD genotyping likelihood model to use.", - "help_text": "Specify which genotype likelihood model to use.\n\n> Modifies angsd parameter: `-GL`", + "help_text": "Specify which genotype likelihood model to use in ANGSD.\n\n> Modifies ANGSD parameter: `-GL`", "enum": ["samtools", "gatk", "soapsnp", "syk"] }, "genotyping_angsd_glformat": { @@ -1065,7 +1072,7 @@ "default": "binary", "fa_icon": "fas fa-text-height", "description": "Specify the formatting of the output VCF for ANGSD genotype likelihood results.", - "help_text": "Specifies what type of genotyping likelihood file format will be output.\n\nThe options refer to the following descriptions respectively:\n\n- `binary`: binary output of all 10 log genotype likelihood\n- `beagle_binary`: beagle likelihood file\n- `binary_three`: binary 3 times likelihood\n- `text`: text output of all 10 log genotype likelihoods.\n\nSee the [ANGSD documentation](http://www.popgen.dk/angsd/) for more information on which to select for your downstream applications.\n\n> Modifies angsd parameter: `-doGlf`", + "help_text": "Specifies what type of genotyping likelihood file format will be output by ANGSD.\n\nThe options refer to the following descriptions respectively:\n\n- `binary`: binary output of all 10 log genotype likelihood\n- `beagle_binary`: beagle likelihood file\n- `binary_three`: binary 3 times likelihood\n- `text`: text output of all 10 log genotype likelihoods.\n\nSee the [ANGSD documentation](http://www.popgen.dk/angsd/) for more information on which to select for your downstream applications.\n\n> Modifies ANGSD parameter: `-doGlf`", "enum": ["binary", "beagle_binary", "binary_three", "text"] } }, @@ -1082,15 +1089,15 @@ "properties": { "run_mtnucratio": { "type": "boolean", - "description": "Turn on mitochondrial to nuclear ratio calculation.", - "help_text": "Turn on the module to estimate the ratio of mitochondrial to nuclear reads.", - "fa_icon": "fas fa-balance-scale-left" + "description": "Specify to turn on mitochondrial to nuclear ratio calculation.", + "help_text": "Specify to turn on estimation of the ratio of mitochondrial to nuclear reads.", + "fa_icon": "fas fa-power-off" }, "mitochondrion_header": { "type": "string", "default": "MT", - "description": "Specify the name of the reference FASTA entry corresponding to the mitochondrial genome (up to the first space).", - "help_text": "Specify the FASTA entry in the reference file specified as --fasta, which acts as the mitochondrial 'chromosome' to base the ratio calculation on. The tool only accepts the first section of the header before the first space. The default chromosome name is based on hs37d5/GrCH37 human reference genome.", + "description": "Specify the name of the reference FASTA entry corresponding to the mitochondrial genome.", + "help_text": "Specify the FASTA entry in the reference file specified as `--fasta`, which acts as the mitochondrial 'chromosome' to base the ratio calculation on. The tool only accepts the first section of the header before the first space. The default chromosome name is based on hs37d5/GrCH37 human reference genome.", "fa_icon": "fas fa-heading" } } @@ -1098,19 +1105,19 @@ "mapping_statistics": { "title": "Mapping statistics", "type": "object", - "description": "", + "description": "Options for the calculation of mapping statistics", "default": "", "properties": { "mapstats_skip_preseq": { "type": "boolean", - "help_text": "Turns off the computation of library complexity estimation.", - "description": "Turns off the computation of library complexity estimation.", + "help_text": "Specify to turn off the computation of library complexity estimation.", + "description": "Specify to turn off the computation of library complexity estimation with preseq.", "fa_icon": "fas fa-forward" }, "mapstats_preseq_mode": { "type": "string", "default": "c_curve", - "help_text": "Specify which mode of preseq to run.\n\nFrom the [PreSeq documentation](http://smithlabresearch.org/wp-content/uploads/manual.pdf):\n\nc curve is used to compute the expected complexity curve of a mapped read file with a hypergeometric formula\n\nlc extrap is used to generate the expected yield for theoretical larger experiments and bounds on the number of distinct reads in the library and the associated confidence intervals, which is computed by bootstrapping the observed duplicate counts histogram.", + "help_text": "Specify which mode of preseq to run.\n\nFrom the [preseq documentation](http://smithlabresearch.org/wp-content/uploads/manual.pdf):\n\nc curve is used to compute the expected complexity curve of a mapped read file with a hypergeometric formula\n\nlc extrap is used to generate the expected yield for theoretical larger experiments and bounds on the number of distinct reads in the library and the associated confidence intervals, which is computed by bootstrapping the observed duplicate counts histogram.", "description": "Specify which mode of preseq to run.", "fa_icon": "fas fa-toggle-on", "enum": ["c_curve", "lc_extrap"] @@ -1118,57 +1125,52 @@ "mapstats_preseq_stepsize": { "type": "integer", "default": 1000, - "description": "Specify the step size (i.e., sampling regularity) of Preseq.", - "help_text": "Can be used to configure the step size of Preseq's c_curve and lc_extrap method. Can be useful when few reads and allow Preseq to be used for extrapolation of shallow sequencing results.\n\n\n> Modifies tool parameter(s)\n> - preseq: `-s`", + "description": "Specify the step size (i.e., sampling regularity) of preseq.", + "help_text": "Specify the step size of preseq's c_curve and lc_extrap methods. This can be useful when few reads are present and allow preseq to be used for extrapolation of shallow sequencing results.\n\n\n> Modifies preseq parameter:\n> `-s`", "fa_icon": "fas fa-shoe-prints" }, "mapstats_preseq_terms": { "type": "integer", "default": 100, "fa_icon": "fas fa-sort-numeric-up-alt", - "help_text": "Specify the maximum number of terms that lc_extrap mode will use.\n \n> Modifies preseq lc_extrap parameter: `-x`", - "description": "Specify the maximum number of terms that lc_extrap mode will use." + "help_text": "Specify the maximum number of terms that preseq's lc_extrap mode will use.\n\n> Modifies preseq lc_extrap parameter: `-x`", + "description": "Specify the maximum number of terms that preseq's lc_extrap mode will use." }, "mapstats_preseq_maxextrap": { "type": "integer", "default": 10000000000, "fa_icon": "fas fa-ban", - "help_text": "Specify the maximum extrapolation that lc_extrap mode will perform.\n\n> Modifies preseq lc_extrap parameter: `-e`", - "description": "Specify the maximum extrapolation (lc_extrap mode only)" + "help_text": "Specify the maximum extrapolation that preseq's lc_extrap mode will perform.\n\n> Modifies preseq lc_extrap parameter: `-e`", + "description": "Specify the maximum extrapolation to use for preseq's lc_extrap mode." }, "mapstats_preseq_bootstrap": { "type": "integer", "default": 100, "fa_icon": "fab fa-bootstrap", - "help_text": "Specify the number of bootstraps lc_extrap mode will perform to calculate confidence intervals.\n\n> Modifies preseq lc_extrap parameter: `-n`", - "description": "Specify number of bootstraps to perform (lc_extrap mode only)" + "help_text": "Specify the number of bootstraps preseq's lc_extrap mode will perform to calculate confidence intervals.\n\n> Modifies preseq lc_extrap parameter: `-n`", + "description": "Specify number of bootstraps to perform in preseq's lc_extrap mode." }, "mapstats_preseq_cval": { "type": "number", "default": 0.95, "fa_icon": "fas fa-check-circle", - "help_text": "Specify the allowed level of confidence intervals used for lc_extrap mode.\n\n> Modifies preseq lc_extrap parameter: `-c`", - "description": "Specify confidence interval level (lc_extrap mode only)" + "help_text": "Specify the allowed level of confidence intervals used for prerseq's lc_extrap mode.\n\n> Modifies preseq lc_extrap parameter: `-c`", + "description": "Specify confidence interval level for preseq's lc_extrap mode." }, "mapstats_preseq_defects_mode": { "type": "boolean", - "description": "Turns on defects mode to extrapolate without testing for defects (lc_extrap mode only).", - "help_text": "Activates defects mode of `lc_extrap`, which does the extrapolation without testing for defects.\n\n> Modifies preseq lc_extrap parameter: `-D`", - "fa_icon": "fab fa-creative-commons-sampling-plus" + "description": "Specify to turn on preseq defects mode to extrapolate without testing for defects in lc_extrap mode.", + "help_text": "Specify to activate defects mode of `preseq lc_extrap`, which runs the extrapolation without testing for defects.\n\n> Modifies preseq lc_extrap parameter: `-D`", + "fa_icon": "fas fa-power-off" }, "skip_qualimap": { - "type": "boolean" + "type": "boolean", + "description": "Specify to turn off coverage calculation with Qualimap.", + "fa_icon": "fas fa-forward" }, "snpcapture_bed": { "type": "string", - "description": "Path to snp capture in BED format. Provided file can also be gzipped." - }, - "elongation_factor": { - "type": "integer", - "default": 500, - "description": "Specify the number of bases to extend reference by (circularmapper only)", - "help_text": "The number of bases to extend the reference genome with. By default this is set to 500 if not specified otherwise.", - "fa_icon": "fas fa-external-link-alt" + "description": "Specify path to SNP capture positions in BED format for coverage calculations with Qualimap." } }, "fa_icon": "fas fa-search" @@ -1179,91 +1181,92 @@ "description": "Options for calculating and filtering for characteristic ancient DNA damage patterns.", "default": "", "fa_icon": "fas fa-chart-line", - "help_text": "More documentation can be seen in the follow links for:\n\n[DamageProfiler](https://github.com/Integrative-Transcriptomics/DamageProfiler)\n\nIf using TSV input, DamageProfiler is performed per library, i.e. after lane merging. BAM Trimming is only performed on non-UDG and half-UDG treated data.", + "help_text": "More documentation can be found at the follow links for:\n\n[DamageProfiler](https://github.com/Integrative-Transcriptomics/DamageProfiler)\n\nIf using TSV input, DamageProfiler is performed per library, i.e. after lane merging. BAM Trimming is only performed on non-UDG and half-UDG treated data.", "properties": { "skip_damagecalculation": { "type": "boolean", "fa_icon": "fas fa-forward", - "help_text": "Turns off damage calculation to compute DNA damage profiles." + "help_text": "Specify to turn off computation of DNA damage profiles.", + "description": "Specify to turn off ancient DNA damage calculation." }, "damagecalculation_tool": { "type": "string", "default": "damageprofiler", "enum": ["damageprofiler", "mapdamage"], - "fa_icon": "fas fa-tools", + "fa_icon": "fas fa-hammer", "description": "Specify the tool to use for damage calculation.", "help_text": "Specify the tool to be used for damage calculation. DamageProfiler is generally faster than mapDamage2, but the latter has an option to limit the number of reads used. This can significantly speed up the processing of very large files, where the damage estimates are already accurate after processing only a fraction of the input." }, "damagecalculation_yaxis": { "type": "number", "default": 0.3, - "description": "Specify the maximum misincorporation frequency that should be displayed on damage plot. Set to 0 to 'autoscale'.", - "help_text": "Specifies what the maximum misincorporation frequency should be displayed as, in the damage plot.\n\n> Modifies DamageProfiler parameter: -yaxis_dp_max or mapDamage2 parameter: --ymax", + "description": "Specify the maximum misincorporation frequency that should be displayed on damage plot.", + "help_text": "Specify the maximum misincorporation frequency that should be displayed in the damage plot.\n\n> Modifies DamageProfiler parameter: `-yaxis_dp_max` or mapDamage2 parameter: `--ymax`", "fa_icon": "fas fa-ruler-combined" }, "damagecalculation_xaxis": { "type": "integer", "default": 25, "description": "Specify number of bases of each read to be considered for plotting damage estimation.", - "help_text": "Specifies the number of bases to be considered for plotting nucleotide misincorporations.\n\n> Modifies DamageProfiler parameter: -t or mapDamage2 parameter: -m\n", + "help_text": "Specify the number of bases to be considered for plotting nucleotide misincorporations.\n\n> Modifies DamageProfiler parameter: `-t` or mapDamage2 parameter: `-m`", "fa_icon": "far fa-chart-bar" }, "damagecalculation_damageprofiler_length": { "type": "integer", "default": 100, - "description": "Specifies the length filter for DamageProfiler.", - "help_text": "Number of bases which are considered for frequency computations, by default set to 100.`\n\n> Modifies DamageProfiler parameter: -l", - "fa_icon": "fas fa-sort-amount-down" + "description": "Specify the length filter for DamageProfiler.", + "help_text": "Specify the number of bases which are considered for frequency computations.\n\n> Modifies DamageProfiler parameter: `-l`", + "fa_icon": "fas fa-ruler-horizontal" }, "damagecalculation_mapdamage_downsample": { "type": "integer", "default": 0, "fa_icon": "fas fa-compress-alt", - "description": "Specify the maximum number of reads to consider for damage calculation. Defaults value is 0 (i.e. no downsampling is performed).", - "help_text": "The maximum number of reads used for damage calculation in mapDamage2. Can be used to significantly reduce the amount of time required for damage assessment. Note that a too low value can also obtain incorrect results.\n\n>Modifies mapDamage2 parameter: -n\n" + "description": "Specify the maximum number of reads to consider for damage calculation with mapDamage.", + "help_text": "Specify the maximum number of reads used for damage calculation in mapDamage2. This can be used to significantly reduce the amount of time required for damage assessment. Note that a too low value can also obtain incorrect results.\n\n> Modifies mapDamage2 parameter: `-n`" } } }, "feature_annotation_statistics": { "title": "Feature Annotation Statistics", "type": "object", - "description": "Options for getting reference annotation statistics (e.g. gene coverages)", + "description": "Options for calculating reference annotation statistics (e.g. gene coverages)", "default": "", "properties": { "run_bedtools_coverage": { "type": "boolean", - "description": "Turn on ability to calculate no. reads, depth and breadth coverage of features in reference.", - "fa_icon": "fas fa-chart-area", - "help_text": "Specifies to turn on the bedtools module, producing statistics for breadth (or percent coverage), and depth (or X fold) coverages.\n\n> Modifies tool parameter(s):\n- bedtools coverage: `-mean`" + "description": "Specify to turn on calculation of number of reads, depth and breadth coverage of features in reference with bedtools.", + "fa_icon": "fas fa-power-off", + "help_text": "Specify to turn on the bedtools module, producing statistics for breadth (or percent coverage), and depth (or X fold) coverages.\n\n> Modifies bedtools coverage parameter: `-mean`" }, "mapstats_bedtools_featurefile": { "type": "string", - "description": "Path to GFF or BED file containing positions of features in reference file (--fasta). Path should be enclosed in quotes.", + "description": "Specify path to GFF or BED file containing positions of features in reference file for bedtools.", "fa_icon": "fas fa-file-signature", "help_text": "Specify the path to a GFF/BED containing the feature coordinates (or any acceptable input for [`bedtools coverage`](https://bedtools.readthedocs.io/en/latest/content/tools/coverage.html)). Must be in quotes.\n" } }, "fa_icon": "fas fa-scroll", - "help_text": "If you're interested in looking at coverage stats for certain features on your\nreference such as genes, SNPs etc., you can use the following bedtools module\nfor this purpose.\n\nMore documentation on bedtools can be seen in the [bedtools\ndocumentation](https://bedtools.readthedocs.io/en/latest/)\n\nIf using TSV input, bedtools is run after library merging of same-named library\nBAMs that have the same type of UDG treatment.\n" + "help_text": "If you're interested in looking at coverage stats for certain features on your reference such as genes, SNPs etc., you can use the following bedtools module for this purpose.\n\nMore documentation on bedtools can be seen in the [bedtools\ndocumentation](https://bedtools.readthedocs.io/en/latest/)\n\nIf using TSV input, bedtools is run after library merging of same-named library BAMs that have the same type of UDG treatment." }, "host_removal": { "title": "Host Removal", "type": "object", - "description": "", + "description": "Options for removing host-mapped reads", "default": "", "properties": { "run_host_removal": { "type": "boolean", - "description": "Turn on per-lane creation of pre-adapter-removal and/or read-pair-merging FASTQ files without reads that mapped to reference (e.g. for public upload of privacy sensitive non-host data)", - "help_text": "Recreates pre-adapter-removal and/or read-pair-merging FASTQ files but without reads that mapped to reference (e.g. for public upload of privacy-sensitive non-host data)", + "description": "Specify to turn on creation of pre-adapter-removal and/or read-pair-merging FASTQ files without reads that mapped to reference (e.g. for public upload of privacy sensitive non-host data).", + "help_text": "Specify to recreate pre-adapter-removal and/or read-pair-merging FASTQ files but without reads that mapped to reference (e.g. for public upload of privacy-sensitive non-host data)", "fa_icon": "fas fa-power-off" }, "host_removal_mode": { "type": "string", "default": "remove", - "description": "Host-mapped read removal mode. Remove mapped reads completely from FASTQ (remove) or just mask the host sequence of mapped reads with N (replace).", - "help_text": "Modifies extract_map_reads.py parameter: -m", - "fa_icon": "fas fa-plane-slash", + "description": "Specify the host-mapped read removal mode.", + "help_text": "Specify the host-mapped read removal mode.\n\n> Modifies extract_map_reads.py parameter: -m", + "fa_icon": "fas fa-toggle-on", "enum": ["remove", "replace"] } }, @@ -1272,56 +1275,56 @@ "contamination_estimation": { "title": "Contamination estimation", "type": "object", - "description": "Options for the estimation of contamination", + "description": "Options for the estimation of contamination in human data", "default": "", "fa_icon": "fas fa-radiation-alt", "properties": { "run_contamination_estimation_angsd": { "type": "boolean", - "description": "Turn on nuclear contamination estimation for genomes with ANGSD.", - "help_text": "Specify to run the optional processes for nuclear DNA contamination estimation with ANGSD.", + "description": "Specify to turn on nuclear contamination estimation for genomes with ANGSD.", + "help_text": "Specify to run nuclear DNA contamination estimation with ANGSD.", "fa_icon": "fas fa-power-off" }, "contamination_estimation_angsd_chrom_name": { "type": "string", "default": "X", - "description": "The name of the chromosome to be used for contamination estimation.", - "help_text": "The name of the chromosome as specified in your FASTA/bam header.\ne.g. 'X' for hs37d5, 'chrX' for HG19", + "description": "Specify the name of the chromosome to be used for contamination estimation with ANGSD.", + "help_text": "Specify the name of the chromosome to be used for contamination estimation with ANGSD as specified in your FASTA/BAM header, e.g. 'X' for hs37d5 or 'chrX' for hg19", "fa_icon": "fas fa-address-card" }, "contamination_estimation_angsd_range_from": { "type": "integer", "default": 5000000, - "description": "The first position on the chromosome to be used for contamination estimation with ANGSD.", - "help_text": "The beginning of the genetic range that should be utilised for nuclear contamination estimation.", + "description": "Specify the first position on the chromosome to be used for contamination estimation with ANGSD.", + "help_text": "Specify the beginning of the genetic range that should be utilised for nuclear contamination estimation with ANGSD.", "fa_icon": "fas fa-map-marker-alt" }, "contamination_estimation_angsd_range_to": { "type": "integer", "default": 154900000, - "help_text": "The end of the genetic range that should be utilised for nuclear contamination estimation.", - "description": "The last position on the chromosome to be used for contamination estimation with ANGSD.", + "help_text": "Specify the end of the genetic range that should be utilised for nuclear contamination estimation with ANGSD.", + "description": "Specify the last position on the chromosome to be used for contamination estimation with ANGSD.", "fa_icon": "fas fa-map-marker-alt" }, "contamination_estimation_angsd_mapq": { "type": "integer", "default": 30, - "help_text": "> Modifies angsd parameter: `-minMapQ`", + "help_text": "Specify the minimum mapping quality reads should have for contamination estimation with ANGSD.\n\n> Modifies ANGSD parameter: `-minMapQ`", "description": "Specify the minimum mapping quality reads should have for contamination estimation with ANGSD.", - "fa_icon": "fas fa-thermometer-full" + "fa_icon": "fas fa-filter" }, "contamination_estimation_angsd_minq": { "type": "integer", "default": 30, "description": "Specify the minimum base quality reads should have for contamination estimation with ANGSD.", - "help_text": "> Modifies angsd parameter: `-minQ`", - "fa_icon": "fas fa-ruler-vertical" + "help_text": "Specify the minimum base quality reads should have for contamination estimation with ANGSD.\n\n> Modifies ANGSD parameter: `-minQ`", + "fa_icon": "fas fa-filter" }, "contamination_estimation_angsd_hapmap": { "type": "string", - "default": "/Users/judith_ballesteros/Documents/GitHub/eager/assets/angsd_resources/HapMapChrX.gz", - "description": "Path to HapMap file of chromosome for contamination estimation..", - "help_text": "The haplotype map, or \"HapMap\", records the location of haplotype blocks and their tag SNPs.", + "default": "${projectDir}/assets/angsd_resources/HapMapChrX.gz", + "description": "Specify path to HapMap file of chromosome for contamination estimation with ANGSD.", + "help_text": "Specify a path to HapMap file of chromosome for contamination estimation with ANGSD. The haplotype map, or \"HapMap\", records the location of haplotype blocks and their tag SNPs.", "fa_icon": "fas fa-map" } } @@ -1329,20 +1332,20 @@ "human_sex_determination": { "title": "Human Sex Determination", "type": "object", - "description": "Options for the calculation of biological sex of human individuals.", + "description": "Options for the calculation of genetic sex of human individuals.", "default": "", "properties": { "run_sexdeterrmine": { "type": "boolean", - "fa_icon": "fas fa-transgender-alt", - "description": "Turn on sex determination for human reference genomes. This will run on single- and double-stranded variants of a library separately.", - "help_text": "Specify to run the optional process of sex determination." + "fa_icon": "fas fa-power-off", + "description": "Specify to turn on sex determination for genomes mapped to human reference genomes with Sex.DetERRmine.", + "help_text": "Specify to run genetic sex determination." }, "sexdeterrmine_bedfile": { "type": "string", "fa_icon": "fas fa-bed", - "description": "Specify path to SNP panel in bed format for error bar calculation. Optional (see documentation).", - "help_text": "Specify an optional bedfile of the list of SNPs to be used for X-/Y-rate calculation. Running without this parameter will considerably increase runtime, and render the resulting error bars untrustworthy. Theoretically, any set of SNPs that are distant enough that two SNPs are unlikely to be covered by the same read can be used here. The programme was coded with the 1240K panel in mind." + "description": "Specify path to SNP panel in BED format for error bar calculation.", + "help_text": "Specify a BED file with SNPs to be used for X-/Y-rate calculation. Running without this parameter will considerably increase runtime, and render the resulting error bars untrustworthy. Theoretically, any set of SNPs that are distant enough that two SNPs are unlikely to be covered by the same read can be used here. The programme was coded with the 1240k panel in mind." } }, "fa_icon": "fas fa-transgender-alt", From 050acfd4d555c1b6f281af90cecdf3e27acb8e7d Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 5 Jul 2024 10:45:31 +0200 Subject: [PATCH 120/198] fix schema --- nextflow_schema.json | 1 + 1 file changed, 1 insertion(+) diff --git a/nextflow_schema.json b/nextflow_schema.json index 06737bf47..4e443e435 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -611,6 +611,7 @@ "description": "Specify the number of bases to extend reference by (circularmapper only)", "help_text": "The number of bases to extend the reference genome with. By default this is set to 500 if not specified otherwise.", "fa_icon": "fas fa-external-link-alt" + } }, "fa_icon": "fas fa-layer-group" }, From 7e41a189f7aa3b7e8d6bedf390402846e05cdec0 Mon Sep 17 00:00:00 2001 From: Ian Light Date: Fri, 5 Jul 2024 08:48:58 +0000 Subject: [PATCH 121/198] small clarity changes for metagenomics backend --- subworkflows/local/metagenomics_postprocessing.nf | 4 ++-- subworkflows/local/metagenomics_profiling.nf | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/subworkflows/local/metagenomics_postprocessing.nf b/subworkflows/local/metagenomics_postprocessing.nf index fd85050db..0e31b1d34 100644 --- a/subworkflows/local/metagenomics_postprocessing.nf +++ b/subworkflows/local/metagenomics_postprocessing.nf @@ -21,10 +21,10 @@ workflow METAGENOMICS_POSTPROCESSING { // We want to combine the chunks, but run MaltExtract on double and singlestranded individually ch_strandedness = ch_postprocessing_input .transpose() - .map{ meta, reads -> + .map{ meta, rma6 -> [ meta + [ 'id': "${meta.strandedness}stranded" ], - reads + rma6 ] } .groupTuple(by:0) diff --git a/subworkflows/local/metagenomics_profiling.nf b/subworkflows/local/metagenomics_profiling.nf index 22006ba8e..aa1abeab1 100644 --- a/subworkflows/local/metagenomics_profiling.nf +++ b/subworkflows/local/metagenomics_profiling.nf @@ -69,7 +69,7 @@ workflow METAGENOMICS_PROFILING { // since we work with channels, we need a channel that stores that information ch_tmp_groups = params.metagenomics_malt_group_size > 0 ? ch_reads.collate(params.metagenomics_malt_group_size).count() : Channel.of(1) // this is for enumerating the channel-entries in the ch_reads channel - def n = 0 + def groups_counter = 0 //replace the meta in a way that groupTuple splits the entries //by strandedness and metagenomics_malt_group_size @@ -83,7 +83,7 @@ workflow METAGENOMICS_PROFILING { [ label: label, strandedness:meta.strandedness, - id:"${meta.strandedness}stranded_${n++%n_groups}" + id:"${meta.strandedness}stranded_${groups_counter++%n_groups}" ], reads ] From 5414f06c0f6d0c8b9152337c6a23d2c58cddb675 Mon Sep 17 00:00:00 2001 From: Merlin Szymanski Date: Fri, 5 Jul 2024 11:54:16 +0200 Subject: [PATCH 122/198] Fix CI, remove duplicated key in nextflow schema --- nextflow_schema.json | 6 ------ 1 file changed, 6 deletions(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index cea5bc0c6..6f64d87ee 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -708,12 +708,6 @@ "fa_icon": "fas fa-hammer", "help_text": "Specify to select which tool is used to generate a final set of reads for the metagenomic classifier after any necessary trimming, filtering or reformatting of the reads.\n\nThis intermediate file is not saved in the results directory unless marked with `--metagenomics_complexity_savefastq`." }, - "metagenomics_complexity_savefastq": { - "type": "boolean", - "fa_icon": "fas fa-save", - "description": "Save FASTQ files containing the complexity filtered reads (before metagenomic classification).", - "help_text": "Save the complexity-filtered fastq-files to the results directory" - }, "metagenomics_complexity_entropy": { "type": "number", "fa_icon": "fas fa-sort-numeric-up", From d7a2d9232a2d8a3c6d8edb29dd7874a1bff73a7d Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 5 Jul 2024 15:12:34 +0200 Subject: [PATCH 123/198] WIP --- subworkflows/local/circularmapper.nf | 118 +++++++++--------- subworkflows/local/map.nf | 14 +-- .../local/reference_indexing_single.nf | 2 +- 3 files changed, 68 insertions(+), 66 deletions(-) diff --git a/subworkflows/local/circularmapper.nf b/subworkflows/local/circularmapper.nf index 73578bda1..b12dc5d55 100644 --- a/subworkflows/local/circularmapper.nf +++ b/subworkflows/local/circularmapper.nf @@ -4,76 +4,78 @@ include { CIRCULARMAPPER_CIRCULARGENERATOR } from '../../modules/nf-core/circularmapper/circulargenerator/main' include { CIRCULARMAPPER_REALIGNSAMFILE } from '../../modules/nf-core/circularmapper/realignsamfile/main' -include { BWA_ALN as BWA_ALN_CIRCULARMAPPER } from '../../modules/nf-core/bwa/aln/main' +include { FASTQ_ALIGN_BWAALN } from '../../subworkflows/nf-core/fastq_align_bwaaln/main' include { BWA_INDEX as BWA_INDEX_CIRCULARMAPPER } from '../../modules/nf-core/bwa/index/main' -include { BWA_SAMSE as BWA_SAMSE_CIRCULARMAPPER } from '../../modules/nf-core/bwa/samse/main' workflow CIRCULARMAPPER { + // TODO - PRepare input for FASTQ_ALIGN_BWAALN SWF, then use CIRCULARMAPPER_REALIGNSAMFILE file anf index output SAM file to emit. take: - fasta_reference // channel (mandatory): [ val(meta), path(reference) ] - eval // channel (mandatory): val(elongation value) - fastq_reads // channel (mandatory): [ val(meta), path(reads) ] + ch_reference // channel (mandatory): [ val(meta), path(reference) ] + elongation_value // channel (mandatory): val(elongation value) + fastq_reads // channel (mandatory): [ val(meta), path(reads) ] main: - ch_versions = Channel.empty() ch_multiqc_files = Channel.empty() + CIRCULARMAPPER_CIRCULARGENERATOR(ch_reference, elongation_value) + ch_versions = ch_versions.mix( CIRCULARMAPPER_CIRCULARGENERATOR.out.versions.first() ) - ch_reference = fasta_reference - ch_eval = eval - - CIRCULARMAPPER_CIRCULARGENERATOR(ch_reference, ch_eval) - ch_versions = ch_versions.mix( CIRCULARMAPPER_CIRCULARGENERATOR.out.versions.first() ) - - BWA_INDEX_CIRCULARMAPPER(CIRCULARMAPPER_CIRCULARGENERATOR.out.fasta) - ch_versions = ch_versions.mix( BWA_INDEX_CIRCULARMAPPER.out.versions.first() ) - - ch_input_bwa_aln = fastq_reads - .map { - // Prepend a new meta that contains the meta.reference value as the new_meta.reference attribute - addNewMetaFromAttributes( it, "reference" , "reference" , false ) - } - .groupTuple(by:0) - .combine( BWA_INDEX_CIRCULARMAPPER.out.index, by: 0 ) // [ [meta], fastq, bai ] - .multiMap { - combo_meta, metas, fastq, ref_bai, bai -> - def ids = metas.collect { meta -> meta.id } - fastqs: [ combo_meta + [id: ids], fastq ] - bai: [ ref_bai, bai ] - } - - BWA_ALN_CIRCULARMAPPER(ch_input_bwa_aln) - ch_versions = ch_versions.mix( BWA_ALN_CIRCULARMAPPER.out.versions.first() ) - - ch_input_bwa_samse = ch_input_bwa_aln - .combine( BWA_ALN_CIRCULARMAPPER.out.sai, by: 0 ) // [ [meta], fastq, bai, sai ] - .multiMap { - metas, fastq, ref_bai, bai, ref_sai, sai -> - fastqs: [ metas, fastq, sai ] - bai: [ ref_bai, bai ] - } - - BWA_SAMSE_CIRCULARMAPPER(ch_input_bwa_samse) - ch_versions = ch_versions.mix( BWA_SAMSE_CIRCULARMAPPER.out.versions.first() ) - - ch_input_realignsamfile = BWA_SAMSE_CIRCULARMAPPER.out.bam - .combine(CIRCULARMAPPER_CIRCULARGENERATOR.out.fasta, by: 0) - .combine(ch_eval) - .multiMap { - ref_bam, bam, ref_fasta, fasta, ch_eval -> - bam: [ ref_bam, bam ] - fasta: [ ref_fasta, fasta ] - eval: [ ch_eval ] - } - - CIRCULARMAPPER_REALIGNSAMFILE(ch_input_realignsamfile) - ch_versions = ch_versions.mix( CIRCULARMAPPER_REALIGNSAMFILE.out.versions.first() ) + BWA_INDEX_CIRCULARMAPPER(CIRCULARMAPPER_CIRCULARGENERATOR.out.fasta) + ch_versions = ch_versions.mix( BWA_INDEX_CIRCULARMAPPER.out.versions.first() ) - emit: + ch_reference_for_bwa = BWA_INDEX_CIRCULARMAPPER.out.index + .map { + // Prepend a new meta that contains the meta.reference value as the new_meta.reference attribute + addNewMetaFromAttributes( it, "id" , "reference" , false ) + } - bam = CIRCULARMAPPER_REALIGNSAMFILE.out.bam // channel: [ val(meta), path(bam) ] - versions = ch_versions // channel: [ path(versions.yml) ] + ch_input_bwa_aln = fastq_reads + .map { + // Prepend a new meta that contains the meta.reference value as the new_meta.reference attribute + addNewMetaFromAttributes( it, "reference" , "reference" , false ) + } + .groupTuple(by:0) + .combine( ch_reference_for_bwa, by: 0 ) + .dump(tag:"ch_input_bwa_aln") + // .multiMap { + // combo_meta, meta, fastq, ref_meta, ref_index -> + // def ids = metas.collect { meta -> meta.id } + // reads: [ combo_meta + [id: ids], fastq ] + // index: [ ref_bai, bai ] + // } + + // BWA_ALN_CIRCULARMAPPER(ch_input_bwa_aln) + // ch_versions = ch_versions.mix( BWA_ALN_CIRCULARMAPPER.out.versions.first() ) + + // ch_input_bwa_samse = ch_input_bwa_aln + // .combine( BWA_ALN_CIRCULARMAPPER.out.sai, by: 0 ) // [ [meta], fastq, bai, sai ] + // .multiMap { + // metas, fastq, ref_bai, bai, ref_sai, sai -> + // fastqs: [ metas, fastq, sai ] + // bai: [ ref_bai, bai ] + // } + + // BWA_SAMSE_CIRCULARMAPPER(ch_input_bwa_samse) + // ch_versions = ch_versions.mix( BWA_SAMSE_CIRCULARMAPPER.out.versions.first() ) + + // ch_input_realignsamfile = BWA_SAMSE_CIRCULARMAPPER.out.bam + // .combine(CIRCULARMAPPER_CIRCULARGENERATOR.out.fasta, by: 0) + // .combine(ch_eval) + // .multiMap { + // ref_bam, bam, ref_fasta, fasta, ch_eval -> + // bam: [ ref_bam, bam ] + // fasta: [ ref_fasta, fasta ] + // eval: [ ch_eval ] + // } + + // CIRCULARMAPPER_REALIGNSAMFILE(ch_input_realignsamfile) + // ch_versions = ch_versions.mix( CIRCULARMAPPER_REALIGNSAMFILE.out.versions.first() ) + + emit: + + // bam = CIRCULARMAPPER_REALIGNSAMFILE.out.bam // channel: [ val(meta), path(bam) ] + versions = ch_versions // channel: [ path(versions.yml) ] } diff --git a/subworkflows/local/map.nf b/subworkflows/local/map.nf index f738161c5..7c5addd81 100644 --- a/subworkflows/local/map.nf +++ b/subworkflows/local/map.nf @@ -120,17 +120,17 @@ workflow MAP { ch_input_for_circularmapper = reads .combine(index.map{ meta, index, fasta -> [ meta, fasta ] }) - .combine(ch_eval) + .dump(tag:"CM Inputs", pretty:true) .multiMap { meta, reads, meta2, fasta, eval -> reads: [ meta, reads ] - index: [ meta2, fasta ] - elon: [ eval ] + reference: [ meta2, fasta ] } - CIRCULARMAPPER(ch_input_for_circularmapper) - ch_versions = ch_versions.mix ( CIRCULARMAPPER.out.versions.first() ) - ch_mapped_bam = CIRCULARMAPPER.out.bam - ch_mapped_bai = Channel.empty() // Circularmapper doesn't give a bai + CIRCULARMAPPER( ch_input_for_circularmapper.reads, params.elongation_factor, ch_input_for_circularmapper.reference ) + ch_versions = ch_versions.mix ( CIRCULARMAPPER.out.versions ) + // TODO - Update SWF outputs + ch_mapped_lane_bam = CIRCULARMAPPER.out.bam + ch_mapped_lane_bai = Channel.empty() // Circularmapper doesn't give a bai } diff --git a/subworkflows/local/reference_indexing_single.nf b/subworkflows/local/reference_indexing_single.nf index a21b4727d..778352680 100644 --- a/subworkflows/local/reference_indexing_single.nf +++ b/subworkflows/local/reference_indexing_single.nf @@ -52,7 +52,7 @@ workflow REFERENCE_INDEXING_SINGLE { } // Generate mapper indicies if not supplied, and if supplied generate meta - if ( params.mapping_tool == 'bwaaln' || params.mapping_tool == 'bwamem' ){ + if ( params.mapping_tool == 'bwaaln' || params.mapping_tool == 'bwamem' || params.mapping_tool == 'circularmapper' ){ if ( !fasta_mapperindexdir ) { ch_fasta_mapperindexdir = BWA_INDEX ( ch_ungz_ref ).index From 6caacef3886670c3d262f283345eb31c26286a2f Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 5 Jul 2024 15:22:14 +0200 Subject: [PATCH 124/198] fix swf emissions --- subworkflows/local/circularmapper.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/subworkflows/local/circularmapper.nf b/subworkflows/local/circularmapper.nf index b12dc5d55..1498b6a3d 100644 --- a/subworkflows/local/circularmapper.nf +++ b/subworkflows/local/circularmapper.nf @@ -75,7 +75,7 @@ workflow CIRCULARMAPPER { emit: - // bam = CIRCULARMAPPER_REALIGNSAMFILE.out.bam // channel: [ val(meta), path(bam) ] + bam = channel.empty() //CIRCULARMAPPER_REALIGNSAMFILE.out.bam // channel: [ val(meta), path(bam) ] versions = ch_versions // channel: [ path(versions.yml) ] } From 670771d05db0465288b8da3723a692942f41b8dd Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 12 Jul 2024 11:37:42 +0200 Subject: [PATCH 125/198] add elongate_reference swf --- subworkflows/local/elongate_reference.nf | 74 ++++++++++++++++++++++++ 1 file changed, 74 insertions(+) create mode 100644 subworkflows/local/elongate_reference.nf diff --git a/subworkflows/local/elongate_reference.nf b/subworkflows/local/elongate_reference.nf new file mode 100644 index 000000000..8c949ba4d --- /dev/null +++ b/subworkflows/local/elongate_reference.nf @@ -0,0 +1,74 @@ +// +// Elongate a reference genome by circularising the target sequence by a given elongation factor. +// + +include { CIRCULARMAPPER_CIRCULARGENERATOR } from '../../modules/nf-core/circularmapper/circulargenerator/main' +include { BWA_INDEX as BWA_INDEX_CIRCULARISED } from '../../modules/nf-core/bwa/index/main' + +workflow ELONGATE_REFERENCE { + take: + ch_reference // [ meta, fasta, fai ] + ch_elongated_reference // [ meta, elongated_fasta, elongated_fai ] + elongation_factor // [ int ] + // TODO CIRCULARMAPPER_CIRCULARGENERATOR module needs updating. `-s` option is the circular target and not the output file >.< + + main: + ch_versions = Channel.empty() + ch_multiqc_files = Channel.empty() + + /* + Check what fasta files we have: + There are four options: + 1. Elongated reference with index (ignore circular target) + 2. Elongated reference without index (ignore circular target) + 3. No elongated reference, but circular target + 4. None of the above -> Throw error (should go in parameter validation) + */ + ch_circulargenerator_input = ch_elongated_reference + .branch{ + meta, elongated_fasta_index, elongated_fasta, circular_target -> + ready: elongated_fasta != "" && elongated_fasta_index != "" + needs_index: elongated_fasta != "" && elongated_fasta_index == "" + needs_elongation: elongated_fasta == "" && circular_target != "" + } + + // Elongate references that need it + // Join the original references to the branch of needs_elongation, to get the original fasta files, and elongate them. + ch_references_to_elongate = ch_circulargenerator_input.needs_elongation + .join( ch_reference ) + .map { + meta, elongated_fasta_index, elongated_fasta, circular_target, meta2, index, fasta -> + [ meta, fasta ] + } + + CIRCULARMAPPER_CIRCULARGENERATOR(ch_circulargenerator_input.needs_elongation, elongation_value) + ch_versions = ch_versions.mix( CIRCULARMAPPER_CIRCULARGENERATOR.out.versions.first() ) + + // Collect newly generated circular references and provided ones without an index, and index them. + ch_input_for_circular_indexing = ch_circulargenerator_input.needs_index + .map { + meta, elongated_fasta_index, elongated_fasta, circular_target -> + [ meta, elongated_fasta ] + } + .mix( CIRCULARMAPPER_CIRCULARGENERATOR.out.fasta ) + + BWA_INDEX_CIRCULARISED(ch_input_for_circular_indexing) + ch_versions = ch_versions.mix( BWA_INDEX_CIRCULARISED.out.versions.first() ) + + ch_indexed_references = ch_input_for_circular_indexing + .join( BWA_INDEX_CIRCULARISED.out.index ) + + // Then put all the indexed elongated references together and emit them + ch_circular_reference = ch_circulargenerator_input.ready + .map { + meta, elongated_fasta_index, elongated_fasta, circular_target -> + [ meta, elongated_fasta, elongated_fasta_index ] + } + .mix( ch_indexed_references ) + + emit: + circular_reference = ch_circular_reference // [ meta, fasta, fai ] + versions = ch_versions + mqc = ch_multiqc_files + +} From 21093ac94b89a6afa31116fc8df516ba684f0bd3 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 12 Jul 2024 11:57:13 +0200 Subject: [PATCH 126/198] wip n CM mapping --- subworkflows/local/map.nf | 36 +++++++++++++++++++----------------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/subworkflows/local/map.nf b/subworkflows/local/map.nf index 7c5addd81..1ffacf706 100644 --- a/subworkflows/local/map.nf +++ b/subworkflows/local/map.nf @@ -16,8 +16,9 @@ include { CIRCULARMAPPER } from '../../subworkflo workflow MAP { take: - reads // [ [meta], [read1, reads2] ] or [ [meta], [read1] ] - index // [ [meta], [ index ], [ fasta ] ] + reads // [ [meta], [read1, reads2] ] or [ [meta], [read1] ] + index // [ [meta], [ index ], [ fasta ] ] + elogated_index // [ [meta], [ index ], [ fasta ], [ circular_target ] ] main: ch_versions = Channel.empty() @@ -116,21 +117,22 @@ workflow MAP { ch_mapped_lane_bai = params.fasta_largeref ? SAMTOOLS_INDEX_BT2.out.csi : SAMTOOLS_INDEX_BT2.out.bai } else if ( params.mapping_tool == 'circularmapper' ) { - ch_eval = params.elongation_factor - - ch_input_for_circularmapper = reads - .combine(index.map{ meta, index, fasta -> [ meta, fasta ] }) - .dump(tag:"CM Inputs", pretty:true) - .multiMap { - meta, reads, meta2, fasta, eval -> - reads: [ meta, reads ] - reference: [ meta2, fasta ] - } - CIRCULARMAPPER( ch_input_for_circularmapper.reads, params.elongation_factor, ch_input_for_circularmapper.reference ) - ch_versions = ch_versions.mix ( CIRCULARMAPPER.out.versions ) - // TODO - Update SWF outputs - ch_mapped_lane_bam = CIRCULARMAPPER.out.bam - ch_mapped_lane_bai = Channel.empty() // Circularmapper doesn't give a bai + // Reference elongation and indexing takes place in the reference_indexing swf. + // Circularmapper takes non-elongated AND elongated references and reads as input (i think. wait for Alex's reply). + + // ch_input_for_circularmapper = reads + // .combine(index.map{ meta, index, fasta -> [ meta, fasta ] }) + // .dump(tag:"CM Inputs", pretty:true) + // .multiMap { + // meta, reads, meta2, fasta -> + // reads: [ meta, reads ] + // reference: [ meta2, fasta ] + // } + // CIRCULARMAPPER( ch_input_for_circularmapper.reads, params.elongation_factor, ch_input_for_circularmapper.reference ) + // ch_versions = ch_versions.mix ( CIRCULARMAPPER.out.versions ) + // // TODO - Update SWF outputs + // ch_mapped_lane_bam = CIRCULARMAPPER.out.bam + // ch_mapped_lane_bai = Channel.empty() // Circularmapper doesn't give a bai } From a6a145aa4502214318e14fa8c4d613223c3f8791 Mon Sep 17 00:00:00 2001 From: scarlhoff Date: Fri, 12 Jul 2024 11:59:12 +0200 Subject: [PATCH 127/198] add elongated fasta and fai input --- assets/schema_fasta.json | 14 ++++ nextflow.config | 34 ++++---- nextflow_schema.json | 16 +++- subworkflows/local/map.nf | 4 +- subworkflows/local/reference_indexing.nf | 47 ++++++++--- .../local/reference_indexing_multi.nf | 80 ++++++++++--------- .../local/reference_indexing_single.nf | 12 ++- 7 files changed, 133 insertions(+), 74 deletions(-) diff --git a/assets/schema_fasta.json b/assets/schema_fasta.json index d53bca776..d89310422 100644 --- a/assets/schema_fasta.json +++ b/assets/schema_fasta.json @@ -48,6 +48,20 @@ "pattern": "^\\S+$", "errorMessage": "The headers of the chromosome to be extended by circularmapper must not contain any spaces and no leading '>'." }, + "circularmapper_elongated_fasta": { + "type": "string", + "format": "file-path", + "pattern": "^\\S+\\.f(na|asta|a|as)(\\.gz)?$", + "exists": true, + "errorMessage": "The elongated Fasta files for the mapping reference must be provided with file extensions '.fasta', '.fa', '.fas', '.fna', '.fasta.gz','.fa.gz','.fas.gz', '.fna.gz' and cannot contain any spaces." + }, + "circularmapper_elongated_fai": { + "type": "string", + "format": "file-path", + "pattern": "^\\S+\\.fai$", + "exists": true, + "errorMessage": "Elongated fasta index files for the mapping reference cannot have any spaces and must have file extension '.fai'." + }, "mitochondrion_header": { "type": "string", "pattern": "^\\S+$", diff --git a/nextflow.config b/nextflow.config index 63663b004..467aeba3f 100644 --- a/nextflow.config +++ b/nextflow.config @@ -108,20 +108,23 @@ params { preprocessing_adapterremoval_qualitymax = 41 // Mapping - mapping_tool = 'bwaaln' - mapping_bwaaln_n = 0.01 // From Oliva et al. 2021 (10.1093/bib/bbab076) - mapping_bwaaln_k = 2 - mapping_bwaaln_l = 1024 // From Oliva et al. 2021 (10.1093/bib/bbab076) - mapping_bwaaln_o = 2 // From Oliva et al. 2021 (10.1093/bib/bbab076) - mapping_bwamem_k = 19 - mapping_bwamem_r = 1.5 - mapping_bowtie2_alignmode = 'local' - mapping_bowtie2_sensitivity = 'sensitive' - mapping_bowtie2_n = 0 - mapping_bowtie2_l = 20 - mapping_bowtie2_trim5 = 0 - mapping_bowtie2_trim3 = 0 - mapping_bowtie2_maxins = 500 + mapping_tool = 'bwaaln' + mapping_bwaaln_n = 0.01 // From Oliva et al. 2021 (10.1093/bib/bbab076) + mapping_bwaaln_k = 2 + mapping_bwaaln_l = 1024 // From Oliva et al. 2021 (10.1093/bib/bbab076) + mapping_bwaaln_o = 2 // From Oliva et al. 2021 (10.1093/bib/bbab076) + mapping_bwamem_k = 19 + mapping_bwamem_r = 1.5 + mapping_bowtie2_alignmode = 'local' + mapping_bowtie2_sensitivity = 'sensitive' + mapping_bowtie2_n = 0 + mapping_bowtie2_l = 20 + mapping_bowtie2_trim5 = 0 + mapping_bowtie2_trim3 = 0 + mapping_bowtie2_maxins = 500 + mapping_circularmapper_elongation_factor = 500 + mapping_circularmapper_elongated_fasta = null + mapping_circularmapper_elongated_fai = null // BAM Filtering run_bamfiltering = false @@ -179,9 +182,6 @@ params { mapstats_preseq_cval = 0.95 mapstats_preseq_defects_mode = false - //Circular Mapper - elongation_factor = 500 - // Damage Calculation options skip_damagecalculation = false damagecalculation_tool = 'damageprofiler' diff --git a/nextflow_schema.json b/nextflow_schema.json index 4e443e435..f87749bd8 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -605,12 +605,24 @@ "help_text": "The maximum fragment for valid paired-end alignments. Only for paired-end mapping (i.e. unmerged), and therefore typically only useful for modern data.\n\n> Modifies Bowtie2 parameter: `--maxins`", "fa_icon": "fas fa-exchange-alt" }, - "elongation_factor": { + "mapping_circularmapper_elongation_factor": { "type": "integer", "default": 500, - "description": "Specify the number of bases to extend reference by (circularmapper only)", + "description": "Specify the number of bases to extend reference by (circularmapper only).", "help_text": "The number of bases to extend the reference genome with. By default this is set to 500 if not specified otherwise.", "fa_icon": "fas fa-external-link-alt" + }, + "mapping_circularmapper_elongated_fasta": { + "type": "string", + "description": "Specify an elongated reference FASTA to be used for circularmapper.", + "help_text": "Specify an already elongated FASTA file for circularmapper to avoid reprocessing.", + "fa_icon": "fas fa-address-book" + }, + "mapping_circularmapper_elongated_fai": { + "type": "string", + "description": "Specify a samtools index for the elongated FASTA file.", + "help_text": "Specify the index for an already elongated FASTA file to avoid reprocessing.", + "fa_icon": "fas fa-address-book" } }, "fa_icon": "fas fa-layer-group" diff --git a/subworkflows/local/map.nf b/subworkflows/local/map.nf index 7c5addd81..88c9d58a2 100644 --- a/subworkflows/local/map.nf +++ b/subworkflows/local/map.nf @@ -116,7 +116,7 @@ workflow MAP { ch_mapped_lane_bai = params.fasta_largeref ? SAMTOOLS_INDEX_BT2.out.csi : SAMTOOLS_INDEX_BT2.out.bai } else if ( params.mapping_tool == 'circularmapper' ) { - ch_eval = params.elongation_factor + ch_eval = params.mapping_circularmapper_elongation_factor ch_input_for_circularmapper = reads .combine(index.map{ meta, index, fasta -> [ meta, fasta ] }) @@ -126,7 +126,7 @@ workflow MAP { reads: [ meta, reads ] reference: [ meta2, fasta ] } - CIRCULARMAPPER( ch_input_for_circularmapper.reads, params.elongation_factor, ch_input_for_circularmapper.reference ) + CIRCULARMAPPER( ch_input_for_circularmapper.reads, params.mapping_circularmapper_elongation_factor, ch_input_for_circularmapper.reference ) ch_versions = ch_versions.mix ( CIRCULARMAPPER.out.versions ) // TODO - Update SWF outputs ch_mapped_lane_bam = CIRCULARMAPPER.out.bam diff --git a/subworkflows/local/reference_indexing.nf b/subworkflows/local/reference_indexing.nf index cdf8d0e90..02a2b2621 100644 --- a/subworkflows/local/reference_indexing.nf +++ b/subworkflows/local/reference_indexing.nf @@ -7,6 +7,7 @@ include { REFERENCE_INDEXING_MULTI } from '../../subworkflows/local/reference_i include { GUNZIP as GUNZIP_PMDBED } from '../../modules/nf-core/gunzip/main.nf' include { GUNZIP as GUNZIP_PMDFASTA } from '../../modules/nf-core/gunzip/main.nf' include { GUNZIP as GUNZIP_SNPBED } from '../../modules/nf-core/gunzip/main.nf' +include { GUNZIP as GUNZIP_CM_FASTA } from '../../modules/nf-core/gunzip/main.nf' workflow REFERENCE_INDEXING { take: @@ -20,12 +21,13 @@ workflow REFERENCE_INDEXING { // Warn user if they've given a reference sheet that already includes fai/dict/mapper index etc. if ( ( fasta.extension == 'csv' || fasta.extension == 'tsv' ) && ( fasta_fai || fasta_dict || fasta_mapperindexdir )) log.warn("A TSV or CSV has been supplied to `--fasta_sheet` as well as e.g. `--fasta_fai`. --fasta_sheet CSV/TSV takes priority and --fasta_* parameters will be ignored.") - if ( ( fasta.extension == 'csv' || fasta.extension == 'tsv' ) && ( params.mitochondrion_header || params.contamination_estimation_angsd_hapmap || params.damage_manipulation_pmdtools_reference_mask || params.damage_manipulation_pmdtools_reference_mask || params.snpcapture_bed || params.genotyping_pileupcaller_bedfile || params.genotyping_pileupcaller_snpfile || params.sexdeterrmine_bedfile || params.mapstats_bedtools_featurefile || params.genotyping_reference_ploidy || params.genotyping_gatk_dbsnp )) log.warn("A TSV or CSV has been supplied to `--fasta_sheet` as well as individual reference-specific input files, e.g. `--contamination_estimation_angsd_hapmap`. Input files specified in the --fasta_sheet CSV/TSV take priority and other input parameters will be ignored.") + if ( ( fasta.extension == 'csv' || fasta.extension == 'tsv' ) && ( params.mitochondrion_header || params.contamination_estimation_angsd_hapmap || params.damage_manipulation_pmdtools_reference_mask || params.damage_manipulation_pmdtools_reference_mask || params.snpcapture_bed || params.genotyping_pileupcaller_bedfile || params.genotyping_pileupcaller_snpfile || params.sexdeterrmine_bedfile || params.mapstats_bedtools_featurefile || params.genotyping_reference_ploidy || params.genotyping_gatk_dbsnp, params.fasta_circular_target, params.circularmapper_elongated_fasta, params.circularmapper_elongated_fai )) log.warn("A TSV or CSV has been supplied to `--fasta_sheet` as well as individual reference-specific input files, e.g. `--contamination_estimation_angsd_hapmap`. Input files specified in the --fasta_sheet CSV/TSV take priority and other input parameters will be ignored.") if ( fasta.extension == 'csv' || fasta.extension == 'tsv' ) { // If input (multi-)reference sheet supplied REFERENCE_INDEXING_MULTI ( fasta ) ch_reference_for_mapping = REFERENCE_INDEXING_MULTI.out.reference + ch_circularmapper = REFERENCE_INDEXING_MULTI.out.circularmapper ch_mitochondrion_header = REFERENCE_INDEXING_MULTI.out.mitochondrion_header ch_hapmap = REFERENCE_INDEXING_MULTI.out.hapmap ch_pmd_masked_fasta = REFERENCE_INDEXING_MULTI.out.pmd_masked_fasta @@ -39,6 +41,7 @@ workflow REFERENCE_INDEXING { } else { // If input FASTA and/or indicies supplied REFERENCE_INDEXING_SINGLE ( fasta, fasta_fai, fasta_dict, fasta_mapperindexdir ) + ch_circularmapper = REFERENCE_INDEXING_SINGLE.out.circularmapper ch_mitochondrion_header = REFERENCE_INDEXING_SINGLE.out.mitochondrion_header ch_hapmap = REFERENCE_INDEXING_SINGLE.out.hapmap ch_pmd_masked_fasta = REFERENCE_INDEXING_SINGLE.out.pmd_masked_fasta @@ -125,17 +128,39 @@ workflow REFERENCE_INDEXING { ch_dbsnp = ch_dbsnp .filter { it[1] != "" } + ch_circularmapper_for_gunzip = ch_circularmapper + .filter{ it[1] != "" || it[2] != "" || it[3] != "" } + .branch{ + meta, circular_target, circularmapper_elongated_fasta, circularmapper_elongated_fai -> + forgunzip: circularmapper_elongated_fasta.extension == "gz" + skip: true + } + + ch_circularmapper_input = ch_circularmapper_for_gunzip.gunzip + .multiMap{ + meta, circular_target, circularmapper_elongated_fasta, circularmapper_elongated_fai -> + gunzip: [ meta, circularmapper_elongated_fasta ] + remainder: [ meta, circular_target, circularmapper_elongated_fai ] + } + + GUNZIP_CM_FASTA( ch_circularmapper_input ) + ch_version = ch_versions.mix( GUNZIP_CM_FASTA.out.versions.first() ) + + ch_gunzipped_elongated = GUNZIP_CM_FASTA.out.gunzip.join( ch_circularmapper_input.remainder, failOnMismatch: true ) + ch_circularmapper_gunzipped = ch_circularmapper_for_gunzip.skip.mix( ch_gunzipped_elongated ) + emit: - reference = ch_reference_for_mapping // [ meta, fasta, fai, dict, mapindex, circular_target ] - mitochondrion_header = ch_mitochondrion_header // [ meta, mitochondrion_header ] - hapmap = ch_hapmap // [ meta, hapmap ] - pmd_masking = ch_pmd_masking // [ meta, pmd_masked_fasta, pmd_bed_for_masking ] - pmd_bed_for_masking = ch_pmd_bed_for_masking // [ meta, pmd_bed_for_masking ] - snp_capture_bed = ch_capture_bed // [ meta, capture_bed ] - pileupcaller_bed_snp = ch_pileupcaller_bed_snp // [ meta, pileupcaller_bed, pileupcaller_snp ] - sexdeterrmine_bed = ch_sexdeterrmine_bed // [ meta, sexdet_bed ] - bedtools_feature = ch_bedtools_feature // [ meta, bedtools_feature ] - dbsnp = ch_dbsnp // [ meta, dbsnp ] + reference = ch_reference_for_mapping // [ meta, fasta, fai, dict, mapindex, circular_target ] + circularmapper = ch_circularmapper_gunzipped // [ meta, circular_target, circularmapper_elongated_fasta, circularmapper_elongated_fai ] + mitochondrion_header = ch_mitochondrion_header // [ meta, mitochondrion_header ] + hapmap = ch_hapmap // [ meta, hapmap ] + pmd_masking = ch_pmd_masking // [ meta, pmd_masked_fasta, pmd_bed_for_masking ] + pmd_bed_for_masking = ch_pmd_bed_for_masking // [ meta, pmd_bed_for_masking ] + snp_capture_bed = ch_capture_bed // [ meta, capture_bed ] + pileupcaller_bed_snp = ch_pileupcaller_bed_snp // [ meta, pileupcaller_bed, pileupcaller_snp ] + sexdeterrmine_bed = ch_sexdeterrmine_bed // [ meta, sexdet_bed ] + bedtools_feature = ch_bedtools_feature // [ meta, bedtools_feature ] + dbsnp = ch_dbsnp // [ meta, dbsnp ] versions = ch_versions } diff --git a/subworkflows/local/reference_indexing_multi.nf b/subworkflows/local/reference_indexing_multi.nf index 6a42d9208..f71280bc6 100644 --- a/subworkflows/local/reference_indexing_multi.nf +++ b/subworkflows/local/reference_indexing_multi.nf @@ -20,23 +20,25 @@ workflow REFERENCE_INDEXING_MULTI { // Import reference sheet and change empty arrays to empty strings for compatibility with single reference input ch_splitreferencesheet_for_branch = Channel.fromSamplesheet("fasta_sheet") .map{ - meta, fasta, fai, dict, mapper_index, circular_target, mitochondrion, capture_bed, pileupcaller_bed, pileupcaller_snp, hapmap, pmd_masked_fasta, pmd_bed_for_masking, sexdet_bed, bedtools_feature, genotyping_gatk_dbsnp -> - meta.ploidy = meta.genotyping_ploidy != null ? meta.genotyping_ploidy : params.genotyping_reference_ploidy - fai = fai != [] ? fai : "" - dict = dict != [] ? dict : "" - mapper_index = mapper_index != [] ? mapper_index : "" - circular_target = circular_target != [] ? circular_target : "" - mitochondrion = mitochondrion != [] ? mitochondrion : "" - capture_bed = capture_bed != [] ? capture_bed : "" - pileupcaller_bed = pileupcaller_bed != [] ? pileupcaller_bed : "" - pileupcaller_snp = pileupcaller_snp != [] ? pileupcaller_snp : "" - hapmap = hapmap != [] ? hapmap : "" - pmd_masked_fasta = pmd_masked_fasta != [] ? pmd_masked_fasta : "" - pmd_bed_for_masking = pmd_bed_for_masking != [] ? pmd_bed_for_masking : "" - sexdet_bed = sexdet_bed != [] ? sexdet_bed : "" - bedtools_feature = bedtools_feature != [] ? bedtools_feature : "" - genotyping_gatk_dbsnp = genotyping_gatk_dbsnp != [] ? genotyping_gatk_dbsnp : "" - [ meta - meta.subMap('genotyping_ploidy'), fasta, fai, dict, mapper_index, circular_target, mitochondrion, capture_bed, pileupcaller_bed, pileupcaller_snp, hapmap, pmd_masked_fasta, pmd_bed_for_masking, sexdet_bed, bedtools_feature, genotyping_gatk_dbsnp ] + meta, fasta, fai, dict, mapper_index, circular_target, circularmapper_elongated_fasta, circularmapper_elongated_fai, mitochondrion, capture_bed, pileupcaller_bed, pileupcaller_snp, hapmap, pmd_masked_fasta, pmd_bed_for_masking, sexdet_bed, bedtools_feature, genotyping_gatk_dbsnp -> + meta.ploidy = meta.genotyping_ploidy != null ? meta.genotyping_ploidy : params.genotyping_reference_ploidy + fai = fai != [] ? fai : "" + dict = dict != [] ? dict : "" + mapper_index = mapper_index != [] ? mapper_index : "" + circular_target = circular_target != [] ? circular_target : "" + circularmapper_elongated_fasta = circularmapper_elongated_fasta != [] ? circularmapper_elongated_fasta : "" + circularmapper_elongated_fai = circularmapper_elongated_fai != [] ? circularmapper_elongated_fai : "" + mitochondrion = mitochondrion != [] ? mitochondrion : "" + capture_bed = capture_bed != [] ? capture_bed : "" + pileupcaller_bed = pileupcaller_bed != [] ? pileupcaller_bed : "" + pileupcaller_snp = pileupcaller_snp != [] ? pileupcaller_snp : "" + hapmap = hapmap != [] ? hapmap : "" + pmd_masked_fasta = pmd_masked_fasta != [] ? pmd_masked_fasta : "" + pmd_bed_for_masking = pmd_bed_for_masking != [] ? pmd_bed_for_masking : "" + sexdet_bed = sexdet_bed != [] ? sexdet_bed : "" + bedtools_feature = bedtools_feature != [] ? bedtools_feature : "" + genotyping_gatk_dbsnp = genotyping_gatk_dbsnp != [] ? genotyping_gatk_dbsnp : "" + [ meta - meta.subMap('genotyping_ploidy'), fasta, fai, dict, mapper_index, circular_target, circularmapper_elongated_fasta, circularmapper_elongated_fai, mitochondrion, capture_bed, pileupcaller_bed, pileupcaller_snp, hapmap, pmd_masked_fasta, pmd_bed_for_masking, sexdet_bed, bedtools_feature, genotyping_gatk_dbsnp ] } // GENERAL DESCRIPTION FOR NEXT SECTIONS @@ -52,8 +54,9 @@ workflow REFERENCE_INDEXING_MULTI { ch_input_from_referencesheet = ch_splitreferencesheet_for_branch .multiMap { - meta, fasta, fai, dict, mapper_index, circular_target, mitochondrion, capture_bed, pileupcaller_bed, pileupcaller_snp, hapmap, pmd_masked_fasta, pmd_bed_for_masking, sexdet_bed, bedtools_feature, genotyping_gatk_dbsnp -> - generated: [ meta, fasta, fai, dict, mapper_index, circular_target ] + meta, fasta, fai, dict, mapper_index, circular_target, circularmapper_elongated_fasta, circularmapper_elongated_fai, mitochondrion, capture_bed, pileupcaller_bed, pileupcaller_snp, hapmap, pmd_masked_fasta, pmd_bed_for_masking, sexdet_bed, bedtools_feature, genotyping_gatk_dbsnp -> + generated: [ meta, fasta, fai, dict, mapper_index ] + circularmapper: [ meta, circular_target, circularmapper_elongated_fasta, circularmapper_elongated_fai ] mitochondrion_header: [ meta, mitochondrion ] angsd_hapmap: [ meta, hapmap ] pmd_masked_fasta: [ meta, pmd_masked_fasta ] @@ -68,7 +71,7 @@ workflow REFERENCE_INDEXING_MULTI { // Detect if fasta is gzipped or not ch_fasta_for_gunzip = ch_input_from_referencesheet.generated .branch { - meta, fasta, fai, dict, mapper_index, circular_target -> + meta, fasta, fai, dict, mapper_index -> forgunzip: fasta.extension == "gz" skip: true } @@ -76,9 +79,9 @@ workflow REFERENCE_INDEXING_MULTI { // Pull out name/file to match cardinality for GUNZIP module ch_gunzip_input = ch_fasta_for_gunzip.forgunzip .multiMap { - meta, fasta, fai, dict, mapper_index, circular_target -> + meta, fasta, fai, dict, mapper_index -> gunzip: [ meta, fasta ] - remainder: [ meta, fai, dict, mapper_index, circular_target ] + remainder: [ meta, fai, dict, mapper_index ] } @@ -96,7 +99,7 @@ workflow REFERENCE_INDEXING_MULTI { // Separate out non-faidxed references ch_fasta_for_faidx = ch_fasta_for_faiindexing .branch { - meta, fasta, fai, dict, mapper_index, circular_target -> + meta, fasta, fai, dict, mapper_index -> forfaidx: fai == "" skip: true } @@ -105,9 +108,9 @@ workflow REFERENCE_INDEXING_MULTI { ch_faidx_input = ch_fasta_for_faidx .forfaidx .multiMap { - meta, fasta, fai, dict, mapper_index, circular_target -> + meta, fasta, fai, dict, mapper_index -> faidx: [ meta, fasta ] - remainder: [ meta, fasta, dict, mapper_index, circular_target ] // we drop fai here as we are going to make it + remainder: [ meta, fasta, dict, mapper_index ] // we drop fai here as we are going to make it } SAMTOOLS_FAIDX ( ch_faidx_input.faidx, [ [], [] ] ) @@ -117,9 +120,9 @@ workflow REFERENCE_INDEXING_MULTI { ch_faidxed_formix = SAMTOOLS_FAIDX.out.fai .join( ch_faidx_input.remainder, failOnMismatch: true ) .map { - meta, fai, fasta, dict, mapper_index, circular_target -> + meta, fai, fasta, dict, mapper_index -> - [ meta, fasta, fai, dict, mapper_index, circular_target ] + [ meta, fasta, fai, dict, mapper_index ] } // Mix back newly faidx'd references with the pre-indexed ones @@ -131,7 +134,7 @@ workflow REFERENCE_INDEXING_MULTI { ch_fasta_for_dict = ch_fasta_for_dictindexing .branch { - meta, fasta, fai, dict, mapper_index, circular_target -> + meta, fasta, fai, dict, mapper_index -> fordict: dict == "" skip: true } @@ -139,9 +142,9 @@ workflow REFERENCE_INDEXING_MULTI { ch_dict_input = ch_fasta_for_dict .fordict .multiMap { - meta, fasta, fai, dict, mapper_index, circular_target -> + meta, fasta, fai, dict, mapper_index -> dict: [ meta, fasta ] - remainder: [ meta, fasta, fai, mapper_index, circular_target ] + remainder: [ meta, fasta, fai, mapper_index ] } PICARD_CREATESEQUENCEDICTIONARY ( ch_dict_input.dict ) @@ -150,9 +153,9 @@ workflow REFERENCE_INDEXING_MULTI { ch_dicted_formix = PICARD_CREATESEQUENCEDICTIONARY.out.reference_dict .join( ch_dict_input.remainder, failOnMismatch: true ) .map { - meta, dict, fasta, fai, mapper_index, circular_target -> + meta, dict, fasta, fai, mapper_index -> - [ meta, fasta, fai, dict, mapper_index, circular_target ] + [ meta, fasta, fai, dict, mapper_index ] } ch_dict_formapperindexing = ch_fasta_for_dict.skip.mix(ch_dicted_formix) @@ -165,7 +168,7 @@ workflow REFERENCE_INDEXING_MULTI { ch_fasta_for_mapperindex = ch_dict_formapperindexing .branch { - meta, fasta, fai, dict, mapper_index, circular_target -> + meta, fasta, fai, dict, mapper_index -> forindex: mapper_index == "" skip: true } @@ -173,9 +176,9 @@ workflow REFERENCE_INDEXING_MULTI { ch_mapindex_input = ch_fasta_for_mapperindex .forindex .multiMap { - meta, fasta, fai, dict, mapper_index, circular_target -> + meta, fasta, fai, dict, mapper_index -> index: [ meta, fasta ] - remainder: [ meta, fasta, fai, dict, circular_target ] + remainder: [ meta, fasta, fai, dict ] } if ( params.mapping_tool == "bwaaln" || params.mapping_tool == "bwamem" ) { @@ -193,15 +196,16 @@ workflow REFERENCE_INDEXING_MULTI { ch_indexed_formix = ch_indexed_forremap .join( ch_mapindex_input.remainder, failOnMismatch: true ) .map { - meta, mapper_index, fasta, fai, dict, circular_target -> + meta, mapper_index, fasta, fai, dict -> - [ meta, fasta, fai, dict, mapper_index, circular_target ] + [ meta, fasta, fai, dict, mapper_index ] } ch_indexmapper_for_reference = ch_fasta_for_mapperindex.skip.mix(ch_indexed_formix) emit: - reference = ch_indexmapper_for_reference // [ meta, fasta, fai, dict, mapindex, circular_target ] + reference = ch_indexmapper_for_reference // [ meta, fasta, fai, dict, mapindex ] + circularmapper = ch_input_from_referencesheet.circularmapper // [ meta, circular_target, circularmapper_elongated_fasta, circularmapper_elongated_fai ] mitochondrion_header = ch_input_from_referencesheet.mitochondrion_header // [ meta, mitochondrion ] hapmap = ch_input_from_referencesheet.angsd_hapmap // [ meta, hapmap ] pmd_masked_fasta = ch_input_from_referencesheet.pmd_masked_fasta // [ meta, pmd_masked_fasta ] diff --git a/subworkflows/local/reference_indexing_single.nf b/subworkflows/local/reference_indexing_single.nf index 778352680..cddb45a63 100644 --- a/subworkflows/local/reference_indexing_single.nf +++ b/subworkflows/local/reference_indexing_single.nf @@ -90,13 +90,16 @@ workflow REFERENCE_INDEXING_SINGLE { def bedtools_feature = params.mapstats_bedtools_featurefile != null ? file(params.mapstats_bedtools_featurefile, checkIfExists: true ) : "" def genotyping_reference_ploidy = params.genotyping_reference_ploidy def genotyping_gatk_dbsnp = params.genotyping_gatk_dbsnp != null ? file(params.genotyping_gatk_dbsnp, checkIfExists: true ) : "" - [ meta + [ ploidy: genotyping_reference_ploidy ], fasta, fai, dict, mapper_index, params.fasta_circular_target, params.mitochondrion_header, contamination_estimation_angsd_hapmap, pmd_masked_fasta, pmd_bed_for_masking, capture_bed, pileupcaller_bed, pileupcaller_snp, sexdet_bed, bedtools_feature, genotyping_gatk_dbsnp ] + def circularmapper_elongated_fasta = params.mapping_circularmapper_elongated_fasta != null ? file( params.mapping_circularmapper_elongated_fasta, checkIfExists: true ) : "" + def circularmapper_elongated_fai = params.mapping_circularmapper_elongated_fai != null ? file( params.mapping_circularmapper_elongated_fai, checkIfExists: true ) : "" + [ meta + [ ploidy: genotyping_reference_ploidy ], fasta, fai, dict, mapper_index, params.fasta_circular_target, params.mitochondrion_header, contamination_estimation_angsd_hapmap, pmd_masked_fasta, pmd_bed_for_masking, capture_bed, pileupcaller_bed, pileupcaller_snp, sexdet_bed, bedtools_feature, genotyping_gatk_dbsnp, circularmapper_elongated_fasta, circularmapper_elongated_fai ] } ch_ref_index_single = ch_reference_for_mapping .multiMap{ - meta, fasta, fai, dict, mapper_index, circular_target, mitochondrion_header, contamination_estimation_angsd_hapmap, pmd_masked_fasta, pmd_bed_for_masking, capture_bed, pileupcaller_bed, pileupcaller_snp, sexdet_bed, bedtools_feature, genotyping_gatk_dbsnp -> - reference: [ meta, fasta, fai, dict, mapper_index, circular_target ] + meta, fasta, fai, dict, mapper_index, circular_target, mitochondrion_header, contamination_estimation_angsd_hapmap, pmd_masked_fasta, pmd_bed_for_masking, capture_bed, pileupcaller_bed, pileupcaller_snp, sexdet_bed, bedtools_feature, genotyping_gatk_dbsnp, circularmapper_elongated_fasta, circularmapper_elongated_fai -> + reference: [ meta, fasta, fai, dict, mapper_index ] + circularmapper: [ meta, circular_target, circularmapper_elongated_fasta, circularmapper_elongated_fai ] mito_header: [ meta, mitochondrion_header ] hapmap: [ meta, contamination_estimation_angsd_hapmap ] pmd_masked_fasta: [ meta, pmd_masked_fasta ] @@ -109,7 +112,8 @@ workflow REFERENCE_INDEXING_SINGLE { } emit: - reference = ch_ref_index_single.reference // [ meta, fasta, fai, dict, mapindex, circular_target ] + reference = ch_ref_index_single.reference // [ meta, fasta, fai, dict, mapindex ] + circularmapper = ch_ref_index_single.circularmapper // [ meta, circular_target, circularmapper_elongated_fasta, circularmapper_elongated_fai ] mitochondrion_header = ch_ref_index_single.mito_header // [ meta, mito_header ] hapmap = ch_ref_index_single.hapmap // [ meta, hapmap ] pmd_masked_fasta = ch_ref_index_single.pmd_masked_fasta // [ meta, pmd_masked_fasta ] From cd867490a750afc68529a80849eae2f787961017 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 12 Jul 2024 15:44:01 +0200 Subject: [PATCH 128/198] attempt to add CM SWF --- subworkflows/local/circularmapper.nf | 112 +++++++++++---------------- subworkflows/local/map.nf | 29 +++++++ 2 files changed, 75 insertions(+), 66 deletions(-) diff --git a/subworkflows/local/circularmapper.nf b/subworkflows/local/circularmapper.nf index 1498b6a3d..0ca399110 100644 --- a/subworkflows/local/circularmapper.nf +++ b/subworkflows/local/circularmapper.nf @@ -2,80 +2,60 @@ // Run circularmapper // -include { CIRCULARMAPPER_CIRCULARGENERATOR } from '../../modules/nf-core/circularmapper/circulargenerator/main' -include { CIRCULARMAPPER_REALIGNSAMFILE } from '../../modules/nf-core/circularmapper/realignsamfile/main' -include { FASTQ_ALIGN_BWAALN } from '../../subworkflows/nf-core/fastq_align_bwaaln/main' -include { BWA_INDEX as BWA_INDEX_CIRCULARMAPPER } from '../../modules/nf-core/bwa/index/main' +include { FASTQ_ALIGN_BWAALN_ELONGATED } from '../../subworkflows/nf-core/fastq_align_bwaaln/main' +include { CIRCULARMAPPER_REALIGNSAMFILE } from '../../modules/nf-core/circularmapper/realignsamfile/main' workflow CIRCULARMAPPER { - - // TODO - PRepare input for FASTQ_ALIGN_BWAALN SWF, then use CIRCULARMAPPER_REALIGNSAMFILE file anf index output SAM file to emit. take: - ch_reference // channel (mandatory): [ val(meta), path(reference) ] - elongation_value // channel (mandatory): val(elongation value) - fastq_reads // channel (mandatory): [ val(meta), path(reads) ] + ch_reference // channel (mandatory): [ val(meta), path(index), path(reference) ] + ch_elongated_index // channel (mandatory): [ val(meta), path(elongated_index) ] + ch_fastq_reads // channel (mandatory): [ val(meta), path(reads) ]. subworkImportant: meta REQUIRES single_end` entry! + val_elongation_factor // int (mandatory): Elongation factor used for chromosome circularisation main: ch_versions = Channel.empty() ch_multiqc_files = Channel.empty() + ch_realigned_bams = Channel.empty() + + // While mapping with BWA will need the elongated reference index, RealignSAMFile apparently does NOT need the elongated reference to be present, only the elongation factor. + FASTQ_ALIGN_BWAALN_ELONGATED( ch_fastq_reads, ch_elongated_index ) + ch_versions = ch_versions.mix( FASTQ_ALIGN_BWAALN_ELONGATED.out.versions.first() ) + + ch_ref_for_realignsamfile = ch_reference + .map { + meta, index, reference -> + [ meta, reference ] + } + .map { + // Prepend a new meta that contains the meta.reference value as the new_meta.reference attribute + addNewMetaFromAttributes( it, "id" , "reference" , false ) + } + + ch_input_for_realignsamfile = FASTQ_ALIGN_BWAALN_ELONGATED.out.bam + .map{ + // create meta consistent with rest of workflow + meta, bam -> + new_meta = meta + [ reference: meta.id_index ] + [ new_meta, bam ] + } + .map { + // Prepend a new meta that contains the meta.reference value as the new_meta.reference attribute + addNewMetaFromAttributes( it, "reference" , "reference" , false ) + } + .combine( ch_ref_for_realignsamfile, by: 0 ) + .multiMap { + ignore_me, meta, bam, ref_meta, ref_index, ref_fasta -> + bam: [ metas, bam ] + fasta: [ ref_meta, ref_fasta ] + } + + CIRCULARMAPPER_REALIGNSAMFILE( ch_input_for_realignsamfile.bam, ch_input_for_realignsamfile.fasta, val_elongation_factor ) + ch_versions = ch_versions.mix( CIRCULARMAPPER_REALIGNSAMFILE.out.versions.first() ) + ch_realigned_bams = ch_realigned_bams.mix( CIRCULARMAPPER_REALIGNSAMFILE.out.bam ) - CIRCULARMAPPER_CIRCULARGENERATOR(ch_reference, elongation_value) - ch_versions = ch_versions.mix( CIRCULARMAPPER_CIRCULARGENERATOR.out.versions.first() ) - - BWA_INDEX_CIRCULARMAPPER(CIRCULARMAPPER_CIRCULARGENERATOR.out.fasta) - ch_versions = ch_versions.mix( BWA_INDEX_CIRCULARMAPPER.out.versions.first() ) - - ch_reference_for_bwa = BWA_INDEX_CIRCULARMAPPER.out.index - .map { - // Prepend a new meta that contains the meta.reference value as the new_meta.reference attribute - addNewMetaFromAttributes( it, "id" , "reference" , false ) - } - - ch_input_bwa_aln = fastq_reads - .map { - // Prepend a new meta that contains the meta.reference value as the new_meta.reference attribute - addNewMetaFromAttributes( it, "reference" , "reference" , false ) - } - .groupTuple(by:0) - .combine( ch_reference_for_bwa, by: 0 ) - .dump(tag:"ch_input_bwa_aln") - // .multiMap { - // combo_meta, meta, fastq, ref_meta, ref_index -> - // def ids = metas.collect { meta -> meta.id } - // reads: [ combo_meta + [id: ids], fastq ] - // index: [ ref_bai, bai ] - // } - - // BWA_ALN_CIRCULARMAPPER(ch_input_bwa_aln) - // ch_versions = ch_versions.mix( BWA_ALN_CIRCULARMAPPER.out.versions.first() ) - - // ch_input_bwa_samse = ch_input_bwa_aln - // .combine( BWA_ALN_CIRCULARMAPPER.out.sai, by: 0 ) // [ [meta], fastq, bai, sai ] - // .multiMap { - // metas, fastq, ref_bai, bai, ref_sai, sai -> - // fastqs: [ metas, fastq, sai ] - // bai: [ ref_bai, bai ] - // } - - // BWA_SAMSE_CIRCULARMAPPER(ch_input_bwa_samse) - // ch_versions = ch_versions.mix( BWA_SAMSE_CIRCULARMAPPER.out.versions.first() ) - - // ch_input_realignsamfile = BWA_SAMSE_CIRCULARMAPPER.out.bam - // .combine(CIRCULARMAPPER_CIRCULARGENERATOR.out.fasta, by: 0) - // .combine(ch_eval) - // .multiMap { - // ref_bam, bam, ref_fasta, fasta, ch_eval -> - // bam: [ ref_bam, bam ] - // fasta: [ ref_fasta, fasta ] - // eval: [ ch_eval ] - // } - - // CIRCULARMAPPER_REALIGNSAMFILE(ch_input_realignsamfile) - // ch_versions = ch_versions.mix( CIRCULARMAPPER_REALIGNSAMFILE.out.versions.first() ) emit: - - bam = channel.empty() //CIRCULARMAPPER_REALIGNSAMFILE.out.bam // channel: [ val(meta), path(bam) ] - versions = ch_versions // channel: [ path(versions.yml) ] - + bam = ch_realigned_bams // channel: [ val(meta), path(bam) ] + versions = ch_versions + mqc = ch_multiqc_files } diff --git a/subworkflows/local/map.nf b/subworkflows/local/map.nf index 1ffacf706..32291ffaa 100644 --- a/subworkflows/local/map.nf +++ b/subworkflows/local/map.nf @@ -117,6 +117,35 @@ workflow MAP { ch_mapped_lane_bai = params.fasta_largeref ? SAMTOOLS_INDEX_BT2.out.csi : SAMTOOLS_INDEX_BT2.out.bai } else if ( params.mapping_tool == 'circularmapper' ) { + ch_index_for_mapping = index.map{ meta, index, fasta -> [ meta, index ] } + ch_elongated_reference_for_mapping = elogated_index.map{ meta, index, fasta, circular_target -> [ meta, index ] } + ch_reads_for_mapping = reads + + CIRCULARMAPPER( ch_index_for_mapping, ch_elongated_reference_for_mapping, ch_reads_for_mapping ) + + // // Join the original and elongated references, then combine with the reads, and multiMap to ensure correct ordering of channel contents. + // ch_reads_for_circularmapper = reads.map { + // // Prepend a new meta that contains the meta.reference value as the new_meta.reference attribute + // addNewMetaFromAttributes( it, "reference" , "reference" , false ) + // } + + // ch_input_for_circularmapper = index.join( elogated_index ) + // .map { + // meta, index, fasta, elongated_index, elongated_fasta, circular_target -> + // [ meta, index, fasta , elongated_index, elongated_fasta ] + // } + // .map { + // // Prepend a new meta that contains the meta.reference value as the new_meta.reference attribute + // addNewMetaFromAttributes( it, "id" , "reference" , false ) + // } + // .combine( ch_reads_for_circularmapper, by: 0) + // .multiMap { + // ignore_me, meta, index, fasta, elongated_index, elongated_fasta, circular_target, meta2, fasta, reads -> + // reads: [ meta, reads ] + // reference: [ meta, index, fasta ] + // elongated_reference: [meta, elongated_index , elongated_index] + // } + // Reference elongation and indexing takes place in the reference_indexing swf. // Circularmapper takes non-elongated AND elongated references and reads as input (i think. wait for Alex's reply). From e05cc42459b2fb91d3e1809b9fbc0d2b91b98d03 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Mon, 15 Jul 2024 14:10:41 +0200 Subject: [PATCH 129/198] index output bams --- subworkflows/local/circularmapper.nf | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/subworkflows/local/circularmapper.nf b/subworkflows/local/circularmapper.nf index 0ca399110..dde31729f 100644 --- a/subworkflows/local/circularmapper.nf +++ b/subworkflows/local/circularmapper.nf @@ -2,8 +2,9 @@ // Run circularmapper // -include { FASTQ_ALIGN_BWAALN_ELONGATED } from '../../subworkflows/nf-core/fastq_align_bwaaln/main' -include { CIRCULARMAPPER_REALIGNSAMFILE } from '../../modules/nf-core/circularmapper/realignsamfile/main' +include { FASTQ_ALIGN_BWAALN as FASTQ_ALIGN_BWAALN_ELONGATED } from '../../subworkflows/nf-core/fastq_align_bwaaln/main' +include { CIRCULARMAPPER_REALIGNSAMFILE } from '../../modules/nf-core/circularmapper/realignsamfile/main' +include { SAMTOOLS_INDEX as SAMTOOLS_INDEX_REALIGNED } from '../../modules/nf-core/samtools/index/main' workflow CIRCULARMAPPER { take: @@ -16,6 +17,8 @@ workflow CIRCULARMAPPER { ch_versions = Channel.empty() ch_multiqc_files = Channel.empty() ch_realigned_bams = Channel.empty() + ch_realigned_bais = Channel.empty() + ch_realigned_csis = Channel.empty() // While mapping with BWA will need the elongated reference index, RealignSAMFile apparently does NOT need the elongated reference to be present, only the elongation factor. FASTQ_ALIGN_BWAALN_ELONGATED( ch_fastq_reads, ch_elongated_index ) @@ -33,10 +36,11 @@ workflow CIRCULARMAPPER { ch_input_for_realignsamfile = FASTQ_ALIGN_BWAALN_ELONGATED.out.bam .map{ - // create meta consistent with rest of workflow + // create meta consistent with rest of MAP workflow + // TODO: Check that the id_index is correctly set and remove the elongation factor suffix if necessary. meta, bam -> - new_meta = meta + [ reference: meta.id_index ] - [ new_meta, bam ] + new_meta = meta + [ reference: meta.id_index ] + [ new_meta, bam ] } .map { // Prepend a new meta that contains the meta.reference value as the new_meta.reference attribute @@ -44,7 +48,7 @@ workflow CIRCULARMAPPER { } .combine( ch_ref_for_realignsamfile, by: 0 ) .multiMap { - ignore_me, meta, bam, ref_meta, ref_index, ref_fasta -> + ignore_me, meta, bam, ref_meta, ref_fasta -> bam: [ metas, bam ] fasta: [ ref_meta, ref_fasta ] } @@ -53,9 +57,15 @@ workflow CIRCULARMAPPER { ch_versions = ch_versions.mix( CIRCULARMAPPER_REALIGNSAMFILE.out.versions.first() ) ch_realigned_bams = ch_realigned_bams.mix( CIRCULARMAPPER_REALIGNSAMFILE.out.bam ) + SAMTOOLS_INDEX_REALIGNED( ch_realigned_bams ) + ch_versions = ch_versions.mix( SAMTOOLS_INDEX_REALIGNED.out.versions.first() ) + ch_realigned_bais = ch_realigned_bais.mix( SAMTOOLS_INDEX_REALIGNED.out.bai ) + ch_realigned_csis = ch_realigned_csis.mix( SAMTOOLS_INDEX_REALIGNED.out.csi ) emit: - bam = ch_realigned_bams // channel: [ val(meta), path(bam) ] + bam = ch_realigned_bams // [ val(meta), path(bam) ] + bai = ch_realigned_bais // [ val(meta), path(bai) ] + csi = ch_realigned_csis // [ val(meta), path(csi) ] versions = ch_versions mqc = ch_multiqc_files } From 1764505d5e4f42b0d79f629645c5d3866a9117c3 Mon Sep 17 00:00:00 2001 From: scarlhoff Date: Tue, 16 Jul 2024 13:57:48 +0200 Subject: [PATCH 130/198] add validation and address comments --- subworkflows/local/reference_indexing.nf | 34 ++++++++++--------- .../local/reference_indexing_multi.nf | 2 +- .../local/reference_indexing_single.nf | 2 +- .../local/utils_nfcore_eager_pipeline/main.nf | 2 +- 4 files changed, 21 insertions(+), 19 deletions(-) diff --git a/subworkflows/local/reference_indexing.nf b/subworkflows/local/reference_indexing.nf index 02a2b2621..ad9e18b05 100644 --- a/subworkflows/local/reference_indexing.nf +++ b/subworkflows/local/reference_indexing.nf @@ -2,12 +2,12 @@ // Prepare reference indexing for downstream // -include { REFERENCE_INDEXING_SINGLE } from '../../subworkflows/local/reference_indexing_single.nf' -include { REFERENCE_INDEXING_MULTI } from '../../subworkflows/local/reference_indexing_multi.nf' -include { GUNZIP as GUNZIP_PMDBED } from '../../modules/nf-core/gunzip/main.nf' -include { GUNZIP as GUNZIP_PMDFASTA } from '../../modules/nf-core/gunzip/main.nf' -include { GUNZIP as GUNZIP_SNPBED } from '../../modules/nf-core/gunzip/main.nf' -include { GUNZIP as GUNZIP_CM_FASTA } from '../../modules/nf-core/gunzip/main.nf' +include { REFERENCE_INDEXING_SINGLE } from '../../subworkflows/local/reference_indexing_single.nf' +include { REFERENCE_INDEXING_MULTI } from '../../subworkflows/local/reference_indexing_multi.nf' +include { GUNZIP as GUNZIP_PMDBED } from '../../modules/nf-core/gunzip/main.nf' +include { GUNZIP as GUNZIP_PMDFASTA } from '../../modules/nf-core/gunzip/main.nf' +include { GUNZIP as GUNZIP_SNPBED } from '../../modules/nf-core/gunzip/main.nf' +include { GUNZIP as GUNZIP_ELONGATED_FASTA } from '../../modules/nf-core/gunzip/main.nf' workflow REFERENCE_INDEXING { take: @@ -27,7 +27,7 @@ workflow REFERENCE_INDEXING { // If input (multi-)reference sheet supplied REFERENCE_INDEXING_MULTI ( fasta ) ch_reference_for_mapping = REFERENCE_INDEXING_MULTI.out.reference - ch_circularmapper = REFERENCE_INDEXING_MULTI.out.circularmapper + ch_elongated_reference = REFERENCE_INDEXING_MULTI.out.elongated_reference ch_mitochondrion_header = REFERENCE_INDEXING_MULTI.out.mitochondrion_header ch_hapmap = REFERENCE_INDEXING_MULTI.out.hapmap ch_pmd_masked_fasta = REFERENCE_INDEXING_MULTI.out.pmd_masked_fasta @@ -41,7 +41,7 @@ workflow REFERENCE_INDEXING { } else { // If input FASTA and/or indicies supplied REFERENCE_INDEXING_SINGLE ( fasta, fasta_fai, fasta_dict, fasta_mapperindexdir ) - ch_circularmapper = REFERENCE_INDEXING_SINGLE.out.circularmapper + ch_elongated_reference = REFERENCE_INDEXING_SINGLE.out.elongated_reference ch_mitochondrion_header = REFERENCE_INDEXING_SINGLE.out.mitochondrion_header ch_hapmap = REFERENCE_INDEXING_SINGLE.out.hapmap ch_pmd_masked_fasta = REFERENCE_INDEXING_SINGLE.out.pmd_masked_fasta @@ -128,30 +128,32 @@ workflow REFERENCE_INDEXING { ch_dbsnp = ch_dbsnp .filter { it[1] != "" } - ch_circularmapper_for_gunzip = ch_circularmapper - .filter{ it[1] != "" || it[2] != "" || it[3] != "" } + ch_elongated_for_gunzip = ch_elongated_reference + .filter{ it[1] != "" && it[2] != "" } + .ifEmpty{ if(params.mapping_tool == "circularmapper" ) { error "[nf-core/eager]: ERROR: Mapping with circularmapper requires either a circular target or elongated reference file." } } + .filter( it != null ) .branch{ meta, circular_target, circularmapper_elongated_fasta, circularmapper_elongated_fai -> forgunzip: circularmapper_elongated_fasta.extension == "gz" skip: true } - ch_circularmapper_input = ch_circularmapper_for_gunzip.gunzip + ch_elongated_input = ch_elongated_for_gunzip.gunzip .multiMap{ meta, circular_target, circularmapper_elongated_fasta, circularmapper_elongated_fai -> gunzip: [ meta, circularmapper_elongated_fasta ] remainder: [ meta, circular_target, circularmapper_elongated_fai ] } - GUNZIP_CM_FASTA( ch_circularmapper_input ) - ch_version = ch_versions.mix( GUNZIP_CM_FASTA.out.versions.first() ) + GUNZIP_ELONGATED_FASTA( ch_elongated_input.gunzip ) + ch_version = ch_versions.mix( GUNZIP_ELONGATED_FASTA.out.versions.first() ) - ch_gunzipped_elongated = GUNZIP_CM_FASTA.out.gunzip.join( ch_circularmapper_input.remainder, failOnMismatch: true ) - ch_circularmapper_gunzipped = ch_circularmapper_for_gunzip.skip.mix( ch_gunzipped_elongated ) + ch_elongated_gunzipped = GUNZIP_ELONGATED_FASTA.out.gunzip.join( ch_elongated_input.remainder, failOnMismatch: true ) + ch_elongated_after_gunzip = ch_elongated_for_gunzip.skip.mix( ch_elongated_gunzipped ) emit: reference = ch_reference_for_mapping // [ meta, fasta, fai, dict, mapindex, circular_target ] - circularmapper = ch_circularmapper_gunzipped // [ meta, circular_target, circularmapper_elongated_fasta, circularmapper_elongated_fai ] + elongated_reference = ch_elongated_after_gunzip // [ meta, circular_target, circularmapper_elongated_fasta, circularmapper_elongated_fai ] mitochondrion_header = ch_mitochondrion_header // [ meta, mitochondrion_header ] hapmap = ch_hapmap // [ meta, hapmap ] pmd_masking = ch_pmd_masking // [ meta, pmd_masked_fasta, pmd_bed_for_masking ] diff --git a/subworkflows/local/reference_indexing_multi.nf b/subworkflows/local/reference_indexing_multi.nf index f71280bc6..628d58a90 100644 --- a/subworkflows/local/reference_indexing_multi.nf +++ b/subworkflows/local/reference_indexing_multi.nf @@ -205,7 +205,7 @@ workflow REFERENCE_INDEXING_MULTI { emit: reference = ch_indexmapper_for_reference // [ meta, fasta, fai, dict, mapindex ] - circularmapper = ch_input_from_referencesheet.circularmapper // [ meta, circular_target, circularmapper_elongated_fasta, circularmapper_elongated_fai ] + elongated_reference = ch_input_from_referencesheet.circularmapper // [ meta, circular_target, circularmapper_elongated_fasta, circularmapper_elongated_fai ] mitochondrion_header = ch_input_from_referencesheet.mitochondrion_header // [ meta, mitochondrion ] hapmap = ch_input_from_referencesheet.angsd_hapmap // [ meta, hapmap ] pmd_masked_fasta = ch_input_from_referencesheet.pmd_masked_fasta // [ meta, pmd_masked_fasta ] diff --git a/subworkflows/local/reference_indexing_single.nf b/subworkflows/local/reference_indexing_single.nf index cddb45a63..f4c9e42cd 100644 --- a/subworkflows/local/reference_indexing_single.nf +++ b/subworkflows/local/reference_indexing_single.nf @@ -113,7 +113,7 @@ workflow REFERENCE_INDEXING_SINGLE { emit: reference = ch_ref_index_single.reference // [ meta, fasta, fai, dict, mapindex ] - circularmapper = ch_ref_index_single.circularmapper // [ meta, circular_target, circularmapper_elongated_fasta, circularmapper_elongated_fai ] + elongated_reference = ch_ref_index_single.circularmapper // [ meta, circular_target, circularmapper_elongated_fasta, circularmapper_elongated_fai ] mitochondrion_header = ch_ref_index_single.mito_header // [ meta, mito_header ] hapmap = ch_ref_index_single.hapmap // [ meta, hapmap ] pmd_masked_fasta = ch_ref_index_single.pmd_masked_fasta // [ meta, pmd_masked_fasta ] diff --git a/subworkflows/local/utils_nfcore_eager_pipeline/main.nf b/subworkflows/local/utils_nfcore_eager_pipeline/main.nf index 8809f7597..a7d75dd28 100644 --- a/subworkflows/local/utils_nfcore_eager_pipeline/main.nf +++ b/subworkflows/local/utils_nfcore_eager_pipeline/main.nf @@ -231,7 +231,7 @@ def validateInputParameters() { if ( params.genotyping_source == 'pmd' && ! params.run_pmd_filtering ) { exit 1, ("[nf-core/eager] ERROR: --genotyping_source cannot be 'pmd' unless PMD-filtering is ran.") } if ( params.genotyping_source == 'rescaled' && ! params.run_mapdamage_rescaling ) { exit 1, ("[nf-core/eager] ERROR: --genotyping_source cannot be 'rescaled' unless aDNA damage rescaling is ran.") } if ( params.fasta && params.run_genotyping && params.genotyping_tool == 'pileupcaller' && ! (params.genotyping_pileupcaller_bedfile || params.genotyping_pileupcaller_snpfile ) ) { exit 1, ("[nf-core/eager] ERROR: Genotyping with pileupcaller requires both '--genotyping_pileupcaller_bedfile' AND '--genotyping_pileupcaller_snpfile' to be provided.") } - + if ( params.fasta && params.mapping_tool == "circularmapper" && ! params.fasta_circular_target && ! params.mapping_circularmapper_elongated_fasta ) { exit 1, ("[nf-core/eager] ERROR: Mapping with circularmapper requires either --fasta_circular_target or --mapping_circularmapper_elongated_fasta. ") } } // From dff6208faa892b02baaf6050d4cebedb65b84578 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 19 Jul 2024 11:18:34 +0200 Subject: [PATCH 131/198] update CM modules --- modules.json | 4 ++-- .../circularmapper/circulargenerator/main.nf | 19 ++++++++++----- .../circularmapper/circulargenerator/meta.yml | 23 ++++++++++++++++--- .../circularmapper/realignsamfile/main.nf | 2 +- .../circularmapper/realignsamfile/meta.yml | 5 ++++ 5 files changed, 41 insertions(+), 12 deletions(-) diff --git a/modules.json b/modules.json index b447553e5..ed96492a2 100644 --- a/modules.json +++ b/modules.json @@ -97,12 +97,12 @@ }, "circularmapper/circulargenerator": { "branch": "master", - "git_sha": "5890d9e73aaa803fc6be94b1822539b4204d8cff", + "git_sha": "0148d00e72e35cd08b3d829d7de3430bc0c92a5a", "installed_by": ["modules"] }, "circularmapper/realignsamfile": { "branch": "master", - "git_sha": "5890d9e73aaa803fc6be94b1822539b4204d8cff", + "git_sha": "579d2d5f15e126a2190a7b709dfc77696c83688d", "installed_by": ["modules"] }, "damageprofiler": { diff --git a/modules/nf-core/circularmapper/circulargenerator/main.nf b/modules/nf-core/circularmapper/circulargenerator/main.nf index b1664d032..07b722c39 100644 --- a/modules/nf-core/circularmapper/circulargenerator/main.nf +++ b/modules/nf-core/circularmapper/circulargenerator/main.nf @@ -1,5 +1,5 @@ // This module does the following: -//creating a modified reference genome, with an elongation of the an specified amount of bases +//creating a modified reference genome, with an elongation_factoration of the an specified amount of bases process CIRCULARMAPPER_CIRCULARGENERATOR { tag "$meta.id" @@ -12,10 +12,11 @@ process CIRCULARMAPPER_CIRCULARGENERATOR { input: tuple val(meta), path(reference) - val(elong) + tuple val(meta2), val(elongation_factor) + tuple val(meta3), val(target) output: - tuple val(meta), path("*_${elong}.fasta"), emit: fasta + tuple val(meta), path("*_${elongation_factor}.fasta"), emit: fasta path "versions.yml" , emit: versions when: @@ -25,11 +26,17 @@ process CIRCULARMAPPER_CIRCULARGENERATOR { def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.id}" """ - circulargenerator -e ${elong} \ + circulargenerator \ + -e ${elongation_factor} \ -i ${reference} \ - -s ${prefix} \ + -s ${target} \ $args + ## circulargenerator has a hardcoded output name. Rename if necessary to use prefix. + if [[ "${reference.getBaseName()}_${elongation_factor}.fasta" != "${prefix}_${elongation_factor}.fasta" ]]; then + mv ${reference.getBaseName()}_${elongation_factor}.fasta ${prefix}_${elongation_factor}.fasta + fi + cat <<-END_VERSIONS > versions.yml "${task.process}": circulargenerator: \$(circulargenerator -h | grep 'usage' | sed 's/usage: CircularGenerator//') @@ -40,7 +47,7 @@ process CIRCULARMAPPER_CIRCULARGENERATOR { def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.id}" """ - touch ${prefix}_${elong}.fasta + touch ${prefix}_${elongation_factor}.fasta cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/nf-core/circularmapper/circulargenerator/meta.yml b/modules/nf-core/circularmapper/circulargenerator/meta.yml index 3e6a51ada..2704fbc36 100644 --- a/modules/nf-core/circularmapper/circulargenerator/meta.yml +++ b/modules/nf-core/circularmapper/circulargenerator/meta.yml @@ -19,18 +19,34 @@ input: - meta: type: map description: | - Groovy Map containing sample information - e.g. `[ id:'sample1', single_end:false ]` + Groovy Map containing reference information + e.g. `[ id:'sample1' ]` - reference: type: file description: Genome fasta file pattern: "*.fasta" - - elong: + - meta2: + type: map + description: | + Groovy Map containing reference information + e.g. `[ id:'sample1' ]` + + - elongation_factor: type: integer description: The number of bases that the ends of the target chromosome in the reference genome should be elongated by + - meta3: + type: map + description: | + Groovy Map containing reference information + e.g. `[ id:'sample1' ]` + + - target: + type: string + description: The name of the chromosome in the reference genome that should be elongated + output: #Only when we have meta - meta: @@ -51,5 +67,6 @@ output: authors: - "@apalleja" + - "@TCLamnidis" maintainers: - "" diff --git a/modules/nf-core/circularmapper/realignsamfile/main.nf b/modules/nf-core/circularmapper/realignsamfile/main.nf index 579815df0..9d74f7b91 100644 --- a/modules/nf-core/circularmapper/realignsamfile/main.nf +++ b/modules/nf-core/circularmapper/realignsamfile/main.nf @@ -10,7 +10,7 @@ process CIRCULARMAPPER_REALIGNSAMFILE { input: tuple val(meta), path(bam) tuple val(meta2), path(fasta) - val(elongation_factor) + tuple val(meta3), val(elongation_factor) output: tuple val(meta), path("*_realigned.bam") , emit: bam diff --git a/modules/nf-core/circularmapper/realignsamfile/meta.yml b/modules/nf-core/circularmapper/realignsamfile/meta.yml index bc4173754..fbb62d76d 100644 --- a/modules/nf-core/circularmapper/realignsamfile/meta.yml +++ b/modules/nf-core/circularmapper/realignsamfile/meta.yml @@ -35,6 +35,11 @@ input: - fasta: type: file description: Input elongated genome fasta + - meta3: + type: map + description: | + Groovy Map containing reference information + e.g. `[ id:'test' ]` - elongation_factor: type: integer description: The elongation factor used when running circulargenerator, i.e. the number of bases that the ends of the target chromosome in the reference genome was elongated by From c1400620286a2d5a6346d09007a4f67e901ed8ba Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 19 Jul 2024 11:27:15 +0200 Subject: [PATCH 132/198] updates to reference elongation SWF --- subworkflows/local/elongate_reference.nf | 55 ++++++++++++++---------- 1 file changed, 32 insertions(+), 23 deletions(-) diff --git a/subworkflows/local/elongate_reference.nf b/subworkflows/local/elongate_reference.nf index 8c949ba4d..b0936f3a2 100644 --- a/subworkflows/local/elongate_reference.nf +++ b/subworkflows/local/elongate_reference.nf @@ -7,48 +7,57 @@ include { BWA_INDEX as BWA_INDEX_CIRCULARISED } from '../../modules/nf-core/bwa/ workflow ELONGATE_REFERENCE { take: - ch_reference // [ meta, fasta, fai ] - ch_elongated_reference // [ meta, elongated_fasta, elongated_fai ] - elongation_factor // [ int ] - // TODO CIRCULARMAPPER_CIRCULARGENERATOR module needs updating. `-s` option is the circular target and not the output file >.< + ch_reference // [ meta, fasta, fai, dict, mapindex ] + ch_elongated_reference // [ meta, circular_target, circularmapper_elongated_fasta, circularmapper_elongated_fai ] main: - ch_versions = Channel.empty() - ch_multiqc_files = Channel.empty() + ch_versions = Channel.empty() + ch_multiqc_files = Channel.empty() + ch_circular_reference = Channel.empty() /* - Check what fasta files we have: + Check what fasta files we have. There are four options: - 1. Elongated reference with index (ignore circular target) - 2. Elongated reference without index (ignore circular target) - 3. No elongated reference, but circular target - 4. None of the above -> Throw error (should go in parameter validation) + 1. Elongated reference with index (ignore circular target) -> Pass through + 2. Elongated reference without index (ignore circular target) -> Index and emit + 3. No elongated reference, but circular target -> Elongate, index and emit. + 4. None of the above -> Throw error and stop execution during parameter validation */ + ch_circulargenerator_input = ch_elongated_reference .branch{ - meta, elongated_fasta_index, elongated_fasta, circular_target -> - ready: elongated_fasta != "" && elongated_fasta_index != "" - needs_index: elongated_fasta != "" && elongated_fasta_index == "" - needs_elongation: elongated_fasta == "" && circular_target != "" + meta, circular_target, circularmapper_elongated_fasta, circularmapper_elongated_fai -> + ready: circularmapper_elongated_fasta != "" && circularmapper_elongated_fai != "" + needs_index: circularmapper_elongated_fasta != "" && circularmapper_elongated_fai == "" + needs_elongation: circularmapper_elongated_fasta == "" && circular_target != "" } // Elongate references that need it // Join the original references to the branch of needs_elongation, to get the original fasta files, and elongate them. ch_references_to_elongate = ch_circulargenerator_input.needs_elongation .join( ch_reference ) - .map { - meta, elongated_fasta_index, elongated_fasta, circular_target, meta2, index, fasta -> - [ meta, fasta ] + .multiMap { + meta, circular_target, circularmapper_elongated_fasta, circularmapper_elongated_fai, fasta, fai, dict, mapindex -> + + def elongation_factor = params.mapping_circularmapper_elongation_factor + + fasta: [ meta, fasta ] + elongation_factor : [ meta, elongation_factor ] + target: [ meta, circular_target ] } - CIRCULARMAPPER_CIRCULARGENERATOR(ch_circulargenerator_input.needs_elongation, elongation_value) + CIRCULARMAPPER_CIRCULARGENERATOR( + ch_circulargenerator_input.needs_elongation.fasta, + ch_circulargenerator_input.needs_elongation.elongation_factor, + ch_circulargenerator_input.needs_elongation.target + ) ch_versions = ch_versions.mix( CIRCULARMAPPER_CIRCULARGENERATOR.out.versions.first() ) // Collect newly generated circular references and provided ones without an index, and index them. ch_input_for_circular_indexing = ch_circulargenerator_input.needs_index .map { - meta, elongated_fasta_index, elongated_fasta, circular_target -> - [ meta, elongated_fasta ] + meta, circular_target, circularmapper_elongated_fasta, circularmapper_elongated_fai -> + [ meta, circularmapper_elongated_fasta ] } .mix( CIRCULARMAPPER_CIRCULARGENERATOR.out.fasta ) @@ -61,8 +70,8 @@ workflow ELONGATE_REFERENCE { // Then put all the indexed elongated references together and emit them ch_circular_reference = ch_circulargenerator_input.ready .map { - meta, elongated_fasta_index, elongated_fasta, circular_target -> - [ meta, elongated_fasta, elongated_fasta_index ] + meta, circular_target, circularmapper_elongated_fasta, circularmapper_elongated_fai -> + [ meta, circularmapper_elongated_fasta, circularmapper_elongated_fai ] } .mix( ch_indexed_references ) From fad1e13c8a42a9a6059a26ee909689d3e1e3e340 Mon Sep 17 00:00:00 2001 From: Ian Light Date: Fri, 19 Jul 2024 09:28:57 +0000 Subject: [PATCH 133/198] applied style and naming changes from PR --- CITATIONS.md | 4 ++-- conf/modules.config | 2 +- docs/development/manual_tests.md | 8 +++---- docs/output.md | 6 ++--- nextflow.config | 6 ++--- nextflow_schema.json | 24 ++++++++++---------- subworkflows/local/metagenomics_profiling.nf | 8 +++---- workflows/eager.nf | 2 -- 8 files changed, 29 insertions(+), 31 deletions(-) diff --git a/CITATIONS.md b/CITATIONS.md index 0ce614b0a..a3d2e6b2f 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -134,9 +134,9 @@ > Hübler, R., Key, F.M., Warinner, C. et al. (2019). HOPS: automated detection and authentication of pathogen DNA in archaeological remains. Genome Biol 20, 280. doi: [10.1186/s13059-019-1903-0](https://doi.org/10.1186/s13059-019-1903-0) -- [MEGAN](https://doi.org/10.1101/gr.5969107) +- [MEGAN](https://doi.org/10.1371/journal.pcbi.1004957) - > Daniel H. Huson, Alexander F. Auch, Ji Qi, and Stephan C. Schuster (2007). MEGAN analysis of metagenomic data. Genome Res. 17000, Published in Advance January 25, 2007, doi: [10.1101/gr.5969107](https://doi.org/10.1101/gr.5969107) + > Huson DH, Beier S, Flade I, Górska A, El-Hadidi M, Mitra S, et al. (2016) MEGAN Community Edition - Interactive Exploration and Analysis of Large-Scale Microbiome Sequencing Data. PLoS Comput Biol 12(6): e1004957. doi: [10.1371/journal.pcbi.1004957] https://doi.org/10.1371/journal.pcbi.1004957 - [Kraken2](https://doi.org/10.1186/s13059-019-1891-0) diff --git a/conf/modules.config b/conf/modules.config index ae618f0bf..53b43a706 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -889,7 +889,7 @@ process { "-id ${params.metagenomics_malt_minpercentidentity}", "-mq ${params.metagenomics_malt_maxqueries}", "--memoryMode ${params.metagenomics_malt_memorymode}", - params.metagenomics_malt_minsupportmode == "percent" ? "-supp ${params.metagenomics_malt_minsupportpercent}" : "-sup ${params.metagenomics_minsupportreads}", + params.metagenomics_malt_minsupportmode == "percent" ? "-supp ${params.metagenomics_malt_minsupportpercent}" : "-sup ${params.metagenomics_malt_minsupportreads}", params.metagenomics_malt_savereads ? "--alignments ./" : "" ].join(' ').trim() publishDir = [ diff --git a/docs/development/manual_tests.md b/docs/development/manual_tests.md index c8d94ad5a..a2902ace7 100644 --- a/docs/development/manual_tests.md +++ b/docs/development/manual_tests.md @@ -630,8 +630,8 @@ nextflow run main.nf -profile test,docker \ --run_metagenomics \ --metagenomics_profiling_tool krakenuniq \ --metagenomics_profiling_database CUSTOM_KRAKEN_DB \ - --metagenomics_kraken_savereads \ - --metagenomics_kraken_savereadclassifications + --metagenomics_kraken2_savereads \ + --metagenomics_kraken2_savereadclassifications ``` ##### kraken2 @@ -662,8 +662,8 @@ nextflow run main.nf -profile test,docker \ --run_metagenomics \ --metagenomics_profiling_tool kraken2 \ --metagenomics_profiling_database CUSTOM_KRAKEN2_DB \ - --metagenomics_kraken_savereads \ - --metagenomics_kraken_savereadclassifications + --metagenomics_kraken2_savereads \ + --metagenomics_kraken2_savereadclassifications ``` ##### malt diff --git a/docs/output.md b/docs/output.md index f66ca8f17..b0da66971 100644 --- a/docs/output.md +++ b/docs/output.md @@ -358,7 +358,7 @@ The saved files are the _good_ files, passing the `dust` or `entropy` filter tre
-MALT is a metagenomic aligner (equivalent to BLAST, but much faster). It produces direct alignments of sequencing reads in a reference genome. It is often used for metagenomic profiling or pathogen screening, and specifically in nf-core/eager, of off-target reads from genome mapping. +MALT is a metagenomic aligner (equivalent to BLAST, but much faster). It produces direct alignments of sequencing reads in a reference genome. It is often used for metagenomic profiling or pathogen screening, and specifically in nf-core/eager, of off-target reads from genome mapping. It is popular by palaeogenomicists as the alignment information can be used for damage pattern and other authentication criteria analysis. You will receive output for each library. This means that if you use TSV input and have one library sequenced over multiple lanes and sequencing types, these are merged and you will get mapping statistics of all lanes and sequencing configurations in one value. @@ -399,7 +399,7 @@ The main taxonomic profiling file from MetaPhlAn is the `*_profile.txt` file. Th The main taxonomic classification file from Kraken2 is the `_combined_reports.txt` or `*report.txt` file. The former provides you the broadest over view of the taxonomic classification results across all samples against a single database, where you get two columns for each sample e.g. `2_all` and `2_lvl`, as well as a summarised column summing up across all samples `tot_all` and `tot_lvl`. The latter gives you the most information for a single sample. The report file is also used for the taxpasta step. -You will only receive the `.fastq` and `*classifiedreads.txt` file if you supply `--metagenomics_kraken_savereads` and/or `--metagenomics_kraken_save_readclassifications` parameters to the pipeline. +You will only receive the `.fastq` and `*classifiedreads.txt` file if you supply `--metagenomics_kraken2_savereads` and/or `--metagenomics_kraken_save_readclassifications` parameters to the pipeline. #### KrakenUniq @@ -418,7 +418,7 @@ You will only receive the `.fastq` and `*classifiedreads.txt` file if you supply The main taxonomic classification file from KrakenUniq is the `*report.txt` file. This is an extension of the Kraken2 report with the additional k-mer coverage information that provides more information about the accuracy of hits. -You will only receive the `*.fastq.gz` and `*.classifiedreads.txt` file if you supply `--metagenomics_kraken_savereads` and/or `--metagenomics_kraken_save_readclassifications` parameters to the pipeline. +You will only receive the `*.fastq.gz` and `*.classifiedreads.txt` file if you supply `--metagenomics_kraken2_savereads` and/or `--metagenomics_kraken_save_readclassifications` parameters to the pipeline. :::info The output system of KrakenUniq can result in other `stdout` or `stderr` logging information being saved in the report file, therefore you must check your report files before downstream use! diff --git a/nextflow.config b/nextflow.config index 1cc25b045..f68d91c57 100644 --- a/nextflow.config +++ b/nextflow.config @@ -145,15 +145,15 @@ params { metagenomics_profiling_tool = null metagenomics_profiling_database = null metagenomics_krakenuniq_ramchunksize = "16G" - metagenomics_kraken_savereads = false - metagenomics_kraken_savereadclassifications = false + metagenomics_kraken2_savereads = false + metagenomics_kraken2_savereadclassifications = false metagenomics_kraken2_saveminimizers = false metagenomics_malt_mode = 'BlastN' metagenomics_malt_alignmentmode = 'SemiGlobal' metagenomics_malt_savereads = false metagenomics_malt_minsupportmode = 'percent' metagenomics_malt_minsupportpercent = 0.01 - metagenomics_minsupportreads = 1 + metagenomics_malt_minsupportreads = 1 metagenomics_malt_minpercentidentity = 85 metagenomics_malt_toppercent = 1 metagenomics_malt_maxqueries = 100 diff --git a/nextflow_schema.json b/nextflow_schema.json index 6f64d87ee..fa52ef8aa 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -744,13 +744,13 @@ "fa_icon": "fas fa-database", "help_text": "Specify a metagenomics profiling database to use with the designated metagenomics_profiling_tool on the selected metagenomics_input. Databases can be provided both as a directory, or a tar.gz of a directory. Metagenomic databases are NOT compatible across different tools (ie a MALT database is different from a kraken2 database).\n\nAll databases need to be pre-built/downloaded for use in nf-core/eager. Database construction is often a balancing act between breadth of sequence diversity and size.\n\nModifies tool parameter(s):\n> - krakenuniq: `--db`\n> - kraken2: `--db`\n> - MetaPhlAn: `--bowtie2db` and `--index`\n> - MALT: '-index'" }, - "metagenomics_kraken_savereads": { + "metagenomics_kraken2_savereads": { "type": "boolean", "fa_icon": "fas fa-save", "description": "Turn on saving reads assigned by KrakenUniq or Kraken2", "help_text": "Save reads that do and do not have a taxonomic classification in your output results directory in FASTQ format.\n\n> Modifies tool parameter(s):\n> - krakenuniq: `--classified-out` and `--unclassified-out`" }, - "metagenomics_kraken_savereadclassifications": { + "metagenomics_kraken2_savereadclassifications": { "type": "boolean", "fa_icon": "fas fa-save", "description": "Turn on saving of KrakenUniq or Kraken2 per-read taxonomic assignment file", @@ -774,7 +774,7 @@ "default": "BlastN", "description": "Specify which alignment mode to use for MALT.", "fa_icon": "fas fa-align-left", - "help_text": "Use this to run the program in 'BlastN', 'BlastP', 'BlastX' modes to align DNA\nand DNA, protein and protein, or DNA reads against protein references\nrespectively. Ensure your database matches the mode. Check the\n[MALT\nmanual](http://ab.inf.uni-tuebingen.de/data/software/malt/download/manual.pdf)\nfor more details. Default: `'BlastN'`\n\nOnly when `--metagenomics_profiling_tool malt` is also supplied.\n\n> Modifies tool parameter(s):\n> - MALT: `-m`\n", + "help_text": "Use this to run the program in 'BlastN', 'BlastP', 'BlastX' modes to align DNA\nand DNA, protein and protein, or DNA reads against protein references respectively. Ensure your database matches the mode. Check the [MALT manual](http://ab.inf.uni-tuebingen.de/data/software/malt/download/manual.pdf) for more details.\n\nOnly when `--metagenomics_profiling_tool malt` is also supplied.\n\n> Modifies tool parameter(s):\n> - MALT: `-m`\n", "enum": ["BlastN", "BlastP", "BlastX"] }, "metagenomics_malt_alignmentmode": { @@ -790,7 +790,7 @@ "default": 85, "description": "Percent identity value threshold for MALT.", "fa_icon": "fas fa-id-card", - "help_text": "Specify the minimum percent identity (or similarity) a sequence must have to the reference for it to be retained. Default is `85`\n\nOnly used when `--metagenomics_profiling_tool malt` is also supplied.\n\n> Modifies tool parameter(s):\n> - MALT:`-id`" + "help_text": "Specify the minimum percent identity (or similarity) a sequence must have to the reference for it to be retained.\n\nOnly used when `--metagenomics_profiling_tool malt` is also supplied.\n\n> Modifies tool parameter(s):\n> - MALT:`-id`" }, "metagenomics_malt_toppercent": { "type": "integer", @@ -812,9 +812,9 @@ "default": 0.01, "description": "Specify the minimum percentage of reads a taxon of sample total is required to have to be retained for MALT.", "fa_icon": "fas fa-percentage", - "help_text": "Specify the minimum number of reads (as a percentage of all assigned reads) a given taxon is required to have to be retained as a positive 'hit' in the RMA6 file. This only applies when `--malt_min_support_mode` is set to 'percent'. Default 0.01.\n\nOnly when `--metagenomics_profiling_tool malt` is also supplied.\n\n> Modifies tool parameter(s):\n> - MALT: `-supp`" + "help_text": "Specify the minimum number of reads (as a percentage of all assigned reads) a given taxon is required to have to be retained as a positive 'hit' in the RMA6 file. This only applies when `--malt_min_support_mode` is set to 'percent'.\n\nOnly when `--metagenomics_profiling_tool malt` is also supplied.\n\n> Modifies tool parameter(s):\n> - MALT: `-supp`" }, - "metagenomics_minsupportreads": { + "metagenomics_malt_minsupportreads": { "type": "integer", "default": 1, "description": "Specify a minimum number of reads a taxon of sample total is required to have to be retained in malt or kraken. Not compatible with --malt_min_support_mode 'percent'.", @@ -833,7 +833,7 @@ "default": "load", "description": "Specify the memory load method. Do not use 'map' with GPFS file systems for MALT as can be very slow.", "fa_icon": "fas fa-memory", - "help_text": "\nHow to load the database into memory. Options are `'load'`, `'page'` or `'map'`.\n'load' directly loads the entire database into memory prior seed look up, this\nis slow but compatible with all servers/file systems. `'page'` and `'map'`\nperform a sort of 'chunked' database loading, allowing seed look up prior entire\ndatabase loading. Note that Page and Map modes do not work properly not with\nmany remote file-systems such as GPFS. Default is `'load'`.\n\nOnly when `--metagenomics_profiling_tool malt` is also supplied.\n\n> Modifies tool parameter(s):\n> - MALT: `--memoryMode`", + "help_text": "\nHow to load the database into memory. Options are `'load'`, `'page'` or `'map'`.\n'load' directly loads the entire database into memory prior seed look up, this\nis slow but compatible with all servers/file systems. `'page'` and `'map'`\nperform a sort of 'chunked' database loading, allowing seed look up prior entire\ndatabase loading. Note that Page and Map modes do not work properly not with\nmany remote file-systems such as GPFS.\n\nOnly when `--metagenomics_profiling_tool malt` is also supplied.\n\n> Modifies tool parameter(s):\n> - MALT: `--memoryMode`", "enum": ["load", "page", "map"] }, "metagenomics_malt_savereads": { @@ -845,7 +845,7 @@ "metagenomics_malt_group_size": { "type": "integer", "default": 0, - "description": "Define how many fastq files should be submitted in the same malt run. Default value of 0 sends all files at once.", + "description": "Define how many fastq files should be submitted in the same malt run. Default value of 0 runs all files at once.", "fa_icon": "fas fa-barcode", "help_text": "Very many (large) fastq files run through MALT at the same time can lead to excessively long runtimes. This parameter allows for parallelization of MALT runs. Please note, MALT is resource heavy and setting this value above the default (0) will spawn at minimum N/metagenomics_malt_group_size jobs where N is the number of samples. Please only use this if it is necessary to avoid runtime limits on your HPC cluster since the overhead of loading a database is high." }, @@ -881,7 +881,7 @@ "type": "number", "default": 0.01, "description": "Specify percent of top alignments to use.", - "help_text": "Specify frequency of top alignments for each read to be considered for each node.\\nDefault is 0.01, i.e. 1% of all reads (where 1 would correspond to 100%).\\n\\n> :warning: this parameter follows the same concept as `--malt_top_percent` but\\n> uses a different notation i.e. integer (MALT) versus float (MALTExtract)\\n\\nDefault: `0.01`.\\n\\nOnly when `--metagenomics_profiling_tool malt` is also supplied.\n\n> Modifies tool parameter(s):\n> - MaltExtract: `-a`", + "help_text": "Specify frequency of top alignments for each read to be considered for each node.\\n Note, value should be given in the format of a proportion (where 1 would correspond to 100%, and 0.1 would correspond to 10%).\\n\\n> :warning: this parameter follows the same concept as `--malt_top_percent` but uses a different notation i.e. integer (MALT) versus float (MALTExtract)\\n\\nOnly when `--metagenomics_profiling_tool malt` is also supplied.\n\n> Modifies tool parameter(s):\n> - MaltExtract: `-a`", "fa_icon": "fas fa-percent" }, "metagenomics_maltextract_destackingoff": { @@ -905,13 +905,13 @@ "metagenomics_maltextract_matches": { "type": "boolean", "description": "Turn on exporting alignments of hits in BLAST format.", - "help_text": "Export alignments of hits for each node in BLAST format. By default turned off.\\n\\nOnly when `--metagenomics_profiling_tool malt` is also supplied.\n\n> Modifies tool parameter(s):\n> - MaltExtract: `--matches`", + "help_text": "Export alignments of hits for each node in BLAST format.\\n\\nOnly when `--metagenomics_profiling_tool malt` is also supplied.\n\n> Modifies tool parameter(s):\n> - MaltExtract: `--matches`", "fa_icon": "fas fa-equals" }, "metagenomics_maltextract_megansummary": { "type": "boolean", "description": "Turn on export of MEGAN summary files.", - "help_text": "Export 'minimal' summary files (i.e. without alignments) that can be loaded into [MEGAN6](https://doi.org/10.1371/journal.pcbi.1004957). By default turned off.\\n\\nOnly when `--metagenomics_profiling_tool malt` is also supplied.\n\n> Sets tool parameter(s):\n> - MaltExtract: `--meganSummary`" + "help_text": "Export 'minimal' summary files (i.e. without alignments) that can be loaded into [MEGAN6](https://doi.org/10.1371/journal.pcbi.1004957).\\n\\nOnly when `--metagenomics_profiling_tool malt` is also supplied.\n\n> Sets tool parameter(s):\n> - MaltExtract: `--meganSummary`" }, "metagenomics_maltextract_minpercentidentity": { "type": "number", @@ -922,7 +922,7 @@ "metagenomics_maltextract_usetopalignment": { "type": "boolean", "description": "Turn on using top alignments per read after filtering.", - "help_text": "Use the best alignment of each read for every statistic, except for those concerning read distribution and coverage. Default: off.\\n\\nOnly when `--metagenomics_profiling_tool malt` is also supplied.\n\n> Sets tool parameter(s):\n> - MaltExtract: `--useTopAlignment`", + "help_text": "Use the best alignment of each read for every statistic, except for those concerning read distribution and coverage. \\n\\nOnly when `--metagenomics_profiling_tool malt` is also supplied.\n\n> Sets tool parameter(s):\n> - MaltExtract: `--useTopAlignment`", "fa_icon": "fas fa-bahai" } }, diff --git a/subworkflows/local/metagenomics_profiling.nf b/subworkflows/local/metagenomics_profiling.nf index aa1abeab1..053d56297 100644 --- a/subworkflows/local/metagenomics_profiling.nf +++ b/subworkflows/local/metagenomics_profiling.nf @@ -153,9 +153,9 @@ workflow METAGENOMICS_PROFILING { ch_krakenuniq_input, ch_database, params.metagenomics_krakenuniq_ramchunksize, - params.metagenomics_kraken_savereads, + params.metagenomics_kraken2_savereads, true, // save read assignments - params.metagenomics_kraken_savereadclassifications + params.metagenomics_kraken2_savereadclassifications ) ch_versions = KRAKENUNIQ_PRELOADEDKRAKENUNIQ.out.versions @@ -178,8 +178,8 @@ workflow METAGENOMICS_PROFILING { KRAKEN2_KRAKEN2 ( ch_kraken2_input.reads, ch_kraken2_input.database, - params.metagenomics_kraken_savereads, - params.metagenomics_kraken_savereadclassifications + params.metagenomics_kraken2_savereads, + params.metagenomics_kraken2_savereadclassifications ) ch_multiqc_files = KRAKEN2_KRAKEN2.out.report diff --git a/workflows/eager.nf b/workflows/eager.nf index 7e965f346..603ed5e49 100644 --- a/workflows/eager.nf +++ b/workflows/eager.nf @@ -347,7 +347,6 @@ workflow EAGER { ch_versions = ch_versions.mix( HOST_REMOVAL.out.versions ) } - // // Section: Metagenomics // @@ -493,7 +492,6 @@ workflow EAGER { ch_genome_for_bedtools = SAMTOOLS_VIEW_GENOME.out.genome - BEDTOOLS_COVERAGE_BREADTH(ch_bedtools_input.withfeature, ch_genome_for_bedtools) BEDTOOLS_COVERAGE_DEPTH(ch_bedtools_input.withfeature, ch_genome_for_bedtools) ch_versions = ch_versions.mix( SAMTOOLS_VIEW_GENOME.out.versions ) From 4cd7a190be8bfdb29125b708ae3428479f348d09 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 19 Jul 2024 12:56:13 +0200 Subject: [PATCH 134/198] Add unzipping in ELONGATE_REFERENCE SWF --- subworkflows/local/elongate_reference.nf | 34 +++++++++++++++++++++--- 1 file changed, 30 insertions(+), 4 deletions(-) diff --git a/subworkflows/local/elongate_reference.nf b/subworkflows/local/elongate_reference.nf index b0936f3a2..a62691140 100644 --- a/subworkflows/local/elongate_reference.nf +++ b/subworkflows/local/elongate_reference.nf @@ -2,12 +2,12 @@ // Elongate a reference genome by circularising the target sequence by a given elongation factor. // -include { CIRCULARMAPPER_CIRCULARGENERATOR } from '../../modules/nf-core/circularmapper/circulargenerator/main' +include { GUNZIP as GUNZIP_ELONGATED_FASTA } from '../../modules/nf-core/gunzip/main' +include { CIRCULARMAPPER_CIRCULARGENERATOR } from '../../modules/nf-core/circularmapper/circulargenerator/main' include { BWA_INDEX as BWA_INDEX_CIRCULARISED } from '../../modules/nf-core/bwa/index/main' workflow ELONGATE_REFERENCE { take: - ch_reference // [ meta, fasta, fai, dict, mapindex ] ch_elongated_reference // [ meta, circular_target, circularmapper_elongated_fasta, circularmapper_elongated_fai ] main: @@ -15,6 +15,32 @@ workflow ELONGATE_REFERENCE { ch_multiqc_files = Channel.empty() ch_circular_reference = Channel.empty() + // Check if the elongated reference is gzipped, and if so, unzip it. + ch_elongated_branches = ch_elongated_reference + .branch { + meta, circular_target, circularmapper_elongated_fasta, circularmapper_elongated_fai -> + + for_gunzip: circularmapper_elongated_fasta.extension == "gz" + skip_gunzip: true + } + + ch_elongated_for_gunzip = ch_elongated_branches.for_gunzip + .map { + meta, circular_target, circularmapper_elongated_fasta, circularmapper_elongated_fai -> + [ meta, circularmapper_elongated_fasta ] + } + + GUNZIP_ELONGATED_FASTA( ch_elongated_for_gunzip.for_gunzip ) + ch_versions = ch_versions.mix( GUNZIP_ELONGATED_FASTA.out.versions.first() ) + + ch_elongated_unzipped_reference = ch_elongated_branches.for_gunzip + .join( GUNZIP_ELONGATED_FASTA.out.gunzip ) + .map { + meta, circular_target, circularmapper_elongated_fasta, circularmapper_elongated_fai, unzipped_fasta -> + [ meta, circular_target, unzipped_fasta, circularmapper_elongated_fai ] + } + .mix( ch_elongated_branches.skip_gunzip ) + /* Check what fasta files we have. There are four options: @@ -24,7 +50,7 @@ workflow ELONGATE_REFERENCE { 4. None of the above -> Throw error and stop execution during parameter validation */ - ch_circulargenerator_input = ch_elongated_reference + ch_circulargenerator_input = ch_elongated_unzipped_reference .branch{ meta, circular_target, circularmapper_elongated_fasta, circularmapper_elongated_fai -> ready: circularmapper_elongated_fasta != "" && circularmapper_elongated_fai != "" @@ -76,7 +102,7 @@ workflow ELONGATE_REFERENCE { .mix( ch_indexed_references ) emit: - circular_reference = ch_circular_reference // [ meta, fasta, fai ] + circular_reference = ch_circular_reference // [ meta, circular_target, fasta, fai ] versions = ch_versions mqc = ch_multiqc_files From de747de76aaf29b08110f66c6d333c191d08153a Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 19 Jul 2024 12:59:15 +0200 Subject: [PATCH 135/198] unzip, elongate and index reference when CM is used --- subworkflows/local/reference_indexing.nf | 27 ++++++++---------------- 1 file changed, 9 insertions(+), 18 deletions(-) diff --git a/subworkflows/local/reference_indexing.nf b/subworkflows/local/reference_indexing.nf index ad9e18b05..23523f025 100644 --- a/subworkflows/local/reference_indexing.nf +++ b/subworkflows/local/reference_indexing.nf @@ -8,6 +8,8 @@ include { GUNZIP as GUNZIP_PMDBED } from '../../modules/nf-core/gunzip/ include { GUNZIP as GUNZIP_PMDFASTA } from '../../modules/nf-core/gunzip/main.nf' include { GUNZIP as GUNZIP_SNPBED } from '../../modules/nf-core/gunzip/main.nf' include { GUNZIP as GUNZIP_ELONGATED_FASTA } from '../../modules/nf-core/gunzip/main.nf' +include { ELONGATE_REFERENCE } from '../../subworkflows/local/elongate_reference' + workflow REFERENCE_INDEXING { take: @@ -132,28 +134,17 @@ workflow REFERENCE_INDEXING { .filter{ it[1] != "" && it[2] != "" } .ifEmpty{ if(params.mapping_tool == "circularmapper" ) { error "[nf-core/eager]: ERROR: Mapping with circularmapper requires either a circular target or elongated reference file." } } .filter( it != null ) - .branch{ - meta, circular_target, circularmapper_elongated_fasta, circularmapper_elongated_fai -> - forgunzip: circularmapper_elongated_fasta.extension == "gz" - skip: true - } - - ch_elongated_input = ch_elongated_for_gunzip.gunzip - .multiMap{ - meta, circular_target, circularmapper_elongated_fasta, circularmapper_elongated_fai -> - gunzip: [ meta, circularmapper_elongated_fasta ] - remainder: [ meta, circular_target, circularmapper_elongated_fai ] - } - GUNZIP_ELONGATED_FASTA( ch_elongated_input.gunzip ) - ch_version = ch_versions.mix( GUNZIP_ELONGATED_FASTA.out.versions.first() ) - - ch_elongated_gunzipped = GUNZIP_ELONGATED_FASTA.out.gunzip.join( ch_elongated_input.remainder, failOnMismatch: true ) - ch_elongated_after_gunzip = ch_elongated_for_gunzip.skip.mix( ch_elongated_gunzipped ) + if ( params.mapping_tool == "circularmapper" ) { + // This ELONGATE_REFERENCE subworkflow also checks if the provided reference is gzipped, and unzips it if necessary. + ELONGATE_REFERENCE( ch_input_from_referencesheet.circularmapper ) + ch_version = ch_versions.mix( ELONGATE_REFERENCE.out.versions ) + ch_elongated_reference = ELONGATE_REFERENCE.out.circular_reference + } emit: reference = ch_reference_for_mapping // [ meta, fasta, fai, dict, mapindex, circular_target ] - elongated_reference = ch_elongated_after_gunzip // [ meta, circular_target, circularmapper_elongated_fasta, circularmapper_elongated_fai ] + elongated_reference = ch_elongated_reference // [ meta, circular_target, circularmapper_elongated_fasta, circularmapper_elongated_fai ] mitochondrion_header = ch_mitochondrion_header // [ meta, mitochondrion_header ] hapmap = ch_hapmap // [ meta, hapmap ] pmd_masking = ch_pmd_masking // [ meta, pmd_masked_fasta, pmd_bed_for_masking ] From ff2ad9ada9e2d21aca1e3d84063310c4640b2bf4 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 19 Jul 2024 13:31:45 +0200 Subject: [PATCH 136/198] add original reference input --- subworkflows/local/elongate_reference.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/subworkflows/local/elongate_reference.nf b/subworkflows/local/elongate_reference.nf index a62691140..1d29c2241 100644 --- a/subworkflows/local/elongate_reference.nf +++ b/subworkflows/local/elongate_reference.nf @@ -8,6 +8,7 @@ include { BWA_INDEX as BWA_INDEX_CIRCULARISED } from '../../modules/nf-core/bwa/ workflow ELONGATE_REFERENCE { take: + ch_reference // [ meta, fasta, fai, dict, mapindex ] ch_elongated_reference // [ meta, circular_target, circularmapper_elongated_fasta, circularmapper_elongated_fai ] main: @@ -105,5 +106,4 @@ workflow ELONGATE_REFERENCE { circular_reference = ch_circular_reference // [ meta, circular_target, fasta, fai ] versions = ch_versions mqc = ch_multiqc_files - } From b48e80f17abbe153bb9d754db432032732038854 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 19 Jul 2024 13:34:16 +0200 Subject: [PATCH 137/198] i broke something Q_Q --- subworkflows/local/reference_indexing.nf | 44 +++++++++++++----------- 1 file changed, 23 insertions(+), 21 deletions(-) diff --git a/subworkflows/local/reference_indexing.nf b/subworkflows/local/reference_indexing.nf index 23523f025..0e84a3858 100644 --- a/subworkflows/local/reference_indexing.nf +++ b/subworkflows/local/reference_indexing.nf @@ -7,9 +7,7 @@ include { REFERENCE_INDEXING_MULTI } from '../../subworkflows/local/refe include { GUNZIP as GUNZIP_PMDBED } from '../../modules/nf-core/gunzip/main.nf' include { GUNZIP as GUNZIP_PMDFASTA } from '../../modules/nf-core/gunzip/main.nf' include { GUNZIP as GUNZIP_SNPBED } from '../../modules/nf-core/gunzip/main.nf' -include { GUNZIP as GUNZIP_ELONGATED_FASTA } from '../../modules/nf-core/gunzip/main.nf' -include { ELONGATE_REFERENCE } from '../../subworkflows/local/elongate_reference' - +include { ELONGATE_REFERENCE } from '../../subworkflows/local/elongate_reference.nf' workflow REFERENCE_INDEXING { take: @@ -130,30 +128,34 @@ workflow REFERENCE_INDEXING { ch_dbsnp = ch_dbsnp .filter { it[1] != "" } - ch_elongated_for_gunzip = ch_elongated_reference - .filter{ it[1] != "" && it[2] != "" } - .ifEmpty{ if(params.mapping_tool == "circularmapper" ) { error "[nf-core/eager]: ERROR: Mapping with circularmapper requires either a circular target or elongated reference file." } } - .filter( it != null ) - + // Elongate reference for circularmapper if requested if ( params.mapping_tool == "circularmapper" ) { + // Throw errors if required parameters are missing + ch_elongated_for_gunzip = ch_elongated_reference + .filter{ it[1] != "" && it[2] != "" } + .ifEmpty{ error "[nf-core/eager]: ERROR: Mapping with circularmapper requires either a circular target or elongated reference file for at least one reference." } + .filter( it != null ) // Remove null channel which arises if empty cause error returns null. + // This ELONGATE_REFERENCE subworkflow also checks if the provided reference is gzipped, and unzips it if necessary. - ELONGATE_REFERENCE( ch_input_from_referencesheet.circularmapper ) + ELONGATE_REFERENCE( ch_input_from_referencesheet.circularmapper, ch_reference_for_mapping ) ch_version = ch_versions.mix( ELONGATE_REFERENCE.out.versions ) - ch_elongated_reference = ELONGATE_REFERENCE.out.circular_reference + ch_elongated_indexed_reference = ELONGATE_REFERENCE.out.circular_reference + } else { + ch_elongated_indexed_reference = ch_elongated_reference } emit: - reference = ch_reference_for_mapping // [ meta, fasta, fai, dict, mapindex, circular_target ] - elongated_reference = ch_elongated_reference // [ meta, circular_target, circularmapper_elongated_fasta, circularmapper_elongated_fai ] - mitochondrion_header = ch_mitochondrion_header // [ meta, mitochondrion_header ] - hapmap = ch_hapmap // [ meta, hapmap ] - pmd_masking = ch_pmd_masking // [ meta, pmd_masked_fasta, pmd_bed_for_masking ] - pmd_bed_for_masking = ch_pmd_bed_for_masking // [ meta, pmd_bed_for_masking ] - snp_capture_bed = ch_capture_bed // [ meta, capture_bed ] - pileupcaller_bed_snp = ch_pileupcaller_bed_snp // [ meta, pileupcaller_bed, pileupcaller_snp ] - sexdeterrmine_bed = ch_sexdeterrmine_bed // [ meta, sexdet_bed ] - bedtools_feature = ch_bedtools_feature // [ meta, bedtools_feature ] - dbsnp = ch_dbsnp // [ meta, dbsnp ] + reference = ch_reference_for_mapping // [ meta, fasta, fai, dict, mapindex ] + elongated_reference = ch_elongated_indexed_reference // [ meta, circular_target, circularmapper_elongated_fasta, circularmapper_elongated_fai ] + mitochondrion_header = ch_mitochondrion_header // [ meta, mitochondrion_header ] + hapmap = ch_hapmap // [ meta, hapmap ] + pmd_masking = ch_pmd_masking // [ meta, pmd_masked_fasta, pmd_bed_for_masking ] + pmd_bed_for_masking = ch_pmd_bed_for_masking // [ meta, pmd_bed_for_masking ] + snp_capture_bed = ch_capture_bed // [ meta, capture_bed ] + pileupcaller_bed_snp = ch_pileupcaller_bed_snp // [ meta, pileupcaller_bed, pileupcaller_snp ] + sexdeterrmine_bed = ch_sexdeterrmine_bed // [ meta, sexdet_bed ] + bedtools_feature = ch_bedtools_feature // [ meta, bedtools_feature ] + dbsnp = ch_dbsnp // [ meta, dbsnp ] versions = ch_versions } From 7e311f4c887e73ee668224b30756e83b51522f37 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 19 Jul 2024 14:08:42 +0200 Subject: [PATCH 138/198] bracket swap --- subworkflows/local/reference_indexing.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/subworkflows/local/reference_indexing.nf b/subworkflows/local/reference_indexing.nf index 0e84a3858..577b5943e 100644 --- a/subworkflows/local/reference_indexing.nf +++ b/subworkflows/local/reference_indexing.nf @@ -134,7 +134,7 @@ workflow REFERENCE_INDEXING { ch_elongated_for_gunzip = ch_elongated_reference .filter{ it[1] != "" && it[2] != "" } .ifEmpty{ error "[nf-core/eager]: ERROR: Mapping with circularmapper requires either a circular target or elongated reference file for at least one reference." } - .filter( it != null ) // Remove null channel which arises if empty cause error returns null. + .filter{ it != null } // Remove null channel which arises if empty cause error returns null. // This ELONGATE_REFERENCE subworkflow also checks if the provided reference is gzipped, and unzips it if necessary. ELONGATE_REFERENCE( ch_input_from_referencesheet.circularmapper, ch_reference_for_mapping ) From 159a48a5c8af1284da53c02f9e7f84e76750abea Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 19 Jul 2024 15:43:08 +0200 Subject: [PATCH 139/198] fix syntax errors. --- subworkflows/local/reference_indexing.nf | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/subworkflows/local/reference_indexing.nf b/subworkflows/local/reference_indexing.nf index 577b5943e..7cd9512ab 100644 --- a/subworkflows/local/reference_indexing.nf +++ b/subworkflows/local/reference_indexing.nf @@ -21,7 +21,7 @@ workflow REFERENCE_INDEXING { // Warn user if they've given a reference sheet that already includes fai/dict/mapper index etc. if ( ( fasta.extension == 'csv' || fasta.extension == 'tsv' ) && ( fasta_fai || fasta_dict || fasta_mapperindexdir )) log.warn("A TSV or CSV has been supplied to `--fasta_sheet` as well as e.g. `--fasta_fai`. --fasta_sheet CSV/TSV takes priority and --fasta_* parameters will be ignored.") - if ( ( fasta.extension == 'csv' || fasta.extension == 'tsv' ) && ( params.mitochondrion_header || params.contamination_estimation_angsd_hapmap || params.damage_manipulation_pmdtools_reference_mask || params.damage_manipulation_pmdtools_reference_mask || params.snpcapture_bed || params.genotyping_pileupcaller_bedfile || params.genotyping_pileupcaller_snpfile || params.sexdeterrmine_bedfile || params.mapstats_bedtools_featurefile || params.genotyping_reference_ploidy || params.genotyping_gatk_dbsnp, params.fasta_circular_target, params.circularmapper_elongated_fasta, params.circularmapper_elongated_fai )) log.warn("A TSV or CSV has been supplied to `--fasta_sheet` as well as individual reference-specific input files, e.g. `--contamination_estimation_angsd_hapmap`. Input files specified in the --fasta_sheet CSV/TSV take priority and other input parameters will be ignored.") + if ( ( fasta.extension == 'csv' || fasta.extension == 'tsv' ) && ( params.mitochondrion_header || params.contamination_estimation_angsd_hapmap || params.damage_manipulation_pmdtools_reference_mask || params.damage_manipulation_pmdtools_reference_mask || params.snpcapture_bed || params.genotyping_pileupcaller_bedfile || params.genotyping_pileupcaller_snpfile || params.sexdeterrmine_bedfile || params.mapstats_bedtools_featurefile || params.genotyping_reference_ploidy || params.genotyping_gatk_dbsnp || params.fasta_circular_target || params.circularmapper_elongated_fasta || params.circularmapper_elongated_fai )) log.warn("A TSV or CSV has been supplied to `--fasta_sheet` as well as individual reference-specific input files, e.g. `--contamination_estimation_angsd_hapmap`. Input files specified in the --fasta_sheet CSV/TSV take priority and other input parameters will be ignored.") if ( fasta.extension == 'csv' || fasta.extension == 'tsv' ) { // If input (multi-)reference sheet supplied @@ -132,12 +132,11 @@ workflow REFERENCE_INDEXING { if ( params.mapping_tool == "circularmapper" ) { // Throw errors if required parameters are missing ch_elongated_for_gunzip = ch_elongated_reference - .filter{ it[1] != "" && it[2] != "" } + .filter{ it[1] != "" || it[2] != "" } .ifEmpty{ error "[nf-core/eager]: ERROR: Mapping with circularmapper requires either a circular target or elongated reference file for at least one reference." } - .filter{ it != null } // Remove null channel which arises if empty cause error returns null. // This ELONGATE_REFERENCE subworkflow also checks if the provided reference is gzipped, and unzips it if necessary. - ELONGATE_REFERENCE( ch_input_from_referencesheet.circularmapper, ch_reference_for_mapping ) + ELONGATE_REFERENCE( ch_reference_for_mapping, ch_elongated_reference ) ch_version = ch_versions.mix( ELONGATE_REFERENCE.out.versions ) ch_elongated_indexed_reference = ELONGATE_REFERENCE.out.circular_reference } else { From 61c5a884006303e143794fdd632a00f9b55bf2f4 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 19 Jul 2024 16:41:35 +0200 Subject: [PATCH 140/198] fix cardinality in reference maps --- workflows/eager.nf | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/workflows/eager.nf b/workflows/eager.nf index c8964cec8..2291ad131 100644 --- a/workflows/eager.nf +++ b/workflows/eager.nf @@ -181,11 +181,11 @@ workflow EAGER { // ch_reference_for_mapping = REFERENCE_INDEXING.out.reference .map{ - meta, fasta, fai, dict, index, circular_target -> + meta, fasta, fai, dict, index -> [ meta, index, fasta ] } - MAP ( ch_reads_for_mapping, ch_reference_for_mapping ) + MAP ( ch_reads_for_mapping, ch_reference_for_mapping, REFERENCE_INDEXING.out.elongated_reference ) ch_versions = ch_versions.mix( MAP.out.versions ) ch_multiqc_files = ch_multiqc_files.mix( MAP.out.mqc.collect{it[1]}.ifEmpty([]) ) @@ -250,7 +250,7 @@ workflow EAGER { ch_fasta_for_deduplication = REFERENCE_INDEXING.out.reference .multiMap{ - meta, fasta, fai, dict, index, circular_target -> + meta, fasta, fai, dict, index -> fasta: [ meta, fasta ] fasta_fai: [ meta, fai ] } @@ -503,7 +503,7 @@ workflow EAGER { ch_fasta_for_damagecalculation = REFERENCE_INDEXING.out.reference .multiMap{ - meta, fasta, fai, dict, index, circular_target -> + meta, fasta, fai, dict, index -> fasta: [ meta, fasta ] fasta_fai: [ meta, fai ] } @@ -570,7 +570,7 @@ workflow EAGER { ch_reference_for_genotyping = REFERENCE_INDEXING.out.reference // Remove unnecessary files from the reference channel, so SWF doesn't break with each change to reference channel. .map { - meta, fasta, fai, dict, mapindex, circular_target -> + meta, fasta, fai, dict, mapindex -> [ meta, fasta, fai, dict ] } GENOTYPE( From 9054dd7db70faf8d79cb1fbd0662229bdacfcc9e Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 19 Jul 2024 16:42:05 +0200 Subject: [PATCH 141/198] add third input channel, and dummy output for CM for testing. --- subworkflows/local/map.nf | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/subworkflows/local/map.nf b/subworkflows/local/map.nf index 32291ffaa..c1fe076ab 100644 --- a/subworkflows/local/map.nf +++ b/subworkflows/local/map.nf @@ -18,7 +18,7 @@ workflow MAP { take: reads // [ [meta], [read1, reads2] ] or [ [meta], [read1] ] index // [ [meta], [ index ], [ fasta ] ] - elogated_index // [ [meta], [ index ], [ fasta ], [ circular_target ] ] + elogated_index // [ [meta], circular_target, circularmapper_elongated_fasta, circularmapper_elongated_fai ] main: ch_versions = Channel.empty() @@ -117,11 +117,11 @@ workflow MAP { ch_mapped_lane_bai = params.fasta_largeref ? SAMTOOLS_INDEX_BT2.out.csi : SAMTOOLS_INDEX_BT2.out.bai } else if ( params.mapping_tool == 'circularmapper' ) { - ch_index_for_mapping = index.map{ meta, index, fasta -> [ meta, index ] } - ch_elongated_reference_for_mapping = elogated_index.map{ meta, index, fasta, circular_target -> [ meta, index ] } - ch_reads_for_mapping = reads + // ch_index_for_mapping = index.map{ meta, index, fasta -> [ meta, index ] } + // ch_elongated_reference_for_mapping = elogated_index.map{ meta, circular_target, elongated_fasta, elongated_index -> [ meta, elongated_index ] } + // ch_reads_for_mapping = reads - CIRCULARMAPPER( ch_index_for_mapping, ch_elongated_reference_for_mapping, ch_reads_for_mapping ) + // CIRCULARMAPPER( ch_index_for_mapping, ch_elongated_reference_for_mapping, ch_reads_for_mapping ) // // Join the original and elongated references, then combine with the reads, and multiMap to ensure correct ordering of channel contents. // ch_reads_for_circularmapper = reads.map { @@ -160,8 +160,8 @@ workflow MAP { // CIRCULARMAPPER( ch_input_for_circularmapper.reads, params.elongation_factor, ch_input_for_circularmapper.reference ) // ch_versions = ch_versions.mix ( CIRCULARMAPPER.out.versions ) // // TODO - Update SWF outputs - // ch_mapped_lane_bam = CIRCULARMAPPER.out.bam - // ch_mapped_lane_bai = Channel.empty() // Circularmapper doesn't give a bai + ch_mapped_lane_bam = Channel.empty() //CIRCULARMAPPER.out.bam + ch_mapped_lane_bai = Channel.empty() // Circularmapper doesn't give a bai } From d656966327cc7e52209d1b3d4ed6d3829016e630 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 19 Jul 2024 16:42:58 +0200 Subject: [PATCH 142/198] WIP --- subworkflows/local/elongate_reference.nf | 30 ++++++++++++++---------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/subworkflows/local/elongate_reference.nf b/subworkflows/local/elongate_reference.nf index 1d29c2241..86902f508 100644 --- a/subworkflows/local/elongate_reference.nf +++ b/subworkflows/local/elongate_reference.nf @@ -5,6 +5,8 @@ include { GUNZIP as GUNZIP_ELONGATED_FASTA } from '../../modules/nf-core/gunzip/main' include { CIRCULARMAPPER_CIRCULARGENERATOR } from '../../modules/nf-core/circularmapper/circulargenerator/main' include { BWA_INDEX as BWA_INDEX_CIRCULARISED } from '../../modules/nf-core/bwa/index/main' +// TODO Check that the unzipping correctly overwrites the zipped fasta file, and that the emitted channel is constructed correctly. +// TODO Currently, nothing seems to get dumped in the emission channel, so some join must be off. workflow ELONGATE_REFERENCE { take: @@ -15,13 +17,13 @@ workflow ELONGATE_REFERENCE { ch_versions = Channel.empty() ch_multiqc_files = Channel.empty() ch_circular_reference = Channel.empty() + ch_elongated_unzipped = Channel.empty() - // Check if the elongated reference is gzipped, and if so, unzip it. + // Check if the provided elongated reference is gzipped, and if so, unzip it. ch_elongated_branches = ch_elongated_reference .branch { meta, circular_target, circularmapper_elongated_fasta, circularmapper_elongated_fai -> - - for_gunzip: circularmapper_elongated_fasta.extension == "gz" + for_gunzip: circularmapper_elongated_fasta != '' && circularmapper_elongated_fasta.extension == "gz" skip_gunzip: true } @@ -31,16 +33,17 @@ workflow ELONGATE_REFERENCE { [ meta, circularmapper_elongated_fasta ] } - GUNZIP_ELONGATED_FASTA( ch_elongated_for_gunzip.for_gunzip ) + GUNZIP_ELONGATED_FASTA( ch_elongated_for_gunzip ) ch_versions = ch_versions.mix( GUNZIP_ELONGATED_FASTA.out.versions.first() ) - ch_elongated_unzipped_reference = ch_elongated_branches.for_gunzip + ch_elongated_unzipped = ch_elongated_reference .join( GUNZIP_ELONGATED_FASTA.out.gunzip ) + .dump(tag: 'unzipped_fasta') .map { meta, circular_target, circularmapper_elongated_fasta, circularmapper_elongated_fai, unzipped_fasta -> + def final_fasta = unzipped_fasta ?: circularmapper_elongated_fasta [ meta, circular_target, unzipped_fasta, circularmapper_elongated_fai ] } - .mix( ch_elongated_branches.skip_gunzip ) /* Check what fasta files we have. @@ -51,7 +54,7 @@ workflow ELONGATE_REFERENCE { 4. None of the above -> Throw error and stop execution during parameter validation */ - ch_circulargenerator_input = ch_elongated_unzipped_reference + ch_circulargenerator_input = ch_elongated_unzipped .branch{ meta, circular_target, circularmapper_elongated_fasta, circularmapper_elongated_fai -> ready: circularmapper_elongated_fasta != "" && circularmapper_elongated_fai != "" @@ -74,9 +77,9 @@ workflow ELONGATE_REFERENCE { } CIRCULARMAPPER_CIRCULARGENERATOR( - ch_circulargenerator_input.needs_elongation.fasta, - ch_circulargenerator_input.needs_elongation.elongation_factor, - ch_circulargenerator_input.needs_elongation.target + ch_references_to_elongate.fasta, + ch_references_to_elongate.elongation_factor, + ch_references_to_elongate.target ) ch_versions = ch_versions.mix( CIRCULARMAPPER_CIRCULARGENERATOR.out.versions.first() ) @@ -94,13 +97,14 @@ workflow ELONGATE_REFERENCE { ch_indexed_references = ch_input_for_circular_indexing .join( BWA_INDEX_CIRCULARISED.out.index ) - // Then put all the indexed elongated references together and emit them - ch_circular_reference = ch_circulargenerator_input.ready + // Then put all the indexed elongated references together, replace any zipped ones with the unzipped version, and emit them + ch_circular_reference = ch_circulargenerator_input.ready.dump(tag:"ready", pretty:true) .map { meta, circular_target, circularmapper_elongated_fasta, circularmapper_elongated_fai -> [ meta, circularmapper_elongated_fasta, circularmapper_elongated_fai ] } - .mix( ch_indexed_references ) + .mix( ch_indexed_references.dump(tag:"indexed", pretty: true) ) + .dump(tag: 'circular_reference') emit: circular_reference = ch_circular_reference // [ meta, circular_target, fasta, fai ] From 073b8fd1511a61c613ea6c78bc1a571156281523 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Mon, 22 Jul 2024 17:04:41 +0200 Subject: [PATCH 143/198] module updates on elongate_reference modules --- conf/modules.config | 38 ++++++++++++++++++++++++++++---------- 1 file changed, 28 insertions(+), 10 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index c4f01939b..2d520b2c4 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -278,6 +278,34 @@ process { ] } + // Reference elongation and indexing for circular mapping + withName: GUNZIP_ELONGATED_FASTA { + publishDir = [ + path: { "${params.outdir}/reference/${meta.id}_${params.mapping_circularmapper_elongation_factor}/" }, + mode: params.publish_dir_mode, + pattern: '*[0-9].f*', + enabled: params.save_reference + ] + } + + withName: CIRCULARMAPPER_CIRCULARGENERATOR { + tag = { "${meta.id}_${params.mapping_circularmapper_elongation_factor}" } + publishDir = [ + path: { "${params.outdir}/mapping/circularmapper/" }, + mode: params.publish_dir_mode, + pattern: '*[0-9].fasta' + ] + } + + withName: BWA_INDEX_CIRCULARISED { + publishDir = [ + path: { "${params.outdir}/reference/${meta.id}_${params.mapping_circularmapper_elongation_factor}/" }, + mode: params.publish_dir_mode, + pattern: 'bwa', + enabled: params.save_reference + ] + } + // // BAM INPUT // @@ -529,16 +557,6 @@ process { ] } - withName: CIRCULARMAPPER_CIRCULARGENERATOR { - tag = { "${meta.reference}|${meta.sample_id}_${meta.library_id}" } - ext.prefix = { "${meta.sample_id}_${meta.library_id}_${meta.reference}" } - publishDir = [ - path: { "${params.outdir}/mapping/circularmapper/" }, - mode: params.publish_dir_mode, - pattern: '*[0-9].fasta' - ] - } - withName: CIRCULARMAPPER_REALIGNSAMFILE { tag = { "${meta.reference}|${meta.sample_id}_${meta.library_id}" } ext.prefix = { "${meta.sample_id}_${meta.library_id}_${meta.reference}" } From bce305a1e685d82a8eab937fe53442455fc2cf71 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Mon, 22 Jul 2024 17:04:59 +0200 Subject: [PATCH 144/198] actually run when the reference is not zipped --- subworkflows/local/elongate_reference.nf | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/subworkflows/local/elongate_reference.nf b/subworkflows/local/elongate_reference.nf index 86902f508..23f71b33d 100644 --- a/subworkflows/local/elongate_reference.nf +++ b/subworkflows/local/elongate_reference.nf @@ -38,12 +38,13 @@ workflow ELONGATE_REFERENCE { ch_elongated_unzipped = ch_elongated_reference .join( GUNZIP_ELONGATED_FASTA.out.gunzip ) - .dump(tag: 'unzipped_fasta') + .dump(tag: 'unzipped_fasta', pretty: true) .map { meta, circular_target, circularmapper_elongated_fasta, circularmapper_elongated_fai, unzipped_fasta -> def final_fasta = unzipped_fasta ?: circularmapper_elongated_fasta [ meta, circular_target, unzipped_fasta, circularmapper_elongated_fai ] } + .mix( ch_elongated_branches.skip_gunzip ) /* Check what fasta files we have. From 20f8ccfd57ac6e1433b6c9c811f00bb38a0d3d07 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Thu, 25 Jul 2024 12:44:39 +0200 Subject: [PATCH 145/198] correct cardinality of circular reference channel --- subworkflows/local/elongate_reference.nf | 33 +++++++++++------------- subworkflows/local/map.nf | 16 +++++++----- subworkflows/local/reference_indexing.nf | 2 +- 3 files changed, 26 insertions(+), 25 deletions(-) diff --git a/subworkflows/local/elongate_reference.nf b/subworkflows/local/elongate_reference.nf index 23f71b33d..d0f37cd19 100644 --- a/subworkflows/local/elongate_reference.nf +++ b/subworkflows/local/elongate_reference.nf @@ -6,12 +6,11 @@ include { GUNZIP as GUNZIP_ELONGATED_FASTA } from '../../modules/nf-core/gunz include { CIRCULARMAPPER_CIRCULARGENERATOR } from '../../modules/nf-core/circularmapper/circulargenerator/main' include { BWA_INDEX as BWA_INDEX_CIRCULARISED } from '../../modules/nf-core/bwa/index/main' // TODO Check that the unzipping correctly overwrites the zipped fasta file, and that the emitted channel is constructed correctly. -// TODO Currently, nothing seems to get dumped in the emission channel, so some join must be off. workflow ELONGATE_REFERENCE { take: ch_reference // [ meta, fasta, fai, dict, mapindex ] - ch_elongated_reference // [ meta, circular_target, circularmapper_elongated_fasta, circularmapper_elongated_fai ] + ch_elongated_reference // [ meta, circular_target, circularmapper_elongated_fasta, circularmapper_elongated_index ] main: ch_versions = Channel.empty() @@ -22,14 +21,14 @@ workflow ELONGATE_REFERENCE { // Check if the provided elongated reference is gzipped, and if so, unzip it. ch_elongated_branches = ch_elongated_reference .branch { - meta, circular_target, circularmapper_elongated_fasta, circularmapper_elongated_fai -> + meta, circular_target, circularmapper_elongated_fasta, circularmapper_elongated_index -> for_gunzip: circularmapper_elongated_fasta != '' && circularmapper_elongated_fasta.extension == "gz" skip_gunzip: true } ch_elongated_for_gunzip = ch_elongated_branches.for_gunzip .map { - meta, circular_target, circularmapper_elongated_fasta, circularmapper_elongated_fai -> + meta, circular_target, circularmapper_elongated_fasta, circularmapper_elongated_index -> [ meta, circularmapper_elongated_fasta ] } @@ -38,11 +37,10 @@ workflow ELONGATE_REFERENCE { ch_elongated_unzipped = ch_elongated_reference .join( GUNZIP_ELONGATED_FASTA.out.gunzip ) - .dump(tag: 'unzipped_fasta', pretty: true) .map { - meta, circular_target, circularmapper_elongated_fasta, circularmapper_elongated_fai, unzipped_fasta -> + meta, circular_target, circularmapper_elongated_fasta, circularmapper_elongated_index, unzipped_fasta -> def final_fasta = unzipped_fasta ?: circularmapper_elongated_fasta - [ meta, circular_target, unzipped_fasta, circularmapper_elongated_fai ] + [ meta, circular_target, unzipped_fasta, circularmapper_elongated_index ] } .mix( ch_elongated_branches.skip_gunzip ) @@ -57,9 +55,9 @@ workflow ELONGATE_REFERENCE { ch_circulargenerator_input = ch_elongated_unzipped .branch{ - meta, circular_target, circularmapper_elongated_fasta, circularmapper_elongated_fai -> - ready: circularmapper_elongated_fasta != "" && circularmapper_elongated_fai != "" - needs_index: circularmapper_elongated_fasta != "" && circularmapper_elongated_fai == "" + meta, circular_target, circularmapper_elongated_fasta, circularmapper_elongated_index -> + ready: circularmapper_elongated_fasta != "" && circularmapper_elongated_index != "" + needs_index: circularmapper_elongated_fasta != "" && circularmapper_elongated_index == "" needs_elongation: circularmapper_elongated_fasta == "" && circular_target != "" } @@ -68,7 +66,7 @@ workflow ELONGATE_REFERENCE { ch_references_to_elongate = ch_circulargenerator_input.needs_elongation .join( ch_reference ) .multiMap { - meta, circular_target, circularmapper_elongated_fasta, circularmapper_elongated_fai, fasta, fai, dict, mapindex -> + meta, circular_target, circularmapper_elongated_fasta, circularmapper_elongated_index, fasta, fai, dict, mapindex -> def elongation_factor = params.mapping_circularmapper_elongation_factor @@ -87,7 +85,7 @@ workflow ELONGATE_REFERENCE { // Collect newly generated circular references and provided ones without an index, and index them. ch_input_for_circular_indexing = ch_circulargenerator_input.needs_index .map { - meta, circular_target, circularmapper_elongated_fasta, circularmapper_elongated_fai -> + meta, circular_target, circularmapper_elongated_fasta, circularmapper_elongated_index -> [ meta, circularmapper_elongated_fasta ] } .mix( CIRCULARMAPPER_CIRCULARGENERATOR.out.fasta ) @@ -99,16 +97,15 @@ workflow ELONGATE_REFERENCE { .join( BWA_INDEX_CIRCULARISED.out.index ) // Then put all the indexed elongated references together, replace any zipped ones with the unzipped version, and emit them - ch_circular_reference = ch_circulargenerator_input.ready.dump(tag:"ready", pretty:true) + ch_circular_reference = ch_circulargenerator_input.ready .map { - meta, circular_target, circularmapper_elongated_fasta, circularmapper_elongated_fai -> - [ meta, circularmapper_elongated_fasta, circularmapper_elongated_fai ] + meta, circular_target, circularmapper_elongated_fasta, circularmapper_elongated_index -> + [ meta, circularmapper_elongated_fasta, circularmapper_elongated_index ] } - .mix( ch_indexed_references.dump(tag:"indexed", pretty: true) ) - .dump(tag: 'circular_reference') + .mix( ch_indexed_references ) emit: - circular_reference = ch_circular_reference // [ meta, circular_target, fasta, fai ] + circular_reference = ch_circular_reference // [ meta, fasta, index ] versions = ch_versions mqc = ch_multiqc_files } diff --git a/subworkflows/local/map.nf b/subworkflows/local/map.nf index c1fe076ab..3ba52ef86 100644 --- a/subworkflows/local/map.nf +++ b/subworkflows/local/map.nf @@ -18,7 +18,7 @@ workflow MAP { take: reads // [ [meta], [read1, reads2] ] or [ [meta], [read1] ] index // [ [meta], [ index ], [ fasta ] ] - elogated_index // [ [meta], circular_target, circularmapper_elongated_fasta, circularmapper_elongated_fai ] + elogated_index // [ [meta], circularmapper_elongated_fasta, circularmapper_elongated_index ] main: ch_versions = Channel.empty() @@ -117,11 +117,16 @@ workflow MAP { ch_mapped_lane_bai = params.fasta_largeref ? SAMTOOLS_INDEX_BT2.out.csi : SAMTOOLS_INDEX_BT2.out.bai } else if ( params.mapping_tool == 'circularmapper' ) { - // ch_index_for_mapping = index.map{ meta, index, fasta -> [ meta, index ] } - // ch_elongated_reference_for_mapping = elogated_index.map{ meta, circular_target, elongated_fasta, elongated_index -> [ meta, elongated_index ] } - // ch_reads_for_mapping = reads + ch_index_for_mapping = index + ch_elongated_reference_for_mapping = elogated_index.map{ meta, elongated_fasta, elongated_index -> [ meta, elongated_index ] } + ch_reads_for_mapping = reads - // CIRCULARMAPPER( ch_index_for_mapping, ch_elongated_reference_for_mapping, ch_reads_for_mapping ) + CIRCULARMAPPER( + ch_index_for_mapping, + ch_elongated_reference_for_mapping, + ch_reads_for_mapping, + params.mapping_circularmapper_elongation_factor + ) // // Join the original and elongated references, then combine with the reads, and multiMap to ensure correct ordering of channel contents. // ch_reads_for_circularmapper = reads.map { @@ -163,7 +168,6 @@ workflow MAP { ch_mapped_lane_bam = Channel.empty() //CIRCULARMAPPER.out.bam ch_mapped_lane_bai = Channel.empty() // Circularmapper doesn't give a bai - } diff --git a/subworkflows/local/reference_indexing.nf b/subworkflows/local/reference_indexing.nf index 7cd9512ab..2fa931938 100644 --- a/subworkflows/local/reference_indexing.nf +++ b/subworkflows/local/reference_indexing.nf @@ -145,7 +145,7 @@ workflow REFERENCE_INDEXING { emit: reference = ch_reference_for_mapping // [ meta, fasta, fai, dict, mapindex ] - elongated_reference = ch_elongated_indexed_reference // [ meta, circular_target, circularmapper_elongated_fasta, circularmapper_elongated_fai ] + elongated_reference = ch_elongated_indexed_reference // [ meta, circularmapper_elongated_fasta, circularmapper_elongated_index ] mitochondrion_header = ch_mitochondrion_header // [ meta, mitochondrion_header ] hapmap = ch_hapmap // [ meta, hapmap ] pmd_masking = ch_pmd_masking // [ meta, pmd_masked_fasta, pmd_bed_for_masking ] From c73e14fd04a4124cee90c2b1a769e503ea277b85 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Thu, 25 Jul 2024 13:07:52 +0200 Subject: [PATCH 146/198] output elongated reference in reference dirs when requested --- conf/modules.config | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 2d520b2c4..6b131cfbd 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -291,9 +291,10 @@ process { withName: CIRCULARMAPPER_CIRCULARGENERATOR { tag = { "${meta.id}_${params.mapping_circularmapper_elongation_factor}" } publishDir = [ - path: { "${params.outdir}/mapping/circularmapper/" }, + path: { "${params.outdir}/reference/${meta.id}_${params.mapping_circularmapper_elongation_factor}/" }, mode: params.publish_dir_mode, - pattern: '*[0-9].fasta' + pattern: '*[0-9].fasta', + enabled: params.save_reference ] } @@ -557,6 +558,8 @@ process { ] } + // Circular mapping + // Configuration for BWA_ALN and BWA_SAMSE/SAMPE is the same as for the non-circular mapping withName: CIRCULARMAPPER_REALIGNSAMFILE { tag = { "${meta.reference}|${meta.sample_id}_${meta.library_id}" } ext.prefix = { "${meta.sample_id}_${meta.library_id}_${meta.reference}" } @@ -567,6 +570,15 @@ process { ] } + withName: ".*MAP:FASTQ_ALIGN_BWAALN_ELONGATED:SAMTOOLS_INDEX" { + tag = { "${meta.id_index}|${meta.sample_id}_${meta.library_id}_L${meta.lane}" } + ext.args = { params.fasta_largeref ? "-c" : "" } + ext.prefix = { "${meta.sample_id}_${meta.library_id}_L${meta.lane}_${meta.reference}" } + publishDir = [ + enabled: false + ] + } + // // DEDUPLICATION // From 1121cbf6ba6ea8169c2f5361bf14a3174d376e94 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Thu, 25 Jul 2024 13:37:34 +0200 Subject: [PATCH 147/198] no publishing of lane bams in CM --- conf/modules.config | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 6b131cfbd..b912a8ba2 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -564,9 +564,7 @@ process { tag = { "${meta.reference}|${meta.sample_id}_${meta.library_id}" } ext.prefix = { "${meta.sample_id}_${meta.library_id}_${meta.reference}" } publishDir = [ - path: { "${params.outdir}/mapping/circularmapper/" }, - mode: params.publish_dir_mode, - pattern: '*_realigned.bam' + enabled: false ] } From cb20652f3a82e25a9ef72c298c8014cbe3392c93 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Thu, 25 Jul 2024 13:37:59 +0200 Subject: [PATCH 148/198] this works now --- subworkflows/local/circularmapper.nf | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/subworkflows/local/circularmapper.nf b/subworkflows/local/circularmapper.nf index dde31729f..da3de830f 100644 --- a/subworkflows/local/circularmapper.nf +++ b/subworkflows/local/circularmapper.nf @@ -2,16 +2,17 @@ // Run circularmapper // -include { FASTQ_ALIGN_BWAALN as FASTQ_ALIGN_BWAALN_ELONGATED } from '../../subworkflows/nf-core/fastq_align_bwaaln/main' -include { CIRCULARMAPPER_REALIGNSAMFILE } from '../../modules/nf-core/circularmapper/realignsamfile/main' -include { SAMTOOLS_INDEX as SAMTOOLS_INDEX_REALIGNED } from '../../modules/nf-core/samtools/index/main' +include { FASTQ_ALIGN_BWAALN as FASTQ_ALIGN_BWAALN_ELONGATED } from '../../subworkflows/nf-core/fastq_align_bwaaln/main' +include { CIRCULARMAPPER_REALIGNSAMFILE } from '../../modules/nf-core/circularmapper/realignsamfile/main' +include { SAMTOOLS_INDEX as SAMTOOLS_INDEX_REALIGNED } from '../../modules/nf-core/samtools/index/main' +include { addNewMetaFromAttributes } from '../../subworkflows/local/utils_nfcore_eager_pipeline/main' workflow CIRCULARMAPPER { take: ch_reference // channel (mandatory): [ val(meta), path(index), path(reference) ] - ch_elongated_index // channel (mandatory): [ val(meta), path(elongated_index) ] + ch_elongated_reference // channel (mandatory): [ val(meta), path(elongated_index) ] ch_fastq_reads // channel (mandatory): [ val(meta), path(reads) ]. subworkImportant: meta REQUIRES single_end` entry! - val_elongation_factor // int (mandatory): Elongation factor used for chromosome circularisation + val_elongation_factor // int (mandatory): Elongation factor used for chromosome circularisation main: ch_versions = Channel.empty() @@ -21,7 +22,7 @@ workflow CIRCULARMAPPER { ch_realigned_csis = Channel.empty() // While mapping with BWA will need the elongated reference index, RealignSAMFile apparently does NOT need the elongated reference to be present, only the elongation factor. - FASTQ_ALIGN_BWAALN_ELONGATED( ch_fastq_reads, ch_elongated_index ) + FASTQ_ALIGN_BWAALN_ELONGATED( ch_fastq_reads, ch_elongated_reference ) ch_versions = ch_versions.mix( FASTQ_ALIGN_BWAALN_ELONGATED.out.versions.first() ) ch_ref_for_realignsamfile = ch_reference @@ -37,7 +38,6 @@ workflow CIRCULARMAPPER { ch_input_for_realignsamfile = FASTQ_ALIGN_BWAALN_ELONGATED.out.bam .map{ // create meta consistent with rest of MAP workflow - // TODO: Check that the id_index is correctly set and remove the elongation factor suffix if necessary. meta, bam -> new_meta = meta + [ reference: meta.id_index ] [ new_meta, bam ] @@ -49,23 +49,21 @@ workflow CIRCULARMAPPER { .combine( ch_ref_for_realignsamfile, by: 0 ) .multiMap { ignore_me, meta, bam, ref_meta, ref_fasta -> - bam: [ metas, bam ] + bam: [ meta, bam ] fasta: [ ref_meta, ref_fasta ] } - CIRCULARMAPPER_REALIGNSAMFILE( ch_input_for_realignsamfile.bam, ch_input_for_realignsamfile.fasta, val_elongation_factor ) + CIRCULARMAPPER_REALIGNSAMFILE( ch_input_for_realignsamfile.bam, ch_input_for_realignsamfile.fasta, [ [], val_elongation_factor ] ) ch_versions = ch_versions.mix( CIRCULARMAPPER_REALIGNSAMFILE.out.versions.first() ) - ch_realigned_bams = ch_realigned_bams.mix( CIRCULARMAPPER_REALIGNSAMFILE.out.bam ) + ch_realigned_bams = CIRCULARMAPPER_REALIGNSAMFILE.out.bam SAMTOOLS_INDEX_REALIGNED( ch_realigned_bams ) ch_versions = ch_versions.mix( SAMTOOLS_INDEX_REALIGNED.out.versions.first() ) - ch_realigned_bais = ch_realigned_bais.mix( SAMTOOLS_INDEX_REALIGNED.out.bai ) - ch_realigned_csis = ch_realigned_csis.mix( SAMTOOLS_INDEX_REALIGNED.out.csi ) + ch_realigned_bais = params.fasta_largeref ? SAMTOOLS_INDEX_REALIGNED.out.csi : SAMTOOLS_INDEX_REALIGNED.out.bai emit: - bam = ch_realigned_bams // [ val(meta), path(bam) ] - bai = ch_realigned_bais // [ val(meta), path(bai) ] - csi = ch_realigned_csis // [ val(meta), path(csi) ] + bam = ch_realigned_bams // [ meta, bam ] + bai = ch_realigned_bais // [ meta, bai/csi ] versions = ch_versions mqc = ch_multiqc_files } From 224994ffa85c28c281a1e772021669fc4a9f6b45 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Thu, 25 Jul 2024 13:39:05 +0200 Subject: [PATCH 149/198] add circularmapper. fix issue with channel join before flagstat for large references --- subworkflows/local/map.nf | 62 +++++++-------------------------------- 1 file changed, 11 insertions(+), 51 deletions(-) diff --git a/subworkflows/local/map.nf b/subworkflows/local/map.nf index 3ba52ef86..0854688f1 100644 --- a/subworkflows/local/map.nf +++ b/subworkflows/local/map.nf @@ -117,56 +117,16 @@ workflow MAP { ch_mapped_lane_bai = params.fasta_largeref ? SAMTOOLS_INDEX_BT2.out.csi : SAMTOOLS_INDEX_BT2.out.bai } else if ( params.mapping_tool == 'circularmapper' ) { - ch_index_for_mapping = index - ch_elongated_reference_for_mapping = elogated_index.map{ meta, elongated_fasta, elongated_index -> [ meta, elongated_index ] } - ch_reads_for_mapping = reads + ch_elongated_reference_for_mapping = elogated_index + .map { + meta, elongated_fasta, elongated_index -> + [ meta, elongated_index ] + } - CIRCULARMAPPER( - ch_index_for_mapping, - ch_elongated_reference_for_mapping, - ch_reads_for_mapping, - params.mapping_circularmapper_elongation_factor - ) - - // // Join the original and elongated references, then combine with the reads, and multiMap to ensure correct ordering of channel contents. - // ch_reads_for_circularmapper = reads.map { - // // Prepend a new meta that contains the meta.reference value as the new_meta.reference attribute - // addNewMetaFromAttributes( it, "reference" , "reference" , false ) - // } - - // ch_input_for_circularmapper = index.join( elogated_index ) - // .map { - // meta, index, fasta, elongated_index, elongated_fasta, circular_target -> - // [ meta, index, fasta , elongated_index, elongated_fasta ] - // } - // .map { - // // Prepend a new meta that contains the meta.reference value as the new_meta.reference attribute - // addNewMetaFromAttributes( it, "id" , "reference" , false ) - // } - // .combine( ch_reads_for_circularmapper, by: 0) - // .multiMap { - // ignore_me, meta, index, fasta, elongated_index, elongated_fasta, circular_target, meta2, fasta, reads -> - // reads: [ meta, reads ] - // reference: [ meta, index, fasta ] - // elongated_reference: [meta, elongated_index , elongated_index] - // } - - // Reference elongation and indexing takes place in the reference_indexing swf. - // Circularmapper takes non-elongated AND elongated references and reads as input (i think. wait for Alex's reply). - - // ch_input_for_circularmapper = reads - // .combine(index.map{ meta, index, fasta -> [ meta, fasta ] }) - // .dump(tag:"CM Inputs", pretty:true) - // .multiMap { - // meta, reads, meta2, fasta -> - // reads: [ meta, reads ] - // reference: [ meta2, fasta ] - // } - // CIRCULARMAPPER( ch_input_for_circularmapper.reads, params.elongation_factor, ch_input_for_circularmapper.reference ) - // ch_versions = ch_versions.mix ( CIRCULARMAPPER.out.versions ) - // // TODO - Update SWF outputs - ch_mapped_lane_bam = Channel.empty() //CIRCULARMAPPER.out.bam - ch_mapped_lane_bai = Channel.empty() // Circularmapper doesn't give a bai + CIRCULARMAPPER( index, ch_elongated_reference_for_mapping, reads, params.mapping_circularmapper_elongation_factor ) + ch_versions = ch_versions.mix ( CIRCULARMAPPER.out.versions ) + ch_mapped_lane_bam = CIRCULARMAPPER.out.bam + ch_mapped_lane_bai = CIRCULARMAPPER.out.bai } @@ -200,7 +160,7 @@ workflow MAP { ch_mapped_bai = params.fasta_largeref ? SAMTOOLS_INDEX_MERGED_LANES.out.csi : SAMTOOLS_INDEX_MERGED_LANES.out.bai ch_versions.mix( SAMTOOLS_INDEX_MERGED_LANES.out.versions ) - ch_input_for_flagstat = SAMTOOLS_SORT_MERGED_LANES.out.bam.join( SAMTOOLS_INDEX_MERGED_LANES.out.bai, failOnMismatch: true ) + ch_input_for_flagstat = ch_mapped_bam.join( ch_mapped_bai, failOnMismatch: true ) SAMTOOLS_FLAGSTAT_MAPPED ( ch_input_for_flagstat ) ch_versions.mix( SAMTOOLS_FLAGSTAT_MAPPED.out.versions.first() ) @@ -208,7 +168,7 @@ workflow MAP { emit: bam = ch_mapped_bam // [ [ meta ], bam ] - bai = ch_mapped_bai // [ [ meta ], bai ] + bai = ch_mapped_bai // [ [ meta ], bai/csi ] flagstat = SAMTOOLS_FLAGSTAT_MAPPED.out.flagstat // [ [ meta ], stats ] mqc = ch_multiqc_files versions = ch_versions From 63d08883a92e6d8fe844742d5b78b55675fb456d Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Thu, 25 Jul 2024 14:46:03 +0200 Subject: [PATCH 150/198] remove completed TODOs --- subworkflows/local/elongate_reference.nf | 1 - subworkflows/local/reference_indexing_multi.nf | 1 - subworkflows/local/reference_indexing_single.nf | 1 - 3 files changed, 3 deletions(-) diff --git a/subworkflows/local/elongate_reference.nf b/subworkflows/local/elongate_reference.nf index d0f37cd19..9040ea23f 100644 --- a/subworkflows/local/elongate_reference.nf +++ b/subworkflows/local/elongate_reference.nf @@ -5,7 +5,6 @@ include { GUNZIP as GUNZIP_ELONGATED_FASTA } from '../../modules/nf-core/gunzip/main' include { CIRCULARMAPPER_CIRCULARGENERATOR } from '../../modules/nf-core/circularmapper/circulargenerator/main' include { BWA_INDEX as BWA_INDEX_CIRCULARISED } from '../../modules/nf-core/bwa/index/main' -// TODO Check that the unzipping correctly overwrites the zipped fasta file, and that the emitted channel is constructed correctly. workflow ELONGATE_REFERENCE { take: diff --git a/subworkflows/local/reference_indexing_multi.nf b/subworkflows/local/reference_indexing_multi.nf index 628d58a90..652380331 100644 --- a/subworkflows/local/reference_indexing_multi.nf +++ b/subworkflows/local/reference_indexing_multi.nf @@ -7,7 +7,6 @@ include { BWA_INDEX } from '../../modules/nf-core/bwa/inde include { BOWTIE2_BUILD } from '../../modules/nf-core/bowtie2/build/main' include { SAMTOOLS_FAIDX } from '../../modules/nf-core/samtools/faidx/main' include { PICARD_CREATESEQUENCEDICTIONARY } from '../../modules/nf-core/picard/createsequencedictionary/main' -// TODO missing: circulargeneraotr? workflow REFERENCE_INDEXING_MULTI { diff --git a/subworkflows/local/reference_indexing_single.nf b/subworkflows/local/reference_indexing_single.nf index f4c9e42cd..10164f31e 100644 --- a/subworkflows/local/reference_indexing_single.nf +++ b/subworkflows/local/reference_indexing_single.nf @@ -8,7 +8,6 @@ include { BWA_INDEX } from '../../modules/nf-core/bwa/inde include { BOWTIE2_BUILD } from '../../modules/nf-core/bowtie2/build/main' include { SAMTOOLS_FAIDX } from '../../modules/nf-core/samtools/faidx/main' include { PICARD_CREATESEQUENCEDICTIONARY } from '../../modules/nf-core/picard/createsequencedictionary/main' -// TODO missing: circulargeneraotr? workflow REFERENCE_INDEXING_SINGLE { From 36fc7c6386a0e5944610c78d4dd95f386670d324 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Thu, 25 Jul 2024 16:35:39 +0200 Subject: [PATCH 151/198] add CM to CI --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 8c7a9c2a5..45c6b1106 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -31,7 +31,7 @@ jobs: - "-profile test,docker --preprocessing_tool adapterremoval --preprocessing_adapterlist 'https://github.com/nf-core/test-datasets/raw/modules/data/delete_me/adapterremoval/adapterremoval_adapterlist.txt' --sequencing_qc_tool falco --run_genotyping --genotyping_tool 'freebayes' --genotyping_source 'raw'" - "-profile test,docker --mapping_tool bwamem --run_mapdamage_rescaling --run_pmd_filtering --run_trim_bam --run_genotyping --genotyping_tool 'ug' --genotyping_source 'trimmed'" - "-profile test,docker --mapping_tool bowtie2 --damagecalculation_tool mapdamage --damagecalculation_mapdamage_downsample 100 --run_genotyping --genotyping_tool 'hc' --genotyping_source 'raw'" - - "-profile test,docker --skip_preprocessing --convert_inputbam" + - "-profile test,docker --mapping_tool circularmapper --skip_preprocessing --convert_inputbam --fasta_circular_target 'NC_007596.2' --mapping_circularmapper_elongation_factor 500" - "-profile test_humanbam,docker --run_mtnucratio --run_contamination_estimation_angsd --snpcapture_bed 'https://raw.githubusercontent.com/nf-core/test-datasets/eager/reference/Human/1240K.pos.list_hs37d5.0based.bed.gz' --run_genotyping --genotyping_tool 'pileupcaller' --genotyping_source 'raw'" - "-profile test_humanbam,docker --run_sexdeterrmine --run_genotyping --genotyping_tool 'angsd' --genotyping_source 'raw'" - "-profile test_multiref,docker" ## TODO add damage manipulation here instead once it goes multiref From 2f78995d1f147429be836003bf7c789fa4f72e91 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 26 Jul 2024 11:14:38 +0200 Subject: [PATCH 152/198] minor tweak to avoid `null` in file names --- conf/modules.config | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 529367088..11c256adf 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -433,7 +433,7 @@ process { withName: BWA_ALN { tag = { "${meta.id_index}|${meta.sample_id}_${meta.library_id}_L${meta.lane}" } ext.args = { "-n ${params.mapping_bwaaln_n} -k ${params.mapping_bwaaln_k} -l ${params.mapping_bwaaln_l} -o ${params.mapping_bwaaln_o}" } - ext.prefix = { "${meta.sample_id}_${meta.library_id}_L${meta.lane}_${meta.reference}" } + ext.prefix = { "${meta.sample_id}_${meta.library_id}_L${meta.lane}_${meta.index_id}" } publishDir = [ enabled: false ] @@ -446,7 +446,7 @@ process { [ "-r '@RG\\tID:ILLUMINA-${meta.sample_id}_${meta.library_id}\\tSM:${meta.sample_id}\\tLB:${meta.library_id}\\tPL:illumina\\tPU:ILLUMINA-${meta.library_id}-${meta.strandedness}_stranded-${se_pe_string}'" ].join(' ').trim() } - ext.prefix = { "${meta.sample_id}_${meta.library_id}_L${meta.lane}_${meta.reference}" } + ext.prefix = { "${meta.sample_id}_${meta.library_id}_L${meta.lane}_${meta.id_index}" } publishDir = [ enabled: false ] From edccda3c63de381b106920fc957703ccc26c1d5d Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 26 Jul 2024 11:43:19 +0200 Subject: [PATCH 153/198] typo --- conf/modules.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/modules.config b/conf/modules.config index 11c256adf..a8a40dc63 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -433,7 +433,7 @@ process { withName: BWA_ALN { tag = { "${meta.id_index}|${meta.sample_id}_${meta.library_id}_L${meta.lane}" } ext.args = { "-n ${params.mapping_bwaaln_n} -k ${params.mapping_bwaaln_k} -l ${params.mapping_bwaaln_l} -o ${params.mapping_bwaaln_o}" } - ext.prefix = { "${meta.sample_id}_${meta.library_id}_L${meta.lane}_${meta.index_id}" } + ext.prefix = { "${meta.sample_id}_${meta.library_id}_L${meta.lane}_${meta.id_index}" } publishDir = [ enabled: false ] From 1979181e3830a1037b2d44414193b65c67cedb22 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 26 Jul 2024 11:58:27 +0200 Subject: [PATCH 154/198] add manual tests --- docs/development/manual_tests.md | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/docs/development/manual_tests.md b/docs/development/manual_tests.md index eea6cab14..02b6c6022 100644 --- a/docs/development/manual_tests.md +++ b/docs/development/manual_tests.md @@ -281,6 +281,23 @@ nextflow run ../main.nf -profile test,singularity --outdir ./results -resume -du ``` +### CircularMapper + +```bash +## CircularMapper with reference elongation +## Expect: Reference elongation is ran, and circularmapper SWF is ran. +## Check: Expect the elongated reference and BWA index directory within the `reference` directory. Also 2 bam files together with their BAIs and Flagstats in the `mapping/circularmapper` directory. +nextflow run ../main.nf -profile test,docker --outdir ./results -w work/ -resume -dump-channels -ansi-log false --fasta_circular_target 'NC_007596.2' --mapping_tool circularmapper --save_reference +``` + +```bash +## CircularMapper with an already elongated reference. Big reference flag. Also check that bwa_aln flags also propagate when using circularmapper. +## Expect: Reference elongation is NOT ran, and circularmapper SWF is ran. +## Check: 2 bam files together with their CSIs and Flagstats in the `mapping/circularmapper` directory. +## Also check the BAM headers for the -k and -n flags during BWA ALN. +nextflow run main.nf -profile test,docker --outdir ./results -w work/ -resume -dump-channels -ansi-log false --fasta_circular_target 'NC_007596.2' --mapping_tool circularmapper --mapping_circularmapper_elongated_fasta data/reference/Mammoth_MT_Krause_500/Mammoth_MT_Krause_500.fasta --mapping_circularmapper_elongated_fai data/reference/Mammoth_MT_Krause_500/bwa --fasta_largeref --mapping_bwaaln_n 0.05 --mapping_bwaaln_k 3 +``` + ## Host Removal All possible parameters From 585de6b42b3833fedb67ef8519a6594dcc0f7c1c Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 2 Aug 2024 11:26:51 +0200 Subject: [PATCH 155/198] incorporate review suggestions. put mapping output within tool subdirectory --- conf/modules.config | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index a8a40dc63..6b30d4153 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -293,7 +293,7 @@ process { publishDir = [ path: { "${params.outdir}/reference/${meta.id}_${params.mapping_circularmapper_elongation_factor}/" }, mode: params.publish_dir_mode, - pattern: '*[0-9].fasta', + pattern: '*_*[0-9].fasta', enabled: params.save_reference ] } @@ -531,7 +531,7 @@ process { tag = { "${meta.reference}|${meta.sample_id}_${meta.library_id}" } ext.prefix = { "${meta.sample_id}_${meta.library_id}_${meta.reference}_sorted" } publishDir = [ - path: { "${params.outdir}/mapping/" }, + path: { "${params.outdir}/mapping/${params.mapping_tool}" }, mode: params.publish_dir_mode, pattern: '*.{bam}' ] From ab2d40ed6a692ff0517e8c9f4d348c79481e5e14 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 2 Aug 2024 11:33:51 +0200 Subject: [PATCH 156/198] update output.md --- docs/output.md | 44 ++++++++++++++++++++++++++++++++++---------- 1 file changed, 34 insertions(+), 10 deletions(-) diff --git a/docs/output.md b/docs/output.md index b29959f56..ab4f0eb59 100644 --- a/docs/output.md +++ b/docs/output.md @@ -95,6 +95,25 @@ Depending on what is supplied by the user, and if `--save_reference` is supplied It is highly recommend to move these files to a central location or cache directory on your machine to facilitate resume of the indices across different pipeline runs. In many cases indexing the reference genome for alignment can be the longest step of a pipeline run, therefore re-using indices in future runs (supplied to the pipeline with flags such as `--fasta_fai`, `--fasta_dict`, etc. or added to the reference sheet provided to `--fasta`) can greatly speed up analyses on other samples. +#### Reference Elongation + +
+Output files + +- `reference/` + - `_/` + - `*.{fasta,fna,fa,fa}`: Uncompressed input FASTA file (if supplied to pipeline gzipped). + - `bwa/`: + - `*.fasta.{amb,ann,bwt,pac,sa}`: BWA aligner(s) reference index files from `bwa index`. + +
+ +Mapping with `circularmapper` requires an elongated reference built by [CircularMapper/CircularGenerator](https://github.com/apeltzer/CircularMapper). CircularGenerator elongates the `--fasta_circular_target` of a supplied reference genome fasta by the number of base pairs specified in `--fasta_circularmapper_elongationfactor`. + +Depending on what is supplied by the user, and if `--save_reference` is supplied, this directory will contain the elongated reference fasta, as well as its corresponding bwa reference index files. + +It is highly recommend to move these files to a central location or cache directory on your machine to facilitate resume of the indices across different pipeline runs. In many cases indexing the reference genome for alignment can be the longest step of a pipeline run, therefore re-using indices in future runs (supplied to the pipeline with flags such as `--fasta_circularmapper_elongatedfasta`, `--fasta_circularmapper_elongatedindex`, etc. or added to the reference sheet provided to `--fasta`) can greatly speed up analyses on other samples. + ### Preprocessing #### Falco @@ -186,6 +205,21 @@ The resulting FASTQ files will only be present in your results directory if you [Bowtie 2](https://bowtie-bio.sourceforge.net/bowtie2/manual.shtml) is an ultrafast and memory-efficient tool for aligning sequencing reads to long reference sequences. It is particularly good at aligning reads of about 50 up to 100s of characters to relatively long (e.g. mammalian) genomes. Bowtie 2 indexes the genome with an FM Index (based on the Burrows-Wheeler Transform or BWT) to keep its memory footprint small and supports gapped, local, and paired-end alignment modes. +#### CircularMapper + +
+Output files + +- `mapping/circularmapper/` + + - `*.bam`: Sorted reads aligned against an elongated reference genome in BAM format with no additional filtering. + - `*.{bai,csi}`: Index file corresponding to a BAM file which is for faster downstream steps (e.g. SAMtools). + - `*.flagstat`: Statistics of aligned reads from SAMtools `flagstat`. + +
+ +[CircularMapper RealignSAMFile](https://github.com/apeltzer/CircularMapper/tree/master) is an extension to `bwa aln` for realigning reads mapped to circularised contigs. First, an elogated/circularised reference is built using CircularGenerator, then reads are mapped to this reference using BWA ALN. The resulting BAM file is then realigned using CircularMapper RealignSAMFile. The reference coordinates of this BAM file have been adjusted to those of the original reference genome (prior to elongation). + ### Host Removal
@@ -635,13 +669,3 @@ When using pileupCaller for genotyping, single-stranded and double-stranded libr [ANGSD](http://www.popgen.dk/angsd/index.php/ANGSD) is a software for analyzing next generation sequencing data. It can estimate genotype likelihoods and allele frequencies from next-generation sequencing data. The output provided is a bgzipped genotype likelihood file, containing likelihoods across all samples per reference. Users can specify the model used for genotype likelihood estimation, as well as the output format. For more information on the available options, see the [ANGSD](https://www.popgen.dk/angsd/index.php/Genotype_Likelihoods). -#### CircularMapper - -
-Output files - -- `mapping/circularmapper` - - - `*realigned.bam`: BAM file realigned to the extended reference - -
From c028b6a5fa491429bde05340b7d91af7ef0c70d7 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 2 Aug 2024 11:35:40 +0200 Subject: [PATCH 157/198] add mapping tool subdirectory within mapping --- docs/output.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/output.md b/docs/output.md index ab4f0eb59..c5a87feb1 100644 --- a/docs/output.md +++ b/docs/output.md @@ -180,7 +180,7 @@ The resulting FASTQ files will only be present in your results directory if you
Output files -- `mapping/` +- `mapping/bwa{aln,mem}/` - `*.bam`: Sorted reads aligned against a reference genome in BAM format with no additional filtering. - `*.{bai,csi}`: Index file corresponding to a BAM file which is for faster downstream steps (e.g. SAMtools). @@ -195,7 +195,7 @@ The resulting FASTQ files will only be present in your results directory if you
Output files -- `mapping/` +- `mapping/bowtie2/` - `*.bam`: Sorted reads aligned against a reference genome in BAM format with no additional filtering. - `*.{bai,csi}`: Index file corresponding to a BAM file which is for faster downstream steps (e.g. SAMtools). From bdf0d8006fc465ac58870ad0bccb144f27715ce5 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 2 Aug 2024 11:59:14 +0200 Subject: [PATCH 158/198] mapped bams/bais/csi/flagstat all in tool subdir --- conf/modules.config | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 6b30d4153..03a1811f9 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -531,7 +531,7 @@ process { tag = { "${meta.reference}|${meta.sample_id}_${meta.library_id}" } ext.prefix = { "${meta.sample_id}_${meta.library_id}_${meta.reference}_sorted" } publishDir = [ - path: { "${params.outdir}/mapping/${params.mapping_tool}" }, + path: { "${params.outdir}/mapping/${params.mapping_tool}/" }, mode: params.publish_dir_mode, pattern: '*.{bam}' ] @@ -542,7 +542,7 @@ process { ext.args = { params.fasta_largeref ? "-c" : "" } ext.prefix = { "${meta.sample_id}_${meta.library_id}_${meta.reference}" } publishDir = [ - path: { "${params.outdir}/mapping/" }, + path: { "${params.outdir}/mapping/${params.mapping_tool}/" }, mode: params.publish_dir_mode, pattern: '*.{bai,csi}' ] @@ -552,7 +552,7 @@ process { tag = { "${meta.reference}|${meta.sample_id}_${meta.library_id}" } ext.prefix = { "${meta.sample_id}_${meta.library_id}_${meta.reference}" } publishDir = [ - path: { "${params.outdir}/mapping/" }, + path: { "${params.outdir}/mapping/${params.mapping_tool}/" }, mode: params.publish_dir_mode, pattern: '*.flagstat' ] From ed4d753a214933af119d3ee69a9de64ea62eb0a9 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 2 Aug 2024 14:04:07 +0200 Subject: [PATCH 159/198] force bwa indexing of original ref when circularmapper is used --- subworkflows/local/reference_indexing_multi.nf | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/subworkflows/local/reference_indexing_multi.nf b/subworkflows/local/reference_indexing_multi.nf index 652380331..313f3b59e 100644 --- a/subworkflows/local/reference_indexing_multi.nf +++ b/subworkflows/local/reference_indexing_multi.nf @@ -180,7 +180,7 @@ workflow REFERENCE_INDEXING_MULTI { remainder: [ meta, fasta, fai, dict ] } - if ( params.mapping_tool == "bwaaln" || params.mapping_tool == "bwamem" ) { + if ( params.mapping_tool == "bwaaln" || params.mapping_tool == "bwamem" || params.mapping_tool == "circularmapper" ) { BWA_INDEX ( ch_mapindex_input.index ) ch_version = ch_versions.mix( BWA_INDEX.out.versions ) ch_indexed_forremap = BWA_INDEX.out.index @@ -188,8 +188,6 @@ workflow REFERENCE_INDEXING_MULTI { BOWTIE2_BUILD ( ch_mapindex_input.index ) ch_version = ch_versions.mix( BOWTIE2_BUILD.out.versions ) ch_indexed_forremap = BOWTIE2_BUILD.out.index - } else if ( params.mapping_tool == "circularmapper" ) { - println("CircularMapper Indexing Not Yet Implemented") } ch_indexed_formix = ch_indexed_forremap From ef32d616b4832238fa759c3bbf5204e2fb6d3bbe Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 2 Aug 2024 15:36:17 +0200 Subject: [PATCH 160/198] Rename Mapped Flagstat for consistency with other modules and outputs of MAP swf --- conf/modules.config | 4 ++-- subworkflows/local/map.nf | 30 +++++++++++++++--------------- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 03a1811f9..05c999e42 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -548,9 +548,9 @@ process { ] } - withName: SAMTOOLS_FLAGSTAT_MAPPED { + withName: SAMTOOLS_FLAGSTAT_MERGED_LANES { tag = { "${meta.reference}|${meta.sample_id}_${meta.library_id}" } - ext.prefix = { "${meta.sample_id}_${meta.library_id}_${meta.reference}" } + ext.prefix = { "${meta.sample_id}_${meta.library_id}_${meta.reference}_sorted" } publishDir = [ path: { "${params.outdir}/mapping/${params.mapping_tool}/" }, mode: params.publish_dir_mode, diff --git a/subworkflows/local/map.nf b/subworkflows/local/map.nf index 0854688f1..d66009787 100644 --- a/subworkflows/local/map.nf +++ b/subworkflows/local/map.nf @@ -2,17 +2,17 @@ // Prepare reference indexing for downstream // -include { SEQKIT_SPLIT2 } from '../../modules/nf-core/seqkit/split2/main' -include { FASTQ_ALIGN_BWAALN } from '../../subworkflows/nf-core/fastq_align_bwaaln/main' -include { BWA_MEM } from '../../modules/nf-core/bwa/mem/main' -include { BOWTIE2_ALIGN } from '../../modules/nf-core/bowtie2/align/main' -include { SAMTOOLS_MERGE as SAMTOOLS_MERGE_LANES } from '../../modules/nf-core/samtools/merge/main' -include { SAMTOOLS_SORT as SAMTOOLS_SORT_MERGED_LANES } from '../../modules/nf-core/samtools/sort/main' -include { SAMTOOLS_INDEX as SAMTOOLS_INDEX_MEM } from '../../modules/nf-core/samtools/index/main' -include { SAMTOOLS_INDEX as SAMTOOLS_INDEX_BT2 } from '../../modules/nf-core/samtools/index/main' -include { SAMTOOLS_INDEX as SAMTOOLS_INDEX_MERGED_LANES } from '../../modules/nf-core/samtools/index/main' -include { SAMTOOLS_FLAGSTAT as SAMTOOLS_FLAGSTAT_MAPPED } from '../../modules/nf-core/samtools/flagstat/main' -include { CIRCULARMAPPER } from '../../subworkflows/local/circularmapper' +include { SEQKIT_SPLIT2 } from '../../modules/nf-core/seqkit/split2/main' +include { FASTQ_ALIGN_BWAALN } from '../../subworkflows/nf-core/fastq_align_bwaaln/main' +include { BWA_MEM } from '../../modules/nf-core/bwa/mem/main' +include { BOWTIE2_ALIGN } from '../../modules/nf-core/bowtie2/align/main' +include { SAMTOOLS_MERGE as SAMTOOLS_MERGE_LANES } from '../../modules/nf-core/samtools/merge/main' +include { SAMTOOLS_SORT as SAMTOOLS_SORT_MERGED_LANES } from '../../modules/nf-core/samtools/sort/main' +include { SAMTOOLS_INDEX as SAMTOOLS_INDEX_MEM } from '../../modules/nf-core/samtools/index/main' +include { SAMTOOLS_INDEX as SAMTOOLS_INDEX_BT2 } from '../../modules/nf-core/samtools/index/main' +include { SAMTOOLS_INDEX as SAMTOOLS_INDEX_MERGED_LANES } from '../../modules/nf-core/samtools/index/main' +include { SAMTOOLS_FLAGSTAT as SAMTOOLS_FLAGSTAT_MERGED_LANES } from '../../modules/nf-core/samtools/flagstat/main' +include { CIRCULARMAPPER } from '../../subworkflows/local/circularmapper' workflow MAP { take: @@ -162,14 +162,14 @@ workflow MAP { ch_input_for_flagstat = ch_mapped_bam.join( ch_mapped_bai, failOnMismatch: true ) - SAMTOOLS_FLAGSTAT_MAPPED ( ch_input_for_flagstat ) - ch_versions.mix( SAMTOOLS_FLAGSTAT_MAPPED.out.versions.first() ) - ch_multiqc_files = ch_multiqc_files.mix( SAMTOOLS_FLAGSTAT_MAPPED.out.flagstat ) + SAMTOOLS_FLAGSTAT_MERGED_LANES ( ch_input_for_flagstat ) + ch_versions.mix( SAMTOOLS_FLAGSTAT_MERGED_LANES .out.versions.first() ) + ch_multiqc_files = ch_multiqc_files.mix( SAMTOOLS_FLAGSTAT_MERGED_LANES .out.flagstat ) emit: bam = ch_mapped_bam // [ [ meta ], bam ] bai = ch_mapped_bai // [ [ meta ], bai/csi ] - flagstat = SAMTOOLS_FLAGSTAT_MAPPED.out.flagstat // [ [ meta ], stats ] + flagstat = SAMTOOLS_FLAGSTAT_MERGED_LANES .out.flagstat // [ [ meta ], stats ] mqc = ch_multiqc_files versions = ch_versions From e7388e818a3ad62cccb8a9e16d5268e5e513f601 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 2 Aug 2024 15:36:33 +0200 Subject: [PATCH 161/198] Rename CM parameters --- assets/schema_fasta.json | 10 ++-- conf/modules.config | 10 ++-- docs/development/manual_tests.md | 13 +++-- nextflow.config | 49 ++++++++--------- nextflow_schema.json | 54 ++++++++++--------- subworkflows/local/elongate_reference.nf | 2 +- subworkflows/local/map.nf | 2 +- .../local/reference_indexing_multi.nf | 14 ++--- .../local/reference_indexing_single.nf | 12 ++--- .../local/utils_nfcore_eager_pipeline/main.nf | 2 +- 10 files changed, 91 insertions(+), 77 deletions(-) diff --git a/assets/schema_fasta.json b/assets/schema_fasta.json index d89310422..4256a542e 100644 --- a/assets/schema_fasta.json +++ b/assets/schema_fasta.json @@ -48,19 +48,19 @@ "pattern": "^\\S+$", "errorMessage": "The headers of the chromosome to be extended by circularmapper must not contain any spaces and no leading '>'." }, - "circularmapper_elongated_fasta": { + "circularmapper_elongatedfasta": { "type": "string", "format": "file-path", "pattern": "^\\S+\\.f(na|asta|a|as)(\\.gz)?$", "exists": true, "errorMessage": "The elongated Fasta files for the mapping reference must be provided with file extensions '.fasta', '.fa', '.fas', '.fna', '.fasta.gz','.fa.gz','.fas.gz', '.fna.gz' and cannot contain any spaces." }, - "circularmapper_elongated_fai": { + "circularmapper_elongatedindex": { "type": "string", - "format": "file-path", - "pattern": "^\\S+\\.fai$", + "format": "directory-path", + "pattern": "^\\S+$", "exists": true, - "errorMessage": "Elongated fasta index files for the mapping reference cannot have any spaces and must have file extension '.fai'." + "errorMessage": "The directories of the index files for the elongated mapping reference for circularmapper must not contain any spaces and have file extensions ''." }, "mitochondrion_header": { "type": "string", diff --git a/conf/modules.config b/conf/modules.config index 05c999e42..0f9a9ed54 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -281,17 +281,17 @@ process { // Reference elongation and indexing for circular mapping withName: GUNZIP_ELONGATED_FASTA { publishDir = [ - path: { "${params.outdir}/reference/${meta.id}_${params.mapping_circularmapper_elongation_factor}/" }, + path: { "${params.outdir}/reference/${meta.id}_${params.fasta_circularmapper_elongationfactor}/" }, mode: params.publish_dir_mode, - pattern: '*[0-9].f*', + pattern: '*_*[0-9].f*', enabled: params.save_reference ] } withName: CIRCULARMAPPER_CIRCULARGENERATOR { - tag = { "${meta.id}_${params.mapping_circularmapper_elongation_factor}" } + tag = { "${meta.id}_${params.fasta_circularmapper_elongationfactor}" } publishDir = [ - path: { "${params.outdir}/reference/${meta.id}_${params.mapping_circularmapper_elongation_factor}/" }, + path: { "${params.outdir}/reference/${meta.id}_${params.fasta_circularmapper_elongationfactor}/" }, mode: params.publish_dir_mode, pattern: '*_*[0-9].fasta', enabled: params.save_reference @@ -300,7 +300,7 @@ process { withName: BWA_INDEX_CIRCULARISED { publishDir = [ - path: { "${params.outdir}/reference/${meta.id}_${params.mapping_circularmapper_elongation_factor}/" }, + path: { "${params.outdir}/reference/${meta.id}_${params.fasta_circularmapper_elongationfactor}/" }, mode: params.publish_dir_mode, pattern: 'bwa', enabled: params.save_reference diff --git a/docs/development/manual_tests.md b/docs/development/manual_tests.md index 02b6c6022..50334e5b4 100644 --- a/docs/development/manual_tests.md +++ b/docs/development/manual_tests.md @@ -287,15 +287,22 @@ nextflow run ../main.nf -profile test,singularity --outdir ./results -resume -du ## CircularMapper with reference elongation ## Expect: Reference elongation is ran, and circularmapper SWF is ran. ## Check: Expect the elongated reference and BWA index directory within the `reference` directory. Also 2 bam files together with their BAIs and Flagstats in the `mapping/circularmapper` directory. -nextflow run ../main.nf -profile test,docker --outdir ./results -w work/ -resume -dump-channels -ansi-log false --fasta_circular_target 'NC_007596.2' --mapping_tool circularmapper --save_reference +nextflow run main.nf -profile test,docker --outdir ./results -w work/ -resume -dump-channels -ansi-log false --fasta_circular_target 'NC_007596.2' --mapping_tool circularmapper --save_reference ``` ```bash ## CircularMapper with an already elongated reference. Big reference flag. Also check that bwa_aln flags also propagate when using circularmapper. ## Expect: Reference elongation is NOT ran, and circularmapper SWF is ran. ## Check: 2 bam files together with their CSIs and Flagstats in the `mapping/circularmapper` directory. -## Also check the BAM headers for the -k and -n flags during BWA ALN. -nextflow run main.nf -profile test,docker --outdir ./results -w work/ -resume -dump-channels -ansi-log false --fasta_circular_target 'NC_007596.2' --mapping_tool circularmapper --mapping_circularmapper_elongated_fasta data/reference/Mammoth_MT_Krause_500/Mammoth_MT_Krause_500.fasta --mapping_circularmapper_elongated_fai data/reference/Mammoth_MT_Krause_500/bwa --fasta_largeref --mapping_bwaaln_n 0.05 --mapping_bwaaln_k 3 +## Also check the .command.sh for the -k and -n flags during BWA ALN. +nextflow run main.nf -profile test,docker --outdir ./results -w work/ -resume -dump-channels -ansi-log false --fasta_circular_target 'NC_007596.2' --mapping_tool circularmapper --fasta_circularmapper_elongatedfasta data/reference/Mammoth_MT_Krause_500/Mammoth_MT_Krause_500.fasta --fasta_circularmapper_elongatedindex data/reference/Mammoth_MT_Krause_500/bwa --fasta_largeref --mapping_bwaaln_n 0.05 --mapping_bwaaln_k 3 +``` + +```bash +## Multiref with circularmapper. reference_sheet_multiref.csv edited to include elongated reference and index from first CM manual test for Mammoth_MT, and remove the human reference (save on runtime). Will still evaluate through reference_indexing_multi. +## Expect: No elongation for Mammoth MT. +## Check: 2 bam files together with their CSIs and Flagstats in the `mapping/circularmapper` directory. (6 files total) +nextflow run main.nf -profile test_multiref,docker --outdir ./results -w work/ -resume -dump-channels -ansi-log false --fasta_sheet /Users/lamnidis/Software/github/jbv2/eager/data/reference/reference_sheet_multiref.csv --mapping_tool circularmapper --fasta_largeref --mapping_bwaaln_n 0.05 --mapping_bwaaln_k 3 ``` ## Host Removal diff --git a/nextflow.config b/nextflow.config index 467aeba3f..0ad84d596 100644 --- a/nextflow.config +++ b/nextflow.config @@ -29,13 +29,16 @@ params { multiqc_methods_description = null // Main references - fasta = null - fasta_fai = null - fasta_dict = null - fasta_mapperindexdir = null - fasta_circular_target = null - fasta_largeref = false - fasta_sheet = null + fasta = null + fasta_fai = null + fasta_dict = null + fasta_mapperindexdir = null + fasta_circular_target = null + fasta_largeref = false + fasta_sheet = null + fasta_circularmapper_elongationfactor = 500 + fasta_circularmapper_elongatedfasta = null + fasta_circularmapper_elongatedindex = null // Shard Fastq options run_fastq_sharding = false @@ -108,23 +111,21 @@ params { preprocessing_adapterremoval_qualitymax = 41 // Mapping - mapping_tool = 'bwaaln' - mapping_bwaaln_n = 0.01 // From Oliva et al. 2021 (10.1093/bib/bbab076) - mapping_bwaaln_k = 2 - mapping_bwaaln_l = 1024 // From Oliva et al. 2021 (10.1093/bib/bbab076) - mapping_bwaaln_o = 2 // From Oliva et al. 2021 (10.1093/bib/bbab076) - mapping_bwamem_k = 19 - mapping_bwamem_r = 1.5 - mapping_bowtie2_alignmode = 'local' - mapping_bowtie2_sensitivity = 'sensitive' - mapping_bowtie2_n = 0 - mapping_bowtie2_l = 20 - mapping_bowtie2_trim5 = 0 - mapping_bowtie2_trim3 = 0 - mapping_bowtie2_maxins = 500 - mapping_circularmapper_elongation_factor = 500 - mapping_circularmapper_elongated_fasta = null - mapping_circularmapper_elongated_fai = null + mapping_tool = 'bwaaln' + mapping_bwaaln_n = 0.01 // From Oliva et al. 2021 (10.1093/bib/bbab076) + mapping_bwaaln_k = 2 + mapping_bwaaln_l = 1024 // From Oliva et al. 2021 (10.1093/bib/bbab076) + mapping_bwaaln_o = 2 // From Oliva et al. 2021 (10.1093/bib/bbab076) + mapping_bwamem_k = 19 + mapping_bwamem_r = 1.5 + mapping_bowtie2_alignmode = 'local' + mapping_bowtie2_sensitivity = 'sensitive' + mapping_bowtie2_n = 0 + mapping_bowtie2_l = 20 + mapping_bowtie2_trim5 = 0 + mapping_bowtie2_trim3 = 0 + mapping_bowtie2_maxins = 500 + mapping_circularmapper_circularfilter = false // BAM Filtering run_bamfiltering = false diff --git a/nextflow_schema.json b/nextflow_schema.json index f87749bd8..d2cc0d72a 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -84,6 +84,12 @@ "help_text": "For most people this will likely be the same directory that contains the file you provided to `--fasta`.\n\nIf you want to use pre-existing `bwa index` indices, the directory should contain files ending in '.amb' '.ann' '.bwt'. If you want to use pre-existing `bowtie2 build` indices, the directory should contain files ending in'.1.bt2', '.2.bt2', '.rev.1.bt2'.\n\nIn any case do not include the files themselves in the path. nf-core/eager will automagically detect the index files by searching for the FASTA filename with the corresponding `bwa index`/`bowtie2 build` file suffixes. If not supplied, the indices will be generated for you.\n\n", "fa_icon": "fas fa-folder-open" }, + "fasta_largeref": { + "type": "boolean", + "description": "Specify to generate '.csi' BAM indices instead of '.bai' for larger reference genomes.", + "help_text": "This parameter is required to be set for large reference genomes. If your reference genome is larger than 3.5GB, the `samtools index` calls in the pipeline need to generate `.csi` indices instead of `.bai` indices to compensate for the size of the reference genome (with samtools: `-c`). This parameter is not required for smaller references (including the human reference genomes hg19 or grch37/grch38).", + "fa_icon": "fas fa-address-book" + }, "save_reference": { "type": "boolean", "description": "Specify to save any pipeline-generated reference genome indices in the results directory.", @@ -129,6 +135,25 @@ "description": "Specify the FASTA header of the target chromosome to extend when using `circularmapper`.", "help_text": "The entry (chromosome, contig, etc.) in your FASTA reference that you'd like to be treated as circular.\n\nApplies only when providing a single FASTA file via `--fasta` (NOT multi-reference input - see reference TSV/CSV input).\n\n> Modifies tool parameter(s):\n> - circulargenerator `-s`", "fa_icon": "fas fa-bullseye" + }, + "fasta_circularmapper_elongationfactor": { + "type": "integer", + "default": 500, + "description": "Specify the number of bases to extend reference by (circularmapper only).", + "help_text": "The number of bases to extend the reference genome with. By default this is set to 500 if not specified otherwise.", + "fa_icon": "fas fa-external-link-alt" + }, + "fasta_circularmapper_elongatedfasta": { + "type": "string", + "description": "Specify an elongated reference FASTA to be used for circularmapper.", + "help_text": "Specify an already elongated FASTA file for circularmapper to avoid reprocessing.", + "fa_icon": "fas fa-address-book" + }, + "fasta_circularmapper_elongatedindex": { + "type": "string", + "description": "Specify a samtools index for the elongated FASTA file.", + "help_text": "Specify the index for an already elongated FASTA file to avoid reprocessing.", + "fa_icon": "fas fa-address-book" } } }, @@ -506,12 +531,6 @@ "help_text": "Specify which mapping tool to use. Options are BWA aln ('`bwaaln`'), BWA mem ('`bwamem`'), circularmapper ('`circularmapper`'), or Bowtie 2 ('`bowtie2`'). BWA aln is the default and highly suited for short-read ancient DNA. BWA mem can be quite useful for modern DNA, but is rarely used in projects for ancient DNA. CircularMapper enhances the mapping procedure to circular references, using the BWA algorithm but utilizing an extend-remap procedure (see [Peltzer et al 2016](https://doi.org/10.1186/s13059-016-0918-z) for details). Bowtie 2 is similar to BWA aln, and has recently been suggested to provide slightly better results under certain conditions ([Poullet and Orlando 2020](https://doi.org/10.3389/fevo.2020.00105)), as well as providing extra functionality (such as FASTQ trimming).\n\nMore documentation can be seen for each tool under:\n\n- [BWA aln](http://bio-bwa.sourceforge.net/bwa.shtml#3)\n- [BWA mem](http://bio-bwa.sourceforge.net/bwa.shtml#3)\n- [CircularMapper](https://circularmapper.readthedocs.io/en/latest/contents/userguide.html)\n- [Bowtie 2](http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml#command-line)", "fa_icon": "fas fa-hammer" }, - "fasta_largeref": { - "type": "boolean", - "description": "Specify to generate '.csi' BAM indices instead of '.bai' for larger reference genomes.", - "help_text": "This parameter is required to be set for large reference genomes. If your reference genome is larger than 3.5GB, the `samtools index` calls in the pipeline need to generate `.csi` indices instead of `.bai` indices to compensate for the size of the reference genome (with samtools: `-c`). This parameter is not required for smaller references (including the human reference genomes hg19 or grch37/grch38).", - "fa_icon": "fas fa-address-book" - }, "mapping_bwaaln_n": { "type": "number", "default": 0.01, @@ -605,24 +624,11 @@ "help_text": "The maximum fragment for valid paired-end alignments. Only for paired-end mapping (i.e. unmerged), and therefore typically only useful for modern data.\n\n> Modifies Bowtie2 parameter: `--maxins`", "fa_icon": "fas fa-exchange-alt" }, - "mapping_circularmapper_elongation_factor": { - "type": "integer", - "default": 500, - "description": "Specify the number of bases to extend reference by (circularmapper only).", - "help_text": "The number of bases to extend the reference genome with. By default this is set to 500 if not specified otherwise.", - "fa_icon": "fas fa-external-link-alt" - }, - "mapping_circularmapper_elongated_fasta": { - "type": "string", - "description": "Specify an elongated reference FASTA to be used for circularmapper.", - "help_text": "Specify an already elongated FASTA file for circularmapper to avoid reprocessing.", - "fa_icon": "fas fa-address-book" - }, - "mapping_circularmapper_elongated_fai": { - "type": "string", - "description": "Specify a samtools index for the elongated FASTA file.", - "help_text": "Specify the index for an already elongated FASTA file to avoid reprocessing.", - "fa_icon": "fas fa-address-book" + "mapping_circularmapper_circularfilter": { + "type": "boolean", + "fa_icon": "fas fa-filter", + "description": "Turn on to remove reads that did not map to the circularised genome.", + "help_text": "If you want to filter out reads that don't map to a circular chromosome (and also non-circular chromosome headers) from the resulting BAM file, turn this on.\n\n> Modifies `-f` and `-x` parameters of CircularMapper's RealignSAMFile" } }, "fa_icon": "fas fa-layer-group" diff --git a/subworkflows/local/elongate_reference.nf b/subworkflows/local/elongate_reference.nf index 9040ea23f..5c07a3e25 100644 --- a/subworkflows/local/elongate_reference.nf +++ b/subworkflows/local/elongate_reference.nf @@ -67,7 +67,7 @@ workflow ELONGATE_REFERENCE { .multiMap { meta, circular_target, circularmapper_elongated_fasta, circularmapper_elongated_index, fasta, fai, dict, mapindex -> - def elongation_factor = params.mapping_circularmapper_elongation_factor + def elongation_factor = params.fasta_circularmapper_elongationfactor fasta: [ meta, fasta ] elongation_factor : [ meta, elongation_factor ] diff --git a/subworkflows/local/map.nf b/subworkflows/local/map.nf index d66009787..55b222372 100644 --- a/subworkflows/local/map.nf +++ b/subworkflows/local/map.nf @@ -123,7 +123,7 @@ workflow MAP { [ meta, elongated_index ] } - CIRCULARMAPPER( index, ch_elongated_reference_for_mapping, reads, params.mapping_circularmapper_elongation_factor ) + CIRCULARMAPPER( index, ch_elongated_reference_for_mapping, reads, params.fasta_circularmapper_elongationfactor ) ch_versions = ch_versions.mix ( CIRCULARMAPPER.out.versions ) ch_mapped_lane_bam = CIRCULARMAPPER.out.bam ch_mapped_lane_bai = CIRCULARMAPPER.out.bai diff --git a/subworkflows/local/reference_indexing_multi.nf b/subworkflows/local/reference_indexing_multi.nf index 313f3b59e..00276a5b9 100644 --- a/subworkflows/local/reference_indexing_multi.nf +++ b/subworkflows/local/reference_indexing_multi.nf @@ -19,14 +19,14 @@ workflow REFERENCE_INDEXING_MULTI { // Import reference sheet and change empty arrays to empty strings for compatibility with single reference input ch_splitreferencesheet_for_branch = Channel.fromSamplesheet("fasta_sheet") .map{ - meta, fasta, fai, dict, mapper_index, circular_target, circularmapper_elongated_fasta, circularmapper_elongated_fai, mitochondrion, capture_bed, pileupcaller_bed, pileupcaller_snp, hapmap, pmd_masked_fasta, pmd_bed_for_masking, sexdet_bed, bedtools_feature, genotyping_gatk_dbsnp -> + meta, fasta, fai, dict, mapper_index, circular_target, circularmapper_elongatedfasta, circularmapper_elongatedindex, mitochondrion, capture_bed, pileupcaller_bed, pileupcaller_snp, hapmap, pmd_masked_fasta, pmd_bed_for_masking, sexdet_bed, bedtools_feature, genotyping_gatk_dbsnp -> meta.ploidy = meta.genotyping_ploidy != null ? meta.genotyping_ploidy : params.genotyping_reference_ploidy fai = fai != [] ? fai : "" dict = dict != [] ? dict : "" mapper_index = mapper_index != [] ? mapper_index : "" circular_target = circular_target != [] ? circular_target : "" - circularmapper_elongated_fasta = circularmapper_elongated_fasta != [] ? circularmapper_elongated_fasta : "" - circularmapper_elongated_fai = circularmapper_elongated_fai != [] ? circularmapper_elongated_fai : "" + circularmapper_elongatedfasta = circularmapper_elongatedfasta != [] ? circularmapper_elongatedfasta : "" + circularmapper_elongatedindex = circularmapper_elongatedindex != [] ? circularmapper_elongatedindex : "" mitochondrion = mitochondrion != [] ? mitochondrion : "" capture_bed = capture_bed != [] ? capture_bed : "" pileupcaller_bed = pileupcaller_bed != [] ? pileupcaller_bed : "" @@ -37,7 +37,7 @@ workflow REFERENCE_INDEXING_MULTI { sexdet_bed = sexdet_bed != [] ? sexdet_bed : "" bedtools_feature = bedtools_feature != [] ? bedtools_feature : "" genotyping_gatk_dbsnp = genotyping_gatk_dbsnp != [] ? genotyping_gatk_dbsnp : "" - [ meta - meta.subMap('genotyping_ploidy'), fasta, fai, dict, mapper_index, circular_target, circularmapper_elongated_fasta, circularmapper_elongated_fai, mitochondrion, capture_bed, pileupcaller_bed, pileupcaller_snp, hapmap, pmd_masked_fasta, pmd_bed_for_masking, sexdet_bed, bedtools_feature, genotyping_gatk_dbsnp ] + [ meta - meta.subMap('genotyping_ploidy'), fasta, fai, dict, mapper_index, circular_target, circularmapper_elongatedfasta, circularmapper_elongatedindex, mitochondrion, capture_bed, pileupcaller_bed, pileupcaller_snp, hapmap, pmd_masked_fasta, pmd_bed_for_masking, sexdet_bed, bedtools_feature, genotyping_gatk_dbsnp ] } // GENERAL DESCRIPTION FOR NEXT SECTIONS @@ -53,9 +53,9 @@ workflow REFERENCE_INDEXING_MULTI { ch_input_from_referencesheet = ch_splitreferencesheet_for_branch .multiMap { - meta, fasta, fai, dict, mapper_index, circular_target, circularmapper_elongated_fasta, circularmapper_elongated_fai, mitochondrion, capture_bed, pileupcaller_bed, pileupcaller_snp, hapmap, pmd_masked_fasta, pmd_bed_for_masking, sexdet_bed, bedtools_feature, genotyping_gatk_dbsnp -> + meta, fasta, fai, dict, mapper_index, circular_target, circularmapper_elongatedfasta, circularmapper_elongatedindex, mitochondrion, capture_bed, pileupcaller_bed, pileupcaller_snp, hapmap, pmd_masked_fasta, pmd_bed_for_masking, sexdet_bed, bedtools_feature, genotyping_gatk_dbsnp -> generated: [ meta, fasta, fai, dict, mapper_index ] - circularmapper: [ meta, circular_target, circularmapper_elongated_fasta, circularmapper_elongated_fai ] + circularmapper: [ meta, circular_target, circularmapper_elongatedfasta, circularmapper_elongatedindex ] mitochondrion_header: [ meta, mitochondrion ] angsd_hapmap: [ meta, hapmap ] pmd_masked_fasta: [ meta, pmd_masked_fasta ] @@ -202,7 +202,7 @@ workflow REFERENCE_INDEXING_MULTI { emit: reference = ch_indexmapper_for_reference // [ meta, fasta, fai, dict, mapindex ] - elongated_reference = ch_input_from_referencesheet.circularmapper // [ meta, circular_target, circularmapper_elongated_fasta, circularmapper_elongated_fai ] + elongated_reference = ch_input_from_referencesheet.circularmapper // [ meta, circular_target, circularmapper_elongatedfasta, circularmapper_elongatedindex ] mitochondrion_header = ch_input_from_referencesheet.mitochondrion_header // [ meta, mitochondrion ] hapmap = ch_input_from_referencesheet.angsd_hapmap // [ meta, hapmap ] pmd_masked_fasta = ch_input_from_referencesheet.pmd_masked_fasta // [ meta, pmd_masked_fasta ] diff --git a/subworkflows/local/reference_indexing_single.nf b/subworkflows/local/reference_indexing_single.nf index 10164f31e..41feced1e 100644 --- a/subworkflows/local/reference_indexing_single.nf +++ b/subworkflows/local/reference_indexing_single.nf @@ -89,16 +89,16 @@ workflow REFERENCE_INDEXING_SINGLE { def bedtools_feature = params.mapstats_bedtools_featurefile != null ? file(params.mapstats_bedtools_featurefile, checkIfExists: true ) : "" def genotyping_reference_ploidy = params.genotyping_reference_ploidy def genotyping_gatk_dbsnp = params.genotyping_gatk_dbsnp != null ? file(params.genotyping_gatk_dbsnp, checkIfExists: true ) : "" - def circularmapper_elongated_fasta = params.mapping_circularmapper_elongated_fasta != null ? file( params.mapping_circularmapper_elongated_fasta, checkIfExists: true ) : "" - def circularmapper_elongated_fai = params.mapping_circularmapper_elongated_fai != null ? file( params.mapping_circularmapper_elongated_fai, checkIfExists: true ) : "" - [ meta + [ ploidy: genotyping_reference_ploidy ], fasta, fai, dict, mapper_index, params.fasta_circular_target, params.mitochondrion_header, contamination_estimation_angsd_hapmap, pmd_masked_fasta, pmd_bed_for_masking, capture_bed, pileupcaller_bed, pileupcaller_snp, sexdet_bed, bedtools_feature, genotyping_gatk_dbsnp, circularmapper_elongated_fasta, circularmapper_elongated_fai ] + def circularmapper_elongated_fasta = params.fasta_circularmapper_elongatedfasta != null ? file( params.fasta_circularmapper_elongatedfasta, checkIfExists: true ) : "" + def circularmapper_elongated_index = params.fasta_circularmapper_elongatedindex != null ? file( params.fasta_circularmapper_elongatedindex, checkIfExists: true ) : "" + [ meta + [ ploidy: genotyping_reference_ploidy ], fasta, fai, dict, mapper_index, params.fasta_circular_target, params.mitochondrion_header, contamination_estimation_angsd_hapmap, pmd_masked_fasta, pmd_bed_for_masking, capture_bed, pileupcaller_bed, pileupcaller_snp, sexdet_bed, bedtools_feature, genotyping_gatk_dbsnp, circularmapper_elongated_fasta, circularmapper_elongated_index ] } ch_ref_index_single = ch_reference_for_mapping .multiMap{ - meta, fasta, fai, dict, mapper_index, circular_target, mitochondrion_header, contamination_estimation_angsd_hapmap, pmd_masked_fasta, pmd_bed_for_masking, capture_bed, pileupcaller_bed, pileupcaller_snp, sexdet_bed, bedtools_feature, genotyping_gatk_dbsnp, circularmapper_elongated_fasta, circularmapper_elongated_fai -> + meta, fasta, fai, dict, mapper_index, circular_target, mitochondrion_header, contamination_estimation_angsd_hapmap, pmd_masked_fasta, pmd_bed_for_masking, capture_bed, pileupcaller_bed, pileupcaller_snp, sexdet_bed, bedtools_feature, genotyping_gatk_dbsnp, circularmapper_elongated_fasta, circularmapper_elongated_index -> reference: [ meta, fasta, fai, dict, mapper_index ] - circularmapper: [ meta, circular_target, circularmapper_elongated_fasta, circularmapper_elongated_fai ] + circularmapper: [ meta, circular_target, circularmapper_elongated_fasta, circularmapper_elongated_index ] mito_header: [ meta, mitochondrion_header ] hapmap: [ meta, contamination_estimation_angsd_hapmap ] pmd_masked_fasta: [ meta, pmd_masked_fasta ] @@ -112,7 +112,7 @@ workflow REFERENCE_INDEXING_SINGLE { emit: reference = ch_ref_index_single.reference // [ meta, fasta, fai, dict, mapindex ] - elongated_reference = ch_ref_index_single.circularmapper // [ meta, circular_target, circularmapper_elongated_fasta, circularmapper_elongated_fai ] + elongated_reference = ch_ref_index_single.circularmapper // [ meta, circular_target, circularmapper_elongated_fasta, circularmapper_elongated_index ] mitochondrion_header = ch_ref_index_single.mito_header // [ meta, mito_header ] hapmap = ch_ref_index_single.hapmap // [ meta, hapmap ] pmd_masked_fasta = ch_ref_index_single.pmd_masked_fasta // [ meta, pmd_masked_fasta ] diff --git a/subworkflows/local/utils_nfcore_eager_pipeline/main.nf b/subworkflows/local/utils_nfcore_eager_pipeline/main.nf index a7d75dd28..ba460c49e 100644 --- a/subworkflows/local/utils_nfcore_eager_pipeline/main.nf +++ b/subworkflows/local/utils_nfcore_eager_pipeline/main.nf @@ -231,7 +231,7 @@ def validateInputParameters() { if ( params.genotyping_source == 'pmd' && ! params.run_pmd_filtering ) { exit 1, ("[nf-core/eager] ERROR: --genotyping_source cannot be 'pmd' unless PMD-filtering is ran.") } if ( params.genotyping_source == 'rescaled' && ! params.run_mapdamage_rescaling ) { exit 1, ("[nf-core/eager] ERROR: --genotyping_source cannot be 'rescaled' unless aDNA damage rescaling is ran.") } if ( params.fasta && params.run_genotyping && params.genotyping_tool == 'pileupcaller' && ! (params.genotyping_pileupcaller_bedfile || params.genotyping_pileupcaller_snpfile ) ) { exit 1, ("[nf-core/eager] ERROR: Genotyping with pileupcaller requires both '--genotyping_pileupcaller_bedfile' AND '--genotyping_pileupcaller_snpfile' to be provided.") } - if ( params.fasta && params.mapping_tool == "circularmapper" && ! params.fasta_circular_target && ! params.mapping_circularmapper_elongated_fasta ) { exit 1, ("[nf-core/eager] ERROR: Mapping with circularmapper requires either --fasta_circular_target or --mapping_circularmapper_elongated_fasta. ") } + if ( params.fasta && params.mapping_tool == "circularmapper" && ! params.fasta_circular_target && ! params.fasta_circularmapper_elongatedfasta ) { exit 1, ("[nf-core/eager] ERROR: Mapping with circularmapper requires either --fasta_circular_target or --fasta_circularmapper_elongatedfasta. ") } } // From 3b3d1d0c105ba20404d5d2e85a87809d42703ed5 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 2 Aug 2024 15:41:03 +0200 Subject: [PATCH 162/198] Add suggestions from review --- nextflow_schema.json | 6 +++--- subworkflows/local/circularmapper.nf | 2 +- subworkflows/local/reference_indexing.nf | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index d2cc0d72a..e64216c1c 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -140,19 +140,19 @@ "type": "integer", "default": 500, "description": "Specify the number of bases to extend reference by (circularmapper only).", - "help_text": "The number of bases to extend the reference genome with. By default this is set to 500 if not specified otherwise.", + "help_text": "The number of bases to extend the beginning and end of each reference genome with.", "fa_icon": "fas fa-external-link-alt" }, "fasta_circularmapper_elongatedfasta": { "type": "string", "description": "Specify an elongated reference FASTA to be used for circularmapper.", - "help_text": "Specify an already elongated FASTA file for circularmapper to avoid reprocessing.", + "help_text": "Specify an already elongated FASTA file for circularmapper to avoid regeneration.", "fa_icon": "fas fa-address-book" }, "fasta_circularmapper_elongatedindex": { "type": "string", "description": "Specify a samtools index for the elongated FASTA file.", - "help_text": "Specify the index for an already elongated FASTA file to avoid reprocessing.", + "help_text": "Specify the index for an already elongated FASTA file to avoid regeneration.", "fa_icon": "fas fa-address-book" } } diff --git a/subworkflows/local/circularmapper.nf b/subworkflows/local/circularmapper.nf index da3de830f..45726ed0e 100644 --- a/subworkflows/local/circularmapper.nf +++ b/subworkflows/local/circularmapper.nf @@ -21,7 +21,7 @@ workflow CIRCULARMAPPER { ch_realigned_bais = Channel.empty() ch_realigned_csis = Channel.empty() - // While mapping with BWA will need the elongated reference index, RealignSAMFile apparently does NOT need the elongated reference to be present, only the elongation factor. + // Although mapping with BWA will need the elongated reference index, RealignSAMFile apparently does NOT need the elongated reference to be present, only the elongation factor. FASTQ_ALIGN_BWAALN_ELONGATED( ch_fastq_reads, ch_elongated_reference ) ch_versions = ch_versions.mix( FASTQ_ALIGN_BWAALN_ELONGATED.out.versions.first() ) diff --git a/subworkflows/local/reference_indexing.nf b/subworkflows/local/reference_indexing.nf index 2fa931938..7f1d54991 100644 --- a/subworkflows/local/reference_indexing.nf +++ b/subworkflows/local/reference_indexing.nf @@ -133,7 +133,7 @@ workflow REFERENCE_INDEXING { // Throw errors if required parameters are missing ch_elongated_for_gunzip = ch_elongated_reference .filter{ it[1] != "" || it[2] != "" } - .ifEmpty{ error "[nf-core/eager]: ERROR: Mapping with circularmapper requires either a circular target or elongated reference file for at least one reference." } + .ifEmpty{ error "[nf-core/eager] ERROR: Mapping with circularmapper requires either a circular target or elongated reference file for at least one reference." } // This ELONGATE_REFERENCE subworkflow also checks if the provided reference is gzipped, and unzips it if necessary. ELONGATE_REFERENCE( ch_reference_for_mapping, ch_elongated_reference ) From d03b7c7a76a926b3b4936c003547862bb73a4516 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Mon, 5 Aug 2024 14:00:18 +0200 Subject: [PATCH 163/198] linting --- docs/output.md | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/output.md b/docs/output.md index c5a87feb1..e22b29149 100644 --- a/docs/output.md +++ b/docs/output.md @@ -668,4 +668,3 @@ When using pileupCaller for genotyping, single-stranded and double-stranded libr
[ANGSD](http://www.popgen.dk/angsd/index.php/ANGSD) is a software for analyzing next generation sequencing data. It can estimate genotype likelihoods and allele frequencies from next-generation sequencing data. The output provided is a bgzipped genotype likelihood file, containing likelihoods across all samples per reference. Users can specify the model used for genotype likelihood estimation, as well as the output format. For more information on the available options, see the [ANGSD](https://www.popgen.dk/angsd/index.php/Genotype_Likelihoods). - From f72531bff404caa632edaabfaf3d33b8f473c36d Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Mon, 5 Aug 2024 15:45:44 +0200 Subject: [PATCH 164/198] rename elongated ref channel for clarity with its purpose --- subworkflows/local/reference_indexing.nf | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/subworkflows/local/reference_indexing.nf b/subworkflows/local/reference_indexing.nf index 7f1d54991..5ac5e8866 100644 --- a/subworkflows/local/reference_indexing.nf +++ b/subworkflows/local/reference_indexing.nf @@ -27,7 +27,7 @@ workflow REFERENCE_INDEXING { // If input (multi-)reference sheet supplied REFERENCE_INDEXING_MULTI ( fasta ) ch_reference_for_mapping = REFERENCE_INDEXING_MULTI.out.reference - ch_elongated_reference = REFERENCE_INDEXING_MULTI.out.elongated_reference + ch_reference_to_elongate = REFERENCE_INDEXING_MULTI.out.elongated_reference ch_mitochondrion_header = REFERENCE_INDEXING_MULTI.out.mitochondrion_header ch_hapmap = REFERENCE_INDEXING_MULTI.out.hapmap ch_pmd_masked_fasta = REFERENCE_INDEXING_MULTI.out.pmd_masked_fasta @@ -41,7 +41,7 @@ workflow REFERENCE_INDEXING { } else { // If input FASTA and/or indicies supplied REFERENCE_INDEXING_SINGLE ( fasta, fasta_fai, fasta_dict, fasta_mapperindexdir ) - ch_elongated_reference = REFERENCE_INDEXING_SINGLE.out.elongated_reference + ch_reference_to_elongate = REFERENCE_INDEXING_SINGLE.out.elongated_reference ch_mitochondrion_header = REFERENCE_INDEXING_SINGLE.out.mitochondrion_header ch_hapmap = REFERENCE_INDEXING_SINGLE.out.hapmap ch_pmd_masked_fasta = REFERENCE_INDEXING_SINGLE.out.pmd_masked_fasta @@ -131,16 +131,19 @@ workflow REFERENCE_INDEXING { // Elongate reference for circularmapper if requested if ( params.mapping_tool == "circularmapper" ) { // Throw errors if required parameters are missing - ch_elongated_for_gunzip = ch_elongated_reference - .filter{ it[1] != "" || it[2] != "" } + ch_elongated_for_gunzip = ch_reference_to_elongate + .filter{ + meta, circular_target, circularmapper_elongatedfasta, circularmapper_elongatedindex -> + circular_target != "" || circularmapper_elongatedfasta != "" + } .ifEmpty{ error "[nf-core/eager] ERROR: Mapping with circularmapper requires either a circular target or elongated reference file for at least one reference." } // This ELONGATE_REFERENCE subworkflow also checks if the provided reference is gzipped, and unzips it if necessary. - ELONGATE_REFERENCE( ch_reference_for_mapping, ch_elongated_reference ) + ELONGATE_REFERENCE( ch_reference_for_mapping, ch_reference_to_elongate ) ch_version = ch_versions.mix( ELONGATE_REFERENCE.out.versions ) ch_elongated_indexed_reference = ELONGATE_REFERENCE.out.circular_reference } else { - ch_elongated_indexed_reference = ch_elongated_reference + ch_elongated_indexed_reference = ch_reference_to_elongate } emit: From 8fe564bd314d92b0f3d1dd831385f0f2771ffb3b Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Mon, 5 Aug 2024 15:45:58 +0200 Subject: [PATCH 165/198] add mapping_circularmapper_circularfilter option --- conf/modules.config | 1 + 1 file changed, 1 insertion(+) diff --git a/conf/modules.config b/conf/modules.config index 0f9a9ed54..d944ed993 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -562,6 +562,7 @@ process { // Configuration for BWA_ALN and BWA_SAMSE/SAMPE is the same as for the non-circular mapping withName: CIRCULARMAPPER_REALIGNSAMFILE { tag = { "${meta.reference}|${meta.sample_id}_${meta.library_id}" } + ext.args = { params.mapping_circularmapper_circularfilter ? "-f true -x true" : "" } ext.prefix = { "${meta.sample_id}_${meta.library_id}_${meta.reference}" } publishDir = [ enabled: false From 2a98bd9de7e01b80564154e02cb5f2eb174c7ada Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Mon, 5 Aug 2024 15:49:32 +0200 Subject: [PATCH 166/198] update manual tests --- docs/development/manual_tests.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/docs/development/manual_tests.md b/docs/development/manual_tests.md index 50334e5b4..7d9f9a8fe 100644 --- a/docs/development/manual_tests.md +++ b/docs/development/manual_tests.md @@ -305,6 +305,14 @@ nextflow run main.nf -profile test,docker --outdir ./results -w work/ -resume -d nextflow run main.nf -profile test_multiref,docker --outdir ./results -w work/ -resume -dump-channels -ansi-log false --fasta_sheet /Users/lamnidis/Software/github/jbv2/eager/data/reference/reference_sheet_multiref.csv --mapping_tool circularmapper --fasta_largeref --mapping_bwaaln_n 0.05 --mapping_bwaaln_k 3 ``` +```bash +## Multiref with circularmapper PLUS filtering. +## Expect: No elongation for Mammoth MT. +## Check: 2 bam files together with their CSIs and Flagstats in the `mapping/circularmapper` directory. (6 files total) +nextflow run main.nf -profile test,docker --outdir ./results -w work/ -resume -dump-channels -ansi-log false --fasta_circular_target 'NC_007596.2' --mapping_tool circularmapper --fasta_circularmapper_elongatedfasta data/reference/Mammoth_MT_Krause_500/Mammoth_MT_Krause_500.fasta --fasta_circularmapper_elongatedindex data/reference/Mammoth_MT_Krause_500/bwa --fasta_largeref --mapping_bwaaln_n 0.05 --mapping_bwaaln_k 3 --mapping_circularmapper_circularfilter +``` + + ## Host Removal All possible parameters From 6a076c2c0906cacc5e071b99bb61506597632324 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 9 Aug 2024 10:29:15 +0200 Subject: [PATCH 167/198] update CM modules --- modules.json | 4 ++-- .../circularmapper/circulargenerator/main.nf | 12 ++++++++---- .../circularmapper/circulargenerator/meta.yml | 15 ++++----------- .../nf-core/circularmapper/realignsamfile/main.nf | 3 +++ .../circularmapper/realignsamfile/meta.yml | 9 +++++++++ 5 files changed, 26 insertions(+), 17 deletions(-) diff --git a/modules.json b/modules.json index ed96492a2..a732a3cf9 100644 --- a/modules.json +++ b/modules.json @@ -97,12 +97,12 @@ }, "circularmapper/circulargenerator": { "branch": "master", - "git_sha": "0148d00e72e35cd08b3d829d7de3430bc0c92a5a", + "git_sha": "a7b0131370d9bc38076efad88773bca5537203d0", "installed_by": ["modules"] }, "circularmapper/realignsamfile": { "branch": "master", - "git_sha": "579d2d5f15e126a2190a7b709dfc77696c83688d", + "git_sha": "a7b0131370d9bc38076efad88773bca5537203d0", "installed_by": ["modules"] }, "damageprofiler": { diff --git a/modules/nf-core/circularmapper/circulargenerator/main.nf b/modules/nf-core/circularmapper/circulargenerator/main.nf index 07b722c39..9463ec497 100644 --- a/modules/nf-core/circularmapper/circulargenerator/main.nf +++ b/modules/nf-core/circularmapper/circulargenerator/main.nf @@ -16,8 +16,9 @@ process CIRCULARMAPPER_CIRCULARGENERATOR { tuple val(meta3), val(target) output: - tuple val(meta), path("*_${elongation_factor}.fasta"), emit: fasta - path "versions.yml" , emit: versions + tuple val(meta), path("*_${elongation_factor}.fasta") , emit: fasta + tuple val(meta), path("*${elongation_factor}_elongated") , emit: elongated + path "versions.yml" , emit: versions when: task.ext.when == null || task.ext.when @@ -25,6 +26,7 @@ process CIRCULARMAPPER_CIRCULARGENERATOR { script: def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.id}" + def full_extension = reference.getName().replaceFirst(reference.getSimpleName(), "") """ circulargenerator \ -e ${elongation_factor} \ @@ -33,8 +35,9 @@ process CIRCULARMAPPER_CIRCULARGENERATOR { $args ## circulargenerator has a hardcoded output name. Rename if necessary to use prefix. - if [[ "${reference.getBaseName()}_${elongation_factor}.fasta" != "${prefix}_${elongation_factor}.fasta" ]]; then - mv ${reference.getBaseName()}_${elongation_factor}.fasta ${prefix}_${elongation_factor}.fasta + if [[ "${reference.getSimpleName()}_${elongation_factor}${full_extension}" != "${prefix}_${elongation_factor}.fasta" ]]; then + mv ${reference.getSimpleName()}_${elongation_factor}${full_extension} ${prefix}_${elongation_factor}.fasta + mv ${reference}_${elongation_factor}_elongated ${prefix}.fasta_${elongation_factor}_elongated fi cat <<-END_VERSIONS > versions.yml @@ -48,6 +51,7 @@ process CIRCULARMAPPER_CIRCULARGENERATOR { def prefix = task.ext.prefix ?: "${meta.id}" """ touch ${prefix}_${elongation_factor}.fasta + touch ${prefix}.fasta_${elongation_factor}_elongated cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/nf-core/circularmapper/circulargenerator/meta.yml b/modules/nf-core/circularmapper/circulargenerator/meta.yml index 2704fbc36..baa39e74b 100644 --- a/modules/nf-core/circularmapper/circulargenerator/meta.yml +++ b/modules/nf-core/circularmapper/circulargenerator/meta.yml @@ -14,57 +14,50 @@ tools: tool_dev_url: "https://github.com/apeltzer/CircularMapper" doi: "no DOI available" licence: ["GPL v3"] - input: - meta: type: map description: | Groovy Map containing reference information e.g. `[ id:'sample1' ]` - - reference: type: file description: Genome fasta file pattern: "*.fasta" - - meta2: type: map description: | Groovy Map containing reference information e.g. `[ id:'sample1' ]` - - elongation_factor: type: integer description: The number of bases that the ends of the target chromosome in the reference genome should be elongated by - - meta3: type: map description: | Groovy Map containing reference information e.g. `[ id:'sample1' ]` - - target: type: string description: The name of the chromosome in the reference genome that should be elongated - output: - #Only when we have meta - meta: type: map description: | Groovy Map containing sample information e.g. `[ id:'sample1', single_end:false ]` - - versions: type: file description: File containing software versions pattern: "versions.yml" - - fasta: type: file description: Genome fasta file pattern: "*.fasta" - + - elongated: + type: file + description: File listing the chromosomes that were elongated + pattern: "*_elongated" authors: - "@apalleja" - "@TCLamnidis" diff --git a/modules/nf-core/circularmapper/realignsamfile/main.nf b/modules/nf-core/circularmapper/realignsamfile/main.nf index 9d74f7b91..6363b8d25 100644 --- a/modules/nf-core/circularmapper/realignsamfile/main.nf +++ b/modules/nf-core/circularmapper/realignsamfile/main.nf @@ -11,6 +11,9 @@ process CIRCULARMAPPER_REALIGNSAMFILE { tuple val(meta), path(bam) tuple val(meta2), path(fasta) tuple val(meta3), val(elongation_factor) + tuple val(meta4), path(elongated_chr_list) + // NOTE: The elongated_chr_list is not used in the script, but is an implicit input that realignsamfile requires when using the `-f true` option. + // In its absence, when `-f true` is set, realignsamfile will remove all @SQ tags from the BAM header, breaking the bamfile. output: tuple val(meta), path("*_realigned.bam") , emit: bam diff --git a/modules/nf-core/circularmapper/realignsamfile/meta.yml b/modules/nf-core/circularmapper/realignsamfile/meta.yml index fbb62d76d..94f74069e 100644 --- a/modules/nf-core/circularmapper/realignsamfile/meta.yml +++ b/modules/nf-core/circularmapper/realignsamfile/meta.yml @@ -43,6 +43,15 @@ input: - elongation_factor: type: integer description: The elongation factor used when running circulargenerator, i.e. the number of bases that the ends of the target chromosome in the reference genome was elongated by + - meta4: + type: map + description: | + Groovy Map containing reference information + e.g. `[ id:'test' ]` + - elongated_chr_list: + type: file + description: File listing the chromosomes that were elongated + pattern: "*_elongated" output: - meta: type: map From 81d79f8411a6281703171b8b99d6f2f1acaebf76 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 9 Aug 2024 10:57:18 +0200 Subject: [PATCH 168/198] add elongated chromosome list needed for circularmapper filtering --- subworkflows/local/circularmapper.nf | 15 +++++++++------ subworkflows/local/elongate_reference.nf | 5 ++++- subworkflows/local/map.nf | 9 +++++---- subworkflows/local/reference_indexing.nf | 4 ++++ workflows/eager.nf | 2 +- 5 files changed, 23 insertions(+), 12 deletions(-) diff --git a/subworkflows/local/circularmapper.nf b/subworkflows/local/circularmapper.nf index 45726ed0e..46874f987 100644 --- a/subworkflows/local/circularmapper.nf +++ b/subworkflows/local/circularmapper.nf @@ -11,6 +11,7 @@ workflow CIRCULARMAPPER { take: ch_reference // channel (mandatory): [ val(meta), path(index), path(reference) ] ch_elongated_reference // channel (mandatory): [ val(meta), path(elongated_index) ] + ch_elongated_chr_list // channel (mandatory): [ val(meta), path(elongated_chr_list) ] ch_fastq_reads // channel (mandatory): [ val(meta), path(reads) ]. subworkImportant: meta REQUIRES single_end` entry! val_elongation_factor // int (mandatory): Elongation factor used for chromosome circularisation @@ -26,9 +27,10 @@ workflow CIRCULARMAPPER { ch_versions = ch_versions.mix( FASTQ_ALIGN_BWAALN_ELONGATED.out.versions.first() ) ch_ref_for_realignsamfile = ch_reference + .join( ch_elongated_chr_list ) .map { - meta, index, reference -> - [ meta, reference ] + meta, index, reference, elongated_chr_list -> + [ meta, reference, elongated_chr_list ] } .map { // Prepend a new meta that contains the meta.reference value as the new_meta.reference attribute @@ -48,12 +50,13 @@ workflow CIRCULARMAPPER { } .combine( ch_ref_for_realignsamfile, by: 0 ) .multiMap { - ignore_me, meta, bam, ref_meta, ref_fasta -> - bam: [ meta, bam ] - fasta: [ ref_meta, ref_fasta ] + ignore_me, meta, bam, ref_meta, ref_fasta, elongated_chr_list -> + bam: [ meta, bam ] + fasta: [ ref_meta, ref_fasta ] + chr_list: [ ref_meta, elongated_chr_list ] } - CIRCULARMAPPER_REALIGNSAMFILE( ch_input_for_realignsamfile.bam, ch_input_for_realignsamfile.fasta, [ [], val_elongation_factor ] ) + CIRCULARMAPPER_REALIGNSAMFILE( ch_input_for_realignsamfile.bam, ch_input_for_realignsamfile.fasta, [ [], val_elongation_factor ], ch_input_for_realignsamfile.chr_list ) ch_versions = ch_versions.mix( CIRCULARMAPPER_REALIGNSAMFILE.out.versions.first() ) ch_realigned_bams = CIRCULARMAPPER_REALIGNSAMFILE.out.bam diff --git a/subworkflows/local/elongate_reference.nf b/subworkflows/local/elongate_reference.nf index 5c07a3e25..42d3b78e7 100644 --- a/subworkflows/local/elongate_reference.nf +++ b/subworkflows/local/elongate_reference.nf @@ -16,6 +16,7 @@ workflow ELONGATE_REFERENCE { ch_multiqc_files = Channel.empty() ch_circular_reference = Channel.empty() ch_elongated_unzipped = Channel.empty() + ch_elongated_chr = Channel.empty() // Check if the provided elongated reference is gzipped, and if so, unzip it. ch_elongated_branches = ch_elongated_reference @@ -79,7 +80,8 @@ workflow ELONGATE_REFERENCE { ch_references_to_elongate.elongation_factor, ch_references_to_elongate.target ) - ch_versions = ch_versions.mix( CIRCULARMAPPER_CIRCULARGENERATOR.out.versions.first() ) + ch_elongated_chr = CIRCULARMAPPER_CIRCULARGENERATOR.out.elongated + ch_versions = ch_versions.mix( CIRCULARMAPPER_CIRCULARGENERATOR.out.versions.first() ) // Collect newly generated circular references and provided ones without an index, and index them. ch_input_for_circular_indexing = ch_circulargenerator_input.needs_index @@ -105,6 +107,7 @@ workflow ELONGATE_REFERENCE { emit: circular_reference = ch_circular_reference // [ meta, fasta, index ] + elongated_chr_list = ch_elongated_chr // [ meta, elongated_chr_list ] versions = ch_versions mqc = ch_multiqc_files } diff --git a/subworkflows/local/map.nf b/subworkflows/local/map.nf index 55b222372..43469c9e2 100644 --- a/subworkflows/local/map.nf +++ b/subworkflows/local/map.nf @@ -16,9 +16,10 @@ include { CIRCULARMAPPER } from '../../subw workflow MAP { take: - reads // [ [meta], [read1, reads2] ] or [ [meta], [read1] ] - index // [ [meta], [ index ], [ fasta ] ] - elogated_index // [ [meta], circularmapper_elongated_fasta, circularmapper_elongated_index ] + reads // [ [meta], [read1, reads2] ] or [ [meta], [read1] ] + index // [ [meta], [ index ], [ fasta ] ] + elogated_index // [ [meta], circularmapper_elongated_fasta, circularmapper_elongated_index ] + elongated_chr_list // [ [meta], elongated_chr_list ] main: ch_versions = Channel.empty() @@ -123,7 +124,7 @@ workflow MAP { [ meta, elongated_index ] } - CIRCULARMAPPER( index, ch_elongated_reference_for_mapping, reads, params.fasta_circularmapper_elongationfactor ) + CIRCULARMAPPER( index, ch_elongated_reference_for_mapping, elongated_chr_list, reads, params.fasta_circularmapper_elongationfactor ) ch_versions = ch_versions.mix ( CIRCULARMAPPER.out.versions ) ch_mapped_lane_bam = CIRCULARMAPPER.out.bam ch_mapped_lane_bai = CIRCULARMAPPER.out.bai diff --git a/subworkflows/local/reference_indexing.nf b/subworkflows/local/reference_indexing.nf index 5ac5e8866..e085e0f8d 100644 --- a/subworkflows/local/reference_indexing.nf +++ b/subworkflows/local/reference_indexing.nf @@ -142,13 +142,17 @@ workflow REFERENCE_INDEXING { ELONGATE_REFERENCE( ch_reference_for_mapping, ch_reference_to_elongate ) ch_version = ch_versions.mix( ELONGATE_REFERENCE.out.versions ) ch_elongated_indexed_reference = ELONGATE_REFERENCE.out.circular_reference + ch_elongated_chr_list = ELONGATE_REFERENCE.out.elongated_chr_list + } else { ch_elongated_indexed_reference = ch_reference_to_elongate + ch_elongated_chr_list = Channel.empty() } emit: reference = ch_reference_for_mapping // [ meta, fasta, fai, dict, mapindex ] elongated_reference = ch_elongated_indexed_reference // [ meta, circularmapper_elongated_fasta, circularmapper_elongated_index ] + elongated_chr_list = ch_elongated_chr_list // [ meta, elongated_chr_list ] mitochondrion_header = ch_mitochondrion_header // [ meta, mitochondrion_header ] hapmap = ch_hapmap // [ meta, hapmap ] pmd_masking = ch_pmd_masking // [ meta, pmd_masked_fasta, pmd_bed_for_masking ] diff --git a/workflows/eager.nf b/workflows/eager.nf index 31bbee84f..2804db5fd 100644 --- a/workflows/eager.nf +++ b/workflows/eager.nf @@ -185,7 +185,7 @@ workflow EAGER { [ meta, index, fasta ] } - MAP ( ch_reads_for_mapping, ch_reference_for_mapping, REFERENCE_INDEXING.out.elongated_reference ) + MAP ( ch_reads_for_mapping, ch_reference_for_mapping, REFERENCE_INDEXING.out.elongated_reference, REFERENCE_INDEXING.out.elongated_chr_list ) ch_versions = ch_versions.mix( MAP.out.versions ) ch_multiqc_files = ch_multiqc_files.mix( MAP.out.mqc.collect{it[1]}.ifEmpty([]) ) From c18174de1dfdb0fc100afd4a69bd74b4b2d7f5b0 Mon Sep 17 00:00:00 2001 From: scarlhoff Date: Fri, 9 Aug 2024 10:59:55 +0200 Subject: [PATCH 169/198] add raw library merging + publishing --- conf/modules.config | 53 +++++++++++++++++++++++++++++++++++------ workflows/eager.nf | 58 +++++++++++++++++++++++++-------------------- 2 files changed, 78 insertions(+), 33 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 37c5d666b..bc41e822d 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -951,7 +951,7 @@ process { // LIBRARY MERGE // - withName: SAMTOOLS_MERGE_LIBRARIES { + withName: ".*MERGE_LIBRARIES:SAMTOOLS_MERGE_LIBRARIES" { tag = { "${meta.reference}|${meta.sample_id}" } ext.prefix = { "${meta.sample_id}_${meta.reference}_unsorted" } publishDir = [ @@ -959,32 +959,71 @@ process { ] } - withName: SAMTOOLS_SORT_MERGED_LIBRARIES { + withName: ".*MERGE_LIBRARIES:SAMTOOLS_SORT_MERGED_LIBRARIES" { tag = { "${meta.reference}|${meta.sample_id}" } ext.prefix = { "${meta.sample_id}_${meta.reference}" } publishDir = [ - path: { "${params.outdir}/final_bams/" }, + path: { "${params.outdir}/final_bams/raw/" }, mode: params.publish_dir_mode, pattern: '*.bam' ] } - withName: SAMTOOLS_INDEX_MERGED_LIBRARIES { + withName: ".*MERGE_LIBRARIES:SAMTOOLS_INDEX_MERGED_LIBRARIES" { tag = { "${meta.reference}|${meta.sample_id}" } ext.args = { params.fasta_largeref ? "-c" : "" } ext.prefix = { "${meta.sample_id}_${meta.reference}" } publishDir = [ - path: { "${params.outdir}/final_bams/" }, + path: { "${params.outdir}/final_bams/raw/" }, mode: params.publish_dir_mode, pattern: '*.{bai,csi}' ] } - withName: SAMTOOLS_FLAGSTAT_MERGED_LIBRARIES { + withName: ".*MERGE_LIBRARIES:SAMTOOLS_FLAGSTAT_MERGED_LIBRARIES" { tag = { "${meta.reference}|${meta.sample_id}" } ext.prefix = { "${meta.sample_id}_${meta.reference}" } publishDir = [ - path: { "${params.outdir}/final_bams/" }, + path: { "${params.outdir}/final_bams/raw/" }, + mode: params.publish_dir_mode, + pattern: '*.flagstat' + ] + } + + withName: ".*MERGE_LIBRARIES:SAMTOOLS_MERGE_LIBRARIES" { + tag = { "${meta.reference}|${meta.sample_id}" } + ext.prefix = { "${meta.sample_id}_${meta.reference}_unsorted" } + publishDir = [ + enabled: false + ] + } + + withName: ".*MERGE_LIBRARIES_GENOTYPING:SAMTOOLS_SORT_MERGED_LIBRARIES" { + tag = { "${meta.reference}|${meta.sample_id}" } + ext.prefix = { "${meta.sample_id}_${meta.reference}" } + publishDir = [ + path: { "${params.outdir}/final_bams/for_genotyping/" }, + mode: params.publish_dir_mode, + pattern: '*.bam' + ] + } + + withName: ".*MERGE_LIBRARIES_GENOTYPING:SAMTOOLS_INDEX_MERGED_LIBRARIES" { + tag = { "${meta.reference}|${meta.sample_id}" } + ext.args = { params.fasta_largeref ? "-c" : "" } + ext.prefix = { "${meta.sample_id}_${meta.reference}" } + publishDir = [ + path: { "${params.outdir}/final_bams/for_genotyping/" }, + mode: params.publish_dir_mode, + pattern: '*.{bai,csi}' + ] + } + + withName: ".*MERGE_LIBRARIES_GENOTYPING:SAMTOOLS_FLAGSTAT_MERGED_LIBRARIES" { + tag = { "${meta.reference}|${meta.sample_id}" } + ext.prefix = { "${meta.sample_id}_${meta.reference}" } + publishDir = [ + path: { "${params.outdir}/final_bams/for_genotyping/" }, mode: params.publish_dir_mode, pattern: '*.flagstat' ] diff --git a/workflows/eager.nf b/workflows/eager.nf index fee976267..f7fc00f1a 100644 --- a/workflows/eager.nf +++ b/workflows/eager.nf @@ -21,18 +21,19 @@ include { addNewMetaFromAttributes } from '../subworkflows/local/utils_nfcore_ea // // TODO rename to active: index_reference, filter_bam etc. -include { REFERENCE_INDEXING } from '../subworkflows/local/reference_indexing' -include { PREPROCESSING } from '../subworkflows/local/preprocessing' -include { MAP } from '../subworkflows/local/map' -include { FILTER_BAM } from '../subworkflows/local/bamfiltering.nf' -include { DEDUPLICATE } from '../subworkflows/local/deduplicate' -include { MANIPULATE_DAMAGE } from '../subworkflows/local/manipulate_damage' -include { METAGENOMICS_COMPLEXITYFILTER } from '../subworkflows/local/metagenomics_complexityfilter' -include { ESTIMATE_CONTAMINATION } from '../subworkflows/local/estimate_contamination' -include { CALCULATE_DAMAGE } from '../subworkflows/local/calculate_damage' -include { RUN_SEXDETERRMINE } from '../subworkflows/local/run_sex_determination' -include { MERGE_LIBRARIES } from '../subworkflows/local/merge_libraries' -include { GENOTYPE } from '../subworkflows/local/genotype' +include { REFERENCE_INDEXING } from '../subworkflows/local/reference_indexing' +include { PREPROCESSING } from '../subworkflows/local/preprocessing' +include { MAP } from '../subworkflows/local/map' +include { FILTER_BAM } from '../subworkflows/local/bamfiltering.nf' +include { DEDUPLICATE } from '../subworkflows/local/deduplicate' +include { MANIPULATE_DAMAGE } from '../subworkflows/local/manipulate_damage' +include { METAGENOMICS_COMPLEXITYFILTER } from '../subworkflows/local/metagenomics_complexityfilter' +include { ESTIMATE_CONTAMINATION } from '../subworkflows/local/estimate_contamination' +include { CALCULATE_DAMAGE } from '../subworkflows/local/calculate_damage' +include { RUN_SEXDETERRMINE } from '../subworkflows/local/run_sex_determination' +include { MERGE_LIBRARIES } from '../subworkflows/local/merge_libraries' +include { MERGE_LIBRARIES as MERGE_LIBRARIES_GENOTYPING } from '../subworkflows/local/merge_libraries' +include { GENOTYPE } from '../subworkflows/local/genotype' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -264,6 +265,15 @@ workflow EAGER { ch_dedupped_flagstat = Channel.empty() } + // + // SUBWORKFLOW: Merge libraries per sample + // + + MERGE_LIBRARIES ( ch_dedupped_bams ) + ch_versions = ch_versions.mix( MERGE_LIBRARIES.out.versions ) + ch_merged_dedup_bams = MERGE_LIBRARIES.out.bam_bai + ch_multiqc_files = ch_multiqc_files.mix( MERGE_LIBRARIES.out.mqc.collect{it[1]}.ifEmpty([]) ) + // // MODULE QUALIMAP // @@ -538,27 +548,23 @@ workflow EAGER { // // SUBWORKFLOW: aDNA Damage Manipulation + // if ( params.run_mapdamage_rescaling || params.run_pmd_filtering || params.run_trim_bam ) { MANIPULATE_DAMAGE( ch_dedupped_bams, ch_fasta_for_deduplication.fasta, REFERENCE_INDEXING.out.pmd_masking ) - ch_multiqc_files = ch_multiqc_files.mix( MANIPULATE_DAMAGE.out.flagstat.collect{it[1]}.ifEmpty([]) ) - ch_versions = ch_versions.mix( MANIPULATE_DAMAGE.out.versions ) + ch_multiqc_files = ch_multiqc_files.mix( MANIPULATE_DAMAGE.out.flagstat.collect{it[1]}.ifEmpty([]) ) + ch_versions = ch_versions.mix( MANIPULATE_DAMAGE.out.versions ) ch_bams_for_library_merge = params.genotyping_source == 'rescaled' ? MANIPULATE_DAMAGE.out.rescaled : params.genotyping_source == 'pmd' ? MANIPULATE_DAMAGE.out.filtered : params.genotyping_source == 'trimmed' ? MANIPULATE_DAMAGE.out.trimmed : ch_dedupped_bams + + // SUBWORKFLOW: merge libraries for genotyping + MERGE_LIBRARIES_GENOTYPING ( ch_bams_for_library_merge ) + ch_versions = ch_versions.mix( MERGE_LIBRARIES_GENOTYPING.out.versions ) + ch_bams_for_genotyping = MERGE_LIBRARIES_GENOTYPING.out.bam_bai + ch_multiqc_files = ch_multiqc_files.mix( MERGE_LIBRARIES_GENOTYPING.out.mqc.collect{it[1]}.ifEmpty([]) ) } else { - ch_bams_for_library_merge = ch_dedupped_bams + ch_bams_for_genotyping = ch_merged_dedup_bams } - // - // SUBWORKFLOW: MERGE LIBRARIES - // - - // The bams being merged are always the ones specified by params.genotyping_source, - // unless the user skipped damage manipulation, in which case it is the DEDUPLICATION output. - MERGE_LIBRARIES ( ch_bams_for_library_merge ) - ch_versions = ch_versions.mix( MERGE_LIBRARIES.out.versions ) - ch_bams_for_genotyping = MERGE_LIBRARIES.out.bam_bai - ch_multiqc_files = ch_multiqc_files.mix( MERGE_LIBRARIES.out.mqc.collect{it[1]}.ifEmpty([]) ) // Not sure if this is needed, or if it needs to be moved to line 564? - // // SUBWORKFLOW: Genotyping // From d2983efbffe247a62f62809600b3bb4fa325c343 Mon Sep 17 00:00:00 2001 From: scarlhoff Date: Fri, 9 Aug 2024 11:09:52 +0200 Subject: [PATCH 170/198] merged input for bedtools, qualimap, sex det --- conf/modules.config | 20 ++++++++++++-------- workflows/eager.nf | 6 +++--- 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index bc41e822d..f91a741c0 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -668,16 +668,16 @@ process { // BEDTOOLS_COVERAGE // withName: SAMTOOLS_VIEW_GENOME { - tag = { "${meta.reference}|${meta.sample_id}_${meta.library_id}" } + tag = { "${meta.reference}|${meta.sample_id}" } publishDir = [ enabled: false ] } withName: BEDTOOLS_COVERAGE_DEPTH { - tag = { "${meta.reference}|${meta.sample_id}_${meta.library_id}" } + tag = { "${meta.reference}|${meta.sample_id}" } ext.args = '-mean -nonamecheck' - ext.prefix = { "${meta.sample_id}_${meta.library_id}_${meta.reference}_depth" } + ext.prefix = { "${meta.sample_id}_${meta.reference}_depth" } publishDir = [ path: { "${params.outdir}/mapstats/bedtools" }, mode: params.publish_dir_mode @@ -685,9 +685,9 @@ process { } withName: BEDTOOLS_COVERAGE_BREADTH { - tag = { "${meta.reference}|${meta.sample_id}_${meta.library_id}" } + tag = { "${meta.reference}|${meta.sample_id}" } ext.args = '-nonamecheck' - ext.prefix = { "${meta.sample_id}_${meta.library_id}_${meta.reference}_breadth" } + ext.prefix = { "${meta.sample_id}_${meta.reference}_breadth" } publishDir = [ path: { "${params.outdir}/mapstats/bedtools" }, mode: params.publish_dir_mode @@ -880,8 +880,12 @@ process { ] } + // + // QUALIMAP + // + withName: 'QUALIMAP_BAMQC_WITHBED|QUALIMAP_BAMQC_NOBED' { - tag = { "${meta.reference}|${meta.sample_id}_${meta.library_id}" } + tag = { "${meta.reference}|${meta.sample_id}" } publishDir = [ path: { "${params.outdir}/mapstats/qualimap/${meta.reference}/${meta.sample_id}/}" }, mode: params.publish_dir_mode, @@ -928,7 +932,7 @@ process { // RUN SEXDETERRMINE // withName: SAMTOOLS_DEPTH_SEXDETERRMINE { - tag = { "${meta1.reference}|${meta1.sample_id}_${meta1.library_id}" } + tag = { "${meta1.reference}|${meta1.sample_id}" } ext.prefix = { "${meta2.id}_samtoolsdepth" } ext.args = '-aa -q30 -Q30 -H' publishDir = [ @@ -937,7 +941,7 @@ process { } withName: SEXDETERRMINE { - tag = { "${meta.reference}|${meta.sample_id}_${meta.library_id}" } + tag = { "${meta.reference}|${meta.sample_id}" } ext.prefix = { "${meta.reference}_sexdeterrmine" } publishDir = [ path: { "${params.outdir}/sex_determination/" }, diff --git a/workflows/eager.nf b/workflows/eager.nf index f7fc00f1a..a6db6da16 100644 --- a/workflows/eager.nf +++ b/workflows/eager.nf @@ -283,7 +283,7 @@ workflow EAGER { .map{ addNewMetaFromAttributes( it, "id" , "reference" , false ) } - ch_qualimap_input = ch_dedupped_bams + ch_qualimap_input = ch_merged_dedup_bams .map { meta, bam, bai -> [ meta, bam ] @@ -466,7 +466,7 @@ workflow EAGER { addNewMetaFromAttributes( it, "id" , "reference" , false ) } - ch_bedtools_prep = ch_dedupped_bams + ch_bedtools_prep = ch_merged_dedup_bams .map { addNewMetaFromAttributes( it, "reference" , "reference" , false ) } @@ -527,7 +527,7 @@ workflow EAGER { // if ( params.run_sexdeterrmine ) { - ch_sexdeterrmine_input = ch_dedupped_bams + ch_sexdeterrmine_input = ch_merged_dedup_bams RUN_SEXDETERRMINE(ch_sexdeterrmine_input, REFERENCE_INDEXING.out.sexdeterrmine_bed ) ch_versions = ch_versions.mix( RUN_SEXDETERRMINE.out.versions ) From fc70db9392669c141caff3ece5f4f5541efcf5b8 Mon Sep 17 00:00:00 2001 From: Ian Light Date: Fri, 9 Aug 2024 09:21:31 +0000 Subject: [PATCH 171/198] updated test datasets to nf-core/testdatasets eager branch versions --- conf/test_krakenuniq.config | 2 +- conf/test_malt.config | 4 ++-- conf/test_metaphlan.config | 6 +++--- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/conf/test_krakenuniq.config b/conf/test_krakenuniq.config index 9528c336a..510dd3d32 100644 --- a/conf/test_krakenuniq.config +++ b/conf/test_krakenuniq.config @@ -29,5 +29,5 @@ params { // Metagenomics run_metagenomics = true metagenomics_profiling_tool = 'krakenuniq' - metagenomics_profiling_database = 'https://github.com/nf-core/test-datasets/raw/taxprofiler/data/database/krakenuniq/testdb-krakenuniq.tar.gz' + metagenomics_profiling_database = 'https://github.com/nf-core/test-datasets/raw/eager/databases/kraken/eager_test.tar.gz' } diff --git a/conf/test_malt.config b/conf/test_malt.config index b53333d9d..51d951520 100644 --- a/conf/test_malt.config +++ b/conf/test_malt.config @@ -12,8 +12,8 @@ */ params { - config_profile_name = 'malt test profile' - config_profile_description = 'Minimal test dataset to check the metagenomics malt pipeline function' + config_profile_name = 'MALT test profile' + config_profile_description = 'Minimal test dataset to check the metagenomics MALT pipeline function' // Limit resources so that this can run on GitHub Actions max_cpus = 2 diff --git a/conf/test_metaphlan.config b/conf/test_metaphlan.config index b44d3f592..a74a499dc 100644 --- a/conf/test_metaphlan.config +++ b/conf/test_metaphlan.config @@ -12,8 +12,8 @@ */ params { - config_profile_name = 'Metaphlan3 test profile' - config_profile_description = 'Minimal test dataset to check the metagenomics metaphlan3 pipeline function' + config_profile_name = 'MetaPhlAn4 test profile' + config_profile_description = 'Minimal test dataset to check the metagenomics metaphlan4 pipeline function' // Limit resources so that this can run on GitHub Actions max_cpus = 2 @@ -29,5 +29,5 @@ params { // Metagenomics run_metagenomics = true metagenomics_profiling_tool = 'metaphlan' - metagenomics_profiling_database = 'https://github.com/nf-core/test-datasets/raw/409834b927c3a4e9314691b1125acee1434f7dd8/data/delete_me/metaphlan4_database.tar.gz' + metagenomics_profiling_database = 'https://github.com/nf-core/test-datasets/blob/eager/databases/metaphlan/metaphlan4_database.tar.gz' } From c06d7ff28e7577a43f3f9cff15e9e30a3b7c0374 Mon Sep 17 00:00:00 2001 From: Merlin Szymanski Date: Fri, 9 Aug 2024 11:35:34 +0200 Subject: [PATCH 172/198] tmp-fix for wrong module invocation in main WF --- workflows/eager.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/eager.nf b/workflows/eager.nf index 07cd6e365..1ddaaf5a7 100644 --- a/workflows/eager.nf +++ b/workflows/eager.nf @@ -492,7 +492,7 @@ workflow EAGER { BEDTOOLS_COVERAGE_DEPTH(ch_bedtools_input.withfeature, ch_genome_for_bedtools) ch_versions = ch_versions.mix( SAMTOOLS_VIEW_GENOME.out.versions ) - ch_versions = ch_versions.mix( BEDTOOLS_COVERAGE_BREADTH.out.versions ) + //ch_versions = ch_versions.mix( BEDTOOLS_COVERAGE_BREADTH.out.versions ) ch_versions = ch_versions.mix( BEDTOOLS_COVERAGE_DEPTH.out.versions ) } From f901063ebc4219f81c60870b8d6356b70973285b Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 9 Aug 2024 11:49:09 +0200 Subject: [PATCH 173/198] start adding manual chrom_list creation --- docs/development/manual_tests.md | 4 ++-- subworkflows/local/elongate_reference.nf | 23 +++++++++++++++++++++++ 2 files changed, 25 insertions(+), 2 deletions(-) diff --git a/docs/development/manual_tests.md b/docs/development/manual_tests.md index 7d9f9a8fe..b83ee686f 100644 --- a/docs/development/manual_tests.md +++ b/docs/development/manual_tests.md @@ -306,12 +306,12 @@ nextflow run main.nf -profile test_multiref,docker --outdir ./results -w work/ - ``` ```bash -## Multiref with circularmapper PLUS filtering. +## Circularmapper with circularfilter. ## Expect: No elongation for Mammoth MT. ## Check: 2 bam files together with their CSIs and Flagstats in the `mapping/circularmapper` directory. (6 files total) nextflow run main.nf -profile test,docker --outdir ./results -w work/ -resume -dump-channels -ansi-log false --fasta_circular_target 'NC_007596.2' --mapping_tool circularmapper --fasta_circularmapper_elongatedfasta data/reference/Mammoth_MT_Krause_500/Mammoth_MT_Krause_500.fasta --fasta_circularmapper_elongatedindex data/reference/Mammoth_MT_Krause_500/bwa --fasta_largeref --mapping_bwaaln_n 0.05 --mapping_bwaaln_k 3 --mapping_circularmapper_circularfilter ``` - + ## Host Removal diff --git a/subworkflows/local/elongate_reference.nf b/subworkflows/local/elongate_reference.nf index 42d3b78e7..70b868cff 100644 --- a/subworkflows/local/elongate_reference.nf +++ b/subworkflows/local/elongate_reference.nf @@ -5,6 +5,7 @@ include { GUNZIP as GUNZIP_ELONGATED_FASTA } from '../../modules/nf-core/gunzip/main' include { CIRCULARMAPPER_CIRCULARGENERATOR } from '../../modules/nf-core/circularmapper/circulargenerator/main' include { BWA_INDEX as BWA_INDEX_CIRCULARISED } from '../../modules/nf-core/bwa/index/main' +// include { addNewMetaFromAttributes } from '../../subworkflows/local/utils_nfcore_eager_pipeline/main' workflow ELONGATE_REFERENCE { take: @@ -61,6 +62,28 @@ workflow ELONGATE_REFERENCE { needs_elongation: circularmapper_elongated_fasta == "" && circular_target != "" } + // References that are already elongated, need ch_elongated_chr to be created from the circular target information + ch_needs_elongated_chr_list = ch_circulargenerator_input.ready + .mix( ch_circulargenerator_input.needs_index ) + .join( ch_reference ) + .map { + meta, circular_target, circularmapper_elongated_fasta, circularmapper_elongated_index, fasta, fai, dict, mapindex -> + [ meta, fasta, circular_target ] + } + .collectFile { + meta, fasta, circular_target -> + [ "${fasta.name}_500_elongated", circular_target + '\n' ] + } + /* The above gets the right information into the created files, but the channel then also needs a meta, which collectFile doesn't seem able to handle. + TODO Proposed solution: + - Use a map to infer the meta.id fromt he file name (i.e. file name without the suffix. since everything by now is unzipped, it should work). + - Then pull that info out of the ch_reference meta with addNewMetaFromAttributes. + - Join the channels by this meta + - Use a map to give the collected file the meta of the reference. + + This is a bit convoluted, but it should work. Would be simpler if I could create the meta within collectFile. + */ + // Elongate references that need it // Join the original references to the branch of needs_elongation, to get the original fasta files, and elongate them. ch_references_to_elongate = ch_circulargenerator_input.needs_elongation From 7b32ebd22f15bb0a20ad541e22cfd2f659827c0e Mon Sep 17 00:00:00 2001 From: Ian Light Date: Fri, 9 Aug 2024 09:59:51 +0000 Subject: [PATCH 174/198] added validation catch of nonmerging of PE + metagenomics --- nextflow.config | 6 +++--- subworkflows/local/utils_nfcore_eager_pipeline/main.nf | 1 + 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/nextflow.config b/nextflow.config index f68d91c57..c9423743f 100644 --- a/nextflow.config +++ b/nextflow.config @@ -145,15 +145,15 @@ params { metagenomics_profiling_tool = null metagenomics_profiling_database = null metagenomics_krakenuniq_ramchunksize = "16G" - metagenomics_kraken2_savereads = false - metagenomics_kraken2_savereadclassifications = false + metagenomics_kraken2_savereads = false + metagenomics_kraken2_savereadclassifications = false metagenomics_kraken2_saveminimizers = false metagenomics_malt_mode = 'BlastN' metagenomics_malt_alignmentmode = 'SemiGlobal' metagenomics_malt_savereads = false metagenomics_malt_minsupportmode = 'percent' metagenomics_malt_minsupportpercent = 0.01 - metagenomics_malt_minsupportreads = 1 + metagenomics_malt_minsupportreads = 1 metagenomics_malt_minpercentidentity = 85 metagenomics_malt_toppercent = 1 metagenomics_malt_maxqueries = 100 diff --git a/subworkflows/local/utils_nfcore_eager_pipeline/main.nf b/subworkflows/local/utils_nfcore_eager_pipeline/main.nf index 8809f7597..795ea8881 100644 --- a/subworkflows/local/utils_nfcore_eager_pipeline/main.nf +++ b/subworkflows/local/utils_nfcore_eager_pipeline/main.nf @@ -225,6 +225,7 @@ def validateInputParameters() { if ( params.metagenomics_complexity_tool == 'prinseq' && params.metagenomics_prinseq_mode == 'entropy' && params.metagenomics_prinseq_dustscore != 0.5 ) { if (params.metagenomics_complexity_entropy == 0.3) { exit 1, ("[nf-core/eager] ERROR: Metagenomics: You picked PRINSEQ++ with 'entropy' mode but provided a dust score. Please specify an entropy filter threshold using the --metagenomics_complexity_entropy flag") } } + if ( params.run_metagenomics && params.preprocessing_skippairmerging ) { exit 1, ("[nf-core/eager] ERROR: Metagenomics: Currently no support for unmerged paired end reads inputs into Metagenomics subworkflow. Please rerun without --preprocessing_skippairmerging.") } if ( params.run_genotyping && ! params.genotyping_tool ) { exit 1, ("[nf-core/eager] ERROR: --run_genotyping was specified, but no --genotyping_tool was specified.") } if ( params.run_genotyping && ! params.genotyping_source ) { exit 1, ("[nf-core/eager] ERROR: --run_genotyping was specified, but no --genotyping_source was specified.") } if ( params.genotyping_source == 'trimmed' && ! params.run_trim_bam ) { exit 1, ("[nf-core/eager] ERROR: --genotyping_source cannot be 'trimmed' unless BAM trimming is turned on with `--run_trim_bam`.") } From 70089d4186dfe9b84350754c6f38d3595e936df2 Mon Sep 17 00:00:00 2001 From: Merlin Szymanski Date: Fri, 9 Aug 2024 14:00:49 +0200 Subject: [PATCH 175/198] update test-datasets and update withName statements --- conf/modules.config | 6 +++--- conf/test_krakenuniq.config | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index b480d9907..ab346297c 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -869,7 +869,7 @@ process { ] } - withName: ".*BBMAP_BBDUK" { + withName: BBMAP_BBDUK { tag = { "${meta.reference}|${meta.sample_id}_${meta.library_id}" } ext.args = { "entropymask=f entropy=${params.metagenomics_complexity_entropy}" } ext.prefix = { "${meta.sample_id}_${meta.library_id}_${meta.reference}_complexity" } @@ -921,7 +921,7 @@ process { ] } - withName: ".*KRAKENUNIQ_PRELOADEDKRAKENUNIQ" { + withName: KRAKENUNIQ_PRELOADEDKRAKENUNIQ { publishDir = [ path: { "${params.outdir}/metagenomics/profiling/krakenuniq/" }, mode: params.publish_dir_mode, @@ -959,7 +959,7 @@ process { ] } - withName: 'MEGAN_RMA2INFO' { + withName: MEGAN_RMA2INFO { tag = {"${meta.id}"} ext.args = "-c2c Taxonomy" ext.prefix = { "${meta.id}" } diff --git a/conf/test_krakenuniq.config b/conf/test_krakenuniq.config index 510dd3d32..bb5257b46 100644 --- a/conf/test_krakenuniq.config +++ b/conf/test_krakenuniq.config @@ -29,5 +29,5 @@ params { // Metagenomics run_metagenomics = true metagenomics_profiling_tool = 'krakenuniq' - metagenomics_profiling_database = 'https://github.com/nf-core/test-datasets/raw/eager/databases/kraken/eager_test.tar.gz' + metagenomics_profiling_database = 'https://github.com/nf-core/test-datasets/raw/eager/databases/krakenuniq/testdb-krakenuniq.tar.gz' } From 0b0bcf1bd02d365b9c97d452bdf6d30a3dfe3874 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Mon, 12 Aug 2024 11:37:23 +0200 Subject: [PATCH 176/198] create elongated_chr_list from circular target --- subworkflows/local/elongate_reference.nf | 40 ++++++++++++++++-------- 1 file changed, 27 insertions(+), 13 deletions(-) diff --git a/subworkflows/local/elongate_reference.nf b/subworkflows/local/elongate_reference.nf index 70b868cff..ab8c8972b 100644 --- a/subworkflows/local/elongate_reference.nf +++ b/subworkflows/local/elongate_reference.nf @@ -5,7 +5,7 @@ include { GUNZIP as GUNZIP_ELONGATED_FASTA } from '../../modules/nf-core/gunzip/main' include { CIRCULARMAPPER_CIRCULARGENERATOR } from '../../modules/nf-core/circularmapper/circulargenerator/main' include { BWA_INDEX as BWA_INDEX_CIRCULARISED } from '../../modules/nf-core/bwa/index/main' -// include { addNewMetaFromAttributes } from '../../subworkflows/local/utils_nfcore_eager_pipeline/main' +include { addNewMetaFromAttributes } from '../../subworkflows/local/utils_nfcore_eager_pipeline/main' workflow ELONGATE_REFERENCE { take: @@ -62,8 +62,20 @@ workflow ELONGATE_REFERENCE { needs_elongation: circularmapper_elongated_fasta == "" && circular_target != "" } - // References that are already elongated, need ch_elongated_chr to be created from the circular target information - ch_needs_elongated_chr_list = ch_circulargenerator_input.ready + /* References that are already elongated, need ch_elongated_chr to be created from the circular target information + 1) Get the reference information ready for joinin with the new channel. + 2) Take all subchannels from the multiMap that do not go through CircularGenerator (.ready,.needs_index) and infer the name of the elongated_chr_list expected by RealignSAMFile + 3) Put the circular target in a file of that name FOR EACH REFERENCE. The resulting channel has no meta, so we need to add it. + 4) Add meta, and use to merge back to the reference channel. This way we can take the original reference's meta. + + This is a bit convoluted, but it should work. Would be simpler if I could create the meta within collectFile, but I did not find a way to do that. + */ + ch_ref_for_chr_list = ch_reference + .map { + addNewMetaFromAttributes( it, "id", "id", false ) + } + + ch_chr_list_for_already_elongated_ref = ch_circulargenerator_input.ready .mix( ch_circulargenerator_input.needs_index ) .join( ch_reference ) .map { @@ -74,15 +86,17 @@ workflow ELONGATE_REFERENCE { meta, fasta, circular_target -> [ "${fasta.name}_500_elongated", circular_target + '\n' ] } - /* The above gets the right information into the created files, but the channel then also needs a meta, which collectFile doesn't seem able to handle. - TODO Proposed solution: - - Use a map to infer the meta.id fromt he file name (i.e. file name without the suffix. since everything by now is unzipped, it should work). - - Then pull that info out of the ch_reference meta with addNewMetaFromAttributes. - - Join the channels by this meta - - Use a map to give the collected file the meta of the reference. - - This is a bit convoluted, but it should work. Would be simpler if I could create the meta within collectFile. - */ + .map { + file -> + def id = file.getSimpleName() + [ [id: id ], file ] + } + .join(ch_ref_for_chr_list) + .map { + ignore_me, chr_list, meta, fasta, fai, dict, mapindex -> + [ meta, chr_list ] + } + .dump(tag: "collected_files", pretty:true) // Elongate references that need it // Join the original references to the branch of needs_elongation, to get the original fasta files, and elongate them. @@ -103,7 +117,7 @@ workflow ELONGATE_REFERENCE { ch_references_to_elongate.elongation_factor, ch_references_to_elongate.target ) - ch_elongated_chr = CIRCULARMAPPER_CIRCULARGENERATOR.out.elongated + ch_elongated_chr = ch_chr_list_for_already_elongated_ref.mix(CIRCULARMAPPER_CIRCULARGENERATOR.out.elongated) ch_versions = ch_versions.mix( CIRCULARMAPPER_CIRCULARGENERATOR.out.versions.first() ) // Collect newly generated circular references and provided ones without an index, and index them. From 7063fd2586624d5367e19f3d1e0a70936c292d8a Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Mon, 12 Aug 2024 11:48:14 +0200 Subject: [PATCH 177/198] Circular target required regardless of elongated fasta --- assets/schema_fasta.json | 2 +- nextflow_schema.json | 2 +- subworkflows/local/reference_indexing.nf | 5 +++-- subworkflows/local/utils_nfcore_eager_pipeline/main.nf | 2 +- 4 files changed, 6 insertions(+), 5 deletions(-) diff --git a/assets/schema_fasta.json b/assets/schema_fasta.json index 4256a542e..54e00de8e 100644 --- a/assets/schema_fasta.json +++ b/assets/schema_fasta.json @@ -46,7 +46,7 @@ "circular_target": { "type": "string", "pattern": "^\\S+$", - "errorMessage": "The headers of the chromosome to be extended by circularmapper must not contain any spaces and no leading '>'." + "errorMessage": "The headers of the chromosome extended by circulargenerator must not contain any spaces and no leading '>'." }, "circularmapper_elongatedfasta": { "type": "string", diff --git a/nextflow_schema.json b/nextflow_schema.json index e64216c1c..67b97576f 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -132,7 +132,7 @@ }, "fasta_circular_target": { "type": "string", - "description": "Specify the FASTA header of the target chromosome to extend when using `circularmapper`.", + "description": "Specify the FASTA header of the extended chromosome when using `circularmapper`.", "help_text": "The entry (chromosome, contig, etc.) in your FASTA reference that you'd like to be treated as circular.\n\nApplies only when providing a single FASTA file via `--fasta` (NOT multi-reference input - see reference TSV/CSV input).\n\n> Modifies tool parameter(s):\n> - circulargenerator `-s`", "fa_icon": "fas fa-bullseye" }, diff --git a/subworkflows/local/reference_indexing.nf b/subworkflows/local/reference_indexing.nf index e085e0f8d..29e768ba8 100644 --- a/subworkflows/local/reference_indexing.nf +++ b/subworkflows/local/reference_indexing.nf @@ -131,12 +131,13 @@ workflow REFERENCE_INDEXING { // Elongate reference for circularmapper if requested if ( params.mapping_tool == "circularmapper" ) { // Throw errors if required parameters are missing + // A circular target is required even when an elongated reference has been provided. ch_elongated_for_gunzip = ch_reference_to_elongate .filter{ meta, circular_target, circularmapper_elongatedfasta, circularmapper_elongatedindex -> - circular_target != "" || circularmapper_elongatedfasta != "" + circular_target != "" } - .ifEmpty{ error "[nf-core/eager] ERROR: Mapping with circularmapper requires either a circular target or elongated reference file for at least one reference." } + .ifEmpty{ error "[nf-core/eager] ERROR: Mapping with circularmapper requires either a circular target for at least one reference." } // This ELONGATE_REFERENCE subworkflow also checks if the provided reference is gzipped, and unzips it if necessary. ELONGATE_REFERENCE( ch_reference_for_mapping, ch_reference_to_elongate ) diff --git a/subworkflows/local/utils_nfcore_eager_pipeline/main.nf b/subworkflows/local/utils_nfcore_eager_pipeline/main.nf index ba460c49e..f6ec52c00 100644 --- a/subworkflows/local/utils_nfcore_eager_pipeline/main.nf +++ b/subworkflows/local/utils_nfcore_eager_pipeline/main.nf @@ -231,7 +231,7 @@ def validateInputParameters() { if ( params.genotyping_source == 'pmd' && ! params.run_pmd_filtering ) { exit 1, ("[nf-core/eager] ERROR: --genotyping_source cannot be 'pmd' unless PMD-filtering is ran.") } if ( params.genotyping_source == 'rescaled' && ! params.run_mapdamage_rescaling ) { exit 1, ("[nf-core/eager] ERROR: --genotyping_source cannot be 'rescaled' unless aDNA damage rescaling is ran.") } if ( params.fasta && params.run_genotyping && params.genotyping_tool == 'pileupcaller' && ! (params.genotyping_pileupcaller_bedfile || params.genotyping_pileupcaller_snpfile ) ) { exit 1, ("[nf-core/eager] ERROR: Genotyping with pileupcaller requires both '--genotyping_pileupcaller_bedfile' AND '--genotyping_pileupcaller_snpfile' to be provided.") } - if ( params.fasta && params.mapping_tool == "circularmapper" && ! params.fasta_circular_target && ! params.fasta_circularmapper_elongatedfasta ) { exit 1, ("[nf-core/eager] ERROR: Mapping with circularmapper requires either --fasta_circular_target or --fasta_circularmapper_elongatedfasta. ") } + if ( params.fasta && params.mapping_tool == "circularmapper" && ! params.fasta_circular_target ) { exit 1, ("[nf-core/eager] ERROR: Mapping with circularmapper requires --fasta_circular_target to be provided.") } } // From 5d8476fbfa73545581146912145958e1fc094a9e Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Mon, 12 Aug 2024 14:42:58 +0200 Subject: [PATCH 178/198] remove dumps --- subworkflows/local/elongate_reference.nf | 1 - subworkflows/local/map.nf | 3 +-- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/subworkflows/local/elongate_reference.nf b/subworkflows/local/elongate_reference.nf index ab8c8972b..bf020b684 100644 --- a/subworkflows/local/elongate_reference.nf +++ b/subworkflows/local/elongate_reference.nf @@ -96,7 +96,6 @@ workflow ELONGATE_REFERENCE { ignore_me, chr_list, meta, fasta, fai, dict, mapindex -> [ meta, chr_list ] } - .dump(tag: "collected_files", pretty:true) // Elongate references that need it // Join the original references to the branch of needs_elongation, to get the original fasta files, and elongate them. diff --git a/subworkflows/local/map.nf b/subworkflows/local/map.nf index 43469c9e2..10972a2a7 100644 --- a/subworkflows/local/map.nf +++ b/subworkflows/local/map.nf @@ -127,8 +127,7 @@ workflow MAP { CIRCULARMAPPER( index, ch_elongated_reference_for_mapping, elongated_chr_list, reads, params.fasta_circularmapper_elongationfactor ) ch_versions = ch_versions.mix ( CIRCULARMAPPER.out.versions ) ch_mapped_lane_bam = CIRCULARMAPPER.out.bam - ch_mapped_lane_bai = CIRCULARMAPPER.out.bai - + ch_mapped_lane_bai = CIRCULARMAPPER.out.bai // [ [ meta ], bai/csi ] } From 302acf3a61f8c52a8208ce10517fd78cb12743e9 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Mon, 12 Aug 2024 14:45:07 +0200 Subject: [PATCH 179/198] update manual tests --- docs/development/manual_tests.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/development/manual_tests.md b/docs/development/manual_tests.md index b83ee686f..b1b2e3996 100644 --- a/docs/development/manual_tests.md +++ b/docs/development/manual_tests.md @@ -299,19 +299,19 @@ nextflow run main.nf -profile test,docker --outdir ./results -w work/ -resume -d ``` ```bash -## Multiref with circularmapper. reference_sheet_multiref.csv edited to include elongated reference and index from first CM manual test for Mammoth_MT, and remove the human reference (save on runtime). Will still evaluate through reference_indexing_multi. -## Expect: No elongation for Mammoth MT. -## Check: 2 bam files together with their CSIs and Flagstats in the `mapping/circularmapper` directory. (6 files total) +## Multiref with circularmapper. reference_sheet_multiref.csv edited to include elongated reference and index from first CM manual test for Mammoth_MT. +## Expect: No elongation for Mammoth MT. Elongation for hs37d5_chr21-MT reference. +## Check: 2 bam files together with their CSIs and Flagstats in the `mapping/circularmapper` directory PER REFERENCE (3 libraries (from 2 samples) x 2 references x 3 files = 18 files total). +## Also, elongated hs37d5_chr21-MT is not saved, since --save_reference was not specified. But it did get elongated. nextflow run main.nf -profile test_multiref,docker --outdir ./results -w work/ -resume -dump-channels -ansi-log false --fasta_sheet /Users/lamnidis/Software/github/jbv2/eager/data/reference/reference_sheet_multiref.csv --mapping_tool circularmapper --fasta_largeref --mapping_bwaaln_n 0.05 --mapping_bwaaln_k 3 ``` ```bash -## Circularmapper with circularfilter. +## Circularmapper with circularfilter, with a provided elongated reference. ## Expect: No elongation for Mammoth MT. -## Check: 2 bam files together with their CSIs and Flagstats in the `mapping/circularmapper` directory. (6 files total) +## Check: 2 bam files together with their CSIs and Flagstats in the `mapping/circularmapper` directory. (6 files total). Ensure files have the @SQ tag of the circular choromosome. nextflow run main.nf -profile test,docker --outdir ./results -w work/ -resume -dump-channels -ansi-log false --fasta_circular_target 'NC_007596.2' --mapping_tool circularmapper --fasta_circularmapper_elongatedfasta data/reference/Mammoth_MT_Krause_500/Mammoth_MT_Krause_500.fasta --fasta_circularmapper_elongatedindex data/reference/Mammoth_MT_Krause_500/bwa --fasta_largeref --mapping_bwaaln_n 0.05 --mapping_bwaaln_k 3 --mapping_circularmapper_circularfilter ``` - ## Host Removal From c89ef24266f32d4799fa6b2dc1b614506deda4dc Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Mon, 12 Aug 2024 14:45:30 +0200 Subject: [PATCH 180/198] fix module configuration application --- conf/modules.config | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index d944ed993..fc0bf9b1f 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -323,6 +323,9 @@ process { withName: SAMTOOLS_INDEX_BAM_INPUT { tag = { "${meta.reference}|${meta.sample_id}_${meta.library_id}_L${meta.lane}" } + publishDir = [ + enabled: false + ] } // @@ -561,21 +564,31 @@ process { // Circular mapping // Configuration for BWA_ALN and BWA_SAMSE/SAMPE is the same as for the non-circular mapping withName: CIRCULARMAPPER_REALIGNSAMFILE { - tag = { "${meta.reference}|${meta.sample_id}_${meta.library_id}" } + tag = { "${meta.reference}|${meta.sample_id}_${meta.library_id}_L${meta.lane}" } ext.args = { params.mapping_circularmapper_circularfilter ? "-f true -x true" : "" } - ext.prefix = { "${meta.sample_id}_${meta.library_id}_${meta.reference}" } + ext.prefix = { "${meta.sample_id}_${meta.library_id}_L${meta.lane}_${meta.reference}" } + publishDir = [ + enabled: false + ] + } + + withName: ".*MAP:CIRCULARMAPPER:FASTQ_ALIGN_BWAALN_ELONGATED:SAMTOOLS_INDEX" { + tag = { "${meta.id_index}|${meta.sample_id}_${meta.library_id}_L${meta.lane}" } + ext.args = { params.fasta_largeref ? "-c" : "" } + ext.prefix = { "${meta.sample_id}_${meta.library_id}_L${meta.lane}_${meta.reference}" } publishDir = [ enabled: false ] } - withName: ".*MAP:FASTQ_ALIGN_BWAALN_ELONGATED:SAMTOOLS_INDEX" { + withName: ".*MAP:CIRCULARMAPPER:SAMTOOLS_INDEX_REALIGNED" { tag = { "${meta.id_index}|${meta.sample_id}_${meta.library_id}_L${meta.lane}" } ext.args = { params.fasta_largeref ? "-c" : "" } ext.prefix = { "${meta.sample_id}_${meta.library_id}_L${meta.lane}_${meta.reference}" } publishDir = [ enabled: false ] + } // From 32fa8b12a12e99fe90df996fbaf6414b48a911a0 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Mon, 12 Aug 2024 15:10:07 +0200 Subject: [PATCH 181/198] update parameter name in CI --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 45c6b1106..b9827c75d 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -31,7 +31,7 @@ jobs: - "-profile test,docker --preprocessing_tool adapterremoval --preprocessing_adapterlist 'https://github.com/nf-core/test-datasets/raw/modules/data/delete_me/adapterremoval/adapterremoval_adapterlist.txt' --sequencing_qc_tool falco --run_genotyping --genotyping_tool 'freebayes' --genotyping_source 'raw'" - "-profile test,docker --mapping_tool bwamem --run_mapdamage_rescaling --run_pmd_filtering --run_trim_bam --run_genotyping --genotyping_tool 'ug' --genotyping_source 'trimmed'" - "-profile test,docker --mapping_tool bowtie2 --damagecalculation_tool mapdamage --damagecalculation_mapdamage_downsample 100 --run_genotyping --genotyping_tool 'hc' --genotyping_source 'raw'" - - "-profile test,docker --mapping_tool circularmapper --skip_preprocessing --convert_inputbam --fasta_circular_target 'NC_007596.2' --mapping_circularmapper_elongation_factor 500" + - "-profile test,docker --mapping_tool circularmapper --skip_preprocessing --convert_inputbam --fasta_circular_target 'NC_007596.2' --fasta_circularmapper_elongationfactor 500" - "-profile test_humanbam,docker --run_mtnucratio --run_contamination_estimation_angsd --snpcapture_bed 'https://raw.githubusercontent.com/nf-core/test-datasets/eager/reference/Human/1240K.pos.list_hs37d5.0based.bed.gz' --run_genotyping --genotyping_tool 'pileupcaller' --genotyping_source 'raw'" - "-profile test_humanbam,docker --run_sexdeterrmine --run_genotyping --genotyping_tool 'angsd' --genotyping_source 'raw'" - "-profile test_multiref,docker" ## TODO add damage manipulation here instead once it goes multiref From e198cedba8042cfb3faa64a2750eab348bd93dbd Mon Sep 17 00:00:00 2001 From: Merlin Szymanski Date: Fri, 16 Aug 2024 10:07:42 +0200 Subject: [PATCH 182/198] remove taxpast multiQC channels --- subworkflows/local/metagenomics_postprocessing.nf | 2 -- 1 file changed, 2 deletions(-) diff --git a/subworkflows/local/metagenomics_postprocessing.nf b/subworkflows/local/metagenomics_postprocessing.nf index 0e31b1d34..19ea332c0 100644 --- a/subworkflows/local/metagenomics_postprocessing.nf +++ b/subworkflows/local/metagenomics_postprocessing.nf @@ -97,7 +97,6 @@ workflow METAGENOMICS_POSTPROCESSING { TAXPASTA_STANDARDISE( ch_standardise_input, [] ) ch_versions = ch_versions.mix(TAXPASTA_STANDARDISE.out.versions) - ch_multiqc_files = ch_multiqc_files.mix(TAXPASTA_STANDARDISE.out.standardised_profile) ch_merge_input = ch_postprocessing_input.merge.map{ meta, reports, count -> [meta, reports] @@ -105,7 +104,6 @@ workflow METAGENOMICS_POSTPROCESSING { TAXPASTA_MERGE( ch_merge_input, [], [] ) ch_versions = ch_versions.mix(TAXPASTA_MERGE.out.versions) - ch_multiqc_files = ch_multiqc_files.mix(TAXPASTA_MERGE.out.merged_profiles) emit: From 753994c760945213f4f552b6024981d231ca8f2d Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Fri, 16 Aug 2024 10:08:48 +0200 Subject: [PATCH 183/198] Apply suggestions from code review --- nextflow_schema.json | 3 ++- subworkflows/local/reference_indexing.nf | 4 ++-- subworkflows/local/utils_nfcore_eager_pipeline/main.nf | 2 +- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index 67b97576f..ca823548d 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -628,7 +628,8 @@ "type": "boolean", "fa_icon": "fas fa-filter", "description": "Turn on to remove reads that did not map to the circularised genome.", - "help_text": "If you want to filter out reads that don't map to a circular chromosome (and also non-circular chromosome headers) from the resulting BAM file, turn this on.\n\n> Modifies `-f` and `-x` parameters of CircularMapper's RealignSAMFile" + "help_text": "If you want to filter out reads that don't map to + the elongated/circularised chromosome (and also non-circular chromosome headers) from the resulting BAM file, turn this on.\n\n> Modifies `-f` and `-x` parameters of CircularMapper's RealignSAMFile" } }, "fa_icon": "fas fa-layer-group" diff --git a/subworkflows/local/reference_indexing.nf b/subworkflows/local/reference_indexing.nf index 29e768ba8..0080c75cf 100644 --- a/subworkflows/local/reference_indexing.nf +++ b/subworkflows/local/reference_indexing.nf @@ -27,7 +27,7 @@ workflow REFERENCE_INDEXING { // If input (multi-)reference sheet supplied REFERENCE_INDEXING_MULTI ( fasta ) ch_reference_for_mapping = REFERENCE_INDEXING_MULTI.out.reference - ch_reference_to_elongate = REFERENCE_INDEXING_MULTI.out.elongated_reference + ch_reference_to_elongate = REFERENCE_INDEXING_MULTI.out.elongated_reference ch_mitochondrion_header = REFERENCE_INDEXING_MULTI.out.mitochondrion_header ch_hapmap = REFERENCE_INDEXING_MULTI.out.hapmap ch_pmd_masked_fasta = REFERENCE_INDEXING_MULTI.out.pmd_masked_fasta @@ -41,7 +41,7 @@ workflow REFERENCE_INDEXING { } else { // If input FASTA and/or indicies supplied REFERENCE_INDEXING_SINGLE ( fasta, fasta_fai, fasta_dict, fasta_mapperindexdir ) - ch_reference_to_elongate = REFERENCE_INDEXING_SINGLE.out.elongated_reference + ch_reference_to_elongate = REFERENCE_INDEXING_SINGLE.out.elongated_reference ch_mitochondrion_header = REFERENCE_INDEXING_SINGLE.out.mitochondrion_header ch_hapmap = REFERENCE_INDEXING_SINGLE.out.hapmap ch_pmd_masked_fasta = REFERENCE_INDEXING_SINGLE.out.pmd_masked_fasta diff --git a/subworkflows/local/utils_nfcore_eager_pipeline/main.nf b/subworkflows/local/utils_nfcore_eager_pipeline/main.nf index f6ec52c00..2d076cf17 100644 --- a/subworkflows/local/utils_nfcore_eager_pipeline/main.nf +++ b/subworkflows/local/utils_nfcore_eager_pipeline/main.nf @@ -231,7 +231,7 @@ def validateInputParameters() { if ( params.genotyping_source == 'pmd' && ! params.run_pmd_filtering ) { exit 1, ("[nf-core/eager] ERROR: --genotyping_source cannot be 'pmd' unless PMD-filtering is ran.") } if ( params.genotyping_source == 'rescaled' && ! params.run_mapdamage_rescaling ) { exit 1, ("[nf-core/eager] ERROR: --genotyping_source cannot be 'rescaled' unless aDNA damage rescaling is ran.") } if ( params.fasta && params.run_genotyping && params.genotyping_tool == 'pileupcaller' && ! (params.genotyping_pileupcaller_bedfile || params.genotyping_pileupcaller_snpfile ) ) { exit 1, ("[nf-core/eager] ERROR: Genotyping with pileupcaller requires both '--genotyping_pileupcaller_bedfile' AND '--genotyping_pileupcaller_snpfile' to be provided.") } - if ( params.fasta && params.mapping_tool == "circularmapper" && ! params.fasta_circular_target ) { exit 1, ("[nf-core/eager] ERROR: Mapping with circularmapper requires --fasta_circular_target to be provided.") } + if ( params.fasta && params.mapping_tool == "circularmapper" && !params.fasta_circular_target ) { exit 1, ("[nf-core/eager] ERROR: Mapping with circularmapper requires --fasta_circular_target to be provided.") } } // From 81784cb24c538956efb895dc1a02e5166827f75a Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Fri, 16 Aug 2024 10:11:24 +0200 Subject: [PATCH 184/198] Fix linting --- nextflow_schema.json | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index ca823548d..dd252a9aa 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -628,8 +628,7 @@ "type": "boolean", "fa_icon": "fas fa-filter", "description": "Turn on to remove reads that did not map to the circularised genome.", - "help_text": "If you want to filter out reads that don't map to - the elongated/circularised chromosome (and also non-circular chromosome headers) from the resulting BAM file, turn this on.\n\n> Modifies `-f` and `-x` parameters of CircularMapper's RealignSAMFile" + "help_text": "If you want to filter out reads that don't map to elongated/circularised chromosome (and also non-circular chromosome headers) from the resulting BAM file, turn this on.\n\n> Modifies `-f` and `-x` parameters of CircularMapper's RealignSAMFile" } }, "fa_icon": "fas fa-layer-group" From 2196c266496cb45773378d48e0c51ea82e8fb874 Mon Sep 17 00:00:00 2001 From: Merlin Szymanski Date: Fri, 16 Aug 2024 10:31:14 +0200 Subject: [PATCH 185/198] remove redundant comment --- nextflow.config | 2 +- subworkflows/local/metagenomics_complexityfilter.nf | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/nextflow.config b/nextflow.config index c9423743f..cc88373e4 100644 --- a/nextflow.config +++ b/nextflow.config @@ -140,7 +140,7 @@ params { metagenomics_complexity_tool = 'bbduk' metagenomics_complexity_savefastq = false metagenomics_complexity_entropy = 0.3 - metagenomics_prinseq_mode = 'entropy' + metagenomics_prinseq_mode = 'entropy' // entropy or dust metagenomics_prinseq_dustscore = 0.5 metagenomics_profiling_tool = null metagenomics_profiling_database = null diff --git a/subworkflows/local/metagenomics_complexityfilter.nf b/subworkflows/local/metagenomics_complexityfilter.nf index 8171911fa..7ed6b587f 100644 --- a/subworkflows/local/metagenomics_complexityfilter.nf +++ b/subworkflows/local/metagenomics_complexityfilter.nf @@ -17,7 +17,6 @@ workflow METAGENOMICS_COMPLEXITYFILTER { ch_reads_for_metagenomics = BBMAP_BBDUK.out.reads } else if ( params.metagenomics_complexity_tool == 'prinseq' ) { - // check if e.g. dustscore is set but entropy enabled PRINSEQPLUSPLUS ( ch_bamfiltered_for_metagenomics ) ch_versions = PRINSEQPLUSPLUS.out.versions ch_reads_for_metagenomics = PRINSEQPLUSPLUS.out.good_reads From 373e0cadfbdbe09bff1648fa045f3774271e49d3 Mon Sep 17 00:00:00 2001 From: Merlin Szymanski Date: Fri, 16 Aug 2024 10:37:01 +0200 Subject: [PATCH 186/198] Apply minor suggestions from code review Co-authored-by: James A. Fellows Yates --- docs/output.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/output.md b/docs/output.md index b0da66971..368b1d3d1 100644 --- a/docs/output.md +++ b/docs/output.md @@ -428,13 +428,13 @@ The output system of KrakenUniq can result in other `stdout` or `stderr` logging #### taxpasta -the output created by the `taxpasta merge` command. It combines the results of all the samples analyzed with a given metagenomic classifer by nf-core/eager. The file provides an overview of the classification results for all samples combined +The output created by the `taxpasta merge` command. It combines the results of all the samples analyzed with a given metagenomic classifer by nf-core/eager in a standardised tabular taxon-table format. The file provides an overview of the classification results for all samples combined
Output files - `metagenomics_screening/postprocessing/taxpasta/` - - `{metaphlan, krakenuniq, kraken2}_profiles_all_samples_merged.txt` + - `{metaphlan,krakenuniq,kraken2}_profiles_all_samples_merged.txt`
@@ -454,7 +454,7 @@ The output directory for maltExtract, as implemented under [HOPS](https://github
-The main files of interest are within the `pdf_candidate_profiles` direcotry. The file prefixes declare various levels of confidence in a given sample, with stp1 being less confidently ancient and present than stp2, than stp3. Results are highly dependent upon the taxon being analyzed, as different microbial genera are more liable to cross mapping and contamination than others. +The main files of interest are within the `pdf_candidate_profiles` directory. The file prefixes declare various levels of confidence in a given sample, with stp1 being less confidently ancient and present than stp2, than stp3. Results are highly dependent upon the taxon being analyzed, as different microbial genera are more liable to cross mapping and contamination than others. ### Deduplication From 4dea691471f1b443b7b86a954737beed60e0c1e6 Mon Sep 17 00:00:00 2001 From: Merlin Szymanski Date: Fri, 16 Aug 2024 11:03:07 +0200 Subject: [PATCH 187/198] update docs and fix paths in test configs --- conf/test_kraken2.config | 6 +++--- conf/test_krakenuniq.config | 6 +++--- conf/test_malt.config | 6 +++--- conf/test_metaphlan.config | 6 +++--- docs/output.md | 6 ++++-- 5 files changed, 16 insertions(+), 14 deletions(-) diff --git a/conf/test_kraken2.config b/conf/test_kraken2.config index 2176ea58f..295913d10 100644 --- a/conf/test_kraken2.config +++ b/conf/test_kraken2.config @@ -21,13 +21,13 @@ params { max_time = '6.h' // Input data - input = 'https://github.com/nf-core/test-datasets/raw/eager/testdata/Mammoth/samplesheet_v3.tsv' + input = params.pipelines_testdata_base_path + 'eager/testdata/Mammoth/samplesheet_v3.tsv' // Genome references - fasta = 'https://raw.githubusercontent.com/nf-core/test-datasets/eager/reference/Mammoth/Mammoth_MT_Krause.fasta' + fasta = params.pipelines_testdata_base_path + 'eager/reference/Mammoth/Mammoth_MT_Krause.fasta' // Metagenomics run_metagenomics = true metagenomics_profiling_tool = 'kraken2' - metagenomics_profiling_database = 'https://raw.githubusercontent.com/nf-core/test-datasets/eager/databases/kraken/eager_test.tar.gz' + metagenomics_profiling_database = params.pipelines_testdata_base_path + 'eager/databases/kraken/eager_test.tar.gz' } diff --git a/conf/test_krakenuniq.config b/conf/test_krakenuniq.config index bb5257b46..81ccc0c88 100644 --- a/conf/test_krakenuniq.config +++ b/conf/test_krakenuniq.config @@ -21,13 +21,13 @@ params { max_time = '6.h' // Input data - input = 'https://github.com/nf-core/test-datasets/raw/eager/testdata/Mammoth/samplesheet_v3.tsv' + input = params.pipelines_testdata_base_path + 'eager/testdata/Mammoth/samplesheet_v3.tsv' // Genome references - fasta = 'https://raw.githubusercontent.com/nf-core/test-datasets/eager/reference/Mammoth/Mammoth_MT_Krause.fasta' + fasta = params.pipelines_testdata_base_path + 'eager/reference/Mammoth/Mammoth_MT_Krause.fasta' // Metagenomics run_metagenomics = true metagenomics_profiling_tool = 'krakenuniq' - metagenomics_profiling_database = 'https://github.com/nf-core/test-datasets/raw/eager/databases/krakenuniq/testdb-krakenuniq.tar.gz' + metagenomics_profiling_database = params.pipelines_testdata_base_path + 'eager/databases/krakenuniq/testdb-krakenuniq.tar.gz' } diff --git a/conf/test_malt.config b/conf/test_malt.config index 51d951520..b1088f992 100644 --- a/conf/test_malt.config +++ b/conf/test_malt.config @@ -21,13 +21,13 @@ params { max_time = '6.h' // Input data - input = 'https://github.com/nf-core/test-datasets/raw/eager/testdata/Mammoth/samplesheet_v3.tsv' + input = params.pipelines_testdata_base_path + 'eager/testdata/Mammoth/samplesheet_v3.tsv' // Genome references - fasta = 'https://raw.githubusercontent.com/nf-core/test-datasets/eager/reference/Mammoth/Mammoth_MT_Krause.fasta' + fasta = params.pipelines_testdata_base_path + 'eager/reference/Mammoth/Mammoth_MT_Krause.fasta' // Metagenomics run_metagenomics = true metagenomics_profiling_tool = 'malt' - metagenomics_profiling_database = 'https://raw.githubusercontent.com/nf-core/test-datasets/eager/databases/malt/eager_test.tar.gz' + metagenomics_profiling_database = params.pipelines_testdata_base_path + '/eager/databases/malt/eager_test.tar.gz' } diff --git a/conf/test_metaphlan.config b/conf/test_metaphlan.config index a74a499dc..96ce48df0 100644 --- a/conf/test_metaphlan.config +++ b/conf/test_metaphlan.config @@ -21,13 +21,13 @@ params { max_time = '6.h' // Input data - input = 'https://github.com/nf-core/test-datasets/raw/eager/testdata/Mammoth/samplesheet_v3.tsv' + input = params.pipelines_testdata_base_path + 'eager/testdata/Mammoth/samplesheet_v3.tsv' // Genome references - fasta = 'https://raw.githubusercontent.com/nf-core/test-datasets/eager/reference/Mammoth/Mammoth_MT_Krause.fasta' + fasta = params.pipelines_testdata_base_path + 'eager/reference/Mammoth/Mammoth_MT_Krause.fasta' // Metagenomics run_metagenomics = true metagenomics_profiling_tool = 'metaphlan' - metagenomics_profiling_database = 'https://github.com/nf-core/test-datasets/blob/eager/databases/metaphlan/metaphlan4_database.tar.gz' + metagenomics_profiling_database = params.pipelines_testdata_base_path + 'eager/databases/metaphlan/metaphlan4_database.tar.gz' } diff --git a/docs/output.md b/docs/output.md index 368b1d3d1..1f3598cad 100644 --- a/docs/output.md +++ b/docs/output.md @@ -362,7 +362,7 @@ MALT is a metagenomic aligner (equivalent to BLAST, but much faster). It produce You will receive output for each library. This means that if you use TSV input and have one library sequenced over multiple lanes and sequencing types, these are merged and you will get mapping statistics of all lanes and sequencing configurations in one value. -The main output of MALT is the `.rma6` file format, which can be only loaded into MEGAN and it's related tools. +The main output of MALT is the `.rma6` file format, which can be only loaded into MEGAN and it's related tools. The rma-file is further processed by the taxpasta module to provide a standardised tabular output for the MEGAN classifications You will only receive the `.sam` files if you supply `--metagenomics_malt_savereads` parameters to the pipeline. @@ -380,7 +380,9 @@ You will only receive the `.sam` files if you supply `--metagenomics_malt_savere
-The main taxonomic profiling file from MetaPhlAn is the `*_profile.txt` file. This provides the abundance estimates from MetaPhlAn however does not include raw counts by default. Intermediate Bowtie2 output `.bowtie2out.txt`, which presents a condensed representation of the mapping results of your sequencing reads to MetaPhlAn's marker gene sequences. The alignments are listed in tab-separated columns, including Read ID and Marker Gene ID, with each alignment represented on a separate line. +The main taxonomic profiling file from MetaPhlAn is the `*_profile.txt` file. This provides the abundance estimates from MetaPhlAn however does not include raw counts by default. The profiling-file is further processed by the taxpasta module to provide a standardised tabular output for the Metaphlan abundance estimates. + +Intermediate Bowtie2 output `.bowtie2out.txt`, which presents a condensed representation of the mapping results of your sequencing reads to MetaPhlAn's marker gene sequences. The alignments are listed in tab-separated columns, including Read ID and Marker Gene ID, with each alignment represented on a separate line. #### Kraken2 From edf23800ae79087c7bb6e6c8b1f8fbb5c7151830 Mon Sep 17 00:00:00 2001 From: Merlin Szymanski Date: Fri, 16 Aug 2024 11:16:34 +0200 Subject: [PATCH 188/198] update docs, include postprocessing in test_configs --- conf/test_malt.config | 1 + conf/test_metaphlan.config | 1 + docs/output.md | 2 +- 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/conf/test_malt.config b/conf/test_malt.config index b1088f992..ae7cd89cb 100644 --- a/conf/test_malt.config +++ b/conf/test_malt.config @@ -30,4 +30,5 @@ params { run_metagenomics = true metagenomics_profiling_tool = 'malt' metagenomics_profiling_database = params.pipelines_testdata_base_path + '/eager/databases/malt/eager_test.tar.gz' + metagenomics_run_postprocessing = true } diff --git a/conf/test_metaphlan.config b/conf/test_metaphlan.config index 96ce48df0..89c34a2d3 100644 --- a/conf/test_metaphlan.config +++ b/conf/test_metaphlan.config @@ -30,4 +30,5 @@ params { run_metagenomics = true metagenomics_profiling_tool = 'metaphlan' metagenomics_profiling_database = params.pipelines_testdata_base_path + 'eager/databases/metaphlan/metaphlan4_database.tar.gz' + metagenomics_run_postprocessing = true } diff --git a/docs/output.md b/docs/output.md index 1f3598cad..ac6ed8d0f 100644 --- a/docs/output.md +++ b/docs/output.md @@ -382,7 +382,7 @@ You will only receive the `.sam` files if you supply `--metagenomics_malt_savere The main taxonomic profiling file from MetaPhlAn is the `*_profile.txt` file. This provides the abundance estimates from MetaPhlAn however does not include raw counts by default. The profiling-file is further processed by the taxpasta module to provide a standardised tabular output for the Metaphlan abundance estimates. -Intermediate Bowtie2 output `.bowtie2out.txt`, which presents a condensed representation of the mapping results of your sequencing reads to MetaPhlAn's marker gene sequences. The alignments are listed in tab-separated columns, including Read ID and Marker Gene ID, with each alignment represented on a separate line. +Raw counts can be inferred from the Bowtie2 output `.bowtie2out.txt`, which presents a condensed representation of the mapping results of the sequencing reads to MetaPhlAn's marker gene sequences. The alignments are listed in tab-separated columns, including Read ID and Marker Gene ID, with each alignment represented on a separate line. No hits to the marker genes results in an empty file. #### Kraken2 From 3fedfb3c49586ab1d815013664c166facb685b40 Mon Sep 17 00:00:00 2001 From: Merlin Szymanski Date: Fri, 16 Aug 2024 11:50:22 +0200 Subject: [PATCH 189/198] Update MALT docs, test_config and parameter validation --- conf/test_malt.config | 1 - docs/output.md | 2 +- subworkflows/local/utils_nfcore_eager_pipeline/main.nf | 8 ++++++++ 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/conf/test_malt.config b/conf/test_malt.config index ae7cd89cb..b1088f992 100644 --- a/conf/test_malt.config +++ b/conf/test_malt.config @@ -30,5 +30,4 @@ params { run_metagenomics = true metagenomics_profiling_tool = 'malt' metagenomics_profiling_database = params.pipelines_testdata_base_path + '/eager/databases/malt/eager_test.tar.gz' - metagenomics_run_postprocessing = true } diff --git a/docs/output.md b/docs/output.md index ac6ed8d0f..7596e64a8 100644 --- a/docs/output.md +++ b/docs/output.md @@ -354,7 +354,7 @@ The saved files are the _good_ files, passing the `dust` or `entropy` filter tre - `metagenomics_screening/profiling/malt/` - `.rma6`: binary file containing all alignments and taxonomic information of hits that can be loaded into the [MEGAN6](https://uni-tuebingen.de/fakultaeten/mathematisch-naturwissenschaftliche-fakultaet/fachbereiche/informatik/lehrstuehle/algorithms-in-bioinformatics/software/megan6/) interactive viewer - `.blastn.sam`: sparse SAM file containing alignments of each hit (if `--metagenomics_malt_savereads`) - - `*.log`: LOG file containing runtime log of MALT. NOTE: If you are running parallel malt runs with `--metagenomics_malt_group_size` set above 0, your log files will be labelled with the name of _one_ of the input files run for each of the parallel executions. + - `*.log`: LOG files containing the log of the MALT execution. NOTE: If you are running parallel malt runs with `--metagenomics_malt_group_size` set above 0, you will obtain a log file named `__-run-malt-run.log` for each group of the parallel executions. The `_runtime_log_concatenated.log` file contains the concatenated logs of all the groups.
diff --git a/subworkflows/local/utils_nfcore_eager_pipeline/main.nf b/subworkflows/local/utils_nfcore_eager_pipeline/main.nf index 795ea8881..f43870378 100644 --- a/subworkflows/local/utils_nfcore_eager_pipeline/main.nf +++ b/subworkflows/local/utils_nfcore_eager_pipeline/main.nf @@ -226,6 +226,14 @@ def validateInputParameters() { if (params.metagenomics_complexity_entropy == 0.3) { exit 1, ("[nf-core/eager] ERROR: Metagenomics: You picked PRINSEQ++ with 'entropy' mode but provided a dust score. Please specify an entropy filter threshold using the --metagenomics_complexity_entropy flag") } } if ( params.run_metagenomics && params.preprocessing_skippairmerging ) { exit 1, ("[nf-core/eager] ERROR: Metagenomics: Currently no support for unmerged paired end reads inputs into Metagenomics subworkflow. Please rerun without --preprocessing_skippairmerging.") } + if ( + params.metagenomics_run_postprocessing && + params.metagenomics_profiling_tool == 'malt' && + ( + !params.metagenomics_maltextract_taxonlist || + !params.metagenomics_maltextract_ncbidir + ) + ){ exit 1, ("[nf-core/eager] ERROR: Metagenomics: You picked MALT with postprocessing but didnt provided required input files. Please provide the --metagenomics_maltextract_taxonlist and --metagenomics_maltextract_ncbidir flags") } if ( params.run_genotyping && ! params.genotyping_tool ) { exit 1, ("[nf-core/eager] ERROR: --run_genotyping was specified, but no --genotyping_tool was specified.") } if ( params.run_genotyping && ! params.genotyping_source ) { exit 1, ("[nf-core/eager] ERROR: --run_genotyping was specified, but no --genotyping_source was specified.") } if ( params.genotyping_source == 'trimmed' && ! params.run_trim_bam ) { exit 1, ("[nf-core/eager] ERROR: --genotyping_source cannot be 'trimmed' unless BAM trimming is turned on with `--run_trim_bam`.") } From 58cea8af6b92cc0662cd0eb8343774ca90586ae3 Mon Sep 17 00:00:00 2001 From: Merlin Szymanski Date: Fri, 16 Aug 2024 12:06:21 +0200 Subject: [PATCH 190/198] remove defaults from schema help_text entries --- nextflow_schema.json | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index fa52ef8aa..1581ea3d8 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -797,7 +797,7 @@ "default": 1, "description": "Specify the percent for LCA algorithm for MALT (see MEGAN6 CE manual).", "fa_icon": "fas fa-percent", - "help_text": "Specify the top percent value of the LCA algorithm. From the [MALT manual](http://ab.inf.uni-tuebingen.de/data/software/malt/download/manual.pdf): \"For each\nread, only those matches are used for taxonomic placement whose bit disjointScore is within\n10% of the best disjointScore for that read.\". Default: `1`.\n\nOnly when `--metagenomics_profiling_tool malt` is also supplied.\n\n> Modifies tool parameter(s):\n> - MALT: `-top`" + "help_text": "Specify the top percent value of the LCA algorithm. From the [MALT manual](http://ab.inf.uni-tuebingen.de/data/software/malt/download/manual.pdf): \"For each\nread, only those matches are used for taxonomic placement whose bit disjointScore is within\n10% of the best disjointScore for that read.\".\n\nOnly when `--metagenomics_profiling_tool malt` is also supplied.\n\n> Modifies tool parameter(s):\n> - MALT: `-top`" }, "metagenomics_malt_minsupportmode": { "type": "string", @@ -819,14 +819,14 @@ "default": 1, "description": "Specify a minimum number of reads a taxon of sample total is required to have to be retained in malt or kraken. Not compatible with --malt_min_support_mode 'percent'.", "fa_icon": "fas fa-sort-numeric-up-alt", - "help_text": "For usage in malt: Specify the minimum number of reads a given taxon is required to have to be retained as a positive 'hit'.Default: 1. \nFor malt, this only applies when `--malt_min_support_mode` is set to 'reads'. \n\n> Modifies tool parameter(s):\n> - MALT: `-sup` \n" + "help_text": "For usage in malt: Specify the minimum number of reads a given taxon is required to have to be retained as a positive 'hit'.\nFor malt, this only applies when `--malt_min_support_mode` is set to 'reads'. \n\n> Modifies tool parameter(s):\n> - MALT: `-sup` \n" }, "metagenomics_malt_maxqueries": { "type": "integer", "default": 100, "description": "Specify the maximum number of queries a read can have for MALT.", "fa_icon": "fas fa-phone", - "help_text": "Specify the maximum number of alignments a read can have. All further alignments are discarded. Default: `100`\n\nOnly when `--metagenomics_profiling_tool malt` is also supplied.\n\n> Modifies tool parameter(s):\n> - MALT: `-mq`" + "help_text": "Specify the maximum number of alignments a read can have. All further alignments are discarded.\n\nOnly when `--metagenomics_profiling_tool malt` is also supplied.\n\n> Modifies tool parameter(s):\n> - MALT: `-mq`" }, "metagenomics_malt_memorymode": { "type": "string", @@ -847,7 +847,7 @@ "default": 0, "description": "Define how many fastq files should be submitted in the same malt run. Default value of 0 runs all files at once.", "fa_icon": "fas fa-barcode", - "help_text": "Very many (large) fastq files run through MALT at the same time can lead to excessively long runtimes. This parameter allows for parallelization of MALT runs. Please note, MALT is resource heavy and setting this value above the default (0) will spawn at minimum N/metagenomics_malt_group_size jobs where N is the number of samples. Please only use this if it is necessary to avoid runtime limits on your HPC cluster since the overhead of loading a database is high." + "help_text": "Very many (large) fastq files run through MALT at the same time can lead to excessively long runtimes. This parameter allows for parallelization of MALT runs. Please note, MALT is resource heavy and setting this value N above the default (0) will spawn multiple metagenomics_malt_group_size jobs where N is the number of samples per group. Please only use this if it is necessary to avoid runtime limits on your HPC cluster since the overhead of loading a database is high." }, "metagenomics_run_postprocessing": { "type": "boolean", @@ -917,7 +917,7 @@ "type": "number", "default": 85, "description": "Minimum percent identity alignments are required to have to be reported as candidate reads. Recommended to set same as MALT parameter.", - "help_text": "Minimum percent identity alignments are required to have to be reported. Higher values allows fewer mismatches between read and reference sequence, but therefore will provide greater confidence in the hit. Lower values allow more mismatches, which can account for damage and divergence of a related strain/species to the reference. Recommended to set same as MALT parameter or higher. Default: `85`.\\n\\nOnly when `--metagenomics_profiling_tool malt` is also supplied.\n\n> Modifies tool parameter(s):\n> - MaltExtract: `--minPI`" + "help_text": "Minimum percent identity alignments are required to have to be reported. Higher values allows fewer mismatches between read and reference sequence, but therefore will provide greater confidence in the hit. Lower values allow more mismatches, which can account for damage and divergence of a related strain/species to the reference. Recommended to set same as MALT parameter or higher.\\n\\nOnly when `--metagenomics_profiling_tool malt` is also supplied.\n\n> Modifies tool parameter(s):\n> - MaltExtract: `--minPI`" }, "metagenomics_maltextract_usetopalignment": { "type": "boolean", From b6d313ae720a831238926dd7a41eb508a1a425f8 Mon Sep 17 00:00:00 2001 From: Merlin Szymanski Date: Fri, 16 Aug 2024 12:12:40 +0200 Subject: [PATCH 191/198] Apply missed suggestions from code review Co-authored-by: James A. Fellows Yates --- subworkflows/local/metagenomics_profiling.nf | 1 - workflows/eager.nf | 1 - 2 files changed, 2 deletions(-) diff --git a/subworkflows/local/metagenomics_profiling.nf b/subworkflows/local/metagenomics_profiling.nf index 053d56297..f21cb5590 100644 --- a/subworkflows/local/metagenomics_profiling.nf +++ b/subworkflows/local/metagenomics_profiling.nf @@ -62,7 +62,6 @@ workflow METAGENOMICS_PROFILING { // so all samples are in one run and so sample-specific metadata // unnecessary. Set as database name to prevent `null` job ID and prefix. - def label = file(params.metagenomics_profiling_database).getBaseName() // For the next step we need the number of analysis-groups for the spezified number of input files diff --git a/workflows/eager.nf b/workflows/eager.nf index 1ddaaf5a7..05df7f53c 100644 --- a/workflows/eager.nf +++ b/workflows/eager.nf @@ -488,7 +488,6 @@ workflow EAGER { ch_genome_for_bedtools = SAMTOOLS_VIEW_GENOME.out.genome - BEDTOOLS_COVERAGE_DEPTH(ch_bedtools_input.withfeature, ch_genome_for_bedtools) ch_versions = ch_versions.mix( SAMTOOLS_VIEW_GENOME.out.versions ) From d14ea294777fee8a716ea05a1548bc6a2f834b66 Mon Sep 17 00:00:00 2001 From: Merlin Szymanski Date: Fri, 16 Aug 2024 13:17:36 +0200 Subject: [PATCH 192/198] update the manual tests file --- docs/development/manual_tests.md | 159 ++++++++++--------------------- docs/output.md | 2 +- 2 files changed, 53 insertions(+), 108 deletions(-) diff --git a/docs/development/manual_tests.md b/docs/development/manual_tests.md index a2902ace7..4b6c475f4 100644 --- a/docs/development/manual_tests.md +++ b/docs/development/manual_tests.md @@ -534,58 +534,29 @@ nextflow run main.nf -profile docker,test --input ~/eager_dsl2_testing/input/onl ```bash #### Use bbduk to remove low complexity reads _without_ saving the intermediate files +## Use kraken as example profiler ## Expect: NO additional directory created, but the files in the profiling directory contain the 'complexity' suffix -nextflow run main.nf -profile test,docker \ - --outdir ./out \ - --run_metagenomics \ - --metagenomics_profiling_tool krakenuniq \ - --metagenomics_profiling_database CUSTOM_KRAKEN_DB \ - --run_metagenomics_complexityfiltering \ - --metagenomics_complexity_tool bbduk -``` -```bash +nextflow run main.nf -profile test_krakenuniq --outdir ./out --run_metagenomics_complexityfiltering --metagenomics_complexity_tool bbduk + #### Use bbduk to remove low complexity reads _with_ saving intermediate files -## Expect: Additional directory created 'metagenomics_screening/complexity_filter/bbduk' that contains the fastq files -## with 'complexity' postfix and a bbduk.log file for each library -nextflow run main.nf -profile test,docker \ - --outdir ./out \ - --run_metagenomics \ - --metagenomics_profiling_tool krakenuniq \ - --metagenomics_profiling_database CUSTOM_KRAKEN_DB \ - --run_metagenomics_complexityfiltering \ - --metagenomics_complexity_tool bbduk \ - --metagenomics_complexity_savefastq +## Expect: Additional directory created 'metagenomics_screening/complexity_filter/bbduk' containing 'complexity' fastq.gz files and a bbduk.log file for each library + +nextflow run main.nf -profile test_krakenuniq --outdir ./out --run_metagenomics_complexityfiltering --metagenomics_complexity_tool bbduk --metagenomics_complexity_savefastq ``` -## Test prinseq +##### Test prinseq ```bash #### Use prinseq to remove low complexity reads _without_ saving the intermediate files ## Expect: NO additional directory created, but the files in the profiling directory contain the 'complexity_good_out' postfix -nextflow run main.nf -profile test,docker \ - --outdir out \ - --run_metagenomics \ - --metagenomics_profiling_tool krakenuniq \ - --metagenomics_profiling_database CUSTOM_KRAKEN_DB \ - --run_metagenomics_complexityfiltering \ - --metagenomics_complexity_tool prinseq -``` +nextflow run main.nf -profile test_krakenuniq --outdir out --run_metagenomics_complexityfiltering --metagenomics_complexity_tool prinseq -```bash #### Use prinseq to remove low complexity reads _with_ saving the intermediate files -## Expect: Additional directory created 'metagenomics_screening/complexity_filter/prinseq' that contains the fastq files -## with 'complexity_good_out' postfix and a 'complexity.log' file for each library +## Expect: Additional directory created 'metagenomics_screening/complexity_filter/prinseq' that contains the fastq files with 'complexity_good_out' postfix and a 'complexity.log' file for each library -nextflow run main.nf -profile test,docker \ - --outdir out \ - --run_metagenomics \ - --metagenomics_profiling_tool krakenuniq \ - --metagenomics_profiling_database CUSTOM_KRAKEN_DB \ - --run_metagenomics_complexityfiltering \ - --metagenomics_complexity_tool prinseq - --metagenomics_complexity_savefastq +nextflow run main.nf -profile test_krakenuniq --outdir out --run_metagenomics_complexityfiltering --metagenomics_complexity_tool prinseq --metagenomics_complexity_savefastq ``` #### Profiling @@ -593,106 +564,86 @@ nextflow run main.nf -profile test,docker \ ##### metaphlan ```bash -## metaphlan with default parameters -## Expect: +## Run metaphlan with default parameters +## Expect: Directory created 'metagenomics/profiling/metaphlan containing +# _profile.txt, .bowtie2out.txt and .biom.txt for each library +## Expect: 'taxpasta_table.csv' in 'metagenomics/postprocessing/taxpasta/' -nextflow run -resume ./main.nf -profile test,docker --outdir out \ ---run_metagenomics --metagenomics_profiling_tool metaphlan --metagenomics_profiling_database ./runtest/metaphlandb/ - -# 20230728: Works +nextflow run main.nf -profile test_metaphlan --outdir out ``` ##### krakenuniq ```bash -#### Use krakenuniq for metagenomics sequence classification, save only report (default) -## Use a custom Database with the -profile test dataset -## Expect: Directory created 'metagenomics_screening/profiling/krakenuniq' that contains one 'krakenuniq.report' file for -## each analyzed library +## Run krakenuniq for metagenomics sequence classification, save only report (default) +## Expect: Directory created 'metagenomics/profiling/krakenuniq' that contains one 'krakenuniq.report' file for each analyzed library +## Expect: 'taxpasta_table.csv' in 'metagenomics/postprocessing/taxpasta/' -nextflow run main.nf -profile test,docker \ - --outdir out \ - --run_metagenomics \ - --metagenomics_profiling_tool krakenuniq \ - --metagenomics_profiling_database CUSTOM_KRAKEN_DB +nextflow run main.nf -profile test_krakenuniq --outdir out -#### Use krakenuniq for metagenomics sequence classification, save fastq files -## Use a custom Database with the -profile test dataset +## Use krakenuniq for metagenomics sequence classification, save fastq files ## Expect: Directory created 'metagenomics_screening/profiling/krakenuniq' that contains: # - 'krakenuniq.report' file # - 'krakenuniq.classified.txt' file # - 'classified.fastq.gz' file # - 'unclassified.fastq.gz' file # for each analyzed library +## Expect: 'taxpasta_table.csv' in 'metagenomics/postprocessing/taxpasta/' -nextflow run main.nf -profile test,docker \ - --outdir out \ - --run_metagenomics \ - --metagenomics_profiling_tool krakenuniq \ - --metagenomics_profiling_database CUSTOM_KRAKEN_DB \ - --metagenomics_kraken2_savereads \ - --metagenomics_kraken2_savereadclassifications +nextflow run main.nf -profile test_krakenuniq --outdir out --metagenomics_kraken2_savereads --metagenomics_kraken2_savereadclassifications ``` ##### kraken2 ```bash #### Use kraken2 for metagenomics sequence classification, save only report (default) -## Use a custom database with the -profile test dataset -## Expect: Directory created 'metagenomics_screening/profiling/kraken2' that contains a 'kraken2.report' file +## Expect: Directory created 'metagenomics/profiling/kraken2' that contains a 'kraken2.report' file ## for each analyzed library +## Expect: 'taxpasta_table.csv' in 'metagenomics/postprocessing/taxpasta/' -nextflow run main.nf -profile test,docker \ - --outdir out \ - --run_metagenomics \ - --metagenomics_profiling_tool kraken2 \ - --metagenomics_profiling_database CUSTOM_KRAKEN2_DB +nextflow run main.nf -profile test_kraken2 --outdir out #### Use krakenuniq for metagenomics sequence classification, save also fastq files -## Use a custom Database with the -profile test dataset -## Expect: Directory created 'metagenomics_screening/profiling/kraken2' that contains: +## Expect: Directory created 'metagenomics/profiling/kraken2' that contains: # - 'kraken2.report' file # - 'kraken2.classifiedreads.txt' file # - 'classified.fastq.gz' file # - 'unclassified.fastq.gz' file # for each analyzed library +## Expect: 'taxpasta_table.csv' in 'metagenomics/postprocessing/taxpasta/' -nextflow run main.nf -profile test,docker \ - --outdir out \ - --run_metagenomics \ - --metagenomics_profiling_tool kraken2 \ - --metagenomics_profiling_database CUSTOM_KRAKEN2_DB \ - --metagenomics_kraken2_savereads \ - --metagenomics_kraken2_savereadclassifications + +nextflow run run main.nf -profile test_kraken2 --outdir out --metagenomics_kraken2_savereads --metagenomics_kraken2_savereadclassifications ``` ##### malt ```bash #### Use MALT for metagenomics sequence classification, save only report (default) -## Use a custom database with the -profile test dataset -## Expect: Directory created 'metagenomics_screening/profiling/malt' that contains a '.rma6' file for each analyzed library -## and a single CUSTOM_MALT_DB-malt-run.log file +## Expect: Directory created 'metagenomics/profiling/malt' that contains a '.rma6' file for each analyzed library +## and the -malt-run.log files -nextflow run main.nf -profile test,docker \ - --outdir out \ - --run_metagenomics \ - --metagenomics_profiling_tool malt \ - --metagenomics_profiling_database CUSTOM_MALT_DB +nextflow run main.nf -profile test_malt --outdir out #### Use MALT for metagenomics sequence classification, save reads -## Use a custom database with the -profile test dataset -## Expect: Directory created 'metagenomics_screening/profiling/malt' that contains for each analyzed library: +## Expect: Directory created 'metagenomics/profiling/malt' that contains for each analyzed library: # - a '.rma6' file # - a '.blastn.sam' file -# and a single CUSTOM_MALT_DB-malt-run.log file +# and a the malt-run.log files -nextflow run main.nf -profile test,docker \ - --outdir out \ - --run_metagenomics \ - --metagenomics_profiling_tool malt \ - --metagenomics_profiling_database CUSTOM_MALT_DB \ - --metagenomics_malt_savereads +nextflow run main.nf -profile test_malt --outdir out --metagenomics_malt_savereads + +#### Use MALT for metagenomics sequence classification, use the --metagenomics_malt_group_size flag +## Expect: Directory created 'metagenomics/profiling/malt' that contains for each analyzed library: +# - a '.rma6' file +# - a '.blastn.sam' file +# a concatenated malt-run.log file and for each group a _ logfile + +nextflow run main.nf -profile test_malt --outdir out --metagenomics_malt_savereads + +# Run Malt with missing parameters +# Expect: Exit 1 and informative error message +nextflow run main.nf -profile test_malt --metagenomics_run_postprocessing --outdir out ``` #### postprocessing @@ -701,18 +652,12 @@ nextflow run main.nf -profile test,docker \ ```bash ### Create a SummaryTable from the Malt rma6 files -# Expected: A directory 'metagenomics_screening/postprocessing/maltextract/results' see the docs for the content of this dir - -nextflow run main.nf -profile test,docker \ - --outdir out \ - --run_metagenomics \ - --metagenomics_profiling_tool malt \ - --metagenomics_profiling_database CUSTOM_MALT_DB \ - --metagenomics_run_postprocessing \ - --metagenomics_maltextract_ncbidir NCBI_DIR \ - --metagenomics_maltextract_taxonlist TAXONLISTFILE - +# Expect: A directory 'metagenomics/postprocessing/maltextract//' +# Expect: A directory 'metagenomics/postprocessing/megan_summaries' +# Expect: 'taxpasta_table.csv' in 'metagenomics/postprocessing/taxpasta/' +# Also there will be an AMPS error which occurs if no taxa are found... Thats why it is ignored to no break the pipeline +nextflow run main.nf -profile test_malt --outdir out --metagenomics_run_postprocessing --metagenomics_maltextract_ncbidir NCBI_DIR --metagenomics_maltextract_taxonlist TAXONLISTFILE # for generating test data mkdir testing && cd testing diff --git a/docs/output.md b/docs/output.md index 7596e64a8..30eaf5490 100644 --- a/docs/output.md +++ b/docs/output.md @@ -420,7 +420,7 @@ You will only receive the `.fastq` and `*classifiedreads.txt` file if you supply The main taxonomic classification file from KrakenUniq is the `*report.txt` file. This is an extension of the Kraken2 report with the additional k-mer coverage information that provides more information about the accuracy of hits. -You will only receive the `*.fastq.gz` and `*.classifiedreads.txt` file if you supply `--metagenomics_kraken2_savereads` and/or `--metagenomics_kraken_save_readclassifications` parameters to the pipeline. +You will only receive the `*.fastq.gz` and `*.classifiedreads.txt` file if you supply `--metagenomics_kraken2_savereads` and/or `--metagenomics_kraken2_savereadclassifications` parameters to the pipeline. :::info The output system of KrakenUniq can result in other `stdout` or `stderr` logging information being saved in the report file, therefore you must check your report files before downstream use! From f9ca61d678223f590a9b44e21f333ad3c9ac7ebb Mon Sep 17 00:00:00 2001 From: Merlin Szymanski Date: Fri, 16 Aug 2024 13:29:18 +0200 Subject: [PATCH 193/198] Update manual tests, update the docs, fix taxpasta output name --- conf/modules.config | 4 ++-- docs/development/manual_tests.md | 12 ++++++------ docs/output.md | 20 ++++++++++---------- 3 files changed, 18 insertions(+), 18 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index ab346297c..1b1d46466 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -985,7 +985,7 @@ process { mode: params.publish_dir_mode, pattern: '*.{csv,tsv,ods,xlsx,arrow,parquet,biom}' ] - ext.args = { "--profiler ${meta.profiler} --output taxpasta_table.tsv" } + ext.args = { "--profiler ${meta.profiler} --output ${meta.profiler}_taxpasta_table.tsv" } } withName: TAXPASTA_STANDARDISE { @@ -994,7 +994,7 @@ process { mode: params.publish_dir_mode, pattern: '*.{csv,tsv,ods,xlsx,arrow,parquet,biom}' ] - ext.args = { "--profiler ${meta.profiler} --output taxpasta_table.tsv" } + ext.args = { "--profiler ${meta.profiler} --output ${meta.profiler}taxpasta_table.tsv" } } withName: 'QUALIMAP_BAMQC_WITHBED|QUALIMAP_BAMQC_NOBED' { diff --git a/docs/development/manual_tests.md b/docs/development/manual_tests.md index 4b6c475f4..09ad0f468 100644 --- a/docs/development/manual_tests.md +++ b/docs/development/manual_tests.md @@ -567,7 +567,7 @@ nextflow run main.nf -profile test_krakenuniq --outdir out --run_metagenomics_co ## Run metaphlan with default parameters ## Expect: Directory created 'metagenomics/profiling/metaphlan containing # _profile.txt, .bowtie2out.txt and .biom.txt for each library -## Expect: 'taxpasta_table.csv' in 'metagenomics/postprocessing/taxpasta/' +## Expect: 'metaphlan_taxpasta_table.tsv' in 'metagenomics/postprocessing/taxpasta/' nextflow run main.nf -profile test_metaphlan --outdir out ``` @@ -577,7 +577,7 @@ nextflow run main.nf -profile test_metaphlan --outdir out ```bash ## Run krakenuniq for metagenomics sequence classification, save only report (default) ## Expect: Directory created 'metagenomics/profiling/krakenuniq' that contains one 'krakenuniq.report' file for each analyzed library -## Expect: 'taxpasta_table.csv' in 'metagenomics/postprocessing/taxpasta/' +## Expect: 'krakenuniq_taxpasta_table.tsv' in 'metagenomics/postprocessing/taxpasta/' nextflow run main.nf -profile test_krakenuniq --outdir out @@ -588,7 +588,7 @@ nextflow run main.nf -profile test_krakenuniq --outdir out # - 'classified.fastq.gz' file # - 'unclassified.fastq.gz' file # for each analyzed library -## Expect: 'taxpasta_table.csv' in 'metagenomics/postprocessing/taxpasta/' +## Expect: 'taxpasta_table.tsv' in 'metagenomics/postprocessing/taxpasta/' nextflow run main.nf -profile test_krakenuniq --outdir out --metagenomics_kraken2_savereads --metagenomics_kraken2_savereadclassifications ``` @@ -599,7 +599,7 @@ nextflow run main.nf -profile test_krakenuniq --outdir out --metagenomics_kraken #### Use kraken2 for metagenomics sequence classification, save only report (default) ## Expect: Directory created 'metagenomics/profiling/kraken2' that contains a 'kraken2.report' file ## for each analyzed library -## Expect: 'taxpasta_table.csv' in 'metagenomics/postprocessing/taxpasta/' +## Expect: 'kraken2_taxpasta_table.tsv' in 'metagenomics/postprocessing/taxpasta/' nextflow run main.nf -profile test_kraken2 --outdir out @@ -610,7 +610,7 @@ nextflow run main.nf -profile test_kraken2 --outdir out # - 'classified.fastq.gz' file # - 'unclassified.fastq.gz' file # for each analyzed library -## Expect: 'taxpasta_table.csv' in 'metagenomics/postprocessing/taxpasta/' +## Expect: 'taxpasta_table.tsv' in 'metagenomics/postprocessing/taxpasta/' nextflow run run main.nf -profile test_kraken2 --outdir out --metagenomics_kraken2_savereads --metagenomics_kraken2_savereadclassifications @@ -654,7 +654,7 @@ nextflow run main.nf -profile test_malt --metagenomics_run_postprocessing --outd ### Create a SummaryTable from the Malt rma6 files # Expect: A directory 'metagenomics/postprocessing/maltextract//' # Expect: A directory 'metagenomics/postprocessing/megan_summaries' -# Expect: 'taxpasta_table.csv' in 'metagenomics/postprocessing/taxpasta/' +# Expect: 'megan6_taxpasta_table.tsv' in 'metagenomics/postprocessing/taxpasta/' # Also there will be an AMPS error which occurs if no taxa are found... Thats why it is ignored to no break the pipeline nextflow run main.nf -profile test_malt --outdir out --metagenomics_run_postprocessing --metagenomics_maltextract_ncbidir NCBI_DIR --metagenomics_maltextract_taxonlist TAXONLISTFILE diff --git a/docs/output.md b/docs/output.md index 30eaf5490..01627704e 100644 --- a/docs/output.md +++ b/docs/output.md @@ -307,7 +307,7 @@ You may also receive the files above if metagenomic screening is turned on.
Output files -- `metagenomics_screening/complexity_filter/bbduk` +- `metagenomics/complexity_filter/bbduk` - `*_complexity.fastq.gz`: FASTQ file containing the complexity filtered reads - `*.log`: LOG file containing filter stats @@ -327,7 +327,7 @@ Using complexity-filtered fastq-files as input for metagenomic classifiers can r
Output files -- `metagenomics_screening/complexity_filter/prinseq` +- `metagenomics/complexity_filter/prinseq` - `*_complexity_good_out.fastq.gz`: FASTQ file containing the complexity filtered reads - `*_complexity.log`: LOG file containing filter stats @@ -351,7 +351,7 @@ The saved files are the _good_ files, passing the `dust` or `entropy` filter tre
Output files -- `metagenomics_screening/profiling/malt/` +- `metagenomics/profiling/malt/` - `.rma6`: binary file containing all alignments and taxonomic information of hits that can be loaded into the [MEGAN6](https://uni-tuebingen.de/fakultaeten/mathematisch-naturwissenschaftliche-fakultaet/fachbereiche/informatik/lehrstuehle/algorithms-in-bioinformatics/software/megan6/) interactive viewer - `.blastn.sam`: sparse SAM file containing alignments of each hit (if `--metagenomics_malt_savereads`) - `*.log`: LOG files containing the log of the MALT execution. NOTE: If you are running parallel malt runs with `--metagenomics_malt_group_size` set above 0, you will obtain a log file named `__-run-malt-run.log` for each group of the parallel executions. The `_runtime_log_concatenated.log` file contains the concatenated logs of all the groups. @@ -373,7 +373,7 @@ You will only receive the `.sam` files if you supply `--metagenomics_malt_savere
Output files -- `metagenomics_screening/profiling/metaphlan/` +- `metagenomics/profiling/metaphlan/` - `.biom`: taxonomic profile in BIOM format - `.bowtie2out.txt`: BowTie2 alignment information (can be re-used for skipping alignment when re-running MetaPhlAn with different parameters) - `_profile.txt`: MetaPhlAn taxonomic profile including abundance estimates @@ -391,7 +391,7 @@ Raw counts can be inferred from the Bowtie2 output `.bowtie2out.txt`, which pres
Output files -- `metagenomics_screening/profiling/kraken2/` +- `metagenomics/profiling/kraken2/` - `.classified.fastq.gz`: FASTQ file containing all reads that had a hit against a reference in the database for a given sample - `.unclassified.fastq.gz`: FASTQ file containing all reads that did not have a hit in the database for a given sample - `.report.txt`: A Kraken2 report that summarises the fraction abundance, taxonomic ID, number of Kmers, taxonomic path of all the hits in the Kraken2 run for a given sample. Will be 6 column rather than 8 if `--metagenomics_kraken2_saveminimizers` specified. @@ -410,7 +410,7 @@ You will only receive the `.fastq` and `*classifiedreads.txt` file if you supply
Output files -- `metagenomics_screening/profiling/krakenuniq/` +- `metagenomics/profiling/krakenuniq/` - `.classified.fastq.gz`: FASTQ file containing all reads that had a hit against a reference in the database for a given sample - `.unclassified.fastq.gz`: FASTQ file containing all reads that did not have a hit in the database for a given sample - `.report.txt`: A Kraken2-style report that summarises the fraction abundance, taxonomic ID, number of Kmers, taxonomic path of all the hits, with an additional column for k-mer coverage, that allows for more accurate distinguishing between false-positive/true-postitive hits @@ -430,13 +430,13 @@ The output system of KrakenUniq can result in other `stdout` or `stderr` logging #### taxpasta -The output created by the `taxpasta merge` command. It combines the results of all the samples analyzed with a given metagenomic classifer by nf-core/eager in a standardised tabular taxon-table format. The file provides an overview of the classification results for all samples combined +The output created by the `taxpasta merge` or `taxpasta standardise` command. It combines the results of all the samples analyzed with a given metagenomic classifer by nf-core/eager in a standardised tabular taxon-table format. The file provides an overview of the classification results for all samples combined
Output files -- `metagenomics_screening/postprocessing/taxpasta/` - - `{metaphlan,krakenuniq,kraken2}_profiles_all_samples_merged.txt` +- `metagenomics/postprocessing/taxpasta/` + - `{metaphlan,krakenuniq,kraken2,megan6}_taxpasta_table.tsv`
@@ -447,7 +447,7 @@ The output directory for maltExtract, as implemented under [HOPS](https://github
Output files -- `metagenomics_screening/postprocessing/maltextract/` +- `metagenomics/postprocessing/maltextract/` - `results`: Results output by maltextract - `default`: Directory containing summary TSV tables for all reads - `ancient`: Directory containing summary TSV tables for reads with evidence of aDNA damage From 9d648a19e3d35f00860562c7f4cb42955f5f5f12 Mon Sep 17 00:00:00 2001 From: Merlin Szymanski Date: Fri, 16 Aug 2024 13:38:11 +0200 Subject: [PATCH 194/198] fix linting --- CITATIONS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CITATIONS.md b/CITATIONS.md index e32c75c4f..41c130a70 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -153,7 +153,7 @@ - [TAXPASTA](https://doi.org/10.21105/joss.05627) > Beber et al., (2023). TAXPASTA: TAXonomic Profile Aggregation and STAndardisation. Journal of Open Source Software, 8(87), 5627, doi: [10.21105/joss.05627](https://doi.org/10.21105/joss.05627) - + - [CircularMapper](https://doi.org/10.1186/s13059-016-0918-z) > Peltzer, A., Jäger, G., Herbig, A., Seitz, A., Kniep, C., Krause, J., & Nieselt, K. (2016). EAGER: efficient ancient genome reconstruction. Genome Biology, 17(1), 1–14. doi: [10.1186/s13059-016-0918-z](https://doi.org/10.1186/s13059-016-0918-z) From bb98c5b96fe21a6ac92c3805b739f1cf76dea00b Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Mon, 2 Sep 2024 10:51:57 +0200 Subject: [PATCH 195/198] Missing sentences --- docs/output.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/docs/output.md b/docs/output.md index bc82dc821..32b9f150b 100644 --- a/docs/output.md +++ b/docs/output.md @@ -336,7 +336,9 @@ You may also receive the files above if metagenomic screening is turned on. ### Metagenomics Complexity Filtering -#### Bbduk +#### BBDuk + +[BBDuk](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/bbduk-guide/) “Duk” stands for Decontamination Using Kmers. BBDuk was developed to combine most common data-quality-related trimming, filtering, and masking operations into a single high-performance tool.
Output files @@ -358,6 +360,8 @@ Using complexity-filtered fastq-files as input for metagenomic classifiers can r #### PRINSEQ++ +[PRINSEQ++](https://github.com/Adrian-Cantu/PRINSEQ-plus-plus) is a C++ implementation of the prinseq-lite.pl program. It can be used to filter, reformat or trim genomic and metagenomic sequence data. +
Output files From a79f1cde6a2cdd3a897c1d871a5becc26084cff9 Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Mon, 2 Sep 2024 10:54:48 +0200 Subject: [PATCH 196/198] Reduce redundant declarations --- subworkflows/local/metagenomics.nf | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/subworkflows/local/metagenomics.nf b/subworkflows/local/metagenomics.nf index d5b4d3de9..5b9104e35 100644 --- a/subworkflows/local/metagenomics.nf +++ b/subworkflows/local/metagenomics.nf @@ -3,10 +3,11 @@ include { METAGENOMICS_PROFILING } from './metagenomics_profiling' include { METAGENOMICS_POSTPROCESSING } from './metagenomics_postprocessing' workflow METAGENOMICS { - take: ch_bamfiltered_for_metagenomics - take: ch_database - take: ch_tax_list - take: ch_ncbi_dir + take: + ch_bamfiltered_for_metagenomics + ch_database + ch_tax_list + ch_ncbi_dir main: // Define channels From cfbba4d61aef57665c921123fe3c213a363d1d58 Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Mon, 2 Sep 2024 11:00:23 +0200 Subject: [PATCH 197/198] Fix linting --- subworkflows/local/metagenomics.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/subworkflows/local/metagenomics.nf b/subworkflows/local/metagenomics.nf index 5b9104e35..8b742c7b2 100644 --- a/subworkflows/local/metagenomics.nf +++ b/subworkflows/local/metagenomics.nf @@ -3,7 +3,7 @@ include { METAGENOMICS_PROFILING } from './metagenomics_profiling' include { METAGENOMICS_POSTPROCESSING } from './metagenomics_postprocessing' workflow METAGENOMICS { - take: + take: ch_bamfiltered_for_metagenomics ch_database ch_tax_list From 976c5cffd3c3e95b0c22fbf05a2bb4e17c894b56 Mon Sep 17 00:00:00 2001 From: Selina Carlhoff <73653549+scarlhoff@users.noreply.github.com> Date: Fri, 13 Sep 2024 14:50:48 +0200 Subject: [PATCH 198/198] Apply suggestions from code review Co-authored-by: Thiseas C. Lamnidis --- conf/modules.config | 6 +++--- workflows/eager.nf | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index f91a741c0..aa4c9eba7 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -1006,7 +1006,7 @@ process { tag = { "${meta.reference}|${meta.sample_id}" } ext.prefix = { "${meta.sample_id}_${meta.reference}" } publishDir = [ - path: { "${params.outdir}/final_bams/for_genotyping/" }, + path: { "${params.outdir}/final_bams/${params.genotyping_source}/" }, mode: params.publish_dir_mode, pattern: '*.bam' ] @@ -1017,7 +1017,7 @@ process { ext.args = { params.fasta_largeref ? "-c" : "" } ext.prefix = { "${meta.sample_id}_${meta.reference}" } publishDir = [ - path: { "${params.outdir}/final_bams/for_genotyping/" }, + path: { "${params.outdir}/final_bams/${params.genotyping_source}/" }, mode: params.publish_dir_mode, pattern: '*.{bai,csi}' ] @@ -1027,7 +1027,7 @@ process { tag = { "${meta.reference}|${meta.sample_id}" } ext.prefix = { "${meta.sample_id}_${meta.reference}" } publishDir = [ - path: { "${params.outdir}/final_bams/for_genotyping/" }, + path: { "${params.outdir}/final_bams/${params.genotyping_source}/" }, mode: params.publish_dir_mode, pattern: '*.flagstat' ] diff --git a/workflows/eager.nf b/workflows/eager.nf index a6db6da16..65bf4c407 100644 --- a/workflows/eager.nf +++ b/workflows/eager.nf @@ -554,7 +554,7 @@ workflow EAGER { MANIPULATE_DAMAGE( ch_dedupped_bams, ch_fasta_for_deduplication.fasta, REFERENCE_INDEXING.out.pmd_masking ) ch_multiqc_files = ch_multiqc_files.mix( MANIPULATE_DAMAGE.out.flagstat.collect{it[1]}.ifEmpty([]) ) ch_versions = ch_versions.mix( MANIPULATE_DAMAGE.out.versions ) - ch_bams_for_library_merge = params.genotyping_source == 'rescaled' ? MANIPULATE_DAMAGE.out.rescaled : params.genotyping_source == 'pmd' ? MANIPULATE_DAMAGE.out.filtered : params.genotyping_source == 'trimmed' ? MANIPULATE_DAMAGE.out.trimmed : ch_dedupped_bams + ch_bams_for_library_merge = params.genotyping_source == 'rescaled' ? MANIPULATE_DAMAGE.out.rescaled : params.genotyping_source == 'pmd' ? MANIPULATE_DAMAGE.out.filtered : params.genotyping_source == 'trimmed' ? MANIPULATE_DAMAGE.out.trimmed : ch_merged_dedup_bams // SUBWORKFLOW: merge libraries for genotyping MERGE_LIBRARIES_GENOTYPING ( ch_bams_for_library_merge )