diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index 4a9bc5c79..4ecfbfe33 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -18,11 +18,11 @@ "python.linting.flake8Path": "/opt/conda/bin/flake8", "python.linting.pycodestylePath": "/opt/conda/bin/pycodestyle", "python.linting.pydocstylePath": "/opt/conda/bin/pydocstyle", - "python.linting.pylintPath": "/opt/conda/bin/pylint", + "python.linting.pylintPath": "/opt/conda/bin/pylint" }, // Add the IDs of extensions you want installed when the container is created. - "extensions": ["ms-python.python", "ms-python.vscode-pylance", "nf-core.nf-core-extensionpack"], - }, - }, + "extensions": ["ms-python.python", "ms-python.vscode-pylance", "nf-core.nf-core-extensionpack"] + } + } } diff --git a/.github/workflows/download_pipeline.yml b/.github/workflows/download_pipeline.yml index 8611458a7..8a3300450 100644 --- a/.github/workflows/download_pipeline.yml +++ b/.github/workflows/download_pipeline.yml @@ -64,4 +64,4 @@ jobs: env: NXF_SINGULARITY_CACHEDIR: ./ NXF_SINGULARITY_HOME_MOUNT: true - run: nextflow run ./${{ env.REPOTITLE_LOWERCASE }}/$( sed 's/\W/_/g' <<< ${{ env.REPO_BRANCH }}) -stub -profile test,singularity --outdir ./results + run: nextflow run ./${{ env.REPOTITLE_LOWERCASE }}/$( sed 's/\W/_/g' <<< ${{ env.REPO_BRANCH }}) -stub -profile test,singularity --outdir ./results diff --git a/.prettierignore b/.prettierignore index 6f762db0b..30f2a16d8 100644 --- a/.prettierignore +++ b/.prettierignore @@ -12,3 +12,4 @@ testing* bin/ *.cff test/ +dev_docs.md diff --git a/conf/modules.config b/conf/modules.config index 45fdb19c8..ea1d8d927 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -883,4 +883,47 @@ process { enabled: true ] } + + // + // LIBRARY MERGE + // + + withName: SAMTOOLS_MERGE_LIBRARIES { + tag = { "${meta.reference}|${meta.sample_id}" } + ext.prefix = { "${meta.sample_id}_${meta.reference}_unsorted" } + publishDir = [ + enabled: false + ] + } + + withName: SAMTOOLS_SORT_MERGED_LIBRARIES { + tag = { "${meta.reference}|${meta.sample_id}" } + ext.prefix = { "${meta.sample_id}_${meta.reference}" } + publishDir = [ + path: { "${params.outdir}/final_bams/" }, + mode: params.publish_dir_mode, + pattern: '*.bam' + ] + } + + withName: SAMTOOLS_INDEX_MERGED_LIBRARIES { + tag = { "${meta.reference}|${meta.sample_id}" } + ext.args = { params.fasta_largeref ? "-c" : "" } + ext.prefix = { "${meta.sample_id}_${meta.reference}" } + publishDir = [ + path: { "${params.outdir}/final_bams/" }, + mode: params.publish_dir_mode, + pattern: '*.{bai,csi}' + ] + } + + withName: SAMTOOLS_FLAGSTAT_MERGED_LIBRARIES { + tag = { "${meta.reference}|${meta.sample_id}" } + ext.prefix = { "${meta.sample_id}_${meta.reference}" } + publishDir = [ + path: { "${params.outdir}/final_bams/" }, + mode: params.publish_dir_mode, + pattern: '*.flagstat' + ] + } } diff --git a/docs/development/dev_docs.md b/docs/development/dev_docs.md index 767fb35e6..e7c0a0f57 100644 --- a/docs/development/dev_docs.md +++ b/docs/development/dev_docs.md @@ -9,36 +9,26 @@ To add new input files or options to the reference sheet, you have to complete a 1. Add your new parameter to nextflow.config. 2. Add parameter description to schema (nf-core schema build). 3. Read in new parameter (params.) as input within the reference_indexing_single local subworkflow. - 1. Add new line to the large `.map{}` operation starting on [line 80(https://github.com/nf-core/eager/blob/d4211582f349cc30c88202c12942218f99006041/subworkflows/local/reference_indexing_single.nf#L80)] and add check if the file exists. - `def = params. != null ? file(params., checkIfExists: true ) : ""` - 2. Add to the result of the map operation. Double-check the order! - 3. With the `ch_ref_index_single.multiMap{}` below you add the reference name as a meta. You can also combine your new parameter with others if useful for the workflow step. - `: [ meta, ]` - 4. Add your ch_ref_index_single. to the final emit. - ` = ch_ref_index_single.` + 1. Add new line to the large `.map{}` operation starting on [line 80](https://github.com/nf-core/eager/blob/d4211582f349cc30c88202c12942218f99006041/subworkflows/local/reference_indexing_single.nf#L80) and add check if the file exists. `def = params. != null ? file(params., checkIfExists: true ) : ""` + 2. Add to the result of the map operation. Double-check the order! + 3. With the `ch_ref_index_single.multiMap{}` below you add the reference name as a meta. You can also combine your new parameter with others if useful for the workflow step. `: [ meta, ]` + 4. Add your ch_ref_index_single. to the final emit. ` = ch_ref_index_single.` ### Multi-reference input workflow 1. Add new column named and test data to the test reference sheet (https://github.com/nf-core/test-datasets/blob/eager/reference/reference_sheet_multiref.csv). 2. Read in new input within the reference_indexing_multi local subworkflow. - 1. Add new line to the large `.map{}` operation starting on [line 30](https://github.com/nf-core/eager/blob/d4211582f349cc30c88202c12942218f99006041/subworkflows/local/reference_indexing_multi.nf#L30). Add check if the file exists if appropriate. - `def = row[""] != "" ? file(row[""], checkIfExists: true) : ""` - 2. Add to the result of the `.map{}` operation. Double-check the order! - 3. With the `ch_input_from_referencesheet.multiMap{}` below you add the reference name as a meta. You can also combine your new parameter with others if useful for the workflow step. - `: [ meta, ]` - 4. Add ch_input_from_referencesheet. to the final emit. - ` = ch_input_from_referencesheet.` + 1. Add new line to the large `.map{}` operation starting on [line 30](https://github.com/nf-core/eager/blob/d4211582f349cc30c88202c12942218f99006041/subworkflows/local/reference_indexing_multi.nf#L30). Add check if the file exists if appropriate. `def = row[""] != "" ? file(row[""], checkIfExists: true) : ""` + 2. Add to the result of the `.map{}` operation. Double-check the order! + 3. With the `ch_input_from_referencesheet.multiMap{}` below you add the reference name as a meta. You can also combine your new parameter with others if useful for the workflow step. `: [ meta, ]` + 4. Add ch_input_from_referencesheet. to the final emit. ` = ch_input_from_referencesheet.` ### Combining in the Reference Indexing workflow 1. Add you new parameter channel to the `if` condition selecting between the direct parameter input or the reference sheet input. - 1. below "REFERENCE_INDEXING_MULTI" for reference sheet input - ` = REFERENCE_INDEXING_MULTI.out.` - 2. below "REFERENCE_INDEXING_SINGLE" - ` = REFERENCE_INDEXING_SINGLE.out.` - 3. Filter out options that have not been provided. - ` = .filter{ it[1] != "" }` - 4. Add unzipping of zipped input files with GUNZIP. - 5. Add to the final emit. - ` = ` - 6. Call new inputs within the main eager.nf with `REFERENCE_INDEXING.out.`. + 1. below "REFERENCE_INDEXING_MULTI" for reference sheet input ` = REFERENCE_INDEXING_MULTI.out.` + 2. below "REFERENCE_INDEXING_SINGLE" ` = REFERENCE_INDEXING_SINGLE.out.` + 3. Filter out options that have not been provided. ` = .filter{ it[1] != "" }` + 4. Add unzipping of zipped input files with GUNZIP. + 5. Add to the final emit. ` = ` + 6. Call new inputs within the main eager.nf with `REFERENCE_INDEXING.out.`. diff --git a/docs/development/manual_tests.md b/docs/development/manual_tests.md index 457172f3f..78899fa80 100644 --- a/docs/development/manual_tests.md +++ b/docs/development/manual_tests.md @@ -102,6 +102,14 @@ Tool Specific combinations - All together + - Library merge + + - single reference: no damage manipulation ✅ + - single reference: with damage manipulation, on raw data ✅ + - single reference: with damage manipulation (trimming), on trimmed data ✅ + - single reference: with damage manipulation (pmd + trimming), on pmd filtered data ✅ + - multi reference: no damage manipulation ✅ + ### Multi-reference tests ```bash @@ -667,3 +675,52 @@ nextflow run . -profile test,docker \ --damage_manipulation_bamutils_trim_double_stranded_half_udg_left 1 \ --damage_manipulation_bamutils_trim_double_stranded_half_udg_right 2 ``` + +### LIBRARY_MERGE + +```bash +## Library merge on single reference, no damage manipulation. +## EXPECT: 1 bam.bai/flagstat set per sample/reference combination. 6 files total. +## Check the headers of the bams to ensure that the correct number of bams are merged (1 for JK2802, 2 for JK2782). +## Also, check that the bams merged are the deduplication output. +## NOTE: JK2782 seems to have some PG tags repeated, as they apply to each input file separately. +nextflow run main.nf -profile test,docker --outdir ./results -w work/ -resume --genotyping_source 'raw' -ansi-log false -dump-channels +``` + +```bash +## Library merge on single reference, merge trimmed bams. +## EXPECT: 1 bam.bai/flagstat set per sample/reference combination. 6 files total. +## Check the headers of the bams to ensure that the correct number of bams are merged (1 for JK2802, 2 for JK2782). +## Also, check that the bams merged are trimmed. (JK2802 is full udg, but header confirms merged bam is "trimmed") +## NOTE: JK2782 seems to have some PG tags repeated, as they apply to each input file separately. +nextflow run main.nf -profile test,docker --outdir ./results -w work/ -resume --genotyping_source 'trimmed' -ansi-log false -dump-channels \ + --run_trim_bam \ + --damage_manipulation_bamutils_trim_double_stranded_none_udg_left 5 \ + --damage_manipulation_bamutils_trim_double_stranded_none_udg_right 7 \ + --damage_manipulation_bamutils_trim_double_stranded_half_udg_left 1 \ + --damage_manipulation_bamutils_trim_double_stranded_half_udg_right 2 +``` + +```bash +## Library merge on single reference, merge pmd bams. Trimming ran but not used downstream. +## EXPECT: 1 bam.bai/flagstat set per sample/reference combination. 6 files total. +## Check the headers of the bams to ensure that the correct number of bams are merged (1 for JK2802, 2 for JK2782). +## Also, check that the bams merged are the pmd ones. +## NOTE: JK2782 seems to have some PG tags repeated, as they apply to each input file separately. +nextflow run main.nf -profile test,docker --outdir ./results -w work/ -resume --genotyping_source 'pmd' -ansi-log false -dump-channels \ + --run_trim_bam \ + --run_pmd_filtering \ + --damage_manipulation_bamutils_trim_double_stranded_none_udg_left 5 \ + --damage_manipulation_bamutils_trim_double_stranded_none_udg_right 7 \ + --damage_manipulation_bamutils_trim_double_stranded_half_udg_left 1 \ + --damage_manipulation_bamutils_trim_double_stranded_half_udg_right 2 +``` + +```bash +## Library merge on multi reference. No damage manipulation. +## EXPECT: 1 bam.bai/flagstat set per sample/reference combination. 15 files total. (2 refs * 2 samples * 3 files) + BAM input only on one reference (+3) +## Check the headers of the bams to ensure that the correct number of bams are merged (1 for JK2802, 2 for JK2782). +## Also, check that the bams merged are the dedupped ones. +## NOTE: PG tags are repeated for each chromosome in the reference, times each library! Maybe there's some flag missing from samtools MERGE runs? +nextflow run main.nf -profile test_multiref,docker --outdir ./results -w work/ -resume --genotyping_source 'raw' -ansi-log false -dump-channels +``` diff --git a/subworkflows/local/merge_libraries.nf b/subworkflows/local/merge_libraries.nf new file mode 100644 index 000000000..b4b0e473a --- /dev/null +++ b/subworkflows/local/merge_libraries.nf @@ -0,0 +1,52 @@ +// +// Merge libraries of the same sample, then sort, index, and flagstat the merged bam +// + +include { SAMTOOLS_MERGE as SAMTOOLS_MERGE_LIBRARIES } from '../../modules/nf-core/samtools/merge/main' +include { SAMTOOLS_SORT as SAMTOOLS_SORT_MERGED_LIBRARIES } from '../../modules/nf-core/samtools/sort/main' +include { SAMTOOLS_INDEX as SAMTOOLS_INDEX_MERGED_LIBRARIES } from '../../modules/nf-core/samtools/index/main' +include { SAMTOOLS_FLAGSTAT as SAMTOOLS_FLAGSTAT_MERGED_LIBRARIES } from '../../modules/nf-core/samtools/flagstat/main' + +workflow MERGE_LIBRARIES { + take: + ch_bam_bai // [ [ meta ], bam , bai ] + + main: + ch_versions = Channel.empty() + ch_multiqc_files = Channel.empty() + + ch_library_merge_input = ch_bam_bai + .map { WorkflowEager.addNewMetaFromAttributes( it, ["id", "sample_id", "strandedness", "reference"], ["id", "sample_id", "strandedness", "reference"], false ) } + .groupTuple(by: 0) + // Discrad library-level metas, and bais. Add single_end: true to all metas (no SE/PE distinction from here on) + .map { + meta, lib_metas, bam, bai -> + [ meta + [ 'single_end': true ], bam ] + } + + SAMTOOLS_MERGE_LIBRARIES ( ch_library_merge_input, [[], []], [[], []] ) + ch_versions = ch_versions.mix( SAMTOOLS_MERGE_LIBRARIES.out.versions.first() ) + + SAMTOOLS_SORT_MERGED_LIBRARIES ( SAMTOOLS_MERGE_LIBRARIES.out.bam ) + ch_versions = ch_versions.mix( SAMTOOLS_SORT_MERGED_LIBRARIES.out.versions.first() ) + + SAMTOOLS_INDEX_MERGED_LIBRARIES ( SAMTOOLS_SORT_MERGED_LIBRARIES.out.bam ) + ch_versions = ch_versions.mix( SAMTOOLS_INDEX_MERGED_LIBRARIES.out.versions.first() ) + + // Join merged sample-level bams and their bais for genotyping + ch_merged_bams = SAMTOOLS_SORT_MERGED_LIBRARIES.out.bam + .join( SAMTOOLS_INDEX_MERGED_LIBRARIES.out.bai ) + + // Not sure if FLAGSTAT is really needed, but added here for completeness + SAMTOOLS_FLAGSTAT_MERGED_LIBRARIES ( ch_merged_bams ) + ch_versions = ch_versions.mix( SAMTOOLS_FLAGSTAT_MERGED_LIBRARIES.out.versions.first() ) + + ch_merged_flagstat = SAMTOOLS_FLAGSTAT_MERGED_LIBRARIES.out.flagstat + ch_multiqc_files = ch_multiqc_files.mix( SAMTOOLS_FLAGSTAT_MERGED_LIBRARIES.out.flagstat ) + + emit: + bam_bai = ch_merged_bams // [ [ meta ], bam , bai ] + flagstat = ch_merged_flagstat // [ [ meta ], flagstat ] + versions = ch_versions + mqc = ch_multiqc_files // Same as flagstat +} diff --git a/workflows/eager.nf b/workflows/eager.nf index 79d58d28b..7cded4a80 100644 --- a/workflows/eager.nf +++ b/workflows/eager.nf @@ -71,6 +71,7 @@ include { MANIPULATE_DAMAGE } from '../subworkflows/local/manipulate include { METAGENOMICS_COMPLEXITYFILTER } from '../subworkflows/local/metagenomics_complexityfilter' include { ESTIMATE_CONTAMINATION } from '../subworkflows/local/estimate_contamination' include { CALCULATE_DAMAGE } from '../subworkflows/local/calculate_damage' +include { MERGE_LIBRARIES } from '../subworkflows/local/merge_libraries' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -530,11 +531,22 @@ workflow EAGER { MANIPULATE_DAMAGE( ch_dedupped_bams, ch_fasta_for_deduplication.fasta, REFERENCE_INDEXING.out.pmd_masking ) ch_multiqc_files = ch_multiqc_files.mix( MANIPULATE_DAMAGE.out.flagstat.collect{it[1]}.ifEmpty([]) ) ch_versions = ch_versions.mix( MANIPULATE_DAMAGE.out.versions ) - ch_bams_for_genotyping = params.genotyping_source == 'rescaled' ? MANIPULATE_DAMAGE.out.rescaled : params.genotyping_source == 'pmd' ? MANIPULATE_DAMAGE.out.filtered : params.genotyping_source == 'trimmed' ? MANIPULATE_DAMAGE.out.trimmed : ch_dedupped_bams + ch_bams_for_library_merge = params.genotyping_source == 'rescaled' ? MANIPULATE_DAMAGE.out.rescaled : params.genotyping_source == 'pmd' ? MANIPULATE_DAMAGE.out.filtered : params.genotyping_source == 'trimmed' ? MANIPULATE_DAMAGE.out.trimmed : ch_dedupped_bams } else { - ch_bams_for_genotyping = ch_dedupped_bams + ch_bams_for_library_merge = ch_dedupped_bams } + // + // SUBWORKFLOW: MERGE LIBRARIES + // + + // The bams being merged are always the ones specified by params.genotyping_source, + // unless the user skipped damage manipulation, in which case it is the DEDUPLICATION output. + MERGE_LIBRARIES ( ch_bams_for_library_merge ) + ch_versions = ch_versions.mix( MERGE_LIBRARIES.out.versions ) + ch_bams_for_genotyping = MERGE_LIBRARIES.out.bam_bai + ch_multiqc_files = ch_multiqc_files.mix( MERGE_LIBRARIES.out.mqc.collect{it[1]}.ifEmpty([]) ) // Not sure if this is needed, or if it needs to be moved to line 564? + // // MODULE: MultiQC //