From fd9776e65f95ca644795876350513bf8ea001cc7 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Tue, 30 Jan 2024 15:38:47 +0100 Subject: [PATCH] Add library merge SWF --- conf/modules.config | 43 ++++++++++++++++++++++ subworkflows/local/merge_libraries.nf | 53 +++++++++++++++++++++++++++ workflows/eager.nf | 16 +++++++- 3 files changed, 110 insertions(+), 2 deletions(-) create mode 100644 subworkflows/local/merge_libraries.nf diff --git a/conf/modules.config b/conf/modules.config index 45fdb19c8..008837472 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -883,4 +883,47 @@ process { enabled: true ] } + + // + // LIBRARY MERGE + // + + withName: SAMTOOLS_MERGE_LIBRARIES { + tag = { "${meta.reference}|${meta.sample_id}" } + ext.prefix = { "${meta.sample_id}_${meta.reference}_unsorted" } + publishDir = [ + enabled: false + ] + } + + withName: SAMTOOLS_SORT_MERGED_LIBRARIES { + tag = { "${meta.reference}|${meta.sample_id}" } + ext.prefix = { "${meta.sample_id}_${meta.reference}" } + publishDir = [ + path: { "${params.outdir}/library_merge/" }, + mode: params.publish_dir_mode, + pattern: '*.bam' + ] + } + + withName: SAMTOOLS_INDEX_MERGED_LIBRARIES { + tag = { "${meta.reference}|${meta.sample_id}" } + ext.args = { params.fasta_largeref ? "-c" : "" } + ext.prefix = { "${meta.sample_id}_${meta.reference}" } + publishDir = [ + path: { "${params.outdir}/library_merge/" }, + mode: params.publish_dir_mode, + pattern: '*.{bai,csi}' + ] + } + + withName: SAMTOOLS_FLAGSTAT_MERGED_LIBRARIES { + tag = { "${meta.reference}|${meta.sample_id}" } + ext.prefix = { "${meta.sample_id}_${meta.reference}" } + publishDir = [ + path: { "${params.outdir}/library_merge/" }, + mode: params.publish_dir_mode, + pattern: '*.flagstat' + ] + } } diff --git a/subworkflows/local/merge_libraries.nf b/subworkflows/local/merge_libraries.nf new file mode 100644 index 000000000..aca440d66 --- /dev/null +++ b/subworkflows/local/merge_libraries.nf @@ -0,0 +1,53 @@ +// +// Merge libraries of the same sample, then sort, index, and flagstat the merged bam +// + +include { SAMTOOLS_MERGE as SAMTOOLS_MERGE_LIBRARIES } from '../../modules/nf-core/samtools/merge/main' +include { SAMTOOLS_SORT as SAMTOOLS_SORT_MERGED_LIBRARIES } from '../../modules/nf-core/samtools/sort/main' +include { SAMTOOLS_INDEX as SAMTOOLS_INDEX_MERGED_LIBRARIES } from '../../modules/nf-core/samtools/index/main' +include { SAMTOOLS_FLAGSTAT as SAMTOOLS_FLAGSTAT_MERGED_LIBRARIES } from '../../modules/nf-core/samtools/flagstat/main' + +workflow MERGE_LIBRARIES { + take: + ch_bam_bai // [ [ meta ], bam , bai ] + + main: + ch_versions = Channel.empty() + ch_multiqc_files = Channel.empty() + + ch_library_merge_input = ch_bam_bai + // TODO add 'id_index' to final meta? (once that also gets added to bam input). Maybe also keep SE/PE? (for now, we assume SE, see comment below) + .map { WorkflowEager.addNewMetaFromAttributes( it, ["id", "sample_id", "strandedness", "reference"], ["id", "sample_id", "strandedness", "reference"], false ) } + .groupTuple(by: 0) + // Discrad library-level metas, and bais. Add single_end: true to all metas (no SE/PE distinction at this point, right?) + .map { + meta, lib_metas, bam, bai -> + [ meta + [ 'single_end':true ], bam ] + } + + SAMTOOLS_MERGE_LIBRARIES ( ch_library_merge_input, [], [] ) + ch_versions = ch_versions.mix( SAMTOOLS_MERGE_LIBRARIES.out.versions.first() ) + + SAMTOOLS_SORT_MERGED_LIBRARIES ( SAMTOOLS_MERGE_LIBRARIES.out.bam ) + ch_versions = ch_versions.mix( SAMTOOLS_SORT_MERGED_LIBRARIES.out.versions.first() ) + + SAMTOOLS_INDEX_MERGED_LIBRARIES ( SAMTOOLS_SORT_MERGED_LIBRARIES.out.bam ) + ch_versions = ch_versions.mix( SAMTOOLS_INDEX_MERGED_LIBRARIES.out.versions.first() ) + + // Join merged sample-level bams and their bais for genotyping + ch_merged_bams = SAMTOOLS_SORT_MERGED_LIBRARIES.out.bam + .join( SAMTOOLS_INDEX_MERGED_LIBRARIES.out.bai ) + + // Not sure if FLAGSTAT is really needed, but added here for completeness + SAMTOOLS_FLAGSTAT_MERGED_LIBRARIES ( ch_merged_bams ) + ch_versions = ch_versions.mix( SAMTOOLS_FLAGSTAT_MERGED_LIBRARIES.out.versions.first() ) + + ch_merged_flagstat = SAMTOOLS_FLAGSTAT_MERGED_LIBRARIES.out.flagstat + ch_multiqc_files = ch_multiqc_files.mix( SAMTOOLS_FLAGSTAT_MERGED_LIBRARIES.out.flagstat ) + + emit: + bam_bai = ch_merged_bams // [ [ meta ], bam , bai ] + flagstat = ch_merged_flagstat // [ [ meta ], flagstat ] + versions = ch_versions + mqc = ch_multiqc_files // Same as flagstat +} diff --git a/workflows/eager.nf b/workflows/eager.nf index 89a39b252..f829f4089 100644 --- a/workflows/eager.nf +++ b/workflows/eager.nf @@ -71,6 +71,7 @@ include { MANIPULATE_DAMAGE } from '../subworkflows/local/manipulate include { METAGENOMICS_COMPLEXITYFILTER } from '../subworkflows/local/metagenomics_complexityfilter' include { ESTIMATE_CONTAMINATION } from '../subworkflows/local/estimate_contamination' include { CALCULATE_DAMAGE } from '../subworkflows/local/calculate_damage' +include { MERGE_LIBRARIES } from '../subworkflows/local/merge_libraries' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -530,11 +531,22 @@ workflow EAGER { MANIPULATE_DAMAGE( ch_dedupped_bams, ch_fasta_for_deduplication.fasta, REFERENCE_INDEXING.out.pmd_masking ) ch_multiqc_files = ch_multiqc_files.mix( MANIPULATE_DAMAGE.out.flagstat.collect{it[1]}.ifEmpty([]) ) ch_versions = ch_versions.mix( MANIPULATE_DAMAGE.out.versions ) - ch_bams_for_genotyping = params.genotyping_source == 'rescaled' ? MANIPULATE_DAMAGE.out.rescaled : params.genotyping_source == 'pmd' ? MANIPULATE_DAMAGE.out.filtered : params.genotyping_source == 'trimmed' ? MANIPULATE_DAMAGE.out.trimmed : ch_dedupped_bams + ch_bams_for_library_merge = params.genotyping_source == 'rescaled' ? MANIPULATE_DAMAGE.out.rescaled : params.genotyping_source == 'pmd' ? MANIPULATE_DAMAGE.out.filtered : params.genotyping_source == 'trimmed' ? MANIPULATE_DAMAGE.out.trimmed : ch_dedupped_bams } else { - ch_bams_for_genotyping = ch_dedupped_bams + ch_bams_for_library_merge = ch_dedupped_bams } + // + // SUBWORKFLOW: MERGE LIBRARIES + // + + // The bams being merged are always the ones specified by params.genotyping_source, + // unless the user skipped damage manipulation, in which case it is the DEDUPLICATION output. + MERGE_LIBRARIES ( ch_bams_for_library_merge ) + ch_versions = ch_versions.mix( MERGE_LIBRARIES.out.versions ) + ch_bams_for_genotyping = MERGE_LIBRARIES.out.bam_bai + ch_multiqc_files = ch_multiqc_files.mix( MERGE_LIBRARIES.out.mqc.collect{it[1]}.ifEmpty([]) ) // Not sure if this is needed, or if it needs to be moved to line 564? + // // MODULE: MultiQC //