Merge pull request #1043 from nf-core/dsl2_library_merge

DSL2: library merge
nf-core · Feb 2, 2024 · 63bf3e7 · 63bf3e7
2 parents f7bbce7 + eaca1d5
commit 63bf3e7
Show file tree

Hide file tree

Showing 8 changed files with 186 additions and 31 deletions.
diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
@@ -18,11 +18,11 @@
                 "python.linting.flake8Path": "/opt/conda/bin/flake8",
                 "python.linting.pycodestylePath": "/opt/conda/bin/pycodestyle",
                 "python.linting.pydocstylePath": "/opt/conda/bin/pydocstyle",
-                "python.linting.pylintPath": "/opt/conda/bin/pylint",
+                "python.linting.pylintPath": "/opt/conda/bin/pylint"
             },
 
             // Add the IDs of extensions you want installed when the container is created.
-            "extensions": ["ms-python.python", "ms-python.vscode-pylance", "nf-core.nf-core-extensionpack"],
-        },
-    },
+            "extensions": ["ms-python.python", "ms-python.vscode-pylance", "nf-core.nf-core-extensionpack"]
+        }
+    }
 }
diff --git a/.github/workflows/download_pipeline.yml b/.github/workflows/download_pipeline.yml
@@ -64,4 +64,4 @@ jobs:
         env:
           NXF_SINGULARITY_CACHEDIR: ./
           NXF_SINGULARITY_HOME_MOUNT: true
-        run: nextflow run ./${{ env.REPOTITLE_LOWERCASE }}/$( sed 's/\W/_/g' <<< ${{ env.REPO_BRANCH }}) -stub -profile test,singularity --outdir ./results 
+        run: nextflow run ./${{ env.REPOTITLE_LOWERCASE }}/$( sed 's/\W/_/g' <<< ${{ env.REPO_BRANCH }}) -stub -profile test,singularity --outdir ./results
diff --git a/.prettierignore b/.prettierignore
@@ -12,3 +12,4 @@ testing*
 bin/
 *.cff
 test/
+dev_docs.md
diff --git a/conf/modules.config b/conf/modules.config
@@ -883,4 +883,47 @@ process {
             enabled: true
         ]
     }
+
+    //
+    // LIBRARY MERGE
+    //
+
+    withName: SAMTOOLS_MERGE_LIBRARIES {
+        tag = { "${meta.reference}|${meta.sample_id}" }
+        ext.prefix = { "${meta.sample_id}_${meta.reference}_unsorted" }
+        publishDir = [
+            enabled: false
+        ]
+    }
+
+    withName: SAMTOOLS_SORT_MERGED_LIBRARIES {
+        tag = { "${meta.reference}|${meta.sample_id}" }
+        ext.prefix = { "${meta.sample_id}_${meta.reference}" }
+        publishDir = [
+            path: { "${params.outdir}/final_bams/" },
+            mode: params.publish_dir_mode,
+            pattern: '*.bam'
+        ]
+    }
+
+    withName: SAMTOOLS_INDEX_MERGED_LIBRARIES {
+        tag = { "${meta.reference}|${meta.sample_id}" }
+        ext.args = { params.fasta_largeref ? "-c" : "" }
+        ext.prefix = { "${meta.sample_id}_${meta.reference}" }
+        publishDir = [
+            path: { "${params.outdir}/final_bams/" },
+            mode: params.publish_dir_mode,
+            pattern: '*.{bai,csi}'
+        ]
+    }
+
+    withName: SAMTOOLS_FLAGSTAT_MERGED_LIBRARIES {
+        tag = { "${meta.reference}|${meta.sample_id}" }
+        ext.prefix = { "${meta.sample_id}_${meta.reference}" }
+        publishDir = [
+            path: { "${params.outdir}/final_bams/" },
+            mode: params.publish_dir_mode,
+            pattern: '*.flagstat'
+        ]
+    }
 }
diff --git a/docs/development/dev_docs.md b/docs/development/dev_docs.md
@@ -9,36 +9,26 @@ To add new input files or options to the reference sheet, you have to complete a
 1. Add your new parameter to nextflow.config.
 2. Add parameter description to schema (nf-core schema build).
 3. Read in new parameter (params.<NEW>) as input within the reference_indexing_single local subworkflow.
-   1. Add new line to the large `.map{}` operation starting on [line 80(https://github.com/nf-core/eager/blob/d4211582f349cc30c88202c12942218f99006041/subworkflows/local/reference_indexing_single.nf#L80)] and add check if the file exists.
-      `def <PARAM_NAME> = params.<NEW> != null ? file(params.<NEW>, checkIfExists: true ) : ""`
-   2. Add <PARAM_NAME> to the result of the map operation. Double-check the order!
-   3. With the `ch_ref_index_single.multiMap{}` below you add the reference name as a meta. You can also combine your new parameter with others if useful for the workflow step.
-      `<NEW_SUBCHANNE>: [ meta, <PARAM_NAME> ]`
-   4. Add your ch_ref_index_single.<NEW_SUBCHANNEL> to the final emit.
-      `<NEW_EMIT> = ch_ref_index_single.<NEW_SUBCHANNEL>`
+    1. Add new line to the large `.map{}` operation starting on [line 80](https://github.com/nf-core/eager/blob/d4211582f349cc30c88202c12942218f99006041/subworkflows/local/reference_indexing_single.nf#L80) and add check if the file exists. `def <PARAM_NAME> = params.<NEW> != null ? file(params.<NEW>, checkIfExists: true ) : ""`
+    2. Add <PARAM_NAME> to the result of the map operation. Double-check the order!
+    3. With the `ch_ref_index_single.multiMap{}` below you add the reference name as a meta. You can also combine your new parameter with others if useful for the workflow step. `<NEW_SUBCHANNE>: [ meta, <PARAM_NAME> ]`
+    4. Add your ch_ref_index_single.<NEW_SUBCHANNEL> to the final emit. `<NEW_EMIT> = ch_ref_index_single.<NEW_SUBCHANNEL>`
 
 ### Multi-reference input workflow
 
 1. Add new column named <SOFTWARE_FILETYPE> and test data to the test reference sheet (https://github.com/nf-core/test-datasets/blob/eager/reference/reference_sheet_multiref.csv).
 2. Read in new input within the reference_indexing_multi local subworkflow.
-   1. Add new line to the large `.map{}` operation starting on [line 30](https://github.com/nf-core/eager/blob/d4211582f349cc30c88202c12942218f99006041/subworkflows/local/reference_indexing_multi.nf#L30). Add check if the file exists if appropriate.
-      `def <PARAM_NAME> = row["<SOFTWARE_FILETYPE>"] != "" ? file(row["<SOFTWARE_FILETYPE>"], checkIfExists: true) : ""`
-   2. Add <PARAM_NAME> to the result of the `.map{}` operation. Double-check the order!
-   3. With the `ch_input_from_referencesheet.multiMap{}` below you add the reference name as a meta. You can also combine your new parameter with others if useful for the workflow step.
-      `<NEW_SUBCHANNEL>: [ meta, <PARAM_NAME> ]`
-   4. Add ch_input_from_referencesheet.<NEW_SUBCHANNEL> to the final emit.
-      `<NEW_EMIT> = ch_input_from_referencesheet.<NEW_SUBCHANNEL>`
+    1. Add new line to the large `.map{}` operation starting on [line 30](https://github.com/nf-core/eager/blob/d4211582f349cc30c88202c12942218f99006041/subworkflows/local/reference_indexing_multi.nf#L30). Add check if the file exists if appropriate. `def <PARAM_NAME> = row["<SOFTWARE_FILETYPE>"] != "" ? file(row["<SOFTWARE_FILETYPE>"], checkIfExists: true) : ""`
+    2. Add <PARAM_NAME> to the result of the `.map{}` operation. Double-check the order!
+    3. With the `ch_input_from_referencesheet.multiMap{}` below you add the reference name as a meta. You can also combine your new parameter with others if useful for the workflow step. `<NEW_SUBCHANNEL>: [ meta, <PARAM_NAME> ]`
+    4. Add ch_input_from_referencesheet.<NEW_SUBCHANNEL> to the final emit. `<NEW_EMIT> = ch_input_from_referencesheet.<NEW_SUBCHANNEL>`
 
 ### Combining in the Reference Indexing workflow
 
 1. Add you new parameter channel to the `if` condition selecting between the direct parameter input or the reference sheet input.
-   1. below "REFERENCE_INDEXING_MULTI" for reference sheet input
-      `<NEW_CHANNEL> = REFERENCE_INDEXING_MULTI.out.<NEW_EMIT>`
-   2. below "REFERENCE_INDEXING_SINGLE"
-      `<NEW_CHANNEL> = REFERENCE_INDEXING_SINGLE.out.<NEW_EMIT>`
-   3. Filter out options that have not been provided.
-      `<NEW_CHANNEL> = <NEW_CHANNEL>.filter{ it[1] != "" }`
-   4. Add unzipping of zipped input files with GUNZIP.
-   5. Add <NEW_CHANNEL> to the final emit.
-      `<NEW_EMIT> = <NEW_CHANNEL>`
-   6. Call new inputs within the main eager.nf with `REFERENCE_INDEXING.out.<NEW_EMIT>`.
+    1. below "REFERENCE_INDEXING_MULTI" for reference sheet input `<NEW_CHANNEL> = REFERENCE_INDEXING_MULTI.out.<NEW_EMIT>`
+    2. below "REFERENCE_INDEXING_SINGLE" `<NEW_CHANNEL> = REFERENCE_INDEXING_SINGLE.out.<NEW_EMIT>`
+    3. Filter out options that have not been provided. `<NEW_CHANNEL> = <NEW_CHANNEL>.filter{ it[1] != "" }`
+    4. Add unzipping of zipped input files with GUNZIP.
+    5. Add <NEW_CHANNEL> to the final emit. `<NEW_EMIT> = <NEW_CHANNEL>`
+    6. Call new inputs within the main eager.nf with `REFERENCE_INDEXING.out.<NEW_EMIT>`.
diff --git a/docs/development/manual_tests.md b/docs/development/manual_tests.md
@@ -102,6 +102,14 @@ Tool Specific combinations
 
   - All together
 
+  - Library merge
+
+    - single reference: no damage manipulation ✅
+    - single reference: with damage manipulation, on raw data ✅
+    - single reference: with damage manipulation (trimming), on trimmed data ✅
+    - single reference: with damage manipulation (pmd + trimming), on pmd filtered data ✅
+    - multi reference: no damage manipulation ✅
+
 ### Multi-reference tests
 
 ```bash
@@ -667,3 +675,52 @@ nextflow run . -profile test,docker \
   --damage_manipulation_bamutils_trim_double_stranded_half_udg_left 1 \
   --damage_manipulation_bamutils_trim_double_stranded_half_udg_right 2
 ```
+
+### LIBRARY_MERGE
+
+```bash
+## Library merge on single reference, no damage manipulation.
+## EXPECT: 1 bam.bai/flagstat set per sample/reference combination. 6 files total.
+##   Check the headers of the bams to ensure that the correct number of bams are merged (1 for JK2802, 2 for JK2782).
+##   Also, check that the bams merged are the deduplication output.
+## NOTE: JK2782 seems to have some PG tags repeated, as they apply to each input file separately.
+nextflow run main.nf -profile test,docker --outdir ./results -w work/ -resume --genotyping_source 'raw' -ansi-log false -dump-channels
+```
+
+```bash
+## Library merge on single reference, merge trimmed bams.
+## EXPECT: 1 bam.bai/flagstat set per sample/reference combination. 6 files total.
+##   Check the headers of the bams to ensure that the correct number of bams are merged (1 for JK2802, 2 for JK2782).
+##   Also, check that the bams merged are trimmed. (JK2802 is full udg, but header confirms merged bam is "trimmed")
+## NOTE: JK2782 seems to have some PG tags repeated, as they apply to each input file separately.
+nextflow run main.nf -profile test,docker --outdir ./results -w work/ -resume --genotyping_source 'trimmed' -ansi-log false -dump-channels \
+  --run_trim_bam \
+  --damage_manipulation_bamutils_trim_double_stranded_none_udg_left 5 \
+  --damage_manipulation_bamutils_trim_double_stranded_none_udg_right 7 \
+  --damage_manipulation_bamutils_trim_double_stranded_half_udg_left 1 \
+  --damage_manipulation_bamutils_trim_double_stranded_half_udg_right 2
+```
+
+```bash
+## Library merge on single reference, merge pmd bams. Trimming ran but not used downstream.
+## EXPECT: 1 bam.bai/flagstat set per sample/reference combination. 6 files total.
+##   Check the headers of the bams to ensure that the correct number of bams are merged (1 for JK2802, 2 for JK2782).
+##   Also, check that the bams merged are the pmd ones.
+## NOTE: JK2782 seems to have some PG tags repeated, as they apply to each input file separately.
+nextflow run main.nf -profile test,docker --outdir ./results -w work/ -resume --genotyping_source 'pmd' -ansi-log false -dump-channels \
+  --run_trim_bam \
+  --run_pmd_filtering \
+  --damage_manipulation_bamutils_trim_double_stranded_none_udg_left 5 \
+  --damage_manipulation_bamutils_trim_double_stranded_none_udg_right 7 \
+  --damage_manipulation_bamutils_trim_double_stranded_half_udg_left 1 \
+  --damage_manipulation_bamutils_trim_double_stranded_half_udg_right 2
+```
+
+```bash
+## Library merge on multi reference. No damage manipulation.
+## EXPECT: 1 bam.bai/flagstat set per sample/reference combination. 15 files total. (2 refs * 2 samples * 3 files) + BAM input only on one reference (+3)
+##   Check the headers of the bams to ensure that the correct number of bams are merged (1 for JK2802, 2 for JK2782).
+##   Also, check that the bams merged are the dedupped ones.
+## NOTE: PG tags are repeated for each chromosome in the reference, times each library! Maybe there's some flag missing from samtools MERGE runs?
+nextflow run main.nf -profile test_multiref,docker --outdir ./results -w work/ -resume --genotyping_source 'raw' -ansi-log false -dump-channels
+```
diff --git a/subworkflows/local/merge_libraries.nf b/subworkflows/local/merge_libraries.nf
@@ -0,0 +1,52 @@
+//
+// Merge libraries of the same sample, then sort, index, and flagstat the merged bam
+//
+
+include { SAMTOOLS_MERGE    as SAMTOOLS_MERGE_LIBRARIES           } from '../../modules/nf-core/samtools/merge/main'
+include { SAMTOOLS_SORT     as SAMTOOLS_SORT_MERGED_LIBRARIES     } from '../../modules/nf-core/samtools/sort/main'
+include { SAMTOOLS_INDEX    as SAMTOOLS_INDEX_MERGED_LIBRARIES    } from '../../modules/nf-core/samtools/index/main'
+include { SAMTOOLS_FLAGSTAT as SAMTOOLS_FLAGSTAT_MERGED_LIBRARIES } from '../../modules/nf-core/samtools/flagstat/main'
+
+workflow MERGE_LIBRARIES {
+    take:
+    ch_bam_bai  // [ [ meta ], bam , bai ]
+
+    main:
+    ch_versions      = Channel.empty()
+    ch_multiqc_files = Channel.empty()
+
+    ch_library_merge_input = ch_bam_bai
+        .map { WorkflowEager.addNewMetaFromAttributes( it, ["id", "sample_id", "strandedness", "reference"], ["id", "sample_id", "strandedness", "reference"], false ) }
+        .groupTuple(by: 0)
+        // Discrad library-level metas, and bais. Add single_end: true to all metas (no SE/PE distinction from here on)
+        .map {
+            meta, lib_metas, bam, bai ->
+            [ meta + [ 'single_end': true ], bam ]
+        }
+
+    SAMTOOLS_MERGE_LIBRARIES ( ch_library_merge_input, [[], []], [[], []] )
+    ch_versions = ch_versions.mix( SAMTOOLS_MERGE_LIBRARIES.out.versions.first() )
+
+    SAMTOOLS_SORT_MERGED_LIBRARIES ( SAMTOOLS_MERGE_LIBRARIES.out.bam )
+    ch_versions = ch_versions.mix( SAMTOOLS_SORT_MERGED_LIBRARIES.out.versions.first() )
+
+    SAMTOOLS_INDEX_MERGED_LIBRARIES ( SAMTOOLS_SORT_MERGED_LIBRARIES.out.bam )
+    ch_versions = ch_versions.mix( SAMTOOLS_INDEX_MERGED_LIBRARIES.out.versions.first() )
+
+    // Join merged sample-level bams and their bais for genotyping
+    ch_merged_bams = SAMTOOLS_SORT_MERGED_LIBRARIES.out.bam
+        .join( SAMTOOLS_INDEX_MERGED_LIBRARIES.out.bai )
+
+    // Not sure if FLAGSTAT is really needed, but added here for completeness
+    SAMTOOLS_FLAGSTAT_MERGED_LIBRARIES ( ch_merged_bams )
+    ch_versions = ch_versions.mix( SAMTOOLS_FLAGSTAT_MERGED_LIBRARIES.out.versions.first() )
+
+    ch_merged_flagstat = SAMTOOLS_FLAGSTAT_MERGED_LIBRARIES.out.flagstat
+    ch_multiqc_files = ch_multiqc_files.mix( SAMTOOLS_FLAGSTAT_MERGED_LIBRARIES.out.flagstat )
+
+    emit:
+    bam_bai  = ch_merged_bams       // [ [ meta ], bam , bai ]
+    flagstat = ch_merged_flagstat   // [ [ meta ], flagstat ]
+    versions = ch_versions
+    mqc      = ch_multiqc_files // Same as flagstat
+}
diff --git a/workflows/eager.nf b/workflows/eager.nf
@@ -71,6 +71,7 @@ include { MANIPULATE_DAMAGE             } from '../subworkflows/local/manipulate
 include { METAGENOMICS_COMPLEXITYFILTER } from '../subworkflows/local/metagenomics_complexityfilter'
 include { ESTIMATE_CONTAMINATION        } from '../subworkflows/local/estimate_contamination'
 include { CALCULATE_DAMAGE              } from '../subworkflows/local/calculate_damage'
+include { MERGE_LIBRARIES               } from '../subworkflows/local/merge_libraries'
 
 /*
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -530,11 +531,22 @@ workflow EAGER {
         MANIPULATE_DAMAGE( ch_dedupped_bams, ch_fasta_for_deduplication.fasta, REFERENCE_INDEXING.out.pmd_masking )
         ch_multiqc_files       = ch_multiqc_files.mix( MANIPULATE_DAMAGE.out.flagstat.collect{it[1]}.ifEmpty([]) )
         ch_versions            = ch_versions.mix( MANIPULATE_DAMAGE.out.versions )
-        ch_bams_for_genotyping = params.genotyping_source == 'rescaled' ? MANIPULATE_DAMAGE.out.rescaled : params.genotyping_source == 'pmd' ? MANIPULATE_DAMAGE.out.filtered : params.genotyping_source == 'trimmed' ? MANIPULATE_DAMAGE.out.trimmed : ch_dedupped_bams
+        ch_bams_for_library_merge = params.genotyping_source == 'rescaled' ? MANIPULATE_DAMAGE.out.rescaled : params.genotyping_source == 'pmd' ? MANIPULATE_DAMAGE.out.filtered : params.genotyping_source == 'trimmed' ? MANIPULATE_DAMAGE.out.trimmed : ch_dedupped_bams
     } else {
-        ch_bams_for_genotyping = ch_dedupped_bams
+        ch_bams_for_library_merge = ch_dedupped_bams
     }
 
+    //
+    // SUBWORKFLOW: MERGE LIBRARIES
+    //
+
+    // The bams being merged are always the ones specified by params.genotyping_source,
+    //   unless the user skipped damage manipulation, in which case it is the DEDUPLICATION output.
+    MERGE_LIBRARIES ( ch_bams_for_library_merge )
+    ch_versions             = ch_versions.mix( MERGE_LIBRARIES.out.versions )
+    ch_bams_for_genotyping  = MERGE_LIBRARIES.out.bam_bai
+    ch_multiqc_files        = ch_multiqc_files.mix( MERGE_LIBRARIES.out.mqc.collect{it[1]}.ifEmpty([]) ) // Not sure if this is needed, or if it needs to be moved to line 564?
+
     //
     // MODULE: MultiQC
     //
-Original file line number
+Diff line change
@@ Expand Up / @@ -12,3 +12,4 @@ testing* @@
     bin/
     *.cff
     test/
+    dev_docs.md