Merge pull request #1023 from nf-core/dsl2-add-sharding-of-fastqs-bef…

…ore-alignment Dsl2 add sharding of fastqs before alignment
nf-core · Nov 11, 2023 · d421158 · d421158
2 parents 6cb0c30 + 6341f02
commit d421158
Show file tree

Hide file tree

Showing 10 changed files with 187 additions and 12 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,10 +3,12 @@
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
-## v3.0.0dev - [date]
+## v3.0.0dev - [2023-08-25]
 
 ### `Added`
 
+- [#1006](https://github.com/nf-core/eager/issues/1006) Added feature to shard fastqs before mapping, allowing more flexibility in parallelisation of mapping.
+
 ### `Fixed`
 
 ### `Dependencies`

diff --git a/CITATIONS.md b/CITATIONS.md
@@ -34,6 +34,10 @@
 
   > Broad Institute (2019). Picard Toolkit. GitHub Repository: https://broadinstitute.github.io/picard/
 
+- [SeqKit](https://bioinf.shenwei.me/seqkit/)
+
+  > Shen, W., Le, S., Li, Y., & Hu, F. (2016). SeqKit: A Cross-Platform and Ultrafast Toolkit for FASTA/Q File Manipulation. PLOS ONE, 11(10), e0163962. doi:[10.1371/journal.pone.0163962](https://doi.org/10.1371/journal.pone.0163962)
+
 - [bwa](https://doi.org/10.1093/bioinformatics/btp324)
 
   > Li, H., & Durbin, R. (2009). Fast and accurate short read alignment with Burrows-Wheeler transform. Bioinformatics , 25(14), 1754–1760. doi: [10.1093/bioinformatics/btp324](https://doi.org/10.1093/bioinformatics/btp324)

diff --git a/conf/modules.config b/conf/modules.config
@@ -371,6 +371,18 @@ process {
         ]
     }
 
+    //
+    // SHARDING FASTQS
+    //
+    withName: SEQKIT_SPLIT2 {
+        tag = { "${meta.sample_id}_${meta.library_id}_L${meta.lane}" }
+        ext.prefix = "out"
+        ext.args = "-s ${params.fastq_shard_size}"
+        publishDir = [
+            enabled: false
+        ]
+    }
+
     //
     // READ MAPPING
     //
@@ -454,6 +466,7 @@ process {
         publishDir = [
             enabled: false
         ]
+        ext.args = { params.run_fastq_sharding ? "-c -p" : "" }
     }
 
     withName: SAMTOOLS_SORT_MERGED_LANES {

diff --git a/conf/test.config b/conf/test.config
@@ -27,6 +27,10 @@ params {
     // Genome references
     fasta = 'https://raw.githubusercontent.com/nf-core/test-datasets/eager/reference/Mammoth/Mammoth_MT_Krause.fasta'
 
+    // Sharding FASTQ
+    run_fastq_sharding                           = true
+    fastq_shard_size                             = 5000
+
     // BAM filtering
     run_bamfiltering                      = true
     bamfiltering_minreadlength            = 30

diff --git a/modules.json b/modules.json
@@ -204,6 +204,11 @@
                         "branch": "master",
                         "git_sha": "911696ea0b62df80e900ef244d7867d177971f73",
                         "installed_by": ["modules", "bam_split_by_region"]
+                    },
+                    "seqkit/split2": {
+                        "branch": "master",
+                        "git_sha": "911696ea0b62df80e900ef244d7867d177971f73",
+                        "installed_by": ["modules"]
                     }
                 }
             },

diff --git a/modules/nf-core/seqkit/split2/main.nf b/modules/nf-core/seqkit/split2/main.nf
diff --git a/modules/nf-core/seqkit/split2/meta.yml b/modules/nf-core/seqkit/split2/meta.yml
diff --git a/nextflow.config b/nextflow.config
@@ -31,6 +31,10 @@ params {
     max_multiqc_email_size     = '25.MB'
     multiqc_methods_description = null
 
+    // Shard Fastq options
+    run_fastq_sharding                = false
+    fastq_shard_size                  = 1000000
+
     // bedtools options
     run_bedtools_coverage         = false
     mapstats_bedtools_featurefile = null

diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -463,6 +463,19 @@
             "description": "Options for aligning reads against reference genome(s)",
             "default": "",
             "properties": {
+                "run_fastq_sharding": {
+                    "type": "boolean",
+                    "description": "Turn on FastQ sharding.",
+                    "fa_icon": "fas fa-power-off",
+                    "help_text": "Sharding will split the FastQs into smaller chunks before mapping. These chunks are then mapped in parallel. This approach can speed up the mapping process for larger FastQ files."
+                },
+                "fastq_shard_size": {
+                    "type": "integer",
+                    "default": 1000000,
+                    "description": "Specify the number of reads in each shard when splitting.",
+                    "fa_icon": "fas fa-arrows-alt-v",
+                    "help_text": "Make sure to choose a value that makes sense for your dataset. Small values can create many files, which can end up negatively affecting the overall speed of the mapping process."
+                },
                 "mapping_tool": {
                     "type": "string",
                     "default": "bowtie2",
@@ -1118,9 +1131,6 @@
         {
             "$ref": "#/definitions/mapping"
         },
-        {
-            "$ref": "#/definitions/adna_damage_analysis"
-        },
         {
             "$ref": "#/definitions/bam_filtering"
         },
@@ -1131,25 +1141,25 @@
             "$ref": "#/definitions/deduplication"
         },
         {
-            "$ref": "#/definitions/mitochondrial_to_nuclear_ratio"
+            "$ref": "#/definitions/damage_manipulation"
         },
         {
-            "$ref": "#/definitions/mapping_statistics"
+            "$ref": "#/definitions/genotyping"
         },
         {
-            "$ref": "#/definitions/damage_manipulation"
+            "$ref": "#/definitions/mitochondrial_to_nuclear_ratio"
         },
         {
-            "$ref": "#/definitions/genotyping"
+            "$ref": "#/definitions/mapping_statistics"
         },
         {
             "$ref": "#/definitions/adna_damage_analysis"
         },
         {
-            "$ref": "#/definitions/contamination_estimation"
+            "$ref": "#/definitions/host_removal"
         },
         {
-            "$ref": "#/definitions/host_removal"
+            "$ref": "#/definitions/contamination_estimation"
         },
         {
             "$ref": "#/definitions/feature_annotation_statistics"

diff --git a/subworkflows/local/map.nf b/subworkflows/local/map.nf
@@ -2,6 +2,7 @@
 // Prepare reference indexing for downstream
 //
 
+include { SEQKIT_SPLIT2                                                                                                             } from '../../modules/nf-core/seqkit/split2/main'
 include { FASTQ_ALIGN_BWAALN                                                                                                        } from '../../subworkflows/nf-core/fastq_align_bwaaln/main'
 include { BWA_MEM                                                                                                                   } from '../../modules/nf-core/bwa/mem/main'
 include { BOWTIE2_ALIGN                                                                                                             } from '../../modules/nf-core/bowtie2/align/main'
@@ -19,6 +20,47 @@ workflow MAP {
     ch_versions       = Channel.empty()
     ch_multiqc_files  = Channel.empty()
 
+    if ( params.run_fastq_sharding ) {
+
+        ch_input_for_sharding = reads
+
+        SEQKIT_SPLIT2( ch_input_for_sharding )
+        ch_versions        = ch_versions.mix ( SEQKIT_SPLIT2.out.versions.first() )
+
+        sharded_reads = SEQKIT_SPLIT2.out.reads
+            .transpose()
+            .map {
+                meta, reads ->
+                    new_meta = meta.clone()
+                    new_meta.shard_number = reads.getName().replaceAll(/.*(part_\d+).(?:fastq|fq).gz/, '$1')
+                    [ new_meta, reads ] 
+            }
+            .groupTuple()
+
+        ch_input_for_mapping = sharded_reads
+            .combine(index)
+            .multiMap {
+                meta, reads, meta2, index ->
+                    new_meta = meta.clone()
+                    new_meta.reference = meta2.id  
+                    reads: [ new_meta, reads ] 
+                    index: [ meta2, index ]
+            }
+
+    } else {
+
+        ch_input_for_mapping = reads
+            .combine(index)
+            .multiMap {
+                meta, reads, meta2, index ->
+                    new_meta = meta.clone()
+                    new_meta.reference = meta2.id  
+                    reads: [ new_meta, reads ] 
+                    index: [ meta2, index ]
+            }
+
+    }
+
     if ( params.mapping_tool == 'bwaaln' ) {
         ch_index_for_mapping = index
         ch_reads_for_mapping = reads
@@ -76,8 +118,7 @@ workflow MAP {
     ch_input_for_lane_merge = ch_mapped_lane_bam
                                 .map {
                                     meta, bam ->
-                                    new_meta = meta.clone().findAll{ it.key !in ['lane', 'colour_chemistry'] }
-
+                                    new_meta = meta.clone().findAll{ it.key !in ['lane', 'colour_chemistry', 'shard_number'] }
                                     [ new_meta, bam ]
                                 }
                                 .groupTuple()