Skip to content

Commit

Permalink
Merge pull request #1023 from nf-core/dsl2-add-sharding-of-fastqs-bef…
Browse files Browse the repository at this point in the history
…ore-alignment

Dsl2 add sharding of fastqs before alignment
  • Loading branch information
shyama-mama authored Nov 11, 2023
2 parents 6cb0c30 + 6341f02 commit d421158
Show file tree
Hide file tree
Showing 10 changed files with 187 additions and 12 deletions.
4 changes: 3 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,12 @@
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## v3.0.0dev - [date]
## v3.0.0dev - [2023-08-25]

### `Added`

- [#1006](https://github.com/nf-core/eager/issues/1006) Added feature to shard fastqs before mapping, allowing more flexibility in parallelisation of mapping.

### `Fixed`

### `Dependencies`
Expand Down
4 changes: 4 additions & 0 deletions CITATIONS.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,10 @@

> Broad Institute (2019). Picard Toolkit. GitHub Repository: https://broadinstitute.github.io/picard/
- [SeqKit](https://bioinf.shenwei.me/seqkit/)

> Shen, W., Le, S., Li, Y., & Hu, F. (2016). SeqKit: A Cross-Platform and Ultrafast Toolkit for FASTA/Q File Manipulation. PLOS ONE, 11(10), e0163962. doi:[10.1371/journal.pone.0163962](https://doi.org/10.1371/journal.pone.0163962)
- [bwa](https://doi.org/10.1093/bioinformatics/btp324)

> Li, H., & Durbin, R. (2009). Fast and accurate short read alignment with Burrows-Wheeler transform. Bioinformatics , 25(14), 1754–1760. doi: [10.1093/bioinformatics/btp324](https://doi.org/10.1093/bioinformatics/btp324)
Expand Down
13 changes: 13 additions & 0 deletions conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -371,6 +371,18 @@ process {
]
}

//
// SHARDING FASTQS
//
withName: SEQKIT_SPLIT2 {
tag = { "${meta.sample_id}_${meta.library_id}_L${meta.lane}" }
ext.prefix = "out"
ext.args = "-s ${params.fastq_shard_size}"
publishDir = [
enabled: false
]
}

//
// READ MAPPING
//
Expand Down Expand Up @@ -454,6 +466,7 @@ process {
publishDir = [
enabled: false
]
ext.args = { params.run_fastq_sharding ? "-c -p" : "" }
}

withName: SAMTOOLS_SORT_MERGED_LANES {
Expand Down
4 changes: 4 additions & 0 deletions conf/test.config
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,10 @@ params {
// Genome references
fasta = 'https://raw.githubusercontent.com/nf-core/test-datasets/eager/reference/Mammoth/Mammoth_MT_Krause.fasta'

// Sharding FASTQ
run_fastq_sharding = true
fastq_shard_size = 5000

// BAM filtering
run_bamfiltering = true
bamfiltering_minreadlength = 30
Expand Down
5 changes: 5 additions & 0 deletions modules.json
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,11 @@
"branch": "master",
"git_sha": "911696ea0b62df80e900ef244d7867d177971f73",
"installed_by": ["modules", "bam_split_by_region"]
},
"seqkit/split2": {
"branch": "master",
"git_sha": "911696ea0b62df80e900ef244d7867d177971f73",
"installed_by": ["modules"]
}
}
},
Expand Down
53 changes: 53 additions & 0 deletions modules/nf-core/seqkit/split2/main.nf

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

39 changes: 39 additions & 0 deletions modules/nf-core/seqkit/split2/meta.yml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 4 additions & 0 deletions nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,10 @@ params {
max_multiqc_email_size = '25.MB'
multiqc_methods_description = null

// Shard Fastq options
run_fastq_sharding = false
fastq_shard_size = 1000000

// bedtools options
run_bedtools_coverage = false
mapstats_bedtools_featurefile = null
Expand Down
28 changes: 19 additions & 9 deletions nextflow_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -463,6 +463,19 @@
"description": "Options for aligning reads against reference genome(s)",
"default": "",
"properties": {
"run_fastq_sharding": {
"type": "boolean",
"description": "Turn on FastQ sharding.",
"fa_icon": "fas fa-power-off",
"help_text": "Sharding will split the FastQs into smaller chunks before mapping. These chunks are then mapped in parallel. This approach can speed up the mapping process for larger FastQ files."
},
"fastq_shard_size": {
"type": "integer",
"default": 1000000,
"description": "Specify the number of reads in each shard when splitting.",
"fa_icon": "fas fa-arrows-alt-v",
"help_text": "Make sure to choose a value that makes sense for your dataset. Small values can create many files, which can end up negatively affecting the overall speed of the mapping process."
},
"mapping_tool": {
"type": "string",
"default": "bowtie2",
Expand Down Expand Up @@ -1118,9 +1131,6 @@
{
"$ref": "#/definitions/mapping"
},
{
"$ref": "#/definitions/adna_damage_analysis"
},
{
"$ref": "#/definitions/bam_filtering"
},
Expand All @@ -1131,25 +1141,25 @@
"$ref": "#/definitions/deduplication"
},
{
"$ref": "#/definitions/mitochondrial_to_nuclear_ratio"
"$ref": "#/definitions/damage_manipulation"
},
{
"$ref": "#/definitions/mapping_statistics"
"$ref": "#/definitions/genotyping"
},
{
"$ref": "#/definitions/damage_manipulation"
"$ref": "#/definitions/mitochondrial_to_nuclear_ratio"
},
{
"$ref": "#/definitions/genotyping"
"$ref": "#/definitions/mapping_statistics"
},
{
"$ref": "#/definitions/adna_damage_analysis"
},
{
"$ref": "#/definitions/contamination_estimation"
"$ref": "#/definitions/host_removal"
},
{
"$ref": "#/definitions/host_removal"
"$ref": "#/definitions/contamination_estimation"
},
{
"$ref": "#/definitions/feature_annotation_statistics"
Expand Down
45 changes: 43 additions & 2 deletions subworkflows/local/map.nf
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
// Prepare reference indexing for downstream
//

include { SEQKIT_SPLIT2 } from '../../modules/nf-core/seqkit/split2/main'
include { FASTQ_ALIGN_BWAALN } from '../../subworkflows/nf-core/fastq_align_bwaaln/main'
include { BWA_MEM } from '../../modules/nf-core/bwa/mem/main'
include { BOWTIE2_ALIGN } from '../../modules/nf-core/bowtie2/align/main'
Expand All @@ -19,6 +20,47 @@ workflow MAP {
ch_versions = Channel.empty()
ch_multiqc_files = Channel.empty()

if ( params.run_fastq_sharding ) {

ch_input_for_sharding = reads

SEQKIT_SPLIT2( ch_input_for_sharding )
ch_versions = ch_versions.mix ( SEQKIT_SPLIT2.out.versions.first() )

sharded_reads = SEQKIT_SPLIT2.out.reads
.transpose()
.map {
meta, reads ->
new_meta = meta.clone()
new_meta.shard_number = reads.getName().replaceAll(/.*(part_\d+).(?:fastq|fq).gz/, '$1')
[ new_meta, reads ]
}
.groupTuple()

ch_input_for_mapping = sharded_reads
.combine(index)
.multiMap {
meta, reads, meta2, index ->
new_meta = meta.clone()
new_meta.reference = meta2.id
reads: [ new_meta, reads ]
index: [ meta2, index ]
}

} else {

ch_input_for_mapping = reads
.combine(index)
.multiMap {
meta, reads, meta2, index ->
new_meta = meta.clone()
new_meta.reference = meta2.id
reads: [ new_meta, reads ]
index: [ meta2, index ]
}

}

if ( params.mapping_tool == 'bwaaln' ) {
ch_index_for_mapping = index
ch_reads_for_mapping = reads
Expand Down Expand Up @@ -76,8 +118,7 @@ workflow MAP {
ch_input_for_lane_merge = ch_mapped_lane_bam
.map {
meta, bam ->
new_meta = meta.clone().findAll{ it.key !in ['lane', 'colour_chemistry'] }

new_meta = meta.clone().findAll{ it.key !in ['lane', 'colour_chemistry', 'shard_number'] }
[ new_meta, bam ]
}
.groupTuple()
Expand Down

0 comments on commit d421158

Please sign in to comment.