diff --git a/conf/modules.config b/conf/modules.config index 19cc03080..fa6201399 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -231,6 +231,12 @@ process { ] } + withName: 'GUNZIP_PMDFASTA|GUNZIP_PMDSNP|GUNZIP_SNPBED' { + publishDir = [ + enabled: false + ] + } + withName: SAMTOOLS_FAIDX { publishDir = [ path: { "${params.outdir}/reference/${meta.id}/" }, @@ -669,6 +675,16 @@ process { // // DAMAGE MANIPULATION // + + withName: BEDTOOLS_MASKFASTA { + ext.prefix = { "${meta.id}.masked" } + publishDir = [ + path: { "${params.outdir}/reference/masked_reference/" }, + mode: params.publish_dir_mode, + pattern: '*.masked.fa' + ] + } + withName: MAPDAMAGE2 { tag = { "${meta.reference}|${meta.sample_id}_${meta.library_id}" } ext.args = { [ diff --git a/docs/development/dev_docs.md b/docs/development/dev_docs.md new file mode 100644 index 000000000..767fb35e6 --- /dev/null +++ b/docs/development/dev_docs.md @@ -0,0 +1,44 @@ +# Documentation and how tos for developing eager + +## How to add new input files and options to the reference sheet + +To add new input files or options to the reference sheet, you have to complete all the following steps. + +### Single-reference input workflow + +1. Add your new parameter to nextflow.config. +2. Add parameter description to schema (nf-core schema build). +3. Read in new parameter (params.) as input within the reference_indexing_single local subworkflow. + 1. Add new line to the large `.map{}` operation starting on [line 80(https://github.com/nf-core/eager/blob/d4211582f349cc30c88202c12942218f99006041/subworkflows/local/reference_indexing_single.nf#L80)] and add check if the file exists. + `def = params. != null ? file(params., checkIfExists: true ) : ""` + 2. Add to the result of the map operation. Double-check the order! + 3. With the `ch_ref_index_single.multiMap{}` below you add the reference name as a meta. You can also combine your new parameter with others if useful for the workflow step. + `: [ meta, ]` + 4. Add your ch_ref_index_single. to the final emit. + ` = ch_ref_index_single.` + +### Multi-reference input workflow + +1. Add new column named and test data to the test reference sheet (https://github.com/nf-core/test-datasets/blob/eager/reference/reference_sheet_multiref.csv). +2. Read in new input within the reference_indexing_multi local subworkflow. + 1. Add new line to the large `.map{}` operation starting on [line 30](https://github.com/nf-core/eager/blob/d4211582f349cc30c88202c12942218f99006041/subworkflows/local/reference_indexing_multi.nf#L30). Add check if the file exists if appropriate. + `def = row[""] != "" ? file(row[""], checkIfExists: true) : ""` + 2. Add to the result of the `.map{}` operation. Double-check the order! + 3. With the `ch_input_from_referencesheet.multiMap{}` below you add the reference name as a meta. You can also combine your new parameter with others if useful for the workflow step. + `: [ meta, ]` + 4. Add ch_input_from_referencesheet. to the final emit. + ` = ch_input_from_referencesheet.` + +### Combining in the Reference Indexing workflow + +1. Add you new parameter channel to the `if` condition selecting between the direct parameter input or the reference sheet input. + 1. below "REFERENCE_INDEXING_MULTI" for reference sheet input + ` = REFERENCE_INDEXING_MULTI.out.` + 2. below "REFERENCE_INDEXING_SINGLE" + ` = REFERENCE_INDEXING_SINGLE.out.` + 3. Filter out options that have not been provided. + ` = .filter{ it[1] != "" }` + 4. Add unzipping of zipped input files with GUNZIP. + 5. Add to the final emit. + ` = ` + 6. Call new inputs within the main eager.nf with `REFERENCE_INDEXING.out.`. diff --git a/docs/development/manual_tests.md b/docs/development/manual_tests.md index d339d4ba7..457172f3f 100644 --- a/docs/development/manual_tests.md +++ b/docs/development/manual_tests.md @@ -92,6 +92,8 @@ Tool Specific combinations - with default parameters - with stricter threshold + - with fasta masking + - with fasta masking for 1 of 2 references - BAM trimming @@ -615,6 +617,18 @@ nextflow run . -profile test,docker --run_pmd_filtering -resume --outdir ./resul # JK2802_JK2802_AGAATAACCTACCA_pmdfiltered.bam: 30 ``` +```bash +## PMD filtering with fasta masking +## Expect: damage_manipulation directory with *.masked.fa and bam and bai and flagstat per library +nextflow run . -profile test_humanbam,docker --run_pmd_filtering --damage_manipulation_pmdtools_reference_mask https://raw.githubusercontent.com/nf-core/test-datasets/eager/reference/Human/1240K.pos.list_hs37d5.0based.bed.gz -resume --outdir ./results +``` + +```bash +## PMD filtering with fasta masking for 1 of 2 references +## Expect: damage_manipulation directory with hs37d5_chr21-MT.masked.fa and bam and bai and flagstat per library and reference (22 files total). hs37d5_chr21-MT first masked with 1240K.pos.list_hs37d5.0based.bed.gz from reference sheet, PMD filtering run with masked reference fasta for hs37d5 and non-masked reference fasta for Mammoth_MT +nextflow run . -profile test_multiref,docker --run_pmd_filtering --outdir ./results +``` + ## BAM trimming ```bash diff --git a/docs/output.md b/docs/output.md index a6de7067a..9b6791e01 100644 --- a/docs/output.md +++ b/docs/output.md @@ -464,11 +464,13 @@ Be advised that this process introduces reference bias in the resulting rescaled - `*_pmdfiltered.bam`: Reads aligned to a reference genome that pass the post-mortem-damage threshold, in BAM format. - `*_pmdfiltered.bam.{bai,csi}`: Index file corresponding to the BAM file. + - `*_pmdfiltered.flagstat`: Statistics of aligned reads after from SAMtools `flagstat`, after filtering for post-mortem damage. [pmdtools](https://github.com/pontussk/PMDtools) implements a likelihood framework incorporating a postmortem damage (PMD) score, base quality scores and biological polymorphism to identify degraded DNA sequences that are unlikely to originate from modern contamination. Using the model, each sequence is assigned a PMD score, for which positive values indicate support for the sequence being genuinely ancient. By filtering a BAM file for damaged reads in this way, it is possible to ameliorate the effects of present-day contamination, and isolate sequences of likely ancient origin to be used downstream. +By default, all positions are assessed for damage, but it is possible to provide a FASTA file masked for specific references (`--damage_manipulation_pmdtools_masked_reference`) or a BED file to mask the reference FASTA within nf-core/eager (`--damage_manipulation_pmdtools_reference_mask`). This can alleviate reference bias when running PMD filtering on capture data, where you might not want the allele of a SNP to be counted as damage when it is a transition. ### Base Trimming diff --git a/modules.json b/modules.json index 6af228714..ed9b289f8 100644 --- a/modules.json +++ b/modules.json @@ -35,6 +35,11 @@ "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", "installed_by": ["modules"] }, + "bedtools/maskfasta": { + "branch": "master", + "git_sha": "516189e968feb4ebdd9921806988b4c12b4ac2dc", + "installed_by": ["modules"] + }, "bowtie2/align": { "branch": "master", "git_sha": "603ecbd9f45300c9788f197d2a15a005685b4220", diff --git a/modules/nf-core/bedtools/maskfasta/environment.yml b/modules/nf-core/bedtools/maskfasta/environment.yml new file mode 100644 index 000000000..55ce727a7 --- /dev/null +++ b/modules/nf-core/bedtools/maskfasta/environment.yml @@ -0,0 +1,6 @@ +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::bedtools=2.30.0 diff --git a/modules/nf-core/bedtools/maskfasta/main.nf b/modules/nf-core/bedtools/maskfasta/main.nf new file mode 100644 index 000000000..2239eda5c --- /dev/null +++ b/modules/nf-core/bedtools/maskfasta/main.nf @@ -0,0 +1,36 @@ +process BEDTOOLS_MASKFASTA { + tag "$meta.id" + label 'process_single' + + conda 'modules/nf-core/bedtools/maskfasta/environment.yml' + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bedtools:2.30.0--hc088bd4_0' : + 'biocontainers/bedtools:2.30.0--hc088bd4_0' }" + + input: + tuple val(meta), path(bed) + path fasta + + output: + tuple val(meta), path("*.fa"), emit: fasta + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + bedtools \\ + maskfasta \\ + $args \\ + -fi $fasta \\ + -bed $bed \\ + -fo ${prefix}.fa + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bedtools: \$(bedtools --version | sed -e "s/bedtools v//g") + END_VERSIONS + """ +} diff --git a/modules/nf-core/bedtools/maskfasta/meta.yml b/modules/nf-core/bedtools/maskfasta/meta.yml new file mode 100644 index 000000000..94f81aba7 --- /dev/null +++ b/modules/nf-core/bedtools/maskfasta/meta.yml @@ -0,0 +1,46 @@ +name: bedtools_maskfasta +description: masks sequences in a FASTA file based on intervals defined in a feature file. +keywords: + - bed + - fasta + - maskfasta +tools: + - bedtools: + description: | + A set of tools for genomic analysis tasks, specifically enabling genome arithmetic (merge, count, complement) on various file types. + documentation: https://bedtools.readthedocs.io/en/latest/content/tools/intersect.html + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bed: + type: file + description: Bed feature file + pattern: "*.{bed}" + - fasta: + type: file + description: Input fasta file + pattern: "*.{fa,fasta}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fasta: + type: file + description: Output masked fasta file + pattern: "*.{fa}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@joseespinosa" + - "@drpatelh" +maintainers: + - "@joseespinosa" + - "@drpatelh" diff --git a/nextflow.config b/nextflow.config index 90fa73892..d0de5cbd1 100644 --- a/nextflow.config +++ b/nextflow.config @@ -185,6 +185,7 @@ params { damage_manipulation_rescale_length_3p = 0 run_pmd_filtering = false damage_manipulation_pmdtools_threshold = 3 + damage_manipulation_pmdtools_masked_reference = null damage_manipulation_pmdtools_reference_mask = null run_trim_bam = false damage_manipulation_bamutils_trim_double_stranded_none_udg_left = 0 diff --git a/nextflow_schema.json b/nextflow_schema.json index db9ba0805..38abbccf9 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -780,12 +780,20 @@ "description": "Specify PMDScore threshold for PMDtools.", "help_text": "Specifies the PMDScore threshold to use in the pipeline when filtering BAM files for DNA damage. Only reads which surpass this damage score are considered for downstream DNA analysis.\n\n> Modifies PMDtools parameter: `--threshold`" }, + "damage_manipulation_pmdtools_masked_reference": { + "type": "string", + "fa_icon": "fas fa-mask", + "help_text": "Supplying a FASTA file will use this file as reference for `samtools calmd` prior to PMD filtering. /nSetting the SNPs that are part of the used capture set as `N` can alleviate reference bias when running PMD filtering on capture data, where you might not want the allele of a SNP to be counted as damage when it is a transition.", + "description": "Specify a masked FASTA file with positions to be used with pmdtools.", + "pattern": "^\\S+\\.fa?(\\sta)$", + "format": "file-path" + }, "damage_manipulation_pmdtools_reference_mask": { "type": "string", "fa_icon": "fas fa-mask", "help_text": "Supplying a bedfile to this parameter activates masking of the reference fasta at the contained sites prior to running PMDtools. Positions that are in the provided bedfile will be replaced by Ns in the reference genome. \nThis can alleviate reference bias when running PMD filtering on capture data, where you might not want the allele of a transition SNP to be counted as damage. Masking of the reference is done using `bedtools maskfasta`.", "description": "Specify a bedfile to be used to mask the reference fasta prior to running pmdtools.", - "pattern": "^\\S+\\.bed$", + "pattern": "^\\S+\\.bed?(\\.gz)$", "format": "file-path" }, "run_trim_bam": { @@ -960,8 +968,7 @@ "fa_icon": "fab fa-creative-commons-sampling-plus" }, "skip_qualimap": { - "type": "boolean", - "default": "false" + "type": "boolean" }, "snpcapture_bed": { "type": "string", @@ -1155,6 +1162,9 @@ { "$ref": "#/definitions/adna_damage_analysis" }, + { + "$ref": "#/definitions/feature_annotation_statistics" + }, { "$ref": "#/definitions/host_removal" }, @@ -1162,7 +1172,7 @@ "$ref": "#/definitions/contamination_estimation" }, { - "$ref": "#/definitions/feature_annotation_statistics" + "$ref": "#/definitions/contamination_estimation" } ] } diff --git a/subworkflows/local/manipulate_damage.nf b/subworkflows/local/manipulate_damage.nf index d96fd2d43..b54be730e 100644 --- a/subworkflows/local/manipulate_damage.nf +++ b/subworkflows/local/manipulate_damage.nf @@ -2,6 +2,7 @@ // Calculate PMD scores, trim, or rescale DNA damage from mapped reads. // +include { BEDTOOLS_MASKFASTA } from '../../modules/nf-core/bedtools/maskfasta/main' include { MAPDAMAGE2 } from '../../modules/nf-core/mapdamage2/main' include { PMDTOOLS_FILTER } from '../../modules/nf-core/pmdtools/filter/main' include { BAMUTIL_TRIMBAM } from '../../modules/nf-core/bamutil/trimbam/main' @@ -13,8 +14,9 @@ include { SAMTOOLS_FLAGSTAT as SAMTOOLS_FLAGSTAT_DAMAGE_FILTERED } from '../../ // TODO: Add required channels and channel manipulations for reference-dependent bed masking before pmdtools. Requires multi-ref support before implementation. workflow MANIPULATE_DAMAGE { take: - ch_bam_bai // [ [ meta ], bam , bai ] - ch_fasta // [ [ meta ], fasta ] + ch_bam_bai // [ [ meta ], bam , bai ] + ch_fasta // [ [ meta ], fasta ] + ch_pmd_masking // [ [ meta ], masked_fasta, bed_for_masking ] main: ch_versions = Channel.empty() @@ -35,7 +37,7 @@ workflow MANIPULATE_DAMAGE { // Prepend a new meta that contains the meta.reference value as the new_meta.reference attribute WorkflowEager.addNewMetaFromAttributes( it, "reference" , "reference" , false ) } - .combine(ch_refs, by: 0 ) // [ [combine_meta], [meta], bam, bai, [ref_meta], fasta ] + .combine( ch_refs, by: 0 ) // [ [combine_meta], [meta], bam, bai, [ref_meta], fasta ] if ( params.run_mapdamage_rescaling ) { ch_mapdamage_prep = ch_input_for_damage_manipulation @@ -71,17 +73,52 @@ workflow MANIPULATE_DAMAGE { } if ( params.run_pmd_filtering ) { - // TODO Add module to produce Masked reference from given references and bed file (with meta specifying the reference it matches)? - // if ( params.pmdtools_reference_mask) { - // MASK_REFERENCE_BY_BED() - // } - - ch_pmdtools_input = ch_input_for_damage_manipulation - .multiMap { - ignore_me, meta, bam, bai, ref_meta, fasta -> - bam: [ meta, bam, bai ] - fasta: fasta - } + ch_masking_prep = ch_pmd_masking + .combine( ch_fasta, by: 0 ) // [ [meta], masked_fasta, bed, fasta ] + .branch { + meta, masked_fasta, bed, fasta -> + alreadymasked: masked_fasta != "" + tobemasked: masked_fasta == "" && bed != "" + nomasking: masked_fasta == "" && bed == "" + } + + ch_masking_input = ch_masking_prep.tobemasked + .multiMap{ + meta, masked_fasta, bed, fasta -> + bed: [ meta, bed ] + fasta: fasta + } + + BEDTOOLS_MASKFASTA( ch_masking_input.bed, ch_masking_input.fasta ) + ch_masking_output = BEDTOOLS_MASKFASTA.out.fasta + ch_versions = ch_versions.mix( BEDTOOLS_MASKFASTA.out.versions.first() ) + + ch_already_masked = ch_masking_prep.alreadymasked + .map { + meta, masked_fasta, bed, fasta -> + [ meta, masked_fasta ] + } + + ch_no_masking = ch_masking_prep.nomasking + .map { + meta, masked_fasta, bed, fasta -> + [ meta, fasta ] + } + + ch_pmd_fastas = ch_masking_output.mix( ch_already_masked, ch_no_masking ) + .map { + // Prepend a new meta that contains the meta.id value as the new_meta.reference attribute + WorkflowEager.addNewMetaFromAttributes( it, "id" , "reference" , false ) + } + + ch_pmdtools_input = ch_bam_bai + .map { WorkflowEager.addNewMetaFromAttributes( it, "reference" , "reference" , false ) } + .combine( ch_pmd_fastas, by: 0 ) // [ [combine_meta], [meta], bam, bai, [ref_meta] fasta ] + .multiMap { + combine_meta, meta, bam, bai, ref_meta, fasta -> + bam: [ meta, bam, bai ] + fasta: fasta + } PMDTOOLS_FILTER( ch_pmdtools_input.bam, params.damage_manipulation_pmdtools_threshold, ch_pmdtools_input.fasta ) ch_versions = ch_versions.mix( PMDTOOLS_FILTER.out.versions.first() ) diff --git a/subworkflows/local/reference_indexing.nf b/subworkflows/local/reference_indexing.nf index 13357d4ea..145f53ff9 100644 --- a/subworkflows/local/reference_indexing.nf +++ b/subworkflows/local/reference_indexing.nf @@ -2,9 +2,9 @@ // Prepare reference indexing for downstream // -include { REFERENCE_INDEXING_SINGLE } from '../../subworkflows/local/reference_indexing_single.nf' -include { REFERENCE_INDEXING_MULTI } from '../../subworkflows/local/reference_indexing_multi.nf' -include { GUNZIP as GUNZIP_SNPBED } from '../../modules/nf-core/gunzip/main.nf' +include { REFERENCE_INDEXING_SINGLE } from '../../subworkflows/local/reference_indexing_single.nf' +include { REFERENCE_INDEXING_MULTI } from '../../subworkflows/local/reference_indexing_multi.nf' +include { GUNZIP as GUNZIP_PMDBED; GUNZIP as GUNZIP_PMDFASTA; GUNZIP as GUNZIP_SNPBED } from '../../modules/nf-core/gunzip/main.nf' workflow REFERENCE_INDEXING { take: @@ -26,7 +26,8 @@ workflow REFERENCE_INDEXING { ch_reference_for_mapping = REFERENCE_INDEXING_MULTI.out.reference ch_mitochondrion_header = REFERENCE_INDEXING_MULTI.out.mitochondrion_header ch_hapmap = REFERENCE_INDEXING_MULTI.out.hapmap - ch_pmd_mask = REFERENCE_INDEXING_MULTI.out.pmd_mask + ch_pmd_masked_fasta = REFERENCE_INDEXING_MULTI.out.pmd_masked_fasta + ch_pmd_bed_for_masking = REFERENCE_INDEXING_MULTI.out.pmd_bed_for_masking ch_snp_capture_bed = REFERENCE_INDEXING_MULTI.out.snp_capture_bed ch_pileupcaller_snp = REFERENCE_INDEXING_MULTI.out.pileupcaller_snp ch_sexdeterrmine_bed = REFERENCE_INDEXING_MULTI.out.sexdeterrmine_bed @@ -37,7 +38,8 @@ workflow REFERENCE_INDEXING { REFERENCE_INDEXING_SINGLE ( fasta, fasta_fai, fasta_dict, fasta_mapperindexdir ) ch_mitochondrion_header = REFERENCE_INDEXING_SINGLE.out.mitochondrion_header ch_hapmap = REFERENCE_INDEXING_SINGLE.out.hapmap - ch_pmd_mask = REFERENCE_INDEXING_SINGLE.out.pmd_mask + ch_pmd_masked_fasta = REFERENCE_INDEXING_SINGLE.out.pmd_masked_fasta + ch_pmd_bed_for_masking = REFERENCE_INDEXING_SINGLE.out.pmd_bed_for_masking ch_snp_capture_bed = REFERENCE_INDEXING_SINGLE.out.snp_capture_bed ch_pileupcaller_snp = REFERENCE_INDEXING_SINGLE.out.pileupcaller_snp ch_sexdeterrmine_bed = REFERENCE_INDEXING_SINGLE.out.sexdeterrmine_bed @@ -46,17 +48,49 @@ workflow REFERENCE_INDEXING { ch_versions = ch_versions.mix( REFERENCE_INDEXING_SINGLE.out.versions ) } - // Filter out input options that are not provided + // Filter out input options that are not provided and unzip if necessary ch_mitochondrion_header = ch_mitochondrion_header .filter{ it[1] != "" } ch_hapmap = ch_hapmap .filter{ it[1] != "" } - ch_pmd_mask = ch_pmd_mask - .filter{ it[1] != "" && it[2] != "" } + ch_pmd_masked_fasta = ch_pmd_masked_fasta + .branch { + meta, pmd_masked_fasta -> + input: pmd_masked_fasta != "" + skip: true + } + ch_pmd_masked_fasta_gunzip = ch_pmd_masked_fasta.input + .branch { + meta, pmd_masked_fasta -> + forgunzip: pmd_masked_fasta.extension == "gz" + skip: true + } + GUNZIP_PMDFASTA( ch_pmd_masked_fasta_gunzip.forgunzip ) + ch_pmd_masked_fasta = ch_pmd_masked_fasta_gunzip.skip.mix( GUNZIP_PMDFASTA.out.gunzip ).mix( ch_pmd_masked_fasta.skip ) + ch_version = ch_versions.mix( GUNZIP_PMDFASTA.out.versions.first() ) + + ch_pmd_bed_for_masking = ch_pmd_bed_for_masking + .branch { + meta, pmd_bed_for_masking -> + input: pmd_bed_for_masking != "" + skip: true + } + ch_pmd_bed_for_masking_gunzip = ch_pmd_bed_for_masking.input + .branch { + meta, pmd_bed_for_masking -> + forgunzip: pmd_bed_for_masking.extension == "gz" + skip: true + } + GUNZIP_PMDBED( ch_pmd_bed_for_masking_gunzip.forgunzip ) + ch_pmd_bed_for_masking = ch_pmd_bed_for_masking_gunzip.skip.mix( GUNZIP_PMDBED.out.gunzip ).mix( ch_pmd_bed_for_masking.skip ) + ch_version = ch_versions.mix( GUNZIP_PMDBED.out.versions.first() ) + + ch_pmd_masking = ch_pmd_masked_fasta + .combine( by: 0, ch_pmd_bed_for_masking ) - ch_capture_bed = ch_snp_capture_bed //optional + ch_capture_bed = ch_snp_capture_bed //bed file input is optional, so no filtering .branch { meta, capture_bed -> input: capture_bed != "" @@ -70,6 +104,7 @@ workflow REFERENCE_INDEXING { } GUNZIP_SNPBED( ch_capture_bed_gunzip.forgunzip ) ch_capture_bed = GUNZIP_SNPBED.out.gunzip.mix( ch_capture_bed_gunzip.skip ).mix( ch_capture_bed.skip ) + ch_version = ch_versions.mix( GUNZIP_SNPBED.out.versions.first() ) ch_pileupcaller_snp = ch_pileupcaller_snp .filter{ it[1] != "" && it[2] != "" } @@ -84,7 +119,8 @@ workflow REFERENCE_INDEXING { reference = ch_reference_for_mapping // [ meta, fasta, fai, dict, mapindex, circular_target ] mitochondrion_header = ch_mitochondrion_header // [ meta, mitochondrion_header ] hapmap = ch_hapmap // [ meta, hapmap ] - pmd_mask = ch_pmd_mask // [ meta, masked_fasta, capture_bed ] + pmd_masking = ch_pmd_masking // [ meta, pmd_masked_fasta, pmd_bed_for_masking ] + pmd_bed_for_masking = ch_pmd_bed_for_masking // [ meta, pmd_bed_for_masking ] snp_capture_bed = ch_capture_bed // [ meta, capture_bed ] pileupcaller_snp = ch_pileupcaller_snp // [ meta, pileupcaller_bed, pileupcaller_snp ] sexdeterrmine_bed = ch_sexdeterrmine_bed // [ meta, sexdet_bed ] diff --git a/subworkflows/local/reference_indexing_multi.nf b/subworkflows/local/reference_indexing_multi.nf index 1ca630498..c7c79e052 100644 --- a/subworkflows/local/reference_indexing_multi.nf +++ b/subworkflows/local/reference_indexing_multi.nf @@ -29,22 +29,23 @@ workflow REFERENCE_INDEXING_MULTI { ch_splitreferencesheet_for_branch = ch_splitreferencesheet_for_map .map { row -> - def meta = [:] - meta.id = row["reference_name"] - def fasta = file(row["fasta"], checkIfExists: true) // mandatory parameter! - def fai = row["fai"] != "" ? file(row["fai"], checkIfExists: true) : "" - def dict = row["dict"] != "" ? file(row["dict"], checkIfExists: true) : "" - def mapper_index = row["mapper_index"] != "" ? file(row["mapper_index"], checkIfExists: true) : "" - def circular_target = row["circular_target"] - def mitochondrion = row["mitochondrion_header"] - def capture_bed = row["snpcapture_bed"] != "" ? file(row["snpcapture_bed"], checkIfExists: true) : "" - def pileupcaller_bed = row["pileupcaller_bedfile"] != "" ? file(row["pileupcaller_bedfile"], checkIfExists: true) : "" - def pileupcaller_snp = row["pileupcaller_snpfile"] != "" ? file(row["pileupcaller_snpfile"], checkIfExists: true) : "" - def hapmap = row["hapmap_file"] != "" ? file(row["hapmap_file"], checkIfExists: true) : "" - def pmd_mask = row["pmdtools_masked_fasta"] != "" ? file(row["pmdtools_masked_fasta"], checkIfExists: true) : "" - def sexdet_bed = row["sexdeterrmine_snp_bed"] != "" ? file(row["sexdeterrmine_snp_bed"], checkIfExists: true) : "" - def bedtools_feature = row["bedtools_feature_file"] != "" ? file(row["bedtools_feature_file"], checkIfExists: true) : "" - [ meta, fasta, fai, dict, mapper_index, circular_target, mitochondrion, capture_bed, pileupcaller_bed, pileupcaller_snp, hapmap, pmd_mask, sexdet_bed, bedtools_feature ] + def meta = [:] + meta.id = row["reference_name"] + def fasta = file(row["fasta"], checkIfExists: true) // mandatory parameter! + def fai = row["fai"] != "" ? file(row["fai"], checkIfExists: true) : "" + def dict = row["dict"] != "" ? file(row["dict"], checkIfExists: true) : "" + def mapper_index = row["mapper_index"] != "" ? file(row["mapper_index"], checkIfExists: true) : "" + def circular_target = row["circular_target"] + def mitochondrion = row["mitochondrion_header"] + def capture_bed = row["snpcapture_bed"] != "" ? file(row["snpcapture_bed"], checkIfExists: true) : "" + def pileupcaller_bed = row["pileupcaller_bedfile"] != "" ? file(row["pileupcaller_bedfile"], checkIfExists: true) : "" + def pileupcaller_snp = row["pileupcaller_snpfile"] != "" ? file(row["pileupcaller_snpfile"], checkIfExists: true) : "" + def hapmap = row["hapmap_file"] != "" ? file(row["hapmap_file"], checkIfExists: true) : "" + def pmd_masked_fasta = row["pmdtools_masked_fasta"] != "" ? file(row["pmdtools_masked_fasta"], checkIfExists: true) : "" + def pmd_bed_for_masking = row["pmdtools_bed_for_masking"] != "" ? file(row["pmdtools_bed_for_masking"], checkIfExists: true) : "" + def sexdet_bed = row["sexdeterrmine_snp_bed"] != "" ? file(row["sexdeterrmine_snp_bed"], checkIfExists: true) : "" + def bedtools_feature = row["bedtools_feature_file"] != "" ? file(row["bedtools_feature_file"], checkIfExists: true) : "" + [ meta, fasta, fai, dict, mapper_index, circular_target, mitochondrion, capture_bed, pileupcaller_bed, pileupcaller_snp, hapmap, pmd_masked_fasta, pmd_bed_for_masking, sexdet_bed, bedtools_feature ] } @@ -61,11 +62,12 @@ workflow REFERENCE_INDEXING_MULTI { ch_input_from_referencesheet = ch_splitreferencesheet_for_branch .multiMap { - meta, fasta, fai, dict, mapper_index, circular_target, mitochondrion, capture_bed, pileupcaller_bed, pileupcaller_snp, hapmap, pmd_mask, sexdet_bed, bedtools_feature -> + meta, fasta, fai, dict, mapper_index, circular_target, mitochondrion, capture_bed, pileupcaller_bed, pileupcaller_snp, hapmap, pmd_masked_fasta, pmd_bed_for_masking, sexdet_bed, bedtools_feature -> generated: [ meta, fasta, fai, dict, mapper_index, circular_target ] mitochondrion_header: [ meta, mitochondrion ] angsd_hapmap: [ meta, hapmap ] - pmd_mask: [ meta, pmd_mask, capture_bed ] + pmd_masked_fasta: [ meta, pmd_masked_fasta ] + pmd_bed_for_masking: [ meta, pmd_bed_for_masking ] snp_bed: [ meta, capture_bed ] pileupcaller_snp: [ meta, pileupcaller_bed, pileupcaller_snp ] sexdeterrmine_bed: [ meta, sexdet_bed ] @@ -211,7 +213,8 @@ ch_input_from_referencesheet = ch_splitreferencesheet_for_branch reference = ch_indexmapper_for_reference // [ meta, fasta, fai, dict, mapindex, circular_target ] mitochondrion_header = ch_input_from_referencesheet.mitochondrion_header // [ meta, mitochondrion ] hapmap = ch_input_from_referencesheet.angsd_hapmap // [ meta, hapmap ] - pmd_mask = ch_input_from_referencesheet.pmd_mask // [ meta, pmd_mask, capture_bed ] + pmd_masked_fasta = ch_input_from_referencesheet.pmd_masked_fasta // [ meta, pmd_masked_fasta ] + pmd_bed_for_masking = ch_input_from_referencesheet.pmd_bed_for_masking // [ meta, pmd_bed_for_masking ] snp_capture_bed = ch_input_from_referencesheet.snp_bed // [ meta, capture_bed ] pileupcaller_snp = ch_input_from_referencesheet.pileupcaller_snp // [ meta, pileupcaller_snp, pileupcaller_bed ] sexdeterrmine_bed = ch_input_from_referencesheet.sexdeterrmine_bed // [ meta, sexdet_bed ] diff --git a/subworkflows/local/reference_indexing_single.nf b/subworkflows/local/reference_indexing_single.nf index 43ebbf262..161d8e91b 100644 --- a/subworkflows/local/reference_indexing_single.nf +++ b/subworkflows/local/reference_indexing_single.nf @@ -81,37 +81,40 @@ workflow REFERENCE_INDEXING_SINGLE { meta, fasta, fai, dict, mapper_index -> //TODO: add params for snpcapturebed, snpeigenstrat and sexdetbed once implemented def contamination_estimation_angsd_hapmap = params.contamination_estimation_angsd_hapmap != null ? file( params.contamination_estimation_angsd_hapmap, checkIfExists: true ) : "" - def pmd_mask = params.damage_manipulation_pmdtools_reference_mask != null ? file(params.damage_manipulation_pmdtools_reference_mask, checkIfExists: true ) : "" + def pmd_masked_fasta = params.damage_manipulation_pmdtools_masked_reference != null ? file(params.damage_manipulation_pmdtools_masked_reference, checkIfExists: true ) : "" + def pmd_bed_for_masking = params.damage_manipulation_pmdtools_reference_mask != null ? file(params.damage_manipulation_pmdtools_reference_mask, checkIfExists: true ) : "" def capture_bed = params.snpcapture_bed != null ? file(params.snpcapture_bed, checkIfExists: true ) : "" def pileupcaller_bed = "" def pileupcaller_snp = "" def sexdet_bed = "" def bedtools_feature = params.mapstats_bedtools_featurefile != null ? file(params.mapstats_bedtools_featurefile, checkIfExists: true ) : "" - [ meta, fasta, fai, dict, mapper_index, params.fasta_circular_target, params.mitochondrion_header, contamination_estimation_angsd_hapmap, pmd_mask, capture_bed, pileupcaller_bed, pileupcaller_snp, sexdet_bed, bedtools_feature ] + [ meta, fasta, fai, dict, mapper_index, params.fasta_circular_target, params.mitochondrion_header, contamination_estimation_angsd_hapmap, pmd_masked_fasta, pmd_bed_for_masking, capture_bed, pileupcaller_bed, pileupcaller_snp, sexdet_bed, bedtools_feature ] } ch_ref_index_single = ch_reference_for_mapping .multiMap{ - meta, fasta, fai, dict, mapper_index, circular_target, mitochondrion_header, contamination_estimation_angsd_hapmap, pmd_mask, capture_bed, pileupcaller_bed, pileupcaller_snp, sexdet_bed, bedtools_feature -> - reference: [ meta, fasta, fai, dict, mapper_index, circular_target ] - mito_header: [ meta, mitochondrion_header ] - hapmap: [ meta, contamination_estimation_angsd_hapmap ] - pmd_mask: [ meta, pmd_mask, capture_bed ] - snp_bed: [ meta, capture_bed ] - pileupcaller_snp: [ meta, pileupcaller_bed, pileupcaller_snp ] - sexdeterrmine_bed: [ meta, sexdet_bed ] - bedtools_feature: [ meta, bedtools_feature ] + meta, fasta, fai, dict, mapper_index, circular_target, mitochondrion_header, contamination_estimation_angsd_hapmap, pmd_masked_fasta, pmd_bed_for_masking, capture_bed, pileupcaller_bed, pileupcaller_snp, sexdet_bed, bedtools_feature -> + reference: [ meta, fasta, fai, dict, mapper_index, circular_target ] + mito_header: [ meta, mitochondrion_header ] + hapmap: [ meta, contamination_estimation_angsd_hapmap ] + pmd_masked_fasta: [ meta, pmd_masked_fasta ] + pmd_bed_for_masking: [ meta, pmd_bed_for_masking ] + snp_bed: [ meta, capture_bed ] + pileupcaller_snp: [ meta, pileupcaller_bed, pileupcaller_snp ] + sexdeterrmine_bed: [ meta, sexdet_bed ] + bedtools_feature: [ meta, bedtools_feature ] } emit: - reference = ch_ref_index_single.reference // [ meta, fasta, fai, dict, mapindex, circular_target ] - mitochondrion_header = ch_ref_index_single.mito_header // [ meta, mito_header ] - hapmap = ch_ref_index_single.hapmap // [ meta, hapmap ] - pmd_mask = ch_ref_index_single.pmd_mask // [ meta, pmd_mask, capture_bed ] - snp_capture_bed = ch_ref_index_single.snp_bed // [ meta, capture_bed ] - pileupcaller_snp = ch_ref_index_single.pileupcaller_snp // [ meta, pileupcaller_bed, pileupcaller_snp ] - sexdeterrmine_bed = ch_ref_index_single.sexdeterrmine_bed // [ meta, sexdet_bed ] - bedtools_feature = ch_ref_index_single.bedtools_feature // [ meta, bedtools_feature ] + reference = ch_ref_index_single.reference // [ meta, fasta, fai, dict, mapindex, circular_target ] + mitochondrion_header = ch_ref_index_single.mito_header // [ meta, mito_header ] + hapmap = ch_ref_index_single.hapmap // [ meta, hapmap ] + pmd_masked_fasta = ch_ref_index_single.pmd_masked_fasta // [ meta, pmd_masked_fasta ] + pmd_bed_for_masking = ch_ref_index_single.pmd_bed_for_masking // [ meta, pmd_bed_for_masking ] + snp_capture_bed = ch_ref_index_single.snp_bed // [ meta, capture_bed ] + pileupcaller_snp = ch_ref_index_single.pileupcaller_snp // [ meta, pileupcaller_bed, pileupcaller_snp ] + sexdeterrmine_bed = ch_ref_index_single.sexdeterrmine_bed // [ meta, sexdet_bed ] + bedtools_feature = ch_ref_index_single.bedtools_feature // [ meta, bedtools_feature ] versions = ch_versions } diff --git a/workflows/eager.nf b/workflows/eager.nf index 57a4f33e1..89a39b252 100644 --- a/workflows/eager.nf +++ b/workflows/eager.nf @@ -525,10 +525,9 @@ workflow EAGER { // // SUBWORKFLOW: aDNA Damage Manipulation - // TODO: Add pmd_mask and snp_capture input if ( params.run_mapdamage_rescaling || params.run_pmd_filtering || params.run_trim_bam ) { - MANIPULATE_DAMAGE( ch_dedupped_bams, ch_fasta_for_deduplication.fasta ) + MANIPULATE_DAMAGE( ch_dedupped_bams, ch_fasta_for_deduplication.fasta, REFERENCE_INDEXING.out.pmd_masking ) ch_multiqc_files = ch_multiqc_files.mix( MANIPULATE_DAMAGE.out.flagstat.collect{it[1]}.ifEmpty([]) ) ch_versions = ch_versions.mix( MANIPULATE_DAMAGE.out.versions ) ch_bams_for_genotyping = params.genotyping_source == 'rescaled' ? MANIPULATE_DAMAGE.out.rescaled : params.genotyping_source == 'pmd' ? MANIPULATE_DAMAGE.out.filtered : params.genotyping_source == 'trimmed' ? MANIPULATE_DAMAGE.out.trimmed : ch_dedupped_bams