Merge pull request #62 from cidgoh/multiqc_config

add customized multiqc config file
cidgoh · Jun 25, 2024 · dd06226 · dd06226
2 parents 206152f + 5641947
commit dd06226
Show file tree

Hide file tree

Showing 28 changed files with 710 additions and 366 deletions.
diff --git a/.github/workflows/awsfulltest.yml → .github/workflows/archive/awsfulltest.yml b/.github/workflows/awsfulltest.yml → .github/workflows/archive/awsfulltest.yml
diff --git a/.github/workflows/awstest.yml → .github/workflows/archive/awstest.yml b/.github/workflows/awstest.yml → .github/workflows/archive/awstest.yml
diff --git a/.github/workflows/branch.yml → .github/workflows/archive/branch.yml b/.github/workflows/branch.yml → .github/workflows/archive/branch.yml
diff --git a/.github/workflows/ci.yml → .github/workflows/archive/ci.yml b/.github/workflows/ci.yml → .github/workflows/archive/ci.yml
diff --git a/.github/workflows/clean-up.yml → .github/workflows/archive/clean-up.yml b/.github/workflows/clean-up.yml → .github/workflows/archive/clean-up.yml
diff --git a/.github/workflows/download_pipeline.yml → ...b/workflows/archive/download_pipeline.yml b/.github/workflows/download_pipeline.yml → ...b/workflows/archive/download_pipeline.yml
diff --git a/.github/workflows/fix-linting.yml → .github/workflows/archive/fix-linting.yml b/.github/workflows/fix-linting.yml → .github/workflows/archive/fix-linting.yml
diff --git a/.github/workflows/linting.yml → .github/workflows/archive/linting.yml b/.github/workflows/linting.yml → .github/workflows/archive/linting.yml
diff --git a/.github/workflows/linting_comment.yml → ...hub/workflows/archive/linting_comment.yml b/.github/workflows/linting_comment.yml → ...hub/workflows/archive/linting_comment.yml
diff --git a/.github/workflows/release-announcements.yml → ...rkflows/archive/release-announcements.yml b/.github/workflows/release-announcements.yml → ...rkflows/archive/release-announcements.yml
diff --git a/README.md b/README.md
@@ -11,6 +11,8 @@
 [![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/)
 [![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/)
 
+# BACPAQ
+
 ## Introduction
 
 **`bacpaq`** is a bioinformatics best-practice pipeline for bacterial genomic analysis for short-reads (Illumina) and long-reads (Oxford Nanopore) sequencing data. Currently `bacpaq` supports WGS-based analyses, however, we plan to integrate Microbiome (Amplicon and Shotgun Metagenomics) analyses in future.

diff --git a/assets/methods_description_template.yml b/assets/methods_description_template.yml
@@ -1,13 +1,13 @@
-id: "nf-core-bacpaq-methods-description"
+id: "cidgoh-bacpaq-methods-description"
 description: "Suggested text and references to use when describing pipeline usage within the methods section of a publication."
-section_name: "nf-core/bacpaq Methods Description"
-section_href: "https://github.com/nf-core/bacpaq"
+section_name: "cidgoh/bacpaq Methods Description"
+section_href: "https://github.com/cidgoh/bacpaq"
 plot_type: "html"
 ## TODO nf-core: Update the HTML below to your preferred methods description, e.g. add publication citation for this pipeline
 ## You inject any metadata in the Nextflow '${workflow}' object
 data: |
   <h4>Methods</h4>
-  <p>Data was processed using nf-core/bacpaq v${workflow.manifest.version} ${doi_text} of the nf-core collection of workflows (<a href="https://doi.org/10.1038/s41587-020-0439-x">Ewels <em>et al.</em>, 2020</a>), utilising reproducible software environments from the Bioconda (<a href="https://doi.org/10.1038/s41592-018-0046-7">Grüning <em>et al.</em>, 2018</a>) and Biocontainers (<a href="https://doi.org/10.1093/bioinformatics/btx192">da Veiga Leprevost <em>et al.</em>, 2017</a>) projects.</p>
+  <p>Data was processed using cidgoh/bacpaq v${workflow.manifest.version} ${doi_text} built using the nf-core template of workflows (<a href="https://doi.org/10.1038/s41587-020-0439-x">Ewels <em>et al.</em>, 2020</a>), utilising reproducible software environments from the Bioconda (<a href="https://doi.org/10.1038/s41592-018-0046-7">Grüning <em>et al.</em>, 2018</a>) and Biocontainers (<a href="https://doi.org/10.1093/bioinformatics/btx192">da Veiga Leprevost <em>et al.</em>, 2017</a>) projects.</p>
   <p>The pipeline was executed with Nextflow v${workflow.nextflow.version} (<a href="https://doi.org/10.1038/nbt.3820">Di Tommaso <em>et al.</em>, 2017</a>) with the following command:</p>
   <pre><code>${workflow.commandLine}</code></pre>
   <p>${tool_citations}</p>

diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml
@@ -1,15 +1,228 @@
 report_comment: >
-  This report has been generated by the <a href="https://github.com/nf-core/bacpaq/tree/dev" target="_blank">nf-core/bacpaq</a>
-  analysis pipeline. For information about how to interpret these results, please see the
-  <a href="https://nf-co.re/bacpaq/dev/docs/output" target="_blank">documentation</a>.
+  bacpaq is a bioinformatics best-practice pipeline for bacterial genomic analysis for short-reads (Illumina) and long-reads (Oxford Nanopore) sequencing data. Currently bacpaq supports WGS-based analyses, however, we plan to integrate Microbiome (Amplicon and Shotgun Metagenomics) analyses in future.
+
 report_section_order:
-  "nf-core-bacpaq-methods-description":
+  "cidgoh-bacpaq-methods-description":
     order: -1000
   software_versions:
     order: -1001
-  "nf-core-bacpaq-summary":
+  "cidgoh-bacpaq-summary":
     order: -1002
 
 export_plots: true
 
 disable_version_detection: true
+data_format: "yaml"
+
+run_modules:
+  - fastqc
+  - fastp
+  - trimmomatic
+  - porechop
+  - fastqc
+  - kraken
+  - bracken
+  - quast
+  - prokka
+  - bakta
+  - busco
+
+# Module order
+top_modules:
+  - "fastqc":
+      name: "FastQC: raw reads"
+      path_filters_exclude:
+        - "*trimmed*"
+  - "fastp":
+      name: "FastP"
+      info: "Read preprocessing."
+      path_filters:
+        - "*fastp.json"
+  - "trimmomatic":
+      name: "Trimmomatic"
+      info: "Adapter trimming for Illumina reads."
+      path_filters:
+        - "*trimmomatic.log"
+  - "porechop":
+      name: "Porechop"
+      info: "Adapter trimming for Oxford Nanopore reads."
+      path_filters:
+        - "*porechop.log"
+  - "fastqc":
+      name: "FastQC: after preprocessing"
+      anchor: "FastQC_trimmed"
+      info: "After trimming and, if requested, contamination removal."
+      path_filters:
+        - "*trimmed*"
+  - "kraken":
+      name: "Kraken2"
+      anchor: "Kraken2"
+      target: "Kraken2"
+      doi: "10.1101/gr.210641.116"
+      path_filters:
+        - "*.kraken2.report.txt"
+      top_n: 10
+  - "kraken":
+      name: "Centrifuge"
+      anchor: "centrifuge"
+      target: "Centrifuge"
+      doi: "10.1101/gr.210641.116"
+      info: "is a very rapid and memory-efficient system for the classification of DNA sequences from microbial samples. The system uses a novel indexing scheme based on the Burrows-Wheeler transform (BWT) and the Ferragina-Manzini (FM) index. Note: Figure title"
+      extra: "ℹ️: plot title will say Kraken2 due to Centrifuge producing the same output format as Kraken. If activated, see the actual Kraken2 results in the section above."
+      path_filters:
+        - "*.centrifuge_kreport.txt"
+  - "bracken":
+      name: "bracken"
+      anchor: "bracken"
+      target: "bracken"
+      info: "Estimates the abundance of species in metagenomic samples."
+      contents_re: ^(\d{1,3}\.\d{1,2})\t(\d+)\t(\d+)\t((\d+)\t(\d+)\t)?([URDKPCOFGS-]\d{0,2})\t(\d+)(\s+)root
+      num_lines: 1
+      path_filters:
+        - "*.bracken.report.tsv"
+  - "quast":
+      name: "QUAST: assembly"
+      info: "Assembly statistics of binned assemblies."
+      path_filters_exclude:
+        - "*rawassemblies.tsv"
+      contents: "Assembly\t"
+      num_lines: 2
+
+  - "prokka":
+      name: "Prokka"
+      info: "Prokka is a software tool for the rapid annotation of prokaryotic genomes."
+      contents: "contigs:"
+      fn: "*.txt"
+
+  - "bakta":
+      name: "Bakta"
+      info: "Bakta is a software tool for the rapid annotation of prokaryotic genomes."
+      contents: "Bakta:"
+      fn: "*.txt"
+
+  - "busco":
+      info: "assesses genome assembly and annotation completeness with Benchmarking Universal Single-Copy Orthologs. In case BUSCO's automated lineage selection was used, only generic results for the selected domain are shown and only for genome bins and kept, unbinned contigs for which the BUSCO analysis was successfull, i.e. not for contigs for which no BUSCO genes could be found. Bins for which a specific virus lineage was selected are also not shown."
+      fn: short_summary*
+      contents: "BUSCO version is:"
+      num_lines: 1
+
+sp:
+  kraken:
+    fn_re: ".*[kraken2|centrifuge].*report.txt"
+  quast:
+    fn_re: "report.*.tsv"
+
+# clean names
+fn_clean_exts:
+  - ".gz"
+  - ".fastq"
+
+extra_fn_clean_exts:
+  - type: regex
+    pattern: "_T{1,2}.*"
+
+## Prettification
+custom_logo_url: https://github.com/cidgoh/bacpaq/
+custom_logo_title: "cidgoh/bacpaq"
+
+## Report Title
+title: "BACPAQ"
+
+## Tool specific configuration
+prokka_fn_snames: True
+
+## General Stats customisation
+table_columns_visible:
+  "FastQC: raw reads":
+    avg_sequence_length: True
+  "FastQC: after preprocessing":
+    avg_sequence_length: True
+  "fastp":
+    pct_duplication: False
+    after_filtering_q30_rate: False
+    after_filtering_q30_bases: False
+    filtering_result_passed_filter_reads: 3300
+    after_filtering_gc_content: False
+    pct_surviving: True
+    pct_adapter: True
+  "Kraken2": True
+  "Centrifuge": True
+  "QUAST: assembly":
+    N75: True
+    L50: True
+    L75: True
+    "Largest contig": True
+    "Total length": True
+    N50: True
+  "Prokka": True
+  "Bakta": True
+
+table_columns_placement:
+  "FastQC: raw reads":
+    percent_duplicates: 1000
+    percent_gc: 1100
+    avg_sequence_length: 1200
+    median_sequence_length: 1300
+    total_sequences: 1400
+    percent_fails: 1500
+  "FastQC: after preprocessing":
+    percent_duplicates: 2000
+    percent_gc: 2100
+    avg_sequence_length: 2200
+    median_sequence_length: 2300
+    total_sequences: 2400
+    percent_fails: 2500
+  "fastp":
+    pct_duplication: 3000
+    after_filtering_q30_rate: 3100
+    after_filtering_q30_bases: 3200
+    filtering_result_passed_filter_reads: 3300
+    after_filtering_gc_content: 3400
+    pct_surviving: 3500
+    pct_adapter: 3600
+  "Kraken2":
+    "% root": 8000
+    "% Top 5": 8100
+    "% Unclassified": 8200
+  "Centrifuge":
+    "% root": 9000
+    "% Top 5": 9100
+    "% Unclassified": 9200
+  "QUAST: assembly":
+    "N50": 10000
+    "Total length": 11000
+  Prokka:
+    contigs: 20000
+    bases: 21000
+    CDS: 22000
+    organism: 23000
+    plasmid: 24000
+  Bakta:
+    contigs: 20000
+    bases: 21000
+    CDS: 22000
+    organism: 23000
+    plasmid: 24000
+
+table_columns_name:
+  "FastQC: raw reads":
+    percent_duplicates: "% Dups (raw)"
+    percent_gc: "% GC (raw)"
+    avg_sequence_length: "Avg. length (raw)"
+    median_sequence_length: "Median length (raw)"
+    total_sequences: "M Seqs (raw)"
+    percent_fails: "% Fails (raw)"
+  "FastQC: after preprocessing":
+    percent_duplicates: "% Dups (processed)"
+    percent_gc: "% GC (processed)"
+    avg_sequence_length: "Avg. length (processed)"
+    median_sequence_length: "Median length (processed)"
+    total_sequences: "M Seqs (processed)"
+    percent_fails: "% Fails (processed)"
+
+custom_table_header_config:
+  general_stats_table:
+    "Total length":
+      hidden: True
+    N50:
+      hidden: True