Merge pull request #9 from phac-nml/assembly

Adding Assembly Stub Output and Generating IRIDA Next-Compliant JSON Output
phac-nml · Nov 8, 2023 · d230c4d · d230c4d
2 parents 8117887 + bfdb364
commit d230c4d
Show file tree

Hide file tree

Showing 11 changed files with 271 additions and 20 deletions.
diff --git a/bin/irida-next-output.py b/bin/irida-next-output.py
@@ -24,6 +24,15 @@ def main(argv=None):
         epilog="Example: python irida-next-output.py --json-output output.json *.json *.json.gz",
     )
     parser.add_argument("files", nargs="+")
+    parser.add_argument(
+        "--summary-file",
+        action="store",
+        dest="summary_file",
+        type=str,
+        help="pipeline summary file",
+        default=None,
+        required=True,
+    )
     parser.add_argument(
         "--json-output",
         action="store",
@@ -41,6 +50,9 @@ def main(argv=None):
         sys.stderr.write(f"Error: --json-output [{json_output_file}] exists")
         return 1
 
+    # Not checking for the existance of the summary file
+    # because the path may be relative to the outdir, which we don't have here.
+
     input_files = args.files
     if isinstance(input_files, str):
         input_files = [input_files]
@@ -55,7 +67,10 @@ def main(argv=None):
         },
     }
 
-    output_metadata = {"files": {"samples": {}}, "metadata": {"samples": {}}}
+    output_metadata = {
+        "files": {"global": [{"path": str(args.summary_file)}], "samples": {}},
+        "metadata": {"samples": {}},
+    }
 
     for f in input_files:
         _open = get_open(f)
@@ -64,7 +79,7 @@ def main(argv=None):
             output_metadata["files"]["samples"] |= sample_metadata["files"]["samples"]
             output_metadata["metadata"]["samples"] |= sample_metadata["metadata"]["samples"]
 
-    data_json = json.dumps(output_metadata, indent=4)
+    data_json = json.dumps(output_metadata, sort_keys=True, indent=4)
     _open = get_open(json_output_file)
     with _open(json_output_file, "wt") as oh:
         oh.write(data_json)

diff --git a/bin/simplify_irida_json.py b/bin/simplify_irida_json.py
@@ -0,0 +1,77 @@
+#!/usr/bin/env python
+
+import json
+import argparse
+import sys
+import gzip
+from mimetypes import guess_type
+from functools import partial
+from pathlib import Path
+
+
+def flatten_dictionary(dictionary):
+    result = {}
+
+    def flatten(item, name=""):
+        if type(item) is dict:
+            for component in item:
+                flatten(item[component], str(name) + str(component) + ".")
+
+        elif type(item) is list:
+            for i in range(len(item)):
+                flatten(item[i], str(name) + str(i + 1) + ".")  # i + 1 because biologists
+
+        else:
+            result[str(name)[:-1]] = item  # [:-1] avoids the "." appended on the previous recursion
+
+    flatten(dictionary)
+    return result
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Simplifies JSON files for use with IRIDA Next",
+        epilog="Example: python simplify_irida_json.py --json-output output.json input.json",
+    )
+    parser.add_argument("input")
+    parser.add_argument(
+        "--json-output",
+        action="store",
+        dest="json_output",
+        type=str,
+        help="JSON output file",
+        default=None,
+        required=True,
+    )
+
+    args = parser.parse_args()
+
+    json_output_location = Path(args.json_output)
+    if json_output_location.exists():
+        sys.stderr.write("Error: --json-output [{json_output_location}] exists!\n")
+        return 1
+
+    json_input_file = args.input
+
+    # Handle GZIP and non-GZIP
+    encoding = guess_type(json_input_file)[1]
+    open_file = partial(gzip.open, mode="rt") if encoding == "gzip" else open  # partial (function pointer)
+
+    with open_file(json_input_file) as input_file:
+        input_json = json.load(input_file)
+
+    # Flatten metadata:
+    for sample in input_json["metadata"]["samples"]:
+        input_json["metadata"]["samples"][sample] = flatten_dictionary(input_json["metadata"]["samples"][sample])
+
+    json_data = json.dumps(input_json, sort_keys=True, indent=4)
+    with open(json_output_location, "w") as output_file:
+        output_file.write(json_data)
+
+    print("Output written to " + str(json_output_location) + "!")
+
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/conf/modules.config b/conf/modules.config
@@ -12,6 +12,10 @@
 
 process {
 
+    // Publish directory names
+    assembly_directory_name = "assembly"
+    summary_directory_name = "summary"
+
     publishDir = [
         path: { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" },
         mode: params.publish_dir_mode,
@@ -26,6 +30,30 @@ process {
         ]
     }
 
+    withName: ASSEMBLY_STUB {
+        publishDir = [
+            path: { ["${params.outdir}", "${task.assembly_directory_name}"].join(File.separator) },
+            mode: params.publish_dir_mode,
+            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+        ]
+    }
+
+    withName: GENERATE_SUMMARY {
+        publishDir = [
+            path: { ["${params.outdir}", "${task.summary_directory_name}"].join(File.separator) },
+            mode: params.publish_dir_mode,
+            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+        ]
+    }
+
+    withName: IRIDA_NEXT_OUTPUT {
+        publishDir = [
+            path: { "${params.outdir}" },
+            mode: params.publish_dir_mode,
+            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+        ]
+    }
+
     withName: CUSTOM_DUMPSOFTWAREVERSIONS {
         publishDir = [
             path: { "${params.outdir}/pipeline_info" },

diff --git a/modules/local/assembly_stub/main.nf b/modules/local/assembly_stub/main.nf
@@ -0,0 +1,33 @@
+process ASSEMBLY_STUB {
+    tag "$meta.id"
+    label 'process_single'
+
+    container 'docker.io/python:3.9.17'
+
+    input:
+    tuple val(meta), path(reads)
+
+    output:
+    tuple val(meta), path("*.assembly.fa.gz"), emit: assembly
+    path "versions.yml"                      , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args = task.ext.args ?: ''
+    def prefix = task.ext.prefix ?: "${meta.id}"
+    """
+    cat <<-EOF > ${prefix}.assembly.fa
+    >${meta.id}-stub-assembly
+    ACGTAACCGGTTAAACCCGGGTTTAAAACCCCGGGGTTTTAAAAACCCCCGGGGGTTTTT
+    EOF
+
+    gzip -n ${prefix}.assembly.fa
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        assembly_stub : 0.1.0.dev0
+    END_VERSIONS
+    """
+}
diff --git a/modules/local/sample_metadata/main.nf → modules/local/generate_sample_json/main.nf b/modules/local/sample_metadata/main.nf → modules/local/generate_sample_json/main.nf
@@ -1,11 +1,11 @@
-process SAMPLE_METADATA {
+process GENERATE_SAMPLE_JSON {
     tag "$meta.id"
     label 'process_single'
 
     container 'docker.io/python:3.9.17'
 
     input:
-    tuple val(meta), path(reads)
+    tuple val(meta), path(reads), path(assembly)
 
     output:
     tuple val(meta), path("*.json.gz"), emit: json
@@ -17,18 +17,23 @@ process SAMPLE_METADATA {
     script:
     def args = task.ext.args ?: ''
     def prefix = task.ext.prefix ?: "${meta.id}"
+    def assembly_path = ["${task.assembly_directory_name}", "${assembly}"].join(File.separator)
     """
-    reads_1=`basename ${reads[0]}`
-    reads_2=`basename ${reads[1]}`
     cat <<-EOF > "${meta.id}.json"
     {
         "files": {
-            "samples": {}
+            "samples": {
+                "${meta.id}": [
+                    {
+                        "path": "${assembly_path}"
+                    }
+                ]
+            }
         },
         "metadata": {
             "samples": {
                 "${meta.id}": {
-                    "reads": ["\${reads_1}", "\${reads_2}"]
+                    "reads": ["${reads[0]}", "${reads[1]}"]
                 }
             }
         }
@@ -38,7 +43,7 @@ process SAMPLE_METADATA {
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":
-        irida-next-output : 0.1.0.dev0
+        generate_sample_json : 0.1.0.dev0
     END_VERSIONS
     """
 }
diff --git a/modules/local/generate_summary/main.nf b/modules/local/generate_summary/main.nf
@@ -0,0 +1,38 @@
+process GENERATE_SUMMARY {
+    label 'process_single'
+    container 'docker.io/python:3.9.17'
+
+    input:
+    val summaries
+
+    output:
+    path("summary.txt.gz"), emit: summary
+    path "versions.yml", emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args = task.ext.args ?: ''
+    def sorted_summaries = summaries.sort{ it[0].id }
+
+    // Generate summary text:
+    def summary_text = "IRIDANEXT-EXAMPLE-NF Pipeline Summary\n\nSUCCESS!\n"
+
+    // TODO: Consider the possibility of code injection.
+    // Should probably be moved to file processing through Python.
+    for (summary in sorted_summaries) {
+        summary_text += "\n${summary[0].id}:\n"
+        summary_text += "    reads.1: ${summary[1][0]}\n"
+        summary_text += "    reads.2: ${summary[1][1]}\n"
+        summary_text += "    assembly: ${summary[2]}\n"
+    }
+
+    version_text = "\"${task.process}\":\n    generate_summary : 0.1.0.dev0"
+
+    """
+    echo "${summary_text}" > summary.txt
+    gzip -n summary.txt
+    echo "${version_text}" > versions.yml
+    """
+}
diff --git a/modules/local/irida-next-output/main.nf b/modules/local/irida-next-output/main.nf
@@ -7,7 +7,7 @@ process IRIDA_NEXT_OUTPUT {
     path(samples_data)
 
     output:
-    path("output.json.gz"), emit: output_json
+    path("iridanext.output.json.gz"), emit: output_json
     path "versions.yml", emit: versions
 
     when:
@@ -19,7 +19,8 @@ process IRIDA_NEXT_OUTPUT {
     """
     irida-next-output.py \\
         $args \\
-        --json-output output.json.gz \\
+        --summary-file ${task.summary_directory_name}/summary.txt.gz \\
+        --json-output iridanext.output.json.gz \\
         ${samples_data}
 
     cat <<-END_VERSIONS > versions.yml

diff --git a/modules/local/simplify_irida_json/main.nf b/modules/local/simplify_irida_json/main.nf
@@ -0,0 +1,33 @@
+process SIMPLIFY_IRIDA_JSON {
+    tag "$meta.id"
+    label 'process_single'
+
+    container 'docker.io/python:3.9.17'
+
+    input:
+    tuple val(meta), path(json)
+
+    output:
+    tuple val(meta), path("*.simple.json.gz")  , emit: simple_json
+    path "versions.yml"                        , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args = task.ext.args ?: ''
+    def prefix = task.ext.prefix ?: "${meta.id}"
+    """
+    simplify_irida_json.py \\
+        $args \\
+        --json-output ${meta.id}.simple.json \\
+        ${json}
+
+    gzip ${meta.id}.simple.json
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        simplify_irida_json : 0.1.0.dev0
+    END_VERSIONS
+    """
+}
diff --git a/nextflow.config b/nextflow.config
@@ -49,7 +49,6 @@ params {
     validationSchemaIgnoreParams     = 'genomes,igenomes_base'
     validationShowHiddenParams       = false
     validate_params                  = true
-
 }
 
 // Load base.config by default for all pipelines

diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -244,5 +244,6 @@
         {
             "$ref": "#/definitions/generic_options"
         }
-    ]
+    ],
+    "properties": {}
 }
-Original file line number
+Diff line change
@@ Expand Up / @@ -244,5 +244,6 @@ @@
             {
                 "$ref": "#/definitions/generic_options"
             }
-        ]
+        ],
+        "properties": {}
     }