Skip to content

Commit

Permalink
Merge pull request #9 from phac-nml/assembly
Browse files Browse the repository at this point in the history
Adding Assembly Stub Output and Generating IRIDA Next-Compliant JSON Output
  • Loading branch information
apetkau authored Nov 8, 2023
2 parents 8117887 + bfdb364 commit d230c4d
Show file tree
Hide file tree
Showing 11 changed files with 271 additions and 20 deletions.
19 changes: 17 additions & 2 deletions bin/irida-next-output.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,15 @@ def main(argv=None):
epilog="Example: python irida-next-output.py --json-output output.json *.json *.json.gz",
)
parser.add_argument("files", nargs="+")
parser.add_argument(
"--summary-file",
action="store",
dest="summary_file",
type=str,
help="pipeline summary file",
default=None,
required=True,
)
parser.add_argument(
"--json-output",
action="store",
Expand All @@ -41,6 +50,9 @@ def main(argv=None):
sys.stderr.write(f"Error: --json-output [{json_output_file}] exists")
return 1

# Not checking for the existance of the summary file
# because the path may be relative to the outdir, which we don't have here.

input_files = args.files
if isinstance(input_files, str):
input_files = [input_files]
Expand All @@ -55,7 +67,10 @@ def main(argv=None):
},
}

output_metadata = {"files": {"samples": {}}, "metadata": {"samples": {}}}
output_metadata = {
"files": {"global": [{"path": str(args.summary_file)}], "samples": {}},
"metadata": {"samples": {}},
}

for f in input_files:
_open = get_open(f)
Expand All @@ -64,7 +79,7 @@ def main(argv=None):
output_metadata["files"]["samples"] |= sample_metadata["files"]["samples"]
output_metadata["metadata"]["samples"] |= sample_metadata["metadata"]["samples"]

data_json = json.dumps(output_metadata, indent=4)
data_json = json.dumps(output_metadata, sort_keys=True, indent=4)
_open = get_open(json_output_file)
with _open(json_output_file, "wt") as oh:
oh.write(data_json)
Expand Down
77 changes: 77 additions & 0 deletions bin/simplify_irida_json.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
#!/usr/bin/env python

import json
import argparse
import sys
import gzip
from mimetypes import guess_type
from functools import partial
from pathlib import Path


def flatten_dictionary(dictionary):
result = {}

def flatten(item, name=""):
if type(item) is dict:
for component in item:
flatten(item[component], str(name) + str(component) + ".")

elif type(item) is list:
for i in range(len(item)):
flatten(item[i], str(name) + str(i + 1) + ".") # i + 1 because biologists

else:
result[str(name)[:-1]] = item # [:-1] avoids the "." appended on the previous recursion

flatten(dictionary)
return result


def main():
parser = argparse.ArgumentParser(
description="Simplifies JSON files for use with IRIDA Next",
epilog="Example: python simplify_irida_json.py --json-output output.json input.json",
)
parser.add_argument("input")
parser.add_argument(
"--json-output",
action="store",
dest="json_output",
type=str,
help="JSON output file",
default=None,
required=True,
)

args = parser.parse_args()

json_output_location = Path(args.json_output)
if json_output_location.exists():
sys.stderr.write("Error: --json-output [{json_output_location}] exists!\n")
return 1

json_input_file = args.input

# Handle GZIP and non-GZIP
encoding = guess_type(json_input_file)[1]
open_file = partial(gzip.open, mode="rt") if encoding == "gzip" else open # partial (function pointer)

with open_file(json_input_file) as input_file:
input_json = json.load(input_file)

# Flatten metadata:
for sample in input_json["metadata"]["samples"]:
input_json["metadata"]["samples"][sample] = flatten_dictionary(input_json["metadata"]["samples"][sample])

json_data = json.dumps(input_json, sort_keys=True, indent=4)
with open(json_output_location, "w") as output_file:
output_file.write(json_data)

print("Output written to " + str(json_output_location) + "!")

return 0


if __name__ == "__main__":
sys.exit(main())
28 changes: 28 additions & 0 deletions conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,10 @@

process {

// Publish directory names
assembly_directory_name = "assembly"
summary_directory_name = "summary"

publishDir = [
path: { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" },
mode: params.publish_dir_mode,
Expand All @@ -26,6 +30,30 @@ process {
]
}

withName: ASSEMBLY_STUB {
publishDir = [
path: { ["${params.outdir}", "${task.assembly_directory_name}"].join(File.separator) },
mode: params.publish_dir_mode,
saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
]
}

withName: GENERATE_SUMMARY {
publishDir = [
path: { ["${params.outdir}", "${task.summary_directory_name}"].join(File.separator) },
mode: params.publish_dir_mode,
saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
]
}

withName: IRIDA_NEXT_OUTPUT {
publishDir = [
path: { "${params.outdir}" },
mode: params.publish_dir_mode,
saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
]
}

withName: CUSTOM_DUMPSOFTWAREVERSIONS {
publishDir = [
path: { "${params.outdir}/pipeline_info" },
Expand Down
33 changes: 33 additions & 0 deletions modules/local/assembly_stub/main.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
process ASSEMBLY_STUB {
tag "$meta.id"
label 'process_single'

container 'docker.io/python:3.9.17'

input:
tuple val(meta), path(reads)

output:
tuple val(meta), path("*.assembly.fa.gz"), emit: assembly
path "versions.yml" , emit: versions

when:
task.ext.when == null || task.ext.when

script:
def args = task.ext.args ?: ''
def prefix = task.ext.prefix ?: "${meta.id}"
"""
cat <<-EOF > ${prefix}.assembly.fa
>${meta.id}-stub-assembly
ACGTAACCGGTTAAACCCGGGTTTAAAACCCCGGGGTTTTAAAAACCCCCGGGGGTTTTT
EOF
gzip -n ${prefix}.assembly.fa
cat <<-END_VERSIONS > versions.yml
"${task.process}":
assembly_stub : 0.1.0.dev0
END_VERSIONS
"""
}
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
process SAMPLE_METADATA {
process GENERATE_SAMPLE_JSON {
tag "$meta.id"
label 'process_single'

container 'docker.io/python:3.9.17'

input:
tuple val(meta), path(reads)
tuple val(meta), path(reads), path(assembly)

output:
tuple val(meta), path("*.json.gz"), emit: json
Expand All @@ -17,18 +17,23 @@ process SAMPLE_METADATA {
script:
def args = task.ext.args ?: ''
def prefix = task.ext.prefix ?: "${meta.id}"
def assembly_path = ["${task.assembly_directory_name}", "${assembly}"].join(File.separator)
"""
reads_1=`basename ${reads[0]}`
reads_2=`basename ${reads[1]}`
cat <<-EOF > "${meta.id}.json"
{
"files": {
"samples": {}
"samples": {
"${meta.id}": [
{
"path": "${assembly_path}"
}
]
}
},
"metadata": {
"samples": {
"${meta.id}": {
"reads": ["\${reads_1}", "\${reads_2}"]
"reads": ["${reads[0]}", "${reads[1]}"]
}
}
}
Expand All @@ -38,7 +43,7 @@ process SAMPLE_METADATA {
cat <<-END_VERSIONS > versions.yml
"${task.process}":
irida-next-output : 0.1.0.dev0
generate_sample_json : 0.1.0.dev0
END_VERSIONS
"""
}
38 changes: 38 additions & 0 deletions modules/local/generate_summary/main.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
process GENERATE_SUMMARY {
label 'process_single'
container 'docker.io/python:3.9.17'

input:
val summaries

output:
path("summary.txt.gz"), emit: summary
path "versions.yml", emit: versions

when:
task.ext.when == null || task.ext.when

script:
def args = task.ext.args ?: ''
def sorted_summaries = summaries.sort{ it[0].id }

// Generate summary text:
def summary_text = "IRIDANEXT-EXAMPLE-NF Pipeline Summary\n\nSUCCESS!\n"

// TODO: Consider the possibility of code injection.
// Should probably be moved to file processing through Python.
for (summary in sorted_summaries) {
summary_text += "\n${summary[0].id}:\n"
summary_text += " reads.1: ${summary[1][0]}\n"
summary_text += " reads.2: ${summary[1][1]}\n"
summary_text += " assembly: ${summary[2]}\n"
}

version_text = "\"${task.process}\":\n generate_summary : 0.1.0.dev0"

"""
echo "${summary_text}" > summary.txt
gzip -n summary.txt
echo "${version_text}" > versions.yml
"""
}
5 changes: 3 additions & 2 deletions modules/local/irida-next-output/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ process IRIDA_NEXT_OUTPUT {
path(samples_data)

output:
path("output.json.gz"), emit: output_json
path("iridanext.output.json.gz"), emit: output_json
path "versions.yml", emit: versions

when:
Expand All @@ -19,7 +19,8 @@ process IRIDA_NEXT_OUTPUT {
"""
irida-next-output.py \\
$args \\
--json-output output.json.gz \\
--summary-file ${task.summary_directory_name}/summary.txt.gz \\
--json-output iridanext.output.json.gz \\
${samples_data}
cat <<-END_VERSIONS > versions.yml
Expand Down
33 changes: 33 additions & 0 deletions modules/local/simplify_irida_json/main.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
process SIMPLIFY_IRIDA_JSON {
tag "$meta.id"
label 'process_single'

container 'docker.io/python:3.9.17'

input:
tuple val(meta), path(json)

output:
tuple val(meta), path("*.simple.json.gz") , emit: simple_json
path "versions.yml" , emit: versions

when:
task.ext.when == null || task.ext.when

script:
def args = task.ext.args ?: ''
def prefix = task.ext.prefix ?: "${meta.id}"
"""
simplify_irida_json.py \\
$args \\
--json-output ${meta.id}.simple.json \\
${json}
gzip ${meta.id}.simple.json
cat <<-END_VERSIONS > versions.yml
"${task.process}":
simplify_irida_json : 0.1.0.dev0
END_VERSIONS
"""
}
1 change: 0 additions & 1 deletion nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,6 @@ params {
validationSchemaIgnoreParams = 'genomes,igenomes_base'
validationShowHiddenParams = false
validate_params = true

}

// Load base.config by default for all pipelines
Expand Down
3 changes: 2 additions & 1 deletion nextflow_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -244,5 +244,6 @@
{
"$ref": "#/definitions/generic_options"
}
]
],
"properties": {}
}
Loading

0 comments on commit d230c4d

Please sign in to comment.