Merge pull request #87 from sanger-bentley-group/dev

Release Version 1.0.0-rc3
GlobalPneumoSeq · Jan 19, 2024 · f6ca913 · f6ca913
2 parents 02885cc + 20c9308
commit f6ca913
Show file tree

Hide file tree

Showing 15 changed files with 182 additions and 109 deletions.
diff --git a/README.md b/README.md
diff --git a/bin/generate_overall_report.py b/bin/generate_overall_report.py
@@ -24,33 +24,34 @@
 
 
 # Check argv and save to global variables
-if len(sys.argv) != 4:
-    sys.exit('Usage: generate_overall_report.py INPUT_PATTERN ARIBA_METADATA OUTPUT_FILE')
+if len(sys.argv) != 5:
+    sys.exit('Usage: generate_overall_report.py INPUT_PATTERN ARIBA_METADATA RESISTANCE_TO_MIC OUTPUT_FILE')
 INPUT_PATTERN = sys.argv[1]
 ARIBA_METADATA = sys.argv[2]
-OUTPUT_FILE = sys.argv[3]
+RESISTANCE_TO_MIC = sys.argv[3]
+OUTPUT_FILE = sys.argv[4]
 
 
 def main():
-    output_columns = get_output_columns()
-    df_output = get_df_output(output_columns)
+    ariba_targets = set(pd.read_csv(ARIBA_METADATA, sep='\t')['target'].unique())
+    df_resistance_to_mic = pd.read_csv(RESISTANCE_TO_MIC, sep='\t', index_col='drug')
+
+    output_columns = get_output_columns(COLUMNS_BY_CATEGORY, ariba_targets)
+    df_output = get_df_output(INPUT_PATTERN, output_columns, df_resistance_to_mic)
 
     # Saving df_output to OUTPUT_FILE in csv format
     df_output.to_csv(OUTPUT_FILE, index=False, na_rep='_')
 
 
 # Get output columns based on COLUMNS_BY_CATEGORY and ARIBA metadata
-def get_output_columns():
-    output_columns = list(chain.from_iterable(COLUMNS_BY_CATEGORY.values()))
-    add_ariba_columns(output_columns)
+def get_output_columns(columns_by_category, ariba_targets):
+    output_columns = list(chain.from_iterable(columns_by_category.values()))
+    add_ariba_columns(output_columns, ariba_targets)
     return output_columns
 
 
 # Based on ARIBA metadata, add additional output columns
-def add_ariba_columns(output_columns):
-    # Get all targets in ARIBA metadata
-    ariba_targets = set(pd.read_csv(ARIBA_METADATA, sep='\t')['target'].unique())
-
+def add_ariba_columns(output_columns, ariba_targets):
     # Adding special cases if certain targets exist
     if 'TET' in ariba_targets:
         ariba_targets.add('DOX')
@@ -72,14 +73,14 @@ def add_ariba_columns(output_columns):
         output_columns.extend([f'{pili}', f'{pili}_Determinant'])
 
 
-# Generating df_output based on all sample reports with columns in the order of output_columns
-def get_df_output(output_columns):
+# Generating df_output based on all sample reports with columns in the order of output_columns, add inferred MIC range
+def get_df_output(input_pattern, output_columns, df_resistance_to_mic):
     # Generate an empty dataframe as df_manifest based on output_columns
     df_manifest = pd.DataFrame(columns=output_columns)
 
     # Generate a dataframe for each sample report and then concat df_manifest and all dataframes into df_output 
     dfs = [df_manifest]
-    reports = glob.glob(INPUT_PATTERN)
+    reports = glob.glob(input_pattern)
     for report in reports:
         df = pd.read_csv(report, dtype=str)
         dfs.append(df)
@@ -88,6 +89,22 @@ def get_df_output(output_columns):
     # Ensure column order in df_output is the same as output_columns
     df_output = df_output[output_columns]
 
+    df_output = add_inferred_mic(df_output, df_resistance_to_mic)
+
+    return df_output
+
+#  Add inferred MIC (minimum inhibitory concentration) based on resistance phenotypes if the drug exists in the lookup table
+def add_inferred_mic(df_output, df_resistance_to_mic):
+    all_resistance_to_mic = df_resistance_to_mic.to_dict('index')
+
+    for drug, resistance_to_mic in all_resistance_to_mic.items():
+        res_col_name = f'{drug}_Res'
+
+        if res_col_name in df_output:
+            res_col_index = df_output.columns.get_loc(res_col_name)
+            mic_series = df_output[res_col_name].map(resistance_to_mic, na_action='ignore')
+            df_output.insert(res_col_index, f'{drug}_MIC', mic_series)
+
     return df_output
 
 

diff --git a/bin/save_databases_info.sh b/bin/save_databases_info.sh
@@ -58,11 +58,18 @@ add_url_db () {
     jq -n --arg url "$URL" --arg save_time "$SAVE_TIME" '. = {"url": $url, "save_time": $save_time}'
 }
 
+add_resistance_to_mic () {
+    TABLE="$RESISTANCE_TO_MIC"
+    TABLE_MD5=$(md5sum "$RESISTANCE_TO_MIC" | awk '{ print $1 }')
+    jq -n --arg table "$TABLE" --arg table_md5 "$TABLE_MD5" '. = {"table": $table, "table_md5": $table_md5}'
+}
+
 jq -n \
     --argjson bwa_db "$(add_bwa_db)" \
     --argjson ariba_db "$(add_ariba_db)" \
     --argjson seroba_db "$(add_seroba_db)" \
     --argjson kraken2_db "$(add_url_db "${KRAKEN2_DB_PATH}/${KRAKEN2_JSON}")" \
     --argjson poppunnk_db "$(add_url_db "${POPPUNK_DB_PATH}/${POPPUNK_JSON}")" \
     --argjson poppunk_ext "$(add_url_db "${POPPUNK_EXT_PATH}/${POPPUNK_EXT_JSON}")" \
+    --argjson resistance_to_mic "$(add_resistance_to_mic)"\
     '$ARGS.named' > "$JSON_FILE"
diff --git a/data/resistance_to_MIC.tsv b/data/resistance_to_MIC.tsv
@@ -0,0 +1,10 @@
+drug	S	I	R
+CHL	<= 4	-	>= 8
+CLI	<= 0.25	0.5	>= 1
+COT	<= 0.5	1-2	>= 4
+DOX	<= 0.25	0.5	>= 1
+ERY	<= 0.25	0.5	>= 1
+LFX	<= 2	4	>= 8
+RIF	<= 1	2	>= 4
+TET	<= 1	2	>= 4
+VAN	<= 1	-	-
diff --git a/download_test_input b/download_test_input
@@ -18,7 +18,7 @@ else
     exit 1
 fi
 
-echo "Downloading... Please wait ..."
+echo "Downloading... Please wait..."
 eval $GET && tar -xf $TARGET && rm -f $TARGET; status=$?
 
 if [ $status -ne 0 ]; then

diff --git a/main.nf b/main.nf
@@ -1,7 +1,7 @@
 #!/usr/bin/env nextflow
 
 // Version of this release
-pipelineVersion = '1.0.0-rc2'
+pipelineVersion = '1.0.0-rc3'
 
 // Import workflow modules
 include { PIPELINE } from "$projectDir/workflows/pipeline"
@@ -41,11 +41,11 @@ workflow {
         INIT()
     } else if (params.version) {
         workflowSelectMessage('version')
-        PRINT_VERSION(pipelineVersion)
+        PRINT_VERSION(params.resistance_to_mic, pipelineVersion)
     } else {
         workflowSelectMessage('pipeline')
         PIPELINE()
-        SAVE_INFO(PIPELINE.out.databases_info, pipelineVersion)
+        SAVE_INFO(PIPELINE.out.databases_info, params.resistance_to_mic, pipelineVersion)
     }
 }
 

diff --git a/modules/info.nf b/modules/info.nf
@@ -34,6 +34,7 @@ process DATABASES {
     path seroba_db_path
     path poppunk_db_path
     path poppunk_ext_path
+    path resistance_to_mic
 
     output:
     path(json), emit: json
@@ -59,6 +60,7 @@ process DATABASES {
     POPPUNK_JSON="$poppunk_json"
     POPPUNK_EXT_PATH="$poppunk_ext_path"
     POPPUNK_EXT_JSON="$poppunk_ext_json"
+    RESISTANCE_TO_MIC="$resistance_to_mic"
     JSON_FILE="$json"
 
     source save_databases_info.sh
@@ -170,7 +172,7 @@ process PARSE {
         |╔═══════════════════════════╤═════════════════════════════════════════════════════════════════════╗
         |${coreTextRow('Software', 'Version')}
         |╠═══════════════════════════╪═════════════════════════════════════════════════════════════════════╣
-        |${coreTextRow('GPS Unified Pipeline', json.pipeline.version)}
+        |${coreTextRow('GPS Pipeline', json.pipeline.version)}
         |${coreTextRow('Nextflow', json.nextflow.version)}
         |╚═══════════════════════════╧═════════════════════════════════════════════════════════════════════╝
         |""".stripMargin()
@@ -216,6 +218,11 @@ process PARSE {
         |${dbTextRow('Metadata', json.ariba_db.metadata)}
         |${dbTextRow('Metadata MD5', json.ariba_db.metadata_md5)}
         |${dbTextRow('Created', json.ariba_db.create_time)}
+        |╠═══════════════╧═════════════════════════════════════════════════════════════════════════════════╣
+        |║ Resistance phenotypes to MIC (minimum inhibitory concentration) lookup table                    ║
+        |╟───────────────┬─────────────────────────────────────────────────────────────────────────────────╢
+        |${dbTextRow('Table', json.resistance_to_mic.table)}
+        |${dbTextRow('Table MD5', json.resistance_to_mic.table_md5)}
         |╚═══════════════╧═════════════════════════════════════════════════════════════════════════════════╝
         |""".stripMargin()
 

diff --git a/modules/messages.nf b/modules/messages.nf
@@ -5,12 +5,13 @@ void startMessage(String pipelineVersion) {
         |
         |╔══════════════════════════════════════════════════════════════════════════════════════════╗
         |║                                                                                          ║░
-        |║   ____ ____  ____    _   _       _  __ _          _   ____  _            _ _             ║░
-        |║  / ___|  _ \/ ___|  | | | |_ __ (_)/ _(_) ___  __| | |  _ \(_)_ __   ___| (_)_ __   ___  ║░
-        |║ | |  _| |_) \___ \  | | | | '_ \| | |_| |/ _ \/ _` | | |_) | | '_ \ / _ | | | '_ \ / _ \ ║░
-        |║ | |_| |  __/ ___) | | |_| | | | | |  _| |  __| (_| | |  __/| | |_) |  __| | | | | |  __/ ║░
-        |║  \____|_|   |____/   \___/|_| |_|_|_| |_|\___|\__,_| |_|   |_| .__/ \___|_|_|_| |_|\___| ║░
-        |${String.format('║  v %-57s |_|                         ║░', pipelineVersion)}
+        |║    ██████╗ ██████╗ ███████╗    ██████╗ ██╗██████╗ ███████╗██╗     ██╗███╗   ██╗███████╗  ║░
+        |║   ██╔════╝ ██╔══██╗██╔════╝    ██╔══██╗██║██╔══██╗██╔════╝██║     ██║████╗  ██║██╔════╝  ║░
+        |║   ██║  ███╗██████╔╝███████╗    ██████╔╝██║██████╔╝█████╗  ██║     ██║██╔██╗ ██║█████╗    ║░
+        |║   ██║   ██║██╔═══╝ ╚════██║    ██╔═══╝ ██║██╔═══╝ ██╔══╝  ██║     ██║██║╚██╗██║██╔══╝    ║░
+        |║   ╚██████╔╝██║     ███████║    ██║     ██║██║     ███████╗███████╗██║██║ ╚████║███████╗  ║░
+        |║    ╚═════╝ ╚═╝     ╚══════╝    ╚═╝     ╚═╝╚═╝     ╚══════╝╚══════╝╚═╝╚═╝  ╚═══╝╚══════╝  ║░
+        |${String.format('║  v %-86s║░', pipelineVersion)}
         |╚══════════════════════════════════════════════════════════════════════════════════════════╝░
         |  ░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░
        /$.stripMargin()
@@ -92,7 +93,7 @@ void endMessage(String selectedWorkflow) {
                 """.stripMargin()
             failMessage = '''
                 |The pipeline has failed.
-                |If you think it is caused by a bug, submit an issue at \"https://github.com/HarryHung/gps-unified-pipeline/issues\".
+                |If you think it is caused by a bug, submit an issue at \"https://github.com/sanger-bentley-group/gps-pipeline/issues\".
                 '''.stripMargin()
             break
         case 'init':
@@ -111,7 +112,7 @@ void endMessage(String selectedWorkflow) {
                 '''.stripMargin()
             failMessage = '''
                 |Failed to get version information on pipeline, tools or databases.
-                |If you think it is caused by a bug, submit an issue at \"https://github.com/HarryHung/gps-unified-pipeline/issues\"
+                |If you think it is caused by a bug, submit an issue at \"https://github.com/sanger-bentley-group/gps-pipeline/issues\"
                 '''.stripMargin()
             break
     }

diff --git a/modules/output.nf b/modules/output.nf
@@ -29,6 +29,7 @@ process GENERATE_OVERALL_REPORT {
     input:
     path '*'
     path ariba_metadata
+    path resistance_to_mic
 
     output:
     path "$overall_report", emit: report
@@ -37,6 +38,6 @@ process GENERATE_OVERALL_REPORT {
     input_pattern='*_report.csv'
     overall_report='results.csv'
     """
-    generate_overall_report.py '$input_pattern' $ariba_metadata $overall_report
+    generate_overall_report.py '$input_pattern' $ariba_metadata $resistance_to_mic $overall_report
     """
 }
diff --git a/modules/validate.nf b/modules/validate.nf
@@ -26,6 +26,7 @@ validParams = [
     depth: 'int_float',
     ariba_ref: 'path_fasta',
     ariba_metadata: 'path_tsv',
+    resistance_to_mic: 'path_tsv',
     lite: 'boolean'
 ]
 
@@ -50,11 +51,6 @@ void validate(Map params) {
         validParams.put("singularity_cachedir", "path")
     }
 
-    // Add params.maxretries when workflow.profile contains 'lsf' 
-    if (workflow.profile.split(',').contains('lsf')) {
-        validParams.put("maxretries", "int")
-    }
-
     // For initalisation, skip input and output directories checks
     // For version, skip all file paths related checks
     skippedParams = []
@@ -167,7 +163,7 @@ void validate(Map params) {
             default:
                 log.error("""
                     |Unknown value type \"${validParams[key]}\"
-                    |Please submit an issue at \"https://github.com/HarryHung/gps-unified-pipeline/issues\"}
+                    |Please submit an issue at \"https://github.com/sanger-bentley-group/gps-pipeline/issues\"}
                     """.stripMargin())
                 System.exit(1)
         }

diff --git a/nextflow b/nextflow
@@ -15,7 +15,7 @@
 #  limitations under the License.
 
 [[ "$NXF_DEBUG" == 'x' ]] && set -x
-NXF_VER=${NXF_VER:-'23.10.0'}
+NXF_VER=${NXF_VER:-'23.10.1'}
 NXF_ORG=${NXF_ORG:-'nextflow-io'}
 NXF_HOME=${NXF_HOME:-$HOME/.nextflow}
 NXF_PROT=${NXF_PROT:-'https'}

diff --git a/nextflow.config b/nextflow.config
@@ -36,8 +36,8 @@ params {
     ref_genome = "$projectDir/data/ATCC_700669_v1.fa"
 
     // Default links for PopPUNK Database and External Clusters
-    poppunk_db_remote = "https://gps-project.cog.sanger.ac.uk/GPS_v6.tar.gz"
-    poppunk_ext_remote = "https://gps-project.cog.sanger.ac.uk/GPS_v6_external_clusters.csv"
+    poppunk_db_remote = "https://gps-project.cog.sanger.ac.uk/GPS_v8_ref.tar.gz"
+    poppunk_ext_remote = "https://gps-project.cog.sanger.ac.uk/GPS_v8_external_clusters.csv"
 
     // Default values for QC
     spneumo_percentage = 60.00
@@ -51,7 +51,10 @@ params {
 
     // Default ARIBA referece sequences and metadata paths
     ariba_ref = "$projectDir/data/ariba_ref_sequences.fasta"
-    ariba_metadata =  "$projectDir/data/ariba_metadata.tsv"
+    ariba_metadata = "$projectDir/data/ariba_metadata.tsv"
+
+    // Default resistance phenotypes to MIC (minimum inhibitory concentration) lookup table
+    resistance_to_mic = "$projectDir/data/resistance_to_MIC.tsv"
 
     // Toggle for removing .bam and .sam files mid-run to reduce storage requirement
     // Warning: This will break the -resume function of Nextflow  
@@ -88,7 +91,7 @@ process {
         container = 'staphb/bcftools:1.16'
     }
     withLabel: poppunk_container {
-        container = 'staphb/poppunk:2.6.0'
+        container = 'staphb/poppunk:2.6.3'
     }
     withLabel: spn_pbp_amr_container {
         container = 'sangerbentleygroup/spn-pbp-amr:23.10.2'
@@ -138,36 +141,33 @@ profiles {
     // Singularity as container engine, execute by LSF
     lsf {
         params.singularity_cachedir = "$projectDir/singularity_cache"
-        params.maxretries = 4
         params.kraken2_memory_mapping = false
 
         process {
             executor = 'lsf'
             scratch = true
             time = {30.min * task.attempt}
+            maxRetries = 4
+
             withLabel: farm_low {
                 cpus = 1
                 memory = {1.GB * task.attempt}
                 errorStrategy = 'retry'
-                maxRetries = params.maxretries
             }
             withLabel: farm_mid {
                 cpus = 8
                 memory = {4.GB * task.attempt}
                 errorStrategy = 'retry'
-                maxRetries = params.maxretries
             }
             withLabel: farm_high {
                 cpus = 32
                 memory = {16.GB * task.attempt}
                 errorStrategy = 'retry'
-                maxRetries = params.maxretries
             }
             withLabel: farm_high_fallible {
                 cpus = 32
                 memory = {16.GB * task.attempt}
-                errorStrategy = { task.attempt <= params.maxretries ? 'retry' : 'ignore' }
-                maxRetries = params.maxretries
+                errorStrategy = { task.attempt <= process.maxRetries ? 'retry' : 'ignore' }
             }
             withLabel: farm_slow {
                 time = {2.hour * task.attempt}