Skip to content

Commit

Permalink
Merge pull request #87 from sanger-bentley-group/dev
Browse files Browse the repository at this point in the history
Release Version 1.0.0-rc3
  • Loading branch information
HarryHung authored Jan 19, 2024
2 parents 02885cc + 20c9308 commit f6ca913
Show file tree
Hide file tree
Showing 15 changed files with 182 additions and 109 deletions.
124 changes: 68 additions & 56 deletions README.md

Large diffs are not rendered by default.

47 changes: 32 additions & 15 deletions bin/generate_overall_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,33 +24,34 @@


# Check argv and save to global variables
if len(sys.argv) != 4:
sys.exit('Usage: generate_overall_report.py INPUT_PATTERN ARIBA_METADATA OUTPUT_FILE')
if len(sys.argv) != 5:
sys.exit('Usage: generate_overall_report.py INPUT_PATTERN ARIBA_METADATA RESISTANCE_TO_MIC OUTPUT_FILE')
INPUT_PATTERN = sys.argv[1]
ARIBA_METADATA = sys.argv[2]
OUTPUT_FILE = sys.argv[3]
RESISTANCE_TO_MIC = sys.argv[3]
OUTPUT_FILE = sys.argv[4]


def main():
output_columns = get_output_columns()
df_output = get_df_output(output_columns)
ariba_targets = set(pd.read_csv(ARIBA_METADATA, sep='\t')['target'].unique())
df_resistance_to_mic = pd.read_csv(RESISTANCE_TO_MIC, sep='\t', index_col='drug')

output_columns = get_output_columns(COLUMNS_BY_CATEGORY, ariba_targets)
df_output = get_df_output(INPUT_PATTERN, output_columns, df_resistance_to_mic)

# Saving df_output to OUTPUT_FILE in csv format
df_output.to_csv(OUTPUT_FILE, index=False, na_rep='_')


# Get output columns based on COLUMNS_BY_CATEGORY and ARIBA metadata
def get_output_columns():
output_columns = list(chain.from_iterable(COLUMNS_BY_CATEGORY.values()))
add_ariba_columns(output_columns)
def get_output_columns(columns_by_category, ariba_targets):
output_columns = list(chain.from_iterable(columns_by_category.values()))
add_ariba_columns(output_columns, ariba_targets)
return output_columns


# Based on ARIBA metadata, add additional output columns
def add_ariba_columns(output_columns):
# Get all targets in ARIBA metadata
ariba_targets = set(pd.read_csv(ARIBA_METADATA, sep='\t')['target'].unique())

def add_ariba_columns(output_columns, ariba_targets):
# Adding special cases if certain targets exist
if 'TET' in ariba_targets:
ariba_targets.add('DOX')
Expand All @@ -72,14 +73,14 @@ def add_ariba_columns(output_columns):
output_columns.extend([f'{pili}', f'{pili}_Determinant'])


# Generating df_output based on all sample reports with columns in the order of output_columns
def get_df_output(output_columns):
# Generating df_output based on all sample reports with columns in the order of output_columns, add inferred MIC range
def get_df_output(input_pattern, output_columns, df_resistance_to_mic):
# Generate an empty dataframe as df_manifest based on output_columns
df_manifest = pd.DataFrame(columns=output_columns)

# Generate a dataframe for each sample report and then concat df_manifest and all dataframes into df_output
dfs = [df_manifest]
reports = glob.glob(INPUT_PATTERN)
reports = glob.glob(input_pattern)
for report in reports:
df = pd.read_csv(report, dtype=str)
dfs.append(df)
Expand All @@ -88,6 +89,22 @@ def get_df_output(output_columns):
# Ensure column order in df_output is the same as output_columns
df_output = df_output[output_columns]

df_output = add_inferred_mic(df_output, df_resistance_to_mic)

return df_output

# Add inferred MIC (minimum inhibitory concentration) based on resistance phenotypes if the drug exists in the lookup table
def add_inferred_mic(df_output, df_resistance_to_mic):
all_resistance_to_mic = df_resistance_to_mic.to_dict('index')

for drug, resistance_to_mic in all_resistance_to_mic.items():
res_col_name = f'{drug}_Res'

if res_col_name in df_output:
res_col_index = df_output.columns.get_loc(res_col_name)
mic_series = df_output[res_col_name].map(resistance_to_mic, na_action='ignore')
df_output.insert(res_col_index, f'{drug}_MIC', mic_series)

return df_output


Expand Down
7 changes: 7 additions & 0 deletions bin/save_databases_info.sh
Original file line number Diff line number Diff line change
Expand Up @@ -58,11 +58,18 @@ add_url_db () {
jq -n --arg url "$URL" --arg save_time "$SAVE_TIME" '. = {"url": $url, "save_time": $save_time}'
}

add_resistance_to_mic () {
TABLE="$RESISTANCE_TO_MIC"
TABLE_MD5=$(md5sum "$RESISTANCE_TO_MIC" | awk '{ print $1 }')
jq -n --arg table "$TABLE" --arg table_md5 "$TABLE_MD5" '. = {"table": $table, "table_md5": $table_md5}'
}

jq -n \
--argjson bwa_db "$(add_bwa_db)" \
--argjson ariba_db "$(add_ariba_db)" \
--argjson seroba_db "$(add_seroba_db)" \
--argjson kraken2_db "$(add_url_db "${KRAKEN2_DB_PATH}/${KRAKEN2_JSON}")" \
--argjson poppunnk_db "$(add_url_db "${POPPUNK_DB_PATH}/${POPPUNK_JSON}")" \
--argjson poppunk_ext "$(add_url_db "${POPPUNK_EXT_PATH}/${POPPUNK_EXT_JSON}")" \
--argjson resistance_to_mic "$(add_resistance_to_mic)"\
'$ARGS.named' > "$JSON_FILE"
10 changes: 10 additions & 0 deletions data/resistance_to_MIC.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
drug S I R
CHL <= 4 - >= 8
CLI <= 0.25 0.5 >= 1
COT <= 0.5 1-2 >= 4
DOX <= 0.25 0.5 >= 1
ERY <= 0.25 0.5 >= 1
LFX <= 2 4 >= 8
RIF <= 1 2 >= 4
TET <= 1 2 >= 4
VAN <= 1 - -
2 changes: 1 addition & 1 deletion download_test_input
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ else
exit 1
fi

echo "Downloading... Please wait ..."
echo "Downloading... Please wait..."
eval $GET && tar -xf $TARGET && rm -f $TARGET; status=$?

if [ $status -ne 0 ]; then
Expand Down
6 changes: 3 additions & 3 deletions main.nf
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/usr/bin/env nextflow

// Version of this release
pipelineVersion = '1.0.0-rc2'
pipelineVersion = '1.0.0-rc3'

// Import workflow modules
include { PIPELINE } from "$projectDir/workflows/pipeline"
Expand Down Expand Up @@ -41,11 +41,11 @@ workflow {
INIT()
} else if (params.version) {
workflowSelectMessage('version')
PRINT_VERSION(pipelineVersion)
PRINT_VERSION(params.resistance_to_mic, pipelineVersion)
} else {
workflowSelectMessage('pipeline')
PIPELINE()
SAVE_INFO(PIPELINE.out.databases_info, pipelineVersion)
SAVE_INFO(PIPELINE.out.databases_info, params.resistance_to_mic, pipelineVersion)
}
}

Expand Down
9 changes: 8 additions & 1 deletion modules/info.nf
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ process DATABASES {
path seroba_db_path
path poppunk_db_path
path poppunk_ext_path
path resistance_to_mic

output:
path(json), emit: json
Expand All @@ -59,6 +60,7 @@ process DATABASES {
POPPUNK_JSON="$poppunk_json"
POPPUNK_EXT_PATH="$poppunk_ext_path"
POPPUNK_EXT_JSON="$poppunk_ext_json"
RESISTANCE_TO_MIC="$resistance_to_mic"
JSON_FILE="$json"
source save_databases_info.sh
Expand Down Expand Up @@ -170,7 +172,7 @@ process PARSE {
|╔═══════════════════════════╤═════════════════════════════════════════════════════════════════════╗
|${coreTextRow('Software', 'Version')}
|╠═══════════════════════════╪═════════════════════════════════════════════════════════════════════╣
|${coreTextRow('GPS Unified Pipeline', json.pipeline.version)}
|${coreTextRow('GPS Pipeline', json.pipeline.version)}
|${coreTextRow('Nextflow', json.nextflow.version)}
|╚═══════════════════════════╧═════════════════════════════════════════════════════════════════════╝
|""".stripMargin()
Expand Down Expand Up @@ -216,6 +218,11 @@ process PARSE {
|${dbTextRow('Metadata', json.ariba_db.metadata)}
|${dbTextRow('Metadata MD5', json.ariba_db.metadata_md5)}
|${dbTextRow('Created', json.ariba_db.create_time)}
|╠═══════════════╧═════════════════════════════════════════════════════════════════════════════════╣
|║ Resistance phenotypes to MIC (minimum inhibitory concentration) lookup table ║
|╟───────────────┬─────────────────────────────────────────────────────────────────────────────────╢
|${dbTextRow('Table', json.resistance_to_mic.table)}
|${dbTextRow('Table MD5', json.resistance_to_mic.table_md5)}
|╚═══════════════╧═════════════════════════════════════════════════════════════════════════════════╝
|""".stripMargin()

Expand Down
17 changes: 9 additions & 8 deletions modules/messages.nf
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,13 @@ void startMessage(String pipelineVersion) {
|
|╔══════════════════════════════════════════════════════════════════════════════════════════╗
|║ ║░
|║ ____ ____ ____ _ _ _ __ _ _ ____ _ _ _ ║░
|║ / ___| _ \/ ___| | | | |_ __ (_)/ _(_) ___ __| | | _ \(_)_ __ ___| (_)_ __ ___ ║░
|║ | | _| |_) \___ \ | | | | '_ \| | |_| |/ _ \/ _` | | |_) | | '_ \ / _ | | | '_ \ / _ \ ║░
|║ | |_| | __/ ___) | | |_| | | | | | _| | __| (_| | | __/| | |_) | __| | | | | | __/ ║░
|║ \____|_| |____/ \___/|_| |_|_|_| |_|\___|\__,_| |_| |_| .__/ \___|_|_|_| |_|\___| ║░
|${String.format('║ v %-57s |_| ║░', pipelineVersion)}
|║ ██████╗ ██████╗ ███████╗ ██████╗ ██╗██████╗ ███████╗██╗ ██╗███╗ ██╗███████╗ ║░
|║ ██╔════╝ ██╔══██╗██╔════╝ ██╔══██╗██║██╔══██╗██╔════╝██║ ██║████╗ ██║██╔════╝ ║░
|║ ██║ ███╗██████╔╝███████╗ ██████╔╝██║██████╔╝█████╗ ██║ ██║██╔██╗ ██║█████╗ ║░
|║ ██║ ██║██╔═══╝ ╚════██║ ██╔═══╝ ██║██╔═══╝ ██╔══╝ ██║ ██║██║╚██╗██║██╔══╝ ║░
|║ ╚██████╔╝██║ ███████║ ██║ ██║██║ ███████╗███████╗██║██║ ╚████║███████╗ ║░
|║ ╚═════╝ ╚═╝ ╚══════╝ ╚═╝ ╚═╝╚═╝ ╚══════╝╚══════╝╚═╝╚═╝ ╚═══╝╚══════╝ ║░
|${String.format('║ v %-86s║░', pipelineVersion)}
|╚══════════════════════════════════════════════════════════════════════════════════════════╝░
| ░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░
/$.stripMargin()
Expand Down Expand Up @@ -92,7 +93,7 @@ void endMessage(String selectedWorkflow) {
""".stripMargin()
failMessage = '''
|The pipeline has failed.
|If you think it is caused by a bug, submit an issue at \"https://github.com/HarryHung/gps-unified-pipeline/issues\".
|If you think it is caused by a bug, submit an issue at \"https://github.com/sanger-bentley-group/gps-pipeline/issues\".
'''.stripMargin()
break
case 'init':
Expand All @@ -111,7 +112,7 @@ void endMessage(String selectedWorkflow) {
'''.stripMargin()
failMessage = '''
|Failed to get version information on pipeline, tools or databases.
|If you think it is caused by a bug, submit an issue at \"https://github.com/HarryHung/gps-unified-pipeline/issues\"
|If you think it is caused by a bug, submit an issue at \"https://github.com/sanger-bentley-group/gps-pipeline/issues\"
'''.stripMargin()
break
}
Expand Down
3 changes: 2 additions & 1 deletion modules/output.nf
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ process GENERATE_OVERALL_REPORT {
input:
path '*'
path ariba_metadata
path resistance_to_mic

output:
path "$overall_report", emit: report
Expand All @@ -37,6 +38,6 @@ process GENERATE_OVERALL_REPORT {
input_pattern='*_report.csv'
overall_report='results.csv'
"""
generate_overall_report.py '$input_pattern' $ariba_metadata $overall_report
generate_overall_report.py '$input_pattern' $ariba_metadata $resistance_to_mic $overall_report
"""
}
8 changes: 2 additions & 6 deletions modules/validate.nf
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ validParams = [
depth: 'int_float',
ariba_ref: 'path_fasta',
ariba_metadata: 'path_tsv',
resistance_to_mic: 'path_tsv',
lite: 'boolean'
]

Expand All @@ -50,11 +51,6 @@ void validate(Map params) {
validParams.put("singularity_cachedir", "path")
}

// Add params.maxretries when workflow.profile contains 'lsf'
if (workflow.profile.split(',').contains('lsf')) {
validParams.put("maxretries", "int")
}

// For initalisation, skip input and output directories checks
// For version, skip all file paths related checks
skippedParams = []
Expand Down Expand Up @@ -167,7 +163,7 @@ void validate(Map params) {
default:
log.error("""
|Unknown value type \"${validParams[key]}\"
|Please submit an issue at \"https://github.com/HarryHung/gps-unified-pipeline/issues\"}
|Please submit an issue at \"https://github.com/sanger-bentley-group/gps-pipeline/issues\"}
""".stripMargin())
System.exit(1)
}
Expand Down
2 changes: 1 addition & 1 deletion nextflow
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
# limitations under the License.

[[ "$NXF_DEBUG" == 'x' ]] && set -x
NXF_VER=${NXF_VER:-'23.10.0'}
NXF_VER=${NXF_VER:-'23.10.1'}
NXF_ORG=${NXF_ORG:-'nextflow-io'}
NXF_HOME=${NXF_HOME:-$HOME/.nextflow}
NXF_PROT=${NXF_PROT:-'https'}
Expand Down
20 changes: 10 additions & 10 deletions nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,8 @@ params {
ref_genome = "$projectDir/data/ATCC_700669_v1.fa"

// Default links for PopPUNK Database and External Clusters
poppunk_db_remote = "https://gps-project.cog.sanger.ac.uk/GPS_v6.tar.gz"
poppunk_ext_remote = "https://gps-project.cog.sanger.ac.uk/GPS_v6_external_clusters.csv"
poppunk_db_remote = "https://gps-project.cog.sanger.ac.uk/GPS_v8_ref.tar.gz"
poppunk_ext_remote = "https://gps-project.cog.sanger.ac.uk/GPS_v8_external_clusters.csv"

// Default values for QC
spneumo_percentage = 60.00
Expand All @@ -51,7 +51,10 @@ params {

// Default ARIBA referece sequences and metadata paths
ariba_ref = "$projectDir/data/ariba_ref_sequences.fasta"
ariba_metadata = "$projectDir/data/ariba_metadata.tsv"
ariba_metadata = "$projectDir/data/ariba_metadata.tsv"

// Default resistance phenotypes to MIC (minimum inhibitory concentration) lookup table
resistance_to_mic = "$projectDir/data/resistance_to_MIC.tsv"

// Toggle for removing .bam and .sam files mid-run to reduce storage requirement
// Warning: This will break the -resume function of Nextflow
Expand Down Expand Up @@ -88,7 +91,7 @@ process {
container = 'staphb/bcftools:1.16'
}
withLabel: poppunk_container {
container = 'staphb/poppunk:2.6.0'
container = 'staphb/poppunk:2.6.3'
}
withLabel: spn_pbp_amr_container {
container = 'sangerbentleygroup/spn-pbp-amr:23.10.2'
Expand Down Expand Up @@ -138,36 +141,33 @@ profiles {
// Singularity as container engine, execute by LSF
lsf {
params.singularity_cachedir = "$projectDir/singularity_cache"
params.maxretries = 4
params.kraken2_memory_mapping = false

process {
executor = 'lsf'
scratch = true
time = {30.min * task.attempt}
maxRetries = 4

withLabel: farm_low {
cpus = 1
memory = {1.GB * task.attempt}
errorStrategy = 'retry'
maxRetries = params.maxretries
}
withLabel: farm_mid {
cpus = 8
memory = {4.GB * task.attempt}
errorStrategy = 'retry'
maxRetries = params.maxretries
}
withLabel: farm_high {
cpus = 32
memory = {16.GB * task.attempt}
errorStrategy = 'retry'
maxRetries = params.maxretries
}
withLabel: farm_high_fallible {
cpus = 32
memory = {16.GB * task.attempt}
errorStrategy = { task.attempt <= params.maxretries ? 'retry' : 'ignore' }
maxRetries = params.maxretries
errorStrategy = { task.attempt <= process.maxRetries ? 'retry' : 'ignore' }
}
withLabel: farm_slow {
time = {2.hour * task.attempt}
Expand Down
Loading

0 comments on commit f6ca913

Please sign in to comment.