Skip to content

Commit

Permalink
CI tests, list of serovars -l option
Browse files Browse the repository at this point in the history
  • Loading branch information
kbessonov1984 committed Sep 4, 2024
1 parent 77f0896 commit f76294e
Show file tree
Hide file tree
Showing 9 changed files with 84,984 additions and 18 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@ Removed the following entries
7. Madelia,"1,6,14,25",y,"1,7",,H,FALSE,enterica
8. Soahanina,"6,14,24",z,"e,n,x",,H,FALSE,enterica
9. Chichiri,"6,14,24","z4,z24",-,,H,TRUE,enterica
10. II 4:a:z39,"1,4,12,[27]",a,z39,,B,FALSE,salamae

# 1.1.1

Expand Down
30 changes: 28 additions & 2 deletions sistr/sistr_cmd.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ def init_parser():
parser.add_argument('-f',
'--output-format',
default='json',
help='Output format (json, csv, pickle)')
help='Output format (json, csv, tab, pickle)')
parser.add_argument('-o',
'--output-prediction',
help='SISTR serovar prediction output path')
Expand Down Expand Up @@ -93,6 +93,9 @@ def init_parser():
type=int,
default=1,
help='Number of parallel threads to run sistr_cmd analysis.')
parser.add_argument('-l', '--list-of-serovars',
type=str, required=False,
help='A path to a single column text file containing list of serovar(s) to check serovar prediction against. Report predicted serovar is Y (present) and N (absent) in the list')
parser.add_argument('-v',
'--verbose',
action='count',
Expand Down Expand Up @@ -183,12 +186,14 @@ def infer_o_antigen(prediction):
else:
counter_o_antigens = Counter(series_o_antigens)
most_common_o_antigen = counter_o_antigens.most_common(1)[0][0]
# for O24 and O25 antigens need to remove those antigens as we do not doe any testing in the lab
if any([True if antigen in most_common_o_antigen else False for antigen in ['24','25'] ]):
logging.info(f"Cleaning most common O antigen {most_common_o_antigen} ....")
logging.info(f"Cleaning O antigen from 24 and 25 antigens {most_common_o_antigen} ....")
for pattern in [',\[24\]', ',\[25\]', ',24', ',25','\[1\],','1,']:
most_common_o_antigen = re.sub(pattern,'',most_common_o_antigen)
logging.info(f"Reporting final O-antigen result {most_common_o_antigen}")
prediction.o_antigen = most_common_o_antigen
prediction.antigenic_profile=f"{prediction.o_antigen}:{prediction.h1}:{prediction.h2}"

def download_to_file(url,file):
with open(file, 'wb') as f:
Expand Down Expand Up @@ -246,6 +251,16 @@ def setup_sistr_dbs():

def sistr_predict(input_fasta, genome_name, tmp_dir, keep_tmp, args):
blast_runner = None
serovars_selected_list = []

if args.list_serovars_file:
if os.path.exists(args.list_serovars_file):
with open(args.list_serovars_file) as fp:
serovars_selected_list = [l.rstrip() for l in fp.readlines()]
logging.info(f"Selected serovars list to check predictions against from {args.list_serovars_file} is {serovars_selected_list}")
else:
logging.warning(f"File {args.list_serovars_file} does not exist in path specified. Would not check against list of provided serovars ...")

try:
assert os.path.exists(input_fasta), "Input fasta file '%s' must exist!" % input_fasta
if genome_name is None or genome_name == '':
Expand Down Expand Up @@ -285,6 +300,17 @@ def sistr_predict(input_fasta, genome_name, tmp_dir, keep_tmp, args):
overall_serovar_call(prediction, serovar_predictor)
print(f"sistr_cmd.py L280: overall_serovar: {prediction.serovar}"); #raise Exception;
infer_o_antigen(prediction)
# if list of reportable serovars is provided to check prediction serovar against
if serovars_selected_list:
prediction.serovar_in_list = "N"
for selected_serovar in serovars_selected_list:
if selected_serovar == prediction.serovar:
print(selected_serovar, prediction.serovar)
prediction.serovar_in_list = "Y"
break


print(f"L288: sistr_cmd.py antigenic_formula: {prediction.antigenic_profile}")
logging.info('%s | Antigen gene BLAST serovar prediction: "%s" serogroup=%s %s:%s:%s',
genome_name,
prediction.serovar_antigen,
Expand Down
3 changes: 2 additions & 1 deletion sistr/src/cgmlst/extras/hclust_cutree.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@ def profiles_to_np_array(profiles_csv_path):
"""
"""
df = pd.read_csv(profiles_csv_path, index_col=0)
df = pd.read_hdf(profiles_csv_path, key='cgmlst')
#df = pd.read_csv(profiles_csv_path, index_col=0)
arr = np.array(df, dtype=np.float64)
genomes = df.index
markers = df.columns
Expand Down
10 changes: 5 additions & 5 deletions sistr/src/serovar_prediction/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -466,7 +466,7 @@ def predict_serovar_from_antigen_blast(self):
if spp_roman:
self.serovar = '{} {}:{}:{}'.format(spp_roman, o_antigen, self.h1, self.h2)
else:
self.serovar = '{}:{}:{}'.format(o_antigen, self.h1, self.h2)
self.serovar = '{}:{}:{}'.format(o_antigen, self.h1, self.h2)
return self.serovar

def get_serovar_prediction(self):
Expand Down Expand Up @@ -622,7 +622,7 @@ def overall_serovar_call(serovar_prediction, antigen_predictor):
serovars_from_antigen = [serovars_from_antigen]
if cgmlst_serovar is not None:
if cgmlst_serovar in serovars_from_antigen:
logging.info(f"Antigen predictor has multiple serovar results {antigen_predictor.serovar}, but assigned cgmlst serovar {cgmlst_serovar} ...")
logging.info(f"Antigen predictor has multiple serovar results {antigen_predictor.serovar}, but assigned final cgmlst serovar {cgmlst_serovar} ...")
serovar_prediction.serovar = cgmlst_serovar

elif 'mash_match' in serovar_prediction.__dict__:
Expand All @@ -631,16 +631,16 @@ def overall_serovar_call(serovar_prediction, antigen_predictor):
mash_dist = float(spd['mash_distance'])
if mash_serovar in serovars_from_antigen:
serovar_prediction.serovar = mash_serovar
logging.info(f"Antigen predictor has a serovar result {antigen_predictor.serovar}, but assigned mash serovar {mash_serovar} ...")
logging.info(f"Antigen predictor has a serovar result {antigen_predictor.serovar}, but assigned final mash serovar {mash_serovar} ...")
else:
if mash_dist <= MASH_DISTANCE_THRESHOLD:
serovar_prediction.serovar = mash_serovar
logging.info(f"Antigen predictor has a serovar result {antigen_predictor.serovar}, but assigned mash serovar {mash_serovar} ...")
logging.info(f"Antigen predictor has a serovar result {antigen_predictor.serovar}, but assigned final mash serovar {mash_serovar} ...")
else:
logging.info(f"MASH serovar prediction was NOT assigned as mash distance {mash_dist} > {MASH_DISTANCE_THRESHOLD} ")

if serovar_prediction.serovar is None:
logging.info(f"Antigen predictor has a serovar result {antigen_predictor.serovar} and it will be assigned as final serovar ...")
logging.info(f"Antigen predictor has a serovar result {antigen_predictor.serovar} and it will be assigned as a final serovar ...")
serovar_prediction.serovar = serovar_prediction.serovar_antigen

if serovar_prediction.h1 is None:
Expand Down
Loading

0 comments on commit f76294e

Please sign in to comment.