-
Notifications
You must be signed in to change notification settings - Fork 23
/
Copy pathhydrogen_bond_propensity_report.py
512 lines (424 loc) · 21.8 KB
/
hydrogen_bond_propensity_report.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
#!/usr/bin/env python
#
# This script can be used for any purpose without limitation subject to the
# conditions at http://www.ccdc.cam.ac.uk/Community/Pages/Licences/v2.aspx
#
# This permission notice and the following statement of attribution must be
# included in all copies or substantial portions of this script.
#
# 2017-08-10: Created by Andy Maloney, the Cambridge Crystallographic Data Centre
# 2020-08-21: made available by the Cambridge Crystallographic Data Centre
# 2022-12-21: Updated by Joanna S. Stevens, the Cambridge Crystallographic Data Centre
# 2025-01-08: Updated by Pablo Martinez-Bulit, the Cambridge Crystallographic Data Centre
#
"""
hydrogen_bond_propensity_report.py
- Writes a .docx report of a hydrogen bond propensity calculation
"""
import argparse
import os
import sys
import subprocess
import csv
import string
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
from ccdc import io
from ccdc.diagram import DiagramGenerator
from ccdc.search import SubstructureSearch, ConnserSubstructure
from ccdc.descriptors import CrystalDescriptors
try:
import warnings
with warnings.catch_warnings():
warnings.simplefilter("ignore", category=DeprecationWarning)
import docxtpl
from docx.shared import Cm
except ImportError:
error_message = """
The python-docx-template templating engine needed by this script could not
be found. Please run "{} -m pip install docxtpl"
to try to fix the issue.\nYou may need administrator's rights to do this.
""".format(sys.executable)
raise ImportError(error_message)
SCRIPT_DIR = os.path.dirname(__file__)
TEMPLATE_FILENAME = 'hydrogen_bond_propensity_report.docx'
TEMPLATE_FILE = os.path.join(SCRIPT_DIR, TEMPLATE_FILENAME)
###############################################################################
class PropensityCalc:
"HBP Calculator"
def __init__(self, crystal, work_directory, min_donor_coordination, min_acceptor_coordination, fg_count):
self.crystal = crystal
self.directory = work_directory
self.min_donor_coordination = min_donor_coordination
self.min_acceptor_coordination = min_acceptor_coordination
self.fg_count = fg_count
self.settings = self._hbp_settings()
self.hbp = CrystalDescriptors.HBondPropensities()
def _hbp_settings(self):
settings = CrystalDescriptors.HBondPropensities.Settings()
settings.hbond_criterion.require_hydrogens = True
settings.hbond_criterion.path_length_range = (3, 999)
settings.working_directory = self.directory
return settings
def calculate(self):
self.hbp.settings = self.settings
# Set up the target structure for the calculation
self.hbp.set_target(self.crystal)
print(self.hbp.functional_groups)
# Generate Training Dataset
self.hbp.match_fitting_data(count=self.fg_count) # set to >300, preferably 500 for better representation of functional groups
self.hbp.analyse_fitting_data()
for d in self.hbp.donors:
print(d.identifier, d.npositive, d.nnegative)
for a in self.hbp.acceptors:
print(a.identifier, a.npositive, a.nnegative)
# Perform regression
model = self.hbp.perform_regression()
print(model.equation)
print('Area under ROC curve: {} -- {}'.format(round(model.area_under_roc_curve, 3), model.advice_comment))
propensities = self.hbp.calculate_propensities()
intra_flag = True if len(self.hbp.intra_propensities) > 0 else False
intra_count = len([p for p in propensities if not p.is_intermolecular and p.is_observed])
intra_obs = True if intra_count > 0 else False
# Use default coordination cutoffs unless user overrides
groups = self.hbp.generate_hbond_groupings(min_donor_prob=self.min_donor_coordination,
min_acceptor_prob=self.min_acceptor_coordination)
observed_group = self.hbp.target_hbond_grouping()
return (self.hbp.functional_groups,
self.hbp.fitting_data,
self.hbp.donors,
self.hbp.acceptors,
model,
propensities,
intra_flag,
groups,
observed_group,
self.min_donor_coordination,
self.min_acceptor_coordination,
intra_count,
intra_obs)
def make_diagram(mol, directory):
# Generates a diagram from a given structure
molecule_diagram_generator = DiagramGenerator()
molecule_diagram_generator.settings.line_width = 1.6
molecule_diagram_generator.settings.font_size = 12
molecule_diagram_generator.settings.image_height = 300
img = molecule_diagram_generator.image(mol)
fname = str(os.path.join(directory, '%s_diagram.png' % mol.identifier))
if img:
img.save(fname)
return fname
def generate_component_pics(input_mol, molecule_diagram_generator, directory, docx_template):
non_ch_molecule_no = 0
component_diagrams = []
donor_acceptor_check = []
for m in input_mol.components: # generates an image for every molecule in the asymmetric unit
atoms = m.atoms
donor_acceptor_check = [atom for atom in atoms if atom.is_donor or atom.is_acceptor]
if donor_acceptor_check:
img = molecule_diagram_generator.image(m, highlight_atoms=None, label_atoms=donor_acceptor_check)
non_ch_molecule_no += 1
fname = str(os.path.join(directory, '%s_component_%s.png' % (input_mol.identifier, non_ch_molecule_no)))
if img:
img.save(fname)
non_ch_img = add_picture_subdoc(fname, docx_template)
component_diagrams.append(non_ch_img)
return component_diagrams
def make_diagram_components(crystal, molecule, directory, docx_template):
# Generates a diagram for each component in a given structure, with D/A atoms labelled if component has a donor/acceptor
molecule_diagram_generator = DiagramGenerator()
molecule_diagram_generator.settings.line_width = 1.6
molecule_diagram_generator.settings.font_size = 10
molecule_diagram_generator.settings.image_height = 300
molecule_diagram_generator.settings.shrink_symbols = False
asymmetric_molecule = crystal.asymmetric_unit_molecule
# If no Z and Z' information (e.g. mol2), 'None' is returned for crystal.z_prime
z_prime_flag = False
if crystal.z_prime:
z_prime_flag = True
# Generates an image for every molecule
if z_prime_flag and crystal.z_prime < 1:
component_diagrams = generate_component_pics(molecule, molecule_diagram_generator, directory, docx_template)
# Generates an image for every molecule in the asymmetric unit
else:
component_diagrams = generate_component_pics(asymmetric_molecule, molecule_diagram_generator, directory, docx_template)
return component_diagrams
def fg_diagram(mol, directory, con):
# Create highlighted functional group diagrams
diagram_generator = DiagramGenerator()
diagram_generator.settings.shrink_symbols = False
diagram_generator.settings.element_coloring = False
searcher = SubstructureSearch()
searcher.add_substructure(ConnserSubstructure(os.path.join(directory, "%s.con" % con)))
hits = searcher.search(mol)
selection = hits[0].match_atoms()
img = diagram_generator.image(mol, highlight_atoms=selection)
fname = str(os.path.join(directory, '%s.png' % con))
if img:
img.save(fname)
return fname
def add_picture_subdoc(picture_location, docx_template, cm=7):
# This function adds a picture to the .docx file
return docxtpl.InlineImage(
docx_template, image_descriptor=picture_location, width=Cm(cm))
def launch_word_processor(output_file):
"""Open the default application for output_file across platforms."""
if sys.platform == 'win32':
os.startfile(output_file)
elif sys.platform.startswith('linux'):
subprocess.Popen(['xdg-open', output_file])
else:
subprocess.Popen(['open', output_file])
def coordination_scores_calc(crystal, directory):
# Calculate coordination scores for the target structure
# Provide settings for the calculation
settings = CrystalDescriptors.HBondCoordination.Settings()
settings.working_directory = directory
# Set up the coordination scores calculator
coordination_calc = CrystalDescriptors.HBondCoordination(settings)
predictions = coordination_calc.predict(crystal)
return predictions
def format_scores(scores, das, d_type):
# Reformat the coordination scores to make report writing easier
formatted_scores = {}
for da in das:
preds = scores.predictions_for_label(da.label, d_type)[1]
formatted_scores[da.label] = preds
return formatted_scores
def normalize_molecule(molecule):
# Normalise bond types for the input structure (important for cifs)
if any(bond.bond_type == 'Unknown' for bond in molecule.bonds):
print('Unknown type bonds in molecule will be auto assigned')
molecule.assign_bond_types(which='unknown')
molecule.standardise_aromatic_bonds()
molecule.standardise_delocalised_bonds()
def chart_output(groups, work_directory, structure):
# Write out the data points of the HBP chart to a file
with open(os.path.join(work_directory, '%s_chart_data.csv' % structure), 'w') as outfile:
csv_writer = csv.writer(outfile)
csv_writer.writerow(['Mean Propensity', 'Mean Coordination Score', 'Hydrogen Bonds'])
for group in groups:
csv_writer.writerow(
[group.hbond_score,
group.coordination_score,
'; '.join(['%s - %s' % (g.donor.label, g.acceptor.label) for g in group.hbonds])]
)
def hbp_landscape(groups, observed_groups, crystal, directory, work_directory, docx_template):
# Generate the HBP chart
# 11 colours for alternative networks
chart_colours = {'#00AEEF': 'lightblue', '#D63868': 'violetred', '#6E3776': 'plum', '#B5B800': 'pear',
'#1C5797': 'darkblue', '#FF9A0F': 'orange', '#128A00': 'green', '#D2A106': 'yellow',
'#062D56': 'navy', '#00A5A8': 'turquoise', '#EB050C': 'red'}
hbp_colours = list(chart_colours.keys())
# If 10 pairs or less in network with highest number of pairs start colour at 0 - static legend
# otherwise use colours relative to observed (orange, 6th colour, index5) - dynamic legend
# For half-transparent white edges for triangles, use (1, 1, 1, 0.5) or matplotlib.colors.colorConverter.to_rgba('white', alpha=.5)
colour_pairs = []
count_pairs = []
mpair_flag = False
obs_pairs = len(observed_groups.hbonds)
# Check highest number of pairs for alternative networks
# If no alternative network generated, use number of pairs in observed structure
try:
num_alt_pairs = [len(group.hbonds) for group in groups]
max_alt_pairs = max(num_alt_pairs)
highest_pairs = max(max_alt_pairs, obs_pairs)
except:
highest_pairs = obs_pairs
# Static legend
if highest_pairs <= 10 or obs_pairs < 6:
for i, colour in enumerate(hbp_colours):
figure_i = plt.scatter([group.hbond_score for group in groups if len(group.hbonds) == i],
[abs(group.coordination_score) for group in groups if len(group.hbonds) == i],
c=hbp_colours[i], marker='v', s=75, edgecolors=(1, 1, 1, 0.5), linewidth=0.3, clip_on=False)
colour_pairs.append(i)
count_pairs.append(len([group for group in groups if len(group.hbonds) == i]))
# Dynamic legend
else:
relative_range = range(5, -6, -1)
for i, (value, colour) in enumerate(zip(relative_range, hbp_colours)):
r = obs_pairs - value
if i == 0:
figure_i = plt.scatter([group.hbond_score for group in groups if len(group.hbonds) <= r],
[abs(group.coordination_score) for group in groups if len(group.hbonds) <= r],
c=hbp_colours[i], marker='v', s=75, edgecolors=(1, 1, 1, 0.5), linewidth=0.3, clip_on=False)
count_pairs.append(len([group for group in groups if len(group.hbonds) <= r]))
mpair_flag = True
elif i in range(1, 10):
figure_i = plt.scatter([group.hbond_score for group in groups if len(group.hbonds) == r],
[abs(group.coordination_score) for group in groups if len(group.hbonds) == r],
c=hbp_colours[i], marker='v', s=75, edgecolors=(1, 1, 1, 0.5), linewidth=0.3, clip_on=False)
count_pairs.append(len([group for group in groups if len(group.hbonds) == r]))
elif i == 10:
figure_i = plt.scatter([group.hbond_score for group in groups if len(group.hbonds) >= r],
[abs(group.coordination_score) for group in groups if len(group.hbonds) >= r],
c=hbp_colours[i], marker='v', s=75, edgecolors=(1, 1, 1, 0.5), linewidth=0.3, clip_on=False)
count_pairs.append(len([group for group in groups if len(group.hbonds) >= r]))
mpair_flag = True
colour_pairs.append(r)
# Observed network (plotted after other networks so point on top)
ax2 = plt.scatter(observed_groups.hbond_score,
abs(observed_groups.coordination_score),
c='white', marker='o', s=75, edgecolors='black', linewidth=1.6, clip_on=False)
plt.origin = 'upper'
plt.xlim(0, 1.0)
plt.ylim(1.0, 0)
ax = plt.gca()
ax.xaxis.tick_top()
ax.yaxis.tick_left()
ax.set_xlabel('Mean H-Bond Propensity')
ax.set_ylabel('Mean H-Bond Co-ordination')
ax.xaxis.set_label_position('top')
figure_location = os.path.join(directory, '%s_standard_chart.png' % crystal.identifier)
plt.savefig(figure_location, bbox_inches='tight', pad_inches=0.0)
chart = add_picture_subdoc(figure_location, docx_template, cm=12.8)
diagram_file = make_diagram(crystal.molecule, directory)
diagram = add_picture_subdoc(diagram_file, docx_template)
con_files = [f[:-4] for f in os.listdir(work_directory) if f.endswith(".con")]
fg_pics = [fg_diagram(crystal.molecule, work_directory, con) for con in con_files]
fg_diagrams = {con: add_picture_subdoc(os.path.join(work_directory, '%s.png' % con), docx_template)
for con in con_files}
return chart, diagram, fg_diagrams, colour_pairs, obs_pairs, count_pairs, mpair_flag
def main(structure, directory, csdrefcode, min_donor_coordination, min_acceptor_coordination, fg_count, noopen=False):
# This looks for the .docx template that is used to generate the report from
if os.path.isfile(TEMPLATE_FILE):
docx_template = docxtpl.DocxTemplate(TEMPLATE_FILE)
else:
print('Error! {} not found!'.format(TEMPLATE_FILENAME))
quit()
# This loads up the CSD if a refcode is requested, otherwise loads the structural file supplied
if csdrefcode:
try:
crystal = io.CrystalReader('CSD').crystal(str(structure))
except RuntimeError:
print('Error! %s is not in the database!' % str(structure))
quit()
else:
crystal = io.CrystalReader(str(structure))[0]
# If there are atoms without sites, or there are no atoms in the structure, then HBP cannot be performed
molecule = crystal.molecule
if not molecule.all_atoms_have_sites or len(molecule.atoms) == 0:
print('Error! Not all atoms in %s have sites!' % str(structure))
quit()
# Bond types need to be standardised
normalize_molecule(molecule)
crystal.molecule = molecule
# Set up a work directory for the HBP files
work_directory = os.path.join(directory, str(structure).split('.')[0])
hbp_calculator = PropensityCalc(crystal, work_directory, min_donor_coordination,
min_acceptor_coordination, fg_count)
# Get all the necessary data from a HBP calculation
(functional_groups, fitting_data, donors, acceptors, model, propensities, intra_flag, groups, observed_groups,
min_donor_coordination, min_acceptor_coordination, intra_count, intra_obs) = hbp_calculator.calculate()
# Calculate the coordination scores separately
coordination_scores = coordination_scores_calc(crystal, work_directory)
mean_coordination = abs(observed_groups.coordination_score) ## abs() not built into in jinja2 (docx) template
# Create the HBP chart output as a separate file
chart_output(groups, work_directory, structure)
# Set up some dictionaries and fill them with coordination score data, as well as extra information for the report
dscores = {}
ascores = {}
dcoord_bg = {}
acoord_bg = {}
for d in donors:
coord_cols = [i for i in range(len(coordination_scores.predictions_for_label(d.label, 'd')[1]))]
dscores[d.label] = [round((coordination_scores.predictions_for_label(d.label, 'd')[1])[j], 3)
for j in coord_cols]
dcoord_bg[d.label] = ['FFFFFF' for j in coord_cols]
coord_value = coordination_scores.predictions_for_label(d.label, 'd')[0]
if coord_value == dscores[d.label].index(max(dscores[d.label])):
dcoord_bg[d.label][coord_value] = '7FFF00'
else:
dcoord_bg[d.label][coord_value] = 'FF0000'
for a in acceptors:
ascores[a.label] = [round((coordination_scores.predictions_for_label(a.label, 'a')[1])[k], 3)
for k in coord_cols]
acoord_bg[a.label] = ['FFFFFF' for j in coord_cols]
coord_value = coordination_scores.predictions_for_label(a.label, 'a')[0]
if coord_value == ascores[a.label].index(max(ascores[a.label])):
acoord_bg[a.label][coord_value] = '7FFF00'
else:
acoord_bg[a.label][coord_value] = 'FF0000'
# Store letters to use as component identifier by using component number as index
abc = list(string.ascii_uppercase)
# Generate more information for the report
coefficients = model.coefficients
don = list(set(list("%s_d" % p.donor_label.split(" ")[0] for p in propensities)))
acc = list(set(list("%s_a" % p.acceptor_label.split(" ")[0] for p in propensities)))
# Generate HBP landscape chart
chart, diagram, fg_diagrams, colour_pairs, obs_pairs, count_pairs, mpair_flag = hbp_landscape(groups, observed_groups, crystal, directory, work_directory, docx_template)
# Save 2D diagrams of each chemical component in the asu with non-CH labelled for the SI hbp figure
component_diagrams = make_diagram_components(crystal, molecule, directory, docx_template)
# The context is the information that is given to the template to allow it to be populated
context = {
# Title page
'identifier': crystal.identifier,
'chart': chart,
'propensities': propensities,
'intra_flag': intra_flag,
'coord_cols': coord_cols,
'donors': donors,
'acceptors': acceptors,
'dscores': dscores,
'dbg': dcoord_bg,
'ascores': ascores,
'abg': acoord_bg,
'diagram': diagram,
'don': don,
'acc': acc,
'fg_diagrams': fg_diagrams,
'functional_groups': functional_groups,
'data': fitting_data,
'len_data': len(fitting_data),
'coefficients': coefficients,
'model': model,
'abc': abc,
'colour_pairs': colour_pairs,
'obs_pairs': obs_pairs,
'count_pairs': count_pairs,
'min_donor_coordination': min_donor_coordination,
'min_acceptor_coordination': min_acceptor_coordination,
'mpair_flag': mpair_flag,
'component_diagrams': component_diagrams,
'observed_groups': observed_groups,
'mean_coordination': mean_coordination,
'intra_count': intra_count,
'intra_obs': intra_obs
}
# Send all the information to the template file then open up the final report
docx_template.render(context)
output_file = os.path.join(directory, '%s_propensity_report.docx' % crystal.identifier)
docx_template.save(output_file)
if not noopen:
launch_word_processor(output_file)
print('Output file written to %s' % output_file)
if __name__ == '__main__':
# Set up the necessary arguments to run the script
parser = argparse.ArgumentParser(
formatter_class=argparse.RawDescriptionHelpFormatter,
description=__doc__)
parser.add_argument('input_structure', type=str,
help='Refcode or mol2 file of the component to be screened')
parser.add_argument('-o', '--output_directory', default=os.getcwd(),
help='the working directory for the calculation')
parser.add_argument('-d', '--min_donor_coordination', default=0.1, type=float,
help='Cut-off donor coordination likelihood for HBP chart')
parser.add_argument('-a', '--min_acceptor_coordination', default=0.1, type=float,
help='Cut-off acceptor coordination likelihood for HBP chart')
parser.add_argument('-c', '--fg_count', default=500, type=int,
help='Target functional group count for HBP hits')
parser.add_argument('-n', '--noopen', action='store_true',
help='Do not automatically open the generated output file.')
args = parser.parse_args()
refcode = False
if not os.path.isfile(args.input_structure):
if len(str(args.input_structure).split('.')) == 1:
refcode = True
else:
parser.error('%s - file not found.' % args.input_structure)
if not refcode:
args.directory = os.path.dirname(os.path.abspath(args.input_structure))
elif not os.path.isdir(args.output_directory):
os.makedirs(args.output_directory)
main(args.input_structure, args.output_directory, refcode, args.min_donor_coordination, args.min_acceptor_coordination, args.fg_count, args.noopen)