-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathgenerateBIPSPIModel.py
125 lines (98 loc) · 6.79 KB
/
generateBIPSPIModel.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
'''
This scripts is used to generate BIPSPI models.
Models are trained from pdb files which are contained in the directory pdbsIndir
that has to be indicated in ./configFiles/configFile.cfg, (p.e. pdbsIndir ~/Tesis/rriPredMethod/data/develData/pdbFiles)
For each pdb in dataset, there must be 4 files included in pdbsIndir directory, and they have to follow the following
name standard:
prefix_l_u.pdb File for the ligand in unbound conformation. Features are computed from this file
prefix_r_u.pdb File for the receptor in unbound conformation. Features are computed from this file
prefix_l_b.pdb File for the ligand in bound conformation. Residue contacts are computed from this file
prefix_r_b.pdb File for the receptor in bound conformation. Residue contacts are computed from this file
prefix are generally PDBIDs, but can be any string provided it is shared by ligand and receptor.
if no unbound pdbs available, use symlinks prefix_r_b.pdb --> prefix_r_u.pdb; prefix_l_b.pdb --> prefix_l_u.pdb
Computed features are hierachically stored in the computedFeatsRootDir directory that has to be indicated in
./configFiles/configFile.cfg (e.g. computedFeatsRootDir ~/Tesis/rriPredMethod/data/develData/computedFeatures)
Ready to train and predict complexes are stored in the codifiedDataRootDir directory that has to be indicated in
./configFiles/configFile.cfg (e.g. codifiedDataRootDir ~/Tesis/rriPredMethod/data/develData/codifiedInput).
Complexes are saved as joblib.dump files of the class ./codifyComplex/ComplexCodified. See that class for more
info.
Results obtained in cross validation are stored in the resultsRootDir directory that has to be indicated in
./configFiles/configFile.cfg (e.g. resultsRootDir ~/Tesis/rriPredMethod/data/develData/results).
Three files per complex are generated:
prefix.res.tab.gz Residue-residue contact predictions
prefix.res.tab.lig.gz binding site predictions for ligand
prefix.res.tab.lig.gz binding site predictions for receptor
The models trained are strored in the savedModelsPath that has to be indicated in
./configFiles/configFile.cfg (e.g. savedModelsPath ~/Tesis/rriPredMethod/data/develData/modelsComputed).
Models are saved as joblib.dump files of the Class xgboost.XGBClassifier
To run this script, edit config file
load conda environment
source activate xgbpred
and execute
python generateBIPSPIModel
'''
from __future__ import print_function
import os
import computeFeatures.computeFeatsForPdbs as pComCode
import codifyComplexes.codifyPDBsForTraining as pCodifyAll
import trainAndTest.trainAndTest as pTrainTest
from utils import myMakeDir, checkFreeMemory
from Config import Configuration
GiB_PER_PROC=16
conf= Configuration()
USE_2_STEPS= True
def computeFeatures(methodProtocol= conf.modelType, isHomeSet= conf.checkHomoInteractionInTraining):
pComCode.computeFeaturesAllPdbsOneDir(ncpu= 1+ conf.ncpu//conf.psiBlastNThrs, methodProtocol= methodProtocol, isHomeSet=isHomeSet)
def codifyStep(methodProtocol= conf.modelType, feedbackPaths=None):
benchCod= pCodifyAll.BenchmarkCodificator( feedback_paths= feedbackPaths, environType=methodProtocol,
ncpu=min( int(1+ checkFreeMemory()//GiB_PER_PROC), conf.ncpu), overridePrevComp= False)
if not benchCod.checkIfAllCodified():
# skipComplexesList= benchCod.prefixes[4:]
skipComplexesList= []
codifiedPath= benchCod.codifyAll( skipComplexesList=skipComplexesList, samplingFold= 3)
else:
codifiedPath= benchCod.getCodifiedPath()
print("All complexes already codified")
return codifiedPath
def trainAndTest(inputRoot, outputRoot, methodProtocol= conf.modelType, saveModelPath= None, isLastStep=False):
sampledComplexesPath= os.path.join(inputRoot, "sampledInputs")
wholeComplexesPath= os.path.join(inputRoot, "allInputs")
predictOutputPath= os.path.join(outputRoot, methodProtocol)
myMakeDir(predictOutputPath)
numResults= len(os.listdir(predictOutputPath))
if numResults!=0:
print("Warning: predictions outpath is not empty %s"%predictOutputPath)
trainAndTester= pTrainTest.TrainAndTestWorker(trainDataPath=sampledComplexesPath, testPath=wholeComplexesPath,
outputPath=predictOutputPath, nFolds=conf.N_KFOLD, isLastStep=isLastStep,
saveModelFname=saveModelPath, verbose=True, numProc=conf.ncpu)
trainAndTester.computeTrainAndTest()
return predictOutputPath
def main(resultsRoot= None, saveModelName=None):
if resultsRoot is None:
resultsRoot= os.path.expanduser(conf.resultsRootDir)
computeFeatures()
outpathCodif= codifyStep()
myMakeDir(saveModelName)
saveModelNameTmp= None if saveModelName is None else saveModelName+"."+conf.modelType
isLastStep= True
if USE_2_STEPS:
isLastStep=False
predictOutPath_1= trainAndTest( outpathCodif, resultsRoot, methodProtocol= conf.modelType, saveModelPath= saveModelNameTmp, isLastStep=isLastStep)
if USE_2_STEPS:
outpathCodif= codifyStep(methodProtocol=conf.modelType+"_2", feedbackPaths= predictOutPath_1)
saveModelNameTmp= None if saveModelName is None else saveModelName+"."+conf.modelType+"_2"
predictOutPath_2= trainAndTest( outpathCodif, resultsRoot, methodProtocol= conf.modelType+"_2",
saveModelPath= saveModelNameTmp, isLastStep=True)
if __name__=="__main__":
parser = Configuration.getArgParser()
parser.modify_field("pdbsIndir", help="Directory where training pdbs are located", _type= Configuration.file_path)
parser.modify_field("wdir", help="Directory where partial results and final results will be saved", _type= Configuration.file_path)
parser.modify_field("tmp", help="Temporary directory", _type= Configuration.file_path)
parser.modify_field("ncpu", help="Number of cpus for trainng. Each complex in a cross-validation fold is computed in an indepented worker. NCPU workers are computed in parallel")
parser.modify_field("modelType", help="The type of model to train depending on input options. Struct,sequence or mixed (one seq and one struct)", choices=["struct", "mixed", "seq"])
parser.modify_field("checkHomoInteractionInTraining", help="Corrects contact maps to consider all homologous residues as contact if one does. For heter-datasets impact is minorSet it to 'True' "
"if training homo-complexes. For hetero complexes, it has little impact", _type=bool)
parser.modify_field("N_KFOLD", help="Type of cross validation. -1 for leave-one-complex out, positive values for k= N_KFOLD cross-validation.", _type=Configuration.int_or_filePath)
parser.modify_field("scopeFamiliesFname", help="Filename containing the familes of the protein chains of ligand and receptor", _type= Configuration.file_path)
parser.parse_args()
main(saveModelName=os.path.join(conf.savedModelsPath,"model"))