-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdnn.py
175 lines (153 loc) · 8.74 KB
/
dnn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
import argparse
import matplotlib.pyplot as plt
#matplotlib.use("Agg")
import sys
import numpy as np
import keras
from keras.layers import Dense, Flatten, InputLayer, Conv1D, MaxPooling1D
from keras.models import Sequential
import pandas as pd
# Genes: ["BMP4", "Bactine2", "Crabp2a", "DeltaC", "DeltaD", "FGF8a", "Her7", "Hoxa1a", "Hoxa5a", "Hoxb1b", "Hoxd4a", "Notch1a", "RPL13a1", "RPL13a2", "Rarab"]
# On research plate:
## PS: regular doses
## PC: variable doses (less reliable)
# Not reliable: Notch1a/PC, BMP4/PC & Crabp2a
# PS: Notch1a
def convert(y):
my = np.mean(y)
sy = np.std(y)
return (y - my) / sy, my, sy
def back_convert(y, my, sy):
return y * sy + my
def read_file(df, test_dataset=None, validation=False, exclude=None):
''' Function to load the data from the input file and dividing the dataset into train/test if necessary.'''
# Loading the data
gene = np.array(df["Gene"])
condition = np.array(df["Condition"]).astype(str)
wellID = np.array(df["WellID"])
y = np.array(df["Theoretical_N0_log"])
x = np.array(df.loc[:, df.columns.str.startswith("F")])
# Excluding gene/conditions if required
if exclude != [()]:
idx_del = np.logical_or.reduce([((gene == g) & np.char.startswith(condition, c)) for (g, c) in exclude])
gene = gene[~idx_del]
condition = condition[~idx_del]
wellID = wellID[~idx_del]
y = y[~idx_del]
x = x[~idx_del]
# Changing the range of values to be low and around 0
y, my, sy = convert(y)
if validation:
# If this is the validation step, there is no train/test, the full dataset is returned
return wellID, (gene, condition, np.expand_dims(x, axis=2), y), my, sy
else:
# Getting the test indexes
idx_test = np.logical_or.reduce([((gene == g) & np.char.startswith(condition, c)) for (g, c) in test_dataset])
# Creating the train dataset
gene_train = gene[~idx_test]
condition_train = condition[~idx_test]
x_train = np.expand_dims(x[~idx_test], axis=2)
y_train = y[~idx_test]
# Creating the test dataset
gene_test = gene[idx_test]
condition_test = condition[idx_test]
x_test = np.expand_dims(x[idx_test], axis=2)
y_test = y[idx_test]
# Returning train and test datasets
return wellID, (gene_train, condition_train, x_train, y_train), (gene_test, condition_test, x_test, y_test), my, sy
#def display_plot(model, x, y, my, sy, gene, condition):
def display_plot(model, x, y, my, sy, gene):
''' Function to display the theoretical vs the predicted. '''
# Setting the colors
hsv = plt.get_cmap("hsv")
color = hsv(np.linspace(0, 1.0, np.unique(gene).shape[0] + 1))
# Converting back the data to the original range
real = np.log(back_convert(y, my, sy))
pred = np.log(back_convert(model.predict(x).T[0], my, sy))
# Plotting each gene results
for i, g in enumerate(np.unique(gene)):
idx = np.where(gene == g)
#plt.scatter(real[idx], pred[idx], label=g, c=color[i], edgecolors=["k" if b else color[i] for b in np.char.startswith(condition[idx], "PC")]) # Circling the test PC homologue
plt.scatter(real[idx], pred[idx], label=g, c=color[i])
plt.plot([-3, 3], [-3, 3], "-")
plt.legend(loc=0)
plt.grid("on")
plt.axis("equal")
plt.show()
# Main
if __name__ == "__main__":
# Setting the arguments
parser = argparse.ArgumentParser()
parser.add_argument("--config1", action="store_true", dest="config1", help="Plate2 is used for training/testing and plate1 for validation.")
parser.add_argument("--config2", action="store_true", dest="config2", help="Plate1 is used for training/testing and plate2 for validation.")
options = parser.parse_args()
# Setting variables
cycles_col = ["F" + str(i).zfill(2) for i in range(40)]
# Setting gene/condition lists
test_gene_condition = [("FGF8a", "PS"), ("Rarab", "PS"), ("Hoxd4a", "PS")]
#test_gene_condition = [("FGF8a", "PS"), ("Her7", "PC"), ("Crabp2a", "PS")] # Only found test set where val_loss > loss from the beginning for config1
# Loading plate1 dataset
QV_filepath = "Data/plate1.final_data.Gompertz_with_bump.tsv"
QV_df = pd.read_csv(QV_filepath, sep="\t", usecols=["WellID", "Sigmoid_curve", "Condition", "Gene", "Theoretical_N0_log"] + cycles_col)
QV_df = QV_df.loc[(QV_df.Sigmoid_curve) & QV_df.Condition.isin(["PS" + str(x) for x in range(1, 13)]), :].copy()
# Filtering out gene/condition without the 6 replicates showing a sigmoid curve
QV_df = QV_df.groupby(["Gene", "Condition"]).filter(lambda row: row["WellID"].count() == 6).copy()
# Loading research dataset
research_filepath = "Data/Plate2.final_data.Gompertz_with_bump.REF.tsv"
research_df = pd.read_csv(research_filepath, sep="\t", usecols=["WellID", "Sigmoid_curve", "Pre_amplification", "Condition", "Gene", "Theoretical_N0_log"] + cycles_col)
research_df = research_df.loc[(research_df.Sigmoid_curve) & (research_df.Pre_amplification == 0), :].copy()
# Filtering out gene/condition without the 6 replicates showing a sigmoid curve
research_df = research_df.groupby(["Gene", "Condition"]).filter(lambda row: row["WellID"].count() == 6).copy()
# Setting the training/testing plate and validation one
if options.config1:
ttp = research_df
excluded_gene_condition_ttp = [("Notch1a", "PC"), ("Notch1a", "PS"), ("BMP4", "PC"), ("Crabp2a", "PC")]
vp = QV_df
excluded_gene_condition_vp = [("Bactine2", "PS")]
else: # Config2
ttp = QV_df
excluded_gene_condition_ttp = [()]
vp = research_df
excluded_gene_condition_vp = [("Notch1a", "PC"), ("Notch1a", "PS"), ("BMP4", "PC"), ("Crabp2a", "PC"), ("Bactine2", "PS"), ("Bactine2", "PC")] # To validate on the research plate, we discard BActine2 to be fair with other metohds (used as reference for Cq method)
# Loading the train/test dataset
wellID_ttp, (gene_train, condition_train, x_train, y_train), (gene_test, condition_test, x_test, y_test), my, sy = read_file(ttp, test_dataset=test_gene_condition, exclude=excluded_gene_condition_ttp)
# Creating the deep neural network
model = Sequential() # Initialization
model.add(InputLayer(input_shape=(40, 1))) # Providing the shape of our data
# Chaining layer transformations on top of the input
model.add(Conv1D(5, kernel_size=5, strides=1, activation="sigmoid")) # Convolutional layer with "kernel_size" for the size of the kernel to slide, "strides" for the steps size of the sliding window
model.add(MaxPooling1D(pool_size=2, strides=2)) # Smoothing data from values around
model.add(Flatten())
model.add(Dense(3, activation="sigmoid")) # Summarizing data to a vector of 3
model.add(Dense(1)) # Getting result as one value
# Printing the summary of how the data will get transformed at each stage of the model
model.summary()
# Compiling the model
model.compile(loss=keras.losses.mse, # Mean Squared Error
optimizer=keras.optimizers.Adam(), # A stochastic gradient descent method based on adaptive estimation of first-order and second-order moments
metrics=["mae"]) # Mean Absolute Error
# Fitting the model to the data
nb_epoch = 2500
history = model.fit(x_train, y_train,
batch_size=128, # Takes curves by batch of 128
epochs=nb_epoch,
verbose=2,
validation_data=(x_test, y_test))
# Plotting loss and metric
for k in history.history.keys():
plt.plot(range(nb_epoch), history.history[k], label=k)
plt.legend()
plt.show()
# Displaying the correlation between theoretical and predicted for the train set
display_plot(model, x_train, y_train, my, sy, gene_train)
# Displaying the correlation between theoretical and predicted for the test set
display_plot(model, x_test, y_test, my, sy, gene_test)
# Validating the model on the other plate
wellID_vp, (gene_val, condition_val, x_val, y_val), my, sy = read_file(vp, validation=True, exclude=excluded_gene_condition_vp)
# Displaying the correlation between theoretical and predicted for the validation set
display_plot(model, x_val, y_val, my, sy, gene_val)
# Reporting the predicted values
if options.config1:
np.savetxt("DL_predictions.plate1.tsv", np.column_stack((wellID_vp, back_convert(model.predict(x_val).T[0], my, sy))), delimiter="\t", fmt="%s")
else: # Config2
np.savetxt("DL_predictions.plate2.tsv", np.column_stack((wellID_vp, back_convert(model.predict(x_val).T[0], my, sy))), delimiter="\t", fmt="%s")