|
@@ -7,80 +7,50 @@
|
7
|
7
|
"# DeepDrug3D"
|
8
|
8
|
]
|
9
|
9
|
},
|
|
10
|
+ {
|
|
11
|
+ "cell_type": "markdown",
|
|
12
|
+ "metadata": {},
|
|
13
|
+ "source": [
|
|
14
|
+ "## Importing library"
|
|
15
|
+ ]
|
|
16
|
+ },
|
10
|
17
|
{
|
11
|
18
|
"cell_type": "code",
|
12
|
|
- "execution_count": 1,
|
13
|
|
- "metadata": {},
|
14
|
|
- "outputs": [
|
15
|
|
- {
|
16
|
|
- "name": "stderr",
|
17
|
|
- "output_type": "stream",
|
18
|
|
- "text": [
|
19
|
|
- "Using TensorFlow backend.\n"
|
20
|
|
- ]
|
21
|
|
- }
|
22
|
|
- ],
|
|
19
|
+ "execution_count": 6,
|
|
20
|
+ "metadata": {},
|
|
21
|
+ "outputs": [],
|
23
|
22
|
"source": [
|
24
|
23
|
"import numpy as np\n",
|
25
|
|
- "\n",
|
|
24
|
+ "import tensorflow as tf\n",
|
26
|
25
|
"from sklearn.preprocessing import LabelEncoder\n",
|
27
|
26
|
"from keras.models import Sequential\n",
|
28
|
|
- "from keras import optimizers\n",
|
29
|
|
- "from keras.layers import Dense, Flatten, TimeDistributedn, Dropout\n",
|
|
27
|
+ "from keras import optimizers, callbacks\n",
|
|
28
|
+ "from keras.layers import Dense, Flatten, TimeDistributed, Dropout\n",
|
30
|
29
|
"from keras import Input, Model\n",
|
31
|
30
|
"from keras.layers import add, Activation\n",
|
|
31
|
+ "from keras.layers.advanced_activations import LeakyReLU\n",
|
32
|
32
|
"#from keras.utils import plot_model # Needs pydot.\n",
|
33
|
|
- "from keras.layers import Conv3D, MaxPooling3D"
|
|
33
|
+ "from keras.layers import Convolution3D, MaxPooling3D"
|
34
|
34
|
]
|
35
|
35
|
},
|
36
|
36
|
{
|
37
|
37
|
"cell_type": "markdown",
|
38
|
38
|
"metadata": {},
|
39
|
39
|
"source": [
|
40
|
|
- "## Create pocket lists\n",
|
41
|
|
- "4 pockets are created :\n",
|
42
|
|
- " + control\n",
|
43
|
|
- " + steroid\n",
|
44
|
|
- " + heme\n",
|
45
|
|
- " + nucleotide"
|
|
40
|
+ "### used to store model prediction in order to plot roc curve"
|
46
|
41
|
]
|
47
|
42
|
},
|
48
|
43
|
{
|
49
|
44
|
"cell_type": "code",
|
50
|
|
- "execution_count": 2,
|
51
|
|
- "metadata": {},
|
52
|
|
- "outputs": [
|
53
|
|
- {
|
54
|
|
- "data": {
|
55
|
|
- "text/plain": [
|
56
|
|
- "''"
|
57
|
|
- ]
|
58
|
|
- },
|
59
|
|
- "execution_count": 2,
|
60
|
|
- "metadata": {},
|
61
|
|
- "output_type": "execute_result"
|
62
|
|
- }
|
63
|
|
- ],
|
64
|
|
- "source": [
|
65
|
|
- "with open(\"control.list\", \"r\") as filin:\n",
|
66
|
|
- " control = filin.read()\n",
|
67
|
|
- "control = control.split(\"\\n\")\n",
|
68
|
|
- "control.pop()\n",
|
69
|
|
- "\n",
|
70
|
|
- "with open(\"steroid.list\", \"r\") as filin:\n",
|
71
|
|
- " steroid = filin.read()\n",
|
72
|
|
- "steroid = steroid.split(\"\\n\")\n",
|
73
|
|
- "steroid.pop()\n",
|
74
|
|
- "\n",
|
75
|
|
- "with open(\"heme.list\", \"r\") as filin:\n",
|
76
|
|
- " heme = filin.read()\n",
|
77
|
|
- "heme = heme.split(\"\\n\")\n",
|
78
|
|
- "heme.pop()\n",
|
79
|
|
- "\n",
|
80
|
|
- "with open(\"nucleotide.list\", \"r\") as filin:\n",
|
81
|
|
- " nucleotide = filin.read()\n",
|
82
|
|
- "nucleotide = nucleotide.split(\"\\n\")\n",
|
83
|
|
- "nucleotide.pop()"
|
|
45
|
+ "execution_count": null,
|
|
46
|
+ "metadata": {},
|
|
47
|
+ "outputs": [],
|
|
48
|
+ "source": [
|
|
49
|
+ "class prediction_history(callbacks.Callback):\n",
|
|
50
|
+ " def __init__(self):\n",
|
|
51
|
+ " self.predhis = []\n",
|
|
52
|
+ " def on_epoch_end(self, epoch, logs={}):\n",
|
|
53
|
+ " self.predhis.append(model.predict(predictor_train))"
|
84
|
54
|
]
|
85
|
55
|
},
|
86
|
56
|
{
|
|
@@ -92,85 +62,169 @@
|
92
|
62
|
},
|
93
|
63
|
{
|
94
|
64
|
"cell_type": "code",
|
95
|
|
- "execution_count": 3,
|
|
65
|
+ "execution_count": null,
|
96
|
66
|
"metadata": {},
|
97
|
67
|
"outputs": [],
|
98
|
68
|
"source": [
|
99
|
|
- "data_onehot = np.ndarray(shape=(2219, 14, 32, 32, 32)) # initializing empty array\n",
|
100
|
|
- "indices = np.random.permutation(2219)\n",
|
101
|
|
- "output = np.ndarray(shape=(2219, 3)) # softmax 3, {steroid=1, heme=1, nucleotide=1}\n",
|
102
|
|
- "lmin = len(steroid)\n",
|
103
|
|
- "lmid = len(heme)\n",
|
104
|
|
- "lmax = len(nucleotide)"
|
|
69
|
+ "def in_out_lists(size=1000):\n",
|
|
70
|
+ " \"\"\"\n",
|
|
71
|
+ " returns a tuple of array used as input and output for the model\n",
|
|
72
|
+ " Arguments:\n",
|
|
73
|
+ " - size, int: default 1000, size of the lists to be created\n",
|
|
74
|
+ " \n",
|
|
75
|
+ " Returns:\n",
|
|
76
|
+ " - tuple (data_onehot, output):\n",
|
|
77
|
+ " -data_onehot, ndarray: containing one-hot encoded pockets\n",
|
|
78
|
+ " -output, ndarray: containing size-3 vectors for classification\n",
|
|
79
|
+ " \"\"\"\n",
|
|
80
|
+ " with open(\"control.list\", \"r\") as filin:\n",
|
|
81
|
+ " control = filin.read()\n",
|
|
82
|
+ " control = control.split(\"\\n\")\n",
|
|
83
|
+ " control.pop()\n",
|
|
84
|
+ "\n",
|
|
85
|
+ " with open(\"steroid.list\", \"r\") as filin:\n",
|
|
86
|
+ " steroid = filin.read()\n",
|
|
87
|
+ " steroid = steroid.split(\"\\n\")\n",
|
|
88
|
+ " steroid.pop()\n",
|
|
89
|
+ "\n",
|
|
90
|
+ " with open(\"heme.list\", \"r\") as filin:\n",
|
|
91
|
+ " heme = filin.read()\n",
|
|
92
|
+ " heme = heme.split(\"\\n\")\n",
|
|
93
|
+ " heme.pop()\n",
|
|
94
|
+ "\n",
|
|
95
|
+ " with open(\"nucleotide.list\", \"r\") as filin:\n",
|
|
96
|
+ " nucleotide = filin.read()\n",
|
|
97
|
+ " nucleotide = nucleotide.split(\"\\n\")\n",
|
|
98
|
+ " nucleotide.pop()\n",
|
|
99
|
+ " \n",
|
|
100
|
+ " lmin = len(heme)\n",
|
|
101
|
+ " lmid = len(nucleotide)\n",
|
|
102
|
+ " lmax = len(control)\n",
|
|
103
|
+ " tot_size = lmin + lmid + lmax\n",
|
|
104
|
+ " data_onehot = np.ndarray(shape=(size, 14, 32, 32, 32)) # initializing empty array\n",
|
|
105
|
+ "\n",
|
|
106
|
+ " np.random.seed(9001)\n",
|
|
107
|
+ " indices = np.random.permutation(tot_size)\n",
|
|
108
|
+ " indices = indices[:size]\n",
|
|
109
|
+ " output = np.ndarray(shape=(size, 3)) # softmax 3, {steroid=1, heme=1, nucleotide=1}\n",
|
|
110
|
+ "\n",
|
|
111
|
+ " n = -1\n",
|
|
112
|
+ " for i in indices:\n",
|
|
113
|
+ " n += 1\n",
|
|
114
|
+ " if i < lmin:\n",
|
|
115
|
+ " data_onehot[n,] = np.load(\"deepdrug3d_voxel_data/\"+heme[i]+\".npy\")\n",
|
|
116
|
+ " output[n,] = [1,0,0]\n",
|
|
117
|
+ " elif i > lmin and i < (lmin + lmid):\n",
|
|
118
|
+ " data_onehot[n,] = np.load(\"deepdrug3d_voxel_data/\"+nucleotide[i - lmin]+\".npy\")\n",
|
|
119
|
+ " output[n,] = [0,1,0]\n",
|
|
120
|
+ " else:\n",
|
|
121
|
+ " data_onehot[n,] = np.load(\"deepdrug3d_voxel_data/\"+control[i - (lmin+lmid) - 1]+\".npy\")\n",
|
|
122
|
+ " output[n,] = [0,0,1]\n",
|
|
123
|
+ " \n",
|
|
124
|
+ " return (data_onehot, output)"
|
|
125
|
+ ]
|
|
126
|
+ },
|
|
127
|
+ {
|
|
128
|
+ "cell_type": "markdown",
|
|
129
|
+ "metadata": {},
|
|
130
|
+ "source": [
|
|
131
|
+ "### Defining different model to test and compare"
|
105
|
132
|
]
|
106
|
133
|
},
|
107
|
134
|
{
|
108
|
135
|
"cell_type": "code",
|
109
|
|
- "execution_count": 4,
|
|
136
|
+ "execution_count": null,
|
110
|
137
|
"metadata": {},
|
111
|
138
|
"outputs": [],
|
112
|
139
|
"source": [
|
113
|
|
- "n = -1\n",
|
114
|
|
- "for i in indices:\n",
|
115
|
|
- " n += 1\n",
|
116
|
|
- " if i < lmin:\n",
|
117
|
|
- " data_onehot[n,] = np.load(\"deepdrug3d_voxel_data/\"+steroid[i]+\".npy\")\n",
|
118
|
|
- " output[n,] = [1,0,0]\n",
|
119
|
|
- " elif i > lmin and i < (lmin + lmid):\n",
|
120
|
|
- " data_onehot[n,] = np.load(\"deepdrug3d_voxel_data/\"+heme[i - lmin]+\".npy\")\n",
|
121
|
|
- " output[n,] = [0,1,0]\n",
|
122
|
|
- " else:\n",
|
123
|
|
- " data_onehot[n,] = np.load(\"deepdrug3d_voxel_data/\"+nucleotide[i - (lmin+lmid) - 1]+\".npy\")\n",
|
124
|
|
- " output[n,] = [0,0,1]"
|
|
140
|
+ "def model_heavy(): # créer un objet modèle\n",
|
|
141
|
+ " \"\"\"\n",
|
|
142
|
+ " Return a simple sequentiel model\n",
|
|
143
|
+ " \n",
|
|
144
|
+ " Returns :\n",
|
|
145
|
+ " - model : keras.Model\n",
|
|
146
|
+ " \"\"\"\n",
|
|
147
|
+ " inputs = Input(shape=(14,32,32,32))\n",
|
|
148
|
+ " conv_1 = Conv3D(64, (28, 28, 28), padding=\"same\", activation=\"relu\", kernel_initializer=\"he_normal\")(inputs)\n",
|
|
149
|
+ " conv_2 = Conv3D(64, (26, 26, 26), padding=\"same\", activation=\"relu\", kernel_initializer=\"he_normal\")(conv_1)\n",
|
|
150
|
+ " drop_1 = Dropout(0.2)(conv_2)\n",
|
|
151
|
+ " maxpool = MaxPooling3D()(drop_1)\n",
|
|
152
|
+ " drop_2 = Dropout(0.4)(maxpool)\n",
|
|
153
|
+ " dense = Dense(512)(drop_2)\n",
|
|
154
|
+ " drop_3 = Dropout(0.4)(dense)\n",
|
|
155
|
+ " flatters = Flatten()(drop_3)\n",
|
|
156
|
+ " #output = TimeDistributed(Dense(3, activation='softmax'))(drop_3)\n",
|
|
157
|
+ " output = Dense(3, activation='softmax')(flatters)\n",
|
|
158
|
+ " model = Model(inputs=inputs, outputs=output)\n",
|
|
159
|
+ " my_opt = optimizers.Adam(learning_rate=0.000001, beta_1=0.9, beta_2=0.999, amsgrad=False)\n",
|
|
160
|
+ " print(model.summary)\n",
|
|
161
|
+ " model.compile(optimizer=my_opt, loss=\"categorical_crossentropy\",\n",
|
|
162
|
+ " metrics=[\"accuracy\"])\n",
|
|
163
|
+ " return model"
|
125
|
164
|
]
|
126
|
165
|
},
|
127
|
166
|
{
|
128
|
167
|
"cell_type": "code",
|
129
|
|
- "execution_count": 5,
|
|
168
|
+ "execution_count": 8,
|
130
|
169
|
"metadata": {},
|
131
|
170
|
"outputs": [],
|
132
|
171
|
"source": [
|
133
|
|
- "X_train = data_onehot[0:1664,]\n",
|
134
|
|
- "Y_train = output[0:1664,]\n",
|
135
|
|
- "X_test = data_onehot[1664:,]\n",
|
136
|
|
- "Y_test = output[1664:,]"
|
|
172
|
+ "def model_new(): # créer un objet modèle\n",
|
|
173
|
+ " \"\"\"\n",
|
|
174
|
+ " Return a simple sequentiel model\n",
|
|
175
|
+ " \n",
|
|
176
|
+ " Returns :\n",
|
|
177
|
+ " - model : keras.Model\n",
|
|
178
|
+ " \"\"\"\n",
|
|
179
|
+ " inputs = Input(shape=(14,32,32,32))\n",
|
|
180
|
+ " conv_1 = Convolution3D(filters=64, kernel_size=5, padding=\"valid\", data_format='channels_first')(inputs)\n",
|
|
181
|
+ " activation_1 = LeakyReLU(alpha = 0.1)(conv_1)\n",
|
|
182
|
+ " drop_1 = Dropout(0.2)(activation_1)\n",
|
|
183
|
+ " conv_2 = Convolution3D(filters=64, kernel_size=3, padding=\"valid\", data_format='channels_first')(drop_1)\n",
|
|
184
|
+ " activation_2 = LeakyReLU(alpha = 0.1)(conv_2)\n",
|
|
185
|
+ " maxpool = MaxPooling3D(pool_size=(2,2,2),\n",
|
|
186
|
+ " strides=None,\n",
|
|
187
|
+ " padding='valid',\n",
|
|
188
|
+ " data_format='channels_first')(activation_2)\n",
|
|
189
|
+ " drop_2 = Dropout(0.4)(maxpool)\n",
|
|
190
|
+ " flatters = Flatten()(drop_2)\n",
|
|
191
|
+ " dense = Dense(128)(flatters)\n",
|
|
192
|
+ " activation_3 = LeakyReLU(alpha = 0.1)(dense)\n",
|
|
193
|
+ " drop_3 = Dropout(0.4)(activation_3)\n",
|
|
194
|
+ " output = Dense(3, activation='softmax')(drop_3)\n",
|
|
195
|
+ " model = Model(inputs=inputs, outputs=output)\n",
|
|
196
|
+ " my_opt = optimizers.Adam(learning_rate=0.000001, beta_1=0.9, beta_2=0.999, amsgrad=False)\n",
|
|
197
|
+ " print(model.summary)\n",
|
|
198
|
+ " model.compile(optimizer=my_opt, loss=\"categorical_crossentropy\",\n",
|
|
199
|
+ " metrics=[\"accuracy\"])\n",
|
|
200
|
+ " return model"
|
137
|
201
|
]
|
138
|
202
|
},
|
139
|
203
|
{
|
140
|
204
|
"cell_type": "code",
|
141
|
|
- "execution_count": 14,
|
142
|
|
- "metadata": {},
|
143
|
|
- "outputs": [
|
144
|
|
- {
|
145
|
|
- "data": {
|
146
|
|
- "text/plain": [
|
147
|
|
- "(1, 14, 32, 32, 32)"
|
148
|
|
- ]
|
149
|
|
- },
|
150
|
|
- "execution_count": 14,
|
151
|
|
- "metadata": {},
|
152
|
|
- "output_type": "execute_result"
|
153
|
|
- }
|
154
|
|
- ],
|
155
|
|
- "source": [
|
156
|
|
- "def model_sequential(): # créer un objet modèle\n",
|
|
205
|
+ "execution_count": null,
|
|
206
|
+ "metadata": {},
|
|
207
|
+ "outputs": [],
|
|
208
|
+ "source": [
|
|
209
|
+ "def model_light(): # créer un objet modèle\n",
|
157
|
210
|
" \"\"\"\n",
|
158
|
211
|
" Return a simple sequentiel model\n",
|
159
|
212
|
" \n",
|
160
|
213
|
" Returns :\n",
|
161
|
214
|
" - model : keras.Model\n",
|
162
|
215
|
" \"\"\"\n",
|
163
|
|
- " inputs = Input(shape=(32,32,32,14)) # 759 aa, 21 car onehot\n",
|
164
|
|
- " conv_1 = Conv3D(64, (28, 28, 28), padding=\"same\", activation=\"LeakyReLU\",\n",
|
165
|
|
- " kernel_initializer=\"he_normal\")(inputs)\n",
|
166
|
|
- " conv_2 = Conv3D(64, (26, 26, 26), padding=\"same\", activation=\"LeakyReLU\",\n",
|
167
|
|
- " kernel_initializer=\"he_normal\")(conv_1)\n",
|
|
216
|
+ " inputs = Input(shape=(14,32,32,32))\n",
|
|
217
|
+ " conv_1 = Conv3D(32, (28, 28, 28), padding=\"same\", activation=\"relu\", kernel_initializer=\"he_normal\")(inputs)\n",
|
|
218
|
+ " conv_2 = Conv3D(64, (26, 26, 26), padding=\"same\", activation=\"relu\", kernel_initializer=\"he_normal\")(conv_1)\n",
|
168
|
219
|
" drop_1 = Dropout(0.2)(conv_2)\n",
|
169
|
220
|
" maxpool = MaxPooling3D()(drop_1)\n",
|
170
|
|
- " drop_2 = Dropout(0.4)(maxpool)\n",
|
171
|
|
- " dense = Dense(512)(drop_2)\n",
|
172
|
|
- " drop_3 = Dropout(0.4)(dense)\n",
|
173
|
|
- " output = TimeDistributed(Dense(3, activation='softmax'))(drop_3)\n",
|
|
221
|
+ " drop_2 = Dropout(0.3)(maxpool)\n",
|
|
222
|
+ " maxpool_2 = MaxPooling3D()(drop_2)\n",
|
|
223
|
+ " drop_3 = Dropout(0.3)(maxpool_2)\n",
|
|
224
|
+ " dense = Dense(256)(drop_3)\n",
|
|
225
|
+ " drop_4 = Dropout(0.4)(dense)\n",
|
|
226
|
+ " flatters = Flatten()(drop_4)\n",
|
|
227
|
+ " output = Dense(3, activation='softmax')(flatters)\n",
|
174
|
228
|
" model = Model(inputs=inputs, outputs=output)\n",
|
175
|
229
|
" my_opt = optimizers.Adam(learning_rate=0.000001, beta_1=0.9, beta_2=0.999, amsgrad=False)\n",
|
176
|
230
|
" print(model.summary)\n",
|
|
@@ -179,12 +233,92 @@
|
179
|
233
|
" return model"
|
180
|
234
|
]
|
181
|
235
|
},
|
|
236
|
+ {
|
|
237
|
+ "cell_type": "markdown",
|
|
238
|
+ "metadata": {},
|
|
239
|
+ "source": [
|
|
240
|
+ "## Create pocket lists\n",
|
|
241
|
+ "4 lists are created :\n",
|
|
242
|
+ " + control\n",
|
|
243
|
+ " + steroid\n",
|
|
244
|
+ " + heme\n",
|
|
245
|
+ " + nucleotide"
|
|
246
|
+ ]
|
|
247
|
+ },
|
|
248
|
+ {
|
|
249
|
+ "cell_type": "code",
|
|
250
|
+ "execution_count": null,
|
|
251
|
+ "metadata": {},
|
|
252
|
+ "outputs": [],
|
|
253
|
+ "source": [
|
|
254
|
+ "data = in_out_lists(1400)\n",
|
|
255
|
+ "pockets = np.cumsum(data[1], axis=0)[-1]"
|
|
256
|
+ ]
|
|
257
|
+ },
|
|
258
|
+ {
|
|
259
|
+ "cell_type": "code",
|
|
260
|
+ "execution_count": null,
|
|
261
|
+ "metadata": {},
|
|
262
|
+ "outputs": [],
|
|
263
|
+ "source": [
|
|
264
|
+ "print(\"with random seed=9001 and a 1400 pockets dataset the rates are:\\n\\\n",
|
|
265
|
+ " {} heme, {} nucleotide, {} control\\n\\\n",
|
|
266
|
+ " Total avaible dataset are composed of the following proportions:\\n\\\n",
|
|
267
|
+ " {} heme, {} nucleotide, {} control\".format(pockets[0]/1400, pockets[1]/1400,pockets[2]/1400,\n",
|
|
268
|
+ " 0.145, 0.380, 0.475))"
|
|
269
|
+ ]
|
|
270
|
+ },
|
182
|
271
|
{
|
183
|
272
|
"cell_type": "code",
|
184
|
273
|
"execution_count": null,
|
185
|
274
|
"metadata": {},
|
186
|
275
|
"outputs": [],
|
187
|
|
- "source": []
|
|
276
|
+ "source": [
|
|
277
|
+ "data_onehot = data[0]\n",
|
|
278
|
+ "output = data[1]\n",
|
|
279
|
+ "X_train = data_onehot[0:1000,]\n",
|
|
280
|
+ "Y_train = output[0:1000,]\n",
|
|
281
|
+ "X_test = data_onehot[1000:,]\n",
|
|
282
|
+ "Y_test = output[1000:,]"
|
|
283
|
+ ]
|
|
284
|
+ },
|
|
285
|
+ {
|
|
286
|
+ "cell_type": "code",
|
|
287
|
+ "execution_count": null,
|
|
288
|
+ "metadata": {},
|
|
289
|
+ "outputs": [],
|
|
290
|
+ "source": [
|
|
291
|
+ "my_model = model_new()"
|
|
292
|
+ ]
|
|
293
|
+ },
|
|
294
|
+ {
|
|
295
|
+ "cell_type": "code",
|
|
296
|
+ "execution_count": null,
|
|
297
|
+ "metadata": {},
|
|
298
|
+ "outputs": [],
|
|
299
|
+ "source": [
|
|
300
|
+ "tf.test.is_gpu_available()\n",
|
|
301
|
+ "#my_model.fit(X_train, Y_train, epochs=50, batch_size=30)"
|
|
302
|
+ ]
|
|
303
|
+ },
|
|
304
|
+ {
|
|
305
|
+ "cell_type": "code",
|
|
306
|
+ "execution_count": null,
|
|
307
|
+ "metadata": {},
|
|
308
|
+ "outputs": [],
|
|
309
|
+ "source": [
|
|
310
|
+ "history_mild_2mp = mild_model.fit(X_train, Y_train, validation_data=(X_test, Y_test), epochs=30, batch_size=32)\n",
|
|
311
|
+ "my_model.save('new_model_e30_b32_t1000.h5')"
|
|
312
|
+ ]
|
|
313
|
+ },
|
|
314
|
+ {
|
|
315
|
+ "cell_type": "code",
|
|
316
|
+ "execution_count": null,
|
|
317
|
+ "metadata": {},
|
|
318
|
+ "outputs": [],
|
|
319
|
+ "source": [
|
|
320
|
+ "#predictions=prediction_history()"
|
|
321
|
+ ]
|
188
|
322
|
}
|
189
|
323
|
],
|
190
|
324
|
"metadata": {
|