123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308 |
- {
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# DeepDrug3D"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Importing library"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "import numpy as np\n",
- "import tensorflow as tf\n",
- "from sklearn.preprocessing import LabelEncoder\n",
- "from keras.models import Sequential\n",
- "from keras import optimizers, callbacks\n",
- "from keras.layers import Dense, Flatten, TimeDistributed, Dropout\n",
- "from keras import Input, Model\n",
- "from keras.layers import add, Activation\n",
- "#from keras.utils import plot_model # Needs pydot.\n",
- "from keras.layers import Conv3D, MaxPooling3D"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### used to store model prediction in order to plot roc curve"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "class prediction_history(callbacks.Callback):\n",
- " def __init__(self):\n",
- " self.predhis = []\n",
- " def on_epoch_end(self, epoch, logs={}):\n",
- " self.predhis.append(model.predict(predictor_train))"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Creating input and ouputs"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "def in_out_lists(size=1000):\n",
- " \"\"\"\n",
- " returns a tuple of array used as input and output for the model\n",
- " Arguments:\n",
- " - size, int: default 1000, size of the lists to be created\n",
- " \n",
- " Returns:\n",
- " - tuple (data_onehot, output):\n",
- " -data_onehot, ndarray: containing one-hot encoded pockets\n",
- " -output, ndarray: containing size-3 vectors for classification\n",
- " \"\"\"\n",
- " with open(\"control.list\", \"r\") as filin:\n",
- " control = filin.read()\n",
- " control = control.split(\"\\n\")\n",
- " control.pop()\n",
- "\n",
- " with open(\"steroid.list\", \"r\") as filin:\n",
- " steroid = filin.read()\n",
- " steroid = steroid.split(\"\\n\")\n",
- " steroid.pop()\n",
- "\n",
- " with open(\"heme.list\", \"r\") as filin:\n",
- " heme = filin.read()\n",
- " heme = heme.split(\"\\n\")\n",
- " heme.pop()\n",
- "\n",
- " with open(\"nucleotide.list\", \"r\") as filin:\n",
- " nucleotide = filin.read()\n",
- " nucleotide = nucleotide.split(\"\\n\")\n",
- " nucleotide.pop()\n",
- " \n",
- " lmin = len(heme)\n",
- " lmid = len(nucleotide)\n",
- " lmax = len(control)\n",
- " tot_size = lmin + lmid + lmax\n",
- " data_onehot = np.ndarray(shape=(size, 14, 32, 32, 32)) # initializing empty array\n",
- "\n",
- " np.random.seed(9001)\n",
- " indices = np.random.permutation(tot_size)\n",
- " indices = indices[:size]\n",
- " output = np.ndarray(shape=(size, 3)) # softmax 3, {steroid=1, heme=1, nucleotide=1}\n",
- "\n",
- " n = -1\n",
- " for i in indices:\n",
- " n += 1\n",
- " if i < lmin:\n",
- " data_onehot[n,] = np.load(\"deepdrug3d_voxel_data/\"+heme[i]+\".npy\")\n",
- " output[n,] = [1,0,0]\n",
- " elif i > lmin and i < (lmin + lmid):\n",
- " data_onehot[n,] = np.load(\"deepdrug3d_voxel_data/\"+nucleotide[i - lmin]+\".npy\")\n",
- " output[n,] = [0,1,0]\n",
- " else:\n",
- " data_onehot[n,] = np.load(\"deepdrug3d_voxel_data/\"+control[i - (lmin+lmid) - 1]+\".npy\")\n",
- " output[n,] = [0,0,1]\n",
- " \n",
- " return (data_onehot, output)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Defining different model to test and compare"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "def model_heavy(): # créer un objet modèle\n",
- " \"\"\"\n",
- " Return a simple sequentiel model\n",
- " \n",
- " Returns :\n",
- " - model : keras.Model\n",
- " \"\"\"\n",
- " inputs = Input(shape=(14,32,32,32))\n",
- " conv_1 = Conv3D(64, (28, 28, 28), padding=\"same\", activation=\"relu\", kernel_initializer=\"he_normal\")(inputs)\n",
- " conv_2 = Conv3D(64, (26, 26, 26), padding=\"same\", activation=\"relu\", kernel_initializer=\"he_normal\")(conv_1)\n",
- " drop_1 = Dropout(0.2)(conv_2)\n",
- " maxpool = MaxPooling3D()(drop_1)\n",
- " drop_2 = Dropout(0.4)(maxpool)\n",
- " dense = Dense(512)(drop_2)\n",
- " drop_3 = Dropout(0.4)(dense)\n",
- " flatters = Flatten()(drop_3)\n",
- " #output = TimeDistributed(Dense(3, activation='softmax'))(drop_3)\n",
- " output = Dense(3, activation='softmax')(flatters)\n",
- " model = Model(inputs=inputs, outputs=output)\n",
- " my_opt = optimizers.Adam(learning_rate=0.000001, beta_1=0.9, beta_2=0.999, amsgrad=False)\n",
- " print(model.summary)\n",
- " model.compile(optimizer=my_opt, loss=\"categorical_crossentropy\",\n",
- " metrics=[\"accuracy\"])\n",
- " return model"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "def model_light(): # créer un objet modèle\n",
- " \"\"\"\n",
- " Return a simple sequentiel model\n",
- " \n",
- " Returns :\n",
- " - model : keras.Model\n",
- " \"\"\"\n",
- " inputs = Input(shape=(14,32,32,32))\n",
- " conv_1 = Conv3D(32, (28, 28, 28), padding=\"same\", activation=\"relu\", kernel_initializer=\"he_normal\")(inputs)\n",
- " conv_2 = Conv3D(64, (26, 26, 26), padding=\"same\", activation=\"relu\", kernel_initializer=\"he_normal\")(conv_1)\n",
- " drop_1 = Dropout(0.2)(conv_2)\n",
- " maxpool = MaxPooling3D()(drop_1)\n",
- " drop_2 = Dropout(0.3)(maxpool)\n",
- " maxpool_2 = MaxPooling3D()(drop_2)\n",
- " drop_3 = Dropout(0.3)(maxpool_2)\n",
- " dense = Dense(256)(drop_3)\n",
- " drop_4 = Dropout(0.4)(dense)\n",
- " flatters = Flatten()(drop_4)\n",
- " output = Dense(3, activation='softmax')(flatters)\n",
- " model = Model(inputs=inputs, outputs=output)\n",
- " my_opt = optimizers.Adam(learning_rate=0.000001, beta_1=0.9, beta_2=0.999, amsgrad=False)\n",
- " print(model.summary)\n",
- " model.compile(optimizer=my_opt, loss=\"categorical_crossentropy\",\n",
- " metrics=[\"accuracy\"])\n",
- " return model"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Create pocket lists\n",
- "4 lists are created :\n",
- " + control\n",
- " + steroid\n",
- " + heme\n",
- " + nucleotide"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "data = in_out_lists(1400)\n",
- "pockets = np.cumsum(data[1], axis=0)[-1]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "print(\"with random seed=9001 and a 1400 pockets dataset the rates are:\\n\\\n",
- " {} heme, {} nucleotide, {} control\\n\\\n",
- " Total avaible dataset are composed of the following proportions:\\n\\\n",
- " {} heme, {} nucleotide, {} control\".format(pockets[0]/1400, pockets[1]/1400,pockets[2]/1400,\n",
- " 0.145, 0.380, 0.475))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "data_onehot = data[0]\n",
- "output = data[1]\n",
- "X_train = data_onehot[0:1000,]\n",
- "Y_train = output[0:1000,]\n",
- "X_test = data_onehot[1000:,]\n",
- "Y_test = output[1000:,]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "my_model = model_light()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "tf.test.is_gpu_available()\n",
- "#my_model.fit(X_train, Y_train, epochs=50, batch_size=30)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "history_mild_2mp = mild_model.fit(X_train, Y_train, validation_data=(X_test, Y_test), epochs=30, batch_size=32)\n",
- "my_model.save('light_model_2mp_e30_b32.h5')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "#predictions=prediction_history()"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.7.4"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 4
- }
|