nzimmermann
/
deepdrug3D


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346
							{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# DeepDrug3D"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Importing library"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import tensorflow as tf\n",
    "from sklearn.preprocessing import LabelEncoder\n",
    "from keras.models import Sequential\n",
    "from keras import optimizers, callbacks\n",
    "from keras.layers import Dense, Flatten, TimeDistributed, Dropout\n",
    "from keras import Input, Model\n",
    "from keras.layers import add, Activation\n",
    "from keras.layers.advanced_activations import LeakyReLU\n",
    "#from keras.utils import plot_model  # Needs pydot.\n",
    "from keras.layers import Convolution3D, MaxPooling3D"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### used to store model prediction in order to plot roc curve"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "class prediction_history(callbacks.Callback):\n",
    "    def __init__(self):\n",
    "        self.predhis = []\n",
    "    def on_epoch_end(self, epoch, logs={}):\n",
    "        self.predhis.append(model.predict(predictor_train))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Creating input and ouputs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def in_out_lists(size=1000):\n",
    "    \"\"\"\n",
    "    returns a tuple of array used as input and output for the model\n",
    "    Arguments:\n",
    "        - size, int: default 1000, size of the lists to be created\n",
    "        \n",
    "    Returns:\n",
    "        - tuple (data_onehot, output):\n",
    "            -data_onehot, ndarray: containing one-hot encoded pockets\n",
    "            -output, ndarray: containing size-3 vectors for classification\n",
    "    \"\"\"\n",
    "    with open(\"control.list\", \"r\") as filin:\n",
    "        control = filin.read()\n",
    "        control = control.split(\"\\n\")\n",
    "        control.pop()\n",
    "\n",
    "    with open(\"steroid.list\", \"r\") as filin:\n",
    "        steroid = filin.read()\n",
    "        steroid = steroid.split(\"\\n\")\n",
    "        steroid.pop()\n",
    "\n",
    "    with open(\"heme.list\", \"r\") as filin:\n",
    "        heme = filin.read()\n",
    "        heme = heme.split(\"\\n\")\n",
    "        heme.pop()\n",
    "\n",
    "    with open(\"nucleotide.list\", \"r\") as filin:\n",
    "        nucleotide = filin.read()\n",
    "        nucleotide = nucleotide.split(\"\\n\")\n",
    "        nucleotide.pop()\n",
    "    \n",
    "    lmin = len(heme)\n",
    "    lmid = len(nucleotide)\n",
    "    lmax = len(control)\n",
    "    tot_size = lmin + lmid + lmax\n",
    "    data_onehot = np.ndarray(shape=(size, 14, 32, 32, 32)) # initializing empty array\n",
    "\n",
    "    np.random.seed(9001)\n",
    "    indices = np.random.permutation(tot_size)\n",
    "    indices = indices[:size]\n",
    "    output = np.ndarray(shape=(size, 3)) # softmax 3, {steroid=1, heme=1, nucleotide=1}\n",
    "\n",
    "    n = -1\n",
    "    for i in indices:\n",
    "        n += 1\n",
    "        if i < lmin:\n",
    "            data_onehot[n,] = np.load(\"deepdrug3d_voxel_data/\"+heme[i]+\".npy\")\n",
    "            output[n,] = [1,0,0]\n",
    "        elif i > lmin and i < (lmin + lmid):\n",
    "            data_onehot[n,] = np.load(\"deepdrug3d_voxel_data/\"+nucleotide[i - lmin]+\".npy\")\n",
    "            output[n,] = [0,1,0]\n",
    "        else:\n",
    "            data_onehot[n,] = np.load(\"deepdrug3d_voxel_data/\"+control[i - (lmin+lmid) - 1]+\".npy\")\n",
    "            output[n,] = [0,0,1]\n",
    "    \n",
    "    return (data_onehot, output)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Defining different model to test and compare"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def model_heavy(): # créer un objet modèle\n",
    "    \"\"\"\n",
    "    Return a simple sequentiel model\n",
    "    \n",
    "    Returns :\n",
    "        - model : keras.Model\n",
    "    \"\"\"\n",
    "    inputs = Input(shape=(14,32,32,32))\n",
    "    conv_1 = Conv3D(64, (28, 28, 28), padding=\"same\", activation=\"relu\", kernel_initializer=\"he_normal\")(inputs)\n",
    "    conv_2 = Conv3D(64, (26, 26, 26), padding=\"same\", activation=\"relu\", kernel_initializer=\"he_normal\")(conv_1)\n",
    "    drop_1 = Dropout(0.2)(conv_2)\n",
    "    maxpool = MaxPooling3D()(drop_1)\n",
    "    drop_2 = Dropout(0.4)(maxpool)\n",
    "    dense = Dense(512)(drop_2)\n",
    "    drop_3 = Dropout(0.4)(dense)\n",
    "    flatters = Flatten()(drop_3)\n",
    "    #output = TimeDistributed(Dense(3, activation='softmax'))(drop_3)\n",
    "    output = Dense(3, activation='softmax')(flatters)\n",
    "    model = Model(inputs=inputs, outputs=output)\n",
    "    my_opt = optimizers.Adam(learning_rate=0.000001, beta_1=0.9, beta_2=0.999, amsgrad=False)\n",
    "    print(model.summary)\n",
    "    model.compile(optimizer=my_opt, loss=\"categorical_crossentropy\",\n",
    "                  metrics=[\"accuracy\"])\n",
    "    return model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "def model_new(): # créer un objet modèle\n",
    "    \"\"\"\n",
    "    Return a simple sequentiel model\n",
    "    \n",
    "    Returns :\n",
    "        - model : keras.Model\n",
    "    \"\"\"\n",
    "    inputs = Input(shape=(14,32,32,32))\n",
    "    conv_1 = Convolution3D(filters=64, kernel_size=5, padding=\"valid\", data_format='channels_first')(inputs)\n",
    "    activation_1 = LeakyReLU(alpha = 0.1)(conv_1)\n",
    "    drop_1 = Dropout(0.2)(activation_1)\n",
    "    conv_2 = Convolution3D(filters=64, kernel_size=3, padding=\"valid\", data_format='channels_first')(drop_1)\n",
    "    activation_2 = LeakyReLU(alpha = 0.1)(conv_2)\n",
    "    maxpool = MaxPooling3D(pool_size=(2,2,2),\n",
    "                            strides=None,\n",
    "                            padding='valid',\n",
    "                            data_format='channels_first')(activation_2)\n",
    "    drop_2 = Dropout(0.4)(maxpool)\n",
    "    flatters = Flatten()(drop_2)\n",
    "    dense = Dense(128)(flatters)\n",
    "    activation_3 = LeakyReLU(alpha = 0.1)(dense)\n",
    "    drop_3 = Dropout(0.4)(activation_3)\n",
    "    output = Dense(3, activation='softmax')(drop_3)\n",
    "    model = Model(inputs=inputs, outputs=output)\n",
    "    my_opt = optimizers.Adam(learning_rate=0.000001, beta_1=0.9, beta_2=0.999, amsgrad=False)\n",
    "    print(model.summary)\n",
    "    model.compile(optimizer=my_opt, loss=\"categorical_crossentropy\",\n",
    "                  metrics=[\"accuracy\"])\n",
    "    return model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def model_light(): # créer un objet modèle\n",
    "    \"\"\"\n",
    "    Return a simple sequentiel model\n",
    "    \n",
    "    Returns :\n",
    "        - model : keras.Model\n",
    "    \"\"\"\n",
    "    inputs = Input(shape=(14,32,32,32))\n",
    "    conv_1 = Conv3D(32, (28, 28, 28), padding=\"same\", activation=\"relu\", kernel_initializer=\"he_normal\")(inputs)\n",
    "    conv_2 = Conv3D(64, (26, 26, 26), padding=\"same\", activation=\"relu\", kernel_initializer=\"he_normal\")(conv_1)\n",
    "    drop_1 = Dropout(0.2)(conv_2)\n",
    "    maxpool = MaxPooling3D()(drop_1)\n",
    "    drop_2 = Dropout(0.3)(maxpool)\n",
    "    maxpool_2 = MaxPooling3D()(drop_2)\n",
    "    drop_3 = Dropout(0.3)(maxpool_2)\n",
    "    dense = Dense(256)(drop_3)\n",
    "    drop_4 = Dropout(0.4)(dense)\n",
    "    flatters = Flatten()(drop_4)\n",
    "    output = Dense(3, activation='softmax')(flatters)\n",
    "    model = Model(inputs=inputs, outputs=output)\n",
    "    my_opt = optimizers.Adam(learning_rate=0.000001, beta_1=0.9, beta_2=0.999, amsgrad=False)\n",
    "    print(model.summary)\n",
    "    model.compile(optimizer=my_opt, loss=\"categorical_crossentropy\",\n",
    "                  metrics=[\"accuracy\"])\n",
    "    return model"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Create pocket lists\n",
    "4 lists are created :\n",
    "  + control\n",
    "  + steroid\n",
    "  + heme\n",
    "  + nucleotide"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = in_out_lists(1400)\n",
    "pockets = np.cumsum(data[1], axis=0)[-1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"with random seed=9001 and a 1400 pockets dataset the rates are:\\n\\\n",
    "      {} heme, {} nucleotide, {} control\\n\\\n",
    "      Total avaible dataset are composed of the following proportions:\\n\\\n",
    "      {} heme, {} nucleotide, {} control\".format(pockets[0]/1400, pockets[1]/1400,pockets[2]/1400,\n",
    "                                                0.145, 0.380, 0.475))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_onehot = data[0]\n",
    "output = data[1]\n",
    "X_train = data_onehot[0:1000,]\n",
    "Y_train = output[0:1000,]\n",
    "X_test = data_onehot[1000:,]\n",
    "Y_test = output[1000:,]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "my_model = model_new()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tf.test.is_gpu_available()\n",
    "#my_model.fit(X_train, Y_train, epochs=50, batch_size=30)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "history_mild_2mp = mild_model.fit(X_train, Y_train, validation_data=(X_test, Y_test), epochs=30, batch_size=32)\n",
    "my_model.save('new_model_e30_b32_t1000.h5')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#predictions=prediction_history()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}