projet de deep-learning. Apprentissage de poches de liaison de protéines-ligands

DeepDrug.py 7.1KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251
  1. #!/usr/bin/env python
  2. # coding: utf-8
  3. # # DeepDrug3D
  4. # ## Importing library
  5. # In[6]:
  6. import numpy as np
  7. import tensorflow as tf
  8. from sklearn.preprocessing import LabelEncoder
  9. from keras.models import Sequential
  10. from keras import optimizers, callbacks
  11. from keras.layers import Dense, Flatten, TimeDistributed, Dropout
  12. from keras import Input, Model
  13. from keras.layers import add, Activation
  14. from keras.layers.advanced_activations import LeakyReLU
  15. #from keras.utils import plot_model # Needs pydot.
  16. from keras.layers import Convolution3D, MaxPooling3D
  17. # ### used to store model prediction in order to plot roc curve
  18. # In[ ]:
  19. class prediction_history(callbacks.Callback):
  20. def __init__(self):
  21. self.predhis = []
  22. def on_epoch_end(self, epoch, logs={}):
  23. self.predhis.append(model.predict(predictor_train))
  24. # ### Creating input and ouputs
  25. # In[ ]:
  26. def in_out_lists(size=1000):
  27. """
  28. returns a tuple of array used as input and output for the model
  29. Arguments:
  30. - size, int: default 1000, size of the lists to be created
  31. Returns:
  32. - tuple (data_onehot, output):
  33. -data_onehot, ndarray: containing one-hot encoded pockets
  34. -output, ndarray: containing size-3 vectors for classification
  35. """
  36. with open("control.list", "r") as filin:
  37. control = filin.read()
  38. control = control.split("\n")
  39. control.pop()
  40. with open("steroid.list", "r") as filin:
  41. steroid = filin.read()
  42. steroid = steroid.split("\n")
  43. steroid.pop()
  44. with open("heme.list", "r") as filin:
  45. heme = filin.read()
  46. heme = heme.split("\n")
  47. heme.pop()
  48. with open("nucleotide.list", "r") as filin:
  49. nucleotide = filin.read()
  50. nucleotide = nucleotide.split("\n")
  51. nucleotide.pop()
  52. lmin = len(heme)
  53. lmid = len(nucleotide)
  54. lmax = len(control)
  55. tot_size = lmin + lmid + lmax
  56. data_onehot = np.ndarray(shape=(size, 14, 32, 32, 32)) # initializing empty array
  57. np.random.seed(9001)
  58. indices = np.random.permutation(tot_size)
  59. indices = indices[:size]
  60. output = np.ndarray(shape=(size, 3)) # softmax 3, {steroid=1, heme=1, nucleotide=1}
  61. n = -1
  62. for i in indices:
  63. n += 1
  64. if i < lmin:
  65. data_onehot[n,] = np.load("deepdrug3d_voxel_data/"+heme[i]+".npy")
  66. output[n,] = [1,0,0]
  67. elif i > lmin and i < (lmin + lmid):
  68. data_onehot[n,] = np.load("deepdrug3d_voxel_data/"+nucleotide[i - lmin]+".npy")
  69. output[n,] = [0,1,0]
  70. else:
  71. data_onehot[n,] = np.load("deepdrug3d_voxel_data/"+control[i - (lmin+lmid) - 1]+".npy")
  72. output[n,] = [0,0,1]
  73. return (data_onehot, output)
  74. # ### Defining different model to test and compare
  75. # In[ ]:
  76. def model_heavy(): # créer un objet modèle
  77. """
  78. Return a simple sequentiel model
  79. Returns :
  80. - model : keras.Model
  81. """
  82. inputs = Input(shape=(14,32,32,32))
  83. conv_1 = Conv3D(64, (28, 28, 28), padding="same", activation="relu", kernel_initializer="he_normal")(inputs)
  84. conv_2 = Conv3D(64, (26, 26, 26), padding="same", activation="relu", kernel_initializer="he_normal")(conv_1)
  85. drop_1 = Dropout(0.2)(conv_2)
  86. maxpool = MaxPooling3D()(drop_1)
  87. drop_2 = Dropout(0.4)(maxpool)
  88. dense = Dense(512)(drop_2)
  89. drop_3 = Dropout(0.4)(dense)
  90. flatters = Flatten()(drop_3)
  91. #output = TimeDistributed(Dense(3, activation='softmax'))(drop_3)
  92. output = Dense(3, activation='softmax')(flatters)
  93. model = Model(inputs=inputs, outputs=output)
  94. my_opt = optimizers.Adam(learning_rate=0.000001, beta_1=0.9, beta_2=0.999, amsgrad=False)
  95. print(model.summary)
  96. model.compile(optimizer=my_opt, loss="categorical_crossentropy",
  97. metrics=["accuracy"])
  98. return model
  99. # In[8]:
  100. def model_new(): # créer un objet modèle
  101. """
  102. Return a simple sequentiel model
  103. Returns :
  104. - model : keras.Model
  105. """
  106. inputs = Input(shape=(14,32,32,32))
  107. conv_1 = Convolution3D(filters=64, kernel_size=5, padding="valid", data_format='channels_first')(inputs)
  108. activation_1 = LeakyReLU(alpha = 0.1)(conv_1)
  109. drop_1 = Dropout(0.2)(activation_1)
  110. conv_2 = Convolution3D(filters=64, kernel_size=3, padding="valid", data_format='channels_first')(drop_1)
  111. activation_2 = LeakyReLU(alpha = 0.1)(conv_2)
  112. maxpool = MaxPooling3D(pool_size=(2,2,2),
  113. strides=None,
  114. padding='valid',
  115. data_format='channels_first')(activation_2)
  116. drop_2 = Dropout(0.4)(maxpool)
  117. flatters = Flatten()(drop_2)
  118. dense = Dense(128)(flatters)
  119. activation_3 = LeakyReLU(alpha = 0.1)(dense)
  120. drop_3 = Dropout(0.4)(activation_3)
  121. output = Dense(3, activation='softmax')(drop_3)
  122. model = Model(inputs=inputs, outputs=output)
  123. my_opt = optimizers.Adam(learning_rate=0.000001, beta_1=0.9, beta_2=0.999, amsgrad=False)
  124. print(model.summary)
  125. model.compile(optimizer=my_opt, loss="categorical_crossentropy",
  126. metrics=["accuracy"])
  127. return model
  128. # In[ ]:
  129. def model_light(): # créer un objet modèle
  130. """
  131. Return a simple sequentiel model
  132. Returns :
  133. - model : keras.Model
  134. """
  135. inputs = Input(shape=(14,32,32,32))
  136. conv_1 = Conv3D(32, (28, 28, 28), padding="same", activation="relu", kernel_initializer="he_normal")(inputs)
  137. conv_2 = Conv3D(64, (26, 26, 26), padding="same", activation="relu", kernel_initializer="he_normal")(conv_1)
  138. drop_1 = Dropout(0.2)(conv_2)
  139. maxpool = MaxPooling3D()(drop_1)
  140. drop_2 = Dropout(0.3)(maxpool)
  141. maxpool_2 = MaxPooling3D()(drop_2)
  142. drop_3 = Dropout(0.3)(maxpool_2)
  143. dense = Dense(256)(drop_3)
  144. drop_4 = Dropout(0.4)(dense)
  145. flatters = Flatten()(drop_4)
  146. output = Dense(3, activation='softmax')(flatters)
  147. model = Model(inputs=inputs, outputs=output)
  148. my_opt = optimizers.Adam(learning_rate=0.000001, beta_1=0.9, beta_2=0.999, amsgrad=False)
  149. print(model.summary)
  150. model.compile(optimizer=my_opt, loss="categorical_crossentropy",
  151. metrics=["accuracy"])
  152. return model
  153. # ## Create pocket lists
  154. # 4 lists are created :
  155. # + control
  156. # + steroid
  157. # + heme
  158. # + nucleotide
  159. # In[ ]:
  160. data = in_out_lists(1400)
  161. pockets = np.cumsum(data[1], axis=0)[-1]
  162. # In[ ]:
  163. print("with random seed=9001 and a 1400 pockets dataset the rates are:\n {} heme, {} nucleotide, {} control\n Total avaible dataset are composed of the following proportions:\n {} heme, {} nucleotide, {} control".format(pockets[0]/1400, pockets[1]/1400,pockets[2]/1400,
  164. 0.145, 0.380, 0.475))
  165. # In[ ]:
  166. data_onehot = data[0]
  167. output = data[1]
  168. X_train = data_onehot[0:1000,]
  169. Y_train = output[0:1000,]
  170. X_test = data_onehot[1000:,]
  171. Y_test = output[1000:,]
  172. # In[ ]:
  173. my_model = model_new()
  174. # In[ ]:
  175. tf.test.is_gpu_available()
  176. #my_model.fit(X_train, Y_train, epochs=50, batch_size=30)
  177. # In[ ]:
  178. history_mild_2mp = my_model.fit(X_train, Y_train, validation_data=(X_test, Y_test), epochs=30, batch_size=32)
  179. my_model.save('new_model_e30_b32_t1000.h5')
  180. # In[ ]:
  181. #predictions=prediction_history()