projet de deep-learning. Apprentissage de poches de liaison de protéines-ligands

DeepDrug.py 9.0KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292
  1. #!/usr/bin/env python
  2. # coding: utf-8
  3. # # DeepDrug3D
  4. # ## Importing library
  5. # In[6]:
  6. import numpy as np
  7. import tensorflow as tf
  8. from sklearn.preprocessing import LabelEncoder
  9. from keras.models import Sequential, load_model
  10. from keras import optimizers, callbacks
  11. from keras.layers import Dense, Flatten, TimeDistributed, Dropout
  12. from keras import Input, Model
  13. from keras.layers import add, Activation
  14. from keras.layers.advanced_activations import LeakyReLU
  15. #from keras.utils import plot_model # Needs pydot.
  16. from keras.layers import Convolution3D, MaxPooling3D
  17. # ### used to store model prediction in order to plot roc curve
  18. # ### Creating input and ouputs
  19. # In[ ]:
  20. def in_out_lists(size=1000):
  21. """
  22. returns a tuple of array used as input and output for the model
  23. Arguments:
  24. - size, int: default 1000, size of the lists to be created
  25. Returns:
  26. - tuple (data_onehot, output):
  27. -data_onehot, ndarray: containing one-hot encoded pockets
  28. -output, ndarray: containing size-3 vectors for classification
  29. """
  30. with open("control.list", "r") as filin:
  31. control = filin.read()
  32. control = control.split("\n")
  33. control.pop()
  34. with open("steroid.list", "r") as filin:
  35. steroid = filin.read()
  36. steroid = steroid.split("\n")
  37. steroid.pop()
  38. with open("heme.list", "r") as filin:
  39. heme = filin.read()
  40. heme = heme.split("\n")
  41. heme.pop()
  42. with open("nucleotide.list", "r") as filin:
  43. nucleotide = filin.read()
  44. nucleotide = nucleotide.split("\n")
  45. nucleotide.pop()
  46. lmin = len(heme)
  47. lmid = len(nucleotide)
  48. lmax = len(control)
  49. tot_size = lmin + lmid + lmax
  50. data_onehot = np.ndarray(shape=(size, 14, 32, 32, 32)) # initializing empty array
  51. np.random.seed(9001)
  52. indices = np.random.permutation(tot_size)
  53. indices = indices[:size]
  54. output = np.ndarray(shape=(size, 3)) # softmax 3, {steroid=1, heme=1, nucleotide=1}
  55. n = -1
  56. for i in indices:
  57. n += 1
  58. if i < lmin:
  59. data_onehot[n,] = np.load("deepdrug3d_voxel_data/"+heme[i]+".npy")
  60. output[n,] = [1,0,0]
  61. elif i > lmin and i < (lmin + lmid):
  62. data_onehot[n,] = np.load("deepdrug3d_voxel_data/"+nucleotide[i - lmin]+".npy")
  63. output[n,] = [0,1,0]
  64. else:
  65. data_onehot[n,] = np.load("deepdrug3d_voxel_data/"+control[i - (lmin+lmid) - 1]+".npy")
  66. output[n,] = [0,0,1]
  67. return (data_onehot, output)
  68. # ### Defining different model to test and compare
  69. # In[ ]:
  70. def model_heavy(): # créer un objet modèle
  71. """
  72. Return a simple sequentiel model
  73. Returns :
  74. - model : keras.Model
  75. """
  76. inputs = Input(shape=(14,32,32,32))
  77. conv_1 = Conv3D(64, (28, 28, 28), padding="same", activation="relu", kernel_initializer="he_normal")(inputs)
  78. conv_2 = Conv3D(64, (26, 26, 26), padding="same", activation="relu", kernel_initializer="he_normal")(conv_1)
  79. drop_1 = Dropout(0.2)(conv_2)
  80. maxpool = MaxPooling3D()(drop_1)
  81. drop_2 = Dropout(0.4)(maxpool)
  82. dense = Dense(512)(drop_2)
  83. drop_3 = Dropout(0.4)(dense)
  84. flatters = Flatten()(drop_3)
  85. #output = TimeDistributed(Dense(3, activation='softmax'))(drop_3)
  86. output = Dense(3, activation='softmax')(flatters)
  87. model = Model(inputs=inputs, outputs=output)
  88. my_opt = optimizers.Adam(learning_rate=0.000001, beta_1=0.9, beta_2=0.999, amsgrad=False)
  89. print(model.summary)
  90. model.compile(optimizer=my_opt, loss="categorical_crossentropy",
  91. metrics=["accuracy"])
  92. return model
  93. # In[8]:
  94. def model_fast_k32(): # créer un objet modèle
  95. """
  96. Return a simple sequentiel model
  97. Returns :
  98. - model : keras.Model
  99. """
  100. inputs = Input(shape=(14,32,32,32))
  101. conv_1 = Convolution3D(filters=64, kernel_size=32, padding="valid", data_format='channels_first')(inputs)
  102. activation_1 = LeakyReLU(alpha = 0.1)(conv_1)
  103. drop_1 = Dropout(0.2)(activation_1)
  104. conv_2 = Convolution3D(filters=128, kernel_size=32, padding="valid", data_format='channels_first')(drop_1)
  105. activation_2 = LeakyReLU(alpha = 0.1)(conv_2)
  106. maxpool = MaxPooling3D(pool_size=(2,2,2),
  107. strides=None,
  108. padding='valid',
  109. data_format='channels_first')(activation_2)
  110. drop_2 = Dropout(0.4)(maxpool)
  111. flatters = Flatten()(drop_2)
  112. dense = Dense(256)(flatters)
  113. activation_3 = LeakyReLU(alpha = 0.1)(dense)
  114. drop_3 = Dropout(0.4)(activation_3)
  115. output = Dense(3, activation='softmax')(drop_3)
  116. model = Model(inputs=inputs, outputs=output)
  117. my_opt = optimizers.Adam(learning_rate=0.000001, beta_1=0.9, beta_2=0.999, amsgrad=False)
  118. print(model.summary)
  119. model.compile(optimizer=my_opt, loss="categorical_crossentropy",
  120. metrics=["accuracy"])
  121. return model
  122. def model_fast_k16(): # créer un objet modèle
  123. """
  124. Return a simple sequentiel model
  125. Returns :
  126. - model : keras.Model
  127. """
  128. inputs = Input(shape=(14,32,32,32))
  129. conv_1 = Convolution3D(filters=64, kernel_size=16, padding="valid", data_format='channels_first')(inputs)
  130. activation_1 = LeakyReLU(alpha = 0.1)(conv_1)
  131. drop_1 = Dropout(0.2)(activation_1)
  132. conv_2 = Convolution3D(filters=128, kernel_size=16, padding="valid", data_format='channels_first')(drop_1)
  133. activation_2 = LeakyReLU(alpha = 0.1)(conv_2)
  134. maxpool = MaxPooling3D(pool_size=(2,2,2),
  135. strides=None,
  136. padding='valid',
  137. data_format='channels_first')(activation_2)
  138. drop_2 = Dropout(0.4)(maxpool)
  139. flatters = Flatten()(drop_2)
  140. dense = Dense(256)(flatters)
  141. activation_3 = LeakyReLU(alpha = 0.1)(dense)
  142. drop_3 = Dropout(0.4)(activation_3)
  143. output = Dense(3, activation='softmax')(drop_3)
  144. model = Model(inputs=inputs, outputs=output)
  145. my_opt = optimizers.Adam(learning_rate=0.000001, beta_1=0.9, beta_2=0.999, amsgrad=False)
  146. print(model.summary)
  147. model.compile(optimizer=my_opt, loss="categorical_crossentropy",
  148. metrics=["accuracy"])
  149. return model
  150. # In[ ]:
  151. def model_light(): # créer un objet modèle
  152. """
  153. Return a simple sequentiel model
  154. Returns :
  155. - model : keras.Model
  156. """
  157. inputs = Input(shape=(14,32,32,32))
  158. conv_1 = Conv3D(32, (28, 28, 28), padding="same", activation="relu", kernel_initializer="he_normal")(inputs)
  159. conv_2 = Conv3D(64, (26, 26, 26), padding="same", activation="relu", kernel_initializer="he_normal")(conv_1)
  160. drop_1 = Dropout(0.2)(conv_2)
  161. maxpool = MaxPooling3D()(drop_1)
  162. drop_2 = Dropout(0.3)(maxpool)
  163. maxpool_2 = MaxPooling3D()(drop_2)
  164. drop_3 = Dropout(0.3)(maxpool_2)
  165. dense = Dense(256)(drop_3)
  166. drop_4 = Dropout(0.4)(dense)
  167. flatters = Flatten()(drop_4)
  168. output = Dense(3, activation='softmax')(flatters)
  169. model = Model(inputs=inputs, outputs=output)
  170. my_opt = optimizers.Adam(learning_rate=0.000001, beta_1=0.9, beta_2=0.999, amsgrad=False)
  171. print(model.summary)
  172. model.compile(optimizer=my_opt, loss="categorical_crossentropy",
  173. metrics=["accuracy"])
  174. return model
  175. # ## Create pocket lists
  176. # 4 lists are created :
  177. # + control
  178. # + steroid
  179. # + heme
  180. # + nucleotide
  181. # In[ ]:
  182. sample = 1000
  183. data = in_out_lists(sample)
  184. pockets = np.cumsum(data[1], axis=0)[-1]
  185. # In[ ]:
  186. print("with random seed=9001 and a {} pockets dataset the rates are:\n {} heme, {} nucleotide, {} control\n Total avaible dataset are composed of the following proportions:\n {} heme, {} nucleotide, {} control".format(sample, pockets[0]/sample,
  187. pockets[1]/sample,pockets[2]/sample,
  188. 0.145, 0.380, 0.475))
  189. # In[ ]:
  190. train = int(sample*0.6)
  191. data_onehot = data[0]
  192. output = data[1]
  193. X_train = data_onehot[0:train,]
  194. Y_train = output[0:train,]
  195. X_test = data_onehot[train:,]
  196. Y_test = output[train:,]
  197. # In[ ]:
  198. my_model = model_fast_k16()
  199. # In[ ]:
  200. tf.test.is_gpu_available()
  201. # In[ ]:
  202. my_model.fit(X_train, Y_train, validation_data=(X_test, Y_test), epochs=50, batch_size=32)
  203. #my_model.save('new_model_e50_b32_t1600.h5')
  204. #my_model = load_model('new_model_e50_b32_t1600.h5')
  205. # ## Testing steroids
  206. with open("steroid.list", "r") as filin:
  207. steroid = filin.read()
  208. steroid = steroid.split("\n")
  209. steroid.pop()
  210. X_steroid = np.ndarray(shape=(69, 14, 32, 32, 32))
  211. i = -1
  212. for pocket in steroid:
  213. i += 1
  214. X_steroid[i,] = np.load("deepdrug3d_voxel_data/"+pocket+".npy")
  215. Y_pred_steroid = my_model.predict(X_steroid)
  216. Y_pred_steroid = Y_pred_steroid.round()
  217. steroid_predict = Y_pred_steroid.cumsum(axis=0)
  218. print("On 69 steroid-binded pockets, prediction are the following:\n\
  219. predicted as heme:\t{}\npredicted as nucleotide:\t{}\n\
  220. predicted as control:\t{}\n".format(steroid_predict[0],
  221. steroid_predict[1],
  222. steroid_predict[2]))