projet de deep-learning. Apprentissage de poches de liaison de protéines-ligands

DeepDrug.py 5.8KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216
  1. #!/usr/bin/env python
  2. # coding: utf-8
  3. # # DeepDrug3D
  4. # ## Importing library
  5. # In[ ]:
  6. import numpy as np
  7. import tensorflow as tf
  8. from sklearn.preprocessing import LabelEncoder
  9. from keras.models import Sequential
  10. from keras import optimizers, callbacks
  11. from keras.layers import Dense, Flatten, TimeDistributed, Dropout
  12. from keras import Input, Model
  13. from keras.layers import add, Activation
  14. #from keras.utils import plot_model # Needs pydot.
  15. from keras.layers import Conv3D, MaxPooling3D
  16. # ### used to store model prediction in order to plot roc curve
  17. # In[ ]:
  18. class prediction_history(callbacks.Callback):
  19. def __init__(self):
  20. self.predhis = []
  21. def on_epoch_end(self, epoch, logs={}):
  22. self.predhis.append(model.predict(predictor_train))
  23. # ### Creating input and ouputs
  24. # In[ ]:
  25. def in_out_lists(size=1000):
  26. """
  27. returns a tuple of array used as input and output for the model
  28. Arguments:
  29. - size, int: default 1000, size of the lists to be created
  30. Returns:
  31. - tuple (data_onehot, output):
  32. -data_onehot, ndarray: containing one-hot encoded pockets
  33. -output, ndarray: containing size-3 vectors for classification
  34. """
  35. with open("control.list", "r") as filin:
  36. control = filin.read()
  37. control = control.split("\n")
  38. control.pop()
  39. with open("steroid.list", "r") as filin:
  40. steroid = filin.read()
  41. steroid = steroid.split("\n")
  42. steroid.pop()
  43. with open("heme.list", "r") as filin:
  44. heme = filin.read()
  45. heme = heme.split("\n")
  46. heme.pop()
  47. with open("nucleotide.list", "r") as filin:
  48. nucleotide = filin.read()
  49. nucleotide = nucleotide.split("\n")
  50. nucleotide.pop()
  51. lmin = len(heme)
  52. lmid = len(nucleotide)
  53. lmax = len(control)
  54. tot_size = lmin + lmid + lmax
  55. data_onehot = np.ndarray(shape=(size, 14, 32, 32, 32)) # initializing empty array
  56. np.random.seed(9001)
  57. indices = np.random.permutation(tot_size)
  58. indices = indices[:size]
  59. output = np.ndarray(shape=(size, 3)) # softmax 3, {steroid=1, heme=1, nucleotide=1}
  60. n = -1
  61. for i in indices:
  62. n += 1
  63. if i < lmin:
  64. data_onehot[n,] = np.load("deepdrug3d_voxel_data/"+heme[i]+".npy")
  65. output[n,] = [1,0,0]
  66. elif i > lmin and i < (lmin + lmid):
  67. data_onehot[n,] = np.load("deepdrug3d_voxel_data/"+nucleotide[i - lmin]+".npy")
  68. output[n,] = [0,1,0]
  69. else:
  70. data_onehot[n,] = np.load("deepdrug3d_voxel_data/"+control[i - (lmin+lmid) - 1]+".npy")
  71. output[n,] = [0,0,1]
  72. return (data_onehot, output)
  73. # ### Defining different model to test and compare
  74. # In[ ]:
  75. def model_heavy(): # créer un objet modèle
  76. """
  77. Return a simple sequentiel model
  78. Returns :
  79. - model : keras.Model
  80. """
  81. inputs = Input(shape=(14,32,32,32))
  82. conv_1 = Conv3D(64, (28, 28, 28), padding="same", activation="relu", kernel_initializer="he_normal")(inputs)
  83. conv_2 = Conv3D(64, (26, 26, 26), padding="same", activation="relu", kernel_initializer="he_normal")(conv_1)
  84. drop_1 = Dropout(0.2)(conv_2)
  85. maxpool = MaxPooling3D()(drop_1)
  86. drop_2 = Dropout(0.4)(maxpool)
  87. dense = Dense(512)(drop_2)
  88. drop_3 = Dropout(0.4)(dense)
  89. flatters = Flatten()(drop_3)
  90. #output = TimeDistributed(Dense(3, activation='softmax'))(drop_3)
  91. output = Dense(3, activation='softmax')(flatters)
  92. model = Model(inputs=inputs, outputs=output)
  93. my_opt = optimizers.Adam(learning_rate=0.000001, beta_1=0.9, beta_2=0.999, amsgrad=False)
  94. print(model.summary)
  95. model.compile(optimizer=my_opt, loss="categorical_crossentropy",
  96. metrics=["accuracy"])
  97. return model
  98. # In[ ]:
  99. def model_light(): # créer un objet modèle
  100. """
  101. Return a simple sequentiel model
  102. Returns :
  103. - model : keras.Model
  104. """
  105. inputs = Input(shape=(14,32,32,32))
  106. conv_1 = Conv3D(32, (28, 28, 28), padding="same", activation="relu", kernel_initializer="he_normal")(inputs)
  107. conv_2 = Conv3D(64, (26, 26, 26), padding="same", activation="relu", kernel_initializer="he_normal")(conv_1)
  108. drop_1 = Dropout(0.2)(conv_2)
  109. maxpool = MaxPooling3D()(drop_1)
  110. drop_2 = Dropout(0.3)(maxpool)
  111. maxpool_2 = MaxPooling3D()(drop_2)
  112. drop_3 = Dropout(0.3)(maxpool_2)
  113. dense = Dense(256)(drop_3)
  114. drop_4 = Dropout(0.4)(dense)
  115. flatters = Flatten()(drop_4)
  116. output = Dense(3, activation='softmax')(flatters)
  117. model = Model(inputs=inputs, outputs=output)
  118. my_opt = optimizers.Adam(learning_rate=0.000001, beta_1=0.9, beta_2=0.999, amsgrad=False)
  119. print(model.summary)
  120. model.compile(optimizer=my_opt, loss="categorical_crossentropy",
  121. metrics=["accuracy"])
  122. return model
  123. # ## Create pocket lists
  124. # 4 lists are created :
  125. # + control
  126. # + steroid
  127. # + heme
  128. # + nucleotide
  129. # In[ ]:
  130. data = in_out_lists(1400)
  131. pockets = np.cumsum(data[1], axis=0)[-1]
  132. # In[ ]:
  133. print("with random seed=9001 and a 1400 pockets dataset the rates are:\n {} heme, {} nucleotide, {} control\n Total avaible dataset are composed of the following proportions:\n {} heme, {} nucleotide, {} control".format(pockets[0]/1400, pockets[1]/1400,pockets[2]/1400,
  134. 0.145, 0.380, 0.475))
  135. # In[ ]:
  136. data_onehot = data[0]
  137. output = data[1]
  138. X_train = data_onehot[0:1000,]
  139. Y_train = output[0:1000,]
  140. X_test = data_onehot[1000:,]
  141. Y_test = output[1000:,]
  142. # In[ ]:
  143. my_model = model_light()
  144. # In[ ]:
  145. tf.test.is_gpu_available()
  146. #my_model.fit(X_train, Y_train, epochs=50, batch_size=30)
  147. # In[ ]:
  148. history_mild_2mp = my_model.fit(X_train, Y_train, validation_data=(X_test, Y_test), epochs=30, batch_size=32)
  149. my_model.save('light_model_2mp_e30_b32.h5')
  150. # In[ ]:
  151. #predictions=prediction_history()