projet de deep-learning. Apprentissage de poches de liaison de protéines-ligands

secondary_struct-checkpoint.ipynb 12KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388
  1. {
  2. "cells": [
  3. {
  4. "cell_type": "code",
  5. "execution_count": 18,
  6. "metadata": {},
  7. "outputs": [],
  8. "source": [
  9. "import keras\n",
  10. "import numpy as np\n",
  11. "import pandas as pd\n",
  12. "\n",
  13. "from sklearn.preprocessing import LabelEncoder\n",
  14. "from keras.models import Sequential\n",
  15. "from keras.layers import Dense, Flatten, TimeDistributed\n",
  16. "from keras import Input, Model\n",
  17. "from keras.layers import add, Activation\n",
  18. "#from keras.utils import plot_model # Needs pydot.\n",
  19. "from keras.layers import Conv1D, AveragePooling1D"
  20. ]
  21. },
  22. {
  23. "cell_type": "code",
  24. "execution_count": 19,
  25. "metadata": {},
  26. "outputs": [],
  27. "source": [
  28. "def file_to_dataframe(filename):\n",
  29. " \"\"\"\n",
  30. " Returns a pandas dataframe ncol = len(longest_sequence),\n",
  31. " nrow = number of sequences\n",
  32. " \n",
  33. " Arguments :\n",
  34. " - filename : str\n",
  35. " path to file\n",
  36. " Takes a path to a files containing any sequences, must be 1 sequence\n",
  37. " per line.\n",
  38. " \n",
  39. " Sequences shorter than longest_sequence are completed with '0' char.\n",
  40. " \"\"\"\n",
  41. " filin = open(filename)\n",
  42. " fastas0 = filin.read()\n",
  43. " fastas0 = fastas0.split('\\n')\n",
  44. " nmax = len(max(fastas0, key=len))\n",
  45. " fastas = []\n",
  46. " for fasta in fastas0:\n",
  47. " fastas.append(fasta + (nmax - len(fasta)) * '0')\n",
  48. " \n",
  49. " seqs = pd.DataFrame(index=range(len(fastas)), columns=range(nmax))\n",
  50. " \n",
  51. " for i, fasta in enumerate(fastas):\n",
  52. " seqs.loc[i] = pd.Series(list(fasta))\n",
  53. " \n",
  54. " return seqs"
  55. ]
  56. },
  57. {
  58. "cell_type": "code",
  59. "execution_count": 20,
  60. "metadata": {},
  61. "outputs": [],
  62. "source": [
  63. "def model_sequential(): # créer un objet modèle\n",
  64. " \"\"\"\n",
  65. " Return a simple sequentiel model\n",
  66. " \n",
  67. " Returns :\n",
  68. " - model : keras.Model\n",
  69. " \"\"\"\n",
  70. " inputs = Input(shape=(759,21)) # 759 aa, 21 car onehot\n",
  71. " conv_1 = Conv1D(25, (5), padding=\"same\", activation=\"relu\",\n",
  72. " kernel_initializer=\"he_normal\")(inputs)\n",
  73. " conv_2 = Conv1D(35, (5), padding=\"same\", activation=\"relu\",\n",
  74. " kernel_initializer=\"he_normal\")(conv_1)\n",
  75. " output = TimeDistributed(Dense(4, activation='softmax'))(conv_2)\n",
  76. " model = Model(inputs=inputs, outputs=output)\n",
  77. " print(model.summary)\n",
  78. " model.compile(optimizer=\"adam\", loss=\"categorical_crossentropy\",\n",
  79. " metrics=[\"accuracy\"])\n",
  80. " return model"
  81. ]
  82. },
  83. {
  84. "cell_type": "code",
  85. "execution_count": 21,
  86. "metadata": {},
  87. "outputs": [],
  88. "source": [
  89. "def char_to_onehot(df, ncol=0):\n",
  90. " \"\"\"\n",
  91. " Returns the given str-encoded dataframe into a onehot encoded dataframe as an array object\n",
  92. " \n",
  93. " * Arguments:\n",
  94. " - df : pandas.DataFrame, the dataframe containing the sequences, cell \n",
  95. " containing strings.\n",
  96. " - ncol : int, Optionnal. the number of col used to build the LabelEncoder\n",
  97. " which will transform the df cells into int(categorical).\n",
  98. " \"\"\"\n",
  99. " classes = LabelEncoder()\n",
  100. " classes.fit(df[ncol])\n",
  101. " df_categorical = df.apply(func=classes.transform, axis=0)\n",
  102. " df_onehot = keras.utils.to_categorical(df_categorical)\n",
  103. " return df_onehot"
  104. ]
  105. },
  106. {
  107. "cell_type": "code",
  108. "execution_count": 22,
  109. "metadata": {},
  110. "outputs": [],
  111. "source": [
  112. "def true_accuracy(predictions, onehot_Y_test):\n",
  113. " \"\"\"\n",
  114. " Computes the accuracy ignoring the \"0\" of the DataFrame.\n",
  115. " \n",
  116. " * Arguments :\n",
  117. " - predictions : numpy.array, output of the model, onehot encoded.\n",
  118. " - onehot_Y_test : numpy.array, the true values, onehot encoded.\n",
  119. " \n",
  120. " * Returns :\n",
  121. " - acc_coor : int, accuracy corrected, not considering '0' predictions\n",
  122. " \"\"\"\n",
  123. " tp = 0\n",
  124. " tn = 0\n",
  125. " fn = 0\n",
  126. " fp = 0\n",
  127. " tot = 0\n",
  128. "\n",
  129. " for i in range(len(predictions)):\n",
  130. " for j in range(len(predictions[i])):\n",
  131. " if onehot_Y_test[i, j, 3] != 0.:\n",
  132. " predmax = -1\n",
  133. " predict_class = -1\n",
  134. " true_class = -1\n",
  135. " for k in range(len(predictions[i, j])):\n",
  136. " if predmax < predictions[i, j, k]:\n",
  137. " predmax = predictions[i, j, k]\n",
  138. " predict_class = k\n",
  139. " if onehot_Y_test[i, j, k] == 1.:\n",
  140. " true_class = k\n",
  141. " if predict_class == true_class:\n",
  142. " tp = tp+1\n",
  143. " tot = tot + 1\n",
  144. " acc_corr = tp/tot*100\n",
  145. " return acc_corr"
  146. ]
  147. },
  148. {
  149. "cell_type": "code",
  150. "execution_count": 23,
  151. "metadata": {},
  152. "outputs": [],
  153. "source": [
  154. "# Create dataframe from files\n",
  155. "fasta = file_to_dataframe(\"data/train.fasta\")\n",
  156. "fasta = fasta.replace(\"X\", \"A\") # 'X' in fasta sequences are replace with 'A'\n",
  157. "dssp = file_to_dataframe(\"data/train.dssp\")"
  158. ]
  159. },
  160. {
  161. "cell_type": "code",
  162. "execution_count": 24,
  163. "metadata": {},
  164. "outputs": [],
  165. "source": [
  166. "# Writes Dataframes to csv files\n",
  167. "fasta.to_csv(path_or_buf=\"data/train_fasta.csv\")\n",
  168. "dssp.to_csv(path_or_buf=\"data/train_dssp.csv\")"
  169. ]
  170. },
  171. {
  172. "cell_type": "code",
  173. "execution_count": 25,
  174. "metadata": {},
  175. "outputs": [],
  176. "source": [
  177. "# To onehot\n",
  178. "fasta_onehot = char_to_onehot(fasta, 0)\n",
  179. "dssp_onehot = char_to_onehot(dssp, 2)\n"
  180. ]
  181. },
  182. {
  183. "cell_type": "code",
  184. "execution_count": 26,
  185. "metadata": {},
  186. "outputs": [],
  187. "source": [
  188. "# Divides the dataset in train and test subsets\n",
  189. "fasta_train = fasta_onehot[0:1000,]\n",
  190. "dssp_train = dssp_onehot[0:1000,]\n",
  191. "fasta_test = fasta_onehot[1000:,]\n",
  192. "dssp_test = dssp_onehot[1000:]"
  193. ]
  194. },
  195. {
  196. "cell_type": "code",
  197. "execution_count": 27,
  198. "metadata": {},
  199. "outputs": [
  200. {
  201. "name": "stdout",
  202. "output_type": "stream",
  203. "text": [
  204. "<bound method Network.summary of <keras.engine.training.Model object at 0x7f68cfb7b0d0>>\n",
  205. "Epoch 1/30\n",
  206. "1000/1000 [==============================] - 2s 2ms/step - loss: 0.7467 - acc: 0.7447\n",
  207. "Epoch 2/30\n",
  208. "1000/1000 [==============================] - 1s 974us/step - loss: 0.2610 - acc: 0.8944\n",
  209. "Epoch 3/30\n",
  210. "1000/1000 [==============================] - 1s 964us/step - loss: 0.2053 - acc: 0.9086\n",
  211. "Epoch 4/30\n",
  212. "1000/1000 [==============================] - 1s 994us/step - loss: 0.1899 - acc: 0.9154\n",
  213. "Epoch 5/30\n",
  214. "1000/1000 [==============================] - 1s 986us/step - loss: 0.1817 - acc: 0.9193\n",
  215. "Epoch 6/30\n",
  216. "1000/1000 [==============================] - 1s 1ms/step - loss: 0.1790 - acc: 0.9201\n",
  217. "Epoch 7/30\n",
  218. "1000/1000 [==============================] - 1s 1ms/step - loss: 0.1759 - acc: 0.9217\n",
  219. "Epoch 8/30\n",
  220. "1000/1000 [==============================] - 1s 960us/step - loss: 0.1743 - acc: 0.9222\n",
  221. "Epoch 9/30\n",
  222. "1000/1000 [==============================] - 1s 977us/step - loss: 0.1735 - acc: 0.9226\n",
  223. "Epoch 10/30\n",
  224. "1000/1000 [==============================] - 1s 963us/step - loss: 0.1727 - acc: 0.9228\n",
  225. "Epoch 11/30\n",
  226. "1000/1000 [==============================] - 1s 999us/step - loss: 0.1712 - acc: 0.9239\n",
  227. "Epoch 12/30\n",
  228. "1000/1000 [==============================] - 1s 963us/step - loss: 0.1702 - acc: 0.9243\n",
  229. "Epoch 13/30\n",
  230. "1000/1000 [==============================] - 1s 1ms/step - loss: 0.1695 - acc: 0.9249\n",
  231. "Epoch 14/30\n",
  232. "1000/1000 [==============================] - 1s 978us/step - loss: 0.1687 - acc: 0.9252\n",
  233. "Epoch 15/30\n",
  234. "1000/1000 [==============================] - 1s 981us/step - loss: 0.1678 - acc: 0.9257\n",
  235. "Epoch 16/30\n",
  236. "1000/1000 [==============================] - 1s 985us/step - loss: 0.1669 - acc: 0.9260\n",
  237. "Epoch 17/30\n",
  238. "1000/1000 [==============================] - 1s 1ms/step - loss: 0.1661 - acc: 0.9267\n",
  239. "Epoch 18/30\n",
  240. "1000/1000 [==============================] - 1s 1000us/step - loss: 0.1656 - acc: 0.9269\n",
  241. "Epoch 19/30\n",
  242. "1000/1000 [==============================] - 1s 1ms/step - loss: 0.1650 - acc: 0.9273\n",
  243. "Epoch 20/30\n",
  244. "1000/1000 [==============================] - 1s 1ms/step - loss: 0.1640 - acc: 0.9277\n",
  245. "Epoch 21/30\n",
  246. "1000/1000 [==============================] - 1s 1ms/step - loss: 0.1637 - acc: 0.9281\n",
  247. "Epoch 22/30\n",
  248. "1000/1000 [==============================] - 1s 1ms/step - loss: 0.1630 - acc: 0.9285\n",
  249. "Epoch 23/30\n",
  250. "1000/1000 [==============================] - 1s 1ms/step - loss: 0.1621 - acc: 0.9289\n",
  251. "Epoch 24/30\n",
  252. "1000/1000 [==============================] - 1s 992us/step - loss: 0.1619 - acc: 0.9289\n",
  253. "Epoch 25/30\n",
  254. "1000/1000 [==============================] - 1s 1ms/step - loss: 0.1616 - acc: 0.9291\n",
  255. "Epoch 26/30\n",
  256. "1000/1000 [==============================] - 1s 1ms/step - loss: 0.1616 - acc: 0.9291\n",
  257. "Epoch 27/30\n",
  258. "1000/1000 [==============================] - 1s 1ms/step - loss: 0.1608 - acc: 0.9296\n",
  259. "Epoch 28/30\n",
  260. "1000/1000 [==============================] - 1s 1ms/step - loss: 0.1604 - acc: 0.9297\n",
  261. "Epoch 29/30\n",
  262. "1000/1000 [==============================] - 1s 1ms/step - loss: 0.1601 - acc: 0.9298\n",
  263. "Epoch 30/30\n",
  264. "1000/1000 [==============================] - 1s 1ms/step - loss: 0.1597 - acc: 0.9300\n"
  265. ]
  266. },
  267. {
  268. "data": {
  269. "text/plain": [
  270. "<keras.callbacks.History at 0x7f68cfcf6050>"
  271. ]
  272. },
  273. "execution_count": 27,
  274. "metadata": {},
  275. "output_type": "execute_result"
  276. }
  277. ],
  278. "source": [
  279. "model = model_sequential()\n",
  280. "model.fit(fasta_train, dssp_train, epochs=30, batch_size=30)"
  281. ]
  282. },
  283. {
  284. "cell_type": "code",
  285. "execution_count": 30,
  286. "metadata": {},
  287. "outputs": [],
  288. "source": [
  289. "predictions = model.predict(fasta_test, batch_size=30)"
  290. ]
  291. },
  292. {
  293. "cell_type": "code",
  294. "execution_count": 31,
  295. "metadata": {},
  296. "outputs": [
  297. {
  298. "name": "stdout",
  299. "output_type": "stream",
  300. "text": [
  301. "accuracy : 68.06409539780138\n"
  302. ]
  303. }
  304. ],
  305. "source": [
  306. "print(\"accuracy : {}\".format(true_accuracy(predictions, dssp_test)))"
  307. ]
  308. },
  309. {
  310. "cell_type": "code",
  311. "execution_count": null,
  312. "metadata": {},
  313. "outputs": [],
  314. "source": []
  315. },
  316. {
  317. "cell_type": "code",
  318. "execution_count": null,
  319. "metadata": {},
  320. "outputs": [],
  321. "source": []
  322. },
  323. {
  324. "cell_type": "code",
  325. "execution_count": null,
  326. "metadata": {},
  327. "outputs": [],
  328. "source": []
  329. },
  330. {
  331. "cell_type": "code",
  332. "execution_count": null,
  333. "metadata": {},
  334. "outputs": [],
  335. "source": []
  336. },
  337. {
  338. "cell_type": "code",
  339. "execution_count": null,
  340. "metadata": {},
  341. "outputs": [],
  342. "source": []
  343. },
  344. {
  345. "cell_type": "code",
  346. "execution_count": null,
  347. "metadata": {},
  348. "outputs": [],
  349. "source": []
  350. },
  351. {
  352. "cell_type": "code",
  353. "execution_count": null,
  354. "metadata": {},
  355. "outputs": [],
  356. "source": []
  357. },
  358. {
  359. "cell_type": "code",
  360. "execution_count": null,
  361. "metadata": {},
  362. "outputs": [],
  363. "source": []
  364. }
  365. ],
  366. "metadata": {
  367. "kernelspec": {
  368. "display_name": "Python 3",
  369. "language": "python",
  370. "name": "python3"
  371. },
  372. "language_info": {
  373. "codemirror_mode": {
  374. "name": "ipython",
  375. "version": 3
  376. },
  377. "file_extension": ".py",
  378. "mimetype": "text/x-python",
  379. "name": "python",
  380. "nbconvert_exporter": "python",
  381. "pygments_lexer": "ipython3",
  382. "version": "3.7.4"
  383. }
  384. },
  385. "nbformat": 4,
  386. "nbformat_minor": 4
  387. }