123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388 |
- {
- "cells": [
- {
- "cell_type": "code",
- "execution_count": 18,
- "metadata": {},
- "outputs": [],
- "source": [
- "import keras\n",
- "import numpy as np\n",
- "import pandas as pd\n",
- "\n",
- "from sklearn.preprocessing import LabelEncoder\n",
- "from keras.models import Sequential\n",
- "from keras.layers import Dense, Flatten, TimeDistributed\n",
- "from keras import Input, Model\n",
- "from keras.layers import add, Activation\n",
- "#from keras.utils import plot_model # Needs pydot.\n",
- "from keras.layers import Conv1D, AveragePooling1D"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 19,
- "metadata": {},
- "outputs": [],
- "source": [
- "def file_to_dataframe(filename):\n",
- " \"\"\"\n",
- " Returns a pandas dataframe ncol = len(longest_sequence),\n",
- " nrow = number of sequences\n",
- " \n",
- " Arguments :\n",
- " - filename : str\n",
- " path to file\n",
- " Takes a path to a files containing any sequences, must be 1 sequence\n",
- " per line.\n",
- " \n",
- " Sequences shorter than longest_sequence are completed with '0' char.\n",
- " \"\"\"\n",
- " filin = open(filename)\n",
- " fastas0 = filin.read()\n",
- " fastas0 = fastas0.split('\\n')\n",
- " nmax = len(max(fastas0, key=len))\n",
- " fastas = []\n",
- " for fasta in fastas0:\n",
- " fastas.append(fasta + (nmax - len(fasta)) * '0')\n",
- " \n",
- " seqs = pd.DataFrame(index=range(len(fastas)), columns=range(nmax))\n",
- " \n",
- " for i, fasta in enumerate(fastas):\n",
- " seqs.loc[i] = pd.Series(list(fasta))\n",
- " \n",
- " return seqs"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 20,
- "metadata": {},
- "outputs": [],
- "source": [
- "def model_sequential(): # créer un objet modèle\n",
- " \"\"\"\n",
- " Return a simple sequentiel model\n",
- " \n",
- " Returns :\n",
- " - model : keras.Model\n",
- " \"\"\"\n",
- " inputs = Input(shape=(759,21)) # 759 aa, 21 car onehot\n",
- " conv_1 = Conv1D(25, (5), padding=\"same\", activation=\"relu\",\n",
- " kernel_initializer=\"he_normal\")(inputs)\n",
- " conv_2 = Conv1D(35, (5), padding=\"same\", activation=\"relu\",\n",
- " kernel_initializer=\"he_normal\")(conv_1)\n",
- " output = TimeDistributed(Dense(4, activation='softmax'))(conv_2)\n",
- " model = Model(inputs=inputs, outputs=output)\n",
- " print(model.summary)\n",
- " model.compile(optimizer=\"adam\", loss=\"categorical_crossentropy\",\n",
- " metrics=[\"accuracy\"])\n",
- " return model"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 21,
- "metadata": {},
- "outputs": [],
- "source": [
- "def char_to_onehot(df, ncol=0):\n",
- " \"\"\"\n",
- " Returns the given str-encoded dataframe into a onehot encoded dataframe as an array object\n",
- " \n",
- " * Arguments:\n",
- " - df : pandas.DataFrame, the dataframe containing the sequences, cell \n",
- " containing strings.\n",
- " - ncol : int, Optionnal. the number of col used to build the LabelEncoder\n",
- " which will transform the df cells into int(categorical).\n",
- " \"\"\"\n",
- " classes = LabelEncoder()\n",
- " classes.fit(df[ncol])\n",
- " df_categorical = df.apply(func=classes.transform, axis=0)\n",
- " df_onehot = keras.utils.to_categorical(df_categorical)\n",
- " return df_onehot"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 22,
- "metadata": {},
- "outputs": [],
- "source": [
- "def true_accuracy(predictions, onehot_Y_test):\n",
- " \"\"\"\n",
- " Computes the accuracy ignoring the \"0\" of the DataFrame.\n",
- " \n",
- " * Arguments :\n",
- " - predictions : numpy.array, output of the model, onehot encoded.\n",
- " - onehot_Y_test : numpy.array, the true values, onehot encoded.\n",
- " \n",
- " * Returns :\n",
- " - acc_coor : int, accuracy corrected, not considering '0' predictions\n",
- " \"\"\"\n",
- " tp = 0\n",
- " tn = 0\n",
- " fn = 0\n",
- " fp = 0\n",
- " tot = 0\n",
- "\n",
- " for i in range(len(predictions)):\n",
- " for j in range(len(predictions[i])):\n",
- " if onehot_Y_test[i, j, 3] != 0.:\n",
- " predmax = -1\n",
- " predict_class = -1\n",
- " true_class = -1\n",
- " for k in range(len(predictions[i, j])):\n",
- " if predmax < predictions[i, j, k]:\n",
- " predmax = predictions[i, j, k]\n",
- " predict_class = k\n",
- " if onehot_Y_test[i, j, k] == 1.:\n",
- " true_class = k\n",
- " if predict_class == true_class:\n",
- " tp = tp+1\n",
- " tot = tot + 1\n",
- " acc_corr = tp/tot*100\n",
- " return acc_corr"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 23,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Create dataframe from files\n",
- "fasta = file_to_dataframe(\"data/train.fasta\")\n",
- "fasta = fasta.replace(\"X\", \"A\") # 'X' in fasta sequences are replace with 'A'\n",
- "dssp = file_to_dataframe(\"data/train.dssp\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 24,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Writes Dataframes to csv files\n",
- "fasta.to_csv(path_or_buf=\"data/train_fasta.csv\")\n",
- "dssp.to_csv(path_or_buf=\"data/train_dssp.csv\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 25,
- "metadata": {},
- "outputs": [],
- "source": [
- "# To onehot\n",
- "fasta_onehot = char_to_onehot(fasta, 0)\n",
- "dssp_onehot = char_to_onehot(dssp, 2)\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 26,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Divides the dataset in train and test subsets\n",
- "fasta_train = fasta_onehot[0:1000,]\n",
- "dssp_train = dssp_onehot[0:1000,]\n",
- "fasta_test = fasta_onehot[1000:,]\n",
- "dssp_test = dssp_onehot[1000:]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 27,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "<bound method Network.summary of <keras.engine.training.Model object at 0x7f68cfb7b0d0>>\n",
- "Epoch 1/30\n",
- "1000/1000 [==============================] - 2s 2ms/step - loss: 0.7467 - acc: 0.7447\n",
- "Epoch 2/30\n",
- "1000/1000 [==============================] - 1s 974us/step - loss: 0.2610 - acc: 0.8944\n",
- "Epoch 3/30\n",
- "1000/1000 [==============================] - 1s 964us/step - loss: 0.2053 - acc: 0.9086\n",
- "Epoch 4/30\n",
- "1000/1000 [==============================] - 1s 994us/step - loss: 0.1899 - acc: 0.9154\n",
- "Epoch 5/30\n",
- "1000/1000 [==============================] - 1s 986us/step - loss: 0.1817 - acc: 0.9193\n",
- "Epoch 6/30\n",
- "1000/1000 [==============================] - 1s 1ms/step - loss: 0.1790 - acc: 0.9201\n",
- "Epoch 7/30\n",
- "1000/1000 [==============================] - 1s 1ms/step - loss: 0.1759 - acc: 0.9217\n",
- "Epoch 8/30\n",
- "1000/1000 [==============================] - 1s 960us/step - loss: 0.1743 - acc: 0.9222\n",
- "Epoch 9/30\n",
- "1000/1000 [==============================] - 1s 977us/step - loss: 0.1735 - acc: 0.9226\n",
- "Epoch 10/30\n",
- "1000/1000 [==============================] - 1s 963us/step - loss: 0.1727 - acc: 0.9228\n",
- "Epoch 11/30\n",
- "1000/1000 [==============================] - 1s 999us/step - loss: 0.1712 - acc: 0.9239\n",
- "Epoch 12/30\n",
- "1000/1000 [==============================] - 1s 963us/step - loss: 0.1702 - acc: 0.9243\n",
- "Epoch 13/30\n",
- "1000/1000 [==============================] - 1s 1ms/step - loss: 0.1695 - acc: 0.9249\n",
- "Epoch 14/30\n",
- "1000/1000 [==============================] - 1s 978us/step - loss: 0.1687 - acc: 0.9252\n",
- "Epoch 15/30\n",
- "1000/1000 [==============================] - 1s 981us/step - loss: 0.1678 - acc: 0.9257\n",
- "Epoch 16/30\n",
- "1000/1000 [==============================] - 1s 985us/step - loss: 0.1669 - acc: 0.9260\n",
- "Epoch 17/30\n",
- "1000/1000 [==============================] - 1s 1ms/step - loss: 0.1661 - acc: 0.9267\n",
- "Epoch 18/30\n",
- "1000/1000 [==============================] - 1s 1000us/step - loss: 0.1656 - acc: 0.9269\n",
- "Epoch 19/30\n",
- "1000/1000 [==============================] - 1s 1ms/step - loss: 0.1650 - acc: 0.9273\n",
- "Epoch 20/30\n",
- "1000/1000 [==============================] - 1s 1ms/step - loss: 0.1640 - acc: 0.9277\n",
- "Epoch 21/30\n",
- "1000/1000 [==============================] - 1s 1ms/step - loss: 0.1637 - acc: 0.9281\n",
- "Epoch 22/30\n",
- "1000/1000 [==============================] - 1s 1ms/step - loss: 0.1630 - acc: 0.9285\n",
- "Epoch 23/30\n",
- "1000/1000 [==============================] - 1s 1ms/step - loss: 0.1621 - acc: 0.9289\n",
- "Epoch 24/30\n",
- "1000/1000 [==============================] - 1s 992us/step - loss: 0.1619 - acc: 0.9289\n",
- "Epoch 25/30\n",
- "1000/1000 [==============================] - 1s 1ms/step - loss: 0.1616 - acc: 0.9291\n",
- "Epoch 26/30\n",
- "1000/1000 [==============================] - 1s 1ms/step - loss: 0.1616 - acc: 0.9291\n",
- "Epoch 27/30\n",
- "1000/1000 [==============================] - 1s 1ms/step - loss: 0.1608 - acc: 0.9296\n",
- "Epoch 28/30\n",
- "1000/1000 [==============================] - 1s 1ms/step - loss: 0.1604 - acc: 0.9297\n",
- "Epoch 29/30\n",
- "1000/1000 [==============================] - 1s 1ms/step - loss: 0.1601 - acc: 0.9298\n",
- "Epoch 30/30\n",
- "1000/1000 [==============================] - 1s 1ms/step - loss: 0.1597 - acc: 0.9300\n"
- ]
- },
- {
- "data": {
- "text/plain": [
- "<keras.callbacks.History at 0x7f68cfcf6050>"
- ]
- },
- "execution_count": 27,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "model = model_sequential()\n",
- "model.fit(fasta_train, dssp_train, epochs=30, batch_size=30)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 30,
- "metadata": {},
- "outputs": [],
- "source": [
- "predictions = model.predict(fasta_test, batch_size=30)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 31,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "accuracy : 68.06409539780138\n"
- ]
- }
- ],
- "source": [
- "print(\"accuracy : {}\".format(true_accuracy(predictions, dssp_test)))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.7.4"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 4
- }
|