In [18]:
import keras
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from keras.models import Sequential
from keras.layers import Dense, Flatten, TimeDistributed
from keras import Input, Model
from keras.layers import add, Activation
#from keras.utils import plot_model  # Needs pydot.
from keras.layers import Conv1D, AveragePooling1D

In [19]:
def file_to_dataframe(filename):
    """
    Returns a pandas dataframe ncol = len(longest_sequence),
                               nrow = number of sequences
                               
    Arguments :
        - filename : str
            path to file
    Takes a path to a files containing any sequences, must be 1 sequence
    per line.
    
    Sequences shorter than longest_sequence are completed with '0' char.
    """
    filin = open(filename)
    fastas0 = filin.read()
    fastas0 = fastas0.split('\n')
    nmax = len(max(fastas0, key=len))
    fastas = []
    for fasta in fastas0:
        fastas.append(fasta + (nmax - len(fasta)) * '0')
    
    seqs = pd.DataFrame(index=range(len(fastas)), columns=range(nmax))
    
    for i, fasta in enumerate(fastas):
        seqs.loc[i] = pd.Series(list(fasta))
    
    return seqs

In [20]:
def model_sequential(): # créer un objet modèle
    """
    Return a simple sequentiel model
    
    Returns :
        - model : keras.Model
    """
    inputs = Input(shape=(759,21)) # 759 aa, 21 car onehot
    conv_1 = Conv1D(25, (5), padding="same", activation="relu",
                        kernel_initializer="he_normal")(inputs)
    conv_2 = Conv1D(35, (5), padding="same", activation="relu",
                        kernel_initializer="he_normal")(conv_1)
    output = TimeDistributed(Dense(4, activation='softmax'))(conv_2)
    model = Model(inputs=inputs, outputs=output)
    print(model.summary)
    model.compile(optimizer="adam", loss="categorical_crossentropy",
                  metrics=["accuracy"])
    return model

In [21]:
def char_to_onehot(df, ncol=0):
    """
    Returns the given str-encoded dataframe into a onehot encoded dataframe as an array object
    
    * Arguments:
        - df : pandas.DataFrame, the dataframe containing the sequences, cell 
                               containing strings.
        - ncol : int, Optionnal. the number of col used to build the LabelEncoder
                    which will transform the df cells into int(categorical).
    """
    classes = LabelEncoder()
    classes.fit(df[ncol])
    df_categorical = df.apply(func=classes.transform, axis=0)
    df_onehot = keras.utils.to_categorical(df_categorical)
    return df_onehot

In [22]:
def true_accuracy(predictions, onehot_Y_test):
    """
    Computes the accuracy ignoring the "0" of the DataFrame.
    
    * Arguments :
        - predictions : numpy.array, output of the model, onehot encoded.
        - onehot_Y_test : numpy.array, the true values, onehot encoded.
        
    * Returns :
        - acc_coor : int, accuracy corrected, not considering '0' predictions
    """
    tp = 0
    tn = 0
    fn = 0
    fp = 0
    tot = 0

    for i in range(len(predictions)):
        for j in range(len(predictions[i])):
            if onehot_Y_test[i, j, 3] != 0.:
                predmax = -1
                predict_class = -1
                true_class = -1
                for k in range(len(predictions[i, j])):
                    if predmax < predictions[i, j, k]:
                        predmax = predictions[i, j, k]
                        predict_class = k
                    if onehot_Y_test[i, j, k] == 1.:
                        true_class = k
                if predict_class == true_class:
                    tp = tp+1
                tot = tot + 1
    acc_corr = tp/tot*100
    return acc_corr

In [23]:
# Create dataframe from files
fasta = file_to_dataframe("data/train.fasta")
fasta = fasta.replace("X", "A") # 'X' in fasta sequences are replace with 'A'
dssp = file_to_dataframe("data/train.dssp")

In [24]:
# Writes Dataframes to csv files
fasta.to_csv(path_or_buf="data/train_fasta.csv")
dssp.to_csv(path_or_buf="data/train_dssp.csv")

In [25]:
# To onehot
fasta_onehot = char_to_onehot(fasta, 0)
dssp_onehot = char_to_onehot(dssp, 2)


In [26]:
# Divides the dataset in train and test subsets
fasta_train = fasta_onehot[0:1000,]
dssp_train = dssp_onehot[0:1000,]
fasta_test = fasta_onehot[1000:,]
dssp_test = dssp_onehot[1000:]

In [27]:
model = model_sequential()
model.fit(fasta_train, dssp_train, epochs=30, batch_size=30)

<bound method Network.summary of <keras.engine.training.Model object at 0x7f68cfb7b0d0>>
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x7f68cfcf6050>

In [30]:
predictions = model.predict(fasta_test, batch_size=30)

In [31]:
print("accuracy : {}".format(true_accuracy(predictions, dssp_test)))

accuracy : 68.06409539780138
