nzimmermann
/
projet_court_nz


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195
							"""
This modules handles proteins conformations and allows manipulating them.

Conformations are loaded as a pdb file and are then converted to PBs sequences
using the pbxplore package.

Distance matrixes can be produced using 2 methods : Identity between blocs for
the simplest or dissimilarity using a substitution matrix.

Both matrixes can be computed with k-medoids algorithm for clustering.

Generated clusters can be visualised with the visualisation method.
"""

import pandas as pd
import numpy as np
import pbxplore as pbx
from scipy.spatial.distance import squareform, pdist, jaccard
from pyclustering.cluster.kmedoids import kmedoids
import matplotlib.pyplot as plt

class Conformations:
    """
    An instance of the class conformations contains differents conformations of
    the same protein, encoded as 1D sequences of protein bloc, in a pandas
    dataframe.
    """

    def __init__(self, filename):
        """
        filename : a .pdb file or several pdb file containing the conformations

        Attribute :
            - df : pd.DataFrame object
            - filename : a list of str, paths to the structure files
            - simple_dist : matrix distance computed with the identity method
                            Initiality empty, must be computed
            - dissimilarity_dist : matrix distance computed with the
                                   dissimilarity method, initialy empty because
                                   the computation time is long
        Each row of the dataframe is a conformation and each column a position
        of the sequence.
        """
        self.df = pd.DataFrame()
        self.filename = filename
        self.identity_dist = np.array(None)
        self.dissimilarity_dist = np.array(None)
        for chain_name, chain in pbx.chains_from_files([filename]):
            dihedrals = chain.get_phi_psi_angles()
            pb_seq = pbx.assign(dihedrals)
            self.df = self.df.append(pd.Series(list(pb_seq)),
                                     ignore_index=True)

    def identity(self):
        """
        Computes a distance matrix between all conformations based on
        wether the PB at each position is identical or not.
        Returns the matrix as a numpy.ndarray object.
        """
        dict_str_to_float = {'Z':0, 'a':1, 'b':2, 'c':3, 'd':4, 'e':5, 'f':6,
                             'g':7, 'h':8, 'i':9, 'j':10, 'k':11, 'l':12,
                             'm':13, 'n':14, 'o':15, 'p':16}
                              # dict to transform PB to
                              # int, necessary for the pdist function
        dfnum = self.df.replace(dict_str_to_float)
        dist = squareform(pdist(dfnum, metric='jaccard'))
        self.identity_dist = dist

        return dist

    def dissimilarity(self, matrix=None):
        """
        Returns a matrix of the distance between all conformations computed
        according to a substitution matrix of the protein blocks.
        If no substitution matrix is specified, use the matrix from the
        PBxplore package.
        """
        dissimilarity = np.zeros((self.df.shape[0], self.df.shape[0]))
        if matrix == None:
            matrix = pd.read_table("data/PBs_substitution_matrix.dat",
                                   index_col=False, sep='\t')
            matrix = matrix/100 # because in this file weight where multiplied
                                # by 100.
        matrix.index = matrix.columns
        ncol = self.df.shape[1]
        nrow = self.df.shape[0]
        for i in range(1, nrow): # compute each pairwise distance once only
            for j in range(i):
                for k in range(ncol):
                    dissimilarity[i][j] += \
                        matrix[self.df.loc[i, k]][self.df.loc[j, k]]
        dissimilarity = dissimilarity + dissimilarity.T # fills the whole matrix

        return self.dist_from_dissimilarity(dissimilarity)

    def dist_from_dissimilarity(self, diss_matrix):
        """
        Using the substitution matrix from the PBxplore package, the obtained
        dissimilarity matrix has both positive and negative values. Low value
        represent strong differences as identical PB substitution are high
        positive value.
        This function returns a distance matrix from the dissimilarity matrix.

        Arguments :
            - diss_matrix : ndarray
                obtained from Configurations.dissimilarity
        """
        diss_matrix = -diss_matrix
        diss_matrix = (diss_matrix - np.min(diss_matrix))/np.ptp(diss_matrix)
        dist = diss_matrix * abs((np.identity(self.df.shape[0])-1))

        self.dissimilarity_dist = dist

        return dist

    def small_kmedoids(self, matrix, ncluster):
        """
        Returns clusters and medoids computed with kmedoids on a distance matrix
        Arguments :
            - matrix : str, ('identity' or 'dissimilarity')
                       corresponding to the desired distance matrix to be
                       computed
            - ncluster number of clusters to be computed
        """
        if matrix == 'identity':

            matrix = self.identity_dist
            if matrix.all() == None:
                print("Error : distance matrix from identity hasn't been " \
                      "computed yet")
                return
        elif matrix == 'dissimilarity':
            matrix = self.dissimilarity_dist
            if matrix.all() == None:
                print("Error : distance matrix from dissimilarity hasn't " \
                      "been computed yet")
                return

        if ncluster > matrix.shape[0]:
            print("Error : number of desired clusters > number of objects")
            return

        initial_medoids = np.random.randint(matrix.shape[0], size=ncluster)
        kmed1 = kmedoids(matrix, initial_medoids, data_type='distance_matrix')
        kmed1.process()

        clusters = kmed1.get_clusters()
        medoids = kmed1.get_medoids()

        return (clusters, medoids)

def visualise(clusters, output_name=None):
    """
    Generate an image to visualise clusters. Can currently display up to
    seven different colors
    Arguments :
        - clusters : list of lists
            output of the small_kmedoids method

        - output_name : str
            desired filename for the image output, if none don't save file
    """
    nb_confs = sum([len(x) for x in clusters])
    confs = np.arange(nb_confs)
    group = np.zeros(nb_confs)

    for i in range(len(clusters)):
        for j in clusters[i]:
            group[j] = i+1

    color = []
    #dict_col = {1:'b',2:'c',3:'',4:'',5:'',6:''}
    for i in group:
        if i == 1:
            color.append('b')
        if i == 2:
            color.append('g')
        if i == 3:
            color.append('y')
        if i == 4:
            color.append('r')
        if i == 5:
            color.append('c')
        if i == 6:
            color.append('m')
        elif i > 6:
            color.append('k')

    plt.bar(confs, group, width=1.0, color=color)

    if output_name != None:
        plt.savefig("results/"+output_name)

    plt.close()