nzimmermann
/
projet_court_nz


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899
							#!/bin/python3

import pandas as pd
import numpy as np
import pbxplore as pbx
import sys
from scipy.spatial.distance import squareform, pdist, jaccard 
from pyclustering.cluster.kmedoids import kmedoids
from pyclustering.cluster import cluster_visualizer
from pyclustering.utils import read_sample
from pyclustering.samples.definitions import FCPS_SAMPLES


class Conformations:
    """
    An instance of the class conformations contains differents conformations of
    the same protein, encoded as 1D sequences of protein bloc, in a pandas
    dataframe.
    """
    pass

    def __init__(self, filename):
        """
        df : pd.DataFrame object
        Each row of the dataframe is a conformation and each column a position
        of the sequence.
        """
        self.df = pd.DataFrame()
        for chain_name, chain in pbx.chains_from_files([filename]): 
                dihedrals = chain.get_phi_psi_angles() 
                pb_seq = pbx.assign(dihedrals) 
                self.df = self.df.append(pd.Series(list(pb_seq)),
                                         ignore_index=True)
 
    def identity(self):
        """
        Returns a matrix of the distance between all conformations based on
        wether the PB at each position is identical or not.
        Returns the matrix as a numpy.ndarray object.
        """
        dict_str_to_float = {'Z':0,'a':1,'b':2,'c':3,'d':4,'e':5,'f':6,'g':7,
                             'h':8,'i':9,'j':10,'k':11,'l':12,'m':13,'n':14,
                             'o':15,'p':16} # dict to transform PB to int,
                                            # necessary for the pdsit function
        dfnum = self.df.replace(dict_str_to_float)

        return squareform(pdist(dfnum, metric ='jaccard'))

    def dissimilarity(self, matrix=None):
        """
        Returns a matrix of the distance between all conformations computed
        according to a substitution matrix of the protein blocks.
        If no substitution matrix is specified, use the matrix from the
        PBxplore package.
        """
        dissimilarity = np.zeros((self.df.shape[0],self.df.shape[0])) 
        if matrix == None:
            matrix = pd.read_table("data/PBs_substitution_matrix.dat",
                                   index_col=False, sep ='\t')
            matrix = matrix/100 # because in this file weight where multiplied
                                # by 100.
        matrix.index = matrix.columns
        ncol = self.df.shape[1]
        nrow = self.df.shape[0]
        it1 = self.df.iterrows()
        for i in range(1,nrow): # compute each pairwise distance once only
            for j in range(i): 
                for k in range(ncol): 
                    dissimilarity[i][j] += matrix[confs.df.loc[i,k]][confs.df.loc[j,k]]
        dissimilarity = dissimilarity + dissimilarity.T # fills the whole matrix
        
        return dissimilarity

    def dist_from_dissimilarity(diss_matrix):
        """
        Using the substitution matrix from the PBxplore package, the obtained
        dissimilarity matrix has both positive and negative values. Low value
        represent strong differences as identical PB substitution are high
        positive value.
        This function returns a distance matrix from the dissimilarity matrix.
        
        Arguments :
            - diss_matrix : ndarray obtained from Configurations.dissimilarity
        """
        diss_matrix = -diss_matrix
        diss_matrix = (diss_matrix - np.min(diss_matrix))/np.ptp(diss_matrix)
        dist = diss_matrix * abs((np.identity(confs.df.shape[0])-1))
        
        return dist

if __name__ == "__main__":
    if len(sys.argv) != 2:
        sys.exit("Error : usage '$ python3 projet8 md.pdb'")

    confs = Conformations(sys.argv[1])


    print(confs.df)