nzimmermann
/
projet_court_nz


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081
							#!/bin/python3

import pandas as pd
import numpy as np
import pbxplore as pbx
import sys
from scipy.spatial.distance import squareform, pdist, jaccard 
from pyclustering.cluster.kmedoids import kmedoids # for kmedoids method

class Conformations:
    """
    An instance of the class conformations contains differents conformations of
    the same protein, encoded as 1D sequences of protein bloc, in a pandas
    dataframe.
    """
    pass

    def __init__(self, filename):
        """
        df : pd.DataFrame object
        Each row of the dataframe is a conformation and each column a position
        of the sequence.
        dfnum : to compute dissimilarity with the function pdist, elements of
        the DataFrame must be float
        """
        self.df = pd.DataFrame()
        for chain_name, chain in pbx.chains_from_files([filename]): 
                dihedrals = chain.get_phi_psi_angles() 
                pb_seq = pbx.assign(dihedrals) 
                self.df = self.df.append(pd.Series(list(pb_seq)),
                                         ignore_index=True)
 
    def identity(self):
        """
        Returns a matrix of the distance between all conformations based on
        wether the PB at each position is identical or not.
        Returns the matrix as a numpy.ndarray object.
        """
        dict_str_to_float = {'Z':0,'a':1,'b':2,'c':3,'d':4,'e':5,'f':6,'g':7,
                             'h':8,'i':9,'j':10,'k':11,'l':12,'m':13,'n':14,
                             'o':15,'p':16} # dict to transform PB to int,
                                            # necessary for the pdsit function
        dfnum = self.df.replace(dict_str_to_float)

        return squareform(pdist(dfnum, metric ='jaccard'))

    def dissimilarity(self, matrix=None):
        """
        Returns a matrix of the distance between all conformations computed
        according to a substitution matrix of the protein blocks.
        If no substitution matrix is specified, use the matrix from the
        PBxplore package.
        """
        dissimilarity = np.ndarray((self.df.shape[0],self.df.shape[0])) 
        if matrix == None:
            matrix = pd.read_table("data/PBs_substitution_matrix.dat",
                                   index_col=False, sep ='\t')
            matrix = matrix/100 # because in this file weight where multiplied
                                # by 100.
        matrix.index = matrix.columns
        ncol = self.df.shape[1]
        nrow = self.df.shape[0]
        it1 = self.df.iterrows()
        for i in range(1,nrow): 
            for j in range(i): 
                for k in range(ncol): 
                    dissimilarity[i][j] += matrix[confs.df.loc[i,k]][confs.df.loc[j,k]]
        dissimilarity = dissimilarity + dissimilarity.T

        return dissimilarity


if __name__ == "__main__":
    if len(sys.argv) != 2:
        sys.exit("Error : usage '$ python3 projet8 md.pdb'")

    confs = Conformations(sys.argv[1])


    print(confs.df)