#!/bin/python3 import pandas as pd import numpy as np import pbxplore as pbx import sys from scipy.spatial.distance import squareform, pdist, jaccard from pyclustering.cluster.kmedoids import kmedoids # for kmedoids method class Conformations: """ An instance of the class conformations contains differents conformations of the same protein, encoded as 1D sequences of protein bloc, in a pandas dataframe. """ pass def __init__(self, filename): """ df : pd.DataFrame object Each row of the dataframe is a conformation and each column a position of the sequence. dfnum : to compute dissimilarity with the function pdist, elements of the DataFrame must be float """ self.df = pd.DataFrame() for chain_name, chain in pbx.chains_from_files([filename]): dihedrals = chain.get_phi_psi_angles() pb_seq = pbx.assign(dihedrals) self.df = self.df.append(pd.Series(list(pb_seq)), ignore_index=True) def identity(self): """ Returns a matrix of the distance between all conformations based on wether the PB at each position is identical or not. Returns the matrix as a numpy.ndarray object. """ dict_str_to_float = {'Z':0,'a':1,'b':2,'c':3,'d':4,'e':5,'f':6,'g':7, 'h':8,'i':9,'j':10,'k':11,'l':12,'m':13,'n':14, 'o':15,'p':16} # dict to transform PB to int, # necessary for the pdsit function dfnum = self.df.replace(dict_str_to_float) return squareform(pdist(dfnum, metric ='jaccard')) def dissimilarity(self, matrix=None): """ Returns a matrix of the distance between all conformations computed according to a substitution matrix of the protein blocks. If no substitution matrix is specified, use the matrix from the PBxplore package. """ dissimilarity = np.ndarray((self.df.shape[0],self.df.shape[0])) if matrix == None: matrix = pd.read_table("data/PBs_substitution_matrix.dat", index_col=False, sep ='\t') matrix = matrix/100 # because in this file weight where multiplied # by 100. matrix.index = matrix.columns ncol = self.df.shape[1] nrow = self.df.shape[0] it1 = self.df.iterrows() for i in range(1,nrow): for j in range(i): for k in range(ncol): dissimilarity[i][j] += matrix[confs.df.loc[i,k]][confs.df.loc[j,k]] dissimilarity = dissimilarity + dissimilarity.T return dissimilarity if __name__ == "__main__": if len(sys.argv) != 2: sys.exit("Error : usage '$ python3 projet8 md.pdb'") confs = Conformations(sys.argv[1]) print(confs.df)