#!/bin/python3 import pandas as pd import numpy as np import pbxplore as pbx import sys from scipy.spatial.distance import squareform, pdist, jaccard from pyclustering.cluster.kmedoids import kmedoids from pyclustering.cluster import cluster_visualizer from pyclustering.utils import read_sample from pyclustering.samples.definitions import FCPS_SAMPLES class Conformations: """ An instance of the class conformations contains differents conformations of the same protein, encoded as 1D sequences of protein bloc, in a pandas dataframe. """ pass def __init__(self, filename): """ filename : a .pdb file or several pdb file containing the conformations Attribute : - df : pd.DataFrame object - filename : a list of str, paths to the structure files - simple_dist : matrix distance computed with the identity method Initiality empty, must be computed - dissimilarity_dist : matrix distance computed with the dissimilarity method, initialy empty because the computation time is long Each row of the dataframe is a conformation and each column a position of the sequence. """ self.df = pd.DataFrame() self.filename = filename self.identity_dist = np.array(None) self.dissimilarity_dist = np.array(None) for chain_name, chain in pbx.chains_from_files([filename]): dihedrals = chain.get_phi_psi_angles() pb_seq = pbx.assign(dihedrals) self.df = self.df.append(pd.Series(list(pb_seq)), ignore_index=True) def identity(self): """ Computes a distance matrix between all conformations based on wether the PB at each position is identical or not. Returns the matrix as a numpy.ndarray object. """ dict_str_to_float = {'Z':0,'a':1,'b':2,'c':3,'d':4,'e':5,'f':6,'g':7, 'h':8,'i':9,'j':10,'k':11,'l':12,'m':13,'n':14, 'o':15,'p':16} # dict to transform PB to int, # necessary for the pdsit function dfnum = self.df.replace(dict_str_to_float) dist = squareform(pdist(dfnum, metric ='jaccard')) self.identity_dist = dist return dist def dissimilarity(self, matrix=None): """ Returns a matrix of the distance between all conformations computed according to a substitution matrix of the protein blocks. If no substitution matrix is specified, use the matrix from the PBxplore package. """ dissimilarity = np.zeros((self.df.shape[0],self.df.shape[0])) if matrix == None: matrix = pd.read_table("data/PBs_substitution_matrix.dat", index_col=False, sep ='\t') matrix = matrix/100 # because in this file weight where multiplied # by 100. matrix.index = matrix.columns ncol = self.df.shape[1] nrow = self.df.shape[0] it1 = self.df.iterrows() for i in range(1,nrow): # compute each pairwise distance once only for j in range(i): for k in range(ncol): dissimilarity[i][j] += matrix[confs.df.loc[i,k]][confs.df.loc[j,k]] dissimilarity = dissimilarity + dissimilarity.T # fills the whole matrix return self.dist_from_dissimilarity(dissimilarity) def dist_from_dissimilarity(self, diss_matrix): """ Using the substitution matrix from the PBxplore package, the obtained dissimilarity matrix has both positive and negative values. Low value represent strong differences as identical PB substitution are high positive value. This function returns a distance matrix from the dissimilarity matrix. Arguments : - diss_matrix : ndarray obtained from Configurations.dissimilarity """ diss_matrix = -diss_matrix diss_matrix = (diss_matrix - np.min(diss_matrix))/np.ptp(diss_matrix) dist = diss_matrix * abs((np.identity(confs.df.shape[0])-1)) self.dissimilarity_dist = dist return dist def small_kmedoids(self, matrix, ncluster): """ Returns clusters and medoids computed with kmedoids on a distance matrix Arguments : - matrix : str, ('identity' or 'dissimilarity') corresponding to the desired distance matrix to be computed - ncluster number of clusters to be computed """ if matrix == 'identity': matrix = self.identity_dist if matrix.all() == None: print("Error : distance matrix from identity hasn't been " \ "computed yet") return elif matrix == 'dissimilarity': matrix = self.dissimilarity_dist if matrix.all() == None: print("Error : distance matrix from dissimilarity hasn't " \ "been computed yet") return if ncluster > matrix.shape[0]: print("Error : number of desired clusters > number of objects") return initial_medoids = np.random.randint(matrix.shape[0], size=ncluster) kmed1 = kmedoids(matrix, initial_medoids, data_type='distance_matrix') kmed1.process() clusters = kmed1.get_clusters() medoids = kmed1.get_medoids() return (clusters, medoids) def visualise(clusters, output_name=None): """ Generate an image to visualise clusters. Can currently display up to seven different colors Arguments : - clusters : list of lists output of the small_kmedoids method - output_name : str desired filename for the image output, if none don't save file """ nb_confs = sum([len(x) for x in clusters]) x = np.arange(nb_confs) y = np.zeros(nb_confs) for i in range(len(clusters_diss)): for j in clusters_diss[i]: y[j] = i+1 color = [] for i in y: if i == 1: color.append('b') if i == 2: color.append('c') if i == 3: color.append('g') if i == 4: color.append('y') if i == 5: color.append('r') if i == 6: color.append('m') else: color.append('k') plt.bar(x, y, width=1.0, color=color) if output_name != None: plt.savefig(output_name) if __name__ == "__main__": if len(sys.argv) < 2: sys.exit("Error : usage '$ python3 projet8 md.pdb ncluster(int)'") # Initialization of the class, meaning loading the pdb and writing pd.df confs = Conformations(sys.argv[1]) nclusters = int(sys.argv[2]) # Computation of the distance matrixes confs.identity() confs.dissimilarity() # Running the kmedoids algorithm on the identity distance matrix clusters_idt = confs.small_kmedoids('identity', nclusters) medoids_idt = clusters_idt[1] clusters_idt = clusters_idt[0] # Running the kmedoids algorithm on the dissimilarity distance matrix clusters_diss = confs.small_kmedoids('dissimilarity', nclusters) medoids_diss = clusters_diss[1] clusters_diss = clusters_diss[0] print(confs.df)