123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151 |
- #!/bin/python3
-
- import pandas as pd
- import numpy as np
- import pbxplore as pbx
- import sys
- from scipy.spatial.distance import squareform, pdist, jaccard
- from pyclustering.cluster.kmedoids import kmedoids
- from pyclustering.cluster import cluster_visualizer
- from pyclustering.utils import read_sample
- from pyclustering.samples.definitions import FCPS_SAMPLES
-
-
- class Conformations:
- """
- An instance of the class conformations contains differents conformations of
- the same protein, encoded as 1D sequences of protein bloc, in a pandas
- dataframe.
- """
- pass
-
- def __init__(self, filename):
- """
- filename : a .pdb file or several pdb file containing the conformations
-
- Attribute :
- - df : pd.DataFrame object
- - filename : a list of str, paths to the structure files
- - simple_dist : matrix distance computed with the identity method
- Initiality empty, must be computed
- - dissimilarity_dist : matrix distance computed with the
- dissimilarity method, initialy empty because
- the computation time is long
- Each row of the dataframe is a conformation and each column a position
- of the sequence.
- """
- self.df = pd.DataFrame()
- self.filename = filename
- self.identity_dist = np.array(None)
- self.dissimilarity_dist = np.array(None)
- for chain_name, chain in pbx.chains_from_files([filename]):
- dihedrals = chain.get_phi_psi_angles()
- pb_seq = pbx.assign(dihedrals)
- self.df = self.df.append(pd.Series(list(pb_seq)),
- ignore_index=True)
-
- def identity(self):
- """
- Computes a distance matrix between all conformations based on
- wether the PB at each position is identical or not.
- Returns the matrix as a numpy.ndarray object.
- """
- dict_str_to_float = {'Z':0,'a':1,'b':2,'c':3,'d':4,'e':5,'f':6,'g':7,
- 'h':8,'i':9,'j':10,'k':11,'l':12,'m':13,'n':14,
- 'o':15,'p':16} # dict to transform PB to int,
- # necessary for the pdsit function
- dfnum = self.df.replace(dict_str_to_float)
- dist = squareform(pdist(dfnum, metric ='jaccard'))
- self.identity_dist = dist
-
- return dist
-
- def dissimilarity(self, matrix=None):
- """
- Returns a matrix of the distance between all conformations computed
- according to a substitution matrix of the protein blocks.
- If no substitution matrix is specified, use the matrix from the
- PBxplore package.
- """
- dissimilarity = np.zeros((self.df.shape[0],self.df.shape[0]))
- if matrix == None:
- matrix = pd.read_table("data/PBs_substitution_matrix.dat",
- index_col=False, sep ='\t')
- matrix = matrix/100 # because in this file weight where multiplied
- # by 100.
- matrix.index = matrix.columns
- ncol = self.df.shape[1]
- nrow = self.df.shape[0]
- it1 = self.df.iterrows()
- for i in range(1,nrow): # compute each pairwise distance once only
- for j in range(i):
- for k in range(ncol):
- dissimilarity[i][j] += matrix[confs.df.loc[i,k]][confs.df.loc[j,k]]
- dissimilarity = dissimilarity + dissimilarity.T # fills the whole matrix
-
- return self.dist_from_dissimilarity(dissimilarity)
-
- def dist_from_dissimilarity(self, diss_matrix):
- """
- Using the substitution matrix from the PBxplore package, the obtained
- dissimilarity matrix has both positive and negative values. Low value
- represent strong differences as identical PB substitution are high
- positive value.
- This function returns a distance matrix from the dissimilarity matrix.
-
- Arguments :
- - diss_matrix : ndarray obtained from Configurations.dissimilarity
- """
- diss_matrix = -diss_matrix
- diss_matrix = (diss_matrix - np.min(diss_matrix))/np.ptp(diss_matrix)
- dist = diss_matrix * abs((np.identity(confs.df.shape[0])-1))
-
- self.dissimilarity_dist = dist
-
- return dist
-
- def small_kmedoids(self, matrix, ncluster):
- """
- Returns clusters and medoids computed with kmedoids on a distance matrix
- Arguments :
- - matrix : str, ('identity' or 'dissimilarity')
- corresponding to the desired distance matrix to
- computed
- - ncluster number of clusters to be computed
- """
- if matrix == 'identity':
-
- matrix = self.identity_dist
- if matrix.all() == None:
- print("Error : distance matrix from identity hasn't been " \
- "computed yet")
- return
- elif matrix == 'dissimilarity':
- matrix = self.dissimilarity_dist
- if matrix.all() == None:
- print("Error : distance matrix from dissimilarity hasn't " \
- "been computed yet")
- return
-
- if ncluster > matrix.shape[0]:
- print("Error : number of desired clusters > number of objects")
- return
-
- initial_medoids = np.random.randint(matrix.shape[0], size=ncluster)
- kmed1 = kmedoids(matrix, initial_medoids, data_type='distance_matrix')
- kmed1.process()
-
- clusters = kmed1.get_clusters()
- medoids = kmed1.get_medoids()
-
- return (clusters, medoids)
-
- if __name__ == "__main__":
- if len(sys.argv) < 2:
- sys.exit("Error : usage '$ python3 projet8 md.pdb'")
-
- confs = Conformations(sys.argv[1])
-
-
- print(confs.df)
|