123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899 |
- #!/bin/python3
-
- import pandas as pd
- import numpy as np
- import pbxplore as pbx
- import sys
- from scipy.spatial.distance import squareform, pdist, jaccard
- from pyclustering.cluster.kmedoids import kmedoids
- from pyclustering.cluster import cluster_visualizer
- from pyclustering.utils import read_sample
- from pyclustering.samples.definitions import FCPS_SAMPLES
-
-
- class Conformations:
- """
- An instance of the class conformations contains differents conformations of
- the same protein, encoded as 1D sequences of protein bloc, in a pandas
- dataframe.
- """
- pass
-
- def __init__(self, filename):
- """
- df : pd.DataFrame object
- Each row of the dataframe is a conformation and each column a position
- of the sequence.
- """
- self.df = pd.DataFrame()
- for chain_name, chain in pbx.chains_from_files([filename]):
- dihedrals = chain.get_phi_psi_angles()
- pb_seq = pbx.assign(dihedrals)
- self.df = self.df.append(pd.Series(list(pb_seq)),
- ignore_index=True)
-
- def identity(self):
- """
- Returns a matrix of the distance between all conformations based on
- wether the PB at each position is identical or not.
- Returns the matrix as a numpy.ndarray object.
- """
- dict_str_to_float = {'Z':0,'a':1,'b':2,'c':3,'d':4,'e':5,'f':6,'g':7,
- 'h':8,'i':9,'j':10,'k':11,'l':12,'m':13,'n':14,
- 'o':15,'p':16} # dict to transform PB to int,
- # necessary for the pdsit function
- dfnum = self.df.replace(dict_str_to_float)
-
- return squareform(pdist(dfnum, metric ='jaccard'))
-
- def dissimilarity(self, matrix=None):
- """
- Returns a matrix of the distance between all conformations computed
- according to a substitution matrix of the protein blocks.
- If no substitution matrix is specified, use the matrix from the
- PBxplore package.
- """
- dissimilarity = np.zeros((self.df.shape[0],self.df.shape[0]))
- if matrix == None:
- matrix = pd.read_table("data/PBs_substitution_matrix.dat",
- index_col=False, sep ='\t')
- matrix = matrix/100 # because in this file weight where multiplied
- # by 100.
- matrix.index = matrix.columns
- ncol = self.df.shape[1]
- nrow = self.df.shape[0]
- it1 = self.df.iterrows()
- for i in range(1,nrow): # compute each pairwise distance once only
- for j in range(i):
- for k in range(ncol):
- dissimilarity[i][j] += matrix[confs.df.loc[i,k]][confs.df.loc[j,k]]
- dissimilarity = dissimilarity + dissimilarity.T # fills the whole matrix
-
- return dissimilarity
-
- def dist_from_dissimilarity(diss_matrix):
- """
- Using the substitution matrix from the PBxplore package, the obtained
- dissimilarity matrix has both positive and negative values. Low value
- represent strong differences as identical PB substitution are high
- positive value.
- This function returns a distance matrix from the dissimilarity matrix.
-
- Arguments :
- - diss_matrix : ndarray obtained from Configurations.dissimilarity
- """
- diss_matrix = -diss_matrix
- diss_matrix = (diss_matrix - np.min(diss_matrix))/np.ptp(diss_matrix)
- dist = diss_matrix * abs((np.identity(confs.df.shape[0])-1))
-
- return dist
-
- if __name__ == "__main__":
- if len(sys.argv) != 2:
- sys.exit("Error : usage '$ python3 projet8 md.pdb'")
-
- confs = Conformations(sys.argv[1])
-
-
- print(confs.df)
|