123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195 |
- """
- This modules handles proteins conformations and allows manipulating them.
-
- Conformations are loaded as a pdb file and are then converted to PBs sequences
- using the pbxplore package.
-
- Distance matrixes can be produced using 2 methods : Identity between blocs for
- the simplest or dissimilarity using a substitution matrix.
-
- Both matrixes can be computed with k-medoids algorithm for clustering.
-
- Generated clusters can be visualised with the visualisation method.
- """
-
- import pandas as pd
- import numpy as np
- import pbxplore as pbx
- from scipy.spatial.distance import squareform, pdist, jaccard
- from pyclustering.cluster.kmedoids import kmedoids
- import matplotlib.pyplot as plt
-
- class Conformations:
- """
- An instance of the class conformations contains differents conformations of
- the same protein, encoded as 1D sequences of protein bloc, in a pandas
- dataframe.
- """
-
- def __init__(self, filename):
- """
- filename : a .pdb file or several pdb file containing the conformations
-
- Attribute :
- - df : pd.DataFrame object
- - filename : a list of str, paths to the structure files
- - simple_dist : matrix distance computed with the identity method
- Initiality empty, must be computed
- - dissimilarity_dist : matrix distance computed with the
- dissimilarity method, initialy empty because
- the computation time is long
- Each row of the dataframe is a conformation and each column a position
- of the sequence.
- """
- self.df = pd.DataFrame()
- self.filename = filename
- self.identity_dist = np.array(None)
- self.dissimilarity_dist = np.array(None)
- for chain_name, chain in pbx.chains_from_files([filename]):
- dihedrals = chain.get_phi_psi_angles()
- pb_seq = pbx.assign(dihedrals)
- self.df = self.df.append(pd.Series(list(pb_seq)),
- ignore_index=True)
-
- def identity(self):
- """
- Computes a distance matrix between all conformations based on
- wether the PB at each position is identical or not.
- Returns the matrix as a numpy.ndarray object.
- """
- dict_str_to_float = {'Z':0, 'a':1, 'b':2, 'c':3, 'd':4, 'e':5, 'f':6,
- 'g':7, 'h':8, 'i':9, 'j':10, 'k':11, 'l':12,
- 'm':13, 'n':14, 'o':15, 'p':16}
- # dict to transform PB to
- # int, necessary for the pdist function
- dfnum = self.df.replace(dict_str_to_float)
- dist = squareform(pdist(dfnum, metric='jaccard'))
- self.identity_dist = dist
-
- return dist
-
- def dissimilarity(self, matrix=None):
- """
- Returns a matrix of the distance between all conformations computed
- according to a substitution matrix of the protein blocks.
- If no substitution matrix is specified, use the matrix from the
- PBxplore package.
- """
- dissimilarity = np.zeros((self.df.shape[0], self.df.shape[0]))
- if matrix == None:
- matrix = pd.read_table("data/PBs_substitution_matrix.dat",
- index_col=False, sep='\t')
- matrix = matrix/100 # because in this file weight where multiplied
- # by 100.
- matrix.index = matrix.columns
- ncol = self.df.shape[1]
- nrow = self.df.shape[0]
- for i in range(1, nrow): # compute each pairwise distance once only
- for j in range(i):
- for k in range(ncol):
- dissimilarity[i][j] += \
- matrix[self.df.loc[i, k]][self.df.loc[j, k]]
- dissimilarity = dissimilarity + dissimilarity.T # fills the whole matrix
-
- return self.dist_from_dissimilarity(dissimilarity)
-
- def dist_from_dissimilarity(self, diss_matrix):
- """
- Using the substitution matrix from the PBxplore package, the obtained
- dissimilarity matrix has both positive and negative values. Low value
- represent strong differences as identical PB substitution are high
- positive value.
- This function returns a distance matrix from the dissimilarity matrix.
-
- Arguments :
- - diss_matrix : ndarray
- obtained from Configurations.dissimilarity
- """
- diss_matrix = -diss_matrix
- diss_matrix = (diss_matrix - np.min(diss_matrix))/np.ptp(diss_matrix)
- dist = diss_matrix * abs((np.identity(self.df.shape[0])-1))
-
- self.dissimilarity_dist = dist
-
- return dist
-
- def small_kmedoids(self, matrix, ncluster):
- """
- Returns clusters and medoids computed with kmedoids on a distance matrix
- Arguments :
- - matrix : str, ('identity' or 'dissimilarity')
- corresponding to the desired distance matrix to be
- computed
- - ncluster number of clusters to be computed
- """
- if matrix == 'identity':
-
- matrix = self.identity_dist
- if matrix.all() == None:
- print("Error : distance matrix from identity hasn't been " \
- "computed yet")
- return
- elif matrix == 'dissimilarity':
- matrix = self.dissimilarity_dist
- if matrix.all() == None:
- print("Error : distance matrix from dissimilarity hasn't " \
- "been computed yet")
- return
-
- if ncluster > matrix.shape[0]:
- print("Error : number of desired clusters > number of objects")
- return
-
- initial_medoids = np.random.randint(matrix.shape[0], size=ncluster)
- kmed1 = kmedoids(matrix, initial_medoids, data_type='distance_matrix')
- kmed1.process()
-
- clusters = kmed1.get_clusters()
- medoids = kmed1.get_medoids()
-
- return (clusters, medoids)
-
- def visualise(clusters, output_name=None):
- """
- Generate an image to visualise clusters. Can currently display up to
- seven different colors
- Arguments :
- - clusters : list of lists
- output of the small_kmedoids method
-
- - output_name : str
- desired filename for the image output, if none don't save file
- """
- nb_confs = sum([len(x) for x in clusters])
- confs = np.arange(nb_confs)
- group = np.zeros(nb_confs)
-
- for i in range(len(clusters)):
- for j in clusters[i]:
- group[j] = i+1
-
- color = []
- #dict_col = {1:'b',2:'c',3:'',4:'',5:'',6:''}
- for i in group:
- if i == 1:
- color.append('b')
- if i == 2:
- color.append('g')
- if i == 3:
- color.append('y')
- if i == 4:
- color.append('r')
- if i == 5:
- color.append('c')
- if i == 6:
- color.append('m')
- elif i > 6:
- color.append('k')
-
- plt.bar(confs, group, width=1.0, color=color)
-
- if output_name != None:
- plt.savefig("results/"+output_name)
-
- plt.close()
|