Projet de classification de conformations de protéines par k-medoids

projet8.py 7.6KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210
  1. #!/bin/python3
  2. import pandas as pd
  3. import numpy as np
  4. import pbxplore as pbx
  5. import sys
  6. from scipy.spatial.distance import squareform, pdist, jaccard
  7. from pyclustering.cluster.kmedoids import kmedoids
  8. from pyclustering.cluster import cluster_visualizer
  9. from pyclustering.utils import read_sample
  10. from pyclustering.samples.definitions import FCPS_SAMPLES
  11. class Conformations:
  12. """
  13. An instance of the class conformations contains differents conformations of
  14. the same protein, encoded as 1D sequences of protein bloc, in a pandas
  15. dataframe.
  16. """
  17. pass
  18. def __init__(self, filename):
  19. """
  20. filename : a .pdb file or several pdb file containing the conformations
  21. Attribute :
  22. - df : pd.DataFrame object
  23. - filename : a list of str, paths to the structure files
  24. - simple_dist : matrix distance computed with the identity method
  25. Initiality empty, must be computed
  26. - dissimilarity_dist : matrix distance computed with the
  27. dissimilarity method, initialy empty because
  28. the computation time is long
  29. Each row of the dataframe is a conformation and each column a position
  30. of the sequence.
  31. """
  32. self.df = pd.DataFrame()
  33. self.filename = filename
  34. self.identity_dist = np.array(None)
  35. self.dissimilarity_dist = np.array(None)
  36. for chain_name, chain in pbx.chains_from_files([filename]):
  37. dihedrals = chain.get_phi_psi_angles()
  38. pb_seq = pbx.assign(dihedrals)
  39. self.df = self.df.append(pd.Series(list(pb_seq)),
  40. ignore_index=True)
  41. def identity(self):
  42. """
  43. Computes a distance matrix between all conformations based on
  44. wether the PB at each position is identical or not.
  45. Returns the matrix as a numpy.ndarray object.
  46. """
  47. dict_str_to_float = {'Z':0,'a':1,'b':2,'c':3,'d':4,'e':5,'f':6,'g':7,
  48. 'h':8,'i':9,'j':10,'k':11,'l':12,'m':13,'n':14,
  49. 'o':15,'p':16} # dict to transform PB to int,
  50. # necessary for the pdsit function
  51. dfnum = self.df.replace(dict_str_to_float)
  52. dist = squareform(pdist(dfnum, metric ='jaccard'))
  53. self.identity_dist = dist
  54. return dist
  55. def dissimilarity(self, matrix=None):
  56. """
  57. Returns a matrix of the distance between all conformations computed
  58. according to a substitution matrix of the protein blocks.
  59. If no substitution matrix is specified, use the matrix from the
  60. PBxplore package.
  61. """
  62. dissimilarity = np.zeros((self.df.shape[0],self.df.shape[0]))
  63. if matrix == None:
  64. matrix = pd.read_table("data/PBs_substitution_matrix.dat",
  65. index_col=False, sep ='\t')
  66. matrix = matrix/100 # because in this file weight where multiplied
  67. # by 100.
  68. matrix.index = matrix.columns
  69. ncol = self.df.shape[1]
  70. nrow = self.df.shape[0]
  71. it1 = self.df.iterrows()
  72. for i in range(1,nrow): # compute each pairwise distance once only
  73. for j in range(i):
  74. for k in range(ncol):
  75. dissimilarity[i][j] += matrix[confs.df.loc[i,k]][confs.df.loc[j,k]]
  76. dissimilarity = dissimilarity + dissimilarity.T # fills the whole matrix
  77. return self.dist_from_dissimilarity(dissimilarity)
  78. def dist_from_dissimilarity(self, diss_matrix):
  79. """
  80. Using the substitution matrix from the PBxplore package, the obtained
  81. dissimilarity matrix has both positive and negative values. Low value
  82. represent strong differences as identical PB substitution are high
  83. positive value.
  84. This function returns a distance matrix from the dissimilarity matrix.
  85. Arguments :
  86. - diss_matrix : ndarray
  87. obtained from Configurations.dissimilarity
  88. """
  89. diss_matrix = -diss_matrix
  90. diss_matrix = (diss_matrix - np.min(diss_matrix))/np.ptp(diss_matrix)
  91. dist = diss_matrix * abs((np.identity(confs.df.shape[0])-1))
  92. self.dissimilarity_dist = dist
  93. return dist
  94. def small_kmedoids(self, matrix, ncluster):
  95. """
  96. Returns clusters and medoids computed with kmedoids on a distance matrix
  97. Arguments :
  98. - matrix : str, ('identity' or 'dissimilarity')
  99. corresponding to the desired distance matrix to be
  100. computed
  101. - ncluster number of clusters to be computed
  102. """
  103. if matrix == 'identity':
  104. matrix = self.identity_dist
  105. if matrix.all() == None:
  106. print("Error : distance matrix from identity hasn't been " \
  107. "computed yet")
  108. return
  109. elif matrix == 'dissimilarity':
  110. matrix = self.dissimilarity_dist
  111. if matrix.all() == None:
  112. print("Error : distance matrix from dissimilarity hasn't " \
  113. "been computed yet")
  114. return
  115. if ncluster > matrix.shape[0]:
  116. print("Error : number of desired clusters > number of objects")
  117. return
  118. initial_medoids = np.random.randint(matrix.shape[0], size=ncluster)
  119. kmed1 = kmedoids(matrix, initial_medoids, data_type='distance_matrix')
  120. kmed1.process()
  121. clusters = kmed1.get_clusters()
  122. medoids = kmed1.get_medoids()
  123. return (clusters, medoids)
  124. def visualise(clusters, output_name=None):
  125. """
  126. Generate an image to visualise clusters. Can currently display up to
  127. seven different colors
  128. Arguments :
  129. - clusters : list of lists
  130. output of the small_kmedoids method
  131. - output_name : str
  132. desired filename for the image output, if none don't save file
  133. """
  134. nb_confs = sum([len(x) for x in clusters])
  135. x = np.arange(nb_confs)
  136. y = np.zeros(nb_confs)
  137. for i in range(len(clusters_diss)):
  138. for j in clusters_diss[i]:
  139. y[j] = i+1
  140. color = []
  141. for i in y:
  142. if i == 1:
  143. color.append('b')
  144. if i == 2:
  145. color.append('c')
  146. if i == 3:
  147. color.append('g')
  148. if i == 4:
  149. color.append('y')
  150. if i == 5:
  151. color.append('r')
  152. if i == 6:
  153. color.append('m')
  154. else:
  155. color.append('k')
  156. plt.bar(x, y, width=1.0, color=color)
  157. if output_name != None:
  158. plt.savefig(output_name)
  159. if __name__ == "__main__":
  160. if len(sys.argv) < 2:
  161. sys.exit("Error : usage '$ python3 projet8 md.pdb ncluster(int)'")
  162. # Initialization of the class, meaning loading the pdb and writing pd.df
  163. confs = Conformations(sys.argv[1])
  164. nclusters = int(sys.argv[2])
  165. # Computation of the distance matrixes
  166. confs.identity()
  167. confs.dissimilarity()
  168. # Running the kmedoids algorithm on the identity distance matrix
  169. clusters_idt = confs.small_kmedoids('identity', nclusters)
  170. medoids_idt = clusters_idt[1]
  171. clusters_idt = clusters_idt[0]
  172. # Running the kmedoids algorithm on the dissimilarity distance matrix
  173. clusters_diss = confs.small_kmedoids('dissimilarity', nclusters)
  174. medoids_diss = clusters_diss[1]
  175. clusters_diss = clusters_diss[0]
  176. print(confs.df)