Projet de classification de conformations de protéines par k-medoids

projet8.py 7.7KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213
  1. #!/bin/python3
  2. import pandas as pd
  3. import numpy as np
  4. import pbxplore as pbx
  5. import sys
  6. from scipy.spatial.distance import squareform, pdist, jaccard
  7. from pyclustering.cluster.kmedoids import kmedoids
  8. from pyclustering.cluster import cluster_visualizer
  9. from pyclustering.utils import read_sample
  10. from pyclustering.samples.definitions import FCPS_SAMPLES
  11. import matplotlib.pyplot as plt
  12. class Conformations:
  13. """
  14. An instance of the class conformations contains differents conformations of
  15. the same protein, encoded as 1D sequences of protein bloc, in a pandas
  16. dataframe.
  17. """
  18. pass
  19. def __init__(self, filename):
  20. """
  21. filename : a .pdb file or several pdb file containing the conformations
  22. Attribute :
  23. - df : pd.DataFrame object
  24. - filename : a list of str, paths to the structure files
  25. - simple_dist : matrix distance computed with the identity method
  26. Initiality empty, must be computed
  27. - dissimilarity_dist : matrix distance computed with the
  28. dissimilarity method, initialy empty because
  29. the computation time is long
  30. Each row of the dataframe is a conformation and each column a position
  31. of the sequence.
  32. """
  33. self.df = pd.DataFrame()
  34. self.filename = filename
  35. self.identity_dist = np.array(None)
  36. self.dissimilarity_dist = np.array(None)
  37. for chain_name, chain in pbx.chains_from_files([filename]):
  38. dihedrals = chain.get_phi_psi_angles()
  39. pb_seq = pbx.assign(dihedrals)
  40. self.df = self.df.append(pd.Series(list(pb_seq)),
  41. ignore_index=True)
  42. def identity(self):
  43. """
  44. Computes a distance matrix between all conformations based on
  45. wether the PB at each position is identical or not.
  46. Returns the matrix as a numpy.ndarray object.
  47. """
  48. dict_str_to_float = {'Z':0,'a':1,'b':2,'c':3,'d':4,'e':5,'f':6,'g':7,
  49. 'h':8,'i':9,'j':10,'k':11,'l':12,'m':13,'n':14,
  50. 'o':15,'p':16} # dict to transform PB to int,
  51. # necessary for the pdsit function
  52. dfnum = self.df.replace(dict_str_to_float)
  53. dist = squareform(pdist(dfnum, metric ='jaccard'))
  54. self.identity_dist = dist
  55. return dist
  56. def dissimilarity(self, matrix=None):
  57. """
  58. Returns a matrix of the distance between all conformations computed
  59. according to a substitution matrix of the protein blocks.
  60. If no substitution matrix is specified, use the matrix from the
  61. PBxplore package.
  62. """
  63. dissimilarity = np.zeros((self.df.shape[0],self.df.shape[0]))
  64. if matrix == None:
  65. matrix = pd.read_table("data/PBs_substitution_matrix.dat",
  66. index_col=False, sep ='\t')
  67. matrix = matrix/100 # because in this file weight where multiplied
  68. # by 100.
  69. matrix.index = matrix.columns
  70. ncol = self.df.shape[1]
  71. nrow = self.df.shape[0]
  72. it1 = self.df.iterrows()
  73. for i in range(1,nrow): # compute each pairwise distance once only
  74. for j in range(i):
  75. for k in range(ncol):
  76. dissimilarity[i][j] += matrix[confs.df.loc[i,k]][confs.df.loc[j,k]]
  77. dissimilarity = dissimilarity + dissimilarity.T # fills the whole matrix
  78. return self.dist_from_dissimilarity(dissimilarity)
  79. def dist_from_dissimilarity(self, diss_matrix):
  80. """
  81. Using the substitution matrix from the PBxplore package, the obtained
  82. dissimilarity matrix has both positive and negative values. Low value
  83. represent strong differences as identical PB substitution are high
  84. positive value.
  85. This function returns a distance matrix from the dissimilarity matrix.
  86. Arguments :
  87. - diss_matrix : ndarray
  88. obtained from Configurations.dissimilarity
  89. """
  90. diss_matrix = -diss_matrix
  91. diss_matrix = (diss_matrix - np.min(diss_matrix))/np.ptp(diss_matrix)
  92. dist = diss_matrix * abs((np.identity(confs.df.shape[0])-1))
  93. self.dissimilarity_dist = dist
  94. return dist
  95. def small_kmedoids(self, matrix, ncluster):
  96. """
  97. Returns clusters and medoids computed with kmedoids on a distance matrix
  98. Arguments :
  99. - matrix : str, ('identity' or 'dissimilarity')
  100. corresponding to the desired distance matrix to be
  101. computed
  102. - ncluster number of clusters to be computed
  103. """
  104. if matrix == 'identity':
  105. matrix = self.identity_dist
  106. if matrix.all() == None:
  107. print("Error : distance matrix from identity hasn't been " \
  108. "computed yet")
  109. return
  110. elif matrix == 'dissimilarity':
  111. matrix = self.dissimilarity_dist
  112. if matrix.all() == None:
  113. print("Error : distance matrix from dissimilarity hasn't " \
  114. "been computed yet")
  115. return
  116. if ncluster > matrix.shape[0]:
  117. print("Error : number of desired clusters > number of objects")
  118. return
  119. initial_medoids = np.random.randint(matrix.shape[0], size=ncluster)
  120. kmed1 = kmedoids(matrix, initial_medoids, data_type='distance_matrix')
  121. kmed1.process()
  122. clusters = kmed1.get_clusters()
  123. medoids = kmed1.get_medoids()
  124. return (clusters, medoids)
  125. def visualise(clusters, output_name=None):
  126. """
  127. Generate an image to visualise clusters. Can currently display up to
  128. seven different colors
  129. Arguments :
  130. - clusters : list of lists
  131. output of the small_kmedoids method
  132. - output_name : str
  133. desired filename for the image output, if none don't save file
  134. """
  135. nb_confs = sum([len(x) for x in clusters])
  136. x = np.arange(nb_confs)
  137. y = np.zeros(nb_confs)
  138. for i in range(len(clusters)):
  139. for j in clusters[i]:
  140. y[j] = i+1
  141. color = []
  142. #dict_col = {1:'b',2:'c',3:'',4:'',5:'',6:''}
  143. for i in y:
  144. if i == 1:
  145. color.append('b')
  146. if i == 2:
  147. color.append('g')
  148. if i == 3:
  149. color.append('y')
  150. if i == 4:
  151. color.append('r')
  152. if i == 5:
  153. color.append('c')
  154. if i == 6:
  155. color.append('m')
  156. elif i > 6:
  157. color.append('k')
  158. plt.bar(x, y, width=1.0, color=color)
  159. if output_name != None:
  160. plt.savefig("results/"+output_name)
  161. plt.close()
  162. if __name__ == "__main__":
  163. if len(sys.argv) < 2:
  164. sys.exit("Error : usage '$ python3 projet8 md.pdb ncluster(int)'")
  165. # Initialization of the class, meaning loading the pdb and writing pd.df
  166. confs = Conformations(sys.argv[1])
  167. nclusters = int(sys.argv[2])
  168. # Computation of the distance matrixes
  169. confs.identity()
  170. confs.dissimilarity()
  171. # Running the kmedoids algorithm on the identity distance matrix
  172. clusters_idt = confs.small_kmedoids('identity', nclusters)
  173. medoids_idt = clusters_idt[1]
  174. clusters_idt = clusters_idt[0]
  175. # Running the kmedoids algorithm on the dissimilarity distance matrix
  176. clusters_diss = confs.small_kmedoids('dissimilarity', nclusters)
  177. medoids_diss = clusters_diss[1]
  178. clusters_diss = clusters_diss[0]
  179. print(confs.df)