Projet de classification de conformations de protéines par k-medoids

conformations.py 7.0KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195
  1. """
  2. This modules handles proteins conformations and allows manipulating them.
  3. Conformations are loaded as a pdb file and are then converted to PBs sequences
  4. using the pbxplore package.
  5. Distance matrixes can be produced using 2 methods : Identity between blocs for
  6. the simplest or dissimilarity using a substitution matrix.
  7. Both matrixes can be computed with k-medoids algorithm for clustering.
  8. Generated clusters can be visualised with the visualisation method.
  9. """
  10. import pandas as pd
  11. import numpy as np
  12. import pbxplore as pbx
  13. from scipy.spatial.distance import squareform, pdist, jaccard
  14. from pyclustering.cluster.kmedoids import kmedoids
  15. import matplotlib.pyplot as plt
  16. class Conformations:
  17. """
  18. An instance of the class conformations contains differents conformations of
  19. the same protein, encoded as 1D sequences of protein bloc, in a pandas
  20. dataframe.
  21. """
  22. def __init__(self, filename):
  23. """
  24. filename : a .pdb file or several pdb file containing the conformations
  25. Attribute :
  26. - df : pd.DataFrame object
  27. - filename : a list of str, paths to the structure files
  28. - simple_dist : matrix distance computed with the identity method
  29. Initiality empty, must be computed
  30. - dissimilarity_dist : matrix distance computed with the
  31. dissimilarity method, initialy empty because
  32. the computation time is long
  33. Each row of the dataframe is a conformation and each column a position
  34. of the sequence.
  35. """
  36. self.df = pd.DataFrame()
  37. self.filename = filename
  38. self.identity_dist = np.array(None)
  39. self.dissimilarity_dist = np.array(None)
  40. for chain_name, chain in pbx.chains_from_files([filename]):
  41. dihedrals = chain.get_phi_psi_angles()
  42. pb_seq = pbx.assign(dihedrals)
  43. self.df = self.df.append(pd.Series(list(pb_seq)),
  44. ignore_index=True)
  45. def identity(self):
  46. """
  47. Computes a distance matrix between all conformations based on
  48. wether the PB at each position is identical or not.
  49. Returns the matrix as a numpy.ndarray object.
  50. """
  51. dict_str_to_float = {'Z':0, 'a':1, 'b':2, 'c':3, 'd':4, 'e':5, 'f':6,
  52. 'g':7, 'h':8, 'i':9, 'j':10, 'k':11, 'l':12,
  53. 'm':13, 'n':14, 'o':15, 'p':16}
  54. # dict to transform PB to
  55. # int, necessary for the pdist function
  56. dfnum = self.df.replace(dict_str_to_float)
  57. dist = squareform(pdist(dfnum, metric='jaccard'))
  58. self.identity_dist = dist
  59. return dist
  60. def dissimilarity(self, matrix=None):
  61. """
  62. Returns a matrix of the distance between all conformations computed
  63. according to a substitution matrix of the protein blocks.
  64. If no substitution matrix is specified, use the matrix from the
  65. PBxplore package.
  66. """
  67. dissimilarity = np.zeros((self.df.shape[0], self.df.shape[0]))
  68. if matrix == None:
  69. matrix = pd.read_table("data/PBs_substitution_matrix.dat",
  70. index_col=False, sep='\t')
  71. matrix = matrix/100 # because in this file weight where multiplied
  72. # by 100.
  73. matrix.index = matrix.columns
  74. ncol = self.df.shape[1]
  75. nrow = self.df.shape[0]
  76. for i in range(1, nrow): # compute each pairwise distance once only
  77. for j in range(i):
  78. for k in range(ncol):
  79. dissimilarity[i][j] += \
  80. matrix[self.df.loc[i, k]][self.df.loc[j, k]]
  81. dissimilarity = dissimilarity + dissimilarity.T # fills the whole matrix
  82. return self.dist_from_dissimilarity(dissimilarity)
  83. def dist_from_dissimilarity(self, diss_matrix):
  84. """
  85. Using the substitution matrix from the PBxplore package, the obtained
  86. dissimilarity matrix has both positive and negative values. Low value
  87. represent strong differences as identical PB substitution are high
  88. positive value.
  89. This function returns a distance matrix from the dissimilarity matrix.
  90. Arguments :
  91. - diss_matrix : ndarray
  92. obtained from Configurations.dissimilarity
  93. """
  94. diss_matrix = -diss_matrix
  95. diss_matrix = (diss_matrix - np.min(diss_matrix))/np.ptp(diss_matrix)
  96. dist = diss_matrix * abs((np.identity(self.df.shape[0])-1))
  97. self.dissimilarity_dist = dist
  98. return dist
  99. def small_kmedoids(self, matrix, ncluster):
  100. """
  101. Returns clusters and medoids computed with kmedoids on a distance matrix
  102. Arguments :
  103. - matrix : str, ('identity' or 'dissimilarity')
  104. corresponding to the desired distance matrix to be
  105. computed
  106. - ncluster number of clusters to be computed
  107. """
  108. if matrix == 'identity':
  109. matrix = self.identity_dist
  110. if matrix.all() == None:
  111. print("Error : distance matrix from identity hasn't been " \
  112. "computed yet")
  113. return
  114. elif matrix == 'dissimilarity':
  115. matrix = self.dissimilarity_dist
  116. if matrix.all() == None:
  117. print("Error : distance matrix from dissimilarity hasn't " \
  118. "been computed yet")
  119. return
  120. if ncluster > matrix.shape[0]:
  121. print("Error : number of desired clusters > number of objects")
  122. return
  123. initial_medoids = np.random.randint(matrix.shape[0], size=ncluster)
  124. kmed1 = kmedoids(matrix, initial_medoids, data_type='distance_matrix')
  125. kmed1.process()
  126. clusters = kmed1.get_clusters()
  127. medoids = kmed1.get_medoids()
  128. return (clusters, medoids)
  129. def visualise(clusters, output_name=None):
  130. """
  131. Generate an image to visualise clusters. Can currently display up to
  132. seven different colors
  133. Arguments :
  134. - clusters : list of lists
  135. output of the small_kmedoids method
  136. - output_name : str
  137. desired filename for the image output, if none don't save file
  138. """
  139. nb_confs = sum([len(x) for x in clusters])
  140. confs = np.arange(nb_confs)
  141. group = np.zeros(nb_confs)
  142. for i in range(len(clusters)):
  143. for j in clusters[i]:
  144. group[j] = i+1
  145. color = []
  146. #dict_col = {1:'b',2:'c',3:'',4:'',5:'',6:''}
  147. for i in group:
  148. if i == 1:
  149. color.append('b')
  150. if i == 2:
  151. color.append('g')
  152. if i == 3:
  153. color.append('y')
  154. if i == 4:
  155. color.append('r')
  156. if i == 5:
  157. color.append('c')
  158. if i == 6:
  159. color.append('m')
  160. elif i > 6:
  161. color.append('k')
  162. plt.bar(confs, group, width=1.0, color=color)
  163. if output_name != None:
  164. plt.savefig("results/"+output_name)
  165. plt.close()