Projet de classification de conformations de protéines par k-medoids

projet8.py 3.7KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899
  1. #!/bin/python3
  2. import pandas as pd
  3. import numpy as np
  4. import pbxplore as pbx
  5. import sys
  6. from scipy.spatial.distance import squareform, pdist, jaccard
  7. from pyclustering.cluster.kmedoids import kmedoids
  8. from pyclustering.cluster import cluster_visualizer
  9. from pyclustering.utils import read_sample
  10. from pyclustering.samples.definitions import FCPS_SAMPLES
  11. class Conformations:
  12. """
  13. An instance of the class conformations contains differents conformations of
  14. the same protein, encoded as 1D sequences of protein bloc, in a pandas
  15. dataframe.
  16. """
  17. pass
  18. def __init__(self, filename):
  19. """
  20. df : pd.DataFrame object
  21. Each row of the dataframe is a conformation and each column a position
  22. of the sequence.
  23. """
  24. self.df = pd.DataFrame()
  25. for chain_name, chain in pbx.chains_from_files([filename]):
  26. dihedrals = chain.get_phi_psi_angles()
  27. pb_seq = pbx.assign(dihedrals)
  28. self.df = self.df.append(pd.Series(list(pb_seq)),
  29. ignore_index=True)
  30. def identity(self):
  31. """
  32. Returns a matrix of the distance between all conformations based on
  33. wether the PB at each position is identical or not.
  34. Returns the matrix as a numpy.ndarray object.
  35. """
  36. dict_str_to_float = {'Z':0,'a':1,'b':2,'c':3,'d':4,'e':5,'f':6,'g':7,
  37. 'h':8,'i':9,'j':10,'k':11,'l':12,'m':13,'n':14,
  38. 'o':15,'p':16} # dict to transform PB to int,
  39. # necessary for the pdsit function
  40. dfnum = self.df.replace(dict_str_to_float)
  41. return squareform(pdist(dfnum, metric ='jaccard'))
  42. def dissimilarity(self, matrix=None):
  43. """
  44. Returns a matrix of the distance between all conformations computed
  45. according to a substitution matrix of the protein blocks.
  46. If no substitution matrix is specified, use the matrix from the
  47. PBxplore package.
  48. """
  49. dissimilarity = np.zeros((self.df.shape[0],self.df.shape[0]))
  50. if matrix == None:
  51. matrix = pd.read_table("data/PBs_substitution_matrix.dat",
  52. index_col=False, sep ='\t')
  53. matrix = matrix/100 # because in this file weight where multiplied
  54. # by 100.
  55. matrix.index = matrix.columns
  56. ncol = self.df.shape[1]
  57. nrow = self.df.shape[0]
  58. it1 = self.df.iterrows()
  59. for i in range(1,nrow): # compute each pairwise distance once only
  60. for j in range(i):
  61. for k in range(ncol):
  62. dissimilarity[i][j] += matrix[confs.df.loc[i,k]][confs.df.loc[j,k]]
  63. dissimilarity = dissimilarity + dissimilarity.T # fills the whole matrix
  64. return dissimilarity
  65. def dist_from_dissimilarity(diss_matrix):
  66. """
  67. Using the substitution matrix from the PBxplore package, the obtained
  68. dissimilarity matrix has both positive and negative values. Low value
  69. represent strong differences as identical PB substitution are high
  70. positive value.
  71. This function returns a distance matrix from the dissimilarity matrix.
  72. Arguments :
  73. - diss_matrix : ndarray obtained from Configurations.dissimilarity
  74. """
  75. diss_matrix = -diss_matrix
  76. diss_matrix = (diss_matrix - np.min(diss_matrix))/np.ptp(diss_matrix)
  77. dist = diss_matrix * abs((np.identity(confs.df.shape[0])-1))
  78. return dist
  79. if __name__ == "__main__":
  80. if len(sys.argv) != 2:
  81. sys.exit("Error : usage '$ python3 projet8 md.pdb'")
  82. confs = Conformations(sys.argv[1])
  83. print(confs.df)