Projet de classification de conformations de protéines par k-medoids

projet8.py 2.9KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081
  1. #!/bin/python3
  2. import pandas as pd
  3. import numpy as np
  4. import pbxplore as pbx
  5. import sys
  6. from scipy.spatial.distance import squareform, pdist, jaccard
  7. from pyclustering.cluster.kmedoids import kmedoids # for kmedoids method
  8. class Conformations:
  9. """
  10. An instance of the class conformations contains differents conformations of
  11. the same protein, encoded as 1D sequences of protein bloc, in a pandas
  12. dataframe.
  13. """
  14. pass
  15. def __init__(self, filename):
  16. """
  17. df : pd.DataFrame object
  18. Each row of the dataframe is a conformation and each column a position
  19. of the sequence.
  20. dfnum : to compute dissimilarity with the function pdist, elements of
  21. the DataFrame must be float
  22. """
  23. self.df = pd.DataFrame()
  24. for chain_name, chain in pbx.chains_from_files([filename]):
  25. dihedrals = chain.get_phi_psi_angles()
  26. pb_seq = pbx.assign(dihedrals)
  27. self.df = self.df.append(pd.Series(list(pb_seq)),
  28. ignore_index=True)
  29. def identity(self):
  30. """
  31. Returns a matrix of the distance between all conformations based on
  32. wether the PB at each position is identical or not.
  33. Returns the matrix as a numpy.ndarray object.
  34. """
  35. dict_str_to_float = {'Z':0,'a':1,'b':2,'c':3,'d':4,'e':5,'f':6,'g':7,
  36. 'h':8,'i':9,'j':10,'k':11,'l':12,'m':13,'n':14,
  37. 'o':15,'p':16} # dict to transform PB to int,
  38. # necessary for the pdsit function
  39. dfnum = self.df.replace(dict_str_to_float)
  40. return squareform(pdist(dfnum, metric ='jaccard'))
  41. def dissimilarity(self, matrix=None):
  42. """
  43. Returns a matrix of the distance between all conformations computed
  44. according to a substitution matrix of the protein blocks.
  45. If no substitution matrix is specified, use the matrix from the
  46. PBxplore package.
  47. """
  48. dissimilarity = np.ndarray((self.df.shape[0],self.df.shape[0]))
  49. if matrix == None:
  50. matrix = pd.read_table("data/PBs_substitution_matrix.dat",
  51. index_col=False, sep ='\t')
  52. matrix = matrix/100 # because in this file weight where multiplied
  53. # by 100.
  54. matrix.index = matrix.columns
  55. ncol = self.df.shape[1]
  56. nrow = self.df.shape[0]
  57. it1 = self.df.iterrows()
  58. for i in range(1,nrow):
  59. for j in range(i):
  60. for k in range(ncol):
  61. dissimilarity[i][j] += matrix[confs.df.loc[i,k]][confs.df.loc[j,k]]
  62. dissimilarity = dissimilarity + dissimilarity.T
  63. return dissimilarity
  64. if __name__ == "__main__":
  65. if len(sys.argv) != 2:
  66. sys.exit("Error : usage '$ python3 projet8 md.pdb'")
  67. confs = Conformations(sys.argv[1])
  68. print(confs.df)