Projet de classification de conformations de protéines par k-medoids

projet8.py 2.4KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869
  1. #!/bin/python3
  2. import pandas as pd
  3. import numpy as np
  4. import pbxplore as pbx
  5. import sys
  6. from scipy.spatial.distance import squareform, pdist, jaccard
  7. class Conformations:
  8. """
  9. An instance of the class conformations contains differents conformations of
  10. the same protein, encoded as 1D sequences of protein bloc, in a pandas
  11. dataframe.
  12. """
  13. pass
  14. def __init__(self, filename):
  15. """
  16. df : pd.DataFrame object
  17. Each row of the dataframe is a conformation and each column a position
  18. of the sequence.
  19. dfnum : to compute dissimilarity with the function pdist, elements of
  20. the DataFrame must be float
  21. """
  22. self.df = pd.DataFrame()
  23. for chain_name, chain in pbx.chains_from_files([filename]):
  24. dihedrals = chain.get_phi_psi_angles()
  25. pb_seq = pbx.assign(dihedrals)
  26. self.df = self.df.append(pd.Series(list(pb_seq)),
  27. ignore_index=True)
  28. def identity(self):
  29. """
  30. Returns a matrix of the distance between all conformations based on
  31. wether the PB at each position is identical or not.
  32. Returns the matrix as a numpy.ndarray object.
  33. """
  34. dict_str_to_float = {'Z':0,'a':1,'b':2,'c':3,'d':4,'e':5,'f':6,'g':7,
  35. 'h':8,'i':9,'j':10,'k':11,'l':12,'m':13,'n':14,
  36. 'o':15,'p':16} # dict to transform PB to int,
  37. # necessary for the pdsit function
  38. dfnum = self.df.replace(dict_str_to_float)
  39. return squareform(pdist(dfnum, metric ='jaccard'))
  40. def dissimilarity(self, matrix):
  41. """
  42. Returns a matrix of the distance between all conformations computed
  43. according to a substitution matrix of the protein blocks.
  44. If no substitution matrix is specified, use the matrix from the
  45. PBxplore package.
  46. """
  47. dissimilarity = np.ndarray((self.df.shape[0],self.df.shape[0]))
  48. if matrix == None:
  49. matrix = pd.read_table("data/PBs_substitution_matrix.dat",
  50. index_col=False, sep ='\t')
  51. matrix.index = matrix.columns
  52. if __name__ == "__main__":
  53. if len(sys.argv) != 2:
  54. print("Error : usage $ python3 projet8 md.pdb")
  55. exit()
  56. confs = Conformations(sys.argv[1])
  57. print(confs.df)