Projet de classification de conformations de protéines par k-medoids

projet8.py 1.8KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657
  1. #!/bin/python3
  2. import pandas as pd
  3. import numpy as np
  4. import pbxplore as pbx
  5. import sys
  6. from scipy.spatial.distance import squareform, pdist, jaccard
  7. class Conformations:
  8. """
  9. An instance of the class conformations contains differents conformations of
  10. the same protein, encoded as 1D sequences of protein bloc, in a pandas
  11. dataframe.
  12. """
  13. pass
  14. def __init__(self, filename):
  15. """
  16. df : pd.DataFrame object
  17. Each row of the dataframe is a conformation and each column a position
  18. of the sequence.
  19. dfnum : to compute dissimilarity with the function pdist, elements of
  20. the DataFrame must be float
  21. """
  22. self.df = pd.DataFrame()
  23. for chain_name, chain in pbx.chains_from_files([filename]):
  24. dihedrals = chain.get_phi_psi_angles()
  25. pb_seq = pbx.assign(dihedrals)
  26. self.df = self.df.append(pd.Series(list(pb_seq)),
  27. ignore_index=True)
  28. def identity(self):
  29. """
  30. Returns a matrix of the distance between all conformations based on
  31. wether the PB at each position is identical or not.
  32. Returns the matrix as a numpy.ndarray object.
  33. """
  34. dict_str_to_float = {'Z':0,'a':1,'b':2,'c':3,'d':4,'e':5,'f':6,'g':7,
  35. 'h':8,'i':9,'j':10,'k':11,'l':12,'m':13,'n':14,
  36. 'o':15,'p':16} # dict to transform PB to int,
  37. # necessary for the pdsit function
  38. dfnum = self.df.replace(dict_str_to_float)
  39. return squareform(pdist(dfnum, metric ='jaccard'))
  40. if __name__ == "__main__":
  41. if len(sys.argv) != 2:
  42. print("Error : usage $ python3 projet8 md.pdb")
  43. exit()
  44. confs = Conformations(sys.argv[1])
  45. print(confs.df)