Projet de classification de conformations de protéines par k-medoids

projet8.py 2.9KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980
  1. #!/bin/python3
  2. import pandas as pd
  3. import numpy as np
  4. import pbxplore as pbx
  5. import sys
  6. from scipy.spatial.distance import squareform, pdist, jaccard
  7. class Conformations:
  8. """
  9. An instance of the class conformations contains differents conformations of
  10. the same protein, encoded as 1D sequences of protein bloc, in a pandas
  11. dataframe.
  12. """
  13. pass
  14. def __init__(self, filename):
  15. """
  16. df : pd.DataFrame object
  17. Each row of the dataframe is a conformation and each column a position
  18. of the sequence.
  19. dfnum : to compute dissimilarity with the function pdist, elements of
  20. the DataFrame must be float
  21. """
  22. self.df = pd.DataFrame()
  23. for chain_name, chain in pbx.chains_from_files([filename]):
  24. dihedrals = chain.get_phi_psi_angles()
  25. pb_seq = pbx.assign(dihedrals)
  26. self.df = self.df.append(pd.Series(list(pb_seq)),
  27. ignore_index=True)
  28. def identity(self):
  29. """
  30. Returns a matrix of the distance between all conformations based on
  31. wether the PB at each position is identical or not.
  32. Returns the matrix as a numpy.ndarray object.
  33. """
  34. dict_str_to_float = {'Z':0,'a':1,'b':2,'c':3,'d':4,'e':5,'f':6,'g':7,
  35. 'h':8,'i':9,'j':10,'k':11,'l':12,'m':13,'n':14,
  36. 'o':15,'p':16} # dict to transform PB to int,
  37. # necessary for the pdsit function
  38. dfnum = self.df.replace(dict_str_to_float)
  39. return squareform(pdist(dfnum, metric ='jaccard'))
  40. def dissimilarity(self, matrix=None):
  41. """
  42. Returns a matrix of the distance between all conformations computed
  43. according to a substitution matrix of the protein blocks.
  44. If no substitution matrix is specified, use the matrix from the
  45. PBxplore package.
  46. """
  47. dissimilarity = np.ndarray((self.df.shape[0],self.df.shape[0]))
  48. if matrix == None:
  49. matrix = pd.read_table("data/PBs_substitution_matrix.dat",
  50. index_col=False, sep ='\t')
  51. matrix = matrix/100 # because in this file weight where multiplied
  52. # by 100.
  53. matrix.index = matrix.columns
  54. ncol = self.df.shape[1]
  55. nrow = self.df.shape[0]
  56. it1 = self.df.iterrows()
  57. for i in range(1,nrow):
  58. for j in range(i):
  59. for k in range(ncol):
  60. dissimilarity[i][j] += matrix[confs.df.loc[i,k]][confs.df.loc[j,k]]
  61. dissimilarity = dissimilarity + dissimilarity.T
  62. return dissimilarity
  63. if __name__ == "__main__":
  64. if len(sys.argv) != 2:
  65. sys.exit("Error : usage '$ python3 projet8 md.pdb'")
  66. confs = Conformations(sys.argv[1])
  67. print(confs.df)