浏览代码

changed 'Conformation.dissimilarity' method to 'Conformation.identity' method

Nicolas Zimmermann 5 年前
父节点
当前提交
4144a2a093
共有 1 个文件被更改,包括 19 次插入8 次删除
  1. 19 8
      src/projet8.py

+ 19 - 8
src/projet8.py 查看文件

@@ -4,6 +4,7 @@ import pandas as pd
4 4
 import numpy as np
5 5
 import pbxplore as pbx
6 6
 import sys
7
+from scipy.spatial.distance import squareform, pdist, jaccard 
7 8
 
8 9
 class Conformations:
9 10
     """
@@ -18,21 +19,31 @@ class Conformations:
18 19
         df : pd.DataFrame object
19 20
         Each row of the dataframe is a conformation and each column a position
20 21
         of the sequence.
22
+        dfnum : to compute dissimilarity with the function pdist, elements of
23
+        the DataFrame must be float
21 24
         """
22 25
         self.df = pd.DataFrame()
23 26
         for chain_name, chain in pbx.chains_from_files([filename]): 
24 27
                 dihedrals = chain.get_phi_psi_angles() 
25 28
                 pb_seq = pbx.assign(dihedrals) 
26
-                self.df = self.df.append(pd.Series(list(pb_seq)), ignore_index=True)
27
-    
28
-    def dissimilarity(self):
29
+                self.df = self.df.append(pd.Series(list(pb_seq)),
30
+                                         ignore_index=True)
31
+ 
32
+    def identity(self):
29 33
         """
30
-        computes the dissimilarity matrix of the intance's df
34
+        Returns a matrix of the distance between all conformations based on
35
+        wether the PB at each position is identical or not.
36
+        Returns the matrix as a numpy.ndarray object.
31 37
         """
32
-        matrix = pd.DataFrame(index=np.arange(self.df.shape[0]),
33
-                              columns=np.arange(self.df.shape[0]))
34
-        matrix = matrix.fillna(0)
35
-        return matrix
38
+        dict_str_to_float = {'Z':0,'a':1,'b':2,'c':3,'d':4,'e':5,'f':6,'g':7,
39
+                             'h':8,'i':9,'j':10,'k':11,'l':12,'m':13,'n':14,
40
+                             'o':15,'p':16} # dict to transform PB to int,
41
+                                            # necessary for the pdsit function
42
+        dfnum = self.df.replace(dict_str_to_float)
43
+
44
+        return squareform(pdist(dfnum, metric ='jaccard'))
45
+
46
+
36 47
 
37 48
 if __name__ == "__main__":
38 49
     if len(sys.argv) != 2: