|
@@ -4,6 +4,7 @@ import pandas as pd
|
4
|
4
|
import numpy as np
|
5
|
5
|
import pbxplore as pbx
|
6
|
6
|
import sys
|
|
7
|
+from scipy.spatial.distance import squareform, pdist, jaccard
|
7
|
8
|
|
8
|
9
|
class Conformations:
|
9
|
10
|
"""
|
|
@@ -18,21 +19,31 @@ class Conformations:
|
18
|
19
|
df : pd.DataFrame object
|
19
|
20
|
Each row of the dataframe is a conformation and each column a position
|
20
|
21
|
of the sequence.
|
|
22
|
+ dfnum : to compute dissimilarity with the function pdist, elements of
|
|
23
|
+ the DataFrame must be float
|
21
|
24
|
"""
|
22
|
25
|
self.df = pd.DataFrame()
|
23
|
26
|
for chain_name, chain in pbx.chains_from_files([filename]):
|
24
|
27
|
dihedrals = chain.get_phi_psi_angles()
|
25
|
28
|
pb_seq = pbx.assign(dihedrals)
|
26
|
|
- self.df = self.df.append(pd.Series(list(pb_seq)), ignore_index=True)
|
27
|
|
-
|
28
|
|
- def dissimilarity(self):
|
|
29
|
+ self.df = self.df.append(pd.Series(list(pb_seq)),
|
|
30
|
+ ignore_index=True)
|
|
31
|
+
|
|
32
|
+ def identity(self):
|
29
|
33
|
"""
|
30
|
|
- computes the dissimilarity matrix of the intance's df
|
|
34
|
+ Returns a matrix of the distance between all conformations based on
|
|
35
|
+ wether the PB at each position is identical or not.
|
|
36
|
+ Returns the matrix as a numpy.ndarray object.
|
31
|
37
|
"""
|
32
|
|
- matrix = pd.DataFrame(index=np.arange(self.df.shape[0]),
|
33
|
|
- columns=np.arange(self.df.shape[0]))
|
34
|
|
- matrix = matrix.fillna(0)
|
35
|
|
- return matrix
|
|
38
|
+ dict_str_to_float = {'Z':0,'a':1,'b':2,'c':3,'d':4,'e':5,'f':6,'g':7,
|
|
39
|
+ 'h':8,'i':9,'j':10,'k':11,'l':12,'m':13,'n':14,
|
|
40
|
+ 'o':15,'p':16} # dict to transform PB to int,
|
|
41
|
+ # necessary for the pdsit function
|
|
42
|
+ dfnum = self.df.replace(dict_str_to_float)
|
|
43
|
+
|
|
44
|
+ return squareform(pdist(dfnum, metric ='jaccard'))
|
|
45
|
+
|
|
46
|
+
|
36
|
47
|
|
37
|
48
|
if __name__ == "__main__":
|
38
|
49
|
if len(sys.argv) != 2:
|