|
@@ -5,7 +5,11 @@ import numpy as np
|
5
|
5
|
import pbxplore as pbx
|
6
|
6
|
import sys
|
7
|
7
|
from scipy.spatial.distance import squareform, pdist, jaccard
|
8
|
|
-from pyclustering.cluster.kmedoids import kmedoids # for kmedoids method
|
|
8
|
+from pyclustering.cluster.kmedoids import kmedoids
|
|
9
|
+from pyclustering.cluster import cluster_visualizer
|
|
10
|
+from pyclustering.utils import read_sample
|
|
11
|
+from pyclustering.samples.definitions import FCPS_SAMPLES
|
|
12
|
+
|
9
|
13
|
|
10
|
14
|
class Conformations:
|
11
|
15
|
"""
|
|
@@ -20,8 +24,6 @@ class Conformations:
|
20
|
24
|
df : pd.DataFrame object
|
21
|
25
|
Each row of the dataframe is a conformation and each column a position
|
22
|
26
|
of the sequence.
|
23
|
|
- dfnum : to compute dissimilarity with the function pdist, elements of
|
24
|
|
- the DataFrame must be float
|
25
|
27
|
"""
|
26
|
28
|
self.df = pd.DataFrame()
|
27
|
29
|
for chain_name, chain in pbx.chains_from_files([filename]):
|
|
@@ -51,7 +53,7 @@ class Conformations:
|
51
|
53
|
If no substitution matrix is specified, use the matrix from the
|
52
|
54
|
PBxplore package.
|
53
|
55
|
"""
|
54
|
|
- dissimilarity = np.ndarray((self.df.shape[0],self.df.shape[0]))
|
|
56
|
+ dissimilarity = np.zeros((self.df.shape[0],self.df.shape[0]))
|
55
|
57
|
if matrix == None:
|
56
|
58
|
matrix = pd.read_table("data/PBs_substitution_matrix.dat",
|
57
|
59
|
index_col=False, sep ='\t')
|
|
@@ -61,14 +63,30 @@ class Conformations:
|
61
|
63
|
ncol = self.df.shape[1]
|
62
|
64
|
nrow = self.df.shape[0]
|
63
|
65
|
it1 = self.df.iterrows()
|
64
|
|
- for i in range(1,nrow):
|
|
66
|
+ for i in range(1,nrow): # compute each pairwise distance once only
|
65
|
67
|
for j in range(i):
|
66
|
68
|
for k in range(ncol):
|
67
|
69
|
dissimilarity[i][j] += matrix[confs.df.loc[i,k]][confs.df.loc[j,k]]
|
68
|
|
- dissimilarity = dissimilarity + dissimilarity.T
|
69
|
|
-
|
|
70
|
+ dissimilarity = dissimilarity + dissimilarity.T # fills the whole matrix
|
|
71
|
+
|
70
|
72
|
return dissimilarity
|
71
|
73
|
|
|
74
|
+ def dist_from_dissimilarity(diss_matrix):
|
|
75
|
+ """
|
|
76
|
+ Using the substitution matrix from the PBxplore package, the obtained
|
|
77
|
+ dissimilarity matrix has both positive and negative values. Low value
|
|
78
|
+ represent strong differences as identical PB substitution are high
|
|
79
|
+ positive value.
|
|
80
|
+ This function returns a distance matrix from the dissimilarity matrix.
|
|
81
|
+
|
|
82
|
+ Arguments :
|
|
83
|
+ - diss_matrix : ndarray obtained from Configurations.dissimilarity
|
|
84
|
+ """
|
|
85
|
+ diss_matrix = -diss_matrix
|
|
86
|
+ diss_matrix = (diss_matrix - np.min(diss_matrix))/np.ptp(diss_matrix)
|
|
87
|
+ dist = diss_matrix * abs((np.identity(confs.df.shape[0])-1))
|
|
88
|
+
|
|
89
|
+ return dist
|
72
|
90
|
|
73
|
91
|
if __name__ == "__main__":
|
74
|
92
|
if len(sys.argv) != 2:
|