Browse Source

added Conformations.dist_from_dissimilarity function

nicolas-zimmermann 4 years ago
parent
commit
d77136b7f4
1 changed files with 25 additions and 7 deletions
  1. 25 7
      src/projet8.py

+ 25 - 7
src/projet8.py View File

5
 import pbxplore as pbx
5
 import pbxplore as pbx
6
 import sys
6
 import sys
7
 from scipy.spatial.distance import squareform, pdist, jaccard 
7
 from scipy.spatial.distance import squareform, pdist, jaccard 
8
-from pyclustering.cluster.kmedoids import kmedoids # for kmedoids method
8
+from pyclustering.cluster.kmedoids import kmedoids
9
+from pyclustering.cluster import cluster_visualizer
10
+from pyclustering.utils import read_sample
11
+from pyclustering.samples.definitions import FCPS_SAMPLES
12
+
9
 
13
 
10
 class Conformations:
14
 class Conformations:
11
     """
15
     """
20
         df : pd.DataFrame object
24
         df : pd.DataFrame object
21
         Each row of the dataframe is a conformation and each column a position
25
         Each row of the dataframe is a conformation and each column a position
22
         of the sequence.
26
         of the sequence.
23
-        dfnum : to compute dissimilarity with the function pdist, elements of
24
-        the DataFrame must be float
25
         """
27
         """
26
         self.df = pd.DataFrame()
28
         self.df = pd.DataFrame()
27
         for chain_name, chain in pbx.chains_from_files([filename]): 
29
         for chain_name, chain in pbx.chains_from_files([filename]): 
51
         If no substitution matrix is specified, use the matrix from the
53
         If no substitution matrix is specified, use the matrix from the
52
         PBxplore package.
54
         PBxplore package.
53
         """
55
         """
54
-        dissimilarity = np.ndarray((self.df.shape[0],self.df.shape[0])) 
56
+        dissimilarity = np.zeros((self.df.shape[0],self.df.shape[0])) 
55
         if matrix == None:
57
         if matrix == None:
56
             matrix = pd.read_table("data/PBs_substitution_matrix.dat",
58
             matrix = pd.read_table("data/PBs_substitution_matrix.dat",
57
                                    index_col=False, sep ='\t')
59
                                    index_col=False, sep ='\t')
61
         ncol = self.df.shape[1]
63
         ncol = self.df.shape[1]
62
         nrow = self.df.shape[0]
64
         nrow = self.df.shape[0]
63
         it1 = self.df.iterrows()
65
         it1 = self.df.iterrows()
64
-        for i in range(1,nrow): 
66
+        for i in range(1,nrow): # compute each pairwise distance once only
65
             for j in range(i): 
67
             for j in range(i): 
66
                 for k in range(ncol): 
68
                 for k in range(ncol): 
67
                     dissimilarity[i][j] += matrix[confs.df.loc[i,k]][confs.df.loc[j,k]]
69
                     dissimilarity[i][j] += matrix[confs.df.loc[i,k]][confs.df.loc[j,k]]
68
-        dissimilarity = dissimilarity + dissimilarity.T
69
-
70
+        dissimilarity = dissimilarity + dissimilarity.T # fills the whole matrix
71
+        
70
         return dissimilarity
72
         return dissimilarity
71
 
73
 
74
+    def dist_from_dissimilarity(diss_matrix):
75
+        """
76
+        Using the substitution matrix from the PBxplore package, the obtained
77
+        dissimilarity matrix has both positive and negative values. Low value
78
+        represent strong differences as identical PB substitution are high
79
+        positive value.
80
+        This function returns a distance matrix from the dissimilarity matrix.
81
+        
82
+        Arguments :
83
+            - diss_matrix : ndarray obtained from Configurations.dissimilarity
84
+        """
85
+        diss_matrix = -diss_matrix
86
+        diss_matrix = (diss_matrix - np.min(diss_matrix))/np.ptp(diss_matrix)
87
+        dist = diss_matrix * abs((np.identity(confs.df.shape[0])-1))
88
+        
89
+        return dist
72
 
90
 
73
 if __name__ == "__main__":
91
 if __name__ == "__main__":
74
     if len(sys.argv) != 2:
92
     if len(sys.argv) != 2: