Browse Source

added Conformations.dist_from_dissimilarity function

nicolas-zimmermann 4 years ago
parent
commit
d77136b7f4
1 changed files with 25 additions and 7 deletions
  1. 25 7
      src/projet8.py

+ 25 - 7
src/projet8.py View File

@@ -5,7 +5,11 @@ import numpy as np
5 5
 import pbxplore as pbx
6 6
 import sys
7 7
 from scipy.spatial.distance import squareform, pdist, jaccard 
8
-from pyclustering.cluster.kmedoids import kmedoids # for kmedoids method
8
+from pyclustering.cluster.kmedoids import kmedoids
9
+from pyclustering.cluster import cluster_visualizer
10
+from pyclustering.utils import read_sample
11
+from pyclustering.samples.definitions import FCPS_SAMPLES
12
+
9 13
 
10 14
 class Conformations:
11 15
     """
@@ -20,8 +24,6 @@ class Conformations:
20 24
         df : pd.DataFrame object
21 25
         Each row of the dataframe is a conformation and each column a position
22 26
         of the sequence.
23
-        dfnum : to compute dissimilarity with the function pdist, elements of
24
-        the DataFrame must be float
25 27
         """
26 28
         self.df = pd.DataFrame()
27 29
         for chain_name, chain in pbx.chains_from_files([filename]): 
@@ -51,7 +53,7 @@ class Conformations:
51 53
         If no substitution matrix is specified, use the matrix from the
52 54
         PBxplore package.
53 55
         """
54
-        dissimilarity = np.ndarray((self.df.shape[0],self.df.shape[0])) 
56
+        dissimilarity = np.zeros((self.df.shape[0],self.df.shape[0])) 
55 57
         if matrix == None:
56 58
             matrix = pd.read_table("data/PBs_substitution_matrix.dat",
57 59
                                    index_col=False, sep ='\t')
@@ -61,14 +63,30 @@ class Conformations:
61 63
         ncol = self.df.shape[1]
62 64
         nrow = self.df.shape[0]
63 65
         it1 = self.df.iterrows()
64
-        for i in range(1,nrow): 
66
+        for i in range(1,nrow): # compute each pairwise distance once only
65 67
             for j in range(i): 
66 68
                 for k in range(ncol): 
67 69
                     dissimilarity[i][j] += matrix[confs.df.loc[i,k]][confs.df.loc[j,k]]
68
-        dissimilarity = dissimilarity + dissimilarity.T
69
-
70
+        dissimilarity = dissimilarity + dissimilarity.T # fills the whole matrix
71
+        
70 72
         return dissimilarity
71 73
 
74
+    def dist_from_dissimilarity(diss_matrix):
75
+        """
76
+        Using the substitution matrix from the PBxplore package, the obtained
77
+        dissimilarity matrix has both positive and negative values. Low value
78
+        represent strong differences as identical PB substitution are high
79
+        positive value.
80
+        This function returns a distance matrix from the dissimilarity matrix.
81
+        
82
+        Arguments :
83
+            - diss_matrix : ndarray obtained from Configurations.dissimilarity
84
+        """
85
+        diss_matrix = -diss_matrix
86
+        diss_matrix = (diss_matrix - np.min(diss_matrix))/np.ptp(diss_matrix)
87
+        dist = diss_matrix * abs((np.identity(confs.df.shape[0])-1))
88
+        
89
+        return dist
72 90
 
73 91
 if __name__ == "__main__":
74 92
     if len(sys.argv) != 2: