il y a 6 ans · d77136b7f4
--- a/src/projet8.py
+++ b/src/projet8.py
 
															 import pbxplore as pbx
														
 
															 import sys
														
 
															 from scipy.spatial.distance import squareform, pdist, jaccard 
														
 
															-from pyclustering.cluster.kmedoids import kmedoids # for kmedoids method
														
 
															+from pyclustering.cluster.kmedoids import kmedoids
														
 
															+from pyclustering.cluster import cluster_visualizer
														
 
															+from pyclustering.utils import read_sample
														
 
															+from pyclustering.samples.definitions import FCPS_SAMPLES
														
 
															+
														
 
															 class Conformations:
														
 
															     """
														
 
															         df : pd.DataFrame object
														
 
															         Each row of the dataframe is a conformation and each column a position
														
 
															         of the sequence.
														
 
															-        dfnum : to compute dissimilarity with the function pdist, elements of
														
 
															-        the DataFrame must be float
														
 
															         """
														
 
															         self.df = pd.DataFrame()
														
 
															         for chain_name, chain in pbx.chains_from_files([filename]): 
														
 
															         If no substitution matrix is specified, use the matrix from the
														
 
															         PBxplore package.
														
 
															         """
														
 
															-        dissimilarity = np.ndarray((self.df.shape[0],self.df.shape[0])) 
														
 
															+        dissimilarity = np.zeros((self.df.shape[0],self.df.shape[0])) 
														
 
															         if matrix == None:
														
 
															             matrix = pd.read_table("data/PBs_substitution_matrix.dat",
														
 
															                                    index_col=False, sep ='\t')
														
 
															         ncol = self.df.shape[1]
														
 
															         nrow = self.df.shape[0]
														
 
															         it1 = self.df.iterrows()
														
 
															-        for i in range(1,nrow): 
														
 
															+        for i in range(1,nrow): # compute each pairwise distance once only
														
 
															             for j in range(i): 
														
 
															                 for k in range(ncol): 
														
 
															                     dissimilarity[i][j] += matrix[confs.df.loc[i,k]][confs.df.loc[j,k]]
														
 
															-        dissimilarity = dissimilarity + dissimilarity.T
														
 
															-
														
 
															+        dissimilarity = dissimilarity + dissimilarity.T # fills the whole matrix
														
 
															+        
														
 
															         return dissimilarity
														
 
															+    def dist_from_dissimilarity(diss_matrix):
														
 
															+        """
														
 
															+        Using the substitution matrix from the PBxplore package, the obtained
														
 
															+        dissimilarity matrix has both positive and negative values. Low value
														
 
															+        represent strong differences as identical PB substitution are high
														
 
															+        positive value.
														
 
															+        This function returns a distance matrix from the dissimilarity matrix.
														
 
															+        
														
 
															+        Arguments :
														
 
															+            - diss_matrix : ndarray obtained from Configurations.dissimilarity
														
 
															+        """
														
 
															+        diss_matrix = -diss_matrix
														
 
															+        diss_matrix = (diss_matrix - np.min(diss_matrix))/np.ptp(diss_matrix)
														
 
															+        dist = diss_matrix * abs((np.identity(confs.df.shape[0])-1))
														
 
															+        
														
 
															+        return dist
														
 
															 if __name__ == "__main__":
														
 
															     if len(sys.argv) != 2: