Browse Source

documentation for the kmedoid function from pyclustering

Nicolas Zimmermann 3 years ago
parent
commit
6188386979
1 changed files with 327 additions and 0 deletions
  1. 327 0
      doc/doc_kmedoids.py

+ 327 - 0
doc/doc_kmedoids.py View File

@@ -0,0 +1,327 @@
1
+"""!
2
+
3
+@brief Cluster analysis algorithm: K-Medoids.
4
+@details Implementation based on papers @cite book::algorithms_for_clustering_data, @cite book::finding_groups_in_data.
5
+
6
+@authors Andrei Novikov (pyclustering@yandex.ru)
7
+@date 2014-2019
8
+@copyright GNU Public License
9
+
10
+@cond GNU_PUBLIC_LICENSE
11
+    PyClustering is free software: you can redistribute it and/or modify
12
+    it under the terms of the GNU General Public License as published by
13
+    the Free Software Foundation, either version 3 of the License, or
14
+    (at your option) any later version.
15
+    
16
+    PyClustering is distributed in the hope that it will be useful,
17
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
18
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19
+    GNU General Public License for more details.
20
+    
21
+    You should have received a copy of the GNU General Public License
22
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
23
+@endcond
24
+
25
+"""
26
+
27
+
28
+import numpy
29
+
30
+from pyclustering.cluster.encoder import type_encoding
31
+
32
+from pyclustering.utils import medoid
33
+from pyclustering.utils.metric import distance_metric, type_metric
34
+
35
+import pyclustering.core.kmedoids_wrapper as wrapper
36
+
37
+from pyclustering.core.wrapper import ccore_library
38
+from pyclustering.core.metric_wrapper import metric_wrapper
39
+
40
+
41
+class kmedoids:
42
+    """!
43
+    @brief Class represents clustering algorithm K-Medoids.
44
+    @details The algorithm is less sensitive to outliers tham K-Means. The principle difference between K-Medoids and K-Medians is that
45
+             K-Medoids uses existed points from input data space as medoids, but median in K-Medians can be unreal object (not from
46
+             input data space).
47
+    
48
+    Clustering example:
49
+    @code
50
+        from pyclustering.cluster.kmedoids import kmedoids
51
+        from pyclustering.cluster import cluster_visualizer
52
+        from pyclustering.utils import read_sample
53
+        from pyclustering.samples.definitions import FCPS_SAMPLES
54
+
55
+        # Load list of points for cluster analysis.
56
+        sample = read_sample(FCPS_SAMPLES.SAMPLE_TWO_DIAMONDS)
57
+
58
+        # Set random initial medoids.
59
+        initial_medoids = [1, 500]
60
+
61
+        # Create instance of K-Medoids algorithm.
62
+        kmedoids_instance = kmedoids(sample, initial_medoids)
63
+
64
+        # Run cluster analysis and obtain results.
65
+        kmedoids_instance.process()
66
+        clusters = kmedoids_instance.get_clusters()
67
+
68
+        # Show allocated clusters.
69
+        print(clusters)
70
+
71
+        # Display clusters.
72
+        visualizer = cluster_visualizer()
73
+        visualizer.append_clusters(clusters, sample)
74
+        visualizer.show()
75
+    @endcode
76
+
77
+    Metric for calculation distance between points can be specified by parameter additional 'metric':
78
+    @code
79
+        # create Minkowski distance metric with degree equals to '2'
80
+        metric = distance_metric(type_metric.MINKOWSKI, degree=2)
81
+
82
+        # create K-Medoids algorithm with specific distance metric
83
+        kmedoids_instance = kmedoids(sample, initial_medoids, metric=metric)
84
+
85
+        # run cluster analysis and obtain results
86
+        kmedoids_instance.process()
87
+        clusters = kmedoids_instance.get_clusters()
88
+    @endcode
89
+
90
+    Distance matrix can be used instead of sequence of points to increase performance and for that purpose parameter 'data_type' should be used:
91
+    @code
92
+        # calculate distance matrix for sample
93
+        sample = read_sample(path_to_sample)
94
+        matrix = calculate_distance_matrix(sample)
95
+
96
+        # create K-Medoids algorithm for processing distance matrix instead of points
97
+        kmedoids_instance = kmedoids(matrix, initial_medoids, data_type='distance_matrix')
98
+
99
+        # run cluster analysis and obtain results
100
+        kmedoids_instance.process()
101
+
102
+        clusters = kmedoids_instance.get_clusters()
103
+        medoids = kmedoids_instance.get_medoids()
104
+    @endcode
105
+
106
+    """
107
+    
108
+    
109
+    def __init__(self, data, initial_index_medoids, tolerance=0.001, ccore=True, **kwargs):
110
+        """!
111
+        @brief Constructor of clustering algorithm K-Medoids.
112
+        
113
+        @param[in] data (list): Input data that is presented as list of points (objects), each point should be represented by list or tuple.
114
+        @param[in] initial_index_medoids (list): Indexes of intial medoids (indexes of points in input data).
115
+        @param[in] tolerance (double): Stop condition: if maximum value of distance change of medoids of clusters is less than tolerance than algorithm will stop processing.
116
+        @param[in] ccore (bool): If specified than CCORE library (C++ pyclustering library) is used for clustering instead of Python code.
117
+        @param[in] **kwargs: Arbitrary keyword arguments (available arguments: 'metric', 'data_type', 'itermax').
118
+
119
+        <b>Keyword Args:</b><br>
120
+            - metric (distance_metric): Metric that is used for distance calculation between two points.
121
+            - data_type (string): Data type of input sample 'data' that is processed by the algorithm ('points', 'distance_matrix').
122
+            - itermax (uint): Maximum number of iteration for cluster analysis.
123
+
124
+        """
125
+        self.__pointer_data = data
126
+        self.__clusters = []
127
+        self.__medoid_indexes = initial_index_medoids
128
+        self.__tolerance = tolerance
129
+
130
+        self.__metric = kwargs.get('metric', distance_metric(type_metric.EUCLIDEAN_SQUARE))
131
+        self.__data_type = kwargs.get('data_type', 'points')
132
+        self.__itermax = kwargs.get('itermax', 200)
133
+
134
+        self.__distance_calculator = self.__create_distance_calculator()
135
+
136
+        self.__ccore = ccore and self.__metric.get_type() != type_metric.USER_DEFINED
137
+        if self.__ccore:
138
+            self.__ccore = ccore_library.workable()
139
+
140
+        #self.__verify_instance()
141
+
142
+
143
+    def process(self):
144
+        """!
145
+        @brief Performs cluster analysis in line with rules of K-Medoids algorithm.
146
+
147
+        @return (kmedoids) Returns itself (K-Medoids instance).
148
+
149
+        @remark Results of clustering can be obtained using corresponding get methods.
150
+        
151
+        @see get_clusters()
152
+        @see get_medoids()
153
+        
154
+        """
155
+        
156
+        if self.__ccore is True:
157
+            ccore_metric = metric_wrapper.create_instance(self.__metric)
158
+            self.__clusters, self.__medoid_indexes = wrapper.kmedoids(self.__pointer_data, self.__medoid_indexes, self.__tolerance, self.__itermax, ccore_metric.get_pointer(), self.__data_type)
159
+        
160
+        else:
161
+            changes = float('inf')
162
+            iterations = 0
163
+
164
+            while changes > self.__tolerance and iterations < self.__itermax:
165
+                self.__clusters = self.__update_clusters()
166
+                update_medoid_indexes = self.__update_medoids()
167
+
168
+                changes = max([self.__distance_calculator(self.__medoid_indexes[index], update_medoid_indexes[index]) for index in range(len(update_medoid_indexes))])
169
+
170
+                self.__medoid_indexes = update_medoid_indexes
171
+
172
+                iterations += 1
173
+
174
+        return self
175
+
176
+
177
+    def predict(self, points):
178
+        """!
179
+        @brief Calculates the closest cluster to each point.
180
+
181
+        @param[in] points (array_like): Points for which closest clusters are calculated.
182
+
183
+        @return (list) List of closest clusters for each point. Each cluster is denoted by index. Return empty
184
+                 collection if 'process()' method was not called.
185
+
186
+        An example how to calculate (or predict) the closest cluster to specified points.
187
+        @code
188
+            from pyclustering.cluster.kmedoids import kmedoids
189
+            from pyclustering.samples.definitions import SIMPLE_SAMPLES
190
+            from pyclustering.utils import read_sample
191
+
192
+            # Load list of points for cluster analysis.
193
+            sample = read_sample(SIMPLE_SAMPLES.SAMPLE_SIMPLE3)
194
+
195
+            # Initial medoids for sample 'Simple3'.
196
+            initial_medoids = [4, 12, 25, 37]
197
+
198
+            # Create instance of K-Medoids algorithm with prepared centers.
199
+            kmedoids_instance = kmedoids(sample, initial_medoids)
200
+
201
+            # Run cluster analysis.
202
+            kmedoids_instance.process()
203
+
204
+            # Calculate the closest cluster to following two points.
205
+            points = [[0.35, 0.5], [2.5, 2.0]]
206
+            closest_clusters = kmedoids_instance.predict(points)
207
+            print(closest_clusters)
208
+        @endcode
209
+
210
+        """
211
+
212
+        if len(self.__clusters) == 0:
213
+            return []
214
+
215
+        medoids = [ self.__pointer_data[index] for index in self.__medoid_indexes ]
216
+        differences = numpy.zeros((len(points), len(medoids)))
217
+        for index_point in range(len(points)):
218
+            differences[index_point] = [ self.__metric(points[index_point], center) for center in medoids ]
219
+
220
+        return numpy.argmin(differences, axis=1)
221
+
222
+
223
+    def get_clusters(self):
224
+        """!
225
+        @brief Returns list of allocated clusters, each cluster contains indexes of objects in list of data.
226
+        
227
+        @see process()
228
+        @see get_medoids()
229
+        
230
+        """
231
+        
232
+        return self.__clusters
233
+    
234
+    
235
+    def get_medoids(self):
236
+        """!
237
+        @brief Returns list of medoids of allocated clusters represented by indexes from the input data.
238
+        
239
+        @see process()
240
+        @see get_clusters()
241
+        
242
+        """
243
+
244
+        return self.__medoid_indexes
245
+
246
+
247
+    def get_cluster_encoding(self):
248
+        """!
249
+        @brief Returns clustering result representation type that indicate how clusters are encoded.
250
+        
251
+        @return (type_encoding) Clustering result representation.
252
+        
253
+        @see get_clusters()
254
+        
255
+        """
256
+        
257
+        return type_encoding.CLUSTER_INDEX_LIST_SEPARATION
258
+
259
+
260
+    def __verify_instance(self):
261
+        pass
262
+
263
+
264
+    def __create_distance_calculator(self):
265
+        """!
266
+        @brief Creates distance calculator in line with algorithms parameters.
267
+
268
+        @return (callable) Distance calculator.
269
+
270
+        """
271
+        if self.__data_type == 'points':
272
+            return lambda index1, index2: self.__metric(self.__pointer_data[index1], self.__pointer_data[index2])
273
+
274
+        elif self.__data_type == 'distance_matrix':
275
+            if isinstance(self.__pointer_data, numpy.matrix):
276
+                return lambda index1, index2: self.__pointer_data.item((index1, index2))
277
+
278
+            return lambda index1, index2: self.__pointer_data[index1][index2]
279
+
280
+        else:
281
+            raise TypeError("Unknown type of data is specified '%s'" % self.__data_type)
282
+
283
+
284
+    def __update_clusters(self):
285
+        """!
286
+        @brief Calculate distance to each point from the each cluster. 
287
+        @details Nearest points are captured by according clusters and as a result clusters are updated.
288
+        
289
+        @return (list) updated clusters as list of clusters where each cluster contains indexes of objects from data.
290
+        
291
+        """
292
+        
293
+        clusters = [[self.__medoid_indexes[i]] for i in range(len(self.__medoid_indexes))]
294
+        for index_point in range(len(self.__pointer_data)):
295
+            if index_point in self.__medoid_indexes:
296
+                continue
297
+
298
+            index_optim = -1
299
+            dist_optim = float('Inf')
300
+            
301
+            for index in range(len(self.__medoid_indexes)):
302
+                dist = self.__distance_calculator(index_point, self.__medoid_indexes[index])
303
+                
304
+                if dist < dist_optim:
305
+                    index_optim = index
306
+                    dist_optim = dist
307
+            
308
+            clusters[index_optim].append(index_point)
309
+        
310
+        return clusters
311
+    
312
+    
313
+    def __update_medoids(self):
314
+        """!
315
+        @brief Find medoids of clusters in line with contained objects.
316
+        
317
+        @return (list) list of medoids for current number of clusters.
318
+        
319
+        """
320
+
321
+        medoid_indexes = [-1] * len(self.__clusters)
322
+        
323
+        for index in range(len(self.__clusters)):
324
+            medoid_index = medoid(self.__pointer_data, self.__clusters[index], metric=self.__metric, data_type=self.__data_type)
325
+            medoid_indexes[index] = medoid_index
326
+             
327
+        return medoid_indexes