|
@@ -0,0 +1,327 @@
|
|
1
|
+"""!
|
|
2
|
+
|
|
3
|
+@brief Cluster analysis algorithm: K-Medoids.
|
|
4
|
+@details Implementation based on papers @cite book::algorithms_for_clustering_data, @cite book::finding_groups_in_data.
|
|
5
|
+
|
|
6
|
+@authors Andrei Novikov (pyclustering@yandex.ru)
|
|
7
|
+@date 2014-2019
|
|
8
|
+@copyright GNU Public License
|
|
9
|
+
|
|
10
|
+@cond GNU_PUBLIC_LICENSE
|
|
11
|
+ PyClustering is free software: you can redistribute it and/or modify
|
|
12
|
+ it under the terms of the GNU General Public License as published by
|
|
13
|
+ the Free Software Foundation, either version 3 of the License, or
|
|
14
|
+ (at your option) any later version.
|
|
15
|
+
|
|
16
|
+ PyClustering is distributed in the hope that it will be useful,
|
|
17
|
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
18
|
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
19
|
+ GNU General Public License for more details.
|
|
20
|
+
|
|
21
|
+ You should have received a copy of the GNU General Public License
|
|
22
|
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
23
|
+@endcond
|
|
24
|
+
|
|
25
|
+"""
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+import numpy
|
|
29
|
+
|
|
30
|
+from pyclustering.cluster.encoder import type_encoding
|
|
31
|
+
|
|
32
|
+from pyclustering.utils import medoid
|
|
33
|
+from pyclustering.utils.metric import distance_metric, type_metric
|
|
34
|
+
|
|
35
|
+import pyclustering.core.kmedoids_wrapper as wrapper
|
|
36
|
+
|
|
37
|
+from pyclustering.core.wrapper import ccore_library
|
|
38
|
+from pyclustering.core.metric_wrapper import metric_wrapper
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+class kmedoids:
|
|
42
|
+ """!
|
|
43
|
+ @brief Class represents clustering algorithm K-Medoids.
|
|
44
|
+ @details The algorithm is less sensitive to outliers tham K-Means. The principle difference between K-Medoids and K-Medians is that
|
|
45
|
+ K-Medoids uses existed points from input data space as medoids, but median in K-Medians can be unreal object (not from
|
|
46
|
+ input data space).
|
|
47
|
+
|
|
48
|
+ Clustering example:
|
|
49
|
+ @code
|
|
50
|
+ from pyclustering.cluster.kmedoids import kmedoids
|
|
51
|
+ from pyclustering.cluster import cluster_visualizer
|
|
52
|
+ from pyclustering.utils import read_sample
|
|
53
|
+ from pyclustering.samples.definitions import FCPS_SAMPLES
|
|
54
|
+
|
|
55
|
+ # Load list of points for cluster analysis.
|
|
56
|
+ sample = read_sample(FCPS_SAMPLES.SAMPLE_TWO_DIAMONDS)
|
|
57
|
+
|
|
58
|
+ # Set random initial medoids.
|
|
59
|
+ initial_medoids = [1, 500]
|
|
60
|
+
|
|
61
|
+ # Create instance of K-Medoids algorithm.
|
|
62
|
+ kmedoids_instance = kmedoids(sample, initial_medoids)
|
|
63
|
+
|
|
64
|
+ # Run cluster analysis and obtain results.
|
|
65
|
+ kmedoids_instance.process()
|
|
66
|
+ clusters = kmedoids_instance.get_clusters()
|
|
67
|
+
|
|
68
|
+ # Show allocated clusters.
|
|
69
|
+ print(clusters)
|
|
70
|
+
|
|
71
|
+ # Display clusters.
|
|
72
|
+ visualizer = cluster_visualizer()
|
|
73
|
+ visualizer.append_clusters(clusters, sample)
|
|
74
|
+ visualizer.show()
|
|
75
|
+ @endcode
|
|
76
|
+
|
|
77
|
+ Metric for calculation distance between points can be specified by parameter additional 'metric':
|
|
78
|
+ @code
|
|
79
|
+ # create Minkowski distance metric with degree equals to '2'
|
|
80
|
+ metric = distance_metric(type_metric.MINKOWSKI, degree=2)
|
|
81
|
+
|
|
82
|
+ # create K-Medoids algorithm with specific distance metric
|
|
83
|
+ kmedoids_instance = kmedoids(sample, initial_medoids, metric=metric)
|
|
84
|
+
|
|
85
|
+ # run cluster analysis and obtain results
|
|
86
|
+ kmedoids_instance.process()
|
|
87
|
+ clusters = kmedoids_instance.get_clusters()
|
|
88
|
+ @endcode
|
|
89
|
+
|
|
90
|
+ Distance matrix can be used instead of sequence of points to increase performance and for that purpose parameter 'data_type' should be used:
|
|
91
|
+ @code
|
|
92
|
+ # calculate distance matrix for sample
|
|
93
|
+ sample = read_sample(path_to_sample)
|
|
94
|
+ matrix = calculate_distance_matrix(sample)
|
|
95
|
+
|
|
96
|
+ # create K-Medoids algorithm for processing distance matrix instead of points
|
|
97
|
+ kmedoids_instance = kmedoids(matrix, initial_medoids, data_type='distance_matrix')
|
|
98
|
+
|
|
99
|
+ # run cluster analysis and obtain results
|
|
100
|
+ kmedoids_instance.process()
|
|
101
|
+
|
|
102
|
+ clusters = kmedoids_instance.get_clusters()
|
|
103
|
+ medoids = kmedoids_instance.get_medoids()
|
|
104
|
+ @endcode
|
|
105
|
+
|
|
106
|
+ """
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+ def __init__(self, data, initial_index_medoids, tolerance=0.001, ccore=True, **kwargs):
|
|
110
|
+ """!
|
|
111
|
+ @brief Constructor of clustering algorithm K-Medoids.
|
|
112
|
+
|
|
113
|
+ @param[in] data (list): Input data that is presented as list of points (objects), each point should be represented by list or tuple.
|
|
114
|
+ @param[in] initial_index_medoids (list): Indexes of intial medoids (indexes of points in input data).
|
|
115
|
+ @param[in] tolerance (double): Stop condition: if maximum value of distance change of medoids of clusters is less than tolerance than algorithm will stop processing.
|
|
116
|
+ @param[in] ccore (bool): If specified than CCORE library (C++ pyclustering library) is used for clustering instead of Python code.
|
|
117
|
+ @param[in] **kwargs: Arbitrary keyword arguments (available arguments: 'metric', 'data_type', 'itermax').
|
|
118
|
+
|
|
119
|
+ <b>Keyword Args:</b><br>
|
|
120
|
+ - metric (distance_metric): Metric that is used for distance calculation between two points.
|
|
121
|
+ - data_type (string): Data type of input sample 'data' that is processed by the algorithm ('points', 'distance_matrix').
|
|
122
|
+ - itermax (uint): Maximum number of iteration for cluster analysis.
|
|
123
|
+
|
|
124
|
+ """
|
|
125
|
+ self.__pointer_data = data
|
|
126
|
+ self.__clusters = []
|
|
127
|
+ self.__medoid_indexes = initial_index_medoids
|
|
128
|
+ self.__tolerance = tolerance
|
|
129
|
+
|
|
130
|
+ self.__metric = kwargs.get('metric', distance_metric(type_metric.EUCLIDEAN_SQUARE))
|
|
131
|
+ self.__data_type = kwargs.get('data_type', 'points')
|
|
132
|
+ self.__itermax = kwargs.get('itermax', 200)
|
|
133
|
+
|
|
134
|
+ self.__distance_calculator = self.__create_distance_calculator()
|
|
135
|
+
|
|
136
|
+ self.__ccore = ccore and self.__metric.get_type() != type_metric.USER_DEFINED
|
|
137
|
+ if self.__ccore:
|
|
138
|
+ self.__ccore = ccore_library.workable()
|
|
139
|
+
|
|
140
|
+ #self.__verify_instance()
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+ def process(self):
|
|
144
|
+ """!
|
|
145
|
+ @brief Performs cluster analysis in line with rules of K-Medoids algorithm.
|
|
146
|
+
|
|
147
|
+ @return (kmedoids) Returns itself (K-Medoids instance).
|
|
148
|
+
|
|
149
|
+ @remark Results of clustering can be obtained using corresponding get methods.
|
|
150
|
+
|
|
151
|
+ @see get_clusters()
|
|
152
|
+ @see get_medoids()
|
|
153
|
+
|
|
154
|
+ """
|
|
155
|
+
|
|
156
|
+ if self.__ccore is True:
|
|
157
|
+ ccore_metric = metric_wrapper.create_instance(self.__metric)
|
|
158
|
+ self.__clusters, self.__medoid_indexes = wrapper.kmedoids(self.__pointer_data, self.__medoid_indexes, self.__tolerance, self.__itermax, ccore_metric.get_pointer(), self.__data_type)
|
|
159
|
+
|
|
160
|
+ else:
|
|
161
|
+ changes = float('inf')
|
|
162
|
+ iterations = 0
|
|
163
|
+
|
|
164
|
+ while changes > self.__tolerance and iterations < self.__itermax:
|
|
165
|
+ self.__clusters = self.__update_clusters()
|
|
166
|
+ update_medoid_indexes = self.__update_medoids()
|
|
167
|
+
|
|
168
|
+ changes = max([self.__distance_calculator(self.__medoid_indexes[index], update_medoid_indexes[index]) for index in range(len(update_medoid_indexes))])
|
|
169
|
+
|
|
170
|
+ self.__medoid_indexes = update_medoid_indexes
|
|
171
|
+
|
|
172
|
+ iterations += 1
|
|
173
|
+
|
|
174
|
+ return self
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+ def predict(self, points):
|
|
178
|
+ """!
|
|
179
|
+ @brief Calculates the closest cluster to each point.
|
|
180
|
+
|
|
181
|
+ @param[in] points (array_like): Points for which closest clusters are calculated.
|
|
182
|
+
|
|
183
|
+ @return (list) List of closest clusters for each point. Each cluster is denoted by index. Return empty
|
|
184
|
+ collection if 'process()' method was not called.
|
|
185
|
+
|
|
186
|
+ An example how to calculate (or predict) the closest cluster to specified points.
|
|
187
|
+ @code
|
|
188
|
+ from pyclustering.cluster.kmedoids import kmedoids
|
|
189
|
+ from pyclustering.samples.definitions import SIMPLE_SAMPLES
|
|
190
|
+ from pyclustering.utils import read_sample
|
|
191
|
+
|
|
192
|
+ # Load list of points for cluster analysis.
|
|
193
|
+ sample = read_sample(SIMPLE_SAMPLES.SAMPLE_SIMPLE3)
|
|
194
|
+
|
|
195
|
+ # Initial medoids for sample 'Simple3'.
|
|
196
|
+ initial_medoids = [4, 12, 25, 37]
|
|
197
|
+
|
|
198
|
+ # Create instance of K-Medoids algorithm with prepared centers.
|
|
199
|
+ kmedoids_instance = kmedoids(sample, initial_medoids)
|
|
200
|
+
|
|
201
|
+ # Run cluster analysis.
|
|
202
|
+ kmedoids_instance.process()
|
|
203
|
+
|
|
204
|
+ # Calculate the closest cluster to following two points.
|
|
205
|
+ points = [[0.35, 0.5], [2.5, 2.0]]
|
|
206
|
+ closest_clusters = kmedoids_instance.predict(points)
|
|
207
|
+ print(closest_clusters)
|
|
208
|
+ @endcode
|
|
209
|
+
|
|
210
|
+ """
|
|
211
|
+
|
|
212
|
+ if len(self.__clusters) == 0:
|
|
213
|
+ return []
|
|
214
|
+
|
|
215
|
+ medoids = [ self.__pointer_data[index] for index in self.__medoid_indexes ]
|
|
216
|
+ differences = numpy.zeros((len(points), len(medoids)))
|
|
217
|
+ for index_point in range(len(points)):
|
|
218
|
+ differences[index_point] = [ self.__metric(points[index_point], center) for center in medoids ]
|
|
219
|
+
|
|
220
|
+ return numpy.argmin(differences, axis=1)
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+ def get_clusters(self):
|
|
224
|
+ """!
|
|
225
|
+ @brief Returns list of allocated clusters, each cluster contains indexes of objects in list of data.
|
|
226
|
+
|
|
227
|
+ @see process()
|
|
228
|
+ @see get_medoids()
|
|
229
|
+
|
|
230
|
+ """
|
|
231
|
+
|
|
232
|
+ return self.__clusters
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+ def get_medoids(self):
|
|
236
|
+ """!
|
|
237
|
+ @brief Returns list of medoids of allocated clusters represented by indexes from the input data.
|
|
238
|
+
|
|
239
|
+ @see process()
|
|
240
|
+ @see get_clusters()
|
|
241
|
+
|
|
242
|
+ """
|
|
243
|
+
|
|
244
|
+ return self.__medoid_indexes
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+ def get_cluster_encoding(self):
|
|
248
|
+ """!
|
|
249
|
+ @brief Returns clustering result representation type that indicate how clusters are encoded.
|
|
250
|
+
|
|
251
|
+ @return (type_encoding) Clustering result representation.
|
|
252
|
+
|
|
253
|
+ @see get_clusters()
|
|
254
|
+
|
|
255
|
+ """
|
|
256
|
+
|
|
257
|
+ return type_encoding.CLUSTER_INDEX_LIST_SEPARATION
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+ def __verify_instance(self):
|
|
261
|
+ pass
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+ def __create_distance_calculator(self):
|
|
265
|
+ """!
|
|
266
|
+ @brief Creates distance calculator in line with algorithms parameters.
|
|
267
|
+
|
|
268
|
+ @return (callable) Distance calculator.
|
|
269
|
+
|
|
270
|
+ """
|
|
271
|
+ if self.__data_type == 'points':
|
|
272
|
+ return lambda index1, index2: self.__metric(self.__pointer_data[index1], self.__pointer_data[index2])
|
|
273
|
+
|
|
274
|
+ elif self.__data_type == 'distance_matrix':
|
|
275
|
+ if isinstance(self.__pointer_data, numpy.matrix):
|
|
276
|
+ return lambda index1, index2: self.__pointer_data.item((index1, index2))
|
|
277
|
+
|
|
278
|
+ return lambda index1, index2: self.__pointer_data[index1][index2]
|
|
279
|
+
|
|
280
|
+ else:
|
|
281
|
+ raise TypeError("Unknown type of data is specified '%s'" % self.__data_type)
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+ def __update_clusters(self):
|
|
285
|
+ """!
|
|
286
|
+ @brief Calculate distance to each point from the each cluster.
|
|
287
|
+ @details Nearest points are captured by according clusters and as a result clusters are updated.
|
|
288
|
+
|
|
289
|
+ @return (list) updated clusters as list of clusters where each cluster contains indexes of objects from data.
|
|
290
|
+
|
|
291
|
+ """
|
|
292
|
+
|
|
293
|
+ clusters = [[self.__medoid_indexes[i]] for i in range(len(self.__medoid_indexes))]
|
|
294
|
+ for index_point in range(len(self.__pointer_data)):
|
|
295
|
+ if index_point in self.__medoid_indexes:
|
|
296
|
+ continue
|
|
297
|
+
|
|
298
|
+ index_optim = -1
|
|
299
|
+ dist_optim = float('Inf')
|
|
300
|
+
|
|
301
|
+ for index in range(len(self.__medoid_indexes)):
|
|
302
|
+ dist = self.__distance_calculator(index_point, self.__medoid_indexes[index])
|
|
303
|
+
|
|
304
|
+ if dist < dist_optim:
|
|
305
|
+ index_optim = index
|
|
306
|
+ dist_optim = dist
|
|
307
|
+
|
|
308
|
+ clusters[index_optim].append(index_point)
|
|
309
|
+
|
|
310
|
+ return clusters
|
|
311
|
+
|
|
312
|
+
|
|
313
|
+ def __update_medoids(self):
|
|
314
|
+ """!
|
|
315
|
+ @brief Find medoids of clusters in line with contained objects.
|
|
316
|
+
|
|
317
|
+ @return (list) list of medoids for current number of clusters.
|
|
318
|
+
|
|
319
|
+ """
|
|
320
|
+
|
|
321
|
+ medoid_indexes = [-1] * len(self.__clusters)
|
|
322
|
+
|
|
323
|
+ for index in range(len(self.__clusters)):
|
|
324
|
+ medoid_index = medoid(self.__pointer_data, self.__clusters[index], metric=self.__metric, data_type=self.__data_type)
|
|
325
|
+ medoid_indexes[index] = medoid_index
|
|
326
|
+
|
|
327
|
+ return medoid_indexes
|