Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# -*- coding: utf-8 -*-
2"""
3@file
4@brief Implémente la classe @see cl ConstraintKMeans.
5"""
6import numpy
7from scipy.spatial import Delaunay # pylint: disable=E0611
8from sklearn.cluster import KMeans
9from sklearn.metrics.pairwise import euclidean_distances
10from ._kmeans_constraint_ import constraint_kmeans, constraint_predictions
13class ConstraintKMeans(KMeans):
14 """
15 Defines a constraint :epkg:`k-means`.
16 Clusters are modified to have an equal size.
17 The algorithm is initialized with a regular :epkg:`KMeans`
18 and continues with a modified version of it.
20 Computing the predictions offer a choice.
21 The first one is to keep the predictions
22 from the regular :epkg:`k-means`
23 algorithm but with the balanced clusters.
24 The second is to compute balanced predictions
25 over the test set. That implies that the predictions
26 for the same observations might change depending
27 on the set it belongs to.
29 The parameter *strategy* determines how
30 obseervations should be assigned to a cluster.
31 The value can be:
33 * ``'distance'``: observations are ranked by distance to a cluster,
34 the algorithm assigns first point to the closest center unless it reached
35 the maximum size, it deals first with the further point and maps it to
36 the closest center
37 * ``'gain'``: follows the algorithm described at
38 see `Same-size k-Means Variation
39 <https://elki-project.github.io/tutorial/same-size_k_means>`_,
40 * ``'weights'``: estimates weights attached to each cluster,
41 it weights the distance to each cluster in order
42 to balance the number of points mapped to every cluster,
43 the strategy uses a learning rate.
45 The first two strategies cannot reach a good compromise
46 without using function @see fn _switch_clusters which
47 tries every switch between clusters: two points
48 change clusters. It keeps the number of points and checks
49 that the inertia is reduced.
50 """
52 _strategy_value = {'distance', 'gain', 'weights'}
54 def __init__(self, n_clusters=8, init='k-means++', n_init=10, max_iter=500,
55 tol=0.0001, verbose=0,
56 random_state=None, copy_x=True, algorithm='auto',
57 balanced_predictions=False, strategy='gain', kmeans0=True,
58 learning_rate=1., history=False):
59 """
60 @param n_clusters number of clusters
61 @param init used by :epkg:`k-means`
62 @param n_init used by :epkg:`k-means`
63 @param max_iter used by :epkg:`k-means`
64 @param tol used by :epkg:`k-means`
65 @param verbose used by :epkg:`k-means`
66 @param random_state used by :epkg:`k-means`
67 @param copy_x used by :epkg:`k-means`
68 @param algorithm used by :epkg:`k-means`
69 @param balanced_predictions produced balanced prediction
70 or the regular ones
71 @param strategy strategy or algorithm used to abide
72 by the constraint
73 @param kmeans0 if True, applies *k-means* algorithm first
74 @param history keeps centers accress iterations
75 @param learning_rate learning rate, used by strategy `'weights'`
76 """
77 self._n_threads = 1
78 KMeans.__init__(self, n_clusters=n_clusters, init=init, n_init=n_init,
79 max_iter=max_iter, tol=tol,
80 verbose=verbose, random_state=random_state, copy_x=copy_x,
81 algorithm=algorithm)
82 self.balanced_predictions = balanced_predictions
83 self.strategy = strategy
84 self.kmeans0 = kmeans0
85 self.history = history
86 self.learning_rate = learning_rate
87 if strategy not in ConstraintKMeans._strategy_value:
88 raise ValueError('strategy must be in {0}'.format(
89 ConstraintKMeans._strategy_value))
91 def fit(self, X, y=None, sample_weight=None, fLOG=None):
92 """
93 Compute k-means clustering.
95 :param X: array-like or sparse matrix, shape=(n_samples, n_features)
96 Training instances to cluster. It must be noted that the data
97 will be converted to C ordering, which will cause a memory
98 copy if the given data is not C-contiguous.
99 :param y: Ignored
100 :param sample_weight: sample weight
101 :param fLOG: logging function
102 """
103 max_iter = self.max_iter
104 self.max_iter //= 2
105 if self.kmeans0:
106 KMeans.fit(self, X, y, sample_weight=sample_weight)
107 state = None
108 else:
109 state = numpy.random.RandomState( # pylint: disable=E1101
110 self.random_state)
111 labels = state.randint(
112 0, self.n_clusters, X.shape[0], dtype=numpy.int32)
113 centers = numpy.empty((self.n_clusters, X.shape[1]), dtype=X.dtype)
114 choice = state.randint(0, self.n_clusters, self.n_clusters)
115 for i, c in enumerate(choice):
116 centers[i, :] = X[c, :]
117 self.labels_ = labels
118 self.cluster_centers_ = centers
119 self.inertia_ = float(X.shape[0])
120 self.n_iter_ = 0
122 self.max_iter = max_iter
123 return self.constraint_kmeans(
124 X, sample_weight=sample_weight, state=state,
125 learning_rate=self.learning_rate,
126 history=self.history, fLOG=fLOG)
128 def constraint_kmeans(self, X, sample_weight=None, state=None,
129 learning_rate=1., history=False, fLOG=None):
130 """
131 Completes the constraint k-means.
133 @param X features
134 @param sample_weight sample weight
135 @param state state
136 @param history keeps evolution of centers
137 @param fLOG logging function
138 """
139 labels, centers, inertia, weights, iter_, all_centers = constraint_kmeans(
140 X, self.labels_, sample_weight, self.cluster_centers_,
141 inertia=self.inertia_, iter=self.n_iter_,
142 max_iter=self.max_iter, verbose=self.verbose,
143 strategy=self.strategy, state=state,
144 learning_rate=learning_rate, history=history,
145 fLOG=fLOG)
146 self.labels_ = labels
147 self.cluster_centers_ = centers
148 self.cluster_centers_iter_ = (
149 None if len(all_centers) == 0 else numpy.dstack(all_centers))
150 self.inertia_ = inertia
151 self.n_iter_ = iter_
152 self.weights_ = weights
153 return self
155 def predict(self, X, sample_weight=None):
156 """
157 Computes the predictions.
159 @param X features.
160 @return prediction
161 """
162 if self.weights_ is None:
163 if self.balanced_predictions:
164 labels, _, __ = constraint_predictions(
165 X, self.cluster_centers_, strategy=self.strategy + '_p')
166 return labels
167 return KMeans.predict(self, X, sample_weight=sample_weight)
168 else:
169 if self.balanced_predictions:
170 raise RuntimeError( # pragma: no cover
171 "balanced_predictions and weights_ cannot be used together.")
172 return KMeans.predict(self, X, sample_weight=sample_weight)
174 def transform(self, X):
175 """
176 Computes the predictions.
178 @param X features.
179 @return prediction
180 """
181 if self.weights_ is None:
182 if self.balanced_predictions:
183 labels, distances, __ = constraint_predictions(
184 X, self.cluster_centers_, strategy=self.strategy)
185 # We remove small distances than the chosen clusters
186 # due to the constraint, we choose max*2 instead.
187 mx = distances.max() * 2
188 for i, l in enumerate(labels):
189 mi = distances[i, l]
190 mmi = distances[i, :].min()
191 if mi > mmi:
192 # numpy.nan would be best
193 distances[i, distances[i, :] < mi] = mx
194 return distances
195 return KMeans.transform(self, X)
196 else:
197 if self.balanced_predictions:
198 raise RuntimeError( # pragma: no cover
199 "balanced_predictions and weights_ cannot be used together.")
200 res = KMeans.transform(self, X)
201 res *= self.weights_.reshape((1, -1))
202 return res
204 def score(self, X, y=None, sample_weight=None):
205 """
206 Returns the distances to all clusters.
208 @param X features
209 @param y unused
210 @param sample_weight sample weight
211 @return distances
212 """
213 if self.weights_ is None:
214 if self.balanced_predictions:
215 _, __, dist_close = constraint_predictions(
216 X, self.cluster_centers_, strategy=self.strategy)
217 return dist_close
218 res = euclidean_distances(self.cluster_centers_, X, squared=True)
219 else:
220 if self.balanced_predictions:
221 raise RuntimeError( # pragma: no cover
222 "balanced_predictions and weights_ cannot be used together.")
223 res = euclidean_distances(X, self.cluster_centers_, squared=True)
224 res *= self.weights_.reshape((1, -1))
225 return res.max(axis=1)
227 def cluster_edges(self):
228 """
229 Computes edges between clusters based on a
230 `Delaunay <https://docs.scipy.org/doc/scipy/reference/
231 generated/scipy.spatial.Delaunay.html>`_
232 graph.
233 """
234 tri = Delaunay(self.cluster_centers_)
235 triangles = tri.simplices # pylint: disable=E1101
236 edges = set()
237 for row in triangles:
238 for j in range(1, row.shape[-1]):
239 a, b = row[j - 1:j + 1]
240 if a < b:
241 edges.add((a, b))
242 else:
243 edges.add((b, a))
244 a, b = row[0], row[-1]
245 if a < b:
246 edges.add((a, b))
247 else:
248 edges.add((b, a))
249 return edges