Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# -*- coding: utf-8 -*-
2"""
3@file
4@brief Implémente la classe @see cl ConstraintKMeans.
5"""
6import bisect
7from collections import Counter
8from pandas import DataFrame
9import numpy
10import scipy.sparse
11from scipy.spatial.distance import cdist
12from sklearn.metrics.pairwise import euclidean_distances
13from sklearn.utils.extmath import row_norms
14from ._kmeans_022 import (
15 _centers_dense, _centers_sparse, _labels_inertia_skl)
18def linearize_matrix(mat, *adds):
19 """
20 Linearizes a matrix into a new one
21 with 3 columns value, row, column.
22 The output format is similar to
23 :epkg:`csr_matrix` but null values are kept.
25 @param mat matrix
26 @param adds additional square matrices
27 @return new matrix
29 *adds* defines additional matrices, it adds
30 columns on the right side and fill them with
31 the corresponding value taken into the additional
32 matrices.
33 """
34 if scipy.sparse.issparse(mat):
35 if isinstance(mat, scipy.sparse.csr_matrix):
36 max_row = mat.shape[0]
37 res = numpy.empty((len(mat.data), 3 + len(adds)), dtype=mat.dtype)
38 row = 0
39 for i, v in enumerate(mat.data):
40 while row < max_row and i >= mat.indptr[row]:
41 row += 1
42 res[i, 0] = v
43 a, b = row - 1, mat.indices[i]
44 res[i, 1] = a
45 res[i, 2] = b
46 for k, am in enumerate(adds):
47 res[i, k + 3] = am[a, b]
48 return res
49 raise NotImplementedError( # pragma: no cover
50 "This kind of sparse matrix is not handled: {0}".format(type(mat)))
51 else:
52 n = mat.shape[0]
53 c = mat.shape[1]
54 ic = numpy.arange(mat.shape[1])
55 res = numpy.empty((n * c, 3 + len(adds)), dtype=mat.dtype)
56 for i in range(0, n):
57 a = i * c
58 b = (i + 1) * c
59 res[a:b, 1] = i
60 res[a:b, 2] = ic
61 res[:, 0] = mat.ravel()
62 for k, am in enumerate(adds):
63 res[:, 3 + k] = am.ravel()
64 return res
67def constraint_kmeans(X, labels, sample_weight, centers, inertia,
68 iter, max_iter, # pylint: disable=W0622
69 strategy='gain', verbose=0, state=None,
70 learning_rate=1., history=False, fLOG=None):
71 """
72 Completes the constraint :epkg:`k-means`.
74 @param X features
75 @param labels initialized labels (unused)
76 @param sample_weight sample weight
77 @param centers initialized centers
78 @param inertia initialized inertia (unused)
79 @param iter number of iteration already done
80 @param max_iter maximum of number of iteration
81 @param strategy strategy used to sort observations before
82 mapping them to clusters
83 @param verbose verbose
84 @param state random state
85 @param learning_rate used by strategy `'weights'`
86 @param history return list of centers accross iterations
87 @param fLOG logging function (needs to be specified otherwise
88 verbose has no effects)
89 @return tuple (best_labels, best_centers, best_inertia,
90 iter, all_centers)
91 """
92 if labels.dtype != numpy.int32:
93 raise TypeError( # pragma: no cover
94 "Labels must be an array of int not '{0}'".format(labels.dtype))
96 if strategy == 'weights':
97 return _constraint_kmeans_weights(
98 X, labels, sample_weight, centers, inertia, iter,
99 max_iter, verbose=verbose, state=state,
100 learning_rate=learning_rate, history=history, fLOG=fLOG)
101 else:
102 if isinstance(X, DataFrame):
103 X = X.values
104 x_squared_norms = row_norms(X, squared=True)
105 counters = numpy.empty((centers.shape[0],), dtype=numpy.int32)
106 limit = X.shape[0] // centers.shape[0]
107 leftover = X.shape[0] - limit * centers.shape[0]
108 leftclose = numpy.empty((centers.shape[0],), dtype=numpy.int32)
109 n_clusters = centers.shape[0]
110 distances_close = numpy.empty((X.shape[0],), dtype=X.dtype)
111 best_inertia = None
112 best_iter = None
113 all_centers = []
115 # association
116 _constraint_association(leftover, counters, labels, leftclose, distances_close,
117 centers, X, x_squared_norms, limit, strategy, state=state)
119 if sample_weight is None:
120 sw = numpy.ones((X.shape[0],))
121 else:
122 sw = sample_weight
124 if scipy.sparse.issparse(X):
125 _centers_fct = _centers_sparse
126 else:
127 _centers_fct = _centers_dense
129 while iter < max_iter:
130 # compute new clusters
131 centers = _centers_fct(
132 X, sw, labels, n_clusters, distances_close)
134 if history:
135 all_centers.append(centers)
137 # association
138 _constraint_association(
139 leftover, counters, labels, leftclose, distances_close,
140 centers, X, x_squared_norms, limit, strategy, state=state)
142 # inertia
143 _, inertia = _labels_inertia_skl(
144 X=X, sample_weight=sw, x_squared_norms=x_squared_norms,
145 centers=centers, distances=distances_close)
147 iter += 1
148 if verbose and fLOG: # pragma: no cover
149 fLOG("CKMeans %d/%d inertia=%f" % (iter, max_iter, inertia))
151 # best option so far?
152 if best_inertia is None or inertia < best_inertia:
153 best_inertia = inertia
154 best_centers = centers.copy()
155 best_labels = labels.copy()
156 best_iter = iter
158 # early stop
159 if (best_inertia is not None and inertia >= best_inertia and
160 iter > best_iter + 5):
161 break
163 return (best_labels, best_centers, best_inertia, None,
164 iter, all_centers)
167def constraint_predictions(X, centers, strategy, state=None):
168 """
169 Computes the predictions but tries
170 to associates the same numbers of points
171 in each cluster.
173 @param X features
174 @param centers centers of each clusters
175 @param strategy strategy used to sort point before
176 mapping them to a cluster
177 @param state random state
178 @return labels, distances, distances_close
179 """
180 if isinstance(X, DataFrame):
181 X = X.values
182 x_squared_norms = row_norms(X, squared=True)
183 counters = numpy.empty((centers.shape[0],), dtype=numpy.int32)
184 limit = X.shape[0] // centers.shape[0]
185 leftover = X.shape[0] - limit * centers.shape[0]
186 leftclose = numpy.empty((centers.shape[0],), dtype=numpy.int32)
187 distances_close = numpy.empty((X.shape[0],), dtype=X.dtype)
188 labels = numpy.empty((X.shape[0],), dtype=numpy.int32)
190 distances = _constraint_association(
191 leftover, counters, labels, leftclose,
192 distances_close, centers, X, x_squared_norms,
193 limit, strategy, state=state)
195 return labels, distances, distances_close
198def _constraint_association(leftover, counters, labels, leftclose, distances_close,
199 centers, X, x_squared_norms, limit, strategy, state=None):
200 """
201 Completes the constraint :epkg:`k-means`.
203 @param X features
204 @param labels initialized labels (unused)
205 @param centers initialized centers
206 @param x_squared_norms norm of *X*
207 @param limit number of point to associate per cluster
208 @param leftover number of points to associate at the end
209 @param counters allocated array
210 @param leftclose allocated array
211 @param labels allocated array
212 @param distances_close allocated array
213 @param strategy strategy used to sort point before
214 mapping them to a cluster
215 @param state random state
216 """
217 if strategy in ('distance', 'distance_p'):
218 return _constraint_association_distance(
219 leftover, counters, labels, leftclose, distances_close,
220 centers, X, x_squared_norms, limit, strategy, state=state)
221 if strategy in ('gain', 'gain_p'):
222 return _constraint_association_gain(
223 leftover, counters, labels, leftclose, distances_close,
224 centers, X, x_squared_norms, limit, strategy, state=state)
225 raise ValueError("Unknwon strategy '{0}'.".format(
226 strategy)) # pragma: no cover
229def _compute_strategy_coefficient(distances, strategy, labels):
230 """
231 Creates a matrix
232 """
233 if strategy in ('gain', 'gain_p'):
234 ar = numpy.arange(distances.shape[0])
235 dist = distances[ar, labels]
236 return distances - dist[:, numpy.newaxis]
237 raise ValueError( # pragma: no cover
238 "Unknwon strategy '{0}'.".format(strategy))
241def _randomize_index(index, weights):
242 """
243 Randomizes index depending on the value.
244 Swap indexes.
245 Modifies *index*.
246 """
247 maxi = weights.max()
248 mini = weights.min()
249 diff = max(maxi - mini, 1e-5)
250 rand = numpy.random.rand(weights.shape[0])
251 for i in range(1, index.shape[0]):
252 ind1 = index[i - 1]
253 ind2 = index[i]
254 w1 = weights[ind1]
255 w2 = weights[ind2]
256 ratio = abs(w2 - w1) / diff * 0.5
257 if rand[i] >= ratio + 0.5:
258 index[i - 1], index[i] = ind2, ind1
259 weights[i - 1], weights[i] = w2, w1
262def _switch_clusters(labels, distances):
263 """
264 Tries to switch clusters.
265 Modifies *labels* inplace.
267 @param labels labels
268 @param distances distances
269 """
270 perm = numpy.random.permutation(numpy.arange(0, labels.shape[0]))
271 niter = 0
272 modif = 1
273 while modif > 0 and niter < 10:
274 modif = 0
275 niter += 1
276 for i_ in range(labels.shape[0]):
277 for j_ in range(i_ + 1, labels.shape[0]):
278 i = perm[i_]
279 j = perm[j_]
280 c1 = labels[i]
281 c2 = labels[j]
282 if c1 == c2:
283 continue
284 d11 = distances[i, c1]
285 d12 = distances[i, c2]
286 d21 = distances[j, c1]
287 d22 = distances[j, c2]
288 if d11**2 + d22**2 > d21**2 + d12**2:
289 labels[i], labels[j] = c2, c1
290 modif += 1
293def _constraint_association_distance(leftover, counters, labels, leftclose, distances_close,
294 centers, X, x_squared_norms, limit, strategy, state=None):
295 """
296 Completes the constraint *k-means*,
297 the function sorts points by distance to the closest
298 cluster and associates them into that order.
299 It deals first with the further point and maps it to
300 the closest center.
302 @param X features
303 @param labels initialized labels (unused)
304 @param centers initialized centers
305 @param x_squared_norms norm of *X*
306 @param limit number of point to associate per cluster
307 @param leftover number of points to associate at the end
308 @param counters allocated array
309 @param leftclose allocated array
310 @param labels allocated array
311 @param distances_close allocated array
312 @param strategy strategy used to sort point before
313 mapping them to a cluster
314 @param state random state (unused)
315 """
317 # initialisation
318 counters[:] = 0
319 leftclose[:] = -1
320 labels[:] = -1
322 # distances
323 distances = euclidean_distances(
324 centers, X, Y_norm_squared=x_squared_norms, squared=True)
325 distances = distances.T
326 distances0 = distances.copy()
327 maxi = distances.ravel().max() * 2
328 centers_index = numpy.argsort(distances, axis=1)
330 while labels.min() == -1:
331 mini = numpy.min(distances, axis=1)
332 sorted_index = numpy.argsort(mini)
333 _randomize_index(sorted_index, mini)
335 nover = leftover
336 for ind in sorted_index:
337 if labels[ind] >= 0:
338 continue
339 for c in centers_index[ind, :]:
340 if counters[c] < limit:
341 # The cluster still accepts new points.
342 counters[c] += 1
343 labels[ind] = c
344 distances[ind, c] = maxi
345 break
346 if nover > 0 and leftclose[c] == -1:
347 # The cluster may accept one point if the number
348 # of clusters does not divide the number of points in X.
349 counters[c] += 1
350 labels[ind] = c
351 nover -= 1
352 leftclose[c] = 0
353 distances[ind, c] = maxi
354 break
356 _switch_clusters(labels, distances0)
357 distances_close[:] = distances[numpy.arange(X.shape[0]), labels]
358 return distances0
361def _constraint_association_gain(leftover, counters, labels, leftclose, distances_close,
362 centers, X, x_squared_norms, limit, strategy, state=None):
363 """
364 Completes the constraint *k-means*.
365 Follows the method described in `Same-size k-Means Variation
366 <https://elki-project.github.io/tutorial/same-size_k_means>`_.
368 @param X features
369 @param labels initialized labels (unused)
370 @param centers initialized centers
371 @param x_squared_norms norm of *X*
372 @param limit number of points to associate per cluster
373 @param leftover number of points to associate at the end
374 @param counters allocated array
375 @param leftclose allocated array
376 @param labels allocated array
377 @param distances_close allocated array
378 @param strategy strategy used to sort point before
379 mapping them to a cluster
380 @param state random state
382 See `Same-size k-Means Variation <https://elki-project.github.io/tutorial/same-size_k_means>`_.
383 """
384 # distances
385 distances = euclidean_distances(
386 centers, X, Y_norm_squared=x_squared_norms, squared=True)
387 distances = distances.T
389 if strategy == 'gain_p':
390 labels[:] = numpy.argmin(distances, axis=1)
391 else:
392 # We assume labels comes from a previous iteration.
393 pass
395 strategy_coef = _compute_strategy_coefficient(distances, strategy, labels)
396 distance_linear = linearize_matrix(distances, strategy_coef)
397 sorted_distances = distance_linear[distance_linear[:, 3].argsort()]
398 distances_close[:] = 0
400 # counters
401 ave = limit
402 counters[:] = 0
403 for i in labels:
404 counters[i] += 1
405 leftclose[:] = counters[:] - ave
406 leftclose[leftclose < 0] = 0
407 nover = X.shape[0] - ave * counters.shape[0]
408 sumi = nover - leftclose.sum()
409 if sumi != 0:
410 if state is None:
411 state = numpy.random.RandomState() # pylint: disable=E1101
413 def loopf(h, sumi):
414 if sumi < 0 and leftclose[h] > 0: # pylint: disable=R1716
415 sumi -= leftclose[h]
416 leftclose[h] = 0
417 elif sumi > 0 and leftclose[h] == 0:
418 leftclose[h] = 1
419 sumi += 1
420 return sumi
422 it = 0
423 while sumi != 0:
424 h = state.randint(0, counters.shape[0])
425 sumi = loopf(h, sumi)
426 it += 1
427 if it > counters.shape[0] * 2:
428 break
429 for h in range(counters.shape[0]):
430 if sumi == 0:
431 break
432 sumi = loopf(h, sumi)
434 transfer = {}
436 for i in range(0, sorted_distances.shape[0]):
437 gain = sorted_distances[i, 3]
438 ind = int(sorted_distances[i, 1])
439 dest = int(sorted_distances[i, 2])
440 cur = labels[ind]
441 if distances_close[ind]:
442 continue
443 if cur == dest:
444 continue
445 if ((counters[dest] < ave + leftclose[dest]) and
446 (counters[cur] > ave + leftclose[cur])):
447 labels[ind] = dest
448 counters[cur] -= 1
449 counters[dest] += 1
450 distances_close[ind] = 1 # moved
451 else:
452 cp = transfer.get((dest, cur), [])
453 while len(cp) > 0:
454 g, destind = cp[0]
455 if distances_close[destind]:
456 del cp[0]
457 else:
458 break
459 if len(cp) > 0:
460 g, destind = cp[0]
461 if g + gain < 0:
462 del cp[0]
463 labels[ind] = dest
464 labels[destind] = cur
465 add = False
466 distances_close[ind] = 1 # moved
467 distances_close[destind] = 1 # moved
468 else:
469 add = True
470 else:
471 add = True
472 if add:
473 # We add the point to the list of points willing to transfer.
474 if (cur, dest) not in transfer:
475 transfer[cur, dest] = []
476 gain = sorted_distances[i, 3]
477 bisect.insort(transfer[cur, dest], (gain, ind))
479 neg = (counters < ave).sum()
480 if neg > 0:
481 raise RuntimeError( # pragma: no cover
482 "The algorithm failed, counters={0}".format(counters))
484 _switch_clusters(labels, distances)
485 distances_close[:] = distances[numpy.arange(X.shape[0]), labels]
487 return distances
490def _constraint_kmeans_weights(X, labels, sample_weight, centers, inertia, it,
491 max_iter, verbose=0, state=None, learning_rate=1.,
492 history=False, fLOG=None):
493 """
494 Runs KMeans iterator but weights cluster among them.
496 @param X features
497 @param labels initialized labels (unused)
498 @param sample_weight sample weight
499 @param centers initialized centers
500 @param inertia initialized inertia (unused)
501 @param it number of iteration already done
502 @param max_iter maximum of number of iteration
503 @param verbose verbose
504 @param state random state
505 @param learning_rate learning rate
506 @param history keeps all centers accross iterations
507 @param fLOG logging function (needs to be specified otherwise
508 verbose has no effects)
509 @return tuple (best_labels, best_centers, best_inertia, weights, it)
510 """
511 if isinstance(X, DataFrame):
512 X = X.values
513 n_clusters = centers.shape[0]
514 best_inertia = None
515 best_iter = None
516 weights = numpy.ones(centers.shape[0])
517 if sample_weight is None:
518 sw = numpy.ones((X.shape[0],))
519 else:
520 sw = sample_weight
522 if scipy.sparse.issparse(X):
523 _centers_fct = _centers_sparse
524 else:
525 _centers_fct = _centers_dense
527 total_inertia = _inertia(X, sw)
528 all_centers = []
530 while it < max_iter:
532 # compute new clusters
533 centers = _centers_fct(
534 X, sw, labels, n_clusters, None)
535 if history:
536 all_centers.append(centers)
538 # association
539 labels = _constraint_association_weights(X, centers, sw, weights)
540 if len(set(labels)) != centers.shape[0]:
541 if verbose and fLOG: # pragma: no cover
542 if isinstance(verbose, int) and verbose >= 10:
543 fLOG("CKMeans new weights: w=%r" % weights)
544 else:
545 fLOG("CKMeans new weights")
546 weights[:] = 1
547 labels = _constraint_association_weights(X, centers, sw, weights)
549 # inertia
550 inertia, diff = _labels_inertia_weights(
551 X, centers, sw, weights, labels, total_inertia)
552 if numpy.isnan(inertia):
553 raise RuntimeError( # pragma: no cover
554 "nanNobs={} Nclus={}\ninertia={}\nweights={}\ndiff={}\nlabels={}".format(
555 X.shape[0], centers.shape[0], inertia, weights, diff,
556 set(labels)))
558 # best option so far?
559 if best_inertia is None or inertia < best_inertia:
560 best_inertia = inertia
561 best_centers = centers.copy()
562 best_labels = labels.copy()
563 best_weights = weights.copy()
564 best_iter = it
566 # moves weights
567 weights, _ = _adjust_weights(X, sw, weights, labels,
568 learning_rate / (it + 10))
570 it += 1
571 if verbose and fLOG:
572 if isinstance(verbose, int) and verbose >= 10:
573 fLOG("CKMeans %d/%d inertia=%f (%f T=%f) dw=%r w=%r" % (
574 it, max_iter, inertia, best_inertia, total_inertia,
575 diff, weights))
576 elif isinstance(verbose, int) and verbose >= 5:
577 hist = Counter(labels)
578 fLOG("CKMeans %d/%d inertia=%f (%f) hist=%r" % (
579 it, max_iter, inertia, best_inertia, hist))
580 else:
581 fLOG("CKMeans %d/%d inertia=%f (%f T=%f)" % ( # pragma: no cover
582 it, max_iter, inertia, best_inertia, total_inertia))
584 # early stop
585 if (best_inertia is not None and inertia >= best_inertia and
586 it > best_iter + 5 and numpy.abs(diff).sum() <= weights.shape[0] / 2):
587 break
589 return (best_labels, best_centers, best_inertia, best_weights,
590 it, all_centers)
593def _constraint_association_weights(X, centers, sw, weights):
594 """
595 Associates points to clusters.
597 @param X features
598 @param centers centers
599 @param sw sample weights
600 @param weights cluster weights
601 @return labels
602 """
603 dist = cdist(X, centers) * weights.reshape((1, -1))
604 index = numpy.argmin(dist, axis=1)
605 return index
608def _inertia(X, sw):
609 """
610 Computes total weighted inertia.
612 @param X features
613 @param sw sample weights
614 @return inertia
615 """
616 bary = numpy.mean(X, axis=0)
617 diff = X - bary
618 norm = numpy.linalg.norm(diff, axis=1)
619 if sw is not None:
620 norm *= sw
621 return sw.sum()
624def _labels_inertia_weights(X, centers, sw, weights, labels, total_inertia):
625 """
626 Computes weighted inertia. It also adds a fraction
627 of the whole inertia depending on how balanced the
628 clusters are.
630 @param X features
631 @param centers centers
632 @param sw sample weights
633 @param weights cluster weights
634 @param labels labels
635 @param total_inertia total inertia
636 @return inertia
637 """
638 www, exp, _ = _compute_balance(X, sw, labels, centers.shape[0])
639 www -= exp
640 wwwa = numpy.square(www)
641 ratio = wwwa.sum() ** 0.5 / X.shape[0]
642 dist = cdist(X, centers) * weights.reshape((1, -1)) * sw.reshape((-1, 1))
643 return dist.sum() + ratio * total_inertia, www
646def _compute_balance(X, sw, labels, nbc=None):
647 """
648 Computes weights difference.
650 @param X features
651 @param sw sample weights
652 @param labels known labels
653 @param nbc number of clusters
654 @return (weights per cluster, expected weight,
655 total weight)
656 """
657 if nbc is None:
658 nbc = labels.max() + 1
659 N = numpy.float64(nbc)
660 www = numpy.zeros((nbc, ), dtype=numpy.float64)
661 if sw is None:
662 for la in labels:
663 www[la] += 1
664 else:
665 for w, la in zip(sw, labels):
666 www[la] += w
667 exp = www.sum() / N
668 return www, exp, N
671def _adjust_weights(X, sw, weights, labels, lr):
672 """
673 Changes *weights* mapped to every cluster.
674 *weights < 1* are used for big clusters,
675 *weights > 1* are used for small clusters.
677 @param X features
678 @param centers centers
679 @param sw sample weights
680 @param weights cluster weights
681 @param lr learning rate
682 @param labels known labels
683 @return labels
684 """
685 www, exp, N = _compute_balance(X, sw, labels, weights.shape[0])
687 for i in range(0, weights.shape[0]):
688 nw = (www[i] - exp) / exp
689 delta = nw * lr
690 weights[i] += delta
691 N += delta
693 return weights / N * weights.shape[0], www