Coverage for mlinsights/mlmodel/_kmeans_constraint_.py: 96%
329 statements
« prev ^ index » next coverage.py v7.1.0, created at 2023-02-28 08:46 +0100
« prev ^ index » next coverage.py v7.1.0, created at 2023-02-28 08:46 +0100
1# -*- coding: utf-8 -*-
2"""
3@file
4@brief Implémente la classe @see cl ConstraintKMeans.
5"""
6import bisect
7from collections import Counter
8from pandas import DataFrame
9import numpy
10import scipy.sparse
11from scipy.spatial.distance import cdist
12from sklearn.metrics.pairwise import euclidean_distances
13from sklearn.utils.extmath import row_norms
14from ._kmeans_022 import (
15 _centers_dense, _centers_sparse, _labels_inertia_skl)
18def linearize_matrix(mat, *adds):
19 """
20 Linearizes a matrix into a new one
21 with 3 columns value, row, column.
22 The output format is similar to
23 :epkg:`csr_matrix` but null values are kept.
25 @param mat matrix
26 @param adds additional square matrices
27 @return new matrix
29 *adds* defines additional matrices, it adds
30 columns on the right side and fill them with
31 the corresponding value taken into the additional
32 matrices.
33 """
34 if scipy.sparse.issparse(mat):
35 if isinstance(mat, scipy.sparse.csr_matrix):
36 max_row = mat.shape[0]
37 res = numpy.empty((len(mat.data), 3 + len(adds)), dtype=mat.dtype)
38 row = 0
39 for i, v in enumerate(mat.data):
40 while row < max_row and i >= mat.indptr[row]:
41 row += 1
42 res[i, 0] = v
43 a, b = row - 1, mat.indices[i]
44 res[i, 1] = a
45 res[i, 2] = b
46 for k, am in enumerate(adds):
47 res[i, k + 3] = am[a, b]
48 return res
49 raise NotImplementedError( # pragma: no cover
50 f"This kind of sparse matrix is not handled: {type(mat)}")
51 else:
52 n = mat.shape[0]
53 c = mat.shape[1]
54 ic = numpy.arange(mat.shape[1])
55 res = numpy.empty((n * c, 3 + len(adds)), dtype=mat.dtype)
56 for i in range(0, n):
57 a = i * c
58 b = (i + 1) * c
59 res[a:b, 1] = i
60 res[a:b, 2] = ic
61 res[:, 0] = mat.ravel()
62 for k, am in enumerate(adds):
63 res[:, 3 + k] = am.ravel()
64 return res
67def constraint_kmeans(X, labels, sample_weight, centers, inertia,
68 iter, max_iter, # pylint: disable=W0622
69 strategy='gain', verbose=0, state=None,
70 learning_rate=1., history=False, fLOG=None):
71 """
72 Completes the constraint :epkg:`k-means`.
74 @param X features
75 @param labels initialized labels (unused)
76 @param sample_weight sample weight
77 @param centers initialized centers
78 @param inertia initialized inertia (unused)
79 @param iter number of iteration already done
80 @param max_iter maximum of number of iteration
81 @param strategy strategy used to sort observations before
82 mapping them to clusters
83 @param verbose verbose
84 @param state random state
85 @param learning_rate used by strategy `'weights'`
86 @param history return list of centers accross iterations
87 @param fLOG logging function (needs to be specified otherwise
88 verbose has no effects)
89 @return tuple (best_labels, best_centers, best_inertia,
90 iter, all_centers)
91 """
92 if labels.dtype != numpy.int32:
93 raise TypeError( # pragma: no cover
94 f"Labels must be an array of int not '{labels.dtype}'")
96 if strategy == 'weights':
97 return _constraint_kmeans_weights(
98 X, labels, sample_weight, centers, inertia, iter,
99 max_iter, verbose=verbose, state=state,
100 learning_rate=learning_rate, history=history, fLOG=fLOG)
101 else:
102 if isinstance(X, DataFrame):
103 X = X.values
104 x_squared_norms = row_norms(X, squared=True)
105 counters = numpy.empty((centers.shape[0],), dtype=numpy.int32)
106 limit = X.shape[0] // centers.shape[0]
107 leftover = X.shape[0] - limit * centers.shape[0]
108 leftclose = numpy.empty((centers.shape[0],), dtype=numpy.int32)
109 n_clusters = centers.shape[0]
110 distances_close = numpy.empty((X.shape[0],), dtype=X.dtype)
111 best_inertia = None
112 best_iter = None
113 all_centers = []
115 # association
116 _constraint_association(leftover, counters, labels, leftclose, distances_close,
117 centers, X, x_squared_norms, limit, strategy, state=state)
119 if sample_weight is None:
120 sw = numpy.ones((X.shape[0],))
121 else:
122 sw = sample_weight
124 if scipy.sparse.issparse(X):
125 _centers_fct = _centers_sparse
126 else:
127 _centers_fct = _centers_dense
129 while iter < max_iter:
130 # compute new clusters
131 centers = _centers_fct(
132 X, sw, labels, n_clusters, distances_close)
134 if history:
135 all_centers.append(centers)
137 # association
138 _constraint_association(
139 leftover, counters, labels, leftclose, distances_close,
140 centers, X, x_squared_norms, limit, strategy, state=state)
142 # inertia
143 _, inertia = _labels_inertia_skl(
144 X=X, sample_weight=sw, x_squared_norms=x_squared_norms,
145 centers=centers, distances=distances_close)
147 iter += 1
148 if verbose and fLOG: # pragma: no cover
149 fLOG("CKMeans %d/%d inertia=%f" % (iter, max_iter, inertia))
151 # best option so far?
152 if best_inertia is None or inertia < best_inertia:
153 best_inertia = inertia
154 best_centers = centers.copy()
155 best_labels = labels.copy()
156 best_iter = iter
158 # early stop
159 if (best_inertia is not None and inertia >= best_inertia and
160 iter > best_iter + 5):
161 break
163 return (best_labels, best_centers, best_inertia, None,
164 iter, all_centers)
167def constraint_predictions(X, centers, strategy, state=None):
168 """
169 Computes the predictions but tries
170 to associates the same numbers of points
171 in each cluster.
173 @param X features
174 @param centers centers of each clusters
175 @param strategy strategy used to sort point before
176 mapping them to a cluster
177 @param state random state
178 @return labels, distances, distances_close
179 """
180 if isinstance(X, DataFrame):
181 X = X.values
182 x_squared_norms = row_norms(X, squared=True)
183 counters = numpy.empty((centers.shape[0],), dtype=numpy.int32)
184 limit = X.shape[0] // centers.shape[0]
185 leftover = X.shape[0] - limit * centers.shape[0]
186 leftclose = numpy.empty((centers.shape[0],), dtype=numpy.int32)
187 distances_close = numpy.empty((X.shape[0],), dtype=X.dtype)
188 labels = numpy.empty((X.shape[0],), dtype=numpy.int32)
190 distances = _constraint_association(
191 leftover, counters, labels, leftclose,
192 distances_close, centers, X, x_squared_norms,
193 limit, strategy, state=state)
195 return labels, distances, distances_close
198def _constraint_association(leftover, counters, labels, leftclose, distances_close,
199 centers, X, x_squared_norms, limit, strategy, state=None):
200 """
201 Completes the constraint :epkg:`k-means`.
203 @param X features
204 @param labels initialized labels (unused)
205 @param centers initialized centers
206 @param x_squared_norms norm of *X*
207 @param limit number of point to associate per cluster
208 @param leftover number of points to associate at the end
209 @param counters allocated array
210 @param leftclose allocated array
211 @param labels allocated array
212 @param distances_close allocated array
213 @param strategy strategy used to sort point before
214 mapping them to a cluster
215 @param state random state
216 """
217 if strategy in ('distance', 'distance_p'):
218 return _constraint_association_distance(
219 leftover, counters, labels, leftclose, distances_close,
220 centers, X, x_squared_norms, limit, strategy, state=state)
221 if strategy in ('gain', 'gain_p'):
222 return _constraint_association_gain(
223 leftover, counters, labels, leftclose, distances_close,
224 centers, X, x_squared_norms, limit, strategy, state=state)
225 raise ValueError(f"Unknwon strategy '{strategy}'.") # pragma: no cover
228def _compute_strategy_coefficient(distances, strategy, labels):
229 """
230 Creates a matrix
231 """
232 if strategy in ('gain', 'gain_p'):
233 ar = numpy.arange(distances.shape[0])
234 dist = distances[ar, labels]
235 return distances - dist[:, numpy.newaxis]
236 raise ValueError( # pragma: no cover
237 f"Unknwon strategy '{strategy}'.")
240def _randomize_index(index, weights):
241 """
242 Randomizes index depending on the value.
243 Swap indexes.
244 Modifies *index*.
245 """
246 maxi = weights.max()
247 mini = weights.min()
248 diff = max(maxi - mini, 1e-5)
249 rand = numpy.random.rand(weights.shape[0])
250 for i in range(1, index.shape[0]):
251 ind1 = index[i - 1]
252 ind2 = index[i]
253 w1 = weights[ind1]
254 w2 = weights[ind2]
255 ratio = abs(w2 - w1) / diff * 0.5
256 if rand[i] >= ratio + 0.5:
257 index[i - 1], index[i] = ind2, ind1
258 weights[i - 1], weights[i] = w2, w1
261def _switch_clusters(labels, distances):
262 """
263 Tries to switch clusters.
264 Modifies *labels* inplace.
266 @param labels labels
267 @param distances distances
268 """
269 perm = numpy.random.permutation(numpy.arange(0, labels.shape[0]))
270 niter = 0
271 modif = 1
272 while modif > 0 and niter < 10:
273 modif = 0
274 niter += 1
275 for i_ in range(labels.shape[0]):
276 for j_ in range(i_ + 1, labels.shape[0]):
277 i = perm[i_]
278 j = perm[j_]
279 c1 = labels[i]
280 c2 = labels[j]
281 if c1 == c2:
282 continue
283 d11 = distances[i, c1]
284 d12 = distances[i, c2]
285 d21 = distances[j, c1]
286 d22 = distances[j, c2]
287 if d11**2 + d22**2 > d21**2 + d12**2:
288 labels[i], labels[j] = c2, c1
289 modif += 1
292def _constraint_association_distance(leftover, counters, labels, leftclose, distances_close,
293 centers, X, x_squared_norms, limit, strategy, state=None):
294 """
295 Completes the constraint *k-means*,
296 the function sorts points by distance to the closest
297 cluster and associates them into that order.
298 It deals first with the further point and maps it to
299 the closest center.
301 @param X features
302 @param labels initialized labels (unused)
303 @param centers initialized centers
304 @param x_squared_norms norm of *X*
305 @param limit number of point to associate per cluster
306 @param leftover number of points to associate at the end
307 @param counters allocated array
308 @param leftclose allocated array
309 @param labels allocated array
310 @param distances_close allocated array
311 @param strategy strategy used to sort point before
312 mapping them to a cluster
313 @param state random state (unused)
314 """
316 # initialisation
317 counters[:] = 0
318 leftclose[:] = -1
319 labels[:] = -1
321 # distances
322 distances = euclidean_distances(
323 centers, X, Y_norm_squared=x_squared_norms, squared=True)
324 distances = distances.T
325 distances0 = distances.copy()
326 maxi = distances.ravel().max() * 2
327 centers_index = numpy.argsort(distances, axis=1)
329 while labels.min() == -1:
330 mini = numpy.min(distances, axis=1)
331 sorted_index = numpy.argsort(mini)
332 _randomize_index(sorted_index, mini)
334 nover = leftover
335 for ind in sorted_index:
336 if labels[ind] >= 0:
337 continue
338 for c in centers_index[ind, :]:
339 if counters[c] < limit:
340 # The cluster still accepts new points.
341 counters[c] += 1
342 labels[ind] = c
343 distances[ind, c] = maxi
344 break
345 if nover > 0 and leftclose[c] == -1:
346 # The cluster may accept one point if the number
347 # of clusters does not divide the number of points in X.
348 counters[c] += 1
349 labels[ind] = c
350 nover -= 1
351 leftclose[c] = 0
352 distances[ind, c] = maxi
353 break
355 _switch_clusters(labels, distances0)
356 distances_close[:] = distances[numpy.arange(X.shape[0]), labels]
357 return distances0
360def _constraint_association_gain(leftover, counters, labels, leftclose, distances_close,
361 centers, X, x_squared_norms, limit, strategy, state=None):
362 """
363 Completes the constraint *k-means*.
364 Follows the method described in `Same-size k-Means Variation
365 <https://elki-project.github.io/tutorial/same-size_k_means>`_.
367 @param X features
368 @param labels initialized labels (unused)
369 @param centers initialized centers
370 @param x_squared_norms norm of *X*
371 @param limit number of points to associate per cluster
372 @param leftover number of points to associate at the end
373 @param counters allocated array
374 @param leftclose allocated array
375 @param labels allocated array
376 @param distances_close allocated array
377 @param strategy strategy used to sort point before
378 mapping them to a cluster
379 @param state random state
381 See `Same-size k-Means Variation <https://elki-project.github.io/tutorial/same-size_k_means>`_.
382 """
383 # distances
384 distances = euclidean_distances(
385 centers, X, Y_norm_squared=x_squared_norms, squared=True)
386 distances = distances.T
388 if strategy == 'gain_p':
389 labels[:] = numpy.argmin(distances, axis=1)
390 else:
391 # We assume labels comes from a previous iteration.
392 pass
394 strategy_coef = _compute_strategy_coefficient(distances, strategy, labels)
395 distance_linear = linearize_matrix(distances, strategy_coef)
396 sorted_distances = distance_linear[distance_linear[:, 3].argsort()]
397 distances_close[:] = 0
399 # counters
400 ave = limit
401 counters[:] = 0
402 for i in labels:
403 counters[i] += 1
404 leftclose[:] = counters[:] - ave
405 leftclose[leftclose < 0] = 0
406 nover = X.shape[0] - ave * counters.shape[0]
407 sumi = nover - leftclose.sum()
408 if sumi != 0:
409 if state is None:
410 state = numpy.random.RandomState() # pylint: disable=E1101
412 def loopf(h, sumi):
413 if sumi < 0 and leftclose[h] > 0: # pylint: disable=R1716
414 sumi -= leftclose[h]
415 leftclose[h] = 0
416 elif sumi > 0 and leftclose[h] == 0:
417 leftclose[h] = 1
418 sumi += 1
419 return sumi
421 it = 0
422 while sumi != 0:
423 h = state.randint(0, counters.shape[0])
424 sumi = loopf(h, sumi)
425 it += 1
426 if it > counters.shape[0] * 2:
427 break
428 for h in range(counters.shape[0]):
429 if sumi == 0:
430 break
431 sumi = loopf(h, sumi)
433 transfer = {}
435 for i in range(0, sorted_distances.shape[0]):
436 gain = sorted_distances[i, 3]
437 ind = int(sorted_distances[i, 1])
438 dest = int(sorted_distances[i, 2])
439 cur = labels[ind]
440 if distances_close[ind]:
441 continue
442 if cur == dest:
443 continue
444 if ((counters[dest] < ave + leftclose[dest]) and
445 (counters[cur] > ave + leftclose[cur])):
446 labels[ind] = dest
447 counters[cur] -= 1
448 counters[dest] += 1
449 distances_close[ind] = 1 # moved
450 else:
451 cp = transfer.get((dest, cur), [])
452 while len(cp) > 0:
453 g, destind = cp[0]
454 if distances_close[destind]:
455 del cp[0]
456 else:
457 break
458 if len(cp) > 0:
459 g, destind = cp[0]
460 if g + gain < 0:
461 del cp[0]
462 labels[ind] = dest
463 labels[destind] = cur
464 add = False
465 distances_close[ind] = 1 # moved
466 distances_close[destind] = 1 # moved
467 else:
468 add = True
469 else:
470 add = True
471 if add:
472 # We add the point to the list of points willing to transfer.
473 if (cur, dest) not in transfer:
474 transfer[cur, dest] = []
475 gain = sorted_distances[i, 3]
476 bisect.insort(transfer[cur, dest], (gain, ind))
478 neg = (counters < ave).sum()
479 if neg > 0:
480 raise RuntimeError( # pragma: no cover
481 f"The algorithm failed, counters={counters}")
483 _switch_clusters(labels, distances)
484 distances_close[:] = distances[numpy.arange(X.shape[0]), labels]
486 return distances
489def _constraint_kmeans_weights(X, labels, sample_weight, centers, inertia, it,
490 max_iter, verbose=0, state=None, learning_rate=1.,
491 history=False, fLOG=None):
492 """
493 Runs KMeans iterator but weights cluster among them.
495 @param X features
496 @param labels initialized labels (unused)
497 @param sample_weight sample weight
498 @param centers initialized centers
499 @param inertia initialized inertia (unused)
500 @param it number of iteration already done
501 @param max_iter maximum of number of iteration
502 @param verbose verbose
503 @param state random state
504 @param learning_rate learning rate
505 @param history keeps all centers accross iterations
506 @param fLOG logging function (needs to be specified otherwise
507 verbose has no effects)
508 @return tuple (best_labels, best_centers, best_inertia, weights, it)
509 """
510 if isinstance(X, DataFrame):
511 X = X.values
512 n_clusters = centers.shape[0]
513 best_inertia = None
514 best_iter = None
515 weights = numpy.ones(centers.shape[0])
516 if sample_weight is None:
517 sw = numpy.ones((X.shape[0],))
518 else:
519 sw = sample_weight
521 if scipy.sparse.issparse(X):
522 _centers_fct = _centers_sparse
523 else:
524 _centers_fct = _centers_dense
526 total_inertia = _inertia(X, sw)
527 all_centers = []
529 while it < max_iter:
531 # compute new clusters
532 centers = _centers_fct(
533 X, sw, labels, n_clusters, None)
534 if history:
535 all_centers.append(centers)
537 # association
538 labels = _constraint_association_weights(X, centers, sw, weights)
539 if len(set(labels)) != centers.shape[0]:
540 if verbose and fLOG: # pragma: no cover
541 if isinstance(verbose, int) and verbose >= 10:
542 fLOG(f"CKMeans new weights: w={weights!r}")
543 else:
544 fLOG("CKMeans new weights")
545 weights[:] = 1
546 labels = _constraint_association_weights(X, centers, sw, weights)
548 # inertia
549 inertia, diff = _labels_inertia_weights(
550 X, centers, sw, weights, labels, total_inertia)
551 if numpy.isnan(inertia):
552 raise RuntimeError( # pragma: no cover
553 f"nanNobs={X.shape[0]} Nclus={centers.shape[0]}\n"
554 f"inertia={inertia}\nweights={weights}\ndiff={diff}\n"
555 f"labels={set(labels)}")
557 # best option so far?
558 if best_inertia is None or inertia < best_inertia:
559 best_inertia = inertia
560 best_centers = centers.copy()
561 best_labels = labels.copy()
562 best_weights = weights.copy()
563 best_iter = it
565 # moves weights
566 weights, _ = _adjust_weights(X, sw, weights, labels,
567 learning_rate / (it + 10))
569 it += 1
570 if verbose and fLOG:
571 if isinstance(verbose, int) and verbose >= 10:
572 fLOG("CKMeans %d/%d inertia=%f (%f T=%f) dw=%r w=%r" % (
573 it, max_iter, inertia, best_inertia, total_inertia,
574 diff, weights))
575 elif isinstance(verbose, int) and verbose >= 5:
576 hist = Counter(labels)
577 fLOG("CKMeans %d/%d inertia=%f (%f) hist=%r" % (
578 it, max_iter, inertia, best_inertia, hist))
579 else:
580 fLOG("CKMeans %d/%d inertia=%f (%f T=%f)" % ( # pragma: no cover
581 it, max_iter, inertia, best_inertia, total_inertia))
583 # early stop
584 if (best_inertia is not None and inertia >= best_inertia and
585 it > best_iter + 5 and numpy.abs(diff).sum() <= weights.shape[0] / 2):
586 break
588 return (best_labels, best_centers, best_inertia, best_weights,
589 it, all_centers)
592def _constraint_association_weights(X, centers, sw, weights):
593 """
594 Associates points to clusters.
596 @param X features
597 @param centers centers
598 @param sw sample weights
599 @param weights cluster weights
600 @return labels
601 """
602 dist = cdist(X, centers) * weights.reshape((1, -1))
603 index = numpy.argmin(dist, axis=1)
604 return index
607def _inertia(X, sw):
608 """
609 Computes total weighted inertia.
611 @param X features
612 @param sw sample weights
613 @return inertia
614 """
615 bary = numpy.mean(X, axis=0)
616 diff = X - bary
617 norm = numpy.linalg.norm(diff, axis=1)
618 if sw is not None:
619 norm *= sw
620 return sw.sum()
623def _labels_inertia_weights(X, centers, sw, weights, labels, total_inertia):
624 """
625 Computes weighted inertia. It also adds a fraction
626 of the whole inertia depending on how balanced the
627 clusters are.
629 @param X features
630 @param centers centers
631 @param sw sample weights
632 @param weights cluster weights
633 @param labels labels
634 @param total_inertia total inertia
635 @return inertia
636 """
637 www, exp, _ = _compute_balance(X, sw, labels, centers.shape[0])
638 www -= exp
639 wwwa = numpy.square(www)
640 ratio = wwwa.sum() ** 0.5 / X.shape[0]
641 dist = cdist(X, centers) * weights.reshape((1, -1)) * sw.reshape((-1, 1))
642 return dist.sum() + ratio * total_inertia, www
645def _compute_balance(X, sw, labels, nbc=None):
646 """
647 Computes weights difference.
649 @param X features
650 @param sw sample weights
651 @param labels known labels
652 @param nbc number of clusters
653 @return (weights per cluster, expected weight,
654 total weight)
655 """
656 if nbc is None:
657 nbc = labels.max() + 1
658 N = numpy.float64(nbc)
659 www = numpy.zeros((nbc, ), dtype=numpy.float64)
660 if sw is None:
661 for la in labels:
662 www[la] += 1
663 else:
664 for w, la in zip(sw, labels):
665 www[la] += w
666 exp = www.sum() / N
667 return www, exp, N
670def _adjust_weights(X, sw, weights, labels, lr):
671 """
672 Changes *weights* mapped to every cluster.
673 *weights < 1* are used for big clusters,
674 *weights > 1* are used for small clusters.
676 @param X features
677 @param centers centers
678 @param sw sample weights
679 @param weights cluster weights
680 @param lr learning rate
681 @param labels known labels
682 @return labels
683 """
684 www, exp, N = _compute_balance(X, sw, labels, weights.shape[0])
686 for i in range(0, weights.shape[0]):
687 nw = (www[i] - exp) / exp
688 delta = nw * lr
689 weights[i] += delta
690 N += delta
692 return weights / N * weights.shape[0], www