Source code for mlinsights.mlmodel._kmeans_constraint_

# -*- coding: utf-8 -*-
"""
Implémente la classe :class:`ConstraintKMeans <mlinsights.mlmodel.kmeans_constraint.ConstraintKMeans>`.


:githublink:`%|py|6`
"""
import bisect
from collections import Counter
from pandas import DataFrame
import numpy
import scipy.sparse
from scipy.spatial.distance import cdist
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.utils.extmath import row_norms
from ._kmeans_022 import (
    _centers_dense, _centers_sparse, _labels_inertia_skl)


[docs]def linearize_matrix(mat, *adds): """ Linearizes a matrix into a new one with 3 columns value, row, column. The output format is similar to :epkg:`csr_matrix` but null values are kept. :param mat: matrix :param adds: additional square matrices :return: new matrix *adds* defines additional matrices, it adds columns on the right side and fill them with the corresponding value taken into the additional matrices. :githublink:`%|py|33` """ if scipy.sparse.issparse(mat): if isinstance(mat, scipy.sparse.csr_matrix): max_row = mat.shape[0] res = numpy.empty((len(mat.data), 3 + len(adds)), dtype=mat.dtype) row = 0 for i, v in enumerate(mat.data): while row < max_row and i >= mat.indptr[row]: row += 1 res[i, 0] = v a, b = row - 1, mat.indices[i] res[i, 1] = a res[i, 2] = b for k, am in enumerate(adds): res[i, k + 3] = am[a, b] return res raise NotImplementedError( # pragma: no cover "This kind of sparse matrix is not handled: {0}".format(type(mat))) else: n = mat.shape[0] c = mat.shape[1] ic = numpy.arange(mat.shape[1]) res = numpy.empty((n * c, 3 + len(adds)), dtype=mat.dtype) for i in range(0, n): a = i * c b = (i + 1) * c res[a:b, 1] = i res[a:b, 2] = ic res[:, 0] = mat.ravel() for k, am in enumerate(adds): res[:, 3 + k] = am.ravel() return res
[docs]def constraint_kmeans(X, labels, sample_weight, centers, inertia, iter, max_iter, # pylint: disable=W0622 strategy='gain', verbose=0, state=None, learning_rate=1., history=False, fLOG=None): """ Completes the constraint :epkg:`k-means`. :param X: features :param labels: initialized labels (unused) :param sample_weight: sample weight :param centers: initialized centers :param inertia: initialized inertia (unused) :param iter: number of iteration already done :param max_iter: maximum of number of iteration :param strategy: strategy used to sort observations before mapping them to clusters :param verbose: verbose :param state: random state :param learning_rate: used by strategy `'weights'` :param history: return list of centers accross iterations :param fLOG: logging function (needs to be specified otherwise verbose has no effects) :return: tuple (best_labels, best_centers, best_inertia, iter, all_centers) :githublink:`%|py|91` """ if labels.dtype != numpy.int32: raise TypeError( # pragma: no cover "Labels must be an array of int not '{0}'".format(labels.dtype)) if strategy == 'weights': return _constraint_kmeans_weights( X, labels, sample_weight, centers, inertia, iter, max_iter, verbose=verbose, state=state, learning_rate=learning_rate, history=history, fLOG=fLOG) else: if isinstance(X, DataFrame): X = X.values x_squared_norms = row_norms(X, squared=True) counters = numpy.empty((centers.shape[0],), dtype=numpy.int32) limit = X.shape[0] // centers.shape[0] leftover = X.shape[0] - limit * centers.shape[0] leftclose = numpy.empty((centers.shape[0],), dtype=numpy.int32) n_clusters = centers.shape[0] distances_close = numpy.empty((X.shape[0],), dtype=X.dtype) best_inertia = None best_iter = None all_centers = [] # association _constraint_association(leftover, counters, labels, leftclose, distances_close, centers, X, x_squared_norms, limit, strategy, state=state) if sample_weight is None: sw = numpy.ones((X.shape[0],)) else: sw = sample_weight if scipy.sparse.issparse(X): _centers_fct = _centers_sparse else: _centers_fct = _centers_dense while iter < max_iter: # compute new clusters centers = _centers_fct( X, sw, labels, n_clusters, distances_close) if history: all_centers.append(centers) # association _constraint_association( leftover, counters, labels, leftclose, distances_close, centers, X, x_squared_norms, limit, strategy, state=state) # inertia _, inertia = _labels_inertia_skl( X=X, sample_weight=sw, x_squared_norms=x_squared_norms, centers=centers, distances=distances_close) iter += 1 if verbose and fLOG: # pragma: no cover fLOG("CKMeans %d/%d inertia=%f" % (iter, max_iter, inertia)) # best option so far? if best_inertia is None or inertia < best_inertia: best_inertia = inertia best_centers = centers.copy() best_labels = labels.copy() best_iter = iter # early stop if (best_inertia is not None and inertia >= best_inertia and iter > best_iter + 5): break return (best_labels, best_centers, best_inertia, None, iter, all_centers)
[docs]def constraint_predictions(X, centers, strategy, state=None): """ Computes the predictions but tries to associates the same numbers of points in each cluster. :param X: features :param centers: centers of each clusters :param strategy: strategy used to sort point before mapping them to a cluster :param state: random state :return: labels, distances, distances_close :githublink:`%|py|179` """ if isinstance(X, DataFrame): X = X.values x_squared_norms = row_norms(X, squared=True) counters = numpy.empty((centers.shape[0],), dtype=numpy.int32) limit = X.shape[0] // centers.shape[0] leftover = X.shape[0] - limit * centers.shape[0] leftclose = numpy.empty((centers.shape[0],), dtype=numpy.int32) distances_close = numpy.empty((X.shape[0],), dtype=X.dtype) labels = numpy.empty((X.shape[0],), dtype=numpy.int32) distances = _constraint_association( leftover, counters, labels, leftclose, distances_close, centers, X, x_squared_norms, limit, strategy, state=state) return labels, distances, distances_close
[docs]def _constraint_association(leftover, counters, labels, leftclose, distances_close, centers, X, x_squared_norms, limit, strategy, state=None): """ Completes the constraint :epkg:`k-means`. :param X: features :param labels: initialized labels (unused) :param centers: initialized centers :param x_squared_norms: norm of *X* :param limit: number of point to associate per cluster :param leftover: number of points to associate at the end :param counters: allocated array :param leftclose: allocated array :param labels: allocated array :param distances_close: allocated array :param strategy: strategy used to sort point before mapping them to a cluster :param state: random state :githublink:`%|py|216` """ if strategy in ('distance', 'distance_p'): return _constraint_association_distance( leftover, counters, labels, leftclose, distances_close, centers, X, x_squared_norms, limit, strategy, state=state) if strategy in ('gain', 'gain_p'): return _constraint_association_gain( leftover, counters, labels, leftclose, distances_close, centers, X, x_squared_norms, limit, strategy, state=state) raise ValueError("Unknwon strategy '{0}'.".format( strategy)) # pragma: no cover
[docs]def _compute_strategy_coefficient(distances, strategy, labels): """ Creates a matrix :githublink:`%|py|232` """ if strategy in ('gain', 'gain_p'): ar = numpy.arange(distances.shape[0]) dist = distances[ar, labels] return distances - dist[:, numpy.newaxis] raise ValueError("Unknwon strategy '{0}'.".format(strategy))
[docs]def _randomize_index(index, weights): """ Randomizes index depending on the value. Swap indexes. Modifies *index*. :githublink:`%|py|245` """ maxi = weights.max() mini = weights.min() diff = max(maxi - mini, 1e-5) rand = numpy.random.rand(weights.shape[0]) for i in range(1, index.shape[0]): ind1 = index[i - 1] ind2 = index[i] w1 = weights[ind1] w2 = weights[ind2] ratio = abs(w2 - w1) / diff * 0.5 if rand[i] >= ratio + 0.5: index[i - 1], index[i] = ind2, ind1 weights[i - 1], weights[i] = w2, w1
[docs]def _switch_clusters(labels, distances): """ Tries to switch clusters. Modifies *labels* inplace. :param labels: labels :param distances: distances :githublink:`%|py|268` """ perm = numpy.random.permutation(numpy.arange(0, labels.shape[0])) niter = 0 modif = 1 while modif > 0 and niter < 10: modif = 0 niter += 1 for i_ in range(labels.shape[0]): for j_ in range(i_ + 1, labels.shape[0]): i = perm[i_] j = perm[j_] c1 = labels[i] c2 = labels[j] if c1 == c2: continue d11 = distances[i, c1] d12 = distances[i, c2] d21 = distances[j, c1] d22 = distances[j, c2] if d11**2 + d22**2 > d21**2 + d12**2: labels[i], labels[j] = c2, c1 modif += 1
[docs]def _constraint_association_distance(leftover, counters, labels, leftclose, distances_close, centers, X, x_squared_norms, limit, strategy, state=None): """ Completes the constraint *k-means*, the function sorts points by distance to the closest cluster and associates them into that order. It deals first with the further point and maps it to the closest center. :param X: features :param labels: initialized labels (unused) :param centers: initialized centers :param x_squared_norms: norm of *X* :param limit: number of point to associate per cluster :param leftover: number of points to associate at the end :param counters: allocated array :param leftclose: allocated array :param labels: allocated array :param distances_close: allocated array :param strategy: strategy used to sort point before mapping them to a cluster :param state: random state (unused) :githublink:`%|py|314` """ # initialisation counters[:] = 0 leftclose[:] = -1 labels[:] = -1 # distances distances = euclidean_distances( centers, X, Y_norm_squared=x_squared_norms, squared=True) distances = distances.T distances0 = distances.copy() maxi = distances.ravel().max() * 2 centers_index = numpy.argsort(distances, axis=1) while labels.min() == -1: mini = numpy.min(distances, axis=1) sorted_index = numpy.argsort(mini) _randomize_index(sorted_index, mini) nover = leftover for ind in sorted_index: if labels[ind] >= 0: continue for c in centers_index[ind, :]: if counters[c] < limit: # The cluster still accepts new points. counters[c] += 1 labels[ind] = c distances[ind, c] = maxi break if nover > 0 and leftclose[c] == -1: # The cluster may accept one point if the number # of clusters does not divide the number of points in X. counters[c] += 1 labels[ind] = c nover -= 1 leftclose[c] = 0 distances[ind, c] = maxi break _switch_clusters(labels, distances0) distances_close[:] = distances[numpy.arange(X.shape[0]), labels] return distances0
[docs]def _constraint_association_gain(leftover, counters, labels, leftclose, distances_close, centers, X, x_squared_norms, limit, strategy, state=None): """ Completes the constraint *k-means*. Follows the method described in `Same-size k-Means Variation <https://elki-project.github.io/tutorial/same-size_k_means>`_. :param X: features :param labels: initialized labels (unused) :param centers: initialized centers :param x_squared_norms: norm of *X* :param limit: number of points to associate per cluster :param leftover: number of points to associate at the end :param counters: allocated array :param leftclose: allocated array :param labels: allocated array :param distances_close: allocated array :param strategy: strategy used to sort point before mapping them to a cluster :param state: random state See `Same-size k-Means Variation <https://elki-project.github.io/tutorial/same-size_k_means>`_. :githublink:`%|py|382` """ # distances distances = euclidean_distances( centers, X, Y_norm_squared=x_squared_norms, squared=True) distances = distances.T if strategy == 'gain_p': labels[:] = numpy.argmin(distances, axis=1) else: # We assume labels comes from a previous iteration. pass strategy_coef = _compute_strategy_coefficient(distances, strategy, labels) distance_linear = linearize_matrix(distances, strategy_coef) sorted_distances = distance_linear[distance_linear[:, 3].argsort()] distances_close[:] = 0 # counters ave = limit counters[:] = 0 for i in labels: counters[i] += 1 leftclose[:] = counters[:] - ave leftclose[leftclose < 0] = 0 nover = X.shape[0] - ave * counters.shape[0] sumi = nover - leftclose.sum() if sumi != 0: if state is None: state = numpy.random.RandomState() # pylint: disable=E1101 def loopf(h, sumi): if sumi < 0 and leftclose[h] > 0: # pylint: disable=R1716 sumi -= leftclose[h] leftclose[h] = 0 elif sumi > 0 and leftclose[h] == 0: leftclose[h] = 1 sumi += 1 return sumi it = 0 while sumi != 0: h = state.randint(0, counters.shape[0]) sumi = loopf(h, sumi) it += 1 if it > counters.shape[0] * 2: break for h in range(counters.shape[0]): if sumi == 0: break sumi = loopf(h, sumi) transfer = {} for i in range(0, sorted_distances.shape[0]): gain = sorted_distances[i, 3] ind = int(sorted_distances[i, 1]) dest = int(sorted_distances[i, 2]) cur = labels[ind] if distances_close[ind]: continue if cur == dest: continue if ((counters[dest] < ave + leftclose[dest]) and (counters[cur] > ave + leftclose[cur])): labels[ind] = dest counters[cur] -= 1 counters[dest] += 1 distances_close[ind] = 1 # moved else: cp = transfer.get((dest, cur), []) while len(cp) > 0: g, destind = cp[0] if distances_close[destind]: del cp[0] else: break if len(cp) > 0: g, destind = cp[0] if g + gain < 0: del cp[0] labels[ind] = dest labels[destind] = cur add = False distances_close[ind] = 1 # moved distances_close[destind] = 1 # moved else: add = True else: add = True if add: # We add the point to the list of points willing to transfer. if (cur, dest) not in transfer: transfer[cur, dest] = [] gain = sorted_distances[i, 3] bisect.insort(transfer[cur, dest], (gain, ind)) neg = (counters < ave).sum() if neg > 0: raise RuntimeError( # pragma: no cover "The algorithm failed, counters={0}".format(counters)) _switch_clusters(labels, distances) distances_close[:] = distances[numpy.arange(X.shape[0]), labels] return distances
[docs]def _constraint_kmeans_weights(X, labels, sample_weight, centers, inertia, it, max_iter, verbose=0, state=None, learning_rate=1., history=False, fLOG=None): """ Runs KMeans iterator but weights cluster among them. :param X: features :param labels: initialized labels (unused) :param sample_weight: sample weight :param centers: initialized centers :param inertia: initialized inertia (unused) :param it: number of iteration already done :param max_iter: maximum of number of iteration :param verbose: verbose :param state: random state :param learning_rate: learning rate :param history: keeps all centers accross iterations :param fLOG: logging function (needs to be specified otherwise verbose has no effects) :return: tuple (best_labels, best_centers, best_inertia, weights, it) :githublink:`%|py|509` """ if isinstance(X, DataFrame): X = X.values n_clusters = centers.shape[0] best_inertia = None best_iter = None weights = numpy.ones(centers.shape[0]) if sample_weight is None: sw = numpy.ones((X.shape[0],)) else: sw = sample_weight if scipy.sparse.issparse(X): _centers_fct = _centers_sparse else: _centers_fct = _centers_dense total_inertia = _inertia(X, sw) all_centers = [] while it < max_iter: # compute new clusters centers = _centers_fct( X, sw, labels, n_clusters, None) if history: all_centers.append(centers) # association labels = _constraint_association_weights(X, centers, sw, weights) if len(set(labels)) != centers.shape[0]: if verbose and fLOG: # pragma: no cover if isinstance(verbose, int) and verbose >= 10: fLOG("CKMeans new weights: w=%r" % weights) else: fLOG("CKMeans new weights") weights[:] = 1 labels = _constraint_association_weights(X, centers, sw, weights) # inertia inertia, diff = _labels_inertia_weights( X, centers, sw, weights, labels, total_inertia) if numpy.isnan(inertia): raise RuntimeError( # pragma: no cover "nanNobs={} Nclus={}\ninertia={}\nweights={}\ndiff={}\nlabels={}".format( X.shape[0], centers.shape[0], inertia, weights, diff, set(labels))) # best option so far? if best_inertia is None or inertia < best_inertia: best_inertia = inertia best_centers = centers.copy() best_labels = labels.copy() best_weights = weights.copy() best_iter = it # moves weights weights, _ = _adjust_weights(X, sw, weights, labels, learning_rate / (it + 10)) it += 1 if verbose and fLOG: if isinstance(verbose, int) and verbose >= 10: fLOG("CKMeans %d/%d inertia=%f (%f T=%f) dw=%r w=%r" % ( it, max_iter, inertia, best_inertia, total_inertia, diff, weights)) elif isinstance(verbose, int) and verbose >= 5: hist = Counter(labels) fLOG("CKMeans %d/%d inertia=%f (%f) hist=%r" % ( it, max_iter, inertia, best_inertia, hist)) else: fLOG("CKMeans %d/%d inertia=%f (%f T=%f)" % ( # pragma: no cover it, max_iter, inertia, best_inertia, total_inertia)) # early stop if (best_inertia is not None and inertia >= best_inertia and it > best_iter + 5 and numpy.abs(diff).sum() <= weights.shape[0] / 2): break return (best_labels, best_centers, best_inertia, best_weights, it, all_centers)
[docs]def _constraint_association_weights(X, centers, sw, weights): """ Associates points to clusters. :param X: features :param centers: centers :param sw: sample weights :param weights: cluster weights :return: labels :githublink:`%|py|601` """ dist = cdist(X, centers) * weights.reshape((1, -1)) index = numpy.argmin(dist, axis=1) return index
[docs]def _inertia(X, sw): """ Computes total weighted inertia. :param X: features :param sw: sample weights :return: inertia :githublink:`%|py|614` """ bary = numpy.mean(X, axis=0) diff = X - bary norm = numpy.linalg.norm(diff, axis=1) if sw is not None: norm *= sw return sw.sum()
[docs]def _labels_inertia_weights(X, centers, sw, weights, labels, total_inertia): """ Computes weighted inertia. It also adds a fraction of the whole inertia depending on how balanced the clusters are. :param X: features :param centers: centers :param sw: sample weights :param weights: cluster weights :param labels: labels :param total_inertia: total inertia :return: inertia :githublink:`%|py|636` """ www, exp, _ = _compute_balance(X, sw, labels, centers.shape[0]) www -= exp wwwa = numpy.square(www) ratio = wwwa.sum() ** 0.5 / X.shape[0] dist = cdist(X, centers) * weights.reshape((1, -1)) * sw.reshape((-1, 1)) return dist.sum() + ratio * total_inertia, www
[docs]def _compute_balance(X, sw, labels, nbc=None): """ Computes weights difference. :param X: features :param sw: sample weights :param labels: known labels :param nbc: number of clusters :return: (weights per cluster, expected weight, total weight) :githublink:`%|py|655` """ if nbc is None: nbc = labels.max() + 1 N = numpy.float64(nbc) www = numpy.zeros((nbc, ), dtype=numpy.float64) if sw is None: for la in labels: www[la] += 1 else: for w, la in zip(sw, labels): www[la] += w exp = www.sum() / N return www, exp, N
[docs]def _adjust_weights(X, sw, weights, labels, lr): """ Changes *weights* mapped to every cluster. *weights < 1* are used for big clusters, *weights > 1* are used for small clusters. :param X: features :param centers: centers :param sw: sample weights :param weights: cluster weights :param lr: learning rate :param labels: known labels :return: labels :githublink:`%|py|683` """ www, exp, N = _compute_balance(X, sw, labels, weights.shape[0]) for i in range(0, weights.shape[0]): nw = (www[i] - exp) / exp delta = nw * lr weights[i] += delta N += delta return weights / N * weights.shape[0], www