Source code for mlinsights.mlmodel._kmeans_constraint_

# -*- coding: utf-8 -*-
"""
Implémente la classe :class:`ConstraintKMeans <mlinsights.mlmodel.kmeans_constraint.ConstraintKMeans>`.


:githublink:`%|py|6`
"""
import bisect
from collections import Counter
from pandas import DataFrame
import numpy
import scipy.sparse
from scipy.spatial.distance import cdist
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.utils.extmath import row_norms
from ._kmeans_022 import (
    _centers_dense, _centers_sparse, _labels_inertia_skl)


[docs]def linearize_matrix(mat, *adds):
    """
    Linearizes a matrix into a new one
    with 3 columns value, row, column.
    The output format is similar to
    :epkg:`csr_matrix` but null values are kept.

    :param      mat:     matrix
    :param      adds:    additional square matrices
    :return:             new matrix

    *adds* defines additional matrices, it adds
    columns on the right side and fill them with
    the corresponding value taken into the additional
    matrices.


    :githublink:`%|py|33`
    """
    if scipy.sparse.issparse(mat):
        if isinstance(mat, scipy.sparse.csr_matrix):
            max_row = mat.shape[0]
            res = numpy.empty((len(mat.data), 3 + len(adds)), dtype=mat.dtype)
            row = 0
            for i, v in enumerate(mat.data):
                while row < max_row and i >= mat.indptr[row]:
                    row += 1
                res[i, 0] = v
                a, b = row - 1, mat.indices[i]
                res[i, 1] = a
                res[i, 2] = b
                for k, am in enumerate(adds):
                    res[i, k + 3] = am[a, b]
            return res
        raise NotImplementedError(  # pragma: no cover
            "This kind of sparse matrix is not handled: {0}".format(type(mat)))
    else:
        n = mat.shape[0]
        c = mat.shape[1]
        ic = numpy.arange(mat.shape[1])
        res = numpy.empty((n * c, 3 + len(adds)), dtype=mat.dtype)
        for i in range(0, n):
            a = i * c
            b = (i + 1) * c
            res[a:b, 1] = i
            res[a:b, 2] = ic
        res[:, 0] = mat.ravel()
        for k, am in enumerate(adds):
            res[:, 3 + k] = am.ravel()
        return res


[docs]def constraint_kmeans(X, labels, sample_weight, centers, inertia,
                      iter, max_iter,  # pylint: disable=W0622
                      strategy='gain', verbose=0, state=None,
                      learning_rate=1., history=False, fLOG=None):
    """
    Completes the constraint :epkg:`k-means`.

    :param      X:               features
    :param      labels:          initialized labels (unused)
    :param      sample_weight:   sample weight
    :param      centers:         initialized centers
    :param      inertia:         initialized inertia (unused)
    :param      iter:            number of iteration already done
    :param      max_iter:        maximum of number of iteration
    :param      strategy:        strategy used to sort observations before
                                mapping them to clusters

    :param      verbose:         verbose
    :param      state:           random state
    :param      learning_rate:   used by strategy `'weights'`
    :param      history:         return list of centers accross iterations
    :param      fLOG:            logging function (needs to be specified otherwise
                                verbose has no effects)

    :return:                     tuple (best_labels, best_centers, best_inertia,
                                iter, all_centers)


    :githublink:`%|py|91`
    """
    if labels.dtype != numpy.int32:
        raise TypeError(  # pragma: no cover
            "Labels must be an array of int not '{0}'".format(labels.dtype))

    if strategy == 'weights':
        return _constraint_kmeans_weights(
            X, labels, sample_weight, centers, inertia, iter,
            max_iter, verbose=verbose, state=state,
            learning_rate=learning_rate, history=history, fLOG=fLOG)
    else:
        if isinstance(X, DataFrame):
            X = X.values
        x_squared_norms = row_norms(X, squared=True)
        counters = numpy.empty((centers.shape[0],), dtype=numpy.int32)
        limit = X.shape[0] // centers.shape[0]
        leftover = X.shape[0] - limit * centers.shape[0]
        leftclose = numpy.empty((centers.shape[0],), dtype=numpy.int32)
        n_clusters = centers.shape[0]
        distances_close = numpy.empty((X.shape[0],), dtype=X.dtype)
        best_inertia = None
        best_iter = None
        all_centers = []

        # association
        _constraint_association(leftover, counters, labels, leftclose, distances_close,
                                centers, X, x_squared_norms, limit, strategy, state=state)

        if sample_weight is None:
            sw = numpy.ones((X.shape[0],))
        else:
            sw = sample_weight

        if scipy.sparse.issparse(X):
            _centers_fct = _centers_sparse
        else:
            _centers_fct = _centers_dense

        while iter < max_iter:
            # compute new clusters
            centers = _centers_fct(
                X, sw, labels, n_clusters, distances_close)

            if history:
                all_centers.append(centers)

            # association
            _constraint_association(
                leftover, counters, labels, leftclose, distances_close,
                centers, X, x_squared_norms, limit, strategy, state=state)

            # inertia
            _, inertia = _labels_inertia_skl(
                X=X, sample_weight=sw, x_squared_norms=x_squared_norms,
                centers=centers, distances=distances_close)

            iter += 1
            if verbose and fLOG:  # pragma: no cover
                fLOG("CKMeans %d/%d inertia=%f" % (iter, max_iter, inertia))

            # best option so far?
            if best_inertia is None or inertia < best_inertia:
                best_inertia = inertia
                best_centers = centers.copy()
                best_labels = labels.copy()
                best_iter = iter

            # early stop
            if (best_inertia is not None and inertia >= best_inertia and
                    iter > best_iter + 5):
                break

        return (best_labels, best_centers, best_inertia, None,
                iter, all_centers)


[docs]def constraint_predictions(X, centers, strategy, state=None):
    """
    Computes the predictions but tries
    to associates the same numbers of points
    in each cluster.

    :param      X:           features
    :param      centers:     centers of each clusters
    :param      strategy:    strategy used to sort point before
                            mapping them to a cluster

    :param      state:       random state
    :return:                 labels, distances, distances_close


    :githublink:`%|py|179`
    """
    if isinstance(X, DataFrame):
        X = X.values
    x_squared_norms = row_norms(X, squared=True)
    counters = numpy.empty((centers.shape[0],), dtype=numpy.int32)
    limit = X.shape[0] // centers.shape[0]
    leftover = X.shape[0] - limit * centers.shape[0]
    leftclose = numpy.empty((centers.shape[0],), dtype=numpy.int32)
    distances_close = numpy.empty((X.shape[0],), dtype=X.dtype)
    labels = numpy.empty((X.shape[0],), dtype=numpy.int32)

    distances = _constraint_association(
        leftover, counters, labels, leftclose,
        distances_close, centers, X, x_squared_norms,
        limit, strategy, state=state)

    return labels, distances, distances_close


[docs]def _constraint_association(leftover, counters, labels, leftclose, distances_close,
                            centers, X, x_squared_norms, limit, strategy, state=None):
    """
    Completes the constraint :epkg:`k-means`.

    :param      X:               features
    :param      labels:          initialized labels (unused)
    :param      centers:         initialized centers
    :param      x_squared_norms: norm of *X*
    :param      limit:           number of point to associate per cluster
    :param      leftover:        number of points to associate at the end
    :param      counters:        allocated array
    :param      leftclose:       allocated array
    :param      labels:          allocated array
    :param      distances_close: allocated array
    :param      strategy:        strategy used to sort point before
                                mapping them to a cluster

    :param      state:           random state


    :githublink:`%|py|216`
    """
    if strategy in ('distance', 'distance_p'):
        return _constraint_association_distance(
            leftover, counters, labels, leftclose, distances_close,
            centers, X, x_squared_norms, limit, strategy, state=state)
    if strategy in ('gain', 'gain_p'):
        return _constraint_association_gain(
            leftover, counters, labels, leftclose, distances_close,
            centers, X, x_squared_norms, limit, strategy, state=state)
    raise ValueError("Unknwon strategy '{0}'.".format(
        strategy))  # pragma: no cover


[docs]def _compute_strategy_coefficient(distances, strategy, labels):
    """
    Creates a matrix


    :githublink:`%|py|232`
    """
    if strategy in ('gain', 'gain_p'):
        ar = numpy.arange(distances.shape[0])
        dist = distances[ar, labels]
        return distances - dist[:, numpy.newaxis]
    raise ValueError("Unknwon strategy '{0}'.".format(strategy))


[docs]def _randomize_index(index, weights):
    """
    Randomizes index depending on the value.
    Swap indexes.
    Modifies *index*.


    :githublink:`%|py|245`
    """
    maxi = weights.max()
    mini = weights.min()
    diff = max(maxi - mini, 1e-5)
    rand = numpy.random.rand(weights.shape[0])
    for i in range(1, index.shape[0]):
        ind1 = index[i - 1]
        ind2 = index[i]
        w1 = weights[ind1]
        w2 = weights[ind2]
        ratio = abs(w2 - w1) / diff * 0.5
        if rand[i] >= ratio + 0.5:
            index[i - 1], index[i] = ind2, ind1
            weights[i - 1], weights[i] = w2, w1


[docs]def _switch_clusters(labels, distances):
    """
    Tries to switch clusters.
    Modifies *labels* inplace.

    :param      labels:      labels
    :param      distances:   distances


    :githublink:`%|py|268`
    """
    perm = numpy.random.permutation(numpy.arange(0, labels.shape[0]))
    niter = 0
    modif = 1
    while modif > 0 and niter < 10:
        modif = 0
        niter += 1
        for i_ in range(labels.shape[0]):
            for j_ in range(i_ + 1, labels.shape[0]):
                i = perm[i_]
                j = perm[j_]
                c1 = labels[i]
                c2 = labels[j]
                if c1 == c2:
                    continue
                d11 = distances[i, c1]
                d12 = distances[i, c2]
                d21 = distances[j, c1]
                d22 = distances[j, c2]
                if d11**2 + d22**2 > d21**2 + d12**2:
                    labels[i], labels[j] = c2, c1
                    modif += 1


[docs]def _constraint_association_distance(leftover, counters, labels, leftclose, distances_close,
                                     centers, X, x_squared_norms, limit, strategy, state=None):
    """
    Completes the constraint *k-means*,
    the function sorts points by distance to the closest
    cluster and associates them into that order.
    It deals first with the further point and maps it to
    the closest center.

    :param      X:               features
    :param      labels:          initialized labels (unused)
    :param      centers:         initialized centers
    :param      x_squared_norms: norm of *X*
    :param      limit:           number of point to associate per cluster
    :param      leftover:        number of points to associate at the end
    :param      counters:        allocated array
    :param      leftclose:       allocated array
    :param      labels:          allocated array
    :param      distances_close: allocated array
    :param      strategy:        strategy used to sort point before
                                mapping them to a cluster

    :param      state:           random state (unused)


    :githublink:`%|py|314`
    """

    # initialisation
    counters[:] = 0
    leftclose[:] = -1
    labels[:] = -1

    # distances
    distances = euclidean_distances(
        centers, X, Y_norm_squared=x_squared_norms, squared=True)
    distances = distances.T
    distances0 = distances.copy()
    maxi = distances.ravel().max() * 2
    centers_index = numpy.argsort(distances, axis=1)

    while labels.min() == -1:
        mini = numpy.min(distances, axis=1)
        sorted_index = numpy.argsort(mini)
        _randomize_index(sorted_index, mini)

        nover = leftover
        for ind in sorted_index:
            if labels[ind] >= 0:
                continue
            for c in centers_index[ind, :]:
                if counters[c] < limit:
                    # The cluster still accepts new points.
                    counters[c] += 1
                    labels[ind] = c
                    distances[ind, c] = maxi
                    break
                if nover > 0 and leftclose[c] == -1:
                    # The cluster may accept one point if the number
                    # of clusters does not divide the number of points in X.
                    counters[c] += 1
                    labels[ind] = c
                    nover -= 1
                    leftclose[c] = 0
                    distances[ind, c] = maxi
                    break

    _switch_clusters(labels, distances0)
    distances_close[:] = distances[numpy.arange(X.shape[0]), labels]
    return distances0


[docs]def _constraint_association_gain(leftover, counters, labels, leftclose, distances_close,
                                 centers, X, x_squared_norms, limit, strategy, state=None):
    """
    Completes the constraint *k-means*.
    Follows the method described in `Same-size k-Means Variation
    <https://elki-project.github.io/tutorial/same-size_k_means>`_.

    :param      X:               features
    :param      labels:          initialized labels (unused)
    :param      centers:         initialized centers
    :param      x_squared_norms: norm of *X*
    :param      limit:           number of points to associate per cluster
    :param      leftover:        number of points to associate at the end
    :param      counters:        allocated array
    :param      leftclose:       allocated array
    :param      labels:          allocated array
    :param      distances_close: allocated array
    :param      strategy:        strategy used to sort point before
                                mapping them to a cluster

    :param      state:           random state

    See `Same-size k-Means Variation <https://elki-project.github.io/tutorial/same-size_k_means>`_.


    :githublink:`%|py|382`
    """
    # distances
    distances = euclidean_distances(
        centers, X, Y_norm_squared=x_squared_norms, squared=True)
    distances = distances.T

    if strategy == 'gain_p':
        labels[:] = numpy.argmin(distances, axis=1)
    else:
        # We assume labels comes from a previous iteration.
        pass

    strategy_coef = _compute_strategy_coefficient(distances, strategy, labels)
    distance_linear = linearize_matrix(distances, strategy_coef)
    sorted_distances = distance_linear[distance_linear[:, 3].argsort()]
    distances_close[:] = 0

    # counters
    ave = limit
    counters[:] = 0
    for i in labels:
        counters[i] += 1
    leftclose[:] = counters[:] - ave
    leftclose[leftclose < 0] = 0
    nover = X.shape[0] - ave * counters.shape[0]
    sumi = nover - leftclose.sum()
    if sumi != 0:
        if state is None:
            state = numpy.random.RandomState()  # pylint: disable=E1101

        def loopf(h, sumi):
            if sumi < 0 and leftclose[h] > 0:  # pylint: disable=R1716
                sumi -= leftclose[h]
                leftclose[h] = 0
            elif sumi > 0 and leftclose[h] == 0:
                leftclose[h] = 1
                sumi += 1
            return sumi

        it = 0
        while sumi != 0:
            h = state.randint(0, counters.shape[0])
            sumi = loopf(h, sumi)
            it += 1
            if it > counters.shape[0] * 2:
                break
        for h in range(counters.shape[0]):
            if sumi == 0:
                break
            sumi = loopf(h, sumi)

    transfer = {}

    for i in range(0, sorted_distances.shape[0]):
        gain = sorted_distances[i, 3]
        ind = int(sorted_distances[i, 1])
        dest = int(sorted_distances[i, 2])
        cur = labels[ind]
        if distances_close[ind]:
            continue
        if cur == dest:
            continue
        if ((counters[dest] < ave + leftclose[dest]) and
                (counters[cur] > ave + leftclose[cur])):
            labels[ind] = dest
            counters[cur] -= 1
            counters[dest] += 1
            distances_close[ind] = 1  # moved
        else:
            cp = transfer.get((dest, cur), [])
            while len(cp) > 0:
                g, destind = cp[0]
                if distances_close[destind]:
                    del cp[0]
                else:
                    break
            if len(cp) > 0:
                g, destind = cp[0]
                if g + gain < 0:
                    del cp[0]
                    labels[ind] = dest
                    labels[destind] = cur
                    add = False
                    distances_close[ind] = 1  # moved
                    distances_close[destind] = 1  # moved
                else:
                    add = True
            else:
                add = True
            if add:
                # We add the point to the list of points willing to transfer.
                if (cur, dest) not in transfer:
                    transfer[cur, dest] = []
                gain = sorted_distances[i, 3]
                bisect.insort(transfer[cur, dest], (gain, ind))

    neg = (counters < ave).sum()
    if neg > 0:
        raise RuntimeError(  # pragma: no cover
            "The algorithm failed, counters={0}".format(counters))

    _switch_clusters(labels, distances)
    distances_close[:] = distances[numpy.arange(X.shape[0]), labels]

    return distances


[docs]def _constraint_kmeans_weights(X, labels, sample_weight, centers, inertia, it,
                               max_iter, verbose=0, state=None, learning_rate=1.,
                               history=False, fLOG=None):
    """
    Runs KMeans iterator but weights cluster among them.

    :param      X:               features
    :param      labels:          initialized labels (unused)
    :param      sample_weight:   sample weight
    :param      centers:         initialized centers
    :param      inertia:         initialized inertia (unused)
    :param      it:              number of iteration already done
    :param      max_iter:        maximum of number of iteration
    :param      verbose:         verbose
    :param      state:           random state
    :param      learning_rate:   learning rate
    :param      history:         keeps all centers accross iterations
    :param      fLOG:            logging function (needs to be specified otherwise
                                verbose has no effects)

    :return:                     tuple (best_labels, best_centers, best_inertia, weights, it)


    :githublink:`%|py|509`
    """
    if isinstance(X, DataFrame):
        X = X.values
    n_clusters = centers.shape[0]
    best_inertia = None
    best_iter = None
    weights = numpy.ones(centers.shape[0])
    if sample_weight is None:
        sw = numpy.ones((X.shape[0],))
    else:
        sw = sample_weight

    if scipy.sparse.issparse(X):
        _centers_fct = _centers_sparse
    else:
        _centers_fct = _centers_dense

    total_inertia = _inertia(X, sw)
    all_centers = []

    while it < max_iter:

        # compute new clusters
        centers = _centers_fct(
            X, sw, labels, n_clusters, None)
        if history:
            all_centers.append(centers)

        # association
        labels = _constraint_association_weights(X, centers, sw, weights)
        if len(set(labels)) != centers.shape[0]:
            if verbose and fLOG:  # pragma: no cover
                if isinstance(verbose, int) and verbose >= 10:
                    fLOG("CKMeans new weights: w=%r" % weights)
                else:
                    fLOG("CKMeans new weights")
            weights[:] = 1
            labels = _constraint_association_weights(X, centers, sw, weights)

        # inertia
        inertia, diff = _labels_inertia_weights(
            X, centers, sw, weights, labels, total_inertia)
        if numpy.isnan(inertia):
            raise RuntimeError(  # pragma: no cover
                "nanNobs={} Nclus={}\ninertia={}\nweights={}\ndiff={}\nlabels={}".format(
                    X.shape[0], centers.shape[0], inertia, weights, diff,
                    set(labels)))

        # best option so far?
        if best_inertia is None or inertia < best_inertia:
            best_inertia = inertia
            best_centers = centers.copy()
            best_labels = labels.copy()
            best_weights = weights.copy()
            best_iter = it

        # moves weights
        weights, _ = _adjust_weights(X, sw, weights, labels,
                                     learning_rate / (it + 10))

        it += 1
        if verbose and fLOG:
            if isinstance(verbose, int) and verbose >= 10:
                fLOG("CKMeans %d/%d inertia=%f (%f T=%f) dw=%r w=%r" % (
                    it, max_iter, inertia, best_inertia, total_inertia,
                    diff, weights))
            elif isinstance(verbose, int) and verbose >= 5:
                hist = Counter(labels)
                fLOG("CKMeans %d/%d inertia=%f (%f) hist=%r" % (
                    it, max_iter, inertia, best_inertia, hist))
            else:
                fLOG("CKMeans %d/%d inertia=%f (%f T=%f)" % (  # pragma: no cover
                    it, max_iter, inertia, best_inertia, total_inertia))

        # early stop
        if (best_inertia is not None and inertia >= best_inertia and
                it > best_iter + 5 and numpy.abs(diff).sum() <= weights.shape[0] / 2):
            break

    return (best_labels, best_centers, best_inertia, best_weights,
            it, all_centers)


[docs]def _constraint_association_weights(X, centers, sw, weights):
    """
    Associates points to clusters.

    :param      X:           features
    :param      centers:     centers
    :param      sw:          sample weights
    :param      weights:     cluster weights
    :return:                 labels


    :githublink:`%|py|601`
    """
    dist = cdist(X, centers) * weights.reshape((1, -1))
    index = numpy.argmin(dist, axis=1)
    return index


[docs]def _inertia(X, sw):
    """
    Computes total weighted inertia.

    :param      X:               features
    :param      sw:              sample weights
    :return:                     inertia


    :githublink:`%|py|614`
    """
    bary = numpy.mean(X, axis=0)
    diff = X - bary
    norm = numpy.linalg.norm(diff, axis=1)
    if sw is not None:
        norm *= sw
    return sw.sum()


[docs]def _labels_inertia_weights(X, centers, sw, weights, labels, total_inertia):
    """
    Computes weighted inertia. It also adds a fraction
    of the whole inertia depending on how balanced the
    clusters are.

    :param      X:               features
    :param      centers:         centers
    :param      sw:              sample weights
    :param      weights:         cluster weights
    :param      labels:          labels
    :param      total_inertia:   total inertia
    :return:                     inertia


    :githublink:`%|py|636`
    """
    www, exp, _ = _compute_balance(X, sw, labels, centers.shape[0])
    www -= exp
    wwwa = numpy.square(www)
    ratio = wwwa.sum() ** 0.5 / X.shape[0]
    dist = cdist(X, centers) * weights.reshape((1, -1)) * sw.reshape((-1, 1))
    return dist.sum() + ratio * total_inertia, www


[docs]def _compute_balance(X, sw, labels, nbc=None):
    """
    Computes weights difference.

    :param      X:           features
    :param      sw:          sample weights
    :param      labels:      known labels
    :param      nbc:         number of clusters
    :return:                 (weights per cluster, expected weight,
                            total weight)


    :githublink:`%|py|655`
    """
    if nbc is None:
        nbc = labels.max() + 1
    N = numpy.float64(nbc)
    www = numpy.zeros((nbc, ), dtype=numpy.float64)
    if sw is None:
        for la in labels:
            www[la] += 1
    else:
        for w, la in zip(sw, labels):
            www[la] += w
    exp = www.sum() / N
    return www, exp, N


[docs]def _adjust_weights(X, sw, weights, labels, lr):
    """
    Changes *weights* mapped to every cluster.
    *weights < 1* are used for big clusters,
    *weights > 1* are used for small clusters.

    :param      X:           features
    :param      centers:     centers
    :param      sw:          sample weights
    :param      weights:     cluster weights
    :param      lr:          learning rate
    :param      labels:      known labels
    :return:                 labels


    :githublink:`%|py|683`
    """
    www, exp, N = _compute_balance(X, sw, labels, weights.shape[0])

    for i in range(0, weights.shape[0]):
        nw = (www[i] - exp) / exp
        delta = nw * lr
        weights[i] += delta
        N += delta

    return weights / N * weights.shape[0], www
Source code for mlinsights.mlmodel._kmeans_constraint_

mlinsights

Navigation

Related Topics