Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# pylint: disable=C0302
2"""
3@file
4@brief Implements k-means with norms L1 and L2.
5"""
6import numpy
7from scipy.sparse import issparse
8# Source: https://github.com/scikit-learn/scikit-learn/blob/95d4f0841d57e8b5f6b2a570312e9d832e69debc/sklearn/cluster/_k_means_fast.pyx
9from sklearn.utils.sparsefuncs_fast import assign_rows_csr # pylint: disable=W0611,E0611
10try:
11 from sklearn.cluster._kmeans import _check_sample_weight
12except ImportError: # pragma: no cover
13 from sklearn.cluster._kmeans import (
14 _check_normalize_sample_weight as _check_sample_weight)
15from sklearn.metrics.pairwise import pairwise_distances_argmin_min
18def _labels_inertia_precompute_dense(norm, X, sample_weight, centers, distances):
19 """
20 Computes labels and inertia using a full distance matrix.
22 This will overwrite the 'distances' array in-place.
24 :param norm: 'l1' or 'l2'
25 :param X: numpy array, shape (n_sample, n_features)
26 Input data.
27 :param sample_weight: array-like, shape (n_samples,)
28 The weights for each observation in X.
29 :param centers: numpy array, shape (n_clusters, n_features)
30 Cluster centers which data is assigned to.
31 :param distances: numpy array, shape (n_samples,)
32 Pre-allocated array in which distances are stored.
33 :return: labels : numpy array, dtype=numpy.int, shape (n_samples,)
34 Indices of clusters that samples are assigned to.
35 :return: inertia : float
36 Sum of squared distances of samples to their closest
37 cluster center.
38 """
39 n_samples = X.shape[0]
40 if norm == 'l2':
41 labels, mindist = pairwise_distances_argmin_min(
42 X=X, Y=centers, metric='euclidean', metric_kwargs={'squared': True})
43 elif norm == 'l1':
44 labels, mindist = pairwise_distances_argmin_min(
45 X=X, Y=centers, metric='manhattan')
46 else: # pragma no cover
47 raise NotImplementedError(
48 "Not implemented for norm '{}'.".format(norm))
49 # cython k-means code assumes int32 inputs
50 labels = labels.astype(numpy.int32, copy=False)
51 if n_samples == distances.shape[0]:
52 # distances will be changed in-place
53 distances[:] = mindist
54 inertia = (mindist * sample_weight).sum()
55 return labels, inertia
58def _assign_labels_csr(X, sample_weight, x_squared_norms, centers,
59 labels, distances):
60 """Compute label assignment and inertia for a CSR input
61 Return the inertia (sum of squared distances to the centers).
62 """
63 if (distances is not None and
64 distances.shape != (X.shape[0], )):
65 raise ValueError( # pragma: no cover
66 "Dimension mismatch for distance got {}, expecting {}."
67 "".format(distances.shape, (X.shape[0], centers.shape[0])))
68 n_clusters = centers.shape[0]
69 n_samples = X.shape[0]
70 store_distances = 0
71 inertia = 0.0
73 if centers.dtype == numpy.float32:
74 center_squared_norms = numpy.zeros(n_clusters, dtype=numpy.float32)
75 else:
76 center_squared_norms = numpy.zeros(n_clusters, dtype=numpy.float64)
78 if n_samples == distances.shape[0]:
79 store_distances = 1
81 for center_idx in range(n_clusters):
82 center_squared_norms[center_idx] = numpy.dot(
83 centers[center_idx, :], centers[center_idx, :])
85 for sample_idx in range(n_samples):
86 min_dist = -1
87 for center_idx in range(n_clusters):
88 dist = 0.0
89 # hardcoded: minimize euclidean distance to cluster center:
90 # ||a - b||^2 = ||a||^2 + ||b||^2 -2 <a, b>
91 dist += X[sample_idx, :] @ centers[center_idx, :].reshape((-1, 1))
92 dist *= -2
93 dist += center_squared_norms[center_idx]
94 dist += x_squared_norms[sample_idx]
95 dist *= sample_weight[sample_idx]
96 if min_dist == -1 or dist < min_dist:
97 min_dist = dist
98 labels[sample_idx] = center_idx
99 if store_distances:
100 distances[sample_idx] = dist
101 inertia += min_dist
103 return inertia
106def _assign_labels_array(X, sample_weight, x_squared_norms, centers,
107 labels, distances):
108 """Compute label assignment and inertia for a dense array
109 Return the inertia (sum of squared distances to the centers).
110 """
111 n_clusters = centers.shape[0]
112 n_samples = X.shape[0]
113 store_distances = 0
114 inertia = 0.0
116 dtype = numpy.float32 if centers.dtype == numpy.float32 else numpy.float64
117 center_squared_norms = numpy.zeros(n_clusters, dtype=dtype)
119 if n_samples == distances.shape[0]:
120 store_distances = 1
122 for center_idx in range(n_clusters):
123 center_squared_norms[center_idx] = numpy.dot(
124 centers[center_idx, :], centers[center_idx, :])
126 for sample_idx in range(n_samples):
127 min_dist = -1
128 for center_idx in range(n_clusters):
129 dist = 0.0
130 # hardcoded: minimize euclidean distance to cluster center:
131 # ||a - b||^2 = ||a||^2 + ||b||^2 -2 <a, b>
132 dist += numpy.dot(X[sample_idx, :], centers[center_idx, :])
133 dist *= -2
134 dist += center_squared_norms[center_idx]
135 dist += x_squared_norms[sample_idx]
136 dist *= sample_weight[sample_idx]
137 if min_dist == -1 or dist < min_dist:
138 min_dist = dist
139 labels[sample_idx] = center_idx
141 if store_distances:
142 distances[sample_idx] = min_dist
143 inertia += min_dist
145 return inertia
148def _labels_inertia_skl(X, sample_weight, x_squared_norms, centers,
149 distances=None):
150 """E step of the K-means EM algorithm.
151 Compute the labels and the inertia of the given samples and centers.
152 This will compute the distances in-place.
154 :param X: float64 array-like or CSR sparse matrix, shape (n_samples, n_features)
155 The input samples to assign to the labels.
156 :param sample_weight: array-like, shape (n_samples,)
157 The weights for each observation in X.
158 :param x_squared_norms: array, shape (n_samples,)
159 Precomputed squared euclidean norm of each data point, to speed up
160 computations.
161 :param centers: float array, shape (k, n_features)
162 The cluster centers.
163 :param distances: float array, shape (n_samples,)
164 Pre-allocated array to be filled in with each sample's distance
165 to the closest center.
166 :return: labels, int array of shape(n)
167 The resulting assignment
168 :return: inertia, float
169 Sum of squared distances of samples to their closest cluster center.
170 """
171 n_samples = X.shape[0]
172 sample_weight = _check_sample_weight(sample_weight, X)
173 # set the default value of centers to -1 to be able to detect any anomaly
174 # easily
175 labels = numpy.full(n_samples, -1, numpy.int32)
176 if distances is None:
177 distances = numpy.zeros(shape=(0,), dtype=X.dtype)
178 # distances will be changed in-place
179 if issparse(X):
180 inertia = _assign_labels_csr(
181 X, sample_weight, x_squared_norms, centers, labels,
182 distances=distances)
183 else:
184 inertia = _assign_labels_array(
185 X, sample_weight, x_squared_norms, centers, labels,
186 distances=distances)
187 return labels, inertia
190def _centers_dense(X, sample_weight, labels, n_clusters, distances):
191 """
192 M step of the K-means EM algorithm
193 Computation of cluster centers / means.
195 :param X: array-like, shape (n_samples, n_features)
196 :param sample_weight: array-like, shape (n_samples,)
197 The weights for each observation in X.
198 :param labels: array of integers, shape (n_samples)
199 Current label assignment
200 :param n_clusters: int
201 Number of desired clusters
202 :param distances: array-like, shape (n_samples)
203 Distance to closest cluster for each sample.
204 :return: centers : array, shape (n_clusters, n_features)
205 The resulting centers
206 """
207 n_samples = X.shape[0]
208 n_features = X.shape[1]
210 dtype = X.dtype
211 centers = numpy.zeros((n_clusters, n_features), dtype=dtype)
212 weight_in_cluster = numpy.zeros((n_clusters,), dtype=dtype)
214 for i in range(n_samples):
215 c = labels[i]
216 weight_in_cluster[c] += sample_weight[i]
217 empty_clusters = numpy.where(weight_in_cluster == 0)[0]
218 # maybe also relocate small clusters?
220 if distances is not None and len(empty_clusters):
221 # find points to reassign empty clusters to
222 far_from_centers = distances.argsort()[::-1]
224 for i, cluster_id in enumerate(empty_clusters):
225 far_index = far_from_centers[i]
226 new_center = X[far_index] * sample_weight[far_index]
227 centers[cluster_id] = new_center
228 weight_in_cluster[cluster_id] = sample_weight[far_index]
230 for i in range(n_samples):
231 for j in range(n_features):
232 centers[labels[i], j] += X[i, j] * sample_weight[i]
234 centers /= weight_in_cluster[:, numpy.newaxis]
236 return centers
239def _centers_sparse(X, sample_weight, labels, n_clusters, distances):
240 """
241 M step of the K-means EM algorithm
242 Computation of cluster centers / means.
244 :param X: scipy.sparse.csr_matrix, shape (n_samples, n_features)
245 :param sample_weight: array-like, shape (n_samples,)
246 The weights for each observation in X.
247 :param labels: array of integers, shape (n_samples)
248 Current label assignment
249 :param n_clusters: int
250 Number of desired clusters
251 :param distances: array-like, shape (n_samples)
252 Distance to closest cluster for each sample.
253 :return: centers, array, shape (n_clusters, n_features)
254 The resulting centers
255 """
256 n_samples = X.shape[0]
257 n_features = X.shape[1]
259 data = X.data
260 indices = X.indices
261 indptr = X.indptr
263 dtype = X.dtype
264 centers = numpy.zeros((n_clusters, n_features), dtype=dtype)
265 weight_in_cluster = numpy.zeros((n_clusters,), dtype=dtype)
266 for i in range(n_samples):
267 c = labels[i]
268 weight_in_cluster[c] += sample_weight[i]
269 empty_clusters = numpy.where(weight_in_cluster == 0)[0]
270 n_empty_clusters = empty_clusters.shape[0]
272 # maybe also relocate small clusters?
274 if n_empty_clusters > 0:
275 # find points to reassign empty clusters to
276 far_from_centers = distances.argsort()[::-1][:n_empty_clusters]
277 assign_rows_csr(X, far_from_centers, empty_clusters, centers)
279 for i in range(n_empty_clusters):
280 weight_in_cluster[empty_clusters[i]] = 1
282 for i in range(labels.shape[0]):
283 curr_label = labels[i]
284 for ind in range(indptr[i], indptr[i + 1]):
285 j = indices[ind]
286 centers[curr_label, j] += data[ind] * sample_weight[i]
288 centers /= weight_in_cluster[:, numpy.newaxis]
290 return centers