Coverage for mlinsights/mlmodel/_kmeans_constraint

1# -*- coding: utf-8 -*-

2"""

3@file

4@brief Implémente la classe @see cl ConstraintKMeans.

5"""

6import bisect

7from collections import Counter

8from pandas import DataFrame

9import numpy

10import scipy.sparse

11from scipy.spatial.distance import cdist

12from sklearn.metrics.pairwise import euclidean_distances

13from sklearn.utils.extmath import row_norms

14from ._kmeans_022 import (

15 _centers_dense, _centers_sparse, _labels_inertia_skl)

18def linearize_matrix(mat, *adds):

19 """

20 Linearizes a matrix into a new one

21 with 3 columns value, row, column.

22 The output format is similar to

23 :epkg:`csr_matrix` but null values are kept.

25 @param mat matrix

26 @param adds additional square matrices

27 @return new matrix

29 *adds* defines additional matrices, it adds

30 columns on the right side and fill them with

31 the corresponding value taken into the additional

32 matrices.

33 """

34 if scipy.sparse.issparse(mat):

35 if isinstance(mat, scipy.sparse.csr_matrix):

36 max_row = mat.shape[0]

37 res = numpy.empty((len(mat.data), 3 + len(adds)), dtype=mat.dtype)

38 row = 0

39 for i, v in enumerate(mat.data):

40 while row < max_row and i >= mat.indptr[row]:

41 row += 1

42 res[i, 0] = v

43 a, b = row - 1, mat.indices[i]

44 res[i, 1] = a

45 res[i, 2] = b

46 for k, am in enumerate(adds):

47 res[i, k + 3] = am[a, b]

48 return res

49 raise NotImplementedError( # pragma: no cover

50 f"This kind of sparse matrix is not handled: {type(mat)}")

51 else:

52 n = mat.shape[0]

53 c = mat.shape[1]

54 ic = numpy.arange(mat.shape[1])

55 res = numpy.empty((n * c, 3 + len(adds)), dtype=mat.dtype)

56 for i in range(0, n):

57 a = i * c

58 b = (i + 1) * c

59 res[a:b, 1] = i

60 res[a:b, 2] = ic

61 res[:, 0] = mat.ravel()

62 for k, am in enumerate(adds):

63 res[:, 3 + k] = am.ravel()

64 return res

67def constraint_kmeans(X, labels, sample_weight, centers, inertia,

68 iter, max_iter, # pylint: disable=W0622

69 strategy='gain', verbose=0, state=None,

70 learning_rate=1., history=False, fLOG=None):

71 """

72 Completes the constraint :epkg:`k-means`.

74 @param X features

75 @param labels initialized labels (unused)

76 @param sample_weight sample weight

77 @param centers initialized centers

78 @param inertia initialized inertia (unused)

79 @param iter number of iteration already done

80 @param max_iter maximum of number of iteration

81 @param strategy strategy used to sort observations before

82 mapping them to clusters

83 @param verbose verbose

84 @param state random state

85 @param learning_rate used by strategy `'weights'`

86 @param history return list of centers accross iterations

87 @param fLOG logging function (needs to be specified otherwise

88 verbose has no effects)

89 @return tuple (best_labels, best_centers, best_inertia,

90 iter, all_centers)

91 """

92 if labels.dtype != numpy.int32:

93 raise TypeError( # pragma: no cover

94 f"Labels must be an array of int not '{labels.dtype}'")

96 if strategy == 'weights':

97 return _constraint_kmeans_weights(

98 X, labels, sample_weight, centers, inertia, iter,

99 max_iter, verbose=verbose, state=state,

100 learning_rate=learning_rate, history=history, fLOG=fLOG)

101 else:

102 if isinstance(X, DataFrame):

103 X = X.values

104 x_squared_norms = row_norms(X, squared=True)

105 counters = numpy.empty((centers.shape[0],), dtype=numpy.int32)

106 limit = X.shape[0] // centers.shape[0]

107 leftover = X.shape[0] - limit * centers.shape[0]

108 leftclose = numpy.empty((centers.shape[0],), dtype=numpy.int32)

109 n_clusters = centers.shape[0]

110 distances_close = numpy.empty((X.shape[0],), dtype=X.dtype)

111 best_inertia = None

112 best_iter = None

113 all_centers = []

114

115 # association

116 _constraint_association(leftover, counters, labels, leftclose, distances_close,

117 centers, X, x_squared_norms, limit, strategy, state=state)

118

119 if sample_weight is None:

120 sw = numpy.ones((X.shape[0],))

121 else:

122 sw = sample_weight

123

124 if scipy.sparse.issparse(X):

125 _centers_fct = _centers_sparse

126 else:

127 _centers_fct = _centers_dense

128

129 while iter < max_iter:

130 # compute new clusters

131 centers = _centers_fct(

132 X, sw, labels, n_clusters, distances_close)

133

134 if history:

135 all_centers.append(centers)

136

137 # association

138 _constraint_association(

139 leftover, counters, labels, leftclose, distances_close,

140 centers, X, x_squared_norms, limit, strategy, state=state)

141

142 # inertia

143 _, inertia = _labels_inertia_skl(

144 X=X, sample_weight=sw, x_squared_norms=x_squared_norms,

145 centers=centers, distances=distances_close)

146

147 iter += 1

148 if verbose and fLOG: # pragma: no cover

149 fLOG("CKMeans %d/%d inertia=%f" % (iter, max_iter, inertia))

150

151 # best option so far?

152 if best_inertia is None or inertia < best_inertia:

153 best_inertia = inertia

154 best_centers = centers.copy()

155 best_labels = labels.copy()

156 best_iter = iter

157

158 # early stop

159 if (best_inertia is not None and inertia >= best_inertia and

160 iter > best_iter + 5):

161 break

162

163 return (best_labels, best_centers, best_inertia, None,

164 iter, all_centers)

165

166

167def constraint_predictions(X, centers, strategy, state=None):

168 """

169 Computes the predictions but tries

170 to associates the same numbers of points

171 in each cluster.

172

173 @param X features

174 @param centers centers of each clusters

175 @param strategy strategy used to sort point before

176 mapping them to a cluster

177 @param state random state

178 @return labels, distances, distances_close

179 """

180 if isinstance(X, DataFrame):

181 X = X.values

182 x_squared_norms = row_norms(X, squared=True)

183 counters = numpy.empty((centers.shape[0],), dtype=numpy.int32)

184 limit = X.shape[0] // centers.shape[0]

185 leftover = X.shape[0] - limit * centers.shape[0]

186 leftclose = numpy.empty((centers.shape[0],), dtype=numpy.int32)

187 distances_close = numpy.empty((X.shape[0],), dtype=X.dtype)

188 labels = numpy.empty((X.shape[0],), dtype=numpy.int32)

189

190 distances = _constraint_association(

191 leftover, counters, labels, leftclose,

192 distances_close, centers, X, x_squared_norms,

193 limit, strategy, state=state)

194

195 return labels, distances, distances_close

196

197

198def _constraint_association(leftover, counters, labels, leftclose, distances_close,

199 centers, X, x_squared_norms, limit, strategy, state=None):

200 """

201 Completes the constraint :epkg:`k-means`.

202

203 @param X features

204 @param labels initialized labels (unused)

205 @param centers initialized centers

206 @param x_squared_norms norm of *X*

207 @param limit number of point to associate per cluster

208 @param leftover number of points to associate at the end

209 @param counters allocated array

210 @param leftclose allocated array

211 @param labels allocated array

212 @param distances_close allocated array

213 @param strategy strategy used to sort point before

214 mapping them to a cluster

215 @param state random state

216 """

217 if strategy in ('distance', 'distance_p'):

218 return _constraint_association_distance(

219 leftover, counters, labels, leftclose, distances_close,

220 centers, X, x_squared_norms, limit, strategy, state=state)

221 if strategy in ('gain', 'gain_p'):

222 return _constraint_association_gain(

223 leftover, counters, labels, leftclose, distances_close,

224 centers, X, x_squared_norms, limit, strategy, state=state)

225 raise ValueError(f"Unknwon strategy '{strategy}'.") # pragma: no cover

226

227

228def _compute_strategy_coefficient(distances, strategy, labels):

229 """

230 Creates a matrix

231 """

232 if strategy in ('gain', 'gain_p'):

233 ar = numpy.arange(distances.shape[0])

234 dist = distances[ar, labels]

235 return distances - dist[:, numpy.newaxis]

236 raise ValueError( # pragma: no cover

237 f"Unknwon strategy '{strategy}'.")

238

239

240def _randomize_index(index, weights):

241 """

242 Randomizes index depending on the value.

243 Swap indexes.

244 Modifies *index*.

245 """

246 maxi = weights.max()

247 mini = weights.min()

248 diff = max(maxi - mini, 1e-5)

249 rand = numpy.random.rand(weights.shape[0])

250 for i in range(1, index.shape[0]):

251 ind1 = index[i - 1]

252 ind2 = index[i]

253 w1 = weights[ind1]

254 w2 = weights[ind2]

255 ratio = abs(w2 - w1) / diff * 0.5

256 if rand[i] >= ratio + 0.5:

257 index[i - 1], index[i] = ind2, ind1

258 weights[i - 1], weights[i] = w2, w1

259

260

261def _switch_clusters(labels, distances):

262 """

263 Tries to switch clusters.

264 Modifies *labels* inplace.

265

266 @param labels labels

267 @param distances distances

268 """

269 perm = numpy.random.permutation(numpy.arange(0, labels.shape[0]))

270 niter = 0

271 modif = 1

272 while modif > 0 and niter < 10:

273 modif = 0

274 niter += 1

275 for i_ in range(labels.shape[0]):

276 for j_ in range(i_ + 1, labels.shape[0]):

277 i = perm[i_]

278 j = perm[j_]

279 c1 = labels[i]

280 c2 = labels[j]

281 if c1 == c2:

282 continue

283 d11 = distances[i, c1]

284 d12 = distances[i, c2]

285 d21 = distances[j, c1]

286 d22 = distances[j, c2]

287 if d11**2 + d22**2 > d21**2 + d12**2:

288 labels[i], labels[j] = c2, c1

289 modif += 1

290

291

292def _constraint_association_distance(leftover, counters, labels, leftclose, distances_close,

293 centers, X, x_squared_norms, limit, strategy, state=None):

294 """

295 Completes the constraint *k-means*,

296 the function sorts points by distance to the closest

297 cluster and associates them into that order.

298 It deals first with the further point and maps it to

299 the closest center.

300

301 @param X features

302 @param labels initialized labels (unused)

303 @param centers initialized centers

304 @param x_squared_norms norm of *X*

305 @param limit number of point to associate per cluster

306 @param leftover number of points to associate at the end

307 @param counters allocated array

308 @param leftclose allocated array

309 @param labels allocated array

310 @param distances_close allocated array

311 @param strategy strategy used to sort point before

312 mapping them to a cluster

313 @param state random state (unused)

314 """

315

316 # initialisation

317 counters[:] = 0

318 leftclose[:] = -1

319 labels[:] = -1

320

321 # distances

322 distances = euclidean_distances(

323 centers, X, Y_norm_squared=x_squared_norms, squared=True)

324 distances = distances.T

325 distances0 = distances.copy()

326 maxi = distances.ravel().max() * 2

327 centers_index = numpy.argsort(distances, axis=1)

328

329 while labels.min() == -1:

330 mini = numpy.min(distances, axis=1)

331 sorted_index = numpy.argsort(mini)

332 _randomize_index(sorted_index, mini)

333

334 nover = leftover

335 for ind in sorted_index:

336 if labels[ind] >= 0:

337 continue

338 for c in centers_index[ind, :]:

339 if counters[c] < limit:

340 # The cluster still accepts new points.

341 counters[c] += 1

342 labels[ind] = c

343 distances[ind, c] = maxi

344 break

345 if nover > 0 and leftclose[c] == -1:

346 # The cluster may accept one point if the number

347 # of clusters does not divide the number of points in X.

348 counters[c] += 1

349 labels[ind] = c

350 nover -= 1

351 leftclose[c] = 0

352 distances[ind, c] = maxi

353 break

354

355 _switch_clusters(labels, distances0)

356 distances_close[:] = distances[numpy.arange(X.shape[0]), labels]

357 return distances0

358

359

360def _constraint_association_gain(leftover, counters, labels, leftclose, distances_close,

361 centers, X, x_squared_norms, limit, strategy, state=None):

362 """

363 Completes the constraint *k-means*.

364 Follows the method described in `Same-size k-Means Variation

365 <https://elki-project.github.io/tutorial/same-size_k_means>`_.

366

367 @param X features

368 @param labels initialized labels (unused)

369 @param centers initialized centers

370 @param x_squared_norms norm of *X*

371 @param limit number of points to associate per cluster

372 @param leftover number of points to associate at the end

373 @param counters allocated array

374 @param leftclose allocated array

375 @param labels allocated array

376 @param distances_close allocated array

377 @param strategy strategy used to sort point before

378 mapping them to a cluster

379 @param state random state

380

381 See `Same-size k-Means Variation <https://elki-project.github.io/tutorial/same-size_k_means>`_.

382 """

383 # distances

384 distances = euclidean_distances(

385 centers, X, Y_norm_squared=x_squared_norms, squared=True)

386 distances = distances.T

387

388 if strategy == 'gain_p':

389 labels[:] = numpy.argmin(distances, axis=1)

390 else:

391 # We assume labels comes from a previous iteration.

392 pass

393

394 strategy_coef = _compute_strategy_coefficient(distances, strategy, labels)

395 distance_linear = linearize_matrix(distances, strategy_coef)

396 sorted_distances = distance_linear[distance_linear[:, 3].argsort()]

397 distances_close[:] = 0

398

399 # counters

400 ave = limit

401 counters[:] = 0

402 for i in labels:

403 counters[i] += 1

404 leftclose[:] = counters[:] - ave

405 leftclose[leftclose < 0] = 0

406 nover = X.shape[0] - ave * counters.shape[0]

407 sumi = nover - leftclose.sum()

408 if sumi != 0:

409 if state is None:

410 state = numpy.random.RandomState() # pylint: disable=E1101

411

412 def loopf(h, sumi):

413 if sumi < 0 and leftclose[h] > 0: # pylint: disable=R1716

414 sumi -= leftclose[h]

415 leftclose[h] = 0

416 elif sumi > 0 and leftclose[h] == 0:

417 leftclose[h] = 1

418 sumi += 1

419 return sumi

420

421 it = 0

422 while sumi != 0:

423 h = state.randint(0, counters.shape[0])

424 sumi = loopf(h, sumi)

425 it += 1

426 if it > counters.shape[0] * 2:

427 break

428 for h in range(counters.shape[0]):

429 if sumi == 0:

430 break

431 sumi = loopf(h, sumi)

432

433 transfer = {}

434

435 for i in range(0, sorted_distances.shape[0]):

436 gain = sorted_distances[i, 3]

437 ind = int(sorted_distances[i, 1])

438 dest = int(sorted_distances[i, 2])

439 cur = labels[ind]

440 if distances_close[ind]:

441 continue

442 if cur == dest:

443 continue

444 if ((counters[dest] < ave + leftclose[dest]) and

445 (counters[cur] > ave + leftclose[cur])):

446 labels[ind] = dest

447 counters[cur] -= 1

448 counters[dest] += 1

449 distances_close[ind] = 1 # moved

450 else:

451 cp = transfer.get((dest, cur), [])

452 while len(cp) > 0:

453 g, destind = cp[0]

454 if distances_close[destind]:

455 del cp[0]

456 else:

457 break

458 if len(cp) > 0:

459 g, destind = cp[0]

460 if g + gain < 0:

461 del cp[0]

462 labels[ind] = dest

463 labels[destind] = cur

464 add = False

465 distances_close[ind] = 1 # moved

466 distances_close[destind] = 1 # moved

467 else:

468 add = True

469 else:

470 add = True

471 if add:

472 # We add the point to the list of points willing to transfer.

473 if (cur, dest) not in transfer:

474 transfer[cur, dest] = []

475 gain = sorted_distances[i, 3]

476 bisect.insort(transfer[cur, dest], (gain, ind))

477

478 neg = (counters < ave).sum()

479 if neg > 0:

480 raise RuntimeError( # pragma: no cover

481 f"The algorithm failed, counters={counters}")

482

483 _switch_clusters(labels, distances)

484 distances_close[:] = distances[numpy.arange(X.shape[0]), labels]

485

486 return distances

487

488

489def _constraint_kmeans_weights(X, labels, sample_weight, centers, inertia, it,

490 max_iter, verbose=0, state=None, learning_rate=1.,

491 history=False, fLOG=None):

492 """

493 Runs KMeans iterator but weights cluster among them.

494

495 @param X features

496 @param labels initialized labels (unused)

497 @param sample_weight sample weight

498 @param centers initialized centers

499 @param inertia initialized inertia (unused)

500 @param it number of iteration already done

501 @param max_iter maximum of number of iteration

502 @param verbose verbose

503 @param state random state

504 @param learning_rate learning rate

505 @param history keeps all centers accross iterations

506 @param fLOG logging function (needs to be specified otherwise

507 verbose has no effects)

508 @return tuple (best_labels, best_centers, best_inertia, weights, it)

509 """

510 if isinstance(X, DataFrame):

511 X = X.values

512 n_clusters = centers.shape[0]

513 best_inertia = None

514 best_iter = None

515 weights = numpy.ones(centers.shape[0])

516 if sample_weight is None:

517 sw = numpy.ones((X.shape[0],))

518 else:

519 sw = sample_weight

520

521 if scipy.sparse.issparse(X):

522 _centers_fct = _centers_sparse

523 else:

524 _centers_fct = _centers_dense

525

526 total_inertia = _inertia(X, sw)

527 all_centers = []

528

529 while it < max_iter:

530

531 # compute new clusters

532 centers = _centers_fct(

533 X, sw, labels, n_clusters, None)

534 if history:

535 all_centers.append(centers)

536

537 # association

538 labels = _constraint_association_weights(X, centers, sw, weights)

539 if len(set(labels)) != centers.shape[0]:

540 if verbose and fLOG: # pragma: no cover

541 if isinstance(verbose, int) and verbose >= 10:

542 fLOG(f"CKMeans new weights: w={weights!r}")

543 else:

544 fLOG("CKMeans new weights")

545 weights[:] = 1

546 labels = _constraint_association_weights(X, centers, sw, weights)

547

548 # inertia

549 inertia, diff = _labels_inertia_weights(

550 X, centers, sw, weights, labels, total_inertia)

551 if numpy.isnan(inertia):

552 raise RuntimeError( # pragma: no cover

553 f"nanNobs={X.shape[0]} Nclus={centers.shape[0]}\n"

554 f"inertia={inertia}\nweights={weights}\ndiff={diff}\n"

555 f"labels={set(labels)}")

556

557 # best option so far?

558 if best_inertia is None or inertia < best_inertia:

559 best_inertia = inertia

560 best_centers = centers.copy()

561 best_labels = labels.copy()

562 best_weights = weights.copy()

563 best_iter = it

564

565 # moves weights

566 weights, _ = _adjust_weights(X, sw, weights, labels,

567 learning_rate / (it + 10))

568

569 it += 1

570 if verbose and fLOG:

571 if isinstance(verbose, int) and verbose >= 10:

572 fLOG("CKMeans %d/%d inertia=%f (%f T=%f) dw=%r w=%r" % (

573 it, max_iter, inertia, best_inertia, total_inertia,

574 diff, weights))

575 elif isinstance(verbose, int) and verbose >= 5:

576 hist = Counter(labels)

577 fLOG("CKMeans %d/%d inertia=%f (%f) hist=%r" % (

578 it, max_iter, inertia, best_inertia, hist))

579 else:

580 fLOG("CKMeans %d/%d inertia=%f (%f T=%f)" % ( # pragma: no cover

581 it, max_iter, inertia, best_inertia, total_inertia))

582

583 # early stop

584 if (best_inertia is not None and inertia >= best_inertia and

585 it > best_iter + 5 and numpy.abs(diff).sum() <= weights.shape[0] / 2):

586 break

587

588 return (best_labels, best_centers, best_inertia, best_weights,

589 it, all_centers)

590

591

592def _constraint_association_weights(X, centers, sw, weights):

593 """

594 Associates points to clusters.

595

596 @param X features

597 @param centers centers

598 @param sw sample weights

599 @param weights cluster weights

600 @return labels

601 """

602 dist = cdist(X, centers) * weights.reshape((1, -1))

603 index = numpy.argmin(dist, axis=1)

604 return index

605

606

607def _inertia(X, sw):

608 """

609 Computes total weighted inertia.

610

611 @param X features

612 @param sw sample weights

613 @return inertia

614 """

615 bary = numpy.mean(X, axis=0)

616 diff = X - bary

617 norm = numpy.linalg.norm(diff, axis=1)

618 if sw is not None:

619 norm *= sw

620 return sw.sum()

621

622

623def _labels_inertia_weights(X, centers, sw, weights, labels, total_inertia):

624 """

625 Computes weighted inertia. It also adds a fraction

626 of the whole inertia depending on how balanced the

627 clusters are.

628

629 @param X features

630 @param centers centers

631 @param sw sample weights

632 @param weights cluster weights

633 @param labels labels

634 @param total_inertia total inertia

635 @return inertia

636 """

637 www, exp, _ = _compute_balance(X, sw, labels, centers.shape[0])

638 www -= exp

639 wwwa = numpy.square(www)

640 ratio = wwwa.sum() ** 0.5 / X.shape[0]

641 dist = cdist(X, centers) * weights.reshape((1, -1)) * sw.reshape((-1, 1))

642 return dist.sum() + ratio * total_inertia, www

643

644

645def _compute_balance(X, sw, labels, nbc=None):

646 """

647 Computes weights difference.

648

649 @param X features

650 @param sw sample weights

651 @param labels known labels

652 @param nbc number of clusters

653 @return (weights per cluster, expected weight,

654 total weight)

655 """

656 if nbc is None:

657 nbc = labels.max() + 1

658 N = numpy.float64(nbc)

659 www = numpy.zeros((nbc, ), dtype=numpy.float64)

660 if sw is None:

661 for la in labels:

662 www[la] += 1

663 else:

664 for w, la in zip(sw, labels):

665 www[la] += w

666 exp = www.sum() / N

667 return www, exp, N

668

669

670def _adjust_weights(X, sw, weights, labels, lr):

671 """

672 Changes *weights* mapped to every cluster.

673 *weights < 1* are used for big clusters,

674 *weights > 1* are used for small clusters.

675

676 @param X features

677 @param centers centers

678 @param sw sample weights

679 @param weights cluster weights

680 @param lr learning rate

681 @param labels known labels

682 @return labels

683 """

684 www, exp, N = _compute_balance(X, sw, labels, weights.shape[0])

685

686 for i in range(0, weights.shape[0]):

687 nw = (www[i] - exp) / exp

688 delta = nw * lr

689 weights[i] += delta

690 N += delta

691

692 return weights / N * weights.shape[0], www

Coverage for mlinsights/mlmodel/_kmeans_constraint_.py: 96%

329 statements