Coverage for mlinsights/mlmodel/_kmeans_constraint

Hot-keys on this page

r m x p toggle line displays

j k next/prev highlighted chunk

0 (zero) top of page

1 (one) first highlighted chunk

1# -*- coding: utf-8 -*-

2"""

3@file

4@brief Implémente la classe @see cl ConstraintKMeans.

5"""

6import bisect

7from collections import Counter

8from pandas import DataFrame

9import numpy

10import scipy.sparse

11from scipy.spatial.distance import cdist

12from sklearn.metrics.pairwise import euclidean_distances

13from sklearn.utils.extmath import row_norms

14from ._kmeans_022 import (

15 _centers_dense, _centers_sparse, _labels_inertia_skl)

18def linearize_matrix(mat, *adds):

19 """

20 Linearizes a matrix into a new one

21 with 3 columns value, row, column.

22 The output format is similar to

23 :epkg:`csr_matrix` but null values are kept.

25 @param mat matrix

26 @param adds additional square matrices

27 @return new matrix

29 *adds* defines additional matrices, it adds

30 columns on the right side and fill them with

31 the corresponding value taken into the additional

32 matrices.

33 """

34 if scipy.sparse.issparse(mat):

35 if isinstance(mat, scipy.sparse.csr_matrix):

36 max_row = mat.shape[0]

37 res = numpy.empty((len(mat.data), 3 + len(adds)), dtype=mat.dtype)

38 row = 0

39 for i, v in enumerate(mat.data):

40 while row < max_row and i >= mat.indptr[row]:

41 row += 1

42 res[i, 0] = v

43 a, b = row - 1, mat.indices[i]

44 res[i, 1] = a

45 res[i, 2] = b

46 for k, am in enumerate(adds):

47 res[i, k + 3] = am[a, b]

48 return res

49 raise NotImplementedError( # pragma: no cover

50 "This kind of sparse matrix is not handled: {0}".format(type(mat)))

51 else:

52 n = mat.shape[0]

53 c = mat.shape[1]

54 ic = numpy.arange(mat.shape[1])

55 res = numpy.empty((n * c, 3 + len(adds)), dtype=mat.dtype)

56 for i in range(0, n):

57 a = i * c

58 b = (i + 1) * c

59 res[a:b, 1] = i

60 res[a:b, 2] = ic

61 res[:, 0] = mat.ravel()

62 for k, am in enumerate(adds):

63 res[:, 3 + k] = am.ravel()

64 return res

67def constraint_kmeans(X, labels, sample_weight, centers, inertia,

68 iter, max_iter, # pylint: disable=W0622

69 strategy='gain', verbose=0, state=None,

70 learning_rate=1., history=False, fLOG=None):

71 """

72 Completes the constraint :epkg:`k-means`.

74 @param X features

75 @param labels initialized labels (unused)

76 @param sample_weight sample weight

77 @param centers initialized centers

78 @param inertia initialized inertia (unused)

79 @param iter number of iteration already done

80 @param max_iter maximum of number of iteration

81 @param strategy strategy used to sort observations before

82 mapping them to clusters

83 @param verbose verbose

84 @param state random state

85 @param learning_rate used by strategy `'weights'`

86 @param history return list of centers accross iterations

87 @param fLOG logging function (needs to be specified otherwise

88 verbose has no effects)

89 @return tuple (best_labels, best_centers, best_inertia,

90 iter, all_centers)

91 """

92 if labels.dtype != numpy.int32:

93 raise TypeError( # pragma: no cover

94 "Labels must be an array of int not '{0}'".format(labels.dtype))

96 if strategy == 'weights':

97 return _constraint_kmeans_weights(

98 X, labels, sample_weight, centers, inertia, iter,

99 max_iter, verbose=verbose, state=state,

100 learning_rate=learning_rate, history=history, fLOG=fLOG)

101 else:

102 if isinstance(X, DataFrame):

103 X = X.values

104 x_squared_norms = row_norms(X, squared=True)

105 counters = numpy.empty((centers.shape[0],), dtype=numpy.int32)

106 limit = X.shape[0] // centers.shape[0]

107 leftover = X.shape[0] - limit * centers.shape[0]

108 leftclose = numpy.empty((centers.shape[0],), dtype=numpy.int32)

109 n_clusters = centers.shape[0]

110 distances_close = numpy.empty((X.shape[0],), dtype=X.dtype)

111 best_inertia = None

112 best_iter = None

113 all_centers = []

114

115 # association

116 _constraint_association(leftover, counters, labels, leftclose, distances_close,

117 centers, X, x_squared_norms, limit, strategy, state=state)

118

119 if sample_weight is None:

120 sw = numpy.ones((X.shape[0],))

121 else:

122 sw = sample_weight

123

124 if scipy.sparse.issparse(X):

125 _centers_fct = _centers_sparse

126 else:

127 _centers_fct = _centers_dense

128

129 while iter < max_iter:

130 # compute new clusters

131 centers = _centers_fct(

132 X, sw, labels, n_clusters, distances_close)

133

134 if history:

135 all_centers.append(centers)

136

137 # association

138 _constraint_association(

139 leftover, counters, labels, leftclose, distances_close,

140 centers, X, x_squared_norms, limit, strategy, state=state)

141

142 # inertia

143 _, inertia = _labels_inertia_skl(

144 X=X, sample_weight=sw, x_squared_norms=x_squared_norms,

145 centers=centers, distances=distances_close)

146

147 iter += 1

148 if verbose and fLOG: # pragma: no cover

149 fLOG("CKMeans %d/%d inertia=%f" % (iter, max_iter, inertia))

150

151 # best option so far?

152 if best_inertia is None or inertia < best_inertia:

153 best_inertia = inertia

154 best_centers = centers.copy()

155 best_labels = labels.copy()

156 best_iter = iter

157

158 # early stop

159 if (best_inertia is not None and inertia >= best_inertia and

160 iter > best_iter + 5):

161 break

162

163 return (best_labels, best_centers, best_inertia, None,

164 iter, all_centers)

165

166

167def constraint_predictions(X, centers, strategy, state=None):

168 """

169 Computes the predictions but tries

170 to associates the same numbers of points

171 in each cluster.

172

173 @param X features

174 @param centers centers of each clusters

175 @param strategy strategy used to sort point before

176 mapping them to a cluster

177 @param state random state

178 @return labels, distances, distances_close

179 """

180 if isinstance(X, DataFrame):

181 X = X.values

182 x_squared_norms = row_norms(X, squared=True)

183 counters = numpy.empty((centers.shape[0],), dtype=numpy.int32)

184 limit = X.shape[0] // centers.shape[0]

185 leftover = X.shape[0] - limit * centers.shape[0]

186 leftclose = numpy.empty((centers.shape[0],), dtype=numpy.int32)

187 distances_close = numpy.empty((X.shape[0],), dtype=X.dtype)

188 labels = numpy.empty((X.shape[0],), dtype=numpy.int32)

189

190 distances = _constraint_association(

191 leftover, counters, labels, leftclose,

192 distances_close, centers, X, x_squared_norms,

193 limit, strategy, state=state)

194

195 return labels, distances, distances_close

196

197

198def _constraint_association(leftover, counters, labels, leftclose, distances_close,

199 centers, X, x_squared_norms, limit, strategy, state=None):

200 """

201 Completes the constraint :epkg:`k-means`.

202

203 @param X features

204 @param labels initialized labels (unused)

205 @param centers initialized centers

206 @param x_squared_norms norm of *X*

207 @param limit number of point to associate per cluster

208 @param leftover number of points to associate at the end

209 @param counters allocated array

210 @param leftclose allocated array

211 @param labels allocated array

212 @param distances_close allocated array

213 @param strategy strategy used to sort point before

214 mapping them to a cluster

215 @param state random state

216 """

217 if strategy in ('distance', 'distance_p'):

218 return _constraint_association_distance(

219 leftover, counters, labels, leftclose, distances_close,

220 centers, X, x_squared_norms, limit, strategy, state=state)

221 if strategy in ('gain', 'gain_p'):

222 return _constraint_association_gain(

223 leftover, counters, labels, leftclose, distances_close,

224 centers, X, x_squared_norms, limit, strategy, state=state)

225 raise ValueError("Unknwon strategy '{0}'.".format(

226 strategy)) # pragma: no cover

227

228

229def _compute_strategy_coefficient(distances, strategy, labels):

230 """

231 Creates a matrix

232 """

233 if strategy in ('gain', 'gain_p'):

234 ar = numpy.arange(distances.shape[0])

235 dist = distances[ar, labels]

236 return distances - dist[:, numpy.newaxis]

237 raise ValueError( # pragma: no cover

238 "Unknwon strategy '{0}'.".format(strategy))

239

240

241def _randomize_index(index, weights):

242 """

243 Randomizes index depending on the value.

244 Swap indexes.

245 Modifies *index*.

246 """

247 maxi = weights.max()

248 mini = weights.min()

249 diff = max(maxi - mini, 1e-5)

250 rand = numpy.random.rand(weights.shape[0])

251 for i in range(1, index.shape[0]):

252 ind1 = index[i - 1]

253 ind2 = index[i]

254 w1 = weights[ind1]

255 w2 = weights[ind2]

256 ratio = abs(w2 - w1) / diff * 0.5

257 if rand[i] >= ratio + 0.5:

258 index[i - 1], index[i] = ind2, ind1

259 weights[i - 1], weights[i] = w2, w1

260

261

262def _switch_clusters(labels, distances):

263 """

264 Tries to switch clusters.

265 Modifies *labels* inplace.

266

267 @param labels labels

268 @param distances distances

269 """

270 perm = numpy.random.permutation(numpy.arange(0, labels.shape[0]))

271 niter = 0

272 modif = 1

273 while modif > 0 and niter < 10:

274 modif = 0

275 niter += 1

276 for i_ in range(labels.shape[0]):

277 for j_ in range(i_ + 1, labels.shape[0]):

278 i = perm[i_]

279 j = perm[j_]

280 c1 = labels[i]

281 c2 = labels[j]

282 if c1 == c2:

283 continue

284 d11 = distances[i, c1]

285 d12 = distances[i, c2]

286 d21 = distances[j, c1]

287 d22 = distances[j, c2]

288 if d11**2 + d22**2 > d21**2 + d12**2:

289 labels[i], labels[j] = c2, c1

290 modif += 1

291

292

293def _constraint_association_distance(leftover, counters, labels, leftclose, distances_close,

294 centers, X, x_squared_norms, limit, strategy, state=None):

295 """

296 Completes the constraint *k-means*,

297 the function sorts points by distance to the closest

298 cluster and associates them into that order.

299 It deals first with the further point and maps it to

300 the closest center.

301

302 @param X features

303 @param labels initialized labels (unused)

304 @param centers initialized centers

305 @param x_squared_norms norm of *X*

306 @param limit number of point to associate per cluster

307 @param leftover number of points to associate at the end

308 @param counters allocated array

309 @param leftclose allocated array

310 @param labels allocated array

311 @param distances_close allocated array

312 @param strategy strategy used to sort point before

313 mapping them to a cluster

314 @param state random state (unused)

315 """

316

317 # initialisation

318 counters[:] = 0

319 leftclose[:] = -1

320 labels[:] = -1

321

322 # distances

323 distances = euclidean_distances(

324 centers, X, Y_norm_squared=x_squared_norms, squared=True)

325 distances = distances.T

326 distances0 = distances.copy()

327 maxi = distances.ravel().max() * 2

328 centers_index = numpy.argsort(distances, axis=1)

329

330 while labels.min() == -1:

331 mini = numpy.min(distances, axis=1)

332 sorted_index = numpy.argsort(mini)

333 _randomize_index(sorted_index, mini)

334

335 nover = leftover

336 for ind in sorted_index:

337 if labels[ind] >= 0:

338 continue

339 for c in centers_index[ind, :]:

340 if counters[c] < limit:

341 # The cluster still accepts new points.

342 counters[c] += 1

343 labels[ind] = c

344 distances[ind, c] = maxi

345 break

346 if nover > 0 and leftclose[c] == -1:

347 # The cluster may accept one point if the number

348 # of clusters does not divide the number of points in X.

349 counters[c] += 1

350 labels[ind] = c

351 nover -= 1

352 leftclose[c] = 0

353 distances[ind, c] = maxi

354 break

355

356 _switch_clusters(labels, distances0)

357 distances_close[:] = distances[numpy.arange(X.shape[0]), labels]

358 return distances0

359

360

361def _constraint_association_gain(leftover, counters, labels, leftclose, distances_close,

362 centers, X, x_squared_norms, limit, strategy, state=None):

363 """

364 Completes the constraint *k-means*.

365 Follows the method described in `Same-size k-Means Variation

366 <https://elki-project.github.io/tutorial/same-size_k_means>`_.

367

368 @param X features

369 @param labels initialized labels (unused)

370 @param centers initialized centers

371 @param x_squared_norms norm of *X*

372 @param limit number of points to associate per cluster

373 @param leftover number of points to associate at the end

374 @param counters allocated array

375 @param leftclose allocated array

376 @param labels allocated array

377 @param distances_close allocated array

378 @param strategy strategy used to sort point before

379 mapping them to a cluster

380 @param state random state

381

382 See `Same-size k-Means Variation <https://elki-project.github.io/tutorial/same-size_k_means>`_.

383 """

384 # distances

385 distances = euclidean_distances(

386 centers, X, Y_norm_squared=x_squared_norms, squared=True)

387 distances = distances.T

388

389 if strategy == 'gain_p':

390 labels[:] = numpy.argmin(distances, axis=1)

391 else:

392 # We assume labels comes from a previous iteration.

393 pass

394

395 strategy_coef = _compute_strategy_coefficient(distances, strategy, labels)

396 distance_linear = linearize_matrix(distances, strategy_coef)

397 sorted_distances = distance_linear[distance_linear[:, 3].argsort()]

398 distances_close[:] = 0

399

400 # counters

401 ave = limit

402 counters[:] = 0

403 for i in labels:

404 counters[i] += 1

405 leftclose[:] = counters[:] - ave

406 leftclose[leftclose < 0] = 0

407 nover = X.shape[0] - ave * counters.shape[0]

408 sumi = nover - leftclose.sum()

409 if sumi != 0:

410 if state is None:

411 state = numpy.random.RandomState() # pylint: disable=E1101

412

413 def loopf(h, sumi):

414 if sumi < 0 and leftclose[h] > 0: # pylint: disable=R1716

415 sumi -= leftclose[h]

416 leftclose[h] = 0

417 elif sumi > 0 and leftclose[h] == 0:

418 leftclose[h] = 1

419 sumi += 1

420 return sumi

421

422 it = 0

423 while sumi != 0:

424 h = state.randint(0, counters.shape[0])

425 sumi = loopf(h, sumi)

426 it += 1

427 if it > counters.shape[0] * 2:

428 break

429 for h in range(counters.shape[0]):

430 if sumi == 0:

431 break

432 sumi = loopf(h, sumi)

433

434 transfer = {}

435

436 for i in range(0, sorted_distances.shape[0]):

437 gain = sorted_distances[i, 3]

438 ind = int(sorted_distances[i, 1])

439 dest = int(sorted_distances[i, 2])

440 cur = labels[ind]

441 if distances_close[ind]:

442 continue

443 if cur == dest:

444 continue

445 if ((counters[dest] < ave + leftclose[dest]) and

446 (counters[cur] > ave + leftclose[cur])):

447 labels[ind] = dest

448 counters[cur] -= 1

449 counters[dest] += 1

450 distances_close[ind] = 1 # moved

451 else:

452 cp = transfer.get((dest, cur), [])

453 while len(cp) > 0:

454 g, destind = cp[0]

455 if distances_close[destind]:

456 del cp[0]

457 else:

458 break

459 if len(cp) > 0:

460 g, destind = cp[0]

461 if g + gain < 0:

462 del cp[0]

463 labels[ind] = dest

464 labels[destind] = cur

465 add = False

466 distances_close[ind] = 1 # moved

467 distances_close[destind] = 1 # moved

468 else:

469 add = True

470 else:

471 add = True

472 if add:

473 # We add the point to the list of points willing to transfer.

474 if (cur, dest) not in transfer:

475 transfer[cur, dest] = []

476 gain = sorted_distances[i, 3]

477 bisect.insort(transfer[cur, dest], (gain, ind))

478

479 neg = (counters < ave).sum()

480 if neg > 0:

481 raise RuntimeError( # pragma: no cover

482 "The algorithm failed, counters={0}".format(counters))

483

484 _switch_clusters(labels, distances)

485 distances_close[:] = distances[numpy.arange(X.shape[0]), labels]

486

487 return distances

488

489

490def _constraint_kmeans_weights(X, labels, sample_weight, centers, inertia, it,

491 max_iter, verbose=0, state=None, learning_rate=1.,

492 history=False, fLOG=None):

493 """

494 Runs KMeans iterator but weights cluster among them.

495

496 @param X features

497 @param labels initialized labels (unused)

498 @param sample_weight sample weight

499 @param centers initialized centers

500 @param inertia initialized inertia (unused)

501 @param it number of iteration already done

502 @param max_iter maximum of number of iteration

503 @param verbose verbose

504 @param state random state

505 @param learning_rate learning rate

506 @param history keeps all centers accross iterations

507 @param fLOG logging function (needs to be specified otherwise

508 verbose has no effects)

509 @return tuple (best_labels, best_centers, best_inertia, weights, it)

510 """

511 if isinstance(X, DataFrame):

512 X = X.values

513 n_clusters = centers.shape[0]

514 best_inertia = None

515 best_iter = None

516 weights = numpy.ones(centers.shape[0])

517 if sample_weight is None:

518 sw = numpy.ones((X.shape[0],))

519 else:

520 sw = sample_weight

521

522 if scipy.sparse.issparse(X):

523 _centers_fct = _centers_sparse

524 else:

525 _centers_fct = _centers_dense

526

527 total_inertia = _inertia(X, sw)

528 all_centers = []

529

530 while it < max_iter:

531

532 # compute new clusters

533 centers = _centers_fct(

534 X, sw, labels, n_clusters, None)

535 if history:

536 all_centers.append(centers)

537

538 # association

539 labels = _constraint_association_weights(X, centers, sw, weights)

540 if len(set(labels)) != centers.shape[0]:

541 if verbose and fLOG: # pragma: no cover

542 if isinstance(verbose, int) and verbose >= 10:

543 fLOG("CKMeans new weights: w=%r" % weights)

544 else:

545 fLOG("CKMeans new weights")

546 weights[:] = 1

547 labels = _constraint_association_weights(X, centers, sw, weights)

548

549 # inertia

550 inertia, diff = _labels_inertia_weights(

551 X, centers, sw, weights, labels, total_inertia)

552 if numpy.isnan(inertia):

553 raise RuntimeError( # pragma: no cover

554 "nanNobs={} Nclus={}\ninertia={}\nweights={}\ndiff={}\nlabels={}".format(

555 X.shape[0], centers.shape[0], inertia, weights, diff,

556 set(labels)))

557

558 # best option so far?

559 if best_inertia is None or inertia < best_inertia:

560 best_inertia = inertia

561 best_centers = centers.copy()

562 best_labels = labels.copy()

563 best_weights = weights.copy()

564 best_iter = it

565

566 # moves weights

567 weights, _ = _adjust_weights(X, sw, weights, labels,

568 learning_rate / (it + 10))

569

570 it += 1

571 if verbose and fLOG:

572 if isinstance(verbose, int) and verbose >= 10:

573 fLOG("CKMeans %d/%d inertia=%f (%f T=%f) dw=%r w=%r" % (

574 it, max_iter, inertia, best_inertia, total_inertia,

575 diff, weights))

576 elif isinstance(verbose, int) and verbose >= 5:

577 hist = Counter(labels)

578 fLOG("CKMeans %d/%d inertia=%f (%f) hist=%r" % (

579 it, max_iter, inertia, best_inertia, hist))

580 else:

581 fLOG("CKMeans %d/%d inertia=%f (%f T=%f)" % ( # pragma: no cover

582 it, max_iter, inertia, best_inertia, total_inertia))

583

584 # early stop

585 if (best_inertia is not None and inertia >= best_inertia and

586 it > best_iter + 5 and numpy.abs(diff).sum() <= weights.shape[0] / 2):

587 break

588

589 return (best_labels, best_centers, best_inertia, best_weights,

590 it, all_centers)

591

592

593def _constraint_association_weights(X, centers, sw, weights):

594 """

595 Associates points to clusters.

596

597 @param X features

598 @param centers centers

599 @param sw sample weights

600 @param weights cluster weights

601 @return labels

602 """

603 dist = cdist(X, centers) * weights.reshape((1, -1))

604 index = numpy.argmin(dist, axis=1)

605 return index

606

607

608def _inertia(X, sw):

609 """

610 Computes total weighted inertia.

611

612 @param X features

613 @param sw sample weights

614 @return inertia

615 """

616 bary = numpy.mean(X, axis=0)

617 diff = X - bary

618 norm = numpy.linalg.norm(diff, axis=1)

619 if sw is not None:

620 norm *= sw

621 return sw.sum()

622

623

624def _labels_inertia_weights(X, centers, sw, weights, labels, total_inertia):

625 """

626 Computes weighted inertia. It also adds a fraction

627 of the whole inertia depending on how balanced the

628 clusters are.

629

630 @param X features

631 @param centers centers

632 @param sw sample weights

633 @param weights cluster weights

634 @param labels labels

635 @param total_inertia total inertia

636 @return inertia

637 """

638 www, exp, _ = _compute_balance(X, sw, labels, centers.shape[0])

639 www -= exp

640 wwwa = numpy.square(www)

641 ratio = wwwa.sum() ** 0.5 / X.shape[0]

642 dist = cdist(X, centers) * weights.reshape((1, -1)) * sw.reshape((-1, 1))

643 return dist.sum() + ratio * total_inertia, www

644

645

646def _compute_balance(X, sw, labels, nbc=None):

647 """

648 Computes weights difference.

649

650 @param X features

651 @param sw sample weights

652 @param labels known labels

653 @param nbc number of clusters

654 @return (weights per cluster, expected weight,

655 total weight)

656 """

657 if nbc is None:

658 nbc = labels.max() + 1

659 N = numpy.float64(nbc)

660 www = numpy.zeros((nbc, ), dtype=numpy.float64)

661 if sw is None:

662 for la in labels:

663 www[la] += 1

664 else:

665 for w, la in zip(sw, labels):

666 www[la] += w

667 exp = www.sum() / N

668 return www, exp, N

669

670

671def _adjust_weights(X, sw, weights, labels, lr):

672 """

673 Changes *weights* mapped to every cluster.

674 *weights < 1* are used for big clusters,

675 *weights > 1* are used for small clusters.

676

677 @param X features

678 @param centers centers

679 @param sw sample weights

680 @param weights cluster weights

681 @param lr learning rate

682 @param labels known labels

683 @return labels

684 """

685 www, exp, N = _compute_balance(X, sw, labels, weights.shape[0])

686

687 for i in range(0, weights.shape[0]):

688 nw = (www[i] - exp) / exp

689 delta = nw * lr

690 weights[i] += delta

691 N += delta

692

693 return weights / N * weights.shape[0], www

Coverage for mlinsights/mlmodel/_kmeans_constraint_.py : 98%

329 statements

Coverage for mlinsights/mlmodel/_kmeans_constraint_.py : 98%

329 statements 323 run 6 missing 14 excluded

329 statements