Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# -*- coding: utf-8 -*- 

2""" 

3@file 

4@brief Implémente la classe @see cl ConstraintKMeans. 

5""" 

6import bisect 

7from pandas import DataFrame 

8import numpy 

9import scipy.sparse 

10try: 

11 from sklearn.cluster._kmeans import _labels_inertia # pylint: disable=E0611 

12except ImportError: 

13 from sklearn.cluster.k_means_ import _labels_inertia # pylint: disable=E0611 

14try: 

15 from sklearn.cluster._k_means_fast import _centers_sparse, _centers_dense 

16except ImportError: 

17 from sklearn.cluster._k_means import _centers_sparse, _centers_dense 

18from sklearn.metrics.pairwise import euclidean_distances 

19from sklearn.utils.extmath import row_norms 

20 

21 

22def linearize_matrix(mat, *adds): 

23 """ 

24 Linearizes a matrix into a new one 

25 with 3 columns value, row, column. 

26 The output format is similar to 

27 :epkg:`csr_matrix` but null values are kept. 

28 

29 @param mat matrix 

30 @param adds additional square matrices 

31 @return new matrix 

32 

33 *adds* defines additional matrices, it adds 

34 columns on the right side and fill them with 

35 the corresponding value taken into the additional 

36 matrices. 

37 """ 

38 if scipy.sparse.issparse(mat): 

39 if isinstance(mat, scipy.sparse.csr_matrix): 

40 max_row = mat.shape[0] 

41 res = numpy.empty((len(mat.data), 3 + len(adds)), dtype=mat.dtype) 

42 row = 0 

43 for i, v in enumerate(mat.data): 

44 while row < max_row and i >= mat.indptr[row]: 

45 row += 1 

46 res[i, 0] = v 

47 a, b = row - 1, mat.indices[i] 

48 res[i, 1] = a 

49 res[i, 2] = b 

50 for k, am in enumerate(adds): 

51 res[i, k + 3] = am[a, b] 

52 return res 

53 else: 

54 raise NotImplementedError( 

55 "This kind of sparse matrix is not handled: {0}".format(type(mat))) 

56 else: 

57 n = mat.shape[0] 

58 c = mat.shape[1] 

59 ic = numpy.arange(mat.shape[1]) 

60 res = numpy.empty((n * c, 3 + len(adds)), dtype=mat.dtype) 

61 for i in range(0, n): 

62 a = i * c 

63 b = (i + 1) * c 

64 res[a:b, 1] = i 

65 res[a:b, 2] = ic 

66 res[:, 0] = mat.ravel() 

67 for k, am in enumerate(adds): 

68 res[:, 3 + k] = am.ravel() 

69 return res 

70 

71 

72def constraint_kmeans(X, labels, sample_weight, centers, inertia, 

73 precompute_distances, iter, max_iter, # pylint: disable=W0622 

74 strategy='gain', verbose=0, state=None, fLOG=None): 

75 """ 

76 Completes the constraint *k-means*. 

77 

78 @param X features 

79 @param labels initialized labels (unsued) 

80 @param sample_weight sample weight 

81 @param centers initialized centers 

82 @param inertia initialized inertia (unsued) 

83 @param precompute_distances precompute distances (used in 

84 `_label_inertia <https://github.com/scikit-learn/ 

85 scikit-learn/blob/master/sklearn/cluster/k_means_.py#L547>`_) 

86 @param iter number of iteration already done 

87 @param max_iter maximum of number of iteration 

88 @param strategy strategy used to sort observations before 

89 mapping them to clusters 

90 @param verbose verbose 

91 @param state random state 

92 @param fLOG logging function (needs to be specified otherwise 

93 verbose has no effects) 

94 @return tuple (best_labels, best_centers, best_inertia, iter) 

95 """ 

96 if isinstance(X, DataFrame): 

97 X = X.values 

98 x_squared_norms = row_norms(X, squared=True) 

99 counters = numpy.empty((centers.shape[0],), dtype=numpy.int32) 

100 limit = X.shape[0] // centers.shape[0] 

101 leftover = X.shape[0] - limit * centers.shape[0] 

102 leftclose = numpy.empty((centers.shape[0],), dtype=numpy.int32) 

103 n_clusters = centers.shape[0] 

104 distances_close = numpy.empty((X.shape[0],), dtype=X.dtype) 

105 best_inertia = None 

106 prev_labels = None 

107 best_iter = None 

108 

109 if labels.dtype != numpy.int32: 

110 raise TypeError( 

111 "Labels must be an array of int not '{0}'".format(labels.dtype)) 

112 

113 # association 

114 _constraint_association(leftover, counters, labels, leftclose, distances_close, 

115 centers, X, x_squared_norms, limit, strategy, state=state) 

116 

117 if sample_weight is None: 

118 sw = numpy.ones((X.shape[0],)) 

119 else: 

120 sw = sample_weight 

121 

122 while iter < max_iter: 

123 

124 # compute new clusters 

125 if scipy.sparse.issparse(X): 

126 try: 

127 # scikit-learn >= 0.20 

128 centers = _centers_sparse( 

129 X, sw, labels, n_clusters, distances_close) 

130 except TypeError: 

131 # scikit-learn < 0.20 

132 centers = _centers_sparse( 

133 X, sw, labels, n_clusters, distances_close) 

134 else: 

135 try: 

136 # scikit-learn >= 0.20 

137 centers = _centers_dense( 

138 X, sw, labels, n_clusters, distances_close) 

139 except TypeError: 

140 # scikit-learn < 0.20 

141 centers = _centers_dense( 

142 X, sw, labels, n_clusters, distances_close) 

143 

144 # association 

145 _constraint_association(leftover, counters, labels, leftclose, distances_close, 

146 centers, X, x_squared_norms, limit, strategy, state=state) 

147 

148 # inertia 

149 _, inertia = _labels_inertia(X, sw, x_squared_norms, centers, 

150 precompute_distances=precompute_distances, 

151 distances=distances_close) 

152 

153 iter += 1 

154 if verbose and fLOG: 

155 fLOG("CKMeans %d/%d inertia=%f" % (iter, max_iter, inertia)) 

156 

157 # best option so far? 

158 if best_inertia is None or inertia < best_inertia: 

159 best_inertia = inertia 

160 best_centers = centers.copy() 

161 best_labels = labels.copy() 

162 best_iter = iter 

163 

164 # early stop 

165 if best_inertia is not None and inertia >= best_inertia and iter > best_iter + 5: 

166 break 

167 if prev_labels is not None and numpy.array_equal(prev_labels, labels): 

168 break 

169 prev_labels = labels.copy() 

170 

171 return best_labels, best_centers, best_inertia, iter 

172 

173 

174def constraint_predictions(X, centers, strategy, state=None): 

175 """ 

176 Computes the predictions but tries 

177 to associates the same numbers of points 

178 in each cluster. 

179 

180 @param X features 

181 @param centers centers of each clusters 

182 @param strategy strategy used to sort point before 

183 mapping them to a cluster 

184 @param state random state 

185 @return labels, distances, distances_close 

186 """ 

187 if isinstance(X, DataFrame): 

188 X = X.values 

189 x_squared_norms = row_norms(X, squared=True) 

190 counters = numpy.empty((centers.shape[0],), dtype=numpy.int32) 

191 limit = X.shape[0] // centers.shape[0] 

192 leftover = X.shape[0] - limit * centers.shape[0] 

193 leftclose = numpy.empty((centers.shape[0],), dtype=numpy.int32) 

194 distances_close = numpy.empty((X.shape[0],), dtype=X.dtype) 

195 labels = numpy.empty((X.shape[0],), dtype=numpy.int32) 

196 

197 distances = _constraint_association(leftover, counters, labels, leftclose, 

198 distances_close, centers, X, x_squared_norms, 

199 limit, strategy, state=state) 

200 

201 return labels, distances, distances_close 

202 

203 

204def _constraint_association(leftover, counters, labels, leftclose, distances_close, 

205 centers, X, x_squared_norms, limit, strategy, state=None): 

206 """ 

207 Completes the constraint *k-means*. 

208 

209 @param X features 

210 @param labels initialized labels (unsued) 

211 @param centers initialized centers 

212 @param x_squared_norms norm of *X* 

213 @param limit number of point to associate per cluster 

214 @param leftover number of points to associate at the end 

215 @param counters allocated array 

216 @param leftclose allocated array 

217 @param labels allocated array 

218 @param distances_close allocated array 

219 @param strategy strategy used to sort point before 

220 mapping them to a cluster 

221 @param state random state 

222 """ 

223 if strategy in ('distance', 'distance_p'): 

224 return _constraint_association_distance(leftover, counters, labels, leftclose, distances_close, 

225 centers, X, x_squared_norms, limit, strategy, state=state) 

226 elif strategy in ('gain', 'gain_p'): 

227 return _constraint_association_gain(leftover, counters, labels, leftclose, distances_close, 

228 centers, X, x_squared_norms, limit, strategy, state=state) 

229 else: 

230 raise ValueError("Unknwon strategy '{0}'.".format(strategy)) 

231 

232 

233def _constraint_association_distance(leftover, counters, labels, leftclose, distances_close, 

234 centers, X, x_squared_norms, limit, strategy, state=None): 

235 """ 

236 Completes the constraint *k-means*. 

237 

238 @param X features 

239 @param labels initialized labels (unsued) 

240 @param centers initialized centers 

241 @param x_squared_norms norm of *X* 

242 @param limit number of point to associate per cluster 

243 @param leftover number of points to associate at the end 

244 @param counters allocated array 

245 @param leftclose allocated array 

246 @param labels allocated array 

247 @param distances_close allocated array 

248 @param strategy strategy used to sort point before 

249 mapping them to a cluster 

250 @param state random state (unused) 

251 """ 

252 

253 # initialisation 

254 counters[:] = 0 

255 leftclose[:] = -1 

256 distances_close[:] = numpy.nan 

257 labels[:] = -1 

258 

259 # distances 

260 distances = euclidean_distances( 

261 centers, X, Y_norm_squared=x_squared_norms, squared=True) 

262 distances = distances.T 

263 

264 strategy_coef = _compute_strategy_coefficient(distances, strategy, labels) 

265 distance_linear = linearize_matrix(distances, strategy_coef) 

266 sorted_distances = distance_linear[distance_linear[:, 3].argsort()] 

267 

268 nover = leftover 

269 for i in range(0, sorted_distances.shape[0]): 

270 ind = int(sorted_distances[i, 1]) 

271 if labels[ind] >= 0: 

272 continue 

273 c = int(sorted_distances[i, 2]) 

274 if counters[c] < limit: 

275 # The cluster still accepts new points. 

276 counters[c] += 1 

277 labels[ind] = c 

278 distances_close[ind] = sorted_distances[i, 0] 

279 elif nover > 0 and leftclose[c] == -1: 

280 # The cluster may accept one point if the number 

281 # of clusters does not divide the number of points in X. 

282 counters[c] += 1 

283 labels[ind] = c 

284 nover -= 1 

285 leftclose[c] = 0 

286 distances_close[ind] = sorted_distances[i, 0] 

287 return distances 

288 

289 

290def _compute_strategy_coefficient(distances, strategy, labels): 

291 """ 

292 Creates a matrix 

293 """ 

294 if strategy in ('distance', 'distance_p'): 

295 return distances 

296 elif strategy in ('gain', 'gain_p'): 

297 ar = numpy.arange(distances.shape[0]) 

298 dist = distances[ar, labels] 

299 return distances - dist[:, numpy.newaxis] 

300 else: 

301 raise ValueError("Unknwon strategy '{0}'.".format(strategy)) 

302 

303 

304def _constraint_association_gain(leftover, counters, labels, leftclose, distances_close, 

305 centers, X, x_squared_norms, limit, strategy, state=None): 

306 """ 

307 Completes the constraint *k-means*. 

308 

309 @param X features 

310 @param labels initialized labels (unsued) 

311 @param centers initialized centers 

312 @param x_squared_norms norm of *X* 

313 @param limit number of points to associate per cluster 

314 @param leftover number of points to associate at the end 

315 @param counters allocated array 

316 @param leftclose allocated array 

317 @param labels allocated array 

318 @param distances_close allocated array 

319 @param strategy strategy used to sort point before 

320 mapping them to a cluster 

321 @param state random state 

322 

323 See `Same-size k-Means Variation <https://elki-project.github.io/tutorial/same-size_k_means>`_. 

324 """ 

325 # distances 

326 distances = euclidean_distances( 

327 centers, X, Y_norm_squared=x_squared_norms, squared=True) 

328 distances = distances.T 

329 

330 if strategy == 'gain_p': 

331 labels[:] = numpy.argmin(distances, axis=1) 

332 else: 

333 # We assume labels comes from a previous iteration. 

334 pass 

335 

336 strategy_coef = _compute_strategy_coefficient(distances, strategy, labels) 

337 distance_linear = linearize_matrix(distances, strategy_coef) 

338 sorted_distances = distance_linear[distance_linear[:, 3].argsort()] 

339 distances_close[:] = 0 

340 

341 # counters 

342 ave = limit 

343 counters[:] = 0 

344 for i in labels: 

345 counters[i] += 1 

346 leftclose[:] = counters[:] - ave 

347 leftclose[leftclose < 0] = 0 

348 nover = X.shape[0] - ave * counters.shape[0] 

349 sumi = nover - leftclose.sum() 

350 if sumi != 0: 

351 if state is None: 

352 state = numpy.random.RandomState() 

353 

354 def loopf(h, sumi): 

355 if sumi < 0 and leftclose[h] > 0: # pylint: disable=R1716 

356 sumi -= leftclose[h] 

357 leftclose[h] = 0 

358 elif sumi > 0 and leftclose[h] == 0: 

359 leftclose[h] = 1 

360 sumi += 1 

361 return sumi 

362 

363 it = 0 

364 while sumi != 0: 

365 h = state.randint(0, counters.shape[0]) 

366 sumi = loopf(h, sumi) 

367 it += 1 

368 if it > counters.shape[0] * 2: 

369 break 

370 for h in range(counters.shape[0]): 

371 if sumi == 0: 

372 break 

373 sumi = loopf(h, sumi) 

374 

375 transfer = {} 

376 

377 for i in range(0, sorted_distances.shape[0]): 

378 gain = sorted_distances[i, 3] 

379 ind = int(sorted_distances[i, 1]) 

380 dest = int(sorted_distances[i, 2]) 

381 cur = labels[ind] 

382 if distances_close[ind]: 

383 continue 

384 if cur == dest: 

385 continue 

386 if (counters[dest] < ave + leftclose[dest]) and (counters[cur] > ave + leftclose[cur]): 

387 labels[ind] = dest 

388 counters[cur] -= 1 

389 counters[dest] += 1 

390 distances_close[ind] = 1 # moved 

391 else: 

392 cp = transfer.get((dest, cur), []) 

393 while len(cp) > 0: 

394 g, destind = cp[0] 

395 if distances_close[destind]: 

396 del cp[0] 

397 else: 

398 break 

399 if len(cp) > 0: 

400 g, destind = cp[0] 

401 if g + gain < 0: 

402 del cp[0] 

403 labels[ind] = dest 

404 labels[destind] = cur 

405 add = False 

406 distances_close[ind] = 1 # moved 

407 distances_close[destind] = 1 # moved 

408 else: 

409 add = True 

410 else: 

411 add = True 

412 if add: 

413 # We add the point to the list of points willing to transfer. 

414 if (cur, dest) not in transfer: 

415 transfer[cur, dest] = [] 

416 gain = sorted_distances[i, 3] 

417 bisect.insort(transfer[cur, dest], (gain, ind)) 

418 

419 distances_close[:] = distances[numpy.arange(X.shape[0]), labels] 

420 

421 neg = (counters < ave).sum() 

422 if neg > 0: 

423 raise RuntimeError( 

424 "The algorithm failed, counters={0}".format(counters)) 

425 

426 return distances