Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# -*- coding: utf-8 -*- 

2""" 

3@file 

4@brief Implémente la classe @see cl ConstraintKMeans. 

5""" 

6import bisect 

7from pandas import DataFrame 

8import numpy 

9import scipy.sparse 

10try: 

11 from sklearn.cluster._kmeans import _labels_inertia # pylint: disable=E0611 

12except ImportError: 

13 from sklearn.cluster.k_means_ import _labels_inertia # pylint: disable=E0611 

14from sklearn.metrics.pairwise import euclidean_distances 

15from sklearn.utils.extmath import row_norms 

16from ._kmeans_constraint_ import _centers_dense, _centers_sparse 

17 

18 

19def linearize_matrix(mat, *adds): 

20 """ 

21 Linearizes a matrix into a new one 

22 with 3 columns value, row, column. 

23 The output format is similar to 

24 :epkg:`csr_matrix` but null values are kept. 

25 

26 @param mat matrix 

27 @param adds additional square matrices 

28 @return new matrix 

29 

30 *adds* defines additional matrices, it adds 

31 columns on the right side and fill them with 

32 the corresponding value taken into the additional 

33 matrices. 

34 """ 

35 if scipy.sparse.issparse(mat): 

36 if isinstance(mat, scipy.sparse.csr_matrix): 

37 max_row = mat.shape[0] 

38 res = numpy.empty((len(mat.data), 3 + len(adds)), dtype=mat.dtype) 

39 row = 0 

40 for i, v in enumerate(mat.data): 

41 while row < max_row and i >= mat.indptr[row]: 

42 row += 1 

43 res[i, 0] = v 

44 a, b = row - 1, mat.indices[i] 

45 res[i, 1] = a 

46 res[i, 2] = b 

47 for k, am in enumerate(adds): 

48 res[i, k + 3] = am[a, b] 

49 return res 

50 else: 

51 raise NotImplementedError( 

52 "This kind of sparse matrix is not handled: {0}".format(type(mat))) 

53 else: 

54 n = mat.shape[0] 

55 c = mat.shape[1] 

56 ic = numpy.arange(mat.shape[1]) 

57 res = numpy.empty((n * c, 3 + len(adds)), dtype=mat.dtype) 

58 for i in range(0, n): 

59 a = i * c 

60 b = (i + 1) * c 

61 res[a:b, 1] = i 

62 res[a:b, 2] = ic 

63 res[:, 0] = mat.ravel() 

64 for k, am in enumerate(adds): 

65 res[:, 3 + k] = am.ravel() 

66 return res 

67 

68 

69def constraint_kmeans(X, labels, sample_weight, centers, inertia, 

70 iter, max_iter, # pylint: disable=W0622 

71 strategy='gain', verbose=0, state=None, fLOG=None): 

72 """ 

73 Completes the constraint *k-means*. 

74 

75 @param X features 

76 @param labels initialized labels (unsued) 

77 @param sample_weight sample weight 

78 @param centers initialized centers 

79 @param inertia initialized inertia (unsued) 

80 @param iter number of iteration already done 

81 @param max_iter maximum of number of iteration 

82 @param strategy strategy used to sort observations before 

83 mapping them to clusters 

84 @param verbose verbose 

85 @param state random state 

86 @param fLOG logging function (needs to be specified otherwise 

87 verbose has no effects) 

88 @return tuple (best_labels, best_centers, best_inertia, iter) 

89 """ 

90 if isinstance(X, DataFrame): 

91 X = X.values 

92 x_squared_norms = row_norms(X, squared=True) 

93 counters = numpy.empty((centers.shape[0],), dtype=numpy.int32) 

94 limit = X.shape[0] // centers.shape[0] 

95 leftover = X.shape[0] - limit * centers.shape[0] 

96 leftclose = numpy.empty((centers.shape[0],), dtype=numpy.int32) 

97 n_clusters = centers.shape[0] 

98 distances_close = numpy.empty((X.shape[0],), dtype=X.dtype) 

99 best_inertia = None 

100 prev_labels = None 

101 best_iter = None 

102 

103 if labels.dtype != numpy.int32: 

104 raise TypeError( 

105 "Labels must be an array of int not '{0}'".format(labels.dtype)) 

106 

107 # association 

108 _constraint_association(leftover, counters, labels, leftclose, distances_close, 

109 centers, X, x_squared_norms, limit, strategy, state=state) 

110 

111 if sample_weight is None: 

112 sw = numpy.ones((X.shape[0],)) 

113 else: 

114 sw = sample_weight 

115 

116 if scipy.sparse.issparse(X): 

117 try: 

118 # scikit-learn >= 0.20 

119 _centers_fct = _centers_sparse 

120 except TypeError: 

121 # scikit-learn < 0.20 

122 _centers_fct = _centers_sparse 

123 else: 

124 try: 

125 # scikit-learn >= 0.20 

126 _centers_fct = _centers_dense 

127 except TypeError: 

128 # scikit-learn < 0.20 

129 _centers_fct = _centers_dense 

130 

131 while iter < max_iter: 

132 

133 # compute new clusters 

134 centers = _centers_fct( 

135 X, sw, labels, n_clusters, distances_close) 

136 

137 # association 

138 _constraint_association(leftover, counters, labels, leftclose, distances_close, 

139 centers, X, x_squared_norms, limit, strategy, state=state) 

140 

141 # inertia 

142 _, inertia = _labels_inertia(X, sw, x_squared_norms, centers, 

143 distances=distances_close) 

144 

145 iter += 1 

146 if verbose and fLOG: 

147 fLOG("CKMeans %d/%d inertia=%f" % (iter, max_iter, inertia)) 

148 

149 # best option so far? 

150 if best_inertia is None or inertia < best_inertia: 

151 best_inertia = inertia 

152 best_centers = centers.copy() 

153 best_labels = labels.copy() 

154 best_iter = iter 

155 

156 # early stop 

157 if best_inertia is not None and inertia >= best_inertia and iter > best_iter + 5: 

158 break 

159 if prev_labels is not None and numpy.array_equal(prev_labels, labels): 

160 break 

161 prev_labels = labels.copy() 

162 

163 return best_labels, best_centers, best_inertia, iter 

164 

165 

166def constraint_predictions(X, centers, strategy, state=None): 

167 """ 

168 Computes the predictions but tries 

169 to associates the same numbers of points 

170 in each cluster. 

171 

172 @param X features 

173 @param centers centers of each clusters 

174 @param strategy strategy used to sort point before 

175 mapping them to a cluster 

176 @param state random state 

177 @return labels, distances, distances_close 

178 """ 

179 if isinstance(X, DataFrame): 

180 X = X.values 

181 x_squared_norms = row_norms(X, squared=True) 

182 counters = numpy.empty((centers.shape[0],), dtype=numpy.int32) 

183 limit = X.shape[0] // centers.shape[0] 

184 leftover = X.shape[0] - limit * centers.shape[0] 

185 leftclose = numpy.empty((centers.shape[0],), dtype=numpy.int32) 

186 distances_close = numpy.empty((X.shape[0],), dtype=X.dtype) 

187 labels = numpy.empty((X.shape[0],), dtype=numpy.int32) 

188 

189 distances = _constraint_association(leftover, counters, labels, leftclose, 

190 distances_close, centers, X, x_squared_norms, 

191 limit, strategy, state=state) 

192 

193 return labels, distances, distances_close 

194 

195 

196def _constraint_association(leftover, counters, labels, leftclose, distances_close, 

197 centers, X, x_squared_norms, limit, strategy, state=None): 

198 """ 

199 Completes the constraint *k-means*. 

200 

201 @param X features 

202 @param labels initialized labels (unsued) 

203 @param centers initialized centers 

204 @param x_squared_norms norm of *X* 

205 @param limit number of point to associate per cluster 

206 @param leftover number of points to associate at the end 

207 @param counters allocated array 

208 @param leftclose allocated array 

209 @param labels allocated array 

210 @param distances_close allocated array 

211 @param strategy strategy used to sort point before 

212 mapping them to a cluster 

213 @param state random state 

214 """ 

215 if strategy in ('distance', 'distance_p'): 

216 return _constraint_association_distance(leftover, counters, labels, leftclose, distances_close, 

217 centers, X, x_squared_norms, limit, strategy, state=state) 

218 elif strategy in ('gain', 'gain_p'): 

219 return _constraint_association_gain(leftover, counters, labels, leftclose, distances_close, 

220 centers, X, x_squared_norms, limit, strategy, state=state) 

221 else: 

222 raise ValueError("Unknwon strategy '{0}'.".format(strategy)) 

223 

224 

225def _constraint_association_distance(leftover, counters, labels, leftclose, distances_close, 

226 centers, X, x_squared_norms, limit, strategy, state=None): 

227 """ 

228 Completes the constraint *k-means*. 

229 

230 @param X features 

231 @param labels initialized labels (unsued) 

232 @param centers initialized centers 

233 @param x_squared_norms norm of *X* 

234 @param limit number of point to associate per cluster 

235 @param leftover number of points to associate at the end 

236 @param counters allocated array 

237 @param leftclose allocated array 

238 @param labels allocated array 

239 @param distances_close allocated array 

240 @param strategy strategy used to sort point before 

241 mapping them to a cluster 

242 @param state random state (unused) 

243 """ 

244 

245 # initialisation 

246 counters[:] = 0 

247 leftclose[:] = -1 

248 distances_close[:] = numpy.nan 

249 labels[:] = -1 

250 

251 # distances 

252 distances = euclidean_distances( 

253 centers, X, Y_norm_squared=x_squared_norms, squared=True) 

254 distances = distances.T 

255 

256 strategy_coef = _compute_strategy_coefficient(distances, strategy, labels) 

257 distance_linear = linearize_matrix(distances, strategy_coef) 

258 sorted_distances = distance_linear[distance_linear[:, 3].argsort()] 

259 

260 nover = leftover 

261 for i in range(0, sorted_distances.shape[0]): 

262 ind = int(sorted_distances[i, 1]) 

263 if labels[ind] >= 0: 

264 continue 

265 c = int(sorted_distances[i, 2]) 

266 if counters[c] < limit: 

267 # The cluster still accepts new points. 

268 counters[c] += 1 

269 labels[ind] = c 

270 distances_close[ind] = sorted_distances[i, 0] 

271 elif nover > 0 and leftclose[c] == -1: 

272 # The cluster may accept one point if the number 

273 # of clusters does not divide the number of points in X. 

274 counters[c] += 1 

275 labels[ind] = c 

276 nover -= 1 

277 leftclose[c] = 0 

278 distances_close[ind] = sorted_distances[i, 0] 

279 return distances 

280 

281 

282def _compute_strategy_coefficient(distances, strategy, labels): 

283 """ 

284 Creates a matrix 

285 """ 

286 if strategy in ('distance', 'distance_p'): 

287 return distances 

288 elif strategy in ('gain', 'gain_p'): 

289 ar = numpy.arange(distances.shape[0]) 

290 dist = distances[ar, labels] 

291 return distances - dist[:, numpy.newaxis] 

292 else: 

293 raise ValueError("Unknwon strategy '{0}'.".format(strategy)) 

294 

295 

296def _constraint_association_gain(leftover, counters, labels, leftclose, distances_close, 

297 centers, X, x_squared_norms, limit, strategy, state=None): 

298 """ 

299 Completes the constraint *k-means*. 

300 

301 @param X features 

302 @param labels initialized labels (unsued) 

303 @param centers initialized centers 

304 @param x_squared_norms norm of *X* 

305 @param limit number of points to associate per cluster 

306 @param leftover number of points to associate at the end 

307 @param counters allocated array 

308 @param leftclose allocated array 

309 @param labels allocated array 

310 @param distances_close allocated array 

311 @param strategy strategy used to sort point before 

312 mapping them to a cluster 

313 @param state random state 

314 

315 See `Same-size k-Means Variation <https://elki-project.github.io/tutorial/same-size_k_means>`_. 

316 """ 

317 # distances 

318 distances = euclidean_distances( 

319 centers, X, Y_norm_squared=x_squared_norms, squared=True) 

320 distances = distances.T 

321 

322 if strategy == 'gain_p': 

323 labels[:] = numpy.argmin(distances, axis=1) 

324 else: 

325 # We assume labels comes from a previous iteration. 

326 pass 

327 

328 strategy_coef = _compute_strategy_coefficient(distances, strategy, labels) 

329 distance_linear = linearize_matrix(distances, strategy_coef) 

330 sorted_distances = distance_linear[distance_linear[:, 3].argsort()] 

331 distances_close[:] = 0 

332 

333 # counters 

334 ave = limit 

335 counters[:] = 0 

336 for i in labels: 

337 counters[i] += 1 

338 leftclose[:] = counters[:] - ave 

339 leftclose[leftclose < 0] = 0 

340 nover = X.shape[0] - ave * counters.shape[0] 

341 sumi = nover - leftclose.sum() 

342 if sumi != 0: 

343 if state is None: 

344 state = numpy.random.RandomState() 

345 

346 def loopf(h, sumi): 

347 if sumi < 0 and leftclose[h] > 0: # pylint: disable=R1716 

348 sumi -= leftclose[h] 

349 leftclose[h] = 0 

350 elif sumi > 0 and leftclose[h] == 0: 

351 leftclose[h] = 1 

352 sumi += 1 

353 return sumi 

354 

355 it = 0 

356 while sumi != 0: 

357 h = state.randint(0, counters.shape[0]) 

358 sumi = loopf(h, sumi) 

359 it += 1 

360 if it > counters.shape[0] * 2: 

361 break 

362 for h in range(counters.shape[0]): 

363 if sumi == 0: 

364 break 

365 sumi = loopf(h, sumi) 

366 

367 transfer = {} 

368 

369 for i in range(0, sorted_distances.shape[0]): 

370 gain = sorted_distances[i, 3] 

371 ind = int(sorted_distances[i, 1]) 

372 dest = int(sorted_distances[i, 2]) 

373 cur = labels[ind] 

374 if distances_close[ind]: 

375 continue 

376 if cur == dest: 

377 continue 

378 if (counters[dest] < ave + leftclose[dest]) and (counters[cur] > ave + leftclose[cur]): 

379 labels[ind] = dest 

380 counters[cur] -= 1 

381 counters[dest] += 1 

382 distances_close[ind] = 1 # moved 

383 else: 

384 cp = transfer.get((dest, cur), []) 

385 while len(cp) > 0: 

386 g, destind = cp[0] 

387 if distances_close[destind]: 

388 del cp[0] 

389 else: 

390 break 

391 if len(cp) > 0: 

392 g, destind = cp[0] 

393 if g + gain < 0: 

394 del cp[0] 

395 labels[ind] = dest 

396 labels[destind] = cur 

397 add = False 

398 distances_close[ind] = 1 # moved 

399 distances_close[destind] = 1 # moved 

400 else: 

401 add = True 

402 else: 

403 add = True 

404 if add: 

405 # We add the point to the list of points willing to transfer. 

406 if (cur, dest) not in transfer: 

407 transfer[cur, dest] = [] 

408 gain = sorted_distances[i, 3] 

409 bisect.insort(transfer[cur, dest], (gain, ind)) 

410 

411 distances_close[:] = distances[numpy.arange(X.shape[0]), labels] 

412 

413 neg = (counters < ave).sum() 

414 if neg > 0: 

415 raise RuntimeError( 

416 "The algorithm failed, counters={0}".format(counters)) 

417 

418 return distances