Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

309

310

311

312

313

314

315

316

317

318

319

320

321

322

323

324

325

326

327

328

329

330

331

332

333

334

335

336

337

338

339

340

341

342

343

344

345

346

347

348

349

350

351

352

353

354

355

356

357

358

359

360

361

362

363

364

365

366

367

368

369

370

371

372

373

374

375

376

377

378

379

380

381

382

383

384

385

386

387

388

389

390

391

392

393

394

395

396

397

398

399

400

401

402

403

404

405

406

407

408

409

410

411

412

413

414

415

416

417

418

419

# -*- coding: utf-8 -*- 

""" 

@file 

@brief Implémente la classe @see cl ConstraintKMeans. 

""" 

import bisect 

from pandas import DataFrame 

import numpy 

import scipy.sparse 

from sklearn.cluster.k_means_ import _labels_inertia 

from sklearn.cluster._k_means import _centers_sparse, _centers_dense 

from sklearn.metrics.pairwise import euclidean_distances 

from sklearn.utils.extmath import row_norms 

 

 

def linearize_matrix(mat, *adds): 

""" 

Linearizes a matrix into a new one 

with 3 columns value, row, column. 

The output format is similar to 

:epkg:`csr_matrix` but null values are kept. 

 

@param mat matrix 

@param adds additional square matrices 

@return new matrix 

 

*adds* defines additional matrices, it adds 

columns on the right side and fill them with 

the corresponding value taken into the additional 

matrices. 

""" 

if scipy.sparse.issparse(mat): 

if isinstance(mat, scipy.sparse.csr_matrix): 

max_row = mat.shape[0] 

res = numpy.empty((len(mat.data), 3 + len(adds)), dtype=mat.dtype) 

row = 0 

for i, v in enumerate(mat.data): 

while row < max_row and i >= mat.indptr[row]: 

row += 1 

res[i, 0] = v 

a, b = row - 1, mat.indices[i] 

res[i, 1] = a 

res[i, 2] = b 

for k, am in enumerate(adds): 

res[i, k + 3] = am[a, b] 

return res 

else: 

raise NotImplementedError( 

"This kind of sparse matrix is not handled: {0}".format(type(mat))) 

else: 

n = mat.shape[0] 

c = mat.shape[1] 

ic = numpy.arange(mat.shape[1]) 

res = numpy.empty((n * c, 3 + len(adds)), dtype=mat.dtype) 

for i in range(0, n): 

a = i * c 

b = (i + 1) * c 

res[a:b, 1] = i 

res[a:b, 2] = ic 

res[:, 0] = mat.ravel() 

for k, am in enumerate(adds): 

res[:, 3 + k] = am.ravel() 

return res 

 

 

def constraint_kmeans(X, labels, sample_weight, centers, inertia, precompute_distances, iter, max_iter, 

strategy='gain', verbose=0, state=None, fLOG=None): 

""" 

Completes the constraint *k-means*. 

 

@param X features 

@param labels initialized labels (unsued) 

@param sample_weight sample weight 

@param centers initialized centers 

@param inertia initialized inertia (unsued) 

@param precompute_distances precompute distances (used in 

`_label_inertia <https://github.com/scikit-learn/ 

scikit-learn/blob/master/sklearn/cluster/k_means_.py#L547>`_) 

@param iter number of iteration already done 

@param max_iter maximum of number of iteration 

@param strategy strategy used to sort observations before 

mapping them to clusters 

@param verbose verbose 

@param state random state 

@param fLOG logging function (needs to be specified otherwise 

verbose has no effects) 

@return tuple (best_labels, best_centers, best_inertia, iter) 

""" 

if isinstance(X, DataFrame): 

X = X.values 

x_squared_norms = row_norms(X, squared=True) 

counters = numpy.empty((centers.shape[0],), dtype=numpy.int32) 

limit = X.shape[0] // centers.shape[0] 

leftover = X.shape[0] - limit * centers.shape[0] 

leftclose = numpy.empty((centers.shape[0],), dtype=numpy.int32) 

n_clusters = centers.shape[0] 

distances_close = numpy.empty((X.shape[0],), dtype=X.dtype) 

best_inertia = None 

prev_labels = None 

best_iter = None 

 

if labels.dtype != numpy.int32: 

raise TypeError( 

"Labels must be an array of int not '{0}'".format(labels.dtype)) 

 

# association 

_constraint_association(leftover, counters, labels, leftclose, distances_close, 

centers, X, x_squared_norms, limit, strategy, state=state) 

 

if sample_weight is None: 

sw = numpy.ones((X.shape[0],)) 

else: 

sw = sample_weight 

 

while iter < max_iter: 

 

# compute new clusters 

if scipy.sparse.issparse(X): 

try: 

# scikit-learn >= 0.20 

centers = _centers_sparse( 

X, sw, labels, n_clusters, distances_close) 

except TypeError: 

# scikit-learn < 0.20 

centers = _centers_sparse( 

X, sw, labels, n_clusters, distances_close) 

else: 

try: 

# scikit-learn >= 0.20 

centers = _centers_dense( 

X, sw, labels, n_clusters, distances_close) 

except TypeError: 

# scikit-learn < 0.20 

centers = _centers_dense( 

X, sw, labels, n_clusters, distances_close) 

 

# association 

_constraint_association(leftover, counters, labels, leftclose, distances_close, 

centers, X, x_squared_norms, limit, strategy, state=state) 

 

# inertia 

_, inertia = _labels_inertia(X, sw, x_squared_norms, centers, 

precompute_distances=precompute_distances, 

distances=distances_close) 

 

iter += 1 

if verbose and fLOG: 

fLOG("CKMeans %d/%d inertia=%f" % (iter, max_iter, inertia)) 

 

# best option so far? 

if best_inertia is None or inertia < best_inertia: 

best_inertia = inertia 

best_centers = centers.copy() 

best_labels = labels.copy() 

best_iter = iter 

 

# early stop 

if best_inertia is not None and inertia >= best_inertia and iter > best_iter + 5: 

break 

if prev_labels is not None and numpy.array_equal(prev_labels, labels): 

break 

prev_labels = labels.copy() 

 

return best_labels, best_centers, best_inertia, iter 

 

 

def constraint_predictions(X, centers, strategy, state=None): 

""" 

Computes the predictions but tries 

to associates the same numbers of points 

in each cluster. 

 

@param X features 

@param centers centers of each clusters 

@param strategy strategy used to sort point before 

mapping them to a cluster 

@param state random state 

@return labels, distances, distances_close 

""" 

if isinstance(X, DataFrame): 

X = X.values 

x_squared_norms = row_norms(X, squared=True) 

counters = numpy.empty((centers.shape[0],), dtype=numpy.int32) 

limit = X.shape[0] // centers.shape[0] 

leftover = X.shape[0] - limit * centers.shape[0] 

leftclose = numpy.empty((centers.shape[0],), dtype=numpy.int32) 

distances_close = numpy.empty((X.shape[0],), dtype=X.dtype) 

labels = numpy.empty((X.shape[0],), dtype=numpy.int32) 

 

distances = _constraint_association(leftover, counters, labels, leftclose, 

distances_close, centers, X, x_squared_norms, 

limit, strategy, state=state) 

 

return labels, distances, distances_close 

 

 

def _constraint_association(leftover, counters, labels, leftclose, distances_close, 

centers, X, x_squared_norms, limit, strategy, state=None): 

""" 

Completes the constraint *k-means*. 

 

@param X features 

@param labels initialized labels (unsued) 

@param centers initialized centers 

@param x_squared_norms norm of *X* 

@param limit number of point to associate per cluster 

@param leftover number of points to associate at the end 

@param counters allocated array 

@param leftclose allocated array 

@param labels allocated array 

@param distances_close allocated array 

@param strategy strategy used to sort point before 

mapping them to a cluster 

@param state random state 

""" 

if strategy in ('distance', 'distance_p'): 

return _constraint_association_distance(leftover, counters, labels, leftclose, distances_close, 

centers, X, x_squared_norms, limit, strategy, state=state) 

elif strategy in ('gain', 'gain_p'): 

return _constraint_association_gain(leftover, counters, labels, leftclose, distances_close, 

centers, X, x_squared_norms, limit, strategy, state=state) 

else: 

raise ValueError("Unknwon strategy '{0}'.".format(strategy)) 

 

 

def _constraint_association_distance(leftover, counters, labels, leftclose, distances_close, 

centers, X, x_squared_norms, limit, strategy, state=None): 

""" 

Completes the constraint *k-means*. 

 

@param X features 

@param labels initialized labels (unsued) 

@param centers initialized centers 

@param x_squared_norms norm of *X* 

@param limit number of point to associate per cluster 

@param leftover number of points to associate at the end 

@param counters allocated array 

@param leftclose allocated array 

@param labels allocated array 

@param distances_close allocated array 

@param strategy strategy used to sort point before 

mapping them to a cluster 

@param state random state (unused) 

""" 

 

# initialisation 

counters[:] = 0 

leftclose[:] = -1 

distances_close[:] = numpy.nan 

labels[:] = -1 

 

# distances 

distances = euclidean_distances( 

centers, X, Y_norm_squared=x_squared_norms, squared=True) 

distances = distances.T 

 

strategy_coef = _compute_strategy_coefficient(distances, strategy, labels) 

distance_linear = linearize_matrix(distances, strategy_coef) 

sorted_distances = distance_linear[distance_linear[:, 3].argsort()] 

 

nover = leftover 

for i in range(0, sorted_distances.shape[0]): 

ind = int(sorted_distances[i, 1]) 

if labels[ind] >= 0: 

continue 

c = int(sorted_distances[i, 2]) 

if counters[c] < limit: 

# The cluster still accepts new points. 

counters[c] += 1 

labels[ind] = c 

distances_close[ind] = sorted_distances[i, 0] 

elif nover > 0 and leftclose[c] == -1: 

# The cluster may accept one point if the number 

# of clusters does not divide the number of points in X. 

counters[c] += 1 

labels[ind] = c 

nover -= 1 

leftclose[c] = 0 

distances_close[ind] = sorted_distances[i, 0] 

return distances 

 

 

def _compute_strategy_coefficient(distances, strategy, labels): 

""" 

Creates a matrix 

""" 

if strategy in ('distance', 'distance_p'): 

return distances 

elif strategy in ('gain', 'gain_p'): 

ar = numpy.arange(distances.shape[0]) 

dist = distances[ar, labels] 

return distances - dist[:, numpy.newaxis] 

else: 

raise ValueError("Unknwon strategy '{0}'.".format(strategy)) 

 

 

def _constraint_association_gain(leftover, counters, labels, leftclose, distances_close, 

centers, X, x_squared_norms, limit, strategy, state=None): 

""" 

Completes the constraint *k-means*. 

 

@param X features 

@param labels initialized labels (unsued) 

@param centers initialized centers 

@param x_squared_norms norm of *X* 

@param limit number of points to associate per cluster 

@param leftover number of points to associate at the end 

@param counters allocated array 

@param leftclose allocated array 

@param labels allocated array 

@param distances_close allocated array 

@param strategy strategy used to sort point before 

mapping them to a cluster 

@param state random state 

 

See `Same-size k-Means Variation <https://elki-project.github.io/tutorial/same-size_k_means>`_. 

""" 

# distances 

distances = euclidean_distances( 

centers, X, Y_norm_squared=x_squared_norms, squared=True) 

distances = distances.T 

 

if strategy == 'gain_p': 

labels[:] = numpy.argmin(distances, axis=1) 

else: 

# We assume labels comes from a previous iteration. 

pass 

 

strategy_coef = _compute_strategy_coefficient(distances, strategy, labels) 

distance_linear = linearize_matrix(distances, strategy_coef) 

sorted_distances = distance_linear[distance_linear[:, 3].argsort()] 

distances_close[:] = 0 

 

# counters 

ave = limit 

counters[:] = 0 

for i in labels: 

counters[i] += 1 

leftclose[:] = counters[:] - ave 

leftclose[leftclose < 0] = 0 

nover = X.shape[0] - ave * counters.shape[0] 

sumi = nover - leftclose.sum() 

if sumi != 0: 

if state is None: 

state = numpy.random.RandomState() 

 

def loopf(h, sumi): 

if sumi < 0 and leftclose[h] > 0: # pylint: disable=R1716 

sumi -= leftclose[h] 

leftclose[h] = 0 

elif sumi > 0 and leftclose[h] == 0: 

leftclose[h] = 1 

sumi += 1 

return sumi 

 

it = 0 

while sumi != 0: 

h = state.randint(0, counters.shape[0]) 

sumi = loopf(h, sumi) 

it += 1 

if it > counters.shape[0] * 2: 

break 

for h in range(counters.shape[0]): 

if sumi == 0: 

break 

sumi = loopf(h, sumi) 

 

transfer = {} 

 

for i in range(0, sorted_distances.shape[0]): 

gain = sorted_distances[i, 3] 

ind = int(sorted_distances[i, 1]) 

dest = int(sorted_distances[i, 2]) 

cur = labels[ind] 

if distances_close[ind]: 

continue 

if cur == dest: 

continue 

if (counters[dest] < ave + leftclose[dest]) and (counters[cur] > ave + leftclose[cur]): 

labels[ind] = dest 

counters[cur] -= 1 

counters[dest] += 1 

distances_close[ind] = 1 # moved 

else: 

cp = transfer.get((dest, cur), []) 

while len(cp) > 0: 

g, destind = cp[0] 

if distances_close[destind]: 

del cp[0] 

else: 

break 

if len(cp) > 0: 

g, destind = cp[0] 

if g + gain < 0: 

del cp[0] 

labels[ind] = dest 

labels[destind] = cur 

add = False 

distances_close[ind] = 1 # moved 

distances_close[destind] = 1 # moved 

else: 

add = True 

else: 

add = True 

if add: 

# We add the point to the list of points willing to transfer. 

if (cur, dest) not in transfer: 

transfer[cur, dest] = [] 

gain = sorted_distances[i, 3] 

bisect.insort(transfer[cur, dest], (gain, ind)) 

 

distances_close[:] = distances[numpy.arange(X.shape[0]), labels] 

 

neg = (counters < ave).sum() 

if neg > 0: 

raise RuntimeError( 

"The algorithm failed, counters={0}".format(counters)) 

 

return distances