Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# -*- coding: utf-8 -*- 

2""" 

3@file 

4@brief Implémente la classe @see cl ConstraintKMeans. 

5""" 

6import numpy 

7from sklearn.cluster import KMeans 

8from sklearn.metrics.pairwise import euclidean_distances 

9from .kmeans_constraint_ import constraint_kmeans, constraint_predictions 

10 

11 

12class ConstraintKMeans(KMeans): 

13 """ 

14 Defines a constraint :epkg:`k-means`. 

15 Clusters are modified to have an equal size. 

16 The algorithm is initialized with a regular :epkg:`k-means` 

17 and continues with a modified version of it. 

18 

19 Computing the predictions offer a choice. 

20 The first one is to keep the predictions 

21 from the regular *k-means* algorithm 

22 but with the balanced clusters. 

23 The second is to compute balanced predictions 

24 over the test set. That implies that the predictions 

25 for the same observations might change depending 

26 on the set it belongs to. 

27 

28 .. runpython:: 

29 :rst: 

30 

31 from papierstat.datasets.documentation import list_notebooks_rst_links 

32 links = list_notebooks_rst_links('digressions', 'constraint_kmeans') 

33 links = [' * %s' % s for s in links] 

34 print('\\n'.join(links)) 

35 """ 

36 

37 _strategy_value = {'distance', 'gain'} 

38 

39 def __init__(self, n_clusters=8, init='k-means++', n_init=10, max_iter=300, 

40 tol=0.0001, precompute_distances='auto', verbose=0, 

41 random_state=None, copy_x=True, n_jobs=1, algorithm='auto', 

42 balanced_predictions=False, strategy='gain', kmeans0=True): 

43 """ 

44 @param n_clusters number of clusters 

45 @param init used by :epkg:`k-means` 

46 @param n_init used by :epkg:`k-means` 

47 @param max_iter used by :epkg:`k-means` 

48 @param tol used by :epkg:`k-means` 

49 @param precompute_distances used by :epkg:`k-means` 

50 @param verbose used by :epkg:`k-means` 

51 @param random_state used by :epkg:`k-means` 

52 @param copy_x used by :epkg:`k-means` 

53 @param n_jobs used by :epkg:`k-means` 

54 @param algorithm used by :epkg:`k-means` 

55 @param balanced_predictions produced balanced prediction 

56 or the regular ones 

57 @param strategy strategy or algorithm used to abide 

58 by the constraint 

59 @param kmeans0 if True, applies *k-means* algorithm first 

60 

61 The parameter *strategy* determines how 

62 obseervations should be assigned to a cluster. 

63 The value can be: 

64 

65 * ``'distance'``: observations are ranked by distance to a cluster, 

66 the algorithm assigns first point to the closest center unless it reached 

67 the maximulmsize 

68 * ``'gain'``: follows the algorithm described at 

69 see `Same-size k-Means Variation <https://elki-project.github.io/tutorial/same-size_k_means>`_ 

70 """ 

71 KMeans.__init__(self, n_clusters=n_clusters, init=init, n_init=n_init, 

72 max_iter=max_iter, tol=tol, precompute_distances=precompute_distances, 

73 verbose=verbose, random_state=random_state, copy_x=copy_x, 

74 n_jobs=n_jobs, algorithm=algorithm) 

75 self.balanced_predictions = balanced_predictions 

76 self.strategy = strategy 

77 self.kmeans0 = kmeans0 

78 if strategy not in ConstraintKMeans._strategy_value: 

79 raise ValueError('strategy must be in {0}'.format( 

80 ConstraintKMeans._strategy_value)) 

81 

82 def fit(self, X, y=None, sample_weight=None, fLOG=None): 

83 """ 

84 Compute k-means clustering. 

85 

86 Parameters 

87 ---------- 

88 X : array-like or sparse matrix, shape=(n_samples, n_features) 

89 Training instances to cluster. It must be noted that the data 

90 will be converted to C ordering, which will cause a memory 

91 copy if the given data is not C-contiguous. 

92 

93 sample_weight : sample weight 

94 

95 y : Ignored 

96 

97 fLOG: logging function 

98 """ 

99 max_iter = self.max_iter 

100 self.max_iter //= 2 

101 if self.kmeans0: 

102 KMeans.fit(self, X, y, sample_weight=sample_weight) 

103 state = None 

104 else: 

105 state = numpy.random.RandomState(self.random_state) 

106 labels = state.randint( 

107 0, self.n_clusters, X.shape[0], dtype=numpy.int32) 

108 centers = numpy.empty((self.n_clusters, X.shape[1]), dtype=X.dtype) 

109 choice = state.randint(0, self.n_clusters, self.n_clusters) 

110 for i, c in enumerate(choice): 

111 centers[i, :] = X[c, :] 

112 self.labels_ = labels 

113 self.cluster_centers_ = centers 

114 self.inertia_ = float(X.shape[0]) 

115 self.n_iter_ = 0 

116 

117 self.max_iter = max_iter 

118 return self.constraint_kmeans(X, sample_weight=sample_weight, 

119 state=state, fLOG=fLOG) 

120 

121 def constraint_kmeans(self, X, sample_weight=None, state=None, fLOG=None): 

122 """ 

123 Completes the constraint k-means. 

124 

125 @param X features 

126 @param sample_weight sample weight 

127 @param state state 

128 @param fLOG logging function 

129 """ 

130 labels, centers, inertia, iter_ = constraint_kmeans(X, self.labels_, sample_weight, self.cluster_centers_, 

131 self.inertia_, self.precompute_distances, self.n_iter_, 

132 self.max_iter, verbose=self.verbose, strategy=self.strategy, 

133 state=state, fLOG=fLOG) 

134 self.labels_ = labels 

135 self.cluster_centers_ = centers 

136 self.inertia_ = inertia 

137 self.n_iter_ = iter_ 

138 return self 

139 

140 def predict(self, X, sample_weight=None): 

141 """ 

142 Computes the predictions. 

143 

144 @param X features. 

145 @return prediction 

146 """ 

147 if self.balanced_predictions: 

148 labels, _, __ = constraint_predictions( 

149 X, self.cluster_centers_, strategy=self.strategy + '_p') 

150 return labels 

151 else: 

152 return KMeans.predict(self, X, sample_weight=sample_weight) 

153 

154 def transform(self, X): 

155 """ 

156 Computes the predictions. 

157 

158 @param X features. 

159 @return prediction 

160 """ 

161 if self.balanced_predictions: 

162 labels, distances, __ = constraint_predictions( 

163 X, self.cluster_centers_, strategy=self.strategy) 

164 # We remove small distances than the chosen clusters 

165 # due to the constraint, we choose max*2 instead. 

166 mx = distances.max() * 2 

167 for i, l in enumerate(labels): 

168 mi = distances[i, l] 

169 mmi = distances[i, :].min() 

170 if mi > mmi: 

171 # numpy.nan would be best 

172 distances[i, distances[i, :] < mi] = mx 

173 return distances 

174 else: 

175 return KMeans.transform(self, X) 

176 

177 def score(self, X, y=None, sample_weight=None): 

178 """ 

179 Returns the distances to all clusters. 

180 

181 @param X features 

182 @param y unused 

183 @param sample_weight sample weight 

184 @return distances 

185 """ 

186 if self.balanced_predictions: 

187 _, __, dist_close = constraint_predictions( 

188 X, self.cluster_centers_, strategy=self.strategy) 

189 return dist_close 

190 else: 

191 return euclidean_distances(self.cluster_centers_, X, squared=True)