Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1""" 

2@file 

3@brief Implements a piecewise linear regression. 

4""" 

5import numpy 

6import numpy.random 

7import pandas 

8from sklearn.base import BaseEstimator, RegressorMixin, ClassifierMixin, clone 

9from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier 

10from sklearn.linear_model import LinearRegression, LogisticRegression 

11from sklearn.preprocessing import KBinsDiscretizer 

12from sklearn.utils._joblib import Parallel, delayed 

13from sklearn.utils.fixes import _joblib_parallel_args 

14try: 

15 from tqdm import tqdm 

16except ImportError: # pragma: no cover 

17 pass 

18 

19 

20def _fit_piecewise_estimator(i, model, X, y, sample_weight, association, nb_classes, random_state): 

21 ind = association == i 

22 if not numpy.any(ind): 

23 # No training example for this bucket. 

24 return None 

25 Xi = X[ind, :] 

26 yi = y[ind] 

27 sw = sample_weight[ind] if sample_weight is not None else None 

28 

29 if nb_classes is not None and len(set(yi)) != nb_classes: 

30 # Issues a classifiers requires to have at least one example 

31 # of each class. 

32 if random_state is None: 

33 random_state = numpy.random.RandomState() # pylint: disable=E1101 

34 addition = numpy.arange(len(ind)) 

35 random_state.shuffle(addition) 

36 found = set(yi) 

37 allcl = set(y) 

38 res = [] 

39 while len(found) < len(allcl): 

40 for ki in addition: 

41 if y[ki] not in found: 

42 res.append(ki) 

43 found.add(y[ki]) 

44 ind = ind.copy() 

45 for ki in res: 

46 ind[ki] = True 

47 

48 Xi = X[ind, :] 

49 yi = y[ind] 

50 sw = sample_weight[ind] if sample_weight is not None else None 

51 

52 return model.fit(Xi, yi, sample_weight=sw) 

53 

54 

55def _predict_piecewise_estimator(i, est, X, association): 

56 ind = association == i 

57 if not numpy.any(ind): 

58 return None, None 

59 return ind, est.predict(X[ind, :]) 

60 

61 

62def _predict_proba_piecewise_estimator(i, est, X, association): 

63 ind = association == i 

64 if not numpy.any(ind): 

65 return None, None 

66 return ind, est.predict_proba(X[ind, :]) 

67 

68 

69def _decision_function_piecewise_estimator(i, est, X, association): 

70 ind = association == i 

71 if not numpy.any(ind): 

72 return None, None 

73 return ind, est.decision_function(X[ind, :]) 

74 

75 

76class PiecewiseEstimator(BaseEstimator): 

77 """ 

78 Uses a :epkg:`decision tree` to split the space of features 

79 into buckets and trains a linear regression on each of them. 

80 The second estimator can be a :epkg:`sklearn:linear_model:LinearRegression` 

81 for a regression or :epkg:`sklearn:linear_model:LogisticRegression` 

82 for a classifier. It can also be :epkg:`sklearn:dummy:DummyRegressor` 

83 :epkg:`sklearn:dummy:DummyClassifier` to just get the average on each bucket. 

84 When the buckets are defined by a decision tree and the 

85 estimator is linear, @see cl PiecewiseTreeRegressor optimizes 

86 the buckets based on the results of a linear regression. 

87 The accuracy is usually better. 

88 """ 

89 

90 def __init__(self, binner=None, estimator=None, n_jobs=None, verbose=False): 

91 """ 

92 @param binner transformer or predictor which creates the buckets 

93 @param estimator predictor trained on every bucket 

94 @param n_jobs number of parallel jobs (for training and predicting) 

95 @param verbose boolean or use ``'tqdm'`` to use :epkg:`tqdm` 

96 to fit the estimators 

97 

98 *binner* must be filled or must be: 

99 

100 - ``'bins'``: the model :epkg:`sklearn:preprocessing:KBinsDiscretizer` 

101 - any instanciated model 

102 

103 *estimator* allows the following values: 

104 

105 - ``None``: the model is :epkg:`sklearn:linear_model:LinearRegression` 

106 - any instanciated model 

107 """ 

108 BaseEstimator.__init__(self) 

109 if estimator is None: 

110 raise ValueError( # pragma: no cover 

111 "estimator cannot be null.") 

112 if binner is None: 

113 raise TypeError( # pragma: no cover 

114 "Unsupported options for binner=='tree' and model {}.".format( 

115 type(estimator))) 

116 elif binner == "bins": 

117 binner = KBinsDiscretizer() 

118 self.binner = binner 

119 self.estimator = estimator 

120 self.n_jobs = n_jobs 

121 self.verbose = verbose 

122 

123 @property 

124 def n_estimators_(self): 

125 """ 

126 Returns the number of estimators = the number of buckets 

127 the data was split in. 

128 """ 

129 return len(self.estimators_) 

130 

131 def _mapping_train(self, X, binner): 

132 if hasattr(binner, "tree_"): 

133 tree = binner.tree_ 

134 leaves = [i for i in range(len(tree.children_left)) 

135 if tree.children_left[i] <= i and tree.children_right[i] <= i] 

136 dec_path = self.binner_.decision_path(X) 

137 association = numpy.zeros((X.shape[0],)) 

138 association[:] = -1 

139 mapping = {} 

140 ntree = 0 

141 for j in leaves: 

142 ind = dec_path[:, j] == 1 

143 ind = numpy.asarray(ind.todense()).flatten() 

144 if not numpy.any(ind): 

145 # No training example for this bucket. 

146 continue 

147 mapping[j] = ntree 

148 association[ind] = ntree 

149 ntree += 1 

150 

151 elif hasattr(binner, "transform"): 

152 tr = binner.transform(X) 

153 unique = set() 

154 for x in tr: 

155 d = tuple(numpy.asarray( 

156 x.todense()).ravel().astype(numpy.int32)) 

157 unique.add(d) 

158 leaves = list(sorted(unique)) 

159 association = numpy.zeros((X.shape[0],)) 

160 association[:] = -1 

161 ntree = 0 

162 mapping = {} 

163 for i, le in enumerate(leaves): 

164 mapping[le] = i 

165 for i, x in enumerate(tr): 

166 d = tuple(numpy.asarray( 

167 x.todense()).ravel().astype(numpy.int32)) 

168 association[i] = mapping.get(d, -1) 

169 else: 

170 raise NotImplementedError( # pragma: no cover 

171 "binner is not a decision tree or a transform") 

172 

173 return association, mapping, leaves 

174 

175 def transform_bins(self, X): 

176 """ 

177 Maps every row to a tree in *self.estimators_*. 

178 """ 

179 binner = self.binner_ 

180 if hasattr(binner, "tree_"): 

181 dec_path = self.binner_.decision_path(X) 

182 association = numpy.zeros((X.shape[0],)) 

183 association[:] = -1 

184 for j in self.leaves_: 

185 ind = dec_path[:, j] == 1 

186 ind = numpy.asarray(ind.todense()).flatten() 

187 if not numpy.any(ind): 

188 # No training example for this bucket. 

189 continue 

190 association[ind] = self.mapping_.get(j, -1) 

191 

192 elif hasattr(binner, "transform"): 

193 association = numpy.zeros((X.shape[0],)) 

194 association[:] = -1 

195 tr = binner.transform(X) 

196 for i, x in enumerate(tr): 

197 d = tuple(numpy.asarray( 

198 x.todense()).ravel().astype(numpy.int32)) 

199 association[i] = self.mapping_.get(d, -1) 

200 else: 

201 raise NotImplementedError( # pragma: no cover 

202 "binner is not a decision tree or a transform") 

203 return association 

204 

205 def fit(self, X, y, sample_weight=None): 

206 """ 

207 Trains the binner and an estimator on every 

208 bucket. 

209 

210 :param X: features, *X* is converted into an array if *X* is a dataframe 

211 :param y: target 

212 :param sample_weight: sample weights 

213 :return: self: returns an instance of self. 

214 

215 Fitted attributes: 

216 

217 * `binner_`: binner 

218 * `estimators_`: dictionary of estimators, each of them 

219 mapped to a leave to the tree 

220 * `mean_estimator_`: estimator trained on the whole 

221 datasets in case the binner can find a bucket for 

222 a new observation 

223 * `dim_`: dimension of the output 

224 * `mean_`: average targets 

225 """ 

226 if len(y.shape) == 2: 

227 if y.shape[-1] == 1: 

228 y = y.ravel() 

229 else: 

230 raise RuntimeError( 

231 "This regressor only works with single dimension targets.") 

232 if isinstance(X, pandas.DataFrame): 

233 X = X.values 

234 if isinstance(X, list): 

235 raise TypeError( # pragma: no cover 

236 "X cannot be a list.") 

237 binner = clone(self.binner) 

238 if sample_weight is None: 

239 self.binner_ = binner.fit(X, y) 

240 else: 

241 self.binner_ = binner.fit(X, y, sample_weight=sample_weight) 

242 

243 association, self.mapping_, self.leaves_ = self._mapping_train( 

244 X, self.binner_) 

245 

246 estimators = [clone(self.estimator) for i in self.mapping_] 

247 

248 loop = (tqdm(range(len(estimators))) 

249 if self.verbose == 'tqdm' else range(len(estimators))) 

250 verbose = 1 if self.verbose == 'tqdm' else (1 if self.verbose else 0) 

251 

252 self.mean_estimator_ = clone(self.estimator).fit(X, y, sample_weight) 

253 nb_classes = (None if not hasattr(self.mean_estimator_, 'classes_') 

254 else len(set(self.mean_estimator_.classes_))) 

255 

256 if hasattr(self, 'random_state') and self.random_state is not None: # pylint: disable=E1101 

257 rnd = numpy.random.RandomState( # pylint: disable=E1101 

258 self.random_state) # pylint: disable=E1101 

259 else: 

260 rnd = None 

261 

262 self.estimators_ = \ 

263 Parallel(n_jobs=self.n_jobs, verbose=verbose, 

264 **_joblib_parallel_args(prefer='threads'))( 

265 delayed(_fit_piecewise_estimator)( 

266 i, estimators[i], X, y, sample_weight, association, nb_classes, rnd) 

267 for i in loop) 

268 

269 self.dim_ = 1 if len(y.shape) == 1 else y.shape[1] 

270 if hasattr(self.estimators_[0], 'classes_'): 

271 self.classes_ = self.estimators_[0].classes_ 

272 return self 

273 

274 def _apply_predict_method(self, X, method, parallelized, dimout): 

275 """ 

276 Generic *predict* method, works for *predict_proba* and 

277 *decision_function* as well. 

278 """ 

279 if len(self.estimators_) == 0: 

280 raise RuntimeError( # pragma: no cover 

281 "Estimator was apparently fitted but contains no estimator.") 

282 if not hasattr(self.estimators_[0], method): 

283 raise TypeError( # pragma: no cover 

284 "Estimator {} does not have method '{}'.".format( 

285 type(self.estimators_[0]), method)) 

286 if isinstance(X, pandas.DataFrame): 

287 X = X.values 

288 

289 association = self.transform_bins(X) 

290 

291 indpred = Parallel(n_jobs=self.n_jobs, **_joblib_parallel_args(prefer='threads'))( 

292 delayed(parallelized)(i, model, X, association) 

293 for i, model in enumerate(self.estimators_)) 

294 

295 pred = numpy.zeros((X.shape[0], dimout) 

296 if dimout > 1 else (X.shape[0],)) 

297 indall = numpy.empty((X.shape[0],)) 

298 indall[:] = False 

299 for ind, p in indpred: 

300 if ind is None: 

301 continue 

302 pred[ind] = p 

303 indall = numpy.logical_or(indall, ind) # pylint: disable=E1111 

304 

305 # no in a bucket 

306 indall = numpy.logical_not(indall) # pylint: disable=E1111 

307 Xmissed = X[indall] 

308 if Xmissed.shape[0] > 0: 

309 meth = getattr(self.mean_estimator_, method) 

310 missed = meth(Xmissed) 

311 pred[indall] = missed 

312 return pred 

313 

314 

315class PiecewiseRegressor(PiecewiseEstimator, RegressorMixin): 

316 """ 

317 Uses a :epkg:`decision tree` to split the space of features 

318 into buckets and trains a linear regression (default) on each of them. 

319 The second estimator is usually a :epkg:`sklearn:linear_model:LinearRegression`. 

320 It can also be :epkg:`sklearn:dummy:DummyRegressor` to just get 

321 the average on each bucket. 

322 """ 

323 

324 def __init__(self, binner=None, estimator=None, n_jobs=None, verbose=False): 

325 """ 

326 @param binner transformer or predictor which creates the buckets 

327 @param estimator predictor trained on every bucket 

328 @param n_jobs number of parallel jobs (for training and predicting) 

329 @param verbose boolean or use ``'tqdm'`` to use :epkg:`tqdm` 

330 to fit the estimators 

331 

332 *binner* allows the following values: 

333 

334 - ``tree``: the model is :epkg:`sklearn:tree:DecisionTreeRegressor` 

335 - ``'bins'``: the model :epkg:`sklearn:preprocessing:KBinsDiscretizer` 

336 - any instanciated model 

337 

338 *estimator* allows the following values: 

339 

340 - ``None``: the model is :epkg:`sklearn:linear_model:LinearRegression` 

341 - any instanciated model 

342 """ 

343 if estimator is None: 

344 estimator = LinearRegression() 

345 if binner in ('tree', None): 

346 binner = DecisionTreeRegressor(min_samples_leaf=2) 

347 RegressorMixin.__init__(self) 

348 PiecewiseEstimator.__init__(self, binner=binner, estimator=estimator, 

349 n_jobs=n_jobs, verbose=verbose) 

350 

351 def predict(self, X): 

352 """ 

353 Computes the predictions. 

354 

355 :param X: features, *X* is converted into an array if *X* is a dataframe 

356 :return: predictions 

357 """ 

358 return self._apply_predict_method( 

359 X, "predict", _predict_piecewise_estimator, self.dim_) 

360 

361 

362class PiecewiseClassifier(PiecewiseEstimator, ClassifierMixin): 

363 """ 

364 Uses a :epkg:`decision tree` to split the space of features 

365 into buckets and trains a logistic regression (default) on each of them. 

366 The second estimator is usually a :epkg:`sklearn:linear_model:LogisticRegression`. 

367 It can also be :epkg:`sklearn:dummy:DummyClassifier` to just get 

368 the average on each bucket. 

369 

370 The main issue with the *PiecewiseClassifier* is that each piece requires 

371 one example of each class in each bucket which may not happen. 

372 To avoid that, the training will pick up random example 

373 from other bucket to ensure this case does not happen. 

374 """ 

375 

376 def __init__(self, binner=None, estimator=None, n_jobs=None, 

377 random_state=None, verbose=False): 

378 """ 

379 @param binner transformer or predictor which creates the buckets 

380 @param estimator predictor trained on every bucket 

381 @param n_jobs number of parallel jobs (for training and predicting) 

382 @param random_state to pick up random examples when buckets do not 

383 contain enough examples of each class 

384 @param verbose boolean or use ``'tqdm'`` to use :epkg:`tqdm` 

385 to fit the estimators 

386 

387 *binner* allows the following values: 

388 

389 - ``tree``: the model is :epkg:`sklearn:tree:DecisionTreeClassifier` 

390 - ``'bins'``: the model :epkg:`sklearn:preprocessing:KBinsDiscretizer` 

391 - any instanciated model 

392 

393 *estimator* allows the following values: 

394 

395 - ``None``: the model is :epkg:`sklearn:linear_model:LogisticRegression` 

396 - any instanciated model 

397 """ 

398 if estimator is None: 

399 estimator = LogisticRegression() 

400 if binner in ('tree', None): 

401 binner = DecisionTreeClassifier(min_samples_leaf=5) 

402 ClassifierMixin.__init__(self) 

403 PiecewiseEstimator.__init__( 

404 self, binner=binner, estimator=estimator, 

405 n_jobs=n_jobs, verbose=verbose) 

406 self.random_state = random_state 

407 

408 def predict(self, X): 

409 """ 

410 Computes the predictions. 

411 

412 :param X: features, *X* is converted into an array if *X* is a dataframe 

413 :return: predictions 

414 """ 

415 pred = self._apply_predict_method( 

416 X, "predict", _predict_piecewise_estimator, 1) 

417 return pred.astype(numpy.int32) 

418 

419 def predict_proba(self, X): 

420 """ 

421 Computes the predictions probabilities. 

422 

423 :param X: features, *X* is converted into an array if *X* is a dataframe 

424 :return: predictions probabilities 

425 """ 

426 return self._apply_predict_method( 

427 X, "predict_proba", _predict_proba_piecewise_estimator, 

428 len(self.mean_estimator_.classes_)) 

429 

430 def decision_function(self, X): 

431 """ 

432 Computes the predictions probabilities. 

433 

434 :param X: features, *X* is converted into an array if *X* is a dataframe 

435 :return: predictions probabilities 

436 """ 

437 justone = self.mean_estimator_.decision_function(X[:1]) 

438 return self._apply_predict_method( 

439 X, "decision_function", _decision_function_piecewise_estimator, 

440 1 if len(justone.shape) == 1 else justone.shape[1])