Coverage for mlinsights/mlmodel/piecewise

Hot-keys on this page

r m x p toggle line displays

j k next/prev highlighted chunk

0 (zero) top of page

1 (one) first highlighted chunk

1"""

2@file

3@brief Implements a piecewise linear regression.

4"""

5import numpy

6import numpy.random

7import pandas

8from sklearn.base import BaseEstimator, RegressorMixin, ClassifierMixin, clone

9from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier

10from sklearn.linear_model import LinearRegression, LogisticRegression

11from sklearn.preprocessing import KBinsDiscretizer

12from sklearn.utils._joblib import Parallel, delayed

13from sklearn.utils.fixes import _joblib_parallel_args

14try:

15 from tqdm import tqdm

16except ImportError: # pragma: no cover

17 pass

20def _fit_piecewise_estimator(i, model, X, y, sample_weight, association, nb_classes, random_state):

21 ind = association == i

22 if not numpy.any(ind):

23 # No training example for this bucket.

24 return None

25 Xi = X[ind, :]

26 yi = y[ind]

27 sw = sample_weight[ind] if sample_weight is not None else None

29 if nb_classes is not None and len(set(yi)) != nb_classes:

30 # Issues a classifiers requires to have at least one example

31 # of each class.

32 if random_state is None:

33 random_state = numpy.random.RandomState() # pylint: disable=E1101

34 addition = numpy.arange(len(ind))

35 random_state.shuffle(addition)

36 found = set(yi)

37 allcl = set(y)

38 res = []

39 while len(found) < len(allcl):

40 for ki in addition:

41 if y[ki] not in found:

42 res.append(ki)

43 found.add(y[ki])

44 ind = ind.copy()

45 for ki in res:

46 ind[ki] = True

48 Xi = X[ind, :]

49 yi = y[ind]

50 sw = sample_weight[ind] if sample_weight is not None else None

52 return model.fit(Xi, yi, sample_weight=sw)

55def _predict_piecewise_estimator(i, est, X, association):

56 ind = association == i

57 if not numpy.any(ind):

58 return None, None

59 return ind, est.predict(X[ind, :])

62def _predict_proba_piecewise_estimator(i, est, X, association):

63 ind = association == i

64 if not numpy.any(ind):

65 return None, None

66 return ind, est.predict_proba(X[ind, :])

69def _decision_function_piecewise_estimator(i, est, X, association):

70 ind = association == i

71 if not numpy.any(ind):

72 return None, None

73 return ind, est.decision_function(X[ind, :])

76class PiecewiseEstimator(BaseEstimator):

77 """

78 Uses a :epkg:`decision tree` to split the space of features

79 into buckets and trains a linear regression on each of them.

80 The second estimator can be a :epkg:`sklearn:linear_model:LinearRegression`

81 for a regression or :epkg:`sklearn:linear_model:LogisticRegression`

82 for a classifier. It can also be :epkg:`sklearn:dummy:DummyRegressor`

83 :epkg:`sklearn:dummy:DummyClassifier` to just get the average on each bucket.

84 When the buckets are defined by a decision tree and the

85 estimator is linear, @see cl PiecewiseTreeRegressor optimizes

86 the buckets based on the results of a linear regression.

87 The accuracy is usually better.

88 """

90 def __init__(self, binner=None, estimator=None, n_jobs=None, verbose=False):

91 """

92 @param binner transformer or predictor which creates the buckets

93 @param estimator predictor trained on every bucket

94 @param n_jobs number of parallel jobs (for training and predicting)

95 @param verbose boolean or use ``'tqdm'`` to use :epkg:`tqdm`

96 to fit the estimators

98 *binner* must be filled or must be:

100 - ``'bins'``: the model :epkg:`sklearn:preprocessing:KBinsDiscretizer`

101 - any instanciated model

102

103 *estimator* allows the following values:

104

105 - ``None``: the model is :epkg:`sklearn:linear_model:LinearRegression`

106 - any instanciated model

107 """

108 BaseEstimator.__init__(self)

109 if estimator is None:

110 raise ValueError( # pragma: no cover

111 "estimator cannot be null.")

112 if binner is None:

113 raise TypeError( # pragma: no cover

114 "Unsupported options for binner=='tree' and model {}.".format(

115 type(estimator)))

116 elif binner == "bins":

117 binner = KBinsDiscretizer()

118 self.binner = binner

119 self.estimator = estimator

120 self.n_jobs = n_jobs

121 self.verbose = verbose

122

123 @property

124 def n_estimators_(self):

125 """

126 Returns the number of estimators = the number of buckets

127 the data was split in.

128 """

129 return len(self.estimators_)

130

131 def _mapping_train(self, X, binner):

132 if hasattr(binner, "tree_"):

133 tree = binner.tree_

134 leaves = [i for i in range(len(tree.children_left))

135 if tree.children_left[i] <= i and tree.children_right[i] <= i]

136 dec_path = self.binner_.decision_path(X)

137 association = numpy.zeros((X.shape[0],))

138 association[:] = -1

139 mapping = {}

140 ntree = 0

141 for j in leaves:

142 ind = dec_path[:, j] == 1

143 ind = numpy.asarray(ind.todense()).flatten()

144 if not numpy.any(ind):

145 # No training example for this bucket.

146 continue

147 mapping[j] = ntree

148 association[ind] = ntree

149 ntree += 1

150

151 elif hasattr(binner, "transform"):

152 tr = binner.transform(X)

153 unique = set()

154 for x in tr:

155 d = tuple(numpy.asarray(

156 x.todense()).ravel().astype(numpy.int32))

157 unique.add(d)

158 leaves = list(sorted(unique))

159 association = numpy.zeros((X.shape[0],))

160 association[:] = -1

161 ntree = 0

162 mapping = {}

163 for i, le in enumerate(leaves):

164 mapping[le] = i

165 for i, x in enumerate(tr):

166 d = tuple(numpy.asarray(

167 x.todense()).ravel().astype(numpy.int32))

168 association[i] = mapping.get(d, -1)

169 else:

170 raise NotImplementedError( # pragma: no cover

171 "binner is not a decision tree or a transform")

172

173 return association, mapping, leaves

174

175 def transform_bins(self, X):

176 """

177 Maps every row to a tree in *self.estimators_*.

178 """

179 binner = self.binner_

180 if hasattr(binner, "tree_"):

181 dec_path = self.binner_.decision_path(X)

182 association = numpy.zeros((X.shape[0],))

183 association[:] = -1

184 for j in self.leaves_:

185 ind = dec_path[:, j] == 1

186 ind = numpy.asarray(ind.todense()).flatten()

187 if not numpy.any(ind):

188 # No training example for this bucket.

189 continue

190 association[ind] = self.mapping_.get(j, -1)

191

192 elif hasattr(binner, "transform"):

193 association = numpy.zeros((X.shape[0],))

194 association[:] = -1

195 tr = binner.transform(X)

196 for i, x in enumerate(tr):

197 d = tuple(numpy.asarray(

198 x.todense()).ravel().astype(numpy.int32))

199 association[i] = self.mapping_.get(d, -1)

200 else:

201 raise NotImplementedError( # pragma: no cover

202 "binner is not a decision tree or a transform")

203 return association

204

205 def fit(self, X, y, sample_weight=None):

206 """

207 Trains the binner and an estimator on every

208 bucket.

209

210 :param X: features, *X* is converted into an array if *X* is a dataframe

211 :param y: target

212 :param sample_weight: sample weights

213 :return: self: returns an instance of self.

214

215 Fitted attributes:

216

217 * `binner_`: binner

218 * `estimators_`: dictionary of estimators, each of them

219 mapped to a leave to the tree

220 * `mean_estimator_`: estimator trained on the whole

221 datasets in case the binner can find a bucket for

222 a new observation

223 * `dim_`: dimension of the output

224 * `mean_`: average targets

225 """

226 if len(y.shape) == 2:

227 if y.shape[-1] == 1:

228 y = y.ravel()

229 else:

230 raise RuntimeError(

231 "This regressor only works with single dimension targets.")

232 if isinstance(X, pandas.DataFrame):

233 X = X.values

234 if isinstance(X, list):

235 raise TypeError( # pragma: no cover

236 "X cannot be a list.")

237 binner = clone(self.binner)

238 if sample_weight is None:

239 self.binner_ = binner.fit(X, y)

240 else:

241 self.binner_ = binner.fit(X, y, sample_weight=sample_weight)

242

243 association, self.mapping_, self.leaves_ = self._mapping_train(

244 X, self.binner_)

245

246 estimators = [clone(self.estimator) for i in self.mapping_]

247

248 loop = (tqdm(range(len(estimators)))

249 if self.verbose == 'tqdm' else range(len(estimators)))

250 verbose = 1 if self.verbose == 'tqdm' else (1 if self.verbose else 0)

251

252 self.mean_estimator_ = clone(self.estimator).fit(X, y, sample_weight)

253 nb_classes = (None if not hasattr(self.mean_estimator_, 'classes_')

254 else len(set(self.mean_estimator_.classes_)))

255

256 if hasattr(self, 'random_state') and self.random_state is not None: # pylint: disable=E1101

257 rnd = numpy.random.RandomState( # pylint: disable=E1101

258 self.random_state) # pylint: disable=E1101

259 else:

260 rnd = None

261

262 self.estimators_ = \

263 Parallel(n_jobs=self.n_jobs, verbose=verbose,

264 **_joblib_parallel_args(prefer='threads'))(

265 delayed(_fit_piecewise_estimator)(

266 i, estimators[i], X, y, sample_weight, association, nb_classes, rnd)

267 for i in loop)

268

269 self.dim_ = 1 if len(y.shape) == 1 else y.shape[1]

270 if hasattr(self.estimators_[0], 'classes_'):

271 self.classes_ = self.estimators_[0].classes_

272 return self

273

274 def _apply_predict_method(self, X, method, parallelized, dimout):

275 """

276 Generic *predict* method, works for *predict_proba* and

277 *decision_function* as well.

278 """

279 if len(self.estimators_) == 0:

280 raise RuntimeError( # pragma: no cover

281 "Estimator was apparently fitted but contains no estimator.")

282 if not hasattr(self.estimators_[0], method):

283 raise TypeError( # pragma: no cover

284 "Estimator {} does not have method '{}'.".format(

285 type(self.estimators_[0]), method))

286 if isinstance(X, pandas.DataFrame):

287 X = X.values

288

289 association = self.transform_bins(X)

290

291 indpred = Parallel(n_jobs=self.n_jobs, **_joblib_parallel_args(prefer='threads'))(

292 delayed(parallelized)(i, model, X, association)

293 for i, model in enumerate(self.estimators_))

294

295 pred = numpy.zeros((X.shape[0], dimout)

296 if dimout > 1 else (X.shape[0],))

297 indall = numpy.empty((X.shape[0],))

298 indall[:] = False

299 for ind, p in indpred:

300 if ind is None:

301 continue

302 pred[ind] = p

303 indall = numpy.logical_or(indall, ind) # pylint: disable=E1111

304

305 # no in a bucket

306 indall = numpy.logical_not(indall) # pylint: disable=E1111

307 Xmissed = X[indall]

308 if Xmissed.shape[0] > 0:

309 meth = getattr(self.mean_estimator_, method)

310 missed = meth(Xmissed)

311 pred[indall] = missed

312 return pred

313

314

315class PiecewiseRegressor(PiecewiseEstimator, RegressorMixin):

316 """

317 Uses a :epkg:`decision tree` to split the space of features

318 into buckets and trains a linear regression (default) on each of them.

319 The second estimator is usually a :epkg:`sklearn:linear_model:LinearRegression`.

320 It can also be :epkg:`sklearn:dummy:DummyRegressor` to just get

321 the average on each bucket.

322 """

323

324 def __init__(self, binner=None, estimator=None, n_jobs=None, verbose=False):

325 """

326 @param binner transformer or predictor which creates the buckets

327 @param estimator predictor trained on every bucket

328 @param n_jobs number of parallel jobs (for training and predicting)

329 @param verbose boolean or use ``'tqdm'`` to use :epkg:`tqdm`

330 to fit the estimators

331

332 *binner* allows the following values:

333

334 - ``tree``: the model is :epkg:`sklearn:tree:DecisionTreeRegressor`

335 - ``'bins'``: the model :epkg:`sklearn:preprocessing:KBinsDiscretizer`

336 - any instanciated model

337

338 *estimator* allows the following values:

339

340 - ``None``: the model is :epkg:`sklearn:linear_model:LinearRegression`

341 - any instanciated model

342 """

343 if estimator is None:

344 estimator = LinearRegression()

345 if binner in ('tree', None):

346 binner = DecisionTreeRegressor(min_samples_leaf=2)

347 RegressorMixin.__init__(self)

348 PiecewiseEstimator.__init__(self, binner=binner, estimator=estimator,

349 n_jobs=n_jobs, verbose=verbose)

350

351 def predict(self, X):

352 """

353 Computes the predictions.

354

355 :param X: features, *X* is converted into an array if *X* is a dataframe

356 :return: predictions

357 """

358 return self._apply_predict_method(

359 X, "predict", _predict_piecewise_estimator, self.dim_)

360

361

362class PiecewiseClassifier(PiecewiseEstimator, ClassifierMixin):

363 """

364 Uses a :epkg:`decision tree` to split the space of features

365 into buckets and trains a logistic regression (default) on each of them.

366 The second estimator is usually a :epkg:`sklearn:linear_model:LogisticRegression`.

367 It can also be :epkg:`sklearn:dummy:DummyClassifier` to just get

368 the average on each bucket.

369

370 The main issue with the *PiecewiseClassifier* is that each piece requires

371 one example of each class in each bucket which may not happen.

372 To avoid that, the training will pick up random example

373 from other bucket to ensure this case does not happen.

374 """

375

376 def __init__(self, binner=None, estimator=None, n_jobs=None,

377 random_state=None, verbose=False):

378 """

379 @param binner transformer or predictor which creates the buckets

380 @param estimator predictor trained on every bucket

381 @param n_jobs number of parallel jobs (for training and predicting)

382 @param random_state to pick up random examples when buckets do not

383 contain enough examples of each class

384 @param verbose boolean or use ``'tqdm'`` to use :epkg:`tqdm`

385 to fit the estimators

386

387 *binner* allows the following values:

388

389 - ``tree``: the model is :epkg:`sklearn:tree:DecisionTreeClassifier`

390 - ``'bins'``: the model :epkg:`sklearn:preprocessing:KBinsDiscretizer`

391 - any instanciated model

392

393 *estimator* allows the following values:

394

395 - ``None``: the model is :epkg:`sklearn:linear_model:LogisticRegression`

396 - any instanciated model

397 """

398 if estimator is None:

399 estimator = LogisticRegression()

400 if binner in ('tree', None):

401 binner = DecisionTreeClassifier(min_samples_leaf=5)

402 ClassifierMixin.__init__(self)

403 PiecewiseEstimator.__init__(

404 self, binner=binner, estimator=estimator,

405 n_jobs=n_jobs, verbose=verbose)

406 self.random_state = random_state

407

408 def predict(self, X):

409 """

410 Computes the predictions.

411

412 :param X: features, *X* is converted into an array if *X* is a dataframe

413 :return: predictions

414 """

415 pred = self._apply_predict_method(

416 X, "predict", _predict_piecewise_estimator, 1)

417 return pred.astype(numpy.int32)

418

419 def predict_proba(self, X):

420 """

421 Computes the predictions probabilities.

422

423 :param X: features, *X* is converted into an array if *X* is a dataframe

424 :return: predictions probabilities

425 """

426 return self._apply_predict_method(

427 X, "predict_proba", _predict_proba_piecewise_estimator,

428 len(self.mean_estimator_.classes_))

429

430 def decision_function(self, X):

431 """

432 Computes the predictions probabilities.

433

434 :param X: features, *X* is converted into an array if *X* is a dataframe

435 :return: predictions probabilities

436 """

437 justone = self.mean_estimator_.decision_function(X[:1])

438 return self._apply_predict_method(

439 X, "decision_function", _decision_function_piecewise_estimator,

440 1 if len(justone.shape) == 1 else justone.shape[1])

Coverage for mlinsights/mlmodel/piecewise_estimator.py : 97%

196 statements

Coverage for mlinsights/mlmodel/piecewise_estimator.py : 97%

196 statements 191 run 5 missing 9 excluded

196 statements