Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""
2@file
3@brief Implements a piecewise linear regression.
4"""
5import numpy
6import numpy.random
7import pandas
8from sklearn.base import BaseEstimator, RegressorMixin, ClassifierMixin, clone
9from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
10from sklearn.linear_model import LinearRegression, LogisticRegression
11from sklearn.preprocessing import KBinsDiscretizer
12from sklearn.utils._joblib import Parallel, delayed
13from sklearn.utils.fixes import _joblib_parallel_args
14try:
15 from tqdm import tqdm
16except ImportError: # pragma: no cover
17 pass
20def _fit_piecewise_estimator(i, model, X, y, sample_weight, association, nb_classes, random_state):
21 ind = association == i
22 if not numpy.any(ind):
23 # No training example for this bucket.
24 return None
25 Xi = X[ind, :]
26 yi = y[ind]
27 sw = sample_weight[ind] if sample_weight is not None else None
29 if nb_classes is not None and len(set(yi)) != nb_classes:
30 # Issues a classifiers requires to have at least one example
31 # of each class.
32 if random_state is None:
33 random_state = numpy.random.RandomState() # pylint: disable=E1101
34 addition = numpy.arange(len(ind))
35 random_state.shuffle(addition)
36 found = set(yi)
37 allcl = set(y)
38 res = []
39 while len(found) < len(allcl):
40 for ki in addition:
41 if y[ki] not in found:
42 res.append(ki)
43 found.add(y[ki])
44 ind = ind.copy()
45 for ki in res:
46 ind[ki] = True
48 Xi = X[ind, :]
49 yi = y[ind]
50 sw = sample_weight[ind] if sample_weight is not None else None
52 return model.fit(Xi, yi, sample_weight=sw)
55def _predict_piecewise_estimator(i, est, X, association):
56 ind = association == i
57 if not numpy.any(ind):
58 return None, None
59 return ind, est.predict(X[ind, :])
62def _predict_proba_piecewise_estimator(i, est, X, association):
63 ind = association == i
64 if not numpy.any(ind):
65 return None, None
66 return ind, est.predict_proba(X[ind, :])
69def _decision_function_piecewise_estimator(i, est, X, association):
70 ind = association == i
71 if not numpy.any(ind):
72 return None, None
73 return ind, est.decision_function(X[ind, :])
76class PiecewiseEstimator(BaseEstimator):
77 """
78 Uses a :epkg:`decision tree` to split the space of features
79 into buckets and trains a linear regression on each of them.
80 The second estimator can be a :epkg:`sklearn:linear_model:LinearRegression`
81 for a regression or :epkg:`sklearn:linear_model:LogisticRegression`
82 for a classifier. It can also be :epkg:`sklearn:dummy:DummyRegressor`
83 :epkg:`sklearn:dummy:DummyClassifier` to just get the average on each bucket.
84 When the buckets are defined by a decision tree and the
85 estimator is linear, @see cl PiecewiseTreeRegressor optimizes
86 the buckets based on the results of a linear regression.
87 The accuracy is usually better.
88 """
90 def __init__(self, binner=None, estimator=None, n_jobs=None, verbose=False):
91 """
92 @param binner transformer or predictor which creates the buckets
93 @param estimator predictor trained on every bucket
94 @param n_jobs number of parallel jobs (for training and predicting)
95 @param verbose boolean or use ``'tqdm'`` to use :epkg:`tqdm`
96 to fit the estimators
98 *binner* must be filled or must be:
100 - ``'bins'``: the model :epkg:`sklearn:preprocessing:KBinsDiscretizer`
101 - any instanciated model
103 *estimator* allows the following values:
105 - ``None``: the model is :epkg:`sklearn:linear_model:LinearRegression`
106 - any instanciated model
107 """
108 BaseEstimator.__init__(self)
109 if estimator is None:
110 raise ValueError( # pragma: no cover
111 "estimator cannot be null.")
112 if binner is None:
113 raise TypeError( # pragma: no cover
114 "Unsupported options for binner=='tree' and model {}.".format(
115 type(estimator)))
116 elif binner == "bins":
117 binner = KBinsDiscretizer()
118 self.binner = binner
119 self.estimator = estimator
120 self.n_jobs = n_jobs
121 self.verbose = verbose
123 @property
124 def n_estimators_(self):
125 """
126 Returns the number of estimators = the number of buckets
127 the data was split in.
128 """
129 return len(self.estimators_)
131 def _mapping_train(self, X, binner):
132 if hasattr(binner, "tree_"):
133 tree = binner.tree_
134 leaves = [i for i in range(len(tree.children_left))
135 if tree.children_left[i] <= i and tree.children_right[i] <= i]
136 dec_path = self.binner_.decision_path(X)
137 association = numpy.zeros((X.shape[0],))
138 association[:] = -1
139 mapping = {}
140 ntree = 0
141 for j in leaves:
142 ind = dec_path[:, j] == 1
143 ind = numpy.asarray(ind.todense()).flatten()
144 if not numpy.any(ind):
145 # No training example for this bucket.
146 continue
147 mapping[j] = ntree
148 association[ind] = ntree
149 ntree += 1
151 elif hasattr(binner, "transform"):
152 tr = binner.transform(X)
153 unique = set()
154 for x in tr:
155 d = tuple(numpy.asarray(
156 x.todense()).ravel().astype(numpy.int32))
157 unique.add(d)
158 leaves = list(sorted(unique))
159 association = numpy.zeros((X.shape[0],))
160 association[:] = -1
161 ntree = 0
162 mapping = {}
163 for i, le in enumerate(leaves):
164 mapping[le] = i
165 for i, x in enumerate(tr):
166 d = tuple(numpy.asarray(
167 x.todense()).ravel().astype(numpy.int32))
168 association[i] = mapping.get(d, -1)
169 else:
170 raise NotImplementedError( # pragma: no cover
171 "binner is not a decision tree or a transform")
173 return association, mapping, leaves
175 def transform_bins(self, X):
176 """
177 Maps every row to a tree in *self.estimators_*.
178 """
179 binner = self.binner_
180 if hasattr(binner, "tree_"):
181 dec_path = self.binner_.decision_path(X)
182 association = numpy.zeros((X.shape[0],))
183 association[:] = -1
184 for j in self.leaves_:
185 ind = dec_path[:, j] == 1
186 ind = numpy.asarray(ind.todense()).flatten()
187 if not numpy.any(ind):
188 # No training example for this bucket.
189 continue
190 association[ind] = self.mapping_.get(j, -1)
192 elif hasattr(binner, "transform"):
193 association = numpy.zeros((X.shape[0],))
194 association[:] = -1
195 tr = binner.transform(X)
196 for i, x in enumerate(tr):
197 d = tuple(numpy.asarray(
198 x.todense()).ravel().astype(numpy.int32))
199 association[i] = self.mapping_.get(d, -1)
200 else:
201 raise NotImplementedError( # pragma: no cover
202 "binner is not a decision tree or a transform")
203 return association
205 def fit(self, X, y, sample_weight=None):
206 """
207 Trains the binner and an estimator on every
208 bucket.
210 :param X: features, *X* is converted into an array if *X* is a dataframe
211 :param y: target
212 :param sample_weight: sample weights
213 :return: self: returns an instance of self.
215 Fitted attributes:
217 * `binner_`: binner
218 * `estimators_`: dictionary of estimators, each of them
219 mapped to a leave to the tree
220 * `mean_estimator_`: estimator trained on the whole
221 datasets in case the binner can find a bucket for
222 a new observation
223 * `dim_`: dimension of the output
224 * `mean_`: average targets
225 """
226 if len(y.shape) == 2:
227 if y.shape[-1] == 1:
228 y = y.ravel()
229 else:
230 raise RuntimeError(
231 "This regressor only works with single dimension targets.")
232 if isinstance(X, pandas.DataFrame):
233 X = X.values
234 if isinstance(X, list):
235 raise TypeError( # pragma: no cover
236 "X cannot be a list.")
237 binner = clone(self.binner)
238 if sample_weight is None:
239 self.binner_ = binner.fit(X, y)
240 else:
241 self.binner_ = binner.fit(X, y, sample_weight=sample_weight)
243 association, self.mapping_, self.leaves_ = self._mapping_train(
244 X, self.binner_)
246 estimators = [clone(self.estimator) for i in self.mapping_]
248 loop = (tqdm(range(len(estimators)))
249 if self.verbose == 'tqdm' else range(len(estimators)))
250 verbose = 1 if self.verbose == 'tqdm' else (1 if self.verbose else 0)
252 self.mean_estimator_ = clone(self.estimator).fit(X, y, sample_weight)
253 nb_classes = (None if not hasattr(self.mean_estimator_, 'classes_')
254 else len(set(self.mean_estimator_.classes_)))
256 if hasattr(self, 'random_state') and self.random_state is not None: # pylint: disable=E1101
257 rnd = numpy.random.RandomState( # pylint: disable=E1101
258 self.random_state) # pylint: disable=E1101
259 else:
260 rnd = None
262 self.estimators_ = \
263 Parallel(n_jobs=self.n_jobs, verbose=verbose,
264 **_joblib_parallel_args(prefer='threads'))(
265 delayed(_fit_piecewise_estimator)(
266 i, estimators[i], X, y, sample_weight, association, nb_classes, rnd)
267 for i in loop)
269 self.dim_ = 1 if len(y.shape) == 1 else y.shape[1]
270 if hasattr(self.estimators_[0], 'classes_'):
271 self.classes_ = self.estimators_[0].classes_
272 return self
274 def _apply_predict_method(self, X, method, parallelized, dimout):
275 """
276 Generic *predict* method, works for *predict_proba* and
277 *decision_function* as well.
278 """
279 if len(self.estimators_) == 0:
280 raise RuntimeError( # pragma: no cover
281 "Estimator was apparently fitted but contains no estimator.")
282 if not hasattr(self.estimators_[0], method):
283 raise TypeError( # pragma: no cover
284 "Estimator {} does not have method '{}'.".format(
285 type(self.estimators_[0]), method))
286 if isinstance(X, pandas.DataFrame):
287 X = X.values
289 association = self.transform_bins(X)
291 indpred = Parallel(n_jobs=self.n_jobs, **_joblib_parallel_args(prefer='threads'))(
292 delayed(parallelized)(i, model, X, association)
293 for i, model in enumerate(self.estimators_))
295 pred = numpy.zeros((X.shape[0], dimout)
296 if dimout > 1 else (X.shape[0],))
297 indall = numpy.empty((X.shape[0],))
298 indall[:] = False
299 for ind, p in indpred:
300 if ind is None:
301 continue
302 pred[ind] = p
303 indall = numpy.logical_or(indall, ind) # pylint: disable=E1111
305 # no in a bucket
306 indall = numpy.logical_not(indall) # pylint: disable=E1111
307 Xmissed = X[indall]
308 if Xmissed.shape[0] > 0:
309 meth = getattr(self.mean_estimator_, method)
310 missed = meth(Xmissed)
311 pred[indall] = missed
312 return pred
315class PiecewiseRegressor(PiecewiseEstimator, RegressorMixin):
316 """
317 Uses a :epkg:`decision tree` to split the space of features
318 into buckets and trains a linear regression (default) on each of them.
319 The second estimator is usually a :epkg:`sklearn:linear_model:LinearRegression`.
320 It can also be :epkg:`sklearn:dummy:DummyRegressor` to just get
321 the average on each bucket.
322 """
324 def __init__(self, binner=None, estimator=None, n_jobs=None, verbose=False):
325 """
326 @param binner transformer or predictor which creates the buckets
327 @param estimator predictor trained on every bucket
328 @param n_jobs number of parallel jobs (for training and predicting)
329 @param verbose boolean or use ``'tqdm'`` to use :epkg:`tqdm`
330 to fit the estimators
332 *binner* allows the following values:
334 - ``tree``: the model is :epkg:`sklearn:tree:DecisionTreeRegressor`
335 - ``'bins'``: the model :epkg:`sklearn:preprocessing:KBinsDiscretizer`
336 - any instanciated model
338 *estimator* allows the following values:
340 - ``None``: the model is :epkg:`sklearn:linear_model:LinearRegression`
341 - any instanciated model
342 """
343 if estimator is None:
344 estimator = LinearRegression()
345 if binner in ('tree', None):
346 binner = DecisionTreeRegressor(min_samples_leaf=2)
347 RegressorMixin.__init__(self)
348 PiecewiseEstimator.__init__(self, binner=binner, estimator=estimator,
349 n_jobs=n_jobs, verbose=verbose)
351 def predict(self, X):
352 """
353 Computes the predictions.
355 :param X: features, *X* is converted into an array if *X* is a dataframe
356 :return: predictions
357 """
358 return self._apply_predict_method(
359 X, "predict", _predict_piecewise_estimator, self.dim_)
362class PiecewiseClassifier(PiecewiseEstimator, ClassifierMixin):
363 """
364 Uses a :epkg:`decision tree` to split the space of features
365 into buckets and trains a logistic regression (default) on each of them.
366 The second estimator is usually a :epkg:`sklearn:linear_model:LogisticRegression`.
367 It can also be :epkg:`sklearn:dummy:DummyClassifier` to just get
368 the average on each bucket.
370 The main issue with the *PiecewiseClassifier* is that each piece requires
371 one example of each class in each bucket which may not happen.
372 To avoid that, the training will pick up random example
373 from other bucket to ensure this case does not happen.
374 """
376 def __init__(self, binner=None, estimator=None, n_jobs=None,
377 random_state=None, verbose=False):
378 """
379 @param binner transformer or predictor which creates the buckets
380 @param estimator predictor trained on every bucket
381 @param n_jobs number of parallel jobs (for training and predicting)
382 @param random_state to pick up random examples when buckets do not
383 contain enough examples of each class
384 @param verbose boolean or use ``'tqdm'`` to use :epkg:`tqdm`
385 to fit the estimators
387 *binner* allows the following values:
389 - ``tree``: the model is :epkg:`sklearn:tree:DecisionTreeClassifier`
390 - ``'bins'``: the model :epkg:`sklearn:preprocessing:KBinsDiscretizer`
391 - any instanciated model
393 *estimator* allows the following values:
395 - ``None``: the model is :epkg:`sklearn:linear_model:LogisticRegression`
396 - any instanciated model
397 """
398 if estimator is None:
399 estimator = LogisticRegression()
400 if binner in ('tree', None):
401 binner = DecisionTreeClassifier(min_samples_leaf=5)
402 ClassifierMixin.__init__(self)
403 PiecewiseEstimator.__init__(
404 self, binner=binner, estimator=estimator,
405 n_jobs=n_jobs, verbose=verbose)
406 self.random_state = random_state
408 def predict(self, X):
409 """
410 Computes the predictions.
412 :param X: features, *X* is converted into an array if *X* is a dataframe
413 :return: predictions
414 """
415 pred = self._apply_predict_method(
416 X, "predict", _predict_piecewise_estimator, 1)
417 return pred.astype(numpy.int32)
419 def predict_proba(self, X):
420 """
421 Computes the predictions probabilities.
423 :param X: features, *X* is converted into an array if *X* is a dataframe
424 :return: predictions probabilities
425 """
426 return self._apply_predict_method(
427 X, "predict_proba", _predict_proba_piecewise_estimator,
428 len(self.mean_estimator_.classes_))
430 def decision_function(self, X):
431 """
432 Computes the predictions probabilities.
434 :param X: features, *X* is converted into an array if *X* is a dataframe
435 :return: predictions probabilities
436 """
437 justone = self.mean_estimator_.decision_function(X[:1])
438 return self._apply_predict_method(
439 X, "decision_function", _decision_function_piecewise_estimator,
440 1 if len(justone.shape) == 1 else justone.shape[1])