Source code for mlinsights.mlmodel.interval_regressor

"""
Implements a piecewise linear regression.


:githublink:`%|py|5`
"""
import numpy
import numpy.random
from sklearn.base import RegressorMixin, clone, BaseEstimator
from sklearn.utils._joblib import Parallel, delayed
from sklearn.utils.fixes import _joblib_parallel_args
try:
    from tqdm import tqdm
except ImportError:  # pragma: no cover
    pass


[docs]class IntervalRegressor(BaseEstimator, RegressorMixin): """ Trains multiple regressors to provide a confidence interval on prediction. It only works for single regression. Every training is made with a new sample of the training data, parameter *alpha* let the user choose the size of this sample. A smaller *alpha* increases the variance of the predictions. The current implementation draws sample by random but keeps the weight associated to each of them. Another way could be to draw a weighted sample but give them uniform weights. :githublink:`%|py|28` """
[docs] def __init__(self, estimator=None, n_estimators=10, n_jobs=None, alpha=1., verbose=False): """ :param estimator: predictor trained on every bucket :param n_estimators: number of estimators to train :param n_jobs: number of parallel jobs (for training and predicting) :param alpha: proportion of samples resampled for each training :param verbose: boolean or use ``'tqdm'`` to use :epkg:`tqdm` to fit the estimators :githublink:`%|py|39` """ BaseEstimator.__init__(self) RegressorMixin.__init__(self) if estimator is None: raise ValueError("estimator cannot be null.") # pragma: no cover self.estimator = estimator self.n_jobs = n_jobs self.alpha = alpha self.verbose = verbose self.n_estimators = n_estimators
@property def n_estimators_(self): """ Returns the number of estimators = the number of buckets the data was split in. :githublink:`%|py|55` """ return len(self.estimators_)
[docs] def fit(self, X, y, sample_weight=None): """ Trains the binner and an estimator on every bucket. :param X: features, *X* is converted into an array if *X* is a dataframe :param y: target :param sample_weight: sample weights :return: self: returns an instance of self. Fitted attributes: * `binner_`: binner * `estimators_`: dictionary of estimators, each of them mapped to a leave to the tree * `mean_estimator_`: estimator trained on the whole datasets in case the binner can find a bucket for a new observation * `dim_`: dimension of the output * `mean_`: average targets :githublink:`%|py|78` """ self.estimators_ = [] estimators = [clone(self.estimator) for i in range(self.n_estimators)] loop = tqdm(range(len(estimators)) ) if self.verbose == 'tqdm' else range(len(estimators)) verbose = 1 if self.verbose == 'tqdm' else (1 if self.verbose else 0) def _fit_piecewise_estimator(i, est, X, y, sample_weight, alpha): new_size = int(X.shape[0] * alpha + 0.5) rnd = numpy.random.randint(0, X.shape[0] - 1, new_size) Xr = X[rnd] yr = y[rnd] sr = sample_weight[rnd] if sample_weight else None return est.fit(Xr, yr, sr) self.estimators_ = \ Parallel(n_jobs=self.n_jobs, verbose=verbose, **_joblib_parallel_args(prefer='threads'))( delayed(_fit_piecewise_estimator)( i, estimators[i], X, y, sample_weight, self.alpha) for i in loop) return self
[docs] def predict_all(self, X): """ Computes the predictions for all estimators. :param X: features, *X* is converted into an array if *X* is a dataframe :return: predictions :githublink:`%|py|109` """ container = numpy.empty((X.shape[0], len(self.estimators_))) for i, est in enumerate(self.estimators_): pred = est.predict(X) container[:, i] = pred return container
[docs] def predict(self, X): """ Computes the average predictions. :param X: features, *X* is converted into an array if *X* is a dataframe :return: predictions :githublink:`%|py|122` """ preds = self.predict_all(X) return preds.mean(axis=1)
[docs] def predict_sorted(self, X): """ Computes the predictions for all estimators. Sorts them for all observations. :param X: features, *X* is converted into an array if *X* is a dataframe :return: predictions sorted for each observation :githublink:`%|py|133` """ preds = self.predict_all(X) for i in range(preds.shape[0]): preds[i, :] = numpy.sort(preds[i, :]) return preds