Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# -*- coding: utf-8 -*-
2"""
3@file
4@brief Implements a quantile non-linear regression.
5"""
6import inspect
7import numpy as np
8from sklearn.base import RegressorMixin
9from sklearn.utils import check_X_y, column_or_1d
10from sklearn.utils.validation import check_is_fitted
11from sklearn.utils.extmath import safe_sparse_dot
12from sklearn.neural_network._base import DERIVATIVES, LOSS_FUNCTIONS
13try:
14 from sklearn.neural_network._multilayer_perceptron import BaseMultilayerPerceptron
15except ImportError: # pragma: no cover
16 # scikit-learn < 0.22.
17 from sklearn.neural_network.multilayer_perceptron import BaseMultilayerPerceptron
18from sklearn.metrics import mean_absolute_error
21def absolute_loss(y_true, y_pred):
22 """
23 Computes the absolute loss for regression.
25 :param y_true: array-like or label indicator matrix
26 Ground truth (correct) values.
27 :param y_pred: array-like or label indicator matrix
28 Predicted values, as returned by a regression estimator.
29 :return: loss, float
30 The degree to which the samples are correctly predicted.
31 """
32 return np.sum(np.abs(y_true - y_pred)) / y_true.shape[0]
35def float_sign(a):
36 "Returns 1 if *a > 0*, otherwise -1"
37 if a > 1e-8:
38 return 1.
39 if a < -1e-8:
40 return -1.
41 return 0.
44EXTENDED_LOSS_FUNCTIONS = {'absolute_loss': absolute_loss}
45DERIVATIVE_LOSS_FUNCTIONS = {'absolute_loss': np.vectorize(float_sign)}
48class CustomizedMultilayerPerceptron(BaseMultilayerPerceptron):
49 """
50 Customized MLP Perceptron based on
51 `BaseMultilayerPerceptron
52 <https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/neural_network/multilayer_perceptron.py#L40>`_.
53 """
55 def __init__(self, hidden_layer_sizes, activation, solver,
56 alpha, batch_size, learning_rate, learning_rate_init, power_t,
57 max_iter, loss, shuffle, random_state, tol, verbose,
58 warm_start, momentum, nesterovs_momentum, early_stopping,
59 validation_fraction, beta_1, beta_2, epsilon,
60 n_iter_no_change, max_fun):
61 if 'max_fun' in inspect.signature(BaseMultilayerPerceptron.__init__).parameters:
62 args = [15000]
63 else:
64 args = []
65 BaseMultilayerPerceptron.__init__( # pylint: disable=E1121
66 self, hidden_layer_sizes, activation, solver, alpha, batch_size,
67 learning_rate, learning_rate_init, power_t, max_iter, loss,
68 shuffle, random_state, tol, verbose, warm_start, momentum,
69 nesterovs_momentum, early_stopping, validation_fraction, beta_1, beta_2,
70 epsilon, n_iter_no_change, *args)
72 def _get_loss_function(self, loss_func_name):
73 """
74 Returns the loss functions.
76 @param loss_func_name loss function name, see
77 :epkg:`sklearn:neural_networks:MLPRegressor`
78 """
79 return LOSS_FUNCTIONS.get(loss_func_name, EXTENDED_LOSS_FUNCTIONS[loss_func_name])
81 def _modify_loss_derivatives(self, last_deltas):
82 """
83 Modifies the loss derivatives.
85 @param last_deltas last deltas is the difference between the output and the expected output
86 @return modified derivatives
87 """
88 if self.loss == 'absolute_loss':
89 return DERIVATIVE_LOSS_FUNCTIONS['absolute_loss'](last_deltas)
90 return last_deltas # pragma: no cover
92 def _backprop(self, X, y, activations, deltas, coef_grads,
93 intercept_grads):
94 """
95 Computes the MLP loss function and its corresponding derivatives
96 with respect to each parameter: weights and bias vectors.
98 :param X: {array-like, sparse matrix}, shape (n_samples, n_features)
99 The input data.
100 :param y: array-like, shape (n_samples,)
101 The target values.
102 :param activations: list, length = n_layers - 1
103 The ith element of the list holds the values of the ith layer.
104 :param deltas: list, length = n_layers - 1
105 The ith element of the list holds the difference between the
106 activations of the i + 1 layer and the backpropagated error.
107 More specifically, deltas are gradients of loss with respect to z
108 in each layer, where z = wx + b is the value of a particular layer
109 before passing through the activation function
110 :param coef_grads: list, length = n_layers - 1
111 The ith element contains the amount of change used to update the
112 coefficient parameters of the ith layer in an iteration.
113 :param intercept_grads: list, length = n_layers - 1
114 The ith element contains the amount of change used to update the
115 intercept parameters of the ith layer in an iteration.
116 :return: loss, float
117 :return: coef_grads, list, length = n_layers - 1
118 :return: intercept_grads, list, length = n_layers - 1
119 """
120 n_samples = X.shape[0]
122 # Forward propagate
123 activations = self._forward_pass(activations)
125 # Get loss
126 loss_func_name = self.loss
127 if loss_func_name == 'log_loss' and self.out_activation_ == 'logistic':
128 loss_func_name = 'binary_log_loss'
129 loss_function = self._get_loss_function(loss_func_name)
130 loss = loss_function(y, activations[-1])
131 # Add L2 regularization term to loss
132 values = np.sum(
133 np.array([np.dot(s.ravel(), s.ravel()) for s in self.coefs_]))
134 loss += (0.5 * self.alpha) * values / n_samples
136 # Backward propagate
137 last = self.n_layers_ - 2
139 # The calculation of delta[last] here works with following
140 # combinations of output activation and loss function:
141 # sigmoid and binary cross entropy, softmax and categorical cross
142 # entropy, and identity with squared loss
143 deltas[last] = activations[-1] - y
145 # We insert the following modification to modify the gradient
146 # due to the modification of the loss function.
147 deltas[last] = self._modify_loss_derivatives(deltas[last])
149 # Compute gradient for the last layer
150 temp = self._compute_loss_grad( # pylint: disable=E1111
151 last, n_samples, activations, deltas, coef_grads, intercept_grads)
152 if temp is None:
153 # recent version of scikit-learn
154 # Compute gradient for the last layer
155 self._compute_loss_grad(
156 last, n_samples, activations, deltas, coef_grads, intercept_grads)
158 inplace_derivative = DERIVATIVES[self.activation]
159 # Iterate over the hidden layers
160 for i in range(self.n_layers_ - 2, 0, -1):
161 deltas[i - 1] = safe_sparse_dot(deltas[i], self.coefs_[i].T)
162 inplace_derivative(activations[i], deltas[i - 1])
164 self._compute_loss_grad(
165 i - 1, n_samples, activations, deltas, coef_grads,
166 intercept_grads)
167 else: # pragma: no cover
168 coef_grads, intercept_grads = temp # pylint: disable=E0633
170 # Iterate over the hidden layers
171 for i in range(self.n_layers_ - 2, 0, -1):
172 deltas[i - 1] = safe_sparse_dot(deltas[i], self.coefs_[i].T)
173 inplace_derivative = DERIVATIVES[self.activation]
174 inplace_derivative(activations[i], deltas[i - 1])
176 coef_grads, intercept_grads = self._compute_loss_grad( # pylint: disable=E1111,E0633
177 i - 1, n_samples, activations, deltas, coef_grads,
178 intercept_grads)
180 return loss, coef_grads, intercept_grads
183class QuantileMLPRegressor(CustomizedMultilayerPerceptron, RegressorMixin):
184 """
185 Quantile MLP Regression or neural networks regression
186 trained with norm :epkg:`L1`. This class inherits from
187 :epkg:`sklearn:neural_networks:MLPRegressor`.
188 This model optimizes the absolute-loss using LBFGS or stochastic gradient
189 descent. See @see cl CustomizedMultilayerPerceptron and
190 @see fn absolute_loss.
192 :param hidden_layer_sizes: tuple, length = n_layers - 2, default (100,)
193 The ith element represents the number of neurons in the ith
194 hidden layer.
195 :param activation: {'identity', 'logistic', 'tanh', 'relu'}, default 'relu'
196 Activation function for the hidden layer.
197 - 'identity', no-op activation, useful to implement linear bottleneck,
198 returns :math:`f(x) = x`
199 - 'logistic', the logistic sigmoid function,
200 returns :math:`f(x) = 1 / (1 + exp(-x))`.
201 - 'tanh', the hyperbolic tan function,
202 returns :math:`f(x) = tanh(x)`.
203 - 'relu', the rectified linear unit function,
204 returns :math:`f(x) = \\max(0, x)`.
205 :param solver: ``{'lbfgs', 'sgd', 'adam'}``, default 'adam'
206 The solver for weight optimization.
207 - *'lbfgs'* is an optimizer in the family of quasi-Newton methods.
208 - *'sgd'* refers to stochastic gradient descent.
209 - *'adam'* refers to a stochastic gradient-based optimizer proposed by
210 Kingma, Diederik, and Jimmy Ba
211 Note: The default solver 'adam' works pretty well on relatively
212 large datasets (with thousands of training samples or more) in terms of
213 both training time and validation score.
214 For small datasets, however, 'lbfgs' can converge faster and perform
215 better.
216 :param alpha: float, optional, default 0.0001
217 :epkg:`L2` penalty (regularization term) parameter.
218 :param batch_size: int, optional, default 'auto'
219 Size of minibatches for stochastic optimizers.
220 If the solver is 'lbfgs', the classifier will not use minibatch.
221 When set to "auto", `batch_size=min(200, n_samples)`
222 :param learning_rate: {'constant', 'invscaling', 'adaptive'}, default 'constant'
223 Learning rate schedule for weight updates.
224 - 'constant' is a constant learning rate given by
225 'learning_rate_init'.
226 - 'invscaling' gradually decreases the learning rate ``learning_rate_``
227 at each time step 't' using an inverse scaling exponent of 'power_t'.
228 effective_learning_rate = learning_rate_init / pow(t, power_t)
229 - 'adaptive' keeps the learning rate constant to
230 'learning_rate_init' as long as training loss keeps decreasing.
231 Each time two consecutive epochs fail to decrease training loss by at
232 least tol, or fail to increase validation score by at least tol if
233 'early_stopping' is on, the current learning rate is divided by 5.
234 Only used when solver='sgd'.
235 :param learning_rate_init: double, optional, default 0.001
236 The initial learning rate used. It controls the step-size
237 in updating the weights. Only used when solver='sgd' or 'adam'.
238 :param power_t: double, optional, default 0.5
239 The exponent for inverse scaling learning rate.
240 It is used in updating effective learning rate when the learning_rate
241 is set to 'invscaling'. Only used when solver='sgd'.
242 :param max_iter: int, optional, default 200
243 Maximum number of iterations. The solver iterates until convergence
244 (determined by 'tol') or this number of iterations. For stochastic
245 solvers ('sgd', 'adam'), note that this determines the number of epochs
246 (how many times each data point will be used), not the number of
247 gradient steps.
248 :param shuffle: bool, optional, default True
249 Whether to shuffle samples in each iteration. Only used when
250 solver='sgd' or 'adam'.
251 :param random_state: int, RandomState instance or None, optional, default None
252 If int, random_state is the seed used by the random number generator;
253 If RandomState instance, random_state is the random number generator;
254 If None, the random number generator is the RandomState instance used
255 by `np.random`.
256 :param tol: float, optional, default 1e-4
257 Tolerance for the optimization. When the loss or score is not improving
258 by at least ``tol`` for ``n_iter_no_change`` consecutive iterations,
259 unless ``learning_rate`` is set to 'adaptive', convergence is
260 considered to be reached and training stops.
261 :param verbose: bool, optional, default False
262 Whether to print progress messages to stdout.
263 :param warm_start: bool, optional, default False
264 When set to True, reuse the solution of the previous
265 call to fit as initialization, otherwise, just erase the
266 previous solution. See :term:`the Glossary <warm_start>`.
267 :param momentum: float, default 0.9
268 Momentum for gradient descent update. Should be between 0 and 1. Only
269 used when solver='sgd'.
270 :param nesterovs_momentum: boolean, default True
271 Whether to use Nesterov's momentum. Only used when solver='sgd' and
272 momentum > 0.
273 :param early_stopping: bool, default False
274 Whether to use early stopping to terminate training when validation
275 score is not improving. If set to true, it will automatically set
276 aside 10% of training data as validation and terminate training when
277 validation score is not improving by at least ``tol`` for
278 ``n_iter_no_change`` consecutive epochs.
279 Only effective when solver='sgd' or 'adam'
280 :param validation_fraction: float, optional, default 0.1
281 The proportion of training data to set aside as validation set for
282 early stopping. Must be between 0 and 1.
283 Only used if early_stopping is True
284 :param beta_1: float, optional, default 0.9
285 Exponential decay rate for estimates of first moment vector in adam,
286 should be in [0, 1). Only used when solver='adam'
287 :param beta_2: float, optional, default 0.999
288 Exponential decay rate for estimates of second moment vector in adam,
289 should be in [0, 1). Only used when solver='adam'
290 :param epsilon: float, optional, default 1e-8
291 Value for numerical stability in adam. Only used when solver='adam'
292 :param n_iter_no_change: int, optional, default 10
293 Maximum number of epochs to not meet ``tol`` improvement.
294 Only effective when solver='sgd' or 'adam'
296 Fitted attributes:
298 * `loss_`: float
299 The current loss computed with the loss function.
300 * `coefs_`: list, length n_layers - 1
301 The ith element in the list represents the weight matrix corresponding
302 to layer i.
303 * `intercepts_`: list, length n_layers - 1
304 The ith element in the list represents the bias vector corresponding to
305 layer i + 1.
306 * `n_iter_`: int,
307 The number of iterations the solver has ran.
308 * `n_layers_`: int
309 Number of layers.
310 * `n_outputs_`: int
311 Number of outputs.
312 * `out_activation_`: string
313 Name of the output activation function.
314 """
316 def __init__(self,
317 hidden_layer_sizes=(100,), activation="relu",
318 solver='adam', alpha=0.0001,
319 batch_size='auto', learning_rate="constant",
320 learning_rate_init=0.001,
321 power_t=0.5, max_iter=200, shuffle=True,
322 random_state=None, tol=1e-4,
323 verbose=False, warm_start=False, momentum=0.9,
324 nesterovs_momentum=True, early_stopping=False,
325 validation_fraction=0.1, beta_1=0.9, beta_2=0.999,
326 epsilon=1e-8, n_iter_no_change=10,
327 **kwargs):
328 """
329 See :epkg:`sklearn:neural_networks:MLPRegressor`
330 """
331 sup = super(QuantileMLPRegressor, self) # pylint: disable=R1725
332 if "max_fun" not in kwargs:
333 sig = inspect.signature(sup.__init__)
334 if "max_fun" in sig.parameters:
335 kwargs['max_fun'] = 15000
336 sup.__init__(hidden_layer_sizes=hidden_layer_sizes,
337 activation=activation, solver=solver, alpha=alpha,
338 batch_size=batch_size, learning_rate=learning_rate,
339 learning_rate_init=learning_rate_init, power_t=power_t,
340 max_iter=max_iter, loss='absolute_loss', shuffle=shuffle,
341 random_state=random_state, tol=tol, verbose=verbose,
342 warm_start=warm_start, momentum=momentum,
343 nesterovs_momentum=nesterovs_momentum,
344 early_stopping=early_stopping,
345 validation_fraction=validation_fraction,
346 beta_1=beta_1, beta_2=beta_2, epsilon=epsilon,
347 n_iter_no_change=n_iter_no_change, **kwargs)
349 def predict(self, X):
350 """
351 Predicts using the multi-layer perceptron model.
353 :param X: {array-like, sparse matrix}, shape (n_samples, n_features)
354 The input data.
355 :return: y : array-like, shape (n_samples, n_outputs)
356 The predicted values.
357 """
358 check_is_fitted(self)
359 if hasattr(self, '_predict'):
360 y_pred = self._predict(X)
361 else:
362 y_pred = self._forward_pass_fast(X)
363 if y_pred.shape[1] == 1:
364 return y_pred.ravel()
365 return y_pred
367 def _validate_input(self, X, y, incremental, reset=False):
368 X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'],
369 multi_output=True, y_numeric=True)
370 if y.ndim == 2 and y.shape[1] == 1:
371 y = column_or_1d(y, warn=True)
372 return X, y
374 def score(self, X, y, sample_weight=None):
375 """
376 Returns mean absolute error regression loss.
378 :param X: array-like, shape = (n_samples, n_features)
379 Test samples.
380 :param y: array-like, shape = (n_samples) or (n_samples, n_outputs)
381 True values for X.
382 :param sample_weight: array-like, shape = [n_samples], optional
383 Sample weights.
384 :return: score, float
385 mean absolute error regression loss
386 """
387 pred = self.predict(X)
388 return mean_absolute_error(y, pred, sample_weight=sample_weight)