Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""
2@file
3@brief Builds a tree of logistic regressions.
4"""
5import numpy
6import scipy.sparse as sparse # pylint: disable=R0402
7from sklearn.linear_model import LogisticRegression
8from sklearn.base import BaseEstimator, ClassifierMixin, clone
9from sklearn.linear_model._base import LinearClassifierMixin
12def logistic(x):
13 """
14 Computes :math:`\\frac{1}{1 + e^{-x}}`.
15 """
16 return 1. / (1. + numpy.exp(-x))
19def likelihood(x, y, theta=1., th=0.):
20 """
21 Computes :math:`\\sum_i y_i f(\\theta (x_i - x_0)) + (1 - y_i) (1 - f(\\theta (x_i - x_0)))`
22 where :math:`f(x_i)` is :math:`\\frac{1}{1 + e^{-x}}`.
23 """
24 lr = logistic((x - th) * theta)
25 return y * lr + (1. - y) * (1 - lr)
28class _DecisionTreeLogisticRegressionNode:
29 """
30 Describes the tree structure hold by class
31 @see cl DecisionTreeLogisticRegression.
32 See also notebook :ref:`decisiontreelogregrst`.
33 """
35 def __init__(self, estimator, threshold=0.5, depth=1, index=0):
36 """
37 constructor
39 @param estimator binary estimator
40 """
41 self.index = index
42 self.estimator = estimator
43 self.above = None
44 self.below = None
45 self.threshold = threshold
46 self.depth = depth
48 def predict(self, X):
49 """
50 Predicts
51 """
52 prob = self.predict_proba(X)
53 return (prob[:, 1] >= 0.5).astype(numpy.int32)
55 def predict_proba(self, X):
56 """
57 Returns the classification probabilities.
59 @param X features
60 @return probabilties
61 """
62 prob = self.estimator.predict_proba(X)
63 above = prob[:, 1] > self.threshold
64 below = ~ above
65 n_above = above.sum()
66 n_below = below.sum()
67 if self.above is not None and n_above > 0:
68 prob_above = self.above.predict_proba(X[above])
69 prob[above] = prob_above
70 if self.below is not None and n_below > 0:
71 prob_below = self.below.predict_proba(X[below])
72 prob[below] = prob_below
73 return prob
75 def decision_path(self, X, mat, indices):
76 """
77 Returns the classification probabilities.
79 @param X features
80 @param mat decision path (allocated matrix)
81 """
82 mat[indices, self.index] = 1
83 prob = self.estimator.predict_proba(X)
84 above = prob[:, 1] > self.threshold
85 below = ~ above
86 n_above = above.sum()
87 n_below = below.sum()
88 indices_above = indices[above]
89 indices_below = indices[below]
90 if self.above is not None and n_above > 0:
91 self.above.decision_path(X[above], mat, indices_above)
92 if self.below is not None and n_below > 0:
93 self.below.decision_path(X[below], mat, indices_below)
95 def fit(self, X, y, sample_weight, dtlr, total_N):
96 """
97 Fits a logistic regression, then splits the sample into
98 positive and negative examples, finally tries to fit
99 logistic regressions on both subsamples. This method only
100 works on a linear classifier.
102 @param X features
103 @param y binary labels
104 @param sample_weight weights of every sample
105 @param dtlr @see cl DecisionTreeLogisticRegression
106 @param total_N total number of observation
107 @return last index
108 """
109 self.estimator.fit(X, y, sample_weight=sample_weight)
110 if dtlr.verbose >= 1:
111 print("[DTLR ] %s trained acc %1.2f N=%d" % ( # pragma: no cover
112 " " * self.depth, self.estimator.score(X, y), X.shape[0]))
113 prob = self.fit_improve(dtlr, total_N, X, y,
114 sample_weight=sample_weight)
116 if self.depth + 1 > dtlr.max_depth:
117 return self.index
118 if X.shape[0] < dtlr.min_samples_split:
119 return self.index
121 above = prob[:, 1] > self.threshold
122 below = ~ above
123 n_above = above.sum()
124 n_below = below.sum()
125 y_above = set(y[above])
126 y_below = set(y[below])
128 def _fit_side(index, y_above_below, above_below, n_above_below, side):
129 if dtlr.verbose >= 1:
130 print("[DTLR*] %s%s: n_class=%d N=%d - %d/%d" % ( # pragma: no cover
131 " " * self.depth, side,
132 len(y_above_below), above_below.shape[0],
133 n_above_below, total_N))
134 if (len(y_above_below) > 1 and
135 above_below.shape[0] > dtlr.min_samples_leaf * 2 and
136 (float(n_above_below) / total_N >=
137 dtlr.min_weight_fraction_leaf * 2) and
138 n_above_below < total_N):
139 estimator = clone(dtlr.estimator)
140 sw = sample_weight[above_below] if sample_weight is not None else None
141 node = _DecisionTreeLogisticRegressionNode(
142 estimator, self.threshold, depth=self.depth + 1, index=index)
143 last_index = node.fit(
144 X[above_below], y[above_below], sw, dtlr, total_N)
145 return node, last_index
146 return None, index
148 self.above, last = _fit_side(
149 self.index + 1, y_above, above, n_above, "above")
150 self.below, last = _fit_side(
151 last + 1, y_below, below, n_below, "below")
152 return last
154 @property
155 def tree_depth_(self):
156 """
157 Returns the maximum depth of the tree.
158 """
159 dt = self.depth
160 if self.above is not None:
161 dt = max(dt, self.above.tree_depth_)
162 if self.below is not None:
163 dt = max(dt, self.below.tree_depth_)
164 return dt
166 def fit_improve(self, dtlr, total_N, X, y, sample_weight):
167 """
168 The method only works on a linear classifier, it changes
169 the intercept in order to be within the constraints
170 imposed by the *min_samples_leaf* and *min_weight_fraction_leaf*.
171 The algorithm has a significant cost as it sorts every observation
172 and chooses the best intercept.
174 @param dtlr @see cl DecisionTreeLogisticRegression
175 @param total_N total number of observations
176 @param X features
177 @param y labels
178 @param sample_weight sample weight
179 @return probabilities
180 """
181 if self.estimator is None:
182 raise RuntimeError(
183 "Estimator was not trained.") # pragma: no cover
184 prob = self.estimator.predict_proba(X)
185 if dtlr.fit_improve_algo in (None, 'none'):
186 return prob
188 if not isinstance(self.estimator, LinearClassifierMixin):
189 # The classifier is not linear and cannot be improved.
190 if dtlr.fit_improve_algo == 'intercept_sort_always': # pragma: no cover
191 raise RuntimeError(
192 "The model is not linear ({}), "
193 "intercept cannot be improved.".format(self.estimator.__class__.__name__))
194 return prob
196 above = prob[:, 1] > self.threshold
197 below = ~ above
198 n_above = above.sum()
199 n_below = below.sum()
200 n_min = min(n_above, n_below)
201 p1p2 = float(n_above * n_below) / X.shape[0] ** 2
202 if dtlr.verbose >= 2:
203 print("[DTLRI] %s imp %d <> %d, p1p2=%1.3f <> %1.3f" % ( # pragma: no cover
204 " " * self.depth, n_min, dtlr.min_samples_leaf,
205 p1p2, dtlr.p1p2))
206 if (n_min >= dtlr.min_samples_leaf and
207 float(n_min) / total_N >= dtlr.min_weight_fraction_leaf and
208 p1p2 > dtlr.p1p2 and
209 dtlr.fit_improve_algo != 'intercept_sort_always'):
210 return prob
212 coef = self.estimator.coef_
213 decision_function = (X @ coef.T).ravel()
214 order = numpy.argsort(decision_function, axis=0)
215 begin = dtlr.min_samples_leaf
217 sorted_df = decision_function[order]
218 sorted_y = decision_function[order]
219 N = sorted_y.shape[0]
221 best = None
222 besti = None
223 beta_best = None
224 for i in range(begin, N - begin):
225 beta = - sorted_df[i]
226 like = numpy.sum(likelihood(decision_function + beta, y)) / N
227 w = float(i * (N - i)) / N**2
228 like += w * dtlr.gamma
229 if besti is None or like > best:
230 best = like
231 besti = i
232 beta_best = beta
234 if beta_best is not None:
235 if dtlr.verbose >= 1:
236 print("[DTLRI] %s change intercept %f --> %f in [%f, %f]" % ( # pragma: no cover
237 " " * self.depth, self.estimator.intercept_, beta_best,
238 - sorted_df[-1], - sorted_df[0]))
239 self.estimator.intercept_ = beta_best
240 prob = self.estimator.predict_proba(X)
241 return prob
243 def enumerate_leaves_index(self):
244 """
245 Returns the leaves index.
246 """
247 if self.above is None or self.below is None:
248 yield self.index
249 if self.above is not None:
250 for index in self.above.enumerate_leaves_index():
251 yield index
252 if self.below is not None:
253 for index in self.below.enumerate_leaves_index():
254 yield index
257class DecisionTreeLogisticRegression(BaseEstimator, ClassifierMixin):
258 """
259 Fits a logistic regression, then fits two other
260 logistic regression for every observation on both sides
261 of the border. It goes one until a tree is built.
262 It only handles a binary classification.
263 The built tree cannot be deeper than the maximum recursion.
265 :param estimator: binary classification estimator,
266 if empty, use a logistic regression, the theoritical
267 model defined with a logistic regression but it could
268 any binary classifier
269 :param max_depth: int, default=None
270 The maximum depth of the tree. If None, then nodes are expanded until
271 all leaves are pure or until all leaves contain less than
272 min_samples_split samples. It must be below the maximum
273 allowed recursion by python.
274 :param min_samples_split: int or float, default=2
275 The minimum number of samples required to split an internal node:
276 - If int, then consider `min_samples_split` as the minimum number.
277 - If float, then `min_samples_split` is a fraction and
278 `ceil(min_samples_split * n_samples)` are the minimum
279 number of samples for each split.
280 :param min_samples_leaf: int or float, default=1
281 The minimum number of samples required to be at a leaf node.
282 A split point at any depth will only be considered if it leaves at
283 least ``min_samples_leaf`` training samples in each of the left and
284 right branches. This may have the effect of smoothing the model,
285 especially in regression.
286 - If int, then consider `min_samples_leaf` as the minimum number.
287 - If float, then `min_samples_leaf` is a fraction and
288 `ceil(min_samples_leaf * n_samples)` are the minimum
289 number of samples for each node.
290 :param min_weight_fraction_leaf: float, default=0.0
291 The minimum weighted fraction of the sum total of weights (of all
292 the input samples) required to be at a leaf node. Samples have
293 equal weight when sample_weight is not provided.
294 :param fit_improve_algo: string, one of the following value:
295 - `'auto'`: chooses the best option below, '`none'` for
296 every non linear model, `'intercept_sort'` for linear models
297 - '`none'`: does not nothing once the binary classifier is fit
298 - `'intercept_sort'`: if one side of the classifier is too small,
299 the method changes the best intercept possible verifying
300 the constraints
301 - `'intercept_sort_always'`: always chooses the best intercept
302 possible
303 :param p1p2: threshold in [0, 1]
304 for every split, we can define probabilities :math:`p_1 p_2`
305 which define the ratio of samples in both splits,
306 if :math:`p_1 p_2` is below the threshold,
307 method *fit_improve* is called
308 :param gamma: weight before the coefficient :math:`p (1-p)`.
309 When the model tries to improve the linear classifier,
310 it looks a better intercept which maximizes the
311 likelihood and verifies the constraints.
312 In order to force the classifier to choose a value
313 which splits the dataset into 2 almost equal folds,
314 the function maximimes :math:`likelihood + \\gamma p (1 - p)`
315 where *p* is the proportion of samples falling in the first
316 fold.
317 :param verbose: prints out information about the training
318 :param strategy: `'parallel'` or `'perpendicular'`,
319 see below
321 Fitted attributes:
323 * `classes_`: ndarray of shape (n_classes,) or list of ndarray
324 The classes labels (single output problem),
325 or a list of arrays of class labels (multi-output problem).
326 * `tree_`: Tree
327 The underlying Tree object.
329 The class implements two strategies to build the tree.
330 The first one `'parallel'` splits the feature space using
331 the hyperplan defined by a logistic regression, the second
332 strategy `'perpendicular'` splis the feature space based on
333 a hyperplan perpendicular to a logistic regression. By doing
334 this, two logistic regression fit on both sub parts must
335 necessary decreases the training error.
336 """
338 _fit_improve_algo_values = (
339 None, 'none', 'auto', 'intercept_sort', 'intercept_sort_always')
341 def __init__(self, estimator=None,
342 max_depth=20, min_samples_split=2,
343 min_samples_leaf=2, min_weight_fraction_leaf=0.0,
344 fit_improve_algo='auto', p1p2=0.09,
345 gamma=1., verbose=0, strategy='parallel'):
346 "constructor"
347 ClassifierMixin.__init__(self)
348 BaseEstimator.__init__(self)
349 # logistic regression
350 if estimator is None:
351 self.estimator = LogisticRegression()
352 else:
353 self.estimator = estimator
354 if max_depth is None:
355 raise ValueError("'max_depth' cannot be None.") # pragma: no cover
356 if max_depth > 1024:
357 raise ValueError(
358 "'max_depth' must be <= 1024.") # pragma: no cover
359 self.max_depth = max_depth
360 self.min_samples_split = min_samples_split
361 self.min_samples_leaf = min_samples_leaf
362 self.min_weight_fraction_leaf = min_weight_fraction_leaf
363 self.fit_improve_algo = fit_improve_algo
364 self.p1p2 = p1p2
365 self.gamma = gamma
366 self.verbose = verbose
367 self.strategy = strategy
369 if self.fit_improve_algo not in DecisionTreeLogisticRegression._fit_improve_algo_values:
370 raise ValueError(
371 "fit_improve_algo='{}' not in {}".format(
372 self.fit_improve_algo, DecisionTreeLogisticRegression._fit_improve_algo_values))
374 def fit(self, X, y, sample_weight=None):
375 """
376 Builds the tree model.
378 :param X: numpy array or sparse matrix of shape [n_samples,n_features]
379 Training data
380 :param y: numpy array of shape [n_samples, n_targets]
381 Target values. Will be cast to X's dtype if necessary
382 :param sample_weight: numpy array of shape [n_samples]
383 Individual weights for each sample
384 :return: self : returns an instance of self.
386 Fitted attributes:
388 * `classes_`: classes
389 * `tree_`: tree structure, see @see cl _DecisionTreeLogisticRegressionNode
390 * `n_nodes_`: number of nodes
391 """
392 if not isinstance(X, numpy.ndarray):
393 if hasattr(X, 'values'):
394 X = X.values
395 if not isinstance(X, numpy.ndarray):
396 raise TypeError("'X' must be an array.")
397 if (sample_weight is not None and
398 not isinstance(sample_weight, numpy.ndarray)):
399 raise TypeError(
400 "'sample_weight' must be an array.") # pragma: no cover
401 self.classes_ = numpy.array(sorted(set(y)))
402 if len(self.classes_) != 2:
403 raise RuntimeError(
404 "The model only supports binary classification but labels are "
405 "{}.".format(self.classes_))
407 if self.strategy == 'parallel':
408 return self._fit_parallel(X, y, sample_weight)
409 if self.strategy == 'perpendicular':
410 return self._fit_perpendicular(X, y, sample_weight)
411 raise ValueError(
412 "Unknown strategy '{}'.".format(self.strategy))
414 def _fit_parallel(self, X, y, sample_weight):
415 "Implements the parallel strategy."
416 cls = (y == self.classes_[1]).astype(numpy.int32)
417 estimator = clone(self.estimator)
418 self.tree_ = _DecisionTreeLogisticRegressionNode(estimator, 0.5)
419 self.n_nodes_ = self.tree_.fit(
420 X, cls, sample_weight, self, X.shape[0]) + 1
421 return self
423 def _fit_perpendicular(self, X, y, sample_weight):
424 "Implements the perpendicular strategy."
425 raise NotImplementedError() # pragma: no cover
427 def predict(self, X):
428 """
429 Runs the predictions.
430 """
431 labels = self.tree_.predict(X)
432 return numpy.take(self.classes_, labels)
434 def predict_proba(self, X):
435 """
436 Converts predictions into probabilities.
437 """
438 return self.tree_.predict_proba(X)
440 def decision_function(self, X):
441 """
442 Calls *decision_function*.
443 """
444 raise NotImplementedError( # pragma: no cover
445 "Decision function is not available for this model.")
447 @property
448 def tree_depth_(self):
449 """
450 Returns the maximum depth of the tree.
451 """
452 return self.tree_.tree_depth_
454 def decision_path(self, X, check_input=True):
455 """
456 Returns the decision path.
458 @param X inputs
459 @param check_input unused
460 @return sparse matrix
461 """
462 mat = sparse.lil_matrix((X.shape[0], self.n_nodes_), dtype=numpy.int32)
463 self.tree_.decision_path(X, mat, numpy.arange(X.shape[0]))
464 return sparse.csr_matrix(mat)
466 def get_leaves_index(self):
467 """
468 Returns the index of every leave.
469 """
470 indices = self.tree_.enumerate_leaves_index()
471 return numpy.array(sorted(indices))