Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1""" 

2@file 

3@brief Builds a tree of logistic regressions. 

4""" 

5import numpy 

6import scipy.sparse as sparse # pylint: disable=R0402 

7from sklearn.linear_model import LogisticRegression 

8from sklearn.base import BaseEstimator, ClassifierMixin, clone 

9from sklearn.linear_model._base import LinearClassifierMixin 

10 

11 

12def logistic(x): 

13 """ 

14 Computes :math:`\\frac{1}{1 + e^{-x}}`. 

15 """ 

16 return 1. / (1. + numpy.exp(-x)) 

17 

18 

19def likelihood(x, y, theta=1., th=0.): 

20 """ 

21 Computes :math:`\\sum_i y_i f(\\theta (x_i - x_0)) + (1 - y_i) (1 - f(\\theta (x_i - x_0)))` 

22 where :math:`f(x_i)` is :math:`\\frac{1}{1 + e^{-x}}`. 

23 """ 

24 lr = logistic((x - th) * theta) 

25 return y * lr + (1. - y) * (1 - lr) 

26 

27 

28class _DecisionTreeLogisticRegressionNode: 

29 """ 

30 Describes the tree structure hold by class 

31 @see cl DecisionTreeLogisticRegression. 

32 See also notebook :ref:`decisiontreelogregrst`. 

33 """ 

34 

35 def __init__(self, estimator, threshold=0.5, depth=1, index=0): 

36 """ 

37 constructor 

38 

39 @param estimator binary estimator 

40 """ 

41 self.index = index 

42 self.estimator = estimator 

43 self.above = None 

44 self.below = None 

45 self.threshold = threshold 

46 self.depth = depth 

47 

48 def predict(self, X): 

49 """ 

50 Predicts 

51 """ 

52 prob = self.predict_proba(X) 

53 return (prob[:, 1] >= 0.5).astype(numpy.int32) 

54 

55 def predict_proba(self, X): 

56 """ 

57 Returns the classification probabilities. 

58 

59 @param X features 

60 @return probabilties 

61 """ 

62 prob = self.estimator.predict_proba(X) 

63 above = prob[:, 1] > self.threshold 

64 below = ~ above 

65 n_above = above.sum() 

66 n_below = below.sum() 

67 if self.above is not None and n_above > 0: 

68 prob_above = self.above.predict_proba(X[above]) 

69 prob[above] = prob_above 

70 if self.below is not None and n_below > 0: 

71 prob_below = self.below.predict_proba(X[below]) 

72 prob[below] = prob_below 

73 return prob 

74 

75 def decision_path(self, X, mat, indices): 

76 """ 

77 Returns the classification probabilities. 

78 

79 @param X features 

80 @param mat decision path (allocated matrix) 

81 """ 

82 mat[indices, self.index] = 1 

83 prob = self.estimator.predict_proba(X) 

84 above = prob[:, 1] > self.threshold 

85 below = ~ above 

86 n_above = above.sum() 

87 n_below = below.sum() 

88 indices_above = indices[above] 

89 indices_below = indices[below] 

90 if self.above is not None and n_above > 0: 

91 self.above.decision_path(X[above], mat, indices_above) 

92 if self.below is not None and n_below > 0: 

93 self.below.decision_path(X[below], mat, indices_below) 

94 

95 def fit(self, X, y, sample_weight, dtlr, total_N): 

96 """ 

97 Fits a logistic regression, then splits the sample into 

98 positive and negative examples, finally tries to fit 

99 logistic regressions on both subsamples. This method only 

100 works on a linear classifier. 

101 

102 @param X features 

103 @param y binary labels 

104 @param sample_weight weights of every sample 

105 @param dtlr @see cl DecisionTreeLogisticRegression 

106 @param total_N total number of observation 

107 @return last index 

108 """ 

109 self.estimator.fit(X, y, sample_weight=sample_weight) 

110 if dtlr.verbose >= 1: 

111 print("[DTLR ] %s trained acc %1.2f N=%d" % ( # pragma: no cover 

112 " " * self.depth, self.estimator.score(X, y), X.shape[0])) 

113 prob = self.fit_improve(dtlr, total_N, X, y, 

114 sample_weight=sample_weight) 

115 

116 if self.depth + 1 > dtlr.max_depth: 

117 return self.index 

118 if X.shape[0] < dtlr.min_samples_split: 

119 return self.index 

120 

121 above = prob[:, 1] > self.threshold 

122 below = ~ above 

123 n_above = above.sum() 

124 n_below = below.sum() 

125 y_above = set(y[above]) 

126 y_below = set(y[below]) 

127 

128 def _fit_side(index, y_above_below, above_below, n_above_below, side): 

129 if dtlr.verbose >= 1: 

130 print("[DTLR*] %s%s: n_class=%d N=%d - %d/%d" % ( # pragma: no cover 

131 " " * self.depth, side, 

132 len(y_above_below), above_below.shape[0], 

133 n_above_below, total_N)) 

134 if (len(y_above_below) > 1 and 

135 above_below.shape[0] > dtlr.min_samples_leaf * 2 and 

136 (float(n_above_below) / total_N >= 

137 dtlr.min_weight_fraction_leaf * 2) and 

138 n_above_below < total_N): 

139 estimator = clone(dtlr.estimator) 

140 sw = sample_weight[above_below] if sample_weight is not None else None 

141 node = _DecisionTreeLogisticRegressionNode( 

142 estimator, self.threshold, depth=self.depth + 1, index=index) 

143 last_index = node.fit( 

144 X[above_below], y[above_below], sw, dtlr, total_N) 

145 return node, last_index 

146 return None, index 

147 

148 self.above, last = _fit_side( 

149 self.index + 1, y_above, above, n_above, "above") 

150 self.below, last = _fit_side( 

151 last + 1, y_below, below, n_below, "below") 

152 return last 

153 

154 @property 

155 def tree_depth_(self): 

156 """ 

157 Returns the maximum depth of the tree. 

158 """ 

159 dt = self.depth 

160 if self.above is not None: 

161 dt = max(dt, self.above.tree_depth_) 

162 if self.below is not None: 

163 dt = max(dt, self.below.tree_depth_) 

164 return dt 

165 

166 def fit_improve(self, dtlr, total_N, X, y, sample_weight): 

167 """ 

168 The method only works on a linear classifier, it changes 

169 the intercept in order to be within the constraints 

170 imposed by the *min_samples_leaf* and *min_weight_fraction_leaf*. 

171 The algorithm has a significant cost as it sorts every observation 

172 and chooses the best intercept. 

173 

174 @param dtlr @see cl DecisionTreeLogisticRegression 

175 @param total_N total number of observations 

176 @param X features 

177 @param y labels 

178 @param sample_weight sample weight 

179 @return probabilities 

180 """ 

181 if self.estimator is None: 

182 raise RuntimeError( 

183 "Estimator was not trained.") # pragma: no cover 

184 prob = self.estimator.predict_proba(X) 

185 if dtlr.fit_improve_algo in (None, 'none'): 

186 return prob 

187 

188 if not isinstance(self.estimator, LinearClassifierMixin): 

189 # The classifier is not linear and cannot be improved. 

190 if dtlr.fit_improve_algo == 'intercept_sort_always': # pragma: no cover 

191 raise RuntimeError( 

192 "The model is not linear ({}), " 

193 "intercept cannot be improved.".format(self.estimator.__class__.__name__)) 

194 return prob 

195 

196 above = prob[:, 1] > self.threshold 

197 below = ~ above 

198 n_above = above.sum() 

199 n_below = below.sum() 

200 n_min = min(n_above, n_below) 

201 p1p2 = float(n_above * n_below) / X.shape[0] ** 2 

202 if dtlr.verbose >= 2: 

203 print("[DTLRI] %s imp %d <> %d, p1p2=%1.3f <> %1.3f" % ( # pragma: no cover 

204 " " * self.depth, n_min, dtlr.min_samples_leaf, 

205 p1p2, dtlr.p1p2)) 

206 if (n_min >= dtlr.min_samples_leaf and 

207 float(n_min) / total_N >= dtlr.min_weight_fraction_leaf and 

208 p1p2 > dtlr.p1p2 and 

209 dtlr.fit_improve_algo != 'intercept_sort_always'): 

210 return prob 

211 

212 coef = self.estimator.coef_ 

213 decision_function = (X @ coef.T).ravel() 

214 order = numpy.argsort(decision_function, axis=0) 

215 begin = dtlr.min_samples_leaf 

216 

217 sorted_df = decision_function[order] 

218 sorted_y = decision_function[order] 

219 N = sorted_y.shape[0] 

220 

221 best = None 

222 besti = None 

223 beta_best = None 

224 for i in range(begin, N - begin): 

225 beta = - sorted_df[i] 

226 like = numpy.sum(likelihood(decision_function + beta, y)) / N 

227 w = float(i * (N - i)) / N**2 

228 like += w * dtlr.gamma 

229 if besti is None or like > best: 

230 best = like 

231 besti = i 

232 beta_best = beta 

233 

234 if beta_best is not None: 

235 if dtlr.verbose >= 1: 

236 print("[DTLRI] %s change intercept %f --> %f in [%f, %f]" % ( # pragma: no cover 

237 " " * self.depth, self.estimator.intercept_, beta_best, 

238 - sorted_df[-1], - sorted_df[0])) 

239 self.estimator.intercept_ = beta_best 

240 prob = self.estimator.predict_proba(X) 

241 return prob 

242 

243 def enumerate_leaves_index(self): 

244 """ 

245 Returns the leaves index. 

246 """ 

247 if self.above is None or self.below is None: 

248 yield self.index 

249 if self.above is not None: 

250 for index in self.above.enumerate_leaves_index(): 

251 yield index 

252 if self.below is not None: 

253 for index in self.below.enumerate_leaves_index(): 

254 yield index 

255 

256 

257class DecisionTreeLogisticRegression(BaseEstimator, ClassifierMixin): 

258 """ 

259 Fits a logistic regression, then fits two other 

260 logistic regression for every observation on both sides 

261 of the border. It goes one until a tree is built. 

262 It only handles a binary classification. 

263 The built tree cannot be deeper than the maximum recursion. 

264 

265 :param estimator: binary classification estimator, 

266 if empty, use a logistic regression, the theoritical 

267 model defined with a logistic regression but it could 

268 any binary classifier 

269 :param max_depth: int, default=None 

270 The maximum depth of the tree. If None, then nodes are expanded until 

271 all leaves are pure or until all leaves contain less than 

272 min_samples_split samples. It must be below the maximum 

273 allowed recursion by python. 

274 :param min_samples_split: int or float, default=2 

275 The minimum number of samples required to split an internal node: 

276 - If int, then consider `min_samples_split` as the minimum number. 

277 - If float, then `min_samples_split` is a fraction and 

278 `ceil(min_samples_split * n_samples)` are the minimum 

279 number of samples for each split. 

280 :param min_samples_leaf: int or float, default=1 

281 The minimum number of samples required to be at a leaf node. 

282 A split point at any depth will only be considered if it leaves at 

283 least ``min_samples_leaf`` training samples in each of the left and 

284 right branches. This may have the effect of smoothing the model, 

285 especially in regression. 

286 - If int, then consider `min_samples_leaf` as the minimum number. 

287 - If float, then `min_samples_leaf` is a fraction and 

288 `ceil(min_samples_leaf * n_samples)` are the minimum 

289 number of samples for each node. 

290 :param min_weight_fraction_leaf: float, default=0.0 

291 The minimum weighted fraction of the sum total of weights (of all 

292 the input samples) required to be at a leaf node. Samples have 

293 equal weight when sample_weight is not provided. 

294 :param fit_improve_algo: string, one of the following value: 

295 - `'auto'`: chooses the best option below, '`none'` for 

296 every non linear model, `'intercept_sort'` for linear models 

297 - '`none'`: does not nothing once the binary classifier is fit 

298 - `'intercept_sort'`: if one side of the classifier is too small, 

299 the method changes the best intercept possible verifying 

300 the constraints 

301 - `'intercept_sort_always'`: always chooses the best intercept 

302 possible 

303 :param p1p2: threshold in [0, 1] 

304 for every split, we can define probabilities :math:`p_1 p_2` 

305 which define the ratio of samples in both splits, 

306 if :math:`p_1 p_2` is below the threshold, 

307 method *fit_improve* is called 

308 :param gamma: weight before the coefficient :math:`p (1-p)`. 

309 When the model tries to improve the linear classifier, 

310 it looks a better intercept which maximizes the 

311 likelihood and verifies the constraints. 

312 In order to force the classifier to choose a value 

313 which splits the dataset into 2 almost equal folds, 

314 the function maximimes :math:`likelihood + \\gamma p (1 - p)` 

315 where *p* is the proportion of samples falling in the first 

316 fold. 

317 :param verbose: prints out information about the training 

318 :param strategy: `'parallel'` or `'perpendicular'`, 

319 see below 

320 

321 Fitted attributes: 

322 

323 * `classes_`: ndarray of shape (n_classes,) or list of ndarray 

324 The classes labels (single output problem), 

325 or a list of arrays of class labels (multi-output problem). 

326 * `tree_`: Tree 

327 The underlying Tree object. 

328 

329 The class implements two strategies to build the tree. 

330 The first one `'parallel'` splits the feature space using 

331 the hyperplan defined by a logistic regression, the second 

332 strategy `'perpendicular'` splis the feature space based on 

333 a hyperplan perpendicular to a logistic regression. By doing 

334 this, two logistic regression fit on both sub parts must 

335 necessary decreases the training error. 

336 """ 

337 

338 _fit_improve_algo_values = ( 

339 None, 'none', 'auto', 'intercept_sort', 'intercept_sort_always') 

340 

341 def __init__(self, estimator=None, 

342 max_depth=20, min_samples_split=2, 

343 min_samples_leaf=2, min_weight_fraction_leaf=0.0, 

344 fit_improve_algo='auto', p1p2=0.09, 

345 gamma=1., verbose=0, strategy='parallel'): 

346 "constructor" 

347 ClassifierMixin.__init__(self) 

348 BaseEstimator.__init__(self) 

349 # logistic regression 

350 if estimator is None: 

351 self.estimator = LogisticRegression() 

352 else: 

353 self.estimator = estimator 

354 if max_depth is None: 

355 raise ValueError("'max_depth' cannot be None.") # pragma: no cover 

356 if max_depth > 1024: 

357 raise ValueError( 

358 "'max_depth' must be <= 1024.") # pragma: no cover 

359 self.max_depth = max_depth 

360 self.min_samples_split = min_samples_split 

361 self.min_samples_leaf = min_samples_leaf 

362 self.min_weight_fraction_leaf = min_weight_fraction_leaf 

363 self.fit_improve_algo = fit_improve_algo 

364 self.p1p2 = p1p2 

365 self.gamma = gamma 

366 self.verbose = verbose 

367 self.strategy = strategy 

368 

369 if self.fit_improve_algo not in DecisionTreeLogisticRegression._fit_improve_algo_values: 

370 raise ValueError( 

371 "fit_improve_algo='{}' not in {}".format( 

372 self.fit_improve_algo, DecisionTreeLogisticRegression._fit_improve_algo_values)) 

373 

374 def fit(self, X, y, sample_weight=None): 

375 """ 

376 Builds the tree model. 

377 

378 :param X: numpy array or sparse matrix of shape [n_samples,n_features] 

379 Training data 

380 :param y: numpy array of shape [n_samples, n_targets] 

381 Target values. Will be cast to X's dtype if necessary 

382 :param sample_weight: numpy array of shape [n_samples] 

383 Individual weights for each sample 

384 :return: self : returns an instance of self. 

385 

386 Fitted attributes: 

387 

388 * `classes_`: classes 

389 * `tree_`: tree structure, see @see cl _DecisionTreeLogisticRegressionNode 

390 * `n_nodes_`: number of nodes 

391 """ 

392 if not isinstance(X, numpy.ndarray): 

393 if hasattr(X, 'values'): 

394 X = X.values 

395 if not isinstance(X, numpy.ndarray): 

396 raise TypeError("'X' must be an array.") 

397 if (sample_weight is not None and 

398 not isinstance(sample_weight, numpy.ndarray)): 

399 raise TypeError( 

400 "'sample_weight' must be an array.") # pragma: no cover 

401 self.classes_ = numpy.array(sorted(set(y))) 

402 if len(self.classes_) != 2: 

403 raise RuntimeError( 

404 "The model only supports binary classification but labels are " 

405 "{}.".format(self.classes_)) 

406 

407 if self.strategy == 'parallel': 

408 return self._fit_parallel(X, y, sample_weight) 

409 if self.strategy == 'perpendicular': 

410 return self._fit_perpendicular(X, y, sample_weight) 

411 raise ValueError( 

412 "Unknown strategy '{}'.".format(self.strategy)) 

413 

414 def _fit_parallel(self, X, y, sample_weight): 

415 "Implements the parallel strategy." 

416 cls = (y == self.classes_[1]).astype(numpy.int32) 

417 estimator = clone(self.estimator) 

418 self.tree_ = _DecisionTreeLogisticRegressionNode(estimator, 0.5) 

419 self.n_nodes_ = self.tree_.fit( 

420 X, cls, sample_weight, self, X.shape[0]) + 1 

421 return self 

422 

423 def _fit_perpendicular(self, X, y, sample_weight): 

424 "Implements the perpendicular strategy." 

425 raise NotImplementedError() # pragma: no cover 

426 

427 def predict(self, X): 

428 """ 

429 Runs the predictions. 

430 """ 

431 labels = self.tree_.predict(X) 

432 return numpy.take(self.classes_, labels) 

433 

434 def predict_proba(self, X): 

435 """ 

436 Converts predictions into probabilities. 

437 """ 

438 return self.tree_.predict_proba(X) 

439 

440 def decision_function(self, X): 

441 """ 

442 Calls *decision_function*. 

443 """ 

444 raise NotImplementedError( # pragma: no cover 

445 "Decision function is not available for this model.") 

446 

447 @property 

448 def tree_depth_(self): 

449 """ 

450 Returns the maximum depth of the tree. 

451 """ 

452 return self.tree_.tree_depth_ 

453 

454 def decision_path(self, X, check_input=True): 

455 """ 

456 Returns the decision path. 

457 

458 @param X inputs 

459 @param check_input unused 

460 @return sparse matrix 

461 """ 

462 mat = sparse.lil_matrix((X.shape[0], self.n_nodes_), dtype=numpy.int32) 

463 self.tree_.decision_path(X, mat, numpy.arange(X.shape[0])) 

464 return sparse.csr_matrix(mat) 

465 

466 def get_leaves_index(self): 

467 """ 

468 Returns the index of every leave. 

469 """ 

470 indices = self.tree_.enumerate_leaves_index() 

471 return numpy.array(sorted(indices))