Coverage for mlinsights/mlmodel/decision_tree

Hot-keys on this page

r m x p toggle line displays

j k next/prev highlighted chunk

0 (zero) top of page

1 (one) first highlighted chunk

1"""

2@file

3@brief Builds a tree of logistic regressions.

4"""

5import numpy

6import scipy.sparse as sparse # pylint: disable=R0402

7from sklearn.linear_model import LogisticRegression

8from sklearn.base import BaseEstimator, ClassifierMixin, clone

9from sklearn.linear_model._base import LinearClassifierMixin

12def logistic(x):

13 """

14 Computes :math:`\\frac{1}{1 + e^{-x}}`.

15 """

16 return 1. / (1. + numpy.exp(-x))

19def likelihood(x, y, theta=1., th=0.):

20 """

21 Computes :math:`\\sum_i y_i f(\\theta (x_i - x_0)) + (1 - y_i) (1 - f(\\theta (x_i - x_0)))`

22 where :math:`f(x_i)` is :math:`\\frac{1}{1 + e^{-x}}`.

23 """

24 lr = logistic((x - th) * theta)

25 return y * lr + (1. - y) * (1 - lr)

28class _DecisionTreeLogisticRegressionNode:

29 """

30 Describes the tree structure hold by class

31 @see cl DecisionTreeLogisticRegression.

32 See also notebook :ref:`decisiontreelogregrst`.

33 """

35 def __init__(self, estimator, threshold=0.5, depth=1, index=0):

36 """

37 constructor

39 @param estimator binary estimator

40 """

41 self.index = index

42 self.estimator = estimator

43 self.above = None

44 self.below = None

45 self.threshold = threshold

46 self.depth = depth

48 def predict(self, X):

49 """

50 Predicts

51 """

52 prob = self.predict_proba(X)

53 return (prob[:, 1] >= 0.5).astype(numpy.int32)

55 def predict_proba(self, X):

56 """

57 Returns the classification probabilities.

59 @param X features

60 @return probabilties

61 """

62 prob = self.estimator.predict_proba(X)

63 above = prob[:, 1] > self.threshold

64 below = ~ above

65 n_above = above.sum()

66 n_below = below.sum()

67 if self.above is not None and n_above > 0:

68 prob_above = self.above.predict_proba(X[above])

69 prob[above] = prob_above

70 if self.below is not None and n_below > 0:

71 prob_below = self.below.predict_proba(X[below])

72 prob[below] = prob_below

73 return prob

75 def decision_path(self, X, mat, indices):

76 """

77 Returns the classification probabilities.

79 @param X features

80 @param mat decision path (allocated matrix)

81 """

82 mat[indices, self.index] = 1

83 prob = self.estimator.predict_proba(X)

84 above = prob[:, 1] > self.threshold

85 below = ~ above

86 n_above = above.sum()

87 n_below = below.sum()

88 indices_above = indices[above]

89 indices_below = indices[below]

90 if self.above is not None and n_above > 0:

91 self.above.decision_path(X[above], mat, indices_above)

92 if self.below is not None and n_below > 0:

93 self.below.decision_path(X[below], mat, indices_below)

95 def fit(self, X, y, sample_weight, dtlr, total_N):

96 """

97 Fits a logistic regression, then splits the sample into

98 positive and negative examples, finally tries to fit

99 logistic regressions on both subsamples. This method only

100 works on a linear classifier.

101

102 @param X features

103 @param y binary labels

104 @param sample_weight weights of every sample

105 @param dtlr @see cl DecisionTreeLogisticRegression

106 @param total_N total number of observation

107 @return last index

108 """

109 self.estimator.fit(X, y, sample_weight=sample_weight)

110 if dtlr.verbose >= 1:

111 print("[DTLR ] %s trained acc %1.2f N=%d" % ( # pragma: no cover

112 " " * self.depth, self.estimator.score(X, y), X.shape[0]))

113 prob = self.fit_improve(dtlr, total_N, X, y,

114 sample_weight=sample_weight)

115

116 if self.depth + 1 > dtlr.max_depth:

117 return self.index

118 if X.shape[0] < dtlr.min_samples_split:

119 return self.index

120

121 above = prob[:, 1] > self.threshold

122 below = ~ above

123 n_above = above.sum()

124 n_below = below.sum()

125 y_above = set(y[above])

126 y_below = set(y[below])

127

128 def _fit_side(index, y_above_below, above_below, n_above_below, side):

129 if dtlr.verbose >= 1:

130 print("[DTLR*] %s%s: n_class=%d N=%d - %d/%d" % ( # pragma: no cover

131 " " * self.depth, side,

132 len(y_above_below), above_below.shape[0],

133 n_above_below, total_N))

134 if (len(y_above_below) > 1 and

135 above_below.shape[0] > dtlr.min_samples_leaf * 2 and

136 (float(n_above_below) / total_N >=

137 dtlr.min_weight_fraction_leaf * 2) and

138 n_above_below < total_N):

139 estimator = clone(dtlr.estimator)

140 sw = sample_weight[above_below] if sample_weight is not None else None

141 node = _DecisionTreeLogisticRegressionNode(

142 estimator, self.threshold, depth=self.depth + 1, index=index)

143 last_index = node.fit(

144 X[above_below], y[above_below], sw, dtlr, total_N)

145 return node, last_index

146 return None, index

147

148 self.above, last = _fit_side(

149 self.index + 1, y_above, above, n_above, "above")

150 self.below, last = _fit_side(

151 last + 1, y_below, below, n_below, "below")

152 return last

153

154 @property

155 def tree_depth_(self):

156 """

157 Returns the maximum depth of the tree.

158 """

159 dt = self.depth

160 if self.above is not None:

161 dt = max(dt, self.above.tree_depth_)

162 if self.below is not None:

163 dt = max(dt, self.below.tree_depth_)

164 return dt

165

166 def fit_improve(self, dtlr, total_N, X, y, sample_weight):

167 """

168 The method only works on a linear classifier, it changes

169 the intercept in order to be within the constraints

170 imposed by the *min_samples_leaf* and *min_weight_fraction_leaf*.

171 The algorithm has a significant cost as it sorts every observation

172 and chooses the best intercept.

173

174 @param dtlr @see cl DecisionTreeLogisticRegression

175 @param total_N total number of observations

176 @param X features

177 @param y labels

178 @param sample_weight sample weight

179 @return probabilities

180 """

181 if self.estimator is None:

182 raise RuntimeError(

183 "Estimator was not trained.") # pragma: no cover

184 prob = self.estimator.predict_proba(X)

185 if dtlr.fit_improve_algo in (None, 'none'):

186 return prob

187

188 if not isinstance(self.estimator, LinearClassifierMixin):

189 # The classifier is not linear and cannot be improved.

190 if dtlr.fit_improve_algo == 'intercept_sort_always': # pragma: no cover

191 raise RuntimeError(

192 "The model is not linear ({}), "

193 "intercept cannot be improved.".format(self.estimator.__class__.__name__))

194 return prob

195

196 above = prob[:, 1] > self.threshold

197 below = ~ above

198 n_above = above.sum()

199 n_below = below.sum()

200 n_min = min(n_above, n_below)

201 p1p2 = float(n_above * n_below) / X.shape[0] ** 2

202 if dtlr.verbose >= 2:

203 print("[DTLRI] %s imp %d <> %d, p1p2=%1.3f <> %1.3f" % ( # pragma: no cover

204 " " * self.depth, n_min, dtlr.min_samples_leaf,

205 p1p2, dtlr.p1p2))

206 if (n_min >= dtlr.min_samples_leaf and

207 float(n_min) / total_N >= dtlr.min_weight_fraction_leaf and

208 p1p2 > dtlr.p1p2 and

209 dtlr.fit_improve_algo != 'intercept_sort_always'):

210 return prob

211

212 coef = self.estimator.coef_

213 decision_function = (X @ coef.T).ravel()

214 order = numpy.argsort(decision_function, axis=0)

215 begin = dtlr.min_samples_leaf

216

217 sorted_df = decision_function[order]

218 sorted_y = decision_function[order]

219 N = sorted_y.shape[0]

220

221 best = None

222 besti = None

223 beta_best = None

224 for i in range(begin, N - begin):

225 beta = - sorted_df[i]

226 like = numpy.sum(likelihood(decision_function + beta, y)) / N

227 w = float(i * (N - i)) / N**2

228 like += w * dtlr.gamma

229 if besti is None or like > best:

230 best = like

231 besti = i

232 beta_best = beta

233

234 if beta_best is not None:

235 if dtlr.verbose >= 1:

236 print("[DTLRI] %s change intercept %f --> %f in [%f, %f]" % ( # pragma: no cover

237 " " * self.depth, self.estimator.intercept_, beta_best,

238 - sorted_df[-1], - sorted_df[0]))

239 self.estimator.intercept_ = beta_best

240 prob = self.estimator.predict_proba(X)

241 return prob

242

243 def enumerate_leaves_index(self):

244 """

245 Returns the leaves index.

246 """

247 if self.above is None or self.below is None:

248 yield self.index

249 if self.above is not None:

250 for index in self.above.enumerate_leaves_index():

251 yield index

252 if self.below is not None:

253 for index in self.below.enumerate_leaves_index():

254 yield index

255

256

257class DecisionTreeLogisticRegression(BaseEstimator, ClassifierMixin):

258 """

259 Fits a logistic regression, then fits two other

260 logistic regression for every observation on both sides

261 of the border. It goes one until a tree is built.

262 It only handles a binary classification.

263 The built tree cannot be deeper than the maximum recursion.

264

265 :param estimator: binary classification estimator,

266 if empty, use a logistic regression, the theoritical

267 model defined with a logistic regression but it could

268 any binary classifier

269 :param max_depth: int, default=None

270 The maximum depth of the tree. If None, then nodes are expanded until

271 all leaves are pure or until all leaves contain less than

272 min_samples_split samples. It must be below the maximum

273 allowed recursion by python.

274 :param min_samples_split: int or float, default=2

275 The minimum number of samples required to split an internal node:

276 - If int, then consider `min_samples_split` as the minimum number.

277 - If float, then `min_samples_split` is a fraction and

278 `ceil(min_samples_split * n_samples)` are the minimum

279 number of samples for each split.

280 :param min_samples_leaf: int or float, default=1

281 The minimum number of samples required to be at a leaf node.

282 A split point at any depth will only be considered if it leaves at

283 least ``min_samples_leaf`` training samples in each of the left and

284 right branches. This may have the effect of smoothing the model,

285 especially in regression.

286 - If int, then consider `min_samples_leaf` as the minimum number.

287 - If float, then `min_samples_leaf` is a fraction and

288 `ceil(min_samples_leaf * n_samples)` are the minimum

289 number of samples for each node.

290 :param min_weight_fraction_leaf: float, default=0.0

291 The minimum weighted fraction of the sum total of weights (of all

292 the input samples) required to be at a leaf node. Samples have

293 equal weight when sample_weight is not provided.

294 :param fit_improve_algo: string, one of the following value:

295 - `'auto'`: chooses the best option below, '`none'` for

296 every non linear model, `'intercept_sort'` for linear models

297 - '`none'`: does not nothing once the binary classifier is fit

298 - `'intercept_sort'`: if one side of the classifier is too small,

299 the method changes the best intercept possible verifying

300 the constraints

301 - `'intercept_sort_always'`: always chooses the best intercept

302 possible

303 :param p1p2: threshold in [0, 1]

304 for every split, we can define probabilities :math:`p_1 p_2`

305 which define the ratio of samples in both splits,

306 if :math:`p_1 p_2` is below the threshold,

307 method *fit_improve* is called

308 :param gamma: weight before the coefficient :math:`p (1-p)`.

309 When the model tries to improve the linear classifier,

310 it looks a better intercept which maximizes the

311 likelihood and verifies the constraints.

312 In order to force the classifier to choose a value

313 which splits the dataset into 2 almost equal folds,

314 the function maximimes :math:`likelihood + \\gamma p (1 - p)`

315 where *p* is the proportion of samples falling in the first

316 fold.

317 :param verbose: prints out information about the training

318 :param strategy: `'parallel'` or `'perpendicular'`,

319 see below

320

321 Fitted attributes:

322

323 * `classes_`: ndarray of shape (n_classes,) or list of ndarray

324 The classes labels (single output problem),

325 or a list of arrays of class labels (multi-output problem).

326 * `tree_`: Tree

327 The underlying Tree object.

328

329 The class implements two strategies to build the tree.

330 The first one `'parallel'` splits the feature space using

331 the hyperplan defined by a logistic regression, the second

332 strategy `'perpendicular'` splis the feature space based on

333 a hyperplan perpendicular to a logistic regression. By doing

334 this, two logistic regression fit on both sub parts must

335 necessary decreases the training error.

336 """

337

338 _fit_improve_algo_values = (

339 None, 'none', 'auto', 'intercept_sort', 'intercept_sort_always')

340

341 def __init__(self, estimator=None,

342 max_depth=20, min_samples_split=2,

343 min_samples_leaf=2, min_weight_fraction_leaf=0.0,

344 fit_improve_algo='auto', p1p2=0.09,

345 gamma=1., verbose=0, strategy='parallel'):

346 "constructor"

347 ClassifierMixin.__init__(self)

348 BaseEstimator.__init__(self)

349 # logistic regression

350 if estimator is None:

351 self.estimator = LogisticRegression()

352 else:

353 self.estimator = estimator

354 if max_depth is None:

355 raise ValueError("'max_depth' cannot be None.") # pragma: no cover

356 if max_depth > 1024:

357 raise ValueError(

358 "'max_depth' must be <= 1024.") # pragma: no cover

359 self.max_depth = max_depth

360 self.min_samples_split = min_samples_split

361 self.min_samples_leaf = min_samples_leaf

362 self.min_weight_fraction_leaf = min_weight_fraction_leaf

363 self.fit_improve_algo = fit_improve_algo

364 self.p1p2 = p1p2

365 self.gamma = gamma

366 self.verbose = verbose

367 self.strategy = strategy

368

369 if self.fit_improve_algo not in DecisionTreeLogisticRegression._fit_improve_algo_values:

370 raise ValueError(

371 "fit_improve_algo='{}' not in {}".format(

372 self.fit_improve_algo, DecisionTreeLogisticRegression._fit_improve_algo_values))

373

374 def fit(self, X, y, sample_weight=None):

375 """

376 Builds the tree model.

377

378 :param X: numpy array or sparse matrix of shape [n_samples,n_features]

379 Training data

380 :param y: numpy array of shape [n_samples, n_targets]

381 Target values. Will be cast to X's dtype if necessary

382 :param sample_weight: numpy array of shape [n_samples]

383 Individual weights for each sample

384 :return: self : returns an instance of self.

385

386 Fitted attributes:

387

388 * `classes_`: classes

389 * `tree_`: tree structure, see @see cl _DecisionTreeLogisticRegressionNode

390 * `n_nodes_`: number of nodes

391 """

392 if not isinstance(X, numpy.ndarray):

393 if hasattr(X, 'values'):

394 X = X.values

395 if not isinstance(X, numpy.ndarray):

396 raise TypeError("'X' must be an array.")

397 if (sample_weight is not None and

398 not isinstance(sample_weight, numpy.ndarray)):

399 raise TypeError(

400 "'sample_weight' must be an array.") # pragma: no cover

401 self.classes_ = numpy.array(sorted(set(y)))

402 if len(self.classes_) != 2:

403 raise RuntimeError(

404 "The model only supports binary classification but labels are "

405 "{}.".format(self.classes_))

406

407 if self.strategy == 'parallel':

408 return self._fit_parallel(X, y, sample_weight)

409 if self.strategy == 'perpendicular':

410 return self._fit_perpendicular(X, y, sample_weight)

411 raise ValueError(

412 "Unknown strategy '{}'.".format(self.strategy))

413

414 def _fit_parallel(self, X, y, sample_weight):

415 "Implements the parallel strategy."

416 cls = (y == self.classes_[1]).astype(numpy.int32)

417 estimator = clone(self.estimator)

418 self.tree_ = _DecisionTreeLogisticRegressionNode(estimator, 0.5)

419 self.n_nodes_ = self.tree_.fit(

420 X, cls, sample_weight, self, X.shape[0]) + 1

421 return self

422

423 def _fit_perpendicular(self, X, y, sample_weight):

424 "Implements the perpendicular strategy."

425 raise NotImplementedError() # pragma: no cover

426

427 def predict(self, X):

428 """

429 Runs the predictions.

430 """

431 labels = self.tree_.predict(X)

432 return numpy.take(self.classes_, labels)

433

434 def predict_proba(self, X):

435 """

436 Converts predictions into probabilities.

437 """

438 return self.tree_.predict_proba(X)

439

440 def decision_function(self, X):

441 """

442 Calls *decision_function*.

443 """

444 raise NotImplementedError( # pragma: no cover

445 "Decision function is not available for this model.")

446

447 @property

448 def tree_depth_(self):

449 """

450 Returns the maximum depth of the tree.

451 """

452 return self.tree_.tree_depth_

453

454 def decision_path(self, X, check_input=True):

455 """

456 Returns the decision path.

457

458 @param X inputs

459 @param check_input unused

460 @return sparse matrix

461 """

462 mat = sparse.lil_matrix((X.shape[0], self.n_nodes_), dtype=numpy.int32)

463 self.tree_.decision_path(X, mat, numpy.arange(X.shape[0]))

464 return sparse.csr_matrix(mat)

465

466 def get_leaves_index(self):

467 """

468 Returns the index of every leave.

469 """

470 indices = self.tree_.enumerate_leaves_index()

471 return numpy.array(sorted(indices))

Coverage for mlinsights/mlmodel/decision_tree_logreg.py : 99%

189 statements

Coverage for mlinsights/mlmodel/decision_tree_logreg.py : 99%

189 statements 187 run 2 missing 12 excluded

189 statements