Coverage for src/papierstat/mltricks/sklearn_base_learner

Hot-keys on this page

r m x p toggle line displays

j k next/prev highlighted chunk

0 (zero) top of page

1 (one) first highlighted chunk

1# -*- coding: utf-8 -*-

2"""

3@file

4@brief Implémente un *learner* qui suit la même API que tout :epkg:`scikit-learn` learner.

5"""

6import numpy

7import pandas

8from sklearn.base import clone

9from mlinsights.sklapi import SkBaseLearner, SkLearnParameters

12class SkBaseLearnerCategory(SkBaseLearner):

14 """

15 Base d'un *learner* qui apprend un learner pour chaque

16 modalité d'une classe.

18 Notebooks associés à ce *learner* :

20 .. runpython::

21 :rst:

23 from papierstat.datasets.documentation import list_notebooks_rst_links

24 links = list_notebooks_rst_links('lectures', 'wines_color_linear')

25 links = [' * %s' % s for s in links]

26 print('\\n'.join(links))

27 """

29 def __init__(self, colnameind=None, model=None, **kwargs):

30 """

31 Stocke les paramètres dans une classe

32 @see cl SkLearnParameters, elle garde une copie des

33 paramètres pour implémenter facilement *get_params*

34 et ainsi cloner un modèle.

36 @param colnameind indice ou nom de la colonne qui

37 contient les modalités de la catégorie

38 @param model model à appliquer sur chaque catégorie

39 """

40 if not isinstance(colnameind, (int, str)):

41 raise TypeError( # pragma: no cover

42 "colnameind must be str or int not {0}".format(type(colnameind)))

43 if model is None:

44 raise ValueError("model must not be None") # pragma: no cover

45 kwargs['colnameind'] = colnameind

46 SkBaseLearner.__init__(self, **kwargs)

47 self.model = model

48 self._estimator_type = self.model._estimator_type

50 @property

51 def colnameind(self):

52 """

53 Retourne le nom ou l'indice de la catégorie.

54 """

55 return self.P.colnameind

57 @property

58 def Models(self):

59 """

60 Retourne les models.

61 """

62 if hasattr(self, 'models'):

63 return self.models

64 raise RuntimeError('No trained models') # pragma: no cover

66 def _get_cat(self, X):

67 """

68 Retourne les catégories indiquées par *colnameind*.

69 """

70 if isinstance(self.colnameind, str):

71 if not hasattr(X, 'columns'):

72 raise TypeError( # pragma: no cover

73 "colnameind='{0}' and X is not a DataFrame but {1}".format(

74 self.colnameind, type(X)))

75 return X[self.colnameind]

76 return X[:, self.colnameind]

78 def _filter_cat(self, c, X, y=None, sample_weight=None):

79 """

80 Retoure *X*, *y*, *sample_weight* pour la categorie *c* uniquement.

81 """

82 indices = numpy.arange(0, X.shape[0])

83 if isinstance(self.colnameind, str):

84 if not hasattr(X, 'columns'):

85 raise TypeError( # pragma: no cover

86 "colnameind='{0}' and X is not a DataFrame but {1}".format(

87 self.colnameind, type(X)))

88 ind = X[self.colnameind] == c

89 sa = None if sample_weight is None else sample_weight[ind]

90 y = None if y is None else y[ind]

91 ind, x = indices[ind], X.drop(self.colnameind, axis=1)[ind]

92 elif hasattr(X, 'iloc'):

93 ind = X[self.colnameind] == c

94 sa = None if sample_weight is None else sample_weight[ind]

95 y = None if y is None else y[ind]

96 ind, x = indices[ind], X.iloc[ind, -self.colnameind]

97 else:

98 ind = X[self.colnameind] == c

99 sa = None if sample_weight is None else sample_weight[ind]

100 y = None if y is None else y[ind]

101 ind, x = indices[ind], X[ind, -self.colnameind]

102 if y is not None and x.shape[0] != y.shape[0]:

103 raise RuntimeError( # pragma: no cover

104 "Input arrays have different shapes for value='{0}': {1} != {2} "

105 "(expected: {3}) type(X)={4}".format(

106 c, X.shape[0], y.shape[0], ind.shape, type(X)))

107 if sa is not None and x.shape[0] != sa.shape[0]:

108 raise RuntimeError( # pragma: no cover

109 "Input arrays have different shapes for value='{0}': {1} != {2} "

110 "(expected: {3}) type(X)={4}".format(

111 c, X.shape[0], sa.shape[0], ind.shape, type(X)))

112 return ind, x, y, sa

113

114 ###################

115 # API scikit-learn

116 ###################

117

118 def fit(self, X, y=None, sample_weight=None, **kwargs):

119 """

120 Apprends un modèle pour chaque modalité d'une catégorie.

121

122 @param X features

123 @param y cibles

124 @param sample_weight poids de chaque observation

125 @return self, lui-même

126

127 La fonction n'est pas parallélisée mais elle le pourrait.

128 """

129 cats = set(self._get_cat(X))

130 for c in cats:

131 if not isinstance(c, str) and numpy.isnan(c):

132 raise ValueError( # pragma: no cover

133 "One of the row has a missing category.")

134

135 res = {}

136 for c in sorted(cats):

137 _, xcat, ycat, scat = self._filter_cat(c, X, y, sample_weight)

138 mod = clone(self.model)

139 if scat is not None:

140 kwargs['sample_weight'] = scat

141 mod.fit(xcat, ycat, **kwargs)

142 res[c] = mod

143 self.models = res

144 return self

145

146 def _any_predict(self, X, fct, *args):

147 """

148 Prédit en appelant le modèle associé à chaque catégorie.

149

150 @param X features

151 @return prédictions

152

153 La fonction n'est pas parallélisée mais elle le pourrait.

154 """

155 cats = set(self._get_cat(X))

156 for c in cats:

157 if not isinstance(c, str) and numpy.isnan(c):

158 raise NotImplementedError( # pragma: no cover

159 "No default value is implemented in case of missing value.")

160

161 res = []

162 for c in sorted(cats):

163 ind, xcat, ycat, _ = self._filter_cat(c, X, *args)

164 mod = self.models[c]

165 meth = getattr(mod, fct)

166 if ycat is None:

167 pred = meth(xcat)

168 else:

169 pred = meth(xcat, ycat)

170 if len(pred.shape) == 1:

171 pred = pred[:, numpy.newaxis]

172 if len(ind.shape) == 1:

173 ind = ind[:, numpy.newaxis]

174 pred = numpy.hstack([pred, ind])

175 res.append(pred)

176 try:

177 final = numpy.vstack(res)

178 except ValueError: # pragma: no cover

179 # Only one dimension.

180 final = numpy.hstack(res)

181 df = pandas.DataFrame(final)

182 df = df.sort_values(

183 df.columns[-1]).reset_index(drop=True) # pylint: disable=E1136

184 df = df.iloc[:, :-1].values

185 if len(df.shape) == 2 and df.shape[1] == 1:

186 df = df.ravel()

187 return df

188

189 def predict(self, X):

190 """

191 Prédit en appelant le modèle associé à chaque catégorie.

192

193 @param X features

194 @return prédictions

195

196 La fonction n'est pas parallélisée mais elle le pourrait.

197 """

198 return self._any_predict(X, 'predict')

199

200 def decision_function(self, X):

201 """

202 Output of the model in case of a regressor, matrix with a score for each class and each sample

203 for a classifier

204

205 @param X Samples, {array-like, sparse matrix}, shape = (n_samples, n_features)

206 @return array, shape = (n_samples,.), Returns predicted values.

207 """

208 if hasattr(self.model, 'decision_function'):

209 return self._any_predict(X, 'decision_function')

210 raise NotImplementedError(

211 "No decision_function for {0}".format(self.model))

212

213 def predict_proba(self, X):

214 """

215 Output of the model in case of a regressor, matrix with a score for each class and each sample

216 for a classifier

217

218 @param X Samples, {array-like, sparse matrix}, shape = (n_samples, n_features)

219 @return array, shape = (n_samples,.), Returns predicted values.

220 """

221 if hasattr(self.model, 'predict_proba'):

222 return self._any_predict(X, 'predict_proba')

223 raise NotImplementedError( # pragma: no cover

224 "No method predict_proba for {0}".format(self.model))

225

226 def score(self, X, y=None, sample_weight=None):

227 """

228 Returns the mean accuracy on the given test data and labels.

229

230 @param X Training data, numpy array or sparse matrix of shape [n_samples,n_features]

231 @param y Target values, numpy array of shape [n_samples, n_targets] (optional)

232 @param sample_weight Weight values, numpy array of shape [n_samples, n_targets] (optional)

233 @return score : float, Mean accuracy of self.predict(X) wrt. y.

234 """

235 if self._estimator_type == 'classifier':

236 from sklearn.metrics import accuracy_score

237 return accuracy_score(y, self.predict(X), sample_weight=sample_weight)

238 if self._estimator_type == 'regressor':

239 from sklearn.metrics import r2_score

240 return r2_score(y, self.predict(X), sample_weight=sample_weight)

241 raise RuntimeError( # pragma: no cover

242 "Unexpected estimator type '{0}', cannot guess default scoring metric.".format(

243 self._estimator_type))

244

245 ##############

246 # cloning API

247 ##############

248

249 def get_params(self, deep=True):

250 """

251 Retourne les paramètres qui définissent l'objet

252 (tous ceux nécessaires pour le cloner).

253

254 @param deep unused here

255 @return dict

256 """

257 res = self.P.to_dict()

258 res['model'] = self.model

259 if deep:

260 p = self.model.get_params(deep)

261 ps = {'model__{0}'.format(

262 name): value for name, value in p.items()}

263 res.update(ps)

264 return res

265

266 def set_params(self, **values):

267 """

268 Change les paramètres qui définissent l'objet

269 (tous ceux nécessaires pour le cloner).

270

271 @param values values

272 @return dict

273 """

274 if 'model' in values:

275 self.model = values['model']

276 del values['model']

277 elif not hasattr(self, 'model') or self.model is None:

278 raise KeyError( # pragma: no cover

279 "Missing key '{0}' in [{1}]".format(

280 'model', ', '.join(sorted(values))))

281 prefix = 'model__'

282 ext = {k[len(prefix):]: v for k, v in values.items()

283 if k.startswith(prefix)}

284 self.model.set_params(**ext)

285 existing = self.P.to_dict()

286 ext = {k: v for k, v in values.items() if not k.startswith(prefix)}

287 if ext:

288 existing.update(ext)

289 self.P = SkLearnParameters(**existing)

290 return self

291

292 #################

293 # common methods

294 #################

295

296 def __repr__(self):

297 """

298 usual

299 """

300 return "{0}({2},{1})".format(self.__class__.__name__, repr(self.P), repr(self.model))

Coverage for src/papierstat/mltricks/sklearn_base_learner_category.py : 82%

125 statements

Coverage for src/papierstat/mltricks/sklearn_base_learner_category.py : 82%

125 statements 103 run 22 missing 14 excluded

125 statements