Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# -*- coding: utf-8 -*-
2"""
3@file
4@brief Implémente un *learner* qui suit la même API que tout :epkg:`scikit-learn` learner.
5"""
6import numpy
7import pandas
8from sklearn.base import clone
9from mlinsights.sklapi import SkBaseLearner, SkLearnParameters
12class SkBaseLearnerCategory(SkBaseLearner):
14 """
15 Base d'un *learner* qui apprend un learner pour chaque
16 modalité d'une classe.
18 Notebooks associés à ce *learner* :
20 .. runpython::
21 :rst:
23 from papierstat.datasets.documentation import list_notebooks_rst_links
24 links = list_notebooks_rst_links('lectures', 'wines_color_linear')
25 links = [' * %s' % s for s in links]
26 print('\\n'.join(links))
27 """
29 def __init__(self, colnameind=None, model=None, **kwargs):
30 """
31 Stocke les paramètres dans une classe
32 @see cl SkLearnParameters, elle garde une copie des
33 paramètres pour implémenter facilement *get_params*
34 et ainsi cloner un modèle.
36 @param colnameind indice ou nom de la colonne qui
37 contient les modalités de la catégorie
38 @param model model à appliquer sur chaque catégorie
39 """
40 if not isinstance(colnameind, (int, str)):
41 raise TypeError( # pragma: no cover
42 "colnameind must be str or int not {0}".format(type(colnameind)))
43 if model is None:
44 raise ValueError("model must not be None") # pragma: no cover
45 kwargs['colnameind'] = colnameind
46 SkBaseLearner.__init__(self, **kwargs)
47 self.model = model
48 self._estimator_type = self.model._estimator_type
50 @property
51 def colnameind(self):
52 """
53 Retourne le nom ou l'indice de la catégorie.
54 """
55 return self.P.colnameind
57 @property
58 def Models(self):
59 """
60 Retourne les models.
61 """
62 if hasattr(self, 'models'):
63 return self.models
64 raise RuntimeError('No trained models') # pragma: no cover
66 def _get_cat(self, X):
67 """
68 Retourne les catégories indiquées par *colnameind*.
69 """
70 if isinstance(self.colnameind, str):
71 if not hasattr(X, 'columns'):
72 raise TypeError( # pragma: no cover
73 "colnameind='{0}' and X is not a DataFrame but {1}".format(
74 self.colnameind, type(X)))
75 return X[self.colnameind]
76 return X[:, self.colnameind]
78 def _filter_cat(self, c, X, y=None, sample_weight=None):
79 """
80 Retoure *X*, *y*, *sample_weight* pour la categorie *c* uniquement.
81 """
82 indices = numpy.arange(0, X.shape[0])
83 if isinstance(self.colnameind, str):
84 if not hasattr(X, 'columns'):
85 raise TypeError( # pragma: no cover
86 "colnameind='{0}' and X is not a DataFrame but {1}".format(
87 self.colnameind, type(X)))
88 ind = X[self.colnameind] == c
89 sa = None if sample_weight is None else sample_weight[ind]
90 y = None if y is None else y[ind]
91 ind, x = indices[ind], X.drop(self.colnameind, axis=1)[ind]
92 elif hasattr(X, 'iloc'):
93 ind = X[self.colnameind] == c
94 sa = None if sample_weight is None else sample_weight[ind]
95 y = None if y is None else y[ind]
96 ind, x = indices[ind], X.iloc[ind, -self.colnameind]
97 else:
98 ind = X[self.colnameind] == c
99 sa = None if sample_weight is None else sample_weight[ind]
100 y = None if y is None else y[ind]
101 ind, x = indices[ind], X[ind, -self.colnameind]
102 if y is not None and x.shape[0] != y.shape[0]:
103 raise RuntimeError( # pragma: no cover
104 "Input arrays have different shapes for value='{0}': {1} != {2} "
105 "(expected: {3}) type(X)={4}".format(
106 c, X.shape[0], y.shape[0], ind.shape, type(X)))
107 if sa is not None and x.shape[0] != sa.shape[0]:
108 raise RuntimeError( # pragma: no cover
109 "Input arrays have different shapes for value='{0}': {1} != {2} "
110 "(expected: {3}) type(X)={4}".format(
111 c, X.shape[0], sa.shape[0], ind.shape, type(X)))
112 return ind, x, y, sa
114 ###################
115 # API scikit-learn
116 ###################
118 def fit(self, X, y=None, sample_weight=None, **kwargs):
119 """
120 Apprends un modèle pour chaque modalité d'une catégorie.
122 @param X features
123 @param y cibles
124 @param sample_weight poids de chaque observation
125 @return self, lui-même
127 La fonction n'est pas parallélisée mais elle le pourrait.
128 """
129 cats = set(self._get_cat(X))
130 for c in cats:
131 if not isinstance(c, str) and numpy.isnan(c):
132 raise ValueError( # pragma: no cover
133 "One of the row has a missing category.")
135 res = {}
136 for c in sorted(cats):
137 _, xcat, ycat, scat = self._filter_cat(c, X, y, sample_weight)
138 mod = clone(self.model)
139 if scat is not None:
140 kwargs['sample_weight'] = scat
141 mod.fit(xcat, ycat, **kwargs)
142 res[c] = mod
143 self.models = res
144 return self
146 def _any_predict(self, X, fct, *args):
147 """
148 Prédit en appelant le modèle associé à chaque catégorie.
150 @param X features
151 @return prédictions
153 La fonction n'est pas parallélisée mais elle le pourrait.
154 """
155 cats = set(self._get_cat(X))
156 for c in cats:
157 if not isinstance(c, str) and numpy.isnan(c):
158 raise NotImplementedError( # pragma: no cover
159 "No default value is implemented in case of missing value.")
161 res = []
162 for c in sorted(cats):
163 ind, xcat, ycat, _ = self._filter_cat(c, X, *args)
164 mod = self.models[c]
165 meth = getattr(mod, fct)
166 if ycat is None:
167 pred = meth(xcat)
168 else:
169 pred = meth(xcat, ycat)
170 if len(pred.shape) == 1:
171 pred = pred[:, numpy.newaxis]
172 if len(ind.shape) == 1:
173 ind = ind[:, numpy.newaxis]
174 pred = numpy.hstack([pred, ind])
175 res.append(pred)
176 try:
177 final = numpy.vstack(res)
178 except ValueError: # pragma: no cover
179 # Only one dimension.
180 final = numpy.hstack(res)
181 df = pandas.DataFrame(final)
182 df = df.sort_values(
183 df.columns[-1]).reset_index(drop=True) # pylint: disable=E1136
184 df = df.iloc[:, :-1].values
185 if len(df.shape) == 2 and df.shape[1] == 1:
186 df = df.ravel()
187 return df
189 def predict(self, X):
190 """
191 Prédit en appelant le modèle associé à chaque catégorie.
193 @param X features
194 @return prédictions
196 La fonction n'est pas parallélisée mais elle le pourrait.
197 """
198 return self._any_predict(X, 'predict')
200 def decision_function(self, X):
201 """
202 Output of the model in case of a regressor, matrix with a score for each class and each sample
203 for a classifier
205 @param X Samples, {array-like, sparse matrix}, shape = (n_samples, n_features)
206 @return array, shape = (n_samples,.), Returns predicted values.
207 """
208 if hasattr(self.model, 'decision_function'):
209 return self._any_predict(X, 'decision_function')
210 raise NotImplementedError(
211 "No decision_function for {0}".format(self.model))
213 def predict_proba(self, X):
214 """
215 Output of the model in case of a regressor, matrix with a score for each class and each sample
216 for a classifier
218 @param X Samples, {array-like, sparse matrix}, shape = (n_samples, n_features)
219 @return array, shape = (n_samples,.), Returns predicted values.
220 """
221 if hasattr(self.model, 'predict_proba'):
222 return self._any_predict(X, 'predict_proba')
223 raise NotImplementedError( # pragma: no cover
224 "No method predict_proba for {0}".format(self.model))
226 def score(self, X, y=None, sample_weight=None):
227 """
228 Returns the mean accuracy on the given test data and labels.
230 @param X Training data, numpy array or sparse matrix of shape [n_samples,n_features]
231 @param y Target values, numpy array of shape [n_samples, n_targets] (optional)
232 @param sample_weight Weight values, numpy array of shape [n_samples, n_targets] (optional)
233 @return score : float, Mean accuracy of self.predict(X) wrt. y.
234 """
235 if self._estimator_type == 'classifier':
236 from sklearn.metrics import accuracy_score
237 return accuracy_score(y, self.predict(X), sample_weight=sample_weight)
238 if self._estimator_type == 'regressor':
239 from sklearn.metrics import r2_score
240 return r2_score(y, self.predict(X), sample_weight=sample_weight)
241 raise RuntimeError( # pragma: no cover
242 "Unexpected estimator type '{0}', cannot guess default scoring metric.".format(
243 self._estimator_type))
245 ##############
246 # cloning API
247 ##############
249 def get_params(self, deep=True):
250 """
251 Retourne les paramètres qui définissent l'objet
252 (tous ceux nécessaires pour le cloner).
254 @param deep unused here
255 @return dict
256 """
257 res = self.P.to_dict()
258 res['model'] = self.model
259 if deep:
260 p = self.model.get_params(deep)
261 ps = {'model__{0}'.format(
262 name): value for name, value in p.items()}
263 res.update(ps)
264 return res
266 def set_params(self, **values):
267 """
268 Change les paramètres qui définissent l'objet
269 (tous ceux nécessaires pour le cloner).
271 @param values values
272 @return dict
273 """
274 if 'model' in values:
275 self.model = values['model']
276 del values['model']
277 elif not hasattr(self, 'model') or self.model is None:
278 raise KeyError( # pragma: no cover
279 "Missing key '{0}' in [{1}]".format(
280 'model', ', '.join(sorted(values))))
281 prefix = 'model__'
282 ext = {k[len(prefix):]: v for k, v in values.items()
283 if k.startswith(prefix)}
284 self.model.set_params(**ext)
285 existing = self.P.to_dict()
286 ext = {k: v for k, v in values.items() if not k.startswith(prefix)}
287 if ext:
288 existing.update(ext)
289 self.P = SkLearnParameters(**existing)
290 return self
292 #################
293 # common methods
294 #################
296 def __repr__(self):
297 """
298 usual
299 """
300 return "{0}({2},{1})".format(self.__class__.__name__, repr(self.P), repr(self.model))