Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""
2@file
3@brief Implements a transformation which can be put in a pipeline to transform categories in
4integers.
5"""
6import numpy
7import pandas
8from sklearn.base import BaseEstimator, TransformerMixin
11class CategoriesToIntegers(BaseEstimator, TransformerMixin):
12 """
13 Does something similar to what
14 `DictVectorizer <http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.DictVectorizer.html>`_
15 does but in a transformer. The method *fit* retains all categories,
16 the method *transform* transforms categories into integers.
17 Categories are sorted by columns. If the method *transform* tries to convert
18 a categories which was not seen by method *fit*, it can raise an exception
19 or ignore it and replace it by zero.
21 .. exref::
22 :title: DictVectorizer or CategoriesToIntegers
23 :tag: sklearn
25 Example which transforms text into integers:
27 .. runpython::
28 :showcode:
30 import pandas
31 from mlinsights.mlmodel import CategoriesToIntegers
32 df = pandas.DataFrame( [{"cat": "a"}, {"cat": "b"}] )
33 trans = CategoriesToIntegers()
34 trans.fit(df)
35 newdf = trans.transform(df)
36 print(newdf)
37 """
39 def __init__(self, columns=None, remove=None, skip_errors=False, single=False):
40 """
41 @param columns specify a columns selection
42 @param remove modalities to remove
43 @param skip_errors skip when a new categories appear (no 1)
44 @param single use a single column per category, do not multiply them for each value
46 The logging function displays a message when a new dense and big matrix
47 is created when it should be sparse. A sparse matrix should be allocated instead.
48 """
49 BaseEstimator.__init__(self)
50 TransformerMixin.__init__(self)
51 self.columns = columns if isinstance(
52 columns, list) or columns is None else [columns]
53 self.skip_errors = skip_errors
54 self.remove = remove
55 self.single = single
57 def __str__(self):
58 """
59 usual
60 """
61 return self.__repr__()
63 def fit(self, X, y=None, **fit_params):
64 """
65 Makes the list of all categories in input *X*.
66 *X* must be a dataframe.
68 :param X: iterable
69 Training data
70 :param y: iterable, default=None
71 Training targets.
72 :return: self
73 """
74 if not isinstance(X, pandas.DataFrame):
75 raise TypeError( # pragma: no cover
76 "this transformer only accept Dataframes, not {0}".format(type(X)))
77 if self.columns:
78 columns = self.columns
79 else:
80 columns = [c for c, d in zip(
81 X.columns, X.dtypes) if d in (object,)]
83 self._fit_columns = columns
84 max_cat = max(len(X) // 2 + 1, 10000)
86 self._categories = {}
87 for c in columns:
88 distinct = set(X[c].dropna())
89 nb = len(distinct)
90 if nb >= max_cat:
91 raise ValueError( # pragma: no cover
92 "Too many categories ({0}) for one column '{1}' max_cat={2}".format(nb, c, max_cat))
93 self._categories[c] = dict((c, i)
94 for i, c in enumerate(list(sorted(distinct))))
95 self._schema = self._build_schema()
96 return self
98 def _build_schema(self):
99 """
100 Concatenates all the categories
101 given the information stored in *_categories*.
103 @return list of columns, beginning of each
104 """
105 schema = []
106 position = {}
107 new_vector = {}
108 last = 0
109 for c, v in self._categories.items():
110 sch = [(_[1], "{0}={1}".format(c, _[1]))
111 for _ in sorted((n, d) for d, n in v.items())]
112 if self.remove:
113 sch = [d for d in sch if d[1] not in self.remove]
114 position[c] = last
115 new_vector[c] = {d[0]: i for i, d in enumerate(sch)}
116 last += len(sch)
117 schema.extend(_[1] for _ in sch)
119 return schema, position, new_vector
121 def transform(self, X, y=None, **fit_params):
122 """
123 Transforms categories in numerical features based on the list
124 of categories found by method *fit*.
125 *X* must be a dataframe. The function does not preserve
126 the order of the columns.
128 :param X: iterable
129 Training data
130 :param y: iterable, default=None
131 Training targets.
132 :return: DataFrame, *X* with categories.
133 """
134 if not isinstance(X, pandas.DataFrame):
135 raise TypeError( # pragma: no cover
136 "X is not a dataframe: {0}".format(type(X)))
138 if self.single:
139 b = not self.skip_errors
141 def transform(v, vec):
142 "transform a vector"
143 if v in vec:
144 return vec[v]
145 if v is None:
146 return numpy.nan
147 if isinstance(v, float) and numpy.isnan(v):
148 return numpy.nan
149 if not self.skip_errors:
150 lv = list(sorted(vec))
151 if len(lv) > 20: # pragma: no cover
152 lv = lv[:20]
153 lv.append("...")
154 raise ValueError( # pragma: no cover
155 "Unable to find category value '{0}' type(v)={2} among\n{1}".format(
156 v, "\n".join(lv), type(v)))
157 return numpy.nan
159 sch, pos, new_vector = self._schema
160 X = X.copy()
161 for c in self._fit_columns:
162 X[c] = X[c].apply(lambda v, cv=c: transform(v, new_vector[cv]))
163 return X
164 else:
165 dfcat = X[self._fit_columns]
166 dfnum = X[[c for c in X.columns if c not in self._fit_columns]]
167 sch, pos, new_vector = self._schema
168 vec = new_vector
170 # new_size = X.shape[0] * len(sch)
171 res = numpy.zeros((X.shape[0], len(sch)))
172 res.fill(numpy.nan)
173 b = not self.skip_errors
175 for i, row in enumerate(dfcat.to_dict("records")):
176 for k, v in row.items():
177 if v is None or (isinstance(v, float) and numpy.isnan(v)):
178 # missing values
179 continue
180 if v not in vec[k]:
181 if b:
182 lv = list(sorted(vec[k]))
183 if len(lv) > 20: # pragma: no cover
184 lv = lv[:20]
185 lv.append("...")
186 raise ValueError( # pragma: no cover
187 "unable to find category value '{0}': '{1}' type(v)={3} among\n{2}".format(
188 k, v, "\n".join(lv), type(v)))
189 else:
190 p = pos[k] + vec[k][v]
191 res[i, p] = 1.0
193 if dfnum.shape[1] > 0:
194 newdf = pandas.DataFrame(res, columns=sch, index=dfcat.index)
195 allnum = pandas.concat([dfnum, newdf], axis=1)
196 else:
197 allnum = pandas.DataFrame(res, columns=sch, index=dfcat.index)
199 return allnum
201 def fit_transform(self, X, y=None, **fit_params):
202 """
203 Fits and transforms categories in numerical features based on the list
204 of categories found by method *fit*.
205 *X* must be a dataframe. The function does not preserve
206 the order of the columns.
208 :param X: iterable
209 Training data
210 :param y: iterable, default=None
211 Training targets.
212 :return: Dataframe, *X* with categories.
213 """
214 return self.fit(X, y=y, **fit_params).transform(X, y)