Coverage for mlinsights/mlmodel/categories_to_integers.py: 93%
84 statements
« prev ^ index » next coverage.py v7.1.0, created at 2023-02-28 08:46 +0100
« prev ^ index » next coverage.py v7.1.0, created at 2023-02-28 08:46 +0100
1"""
2@file
3@brief Implements a transformation which can be put in a pipeline to transform categories in
4integers.
5"""
6import numpy
7import pandas
8from sklearn.base import BaseEstimator, TransformerMixin
11class CategoriesToIntegers(BaseEstimator, TransformerMixin):
12 """
13 Does something similar to what
14 `DictVectorizer <http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.DictVectorizer.html>`_
15 does but in a transformer. The method *fit* retains all categories,
16 the method *transform* transforms categories into integers.
17 Categories are sorted by columns. If the method *transform* tries to convert
18 a categories which was not seen by method *fit*, it can raise an exception
19 or ignore it and replace it by zero.
21 .. exref::
22 :title: DictVectorizer or CategoriesToIntegers
23 :tag: sklearn
25 Example which transforms text into integers:
27 .. runpython::
28 :showcode:
30 import pandas
31 from mlinsights.mlmodel import CategoriesToIntegers
32 df = pandas.DataFrame( [{"cat": "a"}, {"cat": "b"}] )
33 trans = CategoriesToIntegers()
34 trans.fit(df)
35 newdf = trans.transform(df)
36 print(newdf)
37 """
39 def __init__(self, columns=None, remove=None, skip_errors=False, single=False):
40 """
41 @param columns specify a columns selection
42 @param remove modalities to remove
43 @param skip_errors skip when a new categories appear (no 1)
44 @param single use a single column per category, do not multiply them for each value
46 The logging function displays a message when a new dense and big matrix
47 is created when it should be sparse. A sparse matrix should be allocated instead.
48 """
49 BaseEstimator.__init__(self)
50 TransformerMixin.__init__(self)
51 self.columns = columns if isinstance(
52 columns, list) or columns is None else [columns]
53 self.skip_errors = skip_errors
54 self.remove = remove
55 self.single = single
57 def __str__(self):
58 """
59 usual
60 """
61 return self.__repr__()
63 def fit(self, X, y=None, **fit_params):
64 """
65 Makes the list of all categories in input *X*.
66 *X* must be a dataframe.
68 :param X: iterable
69 Training data
70 :param y: iterable, default=None
71 Training targets.
72 :return: self
73 """
74 if not isinstance(X, pandas.DataFrame):
75 raise TypeError( # pragma: no cover
76 f"this transformer only accept Dataframes, not {type(X)}")
77 if self.columns:
78 columns = self.columns
79 else:
80 columns = [c for c, d in zip(
81 X.columns, X.dtypes) if d in (object,)]
83 self._fit_columns = columns
84 max_cat = max(len(X) // 2 + 1, 10000)
86 self._categories = {}
87 for c in columns:
88 distinct = set(X[c].dropna())
89 nb = len(distinct)
90 if nb >= max_cat:
91 raise ValueError( # pragma: no cover
92 f"Too many categories ({nb}) for one column '{c}' max_cat={max_cat}")
93 self._categories[c] = dict((c, i)
94 for i, c in enumerate(list(sorted(distinct))))
95 self._schema = self._build_schema()
96 return self
98 def _build_schema(self):
99 """
100 Concatenates all the categories
101 given the information stored in *_categories*.
103 @return list of columns, beginning of each
104 """
105 schema = []
106 position = {}
107 new_vector = {}
108 last = 0
109 for c, v in self._categories.items():
110 sch = [(_[1], f"{c}={_[1]}")
111 for _ in sorted((n, d) for d, n in v.items())]
112 if self.remove:
113 sch = [d for d in sch if d[1] not in self.remove]
114 position[c] = last
115 new_vector[c] = {d[0]: i for i, d in enumerate(sch)}
116 last += len(sch)
117 schema.extend(_[1] for _ in sch)
119 return schema, position, new_vector
121 def transform(self, X, y=None, **fit_params):
122 """
123 Transforms categories in numerical features based on the list
124 of categories found by method *fit*.
125 *X* must be a dataframe. The function does not preserve
126 the order of the columns.
128 :param X: iterable
129 Training data
130 :param y: iterable, default=None
131 Training targets.
132 :return: DataFrame, *X* with categories.
133 """
134 if not isinstance(X, pandas.DataFrame):
135 raise TypeError( # pragma: no cover
136 f"X is not a dataframe: {type(X)}")
138 if self.single:
139 b = not self.skip_errors
141 def transform(v, vec):
142 "transform a vector"
143 if v in vec:
144 return vec[v]
145 if v is None:
146 return numpy.nan
147 if isinstance(v, float) and numpy.isnan(v):
148 return numpy.nan
149 if not self.skip_errors:
150 lv = list(sorted(vec))
151 if len(lv) > 20: # pragma: no cover
152 lv = lv[:20]
153 lv.append("...")
154 raise ValueError( # pragma: no cover
155 "Unable to find category value %r type(v)=%r "
156 "among\n%s" % (v, type(v), '\n'.join(lv)))
157 return numpy.nan
159 sch, pos, new_vector = self._schema
160 X = X.copy()
161 for c in self._fit_columns:
162 X[c] = X[c].apply(lambda v, cv=c: transform(v, new_vector[cv]))
163 return X
164 else:
165 dfcat = X[self._fit_columns]
166 dfnum = X[[c for c in X.columns if c not in self._fit_columns]]
167 sch, pos, new_vector = self._schema
168 vec = new_vector
170 # new_size = X.shape[0] * len(sch)
171 res = numpy.zeros((X.shape[0], len(sch)))
172 res.fill(numpy.nan)
173 b = not self.skip_errors
175 for i, row in enumerate(dfcat.to_dict("records")):
176 for k, v in row.items():
177 if v is None or (isinstance(v, float) and numpy.isnan(v)):
178 # missing values
179 continue
180 if v not in vec[k]:
181 if b:
182 lv = list(sorted(vec[k]))
183 if len(lv) > 20: # pragma: no cover
184 lv = lv[:20]
185 lv.append("...")
186 raise ValueError( # pragma: no cover
187 "Unable to find category value %r: %r "
188 "type(v)=%r among\n%s" % (
189 k, v, type(v), '\n'.join(lv)))
190 else:
191 p = pos[k] + vec[k][v]
192 res[i, p] = 1.0
194 if dfnum.shape[1] > 0:
195 newdf = pandas.DataFrame(res, columns=sch, index=dfcat.index)
196 allnum = pandas.concat([dfnum, newdf], axis=1)
197 else:
198 allnum = pandas.DataFrame(res, columns=sch, index=dfcat.index)
200 return allnum
202 def fit_transform(self, X, y=None, **fit_params):
203 """
204 Fits and transforms categories in numerical features based on the list
205 of categories found by method *fit*.
206 *X* must be a dataframe. The function does not preserve
207 the order of the columns.
209 :param X: iterable
210 Training data
211 :param y: iterable, default=None
212 Training targets.
213 :return: Dataframe, *X* with categories.
214 """
215 return self.fit(X, y=y, **fit_params).transform(X, y)