Coverage for mlinsights/mlmodel/categories_to

1"""

2@file

3@brief Implements a transformation which can be put in a pipeline to transform categories in

4integers.

5"""

6import numpy

7import pandas

8from sklearn.base import BaseEstimator, TransformerMixin

11class CategoriesToIntegers(BaseEstimator, TransformerMixin):

12 """

13 Does something similar to what

14 `DictVectorizer <http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.DictVectorizer.html>`_

15 does but in a transformer. The method *fit* retains all categories,

16 the method *transform* transforms categories into integers.

17 Categories are sorted by columns. If the method *transform* tries to convert

18 a categories which was not seen by method *fit*, it can raise an exception

19 or ignore it and replace it by zero.

21 .. exref::

22 :title: DictVectorizer or CategoriesToIntegers

23 :tag: sklearn

25 Example which transforms text into integers:

27 .. runpython::

28 :showcode:

30 import pandas

31 from mlinsights.mlmodel import CategoriesToIntegers

32 df = pandas.DataFrame( [{"cat": "a"}, {"cat": "b"}] )

33 trans = CategoriesToIntegers()

34 trans.fit(df)

35 newdf = trans.transform(df)

36 print(newdf)

37 """

39 def __init__(self, columns=None, remove=None, skip_errors=False, single=False):

40 """

41 @param columns specify a columns selection

42 @param remove modalities to remove

43 @param skip_errors skip when a new categories appear (no 1)

44 @param single use a single column per category, do not multiply them for each value

46 The logging function displays a message when a new dense and big matrix

47 is created when it should be sparse. A sparse matrix should be allocated instead.

48 """

49 BaseEstimator.__init__(self)

50 TransformerMixin.__init__(self)

51 self.columns = columns if isinstance(

52 columns, list) or columns is None else [columns]

53 self.skip_errors = skip_errors

54 self.remove = remove

55 self.single = single

57 def __str__(self):

58 """

59 usual

60 """

61 return self.__repr__()

63 def fit(self, X, y=None, **fit_params):

64 """

65 Makes the list of all categories in input *X*.

66 *X* must be a dataframe.

68 :param X: iterable

69 Training data

70 :param y: iterable, default=None

71 Training targets.

72 :return: self

73 """

74 if not isinstance(X, pandas.DataFrame):

75 raise TypeError( # pragma: no cover

76 f"this transformer only accept Dataframes, not {type(X)}")

77 if self.columns:

78 columns = self.columns

79 else:

80 columns = [c for c, d in zip(

81 X.columns, X.dtypes) if d in (object,)]

83 self._fit_columns = columns

84 max_cat = max(len(X) // 2 + 1, 10000)

86 self._categories = {}

87 for c in columns:

88 distinct = set(X[c].dropna())

89 nb = len(distinct)

90 if nb >= max_cat:

91 raise ValueError( # pragma: no cover

92 f"Too many categories ({nb}) for one column '{c}' max_cat={max_cat}")

93 self._categories[c] = dict((c, i)

94 for i, c in enumerate(list(sorted(distinct))))

95 self._schema = self._build_schema()

96 return self

98 def _build_schema(self):

99 """

100 Concatenates all the categories

101 given the information stored in *_categories*.

102

103 @return list of columns, beginning of each

104 """

105 schema = []

106 position = {}

107 new_vector = {}

108 last = 0

109 for c, v in self._categories.items():

110 sch = [(_[1], f"{c}={_[1]}")

111 for _ in sorted((n, d) for d, n in v.items())]

112 if self.remove:

113 sch = [d for d in sch if d[1] not in self.remove]

114 position[c] = last

115 new_vector[c] = {d[0]: i for i, d in enumerate(sch)}

116 last += len(sch)

117 schema.extend(_[1] for _ in sch)

118

119 return schema, position, new_vector

120

121 def transform(self, X, y=None, **fit_params):

122 """

123 Transforms categories in numerical features based on the list

124 of categories found by method *fit*.

125 *X* must be a dataframe. The function does not preserve

126 the order of the columns.

127

128 :param X: iterable

129 Training data

130 :param y: iterable, default=None

131 Training targets.

132 :return: DataFrame, *X* with categories.

133 """

134 if not isinstance(X, pandas.DataFrame):

135 raise TypeError( # pragma: no cover

136 f"X is not a dataframe: {type(X)}")

137

138 if self.single:

139 b = not self.skip_errors

140

141 def transform(v, vec):

142 "transform a vector"

143 if v in vec:

144 return vec[v]

145 if v is None:

146 return numpy.nan

147 if isinstance(v, float) and numpy.isnan(v):

148 return numpy.nan

149 if not self.skip_errors:

150 lv = list(sorted(vec))

151 if len(lv) > 20: # pragma: no cover

152 lv = lv[:20]

153 lv.append("...")

154 raise ValueError( # pragma: no cover

155 "Unable to find category value %r type(v)=%r "

156 "among\n%s" % (v, type(v), '\n'.join(lv)))

157 return numpy.nan

158

159 sch, pos, new_vector = self._schema

160 X = X.copy()

161 for c in self._fit_columns:

162 X[c] = X[c].apply(lambda v, cv=c: transform(v, new_vector[cv]))

163 return X

164 else:

165 dfcat = X[self._fit_columns]

166 dfnum = X[[c for c in X.columns if c not in self._fit_columns]]

167 sch, pos, new_vector = self._schema

168 vec = new_vector

169

170 # new_size = X.shape[0] * len(sch)

171 res = numpy.zeros((X.shape[0], len(sch)))

172 res.fill(numpy.nan)

173 b = not self.skip_errors

174

175 for i, row in enumerate(dfcat.to_dict("records")):

176 for k, v in row.items():

177 if v is None or (isinstance(v, float) and numpy.isnan(v)):

178 # missing values

179 continue

180 if v not in vec[k]:

181 if b:

182 lv = list(sorted(vec[k]))

183 if len(lv) > 20: # pragma: no cover

184 lv = lv[:20]

185 lv.append("...")

186 raise ValueError( # pragma: no cover

187 "Unable to find category value %r: %r "

188 "type(v)=%r among\n%s" % (

189 k, v, type(v), '\n'.join(lv)))

190 else:

191 p = pos[k] + vec[k][v]

192 res[i, p] = 1.0

193

194 if dfnum.shape[1] > 0:

195 newdf = pandas.DataFrame(res, columns=sch, index=dfcat.index)

196 allnum = pandas.concat([dfnum, newdf], axis=1)

197 else:

198 allnum = pandas.DataFrame(res, columns=sch, index=dfcat.index)

199

200 return allnum

201

202 def fit_transform(self, X, y=None, **fit_params):

203 """

204 Fits and transforms categories in numerical features based on the list

205 of categories found by method *fit*.

206 *X* must be a dataframe. The function does not preserve

207 the order of the columns.

208

209 :param X: iterable

210 Training data

211 :param y: iterable, default=None

212 Training targets.

213 :return: Dataframe, *X* with categories.

214 """

215 return self.fit(X, y=y, **fit_params).transform(X, y)

Coverage for mlinsights/mlmodel/categories_to_integers.py: 93%

84 statements