Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1""" 

2@file 

3@brief Implements a transformation which can be put in a pipeline to transform categories in 

4integers. 

5""" 

6import numpy 

7import pandas 

8from sklearn.base import BaseEstimator, TransformerMixin 

9 

10 

11class CategoriesToIntegers(BaseEstimator, TransformerMixin): 

12 """ 

13 Does something similar to what 

14 `DictVectorizer <http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.DictVectorizer.html>`_ 

15 does but in a transformer. The method *fit* retains all categories, 

16 the method *transform* transforms categories into integers. 

17 Categories are sorted by columns. If the method *transform* tries to convert 

18 a categories which was not seen by method *fit*, it can raise an exception 

19 or ignore it and replace it by zero. 

20 

21 .. exref:: 

22 :title: DictVectorizer or CategoriesToIntegers 

23 :tag: sklearn 

24 

25 Example which transforms text into integers: 

26 

27 .. runpython:: 

28 :showcode: 

29 

30 import pandas 

31 from mlinsights.mlmodel import CategoriesToIntegers 

32 df = pandas.DataFrame( [{"cat": "a"}, {"cat": "b"}] ) 

33 trans = CategoriesToIntegers() 

34 trans.fit(df) 

35 newdf = trans.transform(df) 

36 print(newdf) 

37 """ 

38 

39 def __init__(self, columns=None, remove=None, skip_errors=False, single=False): 

40 """ 

41 @param columns specify a columns selection 

42 @param remove modalities to remove 

43 @param skip_errors skip when a new categories appear (no 1) 

44 @param single use a single column per category, do not multiply them for each value 

45 

46 The logging function displays a message when a new dense and big matrix 

47 is created when it should be sparse. A sparse matrix should be allocated instead. 

48 """ 

49 BaseEstimator.__init__(self) 

50 TransformerMixin.__init__(self) 

51 self.columns = columns if isinstance( 

52 columns, list) or columns is None else [columns] 

53 self.skip_errors = skip_errors 

54 self.remove = remove 

55 self.single = single 

56 

57 def __str__(self): 

58 """ 

59 usual 

60 """ 

61 return self.__repr__() 

62 

63 def fit(self, X, y=None, **fit_params): 

64 """ 

65 Makes the list of all categories in input *X*. 

66 *X* must be a dataframe. 

67 

68 :param X: iterable 

69 Training data 

70 :param y: iterable, default=None 

71 Training targets. 

72 :return: self 

73 """ 

74 if not isinstance(X, pandas.DataFrame): 

75 raise TypeError( # pragma: no cover 

76 "this transformer only accept Dataframes, not {0}".format(type(X))) 

77 if self.columns: 

78 columns = self.columns 

79 else: 

80 columns = [c for c, d in zip( 

81 X.columns, X.dtypes) if d in (object,)] 

82 

83 self._fit_columns = columns 

84 max_cat = max(len(X) // 2 + 1, 10000) 

85 

86 self._categories = {} 

87 for c in columns: 

88 distinct = set(X[c].dropna()) 

89 nb = len(distinct) 

90 if nb >= max_cat: 

91 raise ValueError( # pragma: no cover 

92 "Too many categories ({0}) for one column '{1}' max_cat={2}".format(nb, c, max_cat)) 

93 self._categories[c] = dict((c, i) 

94 for i, c in enumerate(list(sorted(distinct)))) 

95 self._schema = self._build_schema() 

96 return self 

97 

98 def _build_schema(self): 

99 """ 

100 Concatenates all the categories 

101 given the information stored in *_categories*. 

102 

103 @return list of columns, beginning of each 

104 """ 

105 schema = [] 

106 position = {} 

107 new_vector = {} 

108 last = 0 

109 for c, v in self._categories.items(): 

110 sch = [(_[1], "{0}={1}".format(c, _[1])) 

111 for _ in sorted((n, d) for d, n in v.items())] 

112 if self.remove: 

113 sch = [d for d in sch if d[1] not in self.remove] 

114 position[c] = last 

115 new_vector[c] = {d[0]: i for i, d in enumerate(sch)} 

116 last += len(sch) 

117 schema.extend(_[1] for _ in sch) 

118 

119 return schema, position, new_vector 

120 

121 def transform(self, X, y=None, **fit_params): 

122 """ 

123 Transforms categories in numerical features based on the list 

124 of categories found by method *fit*. 

125 *X* must be a dataframe. The function does not preserve 

126 the order of the columns. 

127 

128 :param X: iterable 

129 Training data 

130 :param y: iterable, default=None 

131 Training targets. 

132 :return: DataFrame, *X* with categories. 

133 """ 

134 if not isinstance(X, pandas.DataFrame): 

135 raise TypeError( # pragma: no cover 

136 "X is not a dataframe: {0}".format(type(X))) 

137 

138 if self.single: 

139 b = not self.skip_errors 

140 

141 def transform(v, vec): 

142 "transform a vector" 

143 if v in vec: 

144 return vec[v] 

145 if v is None: 

146 return numpy.nan 

147 if isinstance(v, float) and numpy.isnan(v): 

148 return numpy.nan 

149 if not self.skip_errors: 

150 lv = list(sorted(vec)) 

151 if len(lv) > 20: # pragma: no cover 

152 lv = lv[:20] 

153 lv.append("...") 

154 raise ValueError( # pragma: no cover 

155 "Unable to find category value '{0}' type(v)={2} among\n{1}".format( 

156 v, "\n".join(lv), type(v))) 

157 return numpy.nan 

158 

159 sch, pos, new_vector = self._schema 

160 X = X.copy() 

161 for c in self._fit_columns: 

162 X[c] = X[c].apply(lambda v, cv=c: transform(v, new_vector[cv])) 

163 return X 

164 else: 

165 dfcat = X[self._fit_columns] 

166 dfnum = X[[c for c in X.columns if c not in self._fit_columns]] 

167 sch, pos, new_vector = self._schema 

168 vec = new_vector 

169 

170 # new_size = X.shape[0] * len(sch) 

171 res = numpy.zeros((X.shape[0], len(sch))) 

172 res.fill(numpy.nan) 

173 b = not self.skip_errors 

174 

175 for i, row in enumerate(dfcat.to_dict("records")): 

176 for k, v in row.items(): 

177 if v is None or (isinstance(v, float) and numpy.isnan(v)): 

178 # missing values 

179 continue 

180 if v not in vec[k]: 

181 if b: 

182 lv = list(sorted(vec[k])) 

183 if len(lv) > 20: # pragma: no cover 

184 lv = lv[:20] 

185 lv.append("...") 

186 raise ValueError( # pragma: no cover 

187 "unable to find category value '{0}': '{1}' type(v)={3} among\n{2}".format( 

188 k, v, "\n".join(lv), type(v))) 

189 else: 

190 p = pos[k] + vec[k][v] 

191 res[i, p] = 1.0 

192 

193 if dfnum.shape[1] > 0: 

194 newdf = pandas.DataFrame(res, columns=sch, index=dfcat.index) 

195 allnum = pandas.concat([dfnum, newdf], axis=1) 

196 else: 

197 allnum = pandas.DataFrame(res, columns=sch, index=dfcat.index) 

198 

199 return allnum 

200 

201 def fit_transform(self, X, y=None, **fit_params): 

202 """ 

203 Fits and transforms categories in numerical features based on the list 

204 of categories found by method *fit*. 

205 *X* must be a dataframe. The function does not preserve 

206 the order of the columns. 

207 

208 :param X: iterable 

209 Training data 

210 :param y: iterable, default=None 

211 Training targets. 

212 :return: Dataframe, *X* with categories. 

213 """ 

214 return self.fit(X, y=y, **fit_params).transform(X, y)