Coverage for mlinsights/mlmodel/categories_to_integers.py: 93%

84 statements  

« prev     ^ index     » next       coverage.py v7.1.0, created at 2023-02-28 08:46 +0100

1""" 

2@file 

3@brief Implements a transformation which can be put in a pipeline to transform categories in 

4integers. 

5""" 

6import numpy 

7import pandas 

8from sklearn.base import BaseEstimator, TransformerMixin 

9 

10 

11class CategoriesToIntegers(BaseEstimator, TransformerMixin): 

12 """ 

13 Does something similar to what 

14 `DictVectorizer <http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.DictVectorizer.html>`_ 

15 does but in a transformer. The method *fit* retains all categories, 

16 the method *transform* transforms categories into integers. 

17 Categories are sorted by columns. If the method *transform* tries to convert 

18 a categories which was not seen by method *fit*, it can raise an exception 

19 or ignore it and replace it by zero. 

20 

21 .. exref:: 

22 :title: DictVectorizer or CategoriesToIntegers 

23 :tag: sklearn 

24 

25 Example which transforms text into integers: 

26 

27 .. runpython:: 

28 :showcode: 

29 

30 import pandas 

31 from mlinsights.mlmodel import CategoriesToIntegers 

32 df = pandas.DataFrame( [{"cat": "a"}, {"cat": "b"}] ) 

33 trans = CategoriesToIntegers() 

34 trans.fit(df) 

35 newdf = trans.transform(df) 

36 print(newdf) 

37 """ 

38 

39 def __init__(self, columns=None, remove=None, skip_errors=False, single=False): 

40 """ 

41 @param columns specify a columns selection 

42 @param remove modalities to remove 

43 @param skip_errors skip when a new categories appear (no 1) 

44 @param single use a single column per category, do not multiply them for each value 

45 

46 The logging function displays a message when a new dense and big matrix 

47 is created when it should be sparse. A sparse matrix should be allocated instead. 

48 """ 

49 BaseEstimator.__init__(self) 

50 TransformerMixin.__init__(self) 

51 self.columns = columns if isinstance( 

52 columns, list) or columns is None else [columns] 

53 self.skip_errors = skip_errors 

54 self.remove = remove 

55 self.single = single 

56 

57 def __str__(self): 

58 """ 

59 usual 

60 """ 

61 return self.__repr__() 

62 

63 def fit(self, X, y=None, **fit_params): 

64 """ 

65 Makes the list of all categories in input *X*. 

66 *X* must be a dataframe. 

67 

68 :param X: iterable 

69 Training data 

70 :param y: iterable, default=None 

71 Training targets. 

72 :return: self 

73 """ 

74 if not isinstance(X, pandas.DataFrame): 

75 raise TypeError( # pragma: no cover 

76 f"this transformer only accept Dataframes, not {type(X)}") 

77 if self.columns: 

78 columns = self.columns 

79 else: 

80 columns = [c for c, d in zip( 

81 X.columns, X.dtypes) if d in (object,)] 

82 

83 self._fit_columns = columns 

84 max_cat = max(len(X) // 2 + 1, 10000) 

85 

86 self._categories = {} 

87 for c in columns: 

88 distinct = set(X[c].dropna()) 

89 nb = len(distinct) 

90 if nb >= max_cat: 

91 raise ValueError( # pragma: no cover 

92 f"Too many categories ({nb}) for one column '{c}' max_cat={max_cat}") 

93 self._categories[c] = dict((c, i) 

94 for i, c in enumerate(list(sorted(distinct)))) 

95 self._schema = self._build_schema() 

96 return self 

97 

98 def _build_schema(self): 

99 """ 

100 Concatenates all the categories 

101 given the information stored in *_categories*. 

102 

103 @return list of columns, beginning of each 

104 """ 

105 schema = [] 

106 position = {} 

107 new_vector = {} 

108 last = 0 

109 for c, v in self._categories.items(): 

110 sch = [(_[1], f"{c}={_[1]}") 

111 for _ in sorted((n, d) for d, n in v.items())] 

112 if self.remove: 

113 sch = [d for d in sch if d[1] not in self.remove] 

114 position[c] = last 

115 new_vector[c] = {d[0]: i for i, d in enumerate(sch)} 

116 last += len(sch) 

117 schema.extend(_[1] for _ in sch) 

118 

119 return schema, position, new_vector 

120 

121 def transform(self, X, y=None, **fit_params): 

122 """ 

123 Transforms categories in numerical features based on the list 

124 of categories found by method *fit*. 

125 *X* must be a dataframe. The function does not preserve 

126 the order of the columns. 

127 

128 :param X: iterable 

129 Training data 

130 :param y: iterable, default=None 

131 Training targets. 

132 :return: DataFrame, *X* with categories. 

133 """ 

134 if not isinstance(X, pandas.DataFrame): 

135 raise TypeError( # pragma: no cover 

136 f"X is not a dataframe: {type(X)}") 

137 

138 if self.single: 

139 b = not self.skip_errors 

140 

141 def transform(v, vec): 

142 "transform a vector" 

143 if v in vec: 

144 return vec[v] 

145 if v is None: 

146 return numpy.nan 

147 if isinstance(v, float) and numpy.isnan(v): 

148 return numpy.nan 

149 if not self.skip_errors: 

150 lv = list(sorted(vec)) 

151 if len(lv) > 20: # pragma: no cover 

152 lv = lv[:20] 

153 lv.append("...") 

154 raise ValueError( # pragma: no cover 

155 "Unable to find category value %r type(v)=%r " 

156 "among\n%s" % (v, type(v), '\n'.join(lv))) 

157 return numpy.nan 

158 

159 sch, pos, new_vector = self._schema 

160 X = X.copy() 

161 for c in self._fit_columns: 

162 X[c] = X[c].apply(lambda v, cv=c: transform(v, new_vector[cv])) 

163 return X 

164 else: 

165 dfcat = X[self._fit_columns] 

166 dfnum = X[[c for c in X.columns if c not in self._fit_columns]] 

167 sch, pos, new_vector = self._schema 

168 vec = new_vector 

169 

170 # new_size = X.shape[0] * len(sch) 

171 res = numpy.zeros((X.shape[0], len(sch))) 

172 res.fill(numpy.nan) 

173 b = not self.skip_errors 

174 

175 for i, row in enumerate(dfcat.to_dict("records")): 

176 for k, v in row.items(): 

177 if v is None or (isinstance(v, float) and numpy.isnan(v)): 

178 # missing values 

179 continue 

180 if v not in vec[k]: 

181 if b: 

182 lv = list(sorted(vec[k])) 

183 if len(lv) > 20: # pragma: no cover 

184 lv = lv[:20] 

185 lv.append("...") 

186 raise ValueError( # pragma: no cover 

187 "Unable to find category value %r: %r " 

188 "type(v)=%r among\n%s" % ( 

189 k, v, type(v), '\n'.join(lv))) 

190 else: 

191 p = pos[k] + vec[k][v] 

192 res[i, p] = 1.0 

193 

194 if dfnum.shape[1] > 0: 

195 newdf = pandas.DataFrame(res, columns=sch, index=dfcat.index) 

196 allnum = pandas.concat([dfnum, newdf], axis=1) 

197 else: 

198 allnum = pandas.DataFrame(res, columns=sch, index=dfcat.index) 

199 

200 return allnum 

201 

202 def fit_transform(self, X, y=None, **fit_params): 

203 """ 

204 Fits and transforms categories in numerical features based on the list 

205 of categories found by method *fit*. 

206 *X* must be a dataframe. The function does not preserve 

207 the order of the columns. 

208 

209 :param X: iterable 

210 Training data 

211 :param y: iterable, default=None 

212 Training targets. 

213 :return: Dataframe, *X* with categories. 

214 """ 

215 return self.fit(X, y=y, **fit_params).transform(X, y)