Source code for mlinsights.mlmodel.categories_to_integers

"""
Implements a transformation which can be put in a pipeline to transform categories in
integers.


:githublink:`%|py|6`
"""
import numpy
import pandas
from sklearn.base import BaseEstimator, TransformerMixin


[docs]class CategoriesToIntegers(BaseEstimator, TransformerMixin): """ Does something similar to what `DictVectorizer <http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.DictVectorizer.html>`_ does but in a transformer. The method *fit* retains all categories, the method *transform* transforms categories into integers. Categories are sorted by columns. If the method *transform* tries to convert a categories which was not seen by method *fit*, it can raise an exception or ignore it and replace it by zero. .. exref:: :title: DictVectorizer or CategoriesToIntegers :tag: sklearn Example which transforms text into integers: .. runpython:: :showcode: import pandas from mlinsights.mlmodel import CategoriesToIntegers df = pandas.DataFrame( [{"cat": "a"}, {"cat": "b"}] ) trans = CategoriesToIntegers() trans.fit(df) newdf = trans.transform(df) print(newdf) :githublink:`%|py|37` """
[docs] def __init__(self, columns=None, remove=None, skip_errors=False, single=False): """ :param columns: specify a columns selection :param remove: modalities to remove :param skip_errors: skip when a new categories appear (no 1) :param single: use a single column per category, do not multiply them for each value The logging function displays a message when a new dense and big matrix is created when it should be sparse. A sparse matrix should be allocated instead. :githublink:`%|py|48` """ BaseEstimator.__init__(self) TransformerMixin.__init__(self) self.columns = columns if isinstance( columns, list) or columns is None else [columns] self.skip_errors = skip_errors self.remove = remove self.single = single
[docs] def __str__(self): """ usual :githublink:`%|py|60` """ return self.__repr__()
[docs] def fit(self, X, y=None, **fit_params): """ Makes the list of all categories in input *X*. *X* must be a dataframe. :param X: iterable Training data :param y: iterable, default=None Training targets. :return: self :githublink:`%|py|73` """ if not isinstance(X, pandas.DataFrame): raise TypeError( # pragma: no cover "this transformer only accept Dataframes, not {0}".format(type(X))) if self.columns: columns = self.columns else: columns = [c for c, d in zip( X.columns, X.dtypes) if d in (object,)] self._fit_columns = columns max_cat = max(len(X) // 2 + 1, 10000) self._categories = {} for c in columns: distinct = set(X[c].dropna()) nb = len(distinct) if nb >= max_cat: raise ValueError( # pragma: no cover "Too many categories ({0}) for one column '{1}' max_cat={2}".format(nb, c, max_cat)) self._categories[c] = dict((c, i) for i, c in enumerate(list(sorted(distinct)))) self._schema = self._build_schema() return self
[docs] def _build_schema(self): """ Concatenates all the categories given the information stored in *_categories*. :return: list of columns, beginning of each :githublink:`%|py|104` """ schema = [] position = {} new_vector = {} last = 0 for c, v in self._categories.items(): sch = [(_[1], "{0}={1}".format(c, _[1])) for _ in sorted((n, d) for d, n in v.items())] if self.remove: sch = [d for d in sch if d[1] not in self.remove] position[c] = last new_vector[c] = {d[0]: i for i, d in enumerate(sch)} last += len(sch) schema.extend(_[1] for _ in sch) return schema, position, new_vector
[docs] def transform(self, X, y=None, **fit_params): """ Transforms categories in numerical features based on the list of categories found by method *fit*. *X* must be a dataframe. The function does not preserve the order of the columns. :param X: iterable Training data :param y: iterable, default=None Training targets. :return: DataFrame, *X* with categories. :githublink:`%|py|133` """ if not isinstance(X, pandas.DataFrame): raise TypeError( # pragma: no cover "X is not a dataframe: {0}".format(type(X))) if self.single: b = not self.skip_errors def transform(v, vec): "transform a vector" if v in vec: return vec[v] elif v is None: return numpy.nan elif isinstance(v, float) and numpy.isnan(v): return numpy.nan elif not self.skip_errors: lv = list(sorted(vec)) if len(lv) > 20: lv = lv[:20] lv.append("...") raise ValueError( # pragma: no cover "Unable to find category value '{0}' type(v)={2} among\n{1}".format( v, "\n".join(lv), type(v))) else: return numpy.nan sch, pos, new_vector = self._schema X = X.copy() for c in self._fit_columns: X[c] = X[c].apply(lambda v, cv=c: transform(v, new_vector[cv])) return X else: dfcat = X[self._fit_columns] dfnum = X[[c for c in X.columns if c not in self._fit_columns]] sch, pos, new_vector = self._schema vec = new_vector # new_size = X.shape[0] * len(sch) res = numpy.zeros((X.shape[0], len(sch))) res.fill(numpy.nan) b = not self.skip_errors for i, row in enumerate(dfcat.to_dict("records")): for k, v in row.items(): if v is None or (isinstance(v, float) and numpy.isnan(v)): # missing values continue if v not in vec[k]: if b: lv = list(sorted(vec[k])) if len(lv) > 20: lv = lv[:20] lv.append("...") raise ValueError( # pragma: no cover "unable to find category value '{0}': '{1}' type(v)={3} among\n{2}".format( k, v, "\n".join(lv), type(v))) else: p = pos[k] + vec[k][v] res[i, p] = 1.0 if dfnum.shape[1] > 0: newdf = pandas.DataFrame(res, columns=sch, index=dfcat.index) allnum = pandas.concat([dfnum, newdf], axis=1) else: allnum = pandas.DataFrame(res, columns=sch, index=dfcat.index) return allnum
[docs] def fit_transform(self, X, y=None, **fit_params): """ Fits and transforms categories in numerical features based on the list of categories found by method *fit*. *X* must be a dataframe. The function does not preserve the order of the columns. :param X: iterable Training data :param y: iterable, default=None Training targets. :return: Dataframe, *X* with categories. :githublink:`%|py|214` """ return self.fit(X, y=y, **fit_params).transform(X, y)