Code source de papierstat.mltricks.text_vectorizer_transformer

"""
Implements TextVectorizerTransformer.


:githublink:`%|py|5`
"""
import numpy
from scipy import sparse
import pandas
from sklearn.base import BaseEstimator, TransformerMixin, clone


[docs]class TextVectorizerTransformer(BaseEstimator, TransformerMixin):
    """
    Overwrites *TfIdfVectorizer* or *CountVectorizer*
    so that it can be used in a pipeline.

    Parameters
    ----------

    estimator: estimator to fit on every column

    estimators_: trained estimators, one per column


    :githublink:`%|py|22`
    """

[docs]    def __init__(self, estimator):
        """
        :param      estimator:       *TfIdfVectorizer* or *CountVectorizer*


        :githublink:`%|py|27`
        """
        self.estimator = estimator

[docs]    def fit(self, X, y=None):
        """
        Trains an estimator on every column.


        :githublink:`%|py|33`
        """
        self.estimators_ = []
        for i in range(X.shape[1]):
            est = clone(self.estimator)
            if isinstance(X, pandas.DataFrame):
                col = X.iloc[:, i]
            elif isinstance(X, numpy.ndarray):
                col = X[:, i]
            else:
                raise TypeError(  # pragma: no cover
                    "X must be an array or a dataframe.")
            est.fit(col)
            self.estimators_.append(est)
        return self

[docs]    def transform(self, X):
        """
        Applies the vectorizer on X.


        :githublink:`%|py|51`
        """
        if len(self.estimators_) != X.shape[1]:
            raise ValueError(  # pragma: no cover
                "Unexpected number of columns {}, expecting {}".format(
                    X.shape[1], len(self.estimators_)))
        res = []
        for i in range(X.shape[1]):
            if isinstance(X, pandas.DataFrame):
                col = X.iloc[:, i]
            elif isinstance(X, numpy.ndarray):
                col = X[:, i]
            else:
                raise TypeError(  # pragma: no cover
                    "X must be an array or a dataframe.")
            r = self.estimators_[i].transform(col)
            res.append(r)
        if len(res) == 1:
            return res[0]
        if all(map(lambda r: isinstance(r, numpy.ndarray), res)):
            return numpy.hstack(res)
        return sparse.hstack(res)
Liens

Contenu

Information

Code source de papierstat.mltricks.text_vectorizer_transformer