Code source de papierstat.mltricks.text_vectorizer_transformer

"""
Implements TextVectorizerTransformer.


:githublink:`%|py|5`
"""
import numpy
from scipy import sparse
import pandas
from sklearn.base import BaseEstimator, TransformerMixin, clone


[docs]class TextVectorizerTransformer(BaseEstimator, TransformerMixin): """ Overwrites *TfIdfVectorizer* or *CountVectorizer* so that it can be used in a pipeline. Parameters ---------- estimator: estimator to fit on every column estimators_: trained estimators, one per column :githublink:`%|py|22` """
[docs] def __init__(self, estimator): """ :param estimator: *TfIdfVectorizer* or *CountVectorizer* :githublink:`%|py|27` """ self.estimator = estimator
[docs] def fit(self, X, y=None): """ Trains an estimator on every column. :githublink:`%|py|33` """ self.estimators_ = [] for i in range(X.shape[1]): est = clone(self.estimator) if isinstance(X, pandas.DataFrame): col = X.iloc[:, i] elif isinstance(X, numpy.ndarray): col = X[:, i] else: raise TypeError( # pragma: no cover "X must be an array or a dataframe.") est.fit(col) self.estimators_.append(est) return self
[docs] def transform(self, X): """ Applies the vectorizer on X. :githublink:`%|py|51` """ if len(self.estimators_) != X.shape[1]: raise ValueError( # pragma: no cover "Unexpected number of columns {}, expecting {}".format( X.shape[1], len(self.estimators_))) res = [] for i in range(X.shape[1]): if isinstance(X, pandas.DataFrame): col = X.iloc[:, i] elif isinstance(X, numpy.ndarray): col = X[:, i] else: raise TypeError( # pragma: no cover "X must be an array or a dataframe.") r = self.estimators_[i].transform(col) res.append(r) if len(res) == 1: return res[0] if all(map(lambda r: isinstance(r, numpy.ndarray), res)): return numpy.hstack(res) return sparse.hstack(res)