Source code for mlinsights.mlmodel.sklearn_text

"""
Overloads :epkg:`TfidfVectorizer` and :epkg:`CountVectorizer`.


:githublink:`%|py|5`
"""
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
try:
    from sklearn.feature_extraction.text import _VectorizerMixin as VectorizerMixin
except ImportError:  # pragma: no cover
    # scikit-learn < 0.23
    from sklearn.feature_extraction.text import VectorizerMixin


[docs]class NGramsMixin(VectorizerMixin): """ Overloads method `_word_ngrams <https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_extraction/text.py#L148>`_ to get tuples instead of string in member `vocabulary_ <https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html>`_. of :epkg:`TfidfVectorizer` or :epkg:`CountVectorizer`. It contains the list of n-grams used to process documents. See :class:`TraceableCountVectorizer <mlinsights.mlmodel.sklearn_text.TraceableCountVectorizer>` and :class:`TraceableTfidfVectorizer` for example. :githublink:`%|py|23` """
[docs] def _word_ngrams(self, tokens, stop_words=None): """ Turn tokens into a sequence of n-grams after stop words filtering :githublink:`%|py|26` """ # handle stop words if tokens is not None: new_tokens = [] for token in tokens: new_tokens.append( (token,) if isinstance(token, str) else token) tokens = new_tokens if stop_words is not None: tokens = [(w, ) for w in tokens if w not in stop_words] # handle token n-grams min_n, max_n = self.ngram_range if max_n != 1: original_tokens = tokens if min_n == 1: # no need to do any slicing for unigrams # just iterate through the original tokens tokens = list(original_tokens) min_n += 1 else: tokens = [] n_original_tokens = len(original_tokens) # bind method outside of loop to reduce overhead tokens_append = tokens.append def space_join(tokens): new_tokens = [] for token in tokens: if isinstance(token, str): new_tokens.append(token) elif isinstance(token, tuple): new_tokens.extend(token) else: raise TypeError( # pragma: no cover "Unable to build a n-grams out of {}.".format( tokens)) return tuple(new_tokens) for n in range(min_n, min(max_n + 1, n_original_tokens + 1)): for i in range(n_original_tokens - n + 1): tokens_append(space_join(original_tokens[i: i + n])) return tokens
[docs]class TraceableCountVectorizer(CountVectorizer, NGramsMixin): """ Inherits from :class:`NGramsMixin <mlinsights.mlmodel.sklearn_text.NGramsMixin>` which overloads method `_word_ngrams <https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_extraction/text.py#L148>`_ to keep more information about n-grams but still produces the same outputs than :epkg:`CountVectorizer`. .. runpython:: :showcode: import numpy from sklearn.feature_extraction.text import CountVectorizer from mlinsights.mlmodel.sklearn_text import TraceableCountVectorizer from pprint import pformat corpus = numpy.array([ "This is the first document.", "This document is the second document.", "Is this the first document?", "", ]).reshape((4, )) print('CountVectorizer from scikit-learn') mod1 = CountVectorizer(ngram_range=(1, 2)) mod1.fit(corpus) print(mod1.transform(corpus).todense()[:2]) print(pformat(mod1.vocabulary_)[:100]) print('TraceableCountVectorizer from scikit-learn') mod2 = TraceableCountVectorizer(ngram_range=(1, 2)) mod2.fit(corpus) print(mod2.transform(corpus).todense()[:2]) print(pformat(mod2.vocabulary_)[:100]) A weirder example with :class:`TraceableTfidfVectorizer <mlinsights.mlmodel.sklearn_text.TraceableTfidfVectorizer>` shows more differences. :githublink:`%|py|111` """
[docs] def _word_ngrams(self, tokens, stop_words=None): return NGramsMixin._word_ngrams(self, tokens=tokens, stop_words=stop_words)
[docs]class TraceableTfidfVectorizer(TfidfVectorizer, NGramsMixin): """ Inherits from :class:`NGramsMixin <mlinsights.mlmodel.sklearn_text.NGramsMixin>` which overloads method `_word_ngrams <https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_extraction/text.py#L148>`_ to keep more information about n-grams but still produces the same outputs than :epkg:`TfidfVectorizer`. .. runpython:: :showcode: import numpy from sklearn.feature_extraction.text import TfidfVectorizer from mlinsights.mlmodel.sklearn_text import TraceableTfidfVectorizer from pprint import pformat corpus = numpy.array([ "This is the first document.", "This document is the second document.", "Is this the first document?", "", ]).reshape((4, )) print('TfidfVectorizer from scikit-learn') mod1 = TfidfVectorizer(ngram_range=(1, 2), token_pattern="[a-zA-Z ]{1,4}") mod1.fit(corpus) print(mod1.transform(corpus).todense()[:2]) print(pformat(mod1.vocabulary_)[:100]) print('TraceableTfidfVectorizer from scikit-learn') mod2 = TraceableTfidfVectorizer(ngram_range=(1, 2), token_pattern="[a-zA-Z ]{1,4}") mod2.fit(corpus) print(mod2.transform(corpus).todense()[:2]) print(pformat(mod2.vocabulary_)[:100]) :githublink:`%|py|152` """
[docs] def _word_ngrams(self, tokens, stop_words=None): return NGramsMixin._word_ngrams(self, tokens=tokens, stop_words=stop_words)