Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""
2@file
3@brief Overloads :epkg:`TfidfVectorizer` and :epkg:`CountVectorizer`.
4"""
5from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
6try:
7 from sklearn.feature_extraction.text import _VectorizerMixin as VectorizerMixin
8except ImportError: # pragma: no cover
9 # scikit-learn < 0.23
10 from sklearn.feature_extraction.text import VectorizerMixin
13class NGramsMixin(VectorizerMixin):
14 """
15 Overloads method `_word_ngrams
16 <https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_extraction/text.py#L148>`_
17 to get tuples instead of string in member `vocabulary_
18 <https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html>`_.
19 of :epkg:`TfidfVectorizer` or :epkg:`CountVectorizer`.
20 It contains the list of n-grams used to process documents.
21 See @see cl TraceableCountVectorizer and @see cl TraceableTfidfVectorizer
22 for example.
23 """
25 def _word_ngrams(self, tokens, stop_words=None):
26 """Turn tokens into a sequence of n-grams after stop words filtering"""
27 # handle stop words
28 if tokens is not None:
29 new_tokens = []
30 for token in tokens:
31 new_tokens.append(
32 (token,) if isinstance(token, str) else token)
33 tokens = new_tokens
35 if stop_words is not None:
36 tokens = [(w, ) for w in tokens if w not in stop_words]
38 # handle token n-grams
39 min_n, max_n = self.ngram_range
40 if max_n != 1:
41 original_tokens = tokens
42 if min_n == 1:
43 # no need to do any slicing for unigrams
44 # just iterate through the original tokens
45 tokens = list(original_tokens)
46 min_n += 1
47 else:
48 tokens = []
50 n_original_tokens = len(original_tokens)
52 # bind method outside of loop to reduce overhead
53 tokens_append = tokens.append
55 def space_join(tokens):
56 new_tokens = []
57 for token in tokens:
58 if isinstance(token, str):
59 new_tokens.append(token)
60 elif isinstance(token, tuple):
61 new_tokens.extend(token)
62 else:
63 raise TypeError( # pragma: no cover
64 "Unable to build a n-grams out of {}.".format(
65 tokens))
66 return tuple(new_tokens)
68 for n in range(min_n,
69 min(max_n + 1, n_original_tokens + 1)):
70 for i in range(n_original_tokens - n + 1):
71 tokens_append(space_join(original_tokens[i: i + n]))
72 return tokens
75class TraceableCountVectorizer(CountVectorizer, NGramsMixin):
76 """
77 Inherits from @see cl NGramsMixin which overloads method `_word_ngrams
78 <https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_extraction/text.py#L148>`_
79 to keep more information about n-grams but still produces the same
80 outputs than :epkg:`CountVectorizer`.
82 .. runpython::
83 :showcode:
85 import numpy
86 from sklearn.feature_extraction.text import CountVectorizer
87 from mlinsights.mlmodel.sklearn_text import TraceableCountVectorizer
88 from pprint import pformat
90 corpus = numpy.array([
91 "This is the first document.",
92 "This document is the second document.",
93 "Is this the first document?",
94 "",
95 ]).reshape((4, ))
97 print('CountVectorizer from scikit-learn')
98 mod1 = CountVectorizer(ngram_range=(1, 2))
99 mod1.fit(corpus)
100 print(mod1.transform(corpus).todense()[:2])
101 print(pformat(mod1.vocabulary_)[:100])
103 print('TraceableCountVectorizer from scikit-learn')
104 mod2 = TraceableCountVectorizer(ngram_range=(1, 2))
105 mod2.fit(corpus)
106 print(mod2.transform(corpus).todense()[:2])
107 print(pformat(mod2.vocabulary_)[:100])
109 A weirder example with
110 @see cl TraceableTfidfVectorizer shows more differences.
111 """
113 def _word_ngrams(self, tokens, stop_words=None):
114 return NGramsMixin._word_ngrams(self, tokens=tokens, stop_words=stop_words)
117class TraceableTfidfVectorizer(TfidfVectorizer, NGramsMixin):
118 """
119 Inherits from @see cl NGramsMixin which overloads method `_word_ngrams
120 <https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_extraction/text.py#L148>`_
121 to keep more information about n-grams but still produces the same
122 outputs than :epkg:`TfidfVectorizer`.
124 .. runpython::
125 :showcode:
127 import numpy
128 from sklearn.feature_extraction.text import TfidfVectorizer
129 from mlinsights.mlmodel.sklearn_text import TraceableTfidfVectorizer
130 from pprint import pformat
132 corpus = numpy.array([
133 "This is the first document.",
134 "This document is the second document.",
135 "Is this the first document?",
136 "",
137 ]).reshape((4, ))
139 print('TfidfVectorizer from scikit-learn')
140 mod1 = TfidfVectorizer(ngram_range=(1, 2),
141 token_pattern="[a-zA-Z ]{1,4}")
142 mod1.fit(corpus)
143 print(mod1.transform(corpus).todense()[:2])
144 print(pformat(mod1.vocabulary_)[:100])
146 print('TraceableTfidfVectorizer from scikit-learn')
147 mod2 = TraceableTfidfVectorizer(ngram_range=(1, 2),
148 token_pattern="[a-zA-Z ]{1,4}")
149 mod2.fit(corpus)
150 print(mod2.transform(corpus).todense()[:2])
151 print(pformat(mod2.vocabulary_)[:100])
152 """
154 def _word_ngrams(self, tokens, stop_words=None):
155 return NGramsMixin._word_ngrams(self, tokens=tokens, stop_words=stop_words)