Coverage for mlinsights/mlmodel/sklearn

Hot-keys on this page

r m x p toggle line displays

j k next/prev highlighted chunk

0 (zero) top of page

1 (one) first highlighted chunk

1"""

2@file

3@brief Overloads :epkg:`TfidfVectorizer` and :epkg:`CountVectorizer`.

4"""

5from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

6try:

7 from sklearn.feature_extraction.text import _VectorizerMixin as VectorizerMixin

8except ImportError: # pragma: no cover

9 # scikit-learn < 0.23

10 from sklearn.feature_extraction.text import VectorizerMixin

13class NGramsMixin(VectorizerMixin):

14 """

15 Overloads method `_word_ngrams

16 <https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_extraction/text.py#L148>`_

17 to get tuples instead of string in member `vocabulary_

18 <https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html>`_.

19 of :epkg:`TfidfVectorizer` or :epkg:`CountVectorizer`.

20 It contains the list of n-grams used to process documents.

21 See @see cl TraceableCountVectorizer and @see cl TraceableTfidfVectorizer

22 for example.

23 """

25 def _word_ngrams(self, tokens, stop_words=None):

26 """Turn tokens into a sequence of n-grams after stop words filtering"""

27 # handle stop words

28 if tokens is not None:

29 new_tokens = []

30 for token in tokens:

31 new_tokens.append(

32 (token,) if isinstance(token, str) else token)

33 tokens = new_tokens

35 if stop_words is not None:

36 tokens = [(w, ) for w in tokens if w not in stop_words]

38 # handle token n-grams

39 min_n, max_n = self.ngram_range

40 if max_n != 1:

41 original_tokens = tokens

42 if min_n == 1:

43 # no need to do any slicing for unigrams

44 # just iterate through the original tokens

45 tokens = list(original_tokens)

46 min_n += 1

47 else:

48 tokens = []

50 n_original_tokens = len(original_tokens)

52 # bind method outside of loop to reduce overhead

53 tokens_append = tokens.append

55 def space_join(tokens):

56 new_tokens = []

57 for token in tokens:

58 if isinstance(token, str):

59 new_tokens.append(token)

60 elif isinstance(token, tuple):

61 new_tokens.extend(token)

62 else:

63 raise TypeError( # pragma: no cover

64 "Unable to build a n-grams out of {}.".format(

65 tokens))

66 return tuple(new_tokens)

68 for n in range(min_n,

69 min(max_n + 1, n_original_tokens + 1)):

70 for i in range(n_original_tokens - n + 1):

71 tokens_append(space_join(original_tokens[i: i + n]))

72 return tokens

75class TraceableCountVectorizer(CountVectorizer, NGramsMixin):

76 """

77 Inherits from @see cl NGramsMixin which overloads method `_word_ngrams

78 <https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_extraction/text.py#L148>`_

79 to keep more information about n-grams but still produces the same

80 outputs than :epkg:`CountVectorizer`.

82 .. runpython::

83 :showcode:

85 import numpy

86 from sklearn.feature_extraction.text import CountVectorizer

87 from mlinsights.mlmodel.sklearn_text import TraceableCountVectorizer

88 from pprint import pformat

90 corpus = numpy.array([

91 "This is the first document.",

92 "This document is the second document.",

93 "Is this the first document?",

94 "",

95 ]).reshape((4, ))

97 print('CountVectorizer from scikit-learn')

98 mod1 = CountVectorizer(ngram_range=(1, 2))

99 mod1.fit(corpus)

100 print(mod1.transform(corpus).todense()[:2])

101 print(pformat(mod1.vocabulary_)[:100])

102

103 print('TraceableCountVectorizer from scikit-learn')

104 mod2 = TraceableCountVectorizer(ngram_range=(1, 2))

105 mod2.fit(corpus)

106 print(mod2.transform(corpus).todense()[:2])

107 print(pformat(mod2.vocabulary_)[:100])

108

109 A weirder example with

110 @see cl TraceableTfidfVectorizer shows more differences.

111 """

112

113 def _word_ngrams(self, tokens, stop_words=None):

114 return NGramsMixin._word_ngrams(self, tokens=tokens, stop_words=stop_words)

115

116

117class TraceableTfidfVectorizer(TfidfVectorizer, NGramsMixin):

118 """

119 Inherits from @see cl NGramsMixin which overloads method `_word_ngrams

120 <https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_extraction/text.py#L148>`_

121 to keep more information about n-grams but still produces the same

122 outputs than :epkg:`TfidfVectorizer`.

123

124 .. runpython::

125 :showcode:

126

127 import numpy

128 from sklearn.feature_extraction.text import TfidfVectorizer

129 from mlinsights.mlmodel.sklearn_text import TraceableTfidfVectorizer

130 from pprint import pformat

131

132 corpus = numpy.array([

133 "This is the first document.",

134 "This document is the second document.",

135 "Is this the first document?",

136 "",

137 ]).reshape((4, ))

138

139 print('TfidfVectorizer from scikit-learn')

140 mod1 = TfidfVectorizer(ngram_range=(1, 2),

141 token_pattern="[a-zA-Z ]{1,4}")

142 mod1.fit(corpus)

143 print(mod1.transform(corpus).todense()[:2])

144 print(pformat(mod1.vocabulary_)[:100])

145

146 print('TraceableTfidfVectorizer from scikit-learn')

147 mod2 = TraceableTfidfVectorizer(ngram_range=(1, 2),

148 token_pattern="[a-zA-Z ]{1,4}")

149 mod2.fit(corpus)

150 print(mod2.transform(corpus).todense()[:2])

151 print(pformat(mod2.vocabulary_)[:100])

152 """

153

154 def _word_ngrams(self, tokens, stop_words=None):

155 return NGramsMixin._word_ngrams(self, tokens=tokens, stop_words=stop_words)

Coverage for mlinsights/mlmodel/sklearn_text.py : 95%

39 statements

Coverage for mlinsights/mlmodel/sklearn_text.py : 95%

39 statements 37 run 2 missing 3 excluded

39 statements