Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1""" 

2@file 

3@brief Overloads :epkg:`TfidfVectorizer` and :epkg:`CountVectorizer`. 

4""" 

5from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer 

6try: 

7 from sklearn.feature_extraction.text import _VectorizerMixin as VectorizerMixin 

8except ImportError: # pragma: no cover 

9 # scikit-learn < 0.23 

10 from sklearn.feature_extraction.text import VectorizerMixin 

11 

12 

13class NGramsMixin(VectorizerMixin): 

14 """ 

15 Overloads method `_word_ngrams 

16 <https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_extraction/text.py#L148>`_ 

17 to get tuples instead of string in member `vocabulary_ 

18 <https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html>`_. 

19 of :epkg:`TfidfVectorizer` or :epkg:`CountVectorizer`. 

20 It contains the list of n-grams used to process documents. 

21 See @see cl TraceableCountVectorizer and @see cl TraceableTfidfVectorizer 

22 for example. 

23 """ 

24 

25 def _word_ngrams(self, tokens, stop_words=None): 

26 """Turn tokens into a sequence of n-grams after stop words filtering""" 

27 # handle stop words 

28 if tokens is not None: 

29 new_tokens = [] 

30 for token in tokens: 

31 new_tokens.append( 

32 (token,) if isinstance(token, str) else token) 

33 tokens = new_tokens 

34 

35 if stop_words is not None: 

36 tokens = [(w, ) for w in tokens if w not in stop_words] 

37 

38 # handle token n-grams 

39 min_n, max_n = self.ngram_range 

40 if max_n != 1: 

41 original_tokens = tokens 

42 if min_n == 1: 

43 # no need to do any slicing for unigrams 

44 # just iterate through the original tokens 

45 tokens = list(original_tokens) 

46 min_n += 1 

47 else: 

48 tokens = [] 

49 

50 n_original_tokens = len(original_tokens) 

51 

52 # bind method outside of loop to reduce overhead 

53 tokens_append = tokens.append 

54 

55 def space_join(tokens): 

56 new_tokens = [] 

57 for token in tokens: 

58 if isinstance(token, str): 

59 new_tokens.append(token) 

60 elif isinstance(token, tuple): 

61 new_tokens.extend(token) 

62 else: 

63 raise TypeError( # pragma: no cover 

64 "Unable to build a n-grams out of {}.".format( 

65 tokens)) 

66 return tuple(new_tokens) 

67 

68 for n in range(min_n, 

69 min(max_n + 1, n_original_tokens + 1)): 

70 for i in range(n_original_tokens - n + 1): 

71 tokens_append(space_join(original_tokens[i: i + n])) 

72 return tokens 

73 

74 

75class TraceableCountVectorizer(CountVectorizer, NGramsMixin): 

76 """ 

77 Inherits from @see cl NGramsMixin which overloads method `_word_ngrams 

78 <https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_extraction/text.py#L148>`_ 

79 to keep more information about n-grams but still produces the same 

80 outputs than :epkg:`CountVectorizer`. 

81 

82 .. runpython:: 

83 :showcode: 

84 

85 import numpy 

86 from sklearn.feature_extraction.text import CountVectorizer 

87 from mlinsights.mlmodel.sklearn_text import TraceableCountVectorizer 

88 from pprint import pformat 

89 

90 corpus = numpy.array([ 

91 "This is the first document.", 

92 "This document is the second document.", 

93 "Is this the first document?", 

94 "", 

95 ]).reshape((4, )) 

96 

97 print('CountVectorizer from scikit-learn') 

98 mod1 = CountVectorizer(ngram_range=(1, 2)) 

99 mod1.fit(corpus) 

100 print(mod1.transform(corpus).todense()[:2]) 

101 print(pformat(mod1.vocabulary_)[:100]) 

102 

103 print('TraceableCountVectorizer from scikit-learn') 

104 mod2 = TraceableCountVectorizer(ngram_range=(1, 2)) 

105 mod2.fit(corpus) 

106 print(mod2.transform(corpus).todense()[:2]) 

107 print(pformat(mod2.vocabulary_)[:100]) 

108 

109 A weirder example with 

110 @see cl TraceableTfidfVectorizer shows more differences. 

111 """ 

112 

113 def _word_ngrams(self, tokens, stop_words=None): 

114 return NGramsMixin._word_ngrams(self, tokens=tokens, stop_words=stop_words) 

115 

116 

117class TraceableTfidfVectorizer(TfidfVectorizer, NGramsMixin): 

118 """ 

119 Inherits from @see cl NGramsMixin which overloads method `_word_ngrams 

120 <https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_extraction/text.py#L148>`_ 

121 to keep more information about n-grams but still produces the same 

122 outputs than :epkg:`TfidfVectorizer`. 

123 

124 .. runpython:: 

125 :showcode: 

126 

127 import numpy 

128 from sklearn.feature_extraction.text import TfidfVectorizer 

129 from mlinsights.mlmodel.sklearn_text import TraceableTfidfVectorizer 

130 from pprint import pformat 

131 

132 corpus = numpy.array([ 

133 "This is the first document.", 

134 "This document is the second document.", 

135 "Is this the first document?", 

136 "", 

137 ]).reshape((4, )) 

138 

139 print('TfidfVectorizer from scikit-learn') 

140 mod1 = TfidfVectorizer(ngram_range=(1, 2), 

141 token_pattern="[a-zA-Z ]{1,4}") 

142 mod1.fit(corpus) 

143 print(mod1.transform(corpus).todense()[:2]) 

144 print(pformat(mod1.vocabulary_)[:100]) 

145 

146 print('TraceableTfidfVectorizer from scikit-learn') 

147 mod2 = TraceableTfidfVectorizer(ngram_range=(1, 2), 

148 token_pattern="[a-zA-Z ]{1,4}") 

149 mod2.fit(corpus) 

150 print(mod2.transform(corpus).todense()[:2]) 

151 print(pformat(mod2.vocabulary_)[:100]) 

152 """ 

153 

154 def _word_ngrams(self, tokens, stop_words=None): 

155 return NGramsMixin._word_ngrams(self, tokens=tokens, stop_words=stop_words)