Traceable n-grams with tf-idf#
Links: notebook
, html, PDF
, python
, slides, GitHub
The notebook looks into the way n-grams are stored in CountVectorizer and TfidfVectorizer and how the current storage (<= 0.21) is ambiguous in some cases.
from jyquickhelper import add_notebook_menu
add_notebook_menu()
Example with CountVectorizer#
scikit-learn version#
import numpy
from sklearn.feature_extraction.text import CountVectorizer
corpus = numpy.array([
"This is the first document.",
"This document is the second document.",
"Is this the first document?",
"",
]).reshape((4, ))
mod1 = CountVectorizer(ngram_range=(1, 2))
mod1.fit(corpus)
mod1.transform(corpus).todense()
matrix([[1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0],
[2, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0],
[1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int64)
mod1.vocabulary_
{'this': 12,
'is': 4,
'the': 9,
'first': 2,
'document': 0,
'this is': 14,
'is the': 5,
'the first': 10,
'first document': 3,
'second': 7,
'this document': 13,
'document is': 1,
'the second': 11,
'second document': 8,
'is this': 6,
'this the': 15}
import numpy
from mlinsights.mlmodel.sklearn_text import TraceableCountVectorizer
corpus = numpy.array([
"This is the first document.",
"This document is the second document.",
"Is this the first document?",
"",
]).reshape((4, ))
mod2 = TraceableCountVectorizer(ngram_range=(1, 2))
mod2.fit(corpus)
mod2.transform(corpus).todense()
matrix([[1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0],
[2, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0],
[1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int64)
mod2.vocabulary_
{('this',): 12,
('is',): 4,
('the',): 9,
('first',): 2,
('document',): 0,
('this', 'is'): 14,
('is', 'the'): 5,
('the', 'first'): 10,
('first', 'document'): 3,
('second',): 7,
('this', 'document'): 13,
('document', 'is'): 1,
('the', 'second'): 11,
('second', 'document'): 8,
('is', 'this'): 6,
('this', 'the'): 15}
The new class does the exact same thing but keeps n-grams in a more explicit form. The original form as a string is sometimes ambiguous as next example shows.
Funny example with TfidfVectorizer#
scikit-learn version#
import numpy
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = numpy.array([
"This is the first document.",
"This document is the second document.",
"Is this the first document?",
"",
]).reshape((4, ))
mod1 = TfidfVectorizer(ngram_range=(1, 2),
token_pattern="[a-zA-Z ]{1,4}")
mod1.fit(corpus)
mod1.transform(corpus).todense()
matrix([[0. , 0. , 0.32940523, 0.32940523, 0. ,
0. , 0. , 0. , 0.25970687, 0.25970687,
0. , 0. , 0.25970687, 0.25970687, 0. ,
0. , 0. , 0. , 0. , 0.25970687,
0. , 0. , 0.25970687, 0.25970687, 0. ,
0. , 0.25970687, 0.25970687, 0.25970687, 0. ,
0.32940523, 0. , 0. ],
[0.24528087, 0.24528087, 0. , 0. , 0.24528087,
0.24528087, 0.24528087, 0.24528087, 0. , 0. ,
0.24528087, 0.24528087, 0. , 0. , 0. ,
0. , 0. , 0. , 0.24528087, 0. ,
0.24528087, 0.24528087, 0. , 0. , 0.24528087,
0.24528087, 0. , 0. , 0.19338226, 0.24528087,
0. , 0.24528087, 0.24528087],
[0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0.25453384, 0.25453384,
0. , 0. , 0.25453384, 0.25453384, 0.3228439 ,
0.3228439 , 0.3228439 , 0.3228439 , 0. , 0.25453384,
0. , 0. , 0.25453384, 0.25453384, 0. ,
0. , 0.25453384, 0.25453384, 0. , 0. ,
0. , 0. , 0. ],
[0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. ]])
mod1.vocabulary_
{'this': 28,
' is ': 2,
'the ': 26,
'firs': 12,
't do': 22,
'cume': 8,
'nt': 19,
'this is ': 30,
' is the ': 3,
'the firs': 27,
'firs t do': 13,
't do cume': 23,
'cume nt': 9,
' doc': 0,
'umen': 31,
't is': 24,
' the': 6,
' sec': 4,
'ond ': 20,
'docu': 10,
'ment': 18,
'this doc': 29,
' doc umen': 1,
'umen t is': 32,
't is the': 25,
' the sec': 7,
' sec ond ': 5,
'ond docu': 21,
'docu ment': 11,
'is t': 16,
'his ': 14,
'is t his ': 17,
'his the ': 15}
mlinsights version#
from mlinsights.mlmodel.sklearn_text import TraceableTfidfVectorizer
mod2 = TraceableTfidfVectorizer(ngram_range=(1, 2),
token_pattern="[a-zA-Z ]{1,4}")
mod2.fit(corpus)
mod2.transform(corpus).todense()
matrix([[0. , 0. , 0.32940523, 0.32940523, 0. ,
0. , 0. , 0. , 0.25970687, 0.25970687,
0. , 0. , 0.25970687, 0.25970687, 0. ,
0. , 0. , 0. , 0. , 0.25970687,
0. , 0. , 0.25970687, 0.25970687, 0. ,
0. , 0.25970687, 0.25970687, 0.25970687, 0. ,
0.32940523, 0. , 0. ],
[0.24528087, 0.24528087, 0. , 0. , 0.24528087,
0.24528087, 0.24528087, 0.24528087, 0. , 0. ,
0.24528087, 0.24528087, 0. , 0. , 0. ,
0. , 0. , 0. , 0.24528087, 0. ,
0.24528087, 0.24528087, 0. , 0. , 0.24528087,
0.24528087, 0. , 0. , 0.19338226, 0.24528087,
0. , 0.24528087, 0.24528087],
[0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0.25453384, 0.25453384,
0. , 0. , 0.25453384, 0.25453384, 0.3228439 ,
0.3228439 , 0.3228439 , 0.3228439 , 0. , 0.25453384,
0. , 0. , 0.25453384, 0.25453384, 0. ,
0. , 0.25453384, 0.25453384, 0. , 0. ,
0. , 0. , 0. ],
[0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. ]])
mod2.vocabulary_
{('this',): 28,
(' is ',): 2,
('the ',): 26,
('firs',): 12,
('t do',): 22,
('cume',): 8,
('nt',): 19,
('this', ' is '): 30,
(' is ', 'the '): 3,
('the ', 'firs'): 27,
('firs', 't do'): 13,
('t do', 'cume'): 23,
('cume', 'nt'): 9,
(' doc',): 0,
('umen',): 31,
('t is',): 24,
(' the',): 6,
(' sec',): 4,
('ond ',): 20,
('docu',): 10,
('ment',): 18,
('this', ' doc'): 29,
(' doc', 'umen'): 1,
('umen', 't is'): 32,
('t is', ' the'): 25,
(' the', ' sec'): 7,
(' sec', 'ond '): 5,
('ond ', 'docu'): 21,
('docu', 'ment'): 11,
('is t',): 16,
('his ',): 14,
('is t', 'his '): 17,
('his ', 'the '): 15}
As you can see, the original 30th n-grams 't is the'
is a little
but ambiguous. It is in fact ('t is', ' the')
as the
TraceableTfidfVectorizer lets you know. The original form could have
been ('t', 'is the')
, ('t is', ' the')
, ('t is ', ' the')
,
('t is ', 'the')
, ('t', 'is ', 'the')
… The regular
expression gives some insights but not some information which can be
easily used to guess the right one.