Traceable n-grams with tf-idf#

Links: notebook, html, PDF, python, slides, GitHub

The notebook looks into the way n-grams are stored in CountVectorizer and TfidfVectorizer and how the current storage (<= 0.21) is ambiguous in some cases.

from jyquickhelper import add_notebook_menu

add_notebook_menu()

Example with CountVectorizer#

scikit-learn version#

import numpy
from sklearn.feature_extraction.text import CountVectorizer

corpus = numpy.array([
    "This is the first document.",
    "This document is the second document.",
    "Is this the first document?",
    "",
]).reshape((4, ))

mod1 = CountVectorizer(ngram_range=(1, 2))
mod1.fit(corpus)
mod1.transform(corpus).todense()
matrix([[1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0],
        [2, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0],
        [1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int64)
mod1.vocabulary_
{'this': 12,
 'is': 4,
 'the': 9,
 'first': 2,
 'document': 0,
 'this is': 14,
 'is the': 5,
 'the first': 10,
 'first document': 3,
 'second': 7,
 'this document': 13,
 'document is': 1,
 'the second': 11,
 'second document': 8,
 'is this': 6,
 'this the': 15}
import numpy
from mlinsights.mlmodel.sklearn_text import TraceableCountVectorizer

corpus = numpy.array([
    "This is the first document.",
    "This document is the second document.",
    "Is this the first document?",
    "",
]).reshape((4, ))

mod2 = TraceableCountVectorizer(ngram_range=(1, 2))
mod2.fit(corpus)
mod2.transform(corpus).todense()
matrix([[1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0],
        [2, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0],
        [1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int64)
mod2.vocabulary_
{('this',): 12,
 ('is',): 4,
 ('the',): 9,
 ('first',): 2,
 ('document',): 0,
 ('this', 'is'): 14,
 ('is', 'the'): 5,
 ('the', 'first'): 10,
 ('first', 'document'): 3,
 ('second',): 7,
 ('this', 'document'): 13,
 ('document', 'is'): 1,
 ('the', 'second'): 11,
 ('second', 'document'): 8,
 ('is', 'this'): 6,
 ('this', 'the'): 15}

The new class does the exact same thing but keeps n-grams in a more explicit form. The original form as a string is sometimes ambiguous as next example shows.

Funny example with TfidfVectorizer#

scikit-learn version#

import numpy
from sklearn.feature_extraction.text import TfidfVectorizer

corpus = numpy.array([
    "This is the first document.",
    "This document is the second document.",
    "Is this the first document?",
    "",
]).reshape((4, ))

mod1 = TfidfVectorizer(ngram_range=(1, 2),
                       token_pattern="[a-zA-Z ]{1,4}")
mod1.fit(corpus)
mod1.transform(corpus).todense()
matrix([[0.        , 0.        , 0.32940523, 0.32940523, 0.        ,
         0.        , 0.        , 0.        , 0.25970687, 0.25970687,
         0.        , 0.        , 0.25970687, 0.25970687, 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.25970687,
         0.        , 0.        , 0.25970687, 0.25970687, 0.        ,
         0.        , 0.25970687, 0.25970687, 0.25970687, 0.        ,
         0.32940523, 0.        , 0.        ],
        [0.24528087, 0.24528087, 0.        , 0.        , 0.24528087,
         0.24528087, 0.24528087, 0.24528087, 0.        , 0.        ,
         0.24528087, 0.24528087, 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.24528087, 0.        ,
         0.24528087, 0.24528087, 0.        , 0.        , 0.24528087,
         0.24528087, 0.        , 0.        , 0.19338226, 0.24528087,
         0.        , 0.24528087, 0.24528087],
        [0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.25453384, 0.25453384,
         0.        , 0.        , 0.25453384, 0.25453384, 0.3228439 ,
         0.3228439 , 0.3228439 , 0.3228439 , 0.        , 0.25453384,
         0.        , 0.        , 0.25453384, 0.25453384, 0.        ,
         0.        , 0.25453384, 0.25453384, 0.        , 0.        ,
         0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        ]])
mod1.vocabulary_
{'this': 28,
 ' is ': 2,
 'the ': 26,
 'firs': 12,
 't do': 22,
 'cume': 8,
 'nt': 19,
 'this  is ': 30,
 ' is  the ': 3,
 'the  firs': 27,
 'firs t do': 13,
 't do cume': 23,
 'cume nt': 9,
 ' doc': 0,
 'umen': 31,
 't is': 24,
 ' the': 6,
 ' sec': 4,
 'ond ': 20,
 'docu': 10,
 'ment': 18,
 'this  doc': 29,
 ' doc umen': 1,
 'umen t is': 32,
 't is  the': 25,
 ' the  sec': 7,
 ' sec ond ': 5,
 'ond  docu': 21,
 'docu ment': 11,
 'is t': 16,
 'his ': 14,
 'is t his ': 17,
 'his  the ': 15}

mlinsights version#

from mlinsights.mlmodel.sklearn_text import TraceableTfidfVectorizer

mod2 = TraceableTfidfVectorizer(ngram_range=(1, 2),
                                token_pattern="[a-zA-Z ]{1,4}")
mod2.fit(corpus)
mod2.transform(corpus).todense()
matrix([[0.        , 0.        , 0.32940523, 0.32940523, 0.        ,
         0.        , 0.        , 0.        , 0.25970687, 0.25970687,
         0.        , 0.        , 0.25970687, 0.25970687, 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.25970687,
         0.        , 0.        , 0.25970687, 0.25970687, 0.        ,
         0.        , 0.25970687, 0.25970687, 0.25970687, 0.        ,
         0.32940523, 0.        , 0.        ],
        [0.24528087, 0.24528087, 0.        , 0.        , 0.24528087,
         0.24528087, 0.24528087, 0.24528087, 0.        , 0.        ,
         0.24528087, 0.24528087, 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.24528087, 0.        ,
         0.24528087, 0.24528087, 0.        , 0.        , 0.24528087,
         0.24528087, 0.        , 0.        , 0.19338226, 0.24528087,
         0.        , 0.24528087, 0.24528087],
        [0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.25453384, 0.25453384,
         0.        , 0.        , 0.25453384, 0.25453384, 0.3228439 ,
         0.3228439 , 0.3228439 , 0.3228439 , 0.        , 0.25453384,
         0.        , 0.        , 0.25453384, 0.25453384, 0.        ,
         0.        , 0.25453384, 0.25453384, 0.        , 0.        ,
         0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        ]])
mod2.vocabulary_
{('this',): 28,
 (' is ',): 2,
 ('the ',): 26,
 ('firs',): 12,
 ('t do',): 22,
 ('cume',): 8,
 ('nt',): 19,
 ('this', ' is '): 30,
 (' is ', 'the '): 3,
 ('the ', 'firs'): 27,
 ('firs', 't do'): 13,
 ('t do', 'cume'): 23,
 ('cume', 'nt'): 9,
 (' doc',): 0,
 ('umen',): 31,
 ('t is',): 24,
 (' the',): 6,
 (' sec',): 4,
 ('ond ',): 20,
 ('docu',): 10,
 ('ment',): 18,
 ('this', ' doc'): 29,
 (' doc', 'umen'): 1,
 ('umen', 't is'): 32,
 ('t is', ' the'): 25,
 (' the', ' sec'): 7,
 (' sec', 'ond '): 5,
 ('ond ', 'docu'): 21,
 ('docu', 'ment'): 11,
 ('is t',): 16,
 ('his ',): 14,
 ('is t', 'his '): 17,
 ('his ', 'the '): 15}

As you can see, the original 30th n-grams 't is  the' is a little but ambiguous. It is in fact ('t is', ' the') as the TraceableTfidfVectorizer lets you know. The original form could have been ('t', 'is  the'), ('t is', '  the'), ('t is ', ' the'), ('t is  ', 'the'), ('t', 'is  ', 'the')… The regular expression gives some insights but not some information which can be easily used to guess the right one.