Source code for skl2onnx.operator_converters.text_vectoriser

# SPDX-License-Identifier: Apache-2.0


import warnings
from collections import OrderedDict, Counter
import numpy as np
from ..common._apply_operation import (
    apply_cast, apply_reshape, apply_identity)
from ..common._registration import register_converter
from ..common._topology import Scope, Operator
from ..common._container import ModelComponentContainer
from ..common.data_types import guess_proto_type, StringTensorType
from ..proto import onnx_proto
from ..algebra.onnx_ops import OnnxStringNormalizer


def _intelligent_split(text, op, tokenizer, existing):
    """
    Splits text into tokens. *scikit-learn*
    merges tokens with ``' '.join(tokens)``
    to name ngrams. ``'a  b'`` could be ``('a ', 'b')``
    or ``('a', ' b')``.
    See `ngram sequence
    <https://github.com/scikit-learn/scikit-learn/blob/master/
    sklearn/feature_extraction/text.py#L169>`_.
    """
    if op.analyzer == 'word':
        if op.ngram_range[0] == op.ngram_range[1] == 1:
            spl = [text]
        elif op.ngram_range[0] == 1 and len(text) >= 2:
            # Every element is in the vocabulary.
            # Naive method
            p1 = len(text) - len(text.lstrip())
            p2_ = len(text) - len(text.rstrip())
            if p2_ == 0:
                p2 = len(text)
            else:
                p2 = -p2_
            spl = text[p1:p2].split()
            if len(spl) <= 1:
                spl = [text]
            else:
                spl[0] = " " * p1 + spl[0]
                spl[-1] = spl[-1] + " " * p2_
            exc = None
            if len(spl) == 1:
                pass
            elif len(spl) == 2:
                if (spl[0] not in op.vocabulary_ or
                        spl[1] not in op.vocabulary_):
                    # This is neceassarily a single token.
                    spl = [text]
                elif spl[0] in op.vocabulary_ and spl[1] in op.vocabulary_:
                    # ambiguity
                    # w1, w2 can be either a 2-grams, either a token.
                    # Usually, ' ' is not part of any token.
                    pass
            elif len(spl) == 3:
                stok = (all([s in op.vocabulary_ for s in spl]), spl)
                spl12 = (spl[2] in op.vocabulary_ and
                         (spl[0] + ' ' + spl[1]) in op.vocabulary_,
                         [spl[0] + ' ' + spl[1], spl[2]])
                spl23 = (spl[0] in op.vocabulary_ and
                         (spl[1] + ' ' + spl[2]) in op.vocabulary_,
                         [spl[0], spl[1] + ' ' + spl[2]])
                c = Counter(map(lambda t: t[0], [stok, spl12, spl23]))
                if c.get(True, -1) == 0:
                    spl = [text]
                found = [el[1] for el in [stok, spl12, spl23] if el[0]]
                if len(found) == 1:
                    spl = found[0]
                elif len(found) == 0:
                    spl = [text]
                elif stok[0]:
                    # By default, we assume the token is just the sum of
                    # single words.
                    pass
                else:
                    exc = (
                        "More than one decomposition in tokens: [" +
                        ", ".join(map(lambda t: "-".join(t), found)) + "].")
            elif any(map(lambda g: g in op.vocabulary_, spl)):
                # TODO: handle this case with an algorithm
                # which is able to break a string into
                # known substrings.
                exc = "Unable to identify tokens in n-grams."
            if exc:
                raise RuntimeError(
                    "Unable to split n-grams '{}' into tokens. "
                    "{} This happens when a token contain "
                    "spaces. Token '{}' may be a token or a n-gram '{}'."
                    "".format(text, exc, text, spl))
        else:
            # We reuse the tokenizer hoping that will clear
            # ambiguities but this might be slow.
            spl = tokenizer(text)
    else:
        spl = list(text)

    spl = tuple(spl)
    if spl in existing:
        raise RuntimeError(
            f"The converter cannot guess how to split expression "
            f"{text!r} into tokens. This case happens when tokens have "
            f"spaces.")
    if (op.ngram_range[0] == 1 and
            (len(op.ngram_range) == 1 or op.ngram_range[1] > 1)):
        # All grams should be existing in the vocabulary.
        for g in spl:
            if g not in op.vocabulary_:
                raise RuntimeError(
                    "Unable to split n-grams '{}' into tokens {} "
                    "existing in the vocabulary. Token '{}' does not "
                    "exist in the vocabulary."
                    ".".format(text, spl, g))
    existing.add(spl)
    return spl


[docs]def convert_sklearn_text_vectorizer(scope: Scope, operator: Operator,
                                    container: ModelComponentContainer):
    """
    Converters for class
    `TfidfVectorizer <https://scikit-learn.org/stable/modules/generated/
    sklearn.feature_extraction.text.TfidfVectorizer.html>`_.
    The current implementation is a work in progress and the ONNX version
    does not produce the exact same results. The converter lets the user
    change some of its parameters.

    Additional options
    ------------------

    tokenexp: string
        The default will change to true in version 1.6.0.
        The tokenizer splits into words using this regular
        expression or the regular expression specified by
        *scikit-learn* is the value is an empty string.
        See also note below.
        Default value: None
    separators: list of separators
        These separators are used to split a string into words.
        Options *separators* is ignore if options *tokenexp* is not None.
        Default value: ``[' ', '[.]', '\\\\?', ',', ';', ':', '\\\\!']``.

    Example (from :ref:`l-example-tfidfvectorizer`):

    ::

        seps = {TfidfVectorizer: {"separators": [' ', '[.]', '\\\\?', ',', ';',
                                                 ':', '!', '\\\\(', '\\\\)',
                                                 '\\n', '\\\\"', "'", "-",
                                                 "\\\\[", "\\\\]", "@"]}}
        model_onnx = convert_sklearn(pipeline, "tfidf",
                                     initial_types=[("input", StringTensorType([None, 2]))],
                                     options=seps)

    The default regular expression of the tokenizer is ``(?u)\\\\b\\\\w\\\\w+\\\\b``
    (see `re <https://docs.python.org/3/library/re.html>`_).
    This expression may not supported by the library handling the backend.
    `onnxruntime <https://github.com/Microsoft/onnxruntime>`_ uses
    `re2 <https://github.com/google/re2>`_. You may need to switch
    to a custom tokenizer based on
    `python wrapper for re2 <https://pypi.org/project/re2/>`_
    or its sources `pyre2 <https://github.com/facebook/pyre2>`_
    (`syntax <https://github.com/google/re2/blob/master/doc/syntax.txt>`_).
    If the regular expression is not specified and if
    the instance of TfidfVectorizer is using the default
    pattern ``(?u)\\\\b\\\\w\\\\w+\\\\b``, it is replaced by
    ``[a-zA-Z0-9_]+``. Any other case has to be
    manually handled.

    Regular expression ``[^\\\\\\\\n]`` is used to split
    a sentance into character (and not works) if ``analyser=='char'``.
    The mode ``analyser=='char_wb'`` is not implemented.

    .. versionchanged:: 1.6
        Parameters have been renamed: *sep* into *separators*,
        *regex* into *tokenexp*.
    ````

    """  # noqa
    op = operator.raw_operator

    if (container.target_opset is not None and
            container.target_opset < 9):
        raise RuntimeError(
            "Converter for '{}' only works for opset >= 9."
            "".format(op.__class__.__name__))

    if op.analyzer == "char_wb":
        raise NotImplementedError(
            "CountVectorizer cannot be converted, "
            "only tokenizer='word' is fully supported. "
            "You may raise an issue at "
            "https://github.com/onnx/sklearn-onnx/issues.")
    if op.analyzer == "char":
        warnings.warn(
            "The conversion of CountVectorizer may not work. "
            "only tokenizer='word' is fully supported. "
            "You may raise an issue at "
            "https://github.com/onnx/sklearn-onnx/issues.",
            UserWarning)
    if op.strip_accents is not None:
        raise NotImplementedError(
            "CountVectorizer cannot be converted, "
            "only strip_accents=None is supported. "
            "You may raise an issue at "
            "https://github.com/onnx/sklearn-onnx/issues.")

    options = container.get_options(
        op, dict(separators="DEFAULT",
                 tokenexp=None,
                 nan=False,
                 keep_empty_string=False))
    if set(options) != {'separators', 'tokenexp', 'nan', 'keep_empty_string'}:
        raise RuntimeError("Unknown option {} for {}".format(
            set(options) - {'separators'}, type(op)))

    if op.analyzer == 'word':
        default_pattern = '(?u)\\b\\w\\w+\\b'
        if options['separators'] == "DEFAULT" and options['tokenexp'] is None:
            regex = op.token_pattern
            if regex == default_pattern:
                regex = '[a-zA-Z0-9_]+'
            default_separators = None
        elif options['tokenexp'] is not None:
            if options['tokenexp']:
                regex = options['tokenexp']
            else:
                regex = op.token_pattern
                if regex == default_pattern:
                    regex = '[a-zA-Z0-9_]+'
            default_separators = None
        else:
            regex = None
            default_separators = options['separators']
    else:
        if options['separators'] != 'DEFAULT':
            raise RuntimeError("Option separators has no effect "
                               "if analyser != 'word'.")
        regex = options['tokenexp'] if options['tokenexp'] else '.'
        default_separators = None

    if op.preprocessor is not None:
        raise NotImplementedError(
            "Custom preprocessor cannot be converted into ONNX. "
            "You may raise an issue at "
            "https://github.com/onnx/sklearn-onnx/issues.")
    if op.tokenizer is not None:
        raise NotImplementedError(
            "Custom tokenizer cannot be converted into ONNX. "
            "You may raise an issue at "
            "https://github.com/onnx/sklearn-onnx/issues.")
    if op.strip_accents is not None:
        raise NotImplementedError(
            "Operator StringNormalizer cannot remove accents. "
            "You may raise an issue at "
            "https://github.com/onnx/sklearn-onnx/issues.")

    if hasattr(op, "stop_words_"):
        stop_words = op.stop_words_ | (
            set(op.stop_words) if op.stop_words else set())
    else:
        stop_words = set()
    for w in stop_words:
        if not isinstance(w, str):
            raise TypeError(
                f"One stop word is not a string {w!r} "
                f"in stop_words={stop_words}.")

    if op.lowercase or stop_words:
        if len(operator.input_full_names) != 1:
            raise RuntimeError("Only one input is allowed, found {}.".format(
                operator.input_full_names))

        # StringNormalizer
        op_type = 'StringNormalizer'
        attrs = {'name': scope.get_unique_operator_name(op_type)}
        normalized = scope.get_unique_variable_name('normalized')
        if container.target_opset >= 10:
            attrs.update({
                'case_change_action': 'LOWER',
                'is_case_sensitive': not op.lowercase,
            })
            op_version = 10
            domain = ''
        else:
            attrs.update({
                'casechangeaction': 'LOWER',
                'is_case_sensitive': not op.lowercase,
            })
            op_version = 9
            domain = 'com.microsoft'
        opvs = 1 if domain == 'com.microsoft' else op_version
        if stop_words:
            attrs['stopwords'] = list(sorted(stop_words))

        if options['keep_empty_string']:
            del attrs['name']
            op_norm = OnnxStringNormalizer(
                'text_in', op_version=container.target_opset,
                output_names=['text_out'], **attrs)
            scan_body = op_norm.to_onnx(
                OrderedDict([('text_in', StringTensorType())]),
                outputs=[('text_out', StringTensorType())],
                target_opset=op_version)

            vector = scope.get_unique_variable_name('vector')
            apply_reshape(scope, operator.input_full_names[0],
                          vector, container,
                          desired_shape=(-1, 1))
            container.add_node('Scan', vector, normalized,
                               body=scan_body.graph, num_scan_inputs=1)
        else:
            flatten = scope.get_unique_variable_name('flattened')
            apply_reshape(scope, operator.input_full_names[0],
                          flatten, container,
                          desired_shape=(-1, ))
            container.add_node(op_type, flatten,
                               normalized, op_version=opvs,
                               op_domain=domain, **attrs)
    else:
        normalized = operator.input_full_names

    # Tokenizer
    padvalue = "#"
    while padvalue in op.vocabulary_:
        padvalue += "#"

    op_type = 'Tokenizer'
    attrs = {'name': scope.get_unique_operator_name(op_type)}
    attrs.update({
        'pad_value': padvalue,
        'mark': False,
        'mincharnum': 1,
    })
    if regex is None:
        attrs['separators'] = default_separators
    else:
        attrs['tokenexp'] = regex

    tokenized = scope.get_unique_variable_name('tokenized')
    container.add_node(op_type, normalized, tokenized,
                       op_domain='com.microsoft', **attrs)

    # Flatten
    # Tokenizer outputs shape {1, C} or {1, 1, C}.
    # Second shape is not allowed by TfIdfVectorizer.
    # We use Flatten which produces {1, C} in both cases.
    flatt_tokenized = scope.get_unique_variable_name('flattened')
    container.add_node("Flatten", tokenized, flatt_tokenized,
                       name=scope.get_unique_operator_name('Flatten'))
    tokenized = flatt_tokenized

    # Ngram - TfIdfVectorizer
    C = max(op.vocabulary_.values()) + 1
    words = [None for i in range(C)]
    weights = [0 for i in range(C)]
    for k, v in op.vocabulary_.items():
        words[v] = k
        weights[v] = 1.
    mode = 'TF'

    # Scikit-learn sorts n-grams by alphabetical order..
    # onnx assumes it is sorted by n.
    tokenizer = op.build_tokenizer()
    split_words = []
    existing = set()
    errors = []
    for w in words:
        if isinstance(w, tuple):
            # TraceableCountVectorizer, TraceableTfIdfVectorizer
            spl = list(w)
            w = ' '.join(w)
        else:
            # CountVectorizer, TfIdfVectorizer
            try:
                spl = _intelligent_split(w, op, tokenizer, existing)
            except RuntimeError as e:
                errors.append(e)
                continue
        split_words.append((spl, w))
    if len(errors) > 0:
        err = "\n".join(map(str, errors))
        raise RuntimeError(
            f"There were ambiguities between n-grams and tokens. "
            f"{len(errors)} errors occurred. You can fix it by using "
            f"class Traceable{op.__class__.__name__}.\n"
            f"You can learn more at https://github.com/scikit-learn/"
            f"scikit-learn/issues/13733.\n{err}")

    ng_split_words = sorted([(len(a[0]), a[0], i)
                            for i, a in enumerate(split_words)])
    key_indices = [a[2] for a in ng_split_words]
    ngcounts = [0 for i in range(op.ngram_range[0])]

    words = list(ng_split_words[0][1])
    for i in range(1, len(ng_split_words)):
        if ng_split_words[i - 1][0] != ng_split_words[i][0]:
            ngcounts.append(len(words))
        words.extend(ng_split_words[i][1])

    weights_ = [weights[a[2]] for a in ng_split_words]
    weights = list(weights_)
    for i, ind in enumerate(key_indices):
        weights[ind] = weights_[i]

    # Create the node.
    attrs = {'name': scope.get_unique_operator_name("TfIdfVectorizer")}
    attrs.update({
        'min_gram_length': op.ngram_range[0],
        'max_gram_length': op.ngram_range[1],
        'mode': mode,
        'max_skip_count': 0,
        'pool_strings': words,
        'ngram_indexes': key_indices,
        'ngram_counts': ngcounts,
        'weights': list(map(np.float32, weights)),
    })
    output = scope.get_unique_variable_name('output')

    proto_dtype = guess_proto_type(operator.inputs[0].type)
    if proto_dtype != onnx_proto.TensorProto.DOUBLE:
        proto_dtype = onnx_proto.TensorProto.FLOAT

    if proto_dtype == onnx_proto.TensorProto.DOUBLE:
        output_tf = scope.get_unique_variable_name('cast_result')
    else:
        output_tf = output

    if container.target_opset < 9:
        op_type = 'Ngram'
        container.add_node(op_type, tokenized, output_tf,
                           op_domain='com.microsoft', **attrs)
    else:
        op_type = 'TfIdfVectorizer'
        container.add_node(op_type, tokenized, output_tf, op_domain='',
                           op_version=9, **attrs)

    if proto_dtype == onnx_proto.TensorProto.DOUBLE:
        apply_cast(scope, output_tf, output,
                   container, to=proto_dtype)

    if op.binary:
        cast_result_name = scope.get_unique_variable_name('cast_result')
        output_name = scope.get_unique_variable_name('output_name')

        apply_cast(scope, output, cast_result_name, container,
                   to=onnx_proto.TensorProto.BOOL)
        apply_cast(scope, cast_result_name, output_name,
                   container, to=onnx_proto.TensorProto.FLOAT)
        output = output_name

    options = container.get_options(op, dict(nan=False))
    replace_by_nan = options.get('nan', False)
    if replace_by_nan:
        # This part replaces all null values by nan.
        cst_nan_name = scope.get_unique_variable_name('nan_name')
        container.add_initializer(cst_nan_name, proto_dtype, [1], [np.nan])
        cst_zero_name = scope.get_unique_variable_name('zero_name')
        container.add_initializer(cst_zero_name, proto_dtype, [1], [0])

        mask_name = scope.get_unique_variable_name('mask_name')
        container.add_node('Equal', [output, cst_zero_name],
                           mask_name,
                           name=scope.get_unique_operator_name('Equal'))

        where_name = scope.get_unique_variable_name('where_name')
        container.add_node('Where', [mask_name, cst_nan_name, output],
                           where_name,
                           name=scope.get_unique_operator_name('Where'))
        output = where_name

    apply_identity(scope, output, operator.output_full_names, container)


register_converter('SklearnCountVectorizer', convert_sklearn_text_vectorizer,
                   options={'tokenexp': None, 'separators': None,
                            'nan': [True, False],
                            'keep_empty_string': [True, False]})