Source code for skl2onnx.operator_converters.text_vectoriser

# SPDX-License-Identifier: Apache-2.0


import warnings
from collections import OrderedDict, Counter
import numpy as np
from ..common._apply_operation import (
    apply_cast, apply_reshape, apply_identity)
from ..common._registration import register_converter
from ..common._topology import Scope, Operator
from ..common._container import ModelComponentContainer
from ..common.data_types import guess_proto_type, StringTensorType
from ..proto import onnx_proto
from ..algebra.onnx_ops import OnnxStringNormalizer


def _intelligent_split(text, op, tokenizer, existing):
    """
    Splits text into tokens. *scikit-learn*
    merges tokens with ``' '.join(tokens)``
    to name ngrams. ``'a  b'`` could be ``('a ', 'b')``
    or ``('a', ' b')``.
    See `ngram sequence
    <https://github.com/scikit-learn/scikit-learn/blob/master/
    sklearn/feature_extraction/text.py#L169>`_.
    """
    if op.analyzer == 'word':
        if op.ngram_range[0] == op.ngram_range[1] == 1:
            spl = [text]
        elif op.ngram_range[0] == 1 and len(text) >= 2:
            # Every element is in the vocabulary.
            # Naive method
            p1 = len(text) - len(text.lstrip())
            p2_ = len(text) - len(text.rstrip())
            if p2_ == 0:
                p2 = len(text)
            else:
                p2 = -p2_
            spl = text[p1:p2].split()
            if len(spl) <= 1:
                spl = [text]
            else:
                spl[0] = " " * p1 + spl[0]
                spl[-1] = spl[-1] + " " * p2_
            exc = None
            if len(spl) == 1:
                pass
            elif len(spl) == 2:
                if (spl[0] not in op.vocabulary_ or
                        spl[1] not in op.vocabulary_):
                    # This is neceassarily a single token.
                    spl = [text]
                elif spl[0] in op.vocabulary_ and spl[1] in op.vocabulary_:
                    # ambiguity
                    # w1, w2 can be either a 2-grams, either a token.
                    # Usually, ' ' is not part of any token.
                    pass
            elif len(spl) == 3:
                stok = (all([s in op.vocabulary_ for s in spl]), spl)
                spl12 = (spl[2] in op.vocabulary_ and
                         (spl[0] + ' ' + spl[1]) in op.vocabulary_,
                         [spl[0] + ' ' + spl[1], spl[2]])
                spl23 = (spl[0] in op.vocabulary_ and
                         (spl[1] + ' ' + spl[2]) in op.vocabulary_,
                         [spl[0], spl[1] + ' ' + spl[2]])
                c = Counter(map(lambda t: t[0], [stok, spl12, spl23]))
                if c.get(True, -1) == 0:
                    spl = [text]
                found = [el[1] for el in [stok, spl12, spl23] if el[0]]
                if len(found) == 1:
                    spl = found[0]
                elif len(found) == 0:
                    spl = [text]
                elif stok[0]:
                    # By default, we assume the token is just the sum of
                    # single words.
                    pass
                else:
                    exc = (
                        "More than one decomposition in tokens: [" +
                        ", ".join(map(lambda t: "-".join(t), found)) + "].")
            elif any(map(lambda g: g in op.vocabulary_, spl)):
                # TODO: handle this case with an algorithm
                # which is able to break a string into
                # known substrings.
                exc = "Unable to identify tokens in n-grams."
            if exc:
                raise RuntimeError(
                    "Unable to split n-grams '{}' into tokens. "
                    "{} This happens when a token contain "
                    "spaces. Token '{}' may be a token or a n-gram '{}'."
                    "".format(text, exc, text, spl))
        else:
            # We reuse the tokenizer hoping that will clear
            # ambiguities but this might be slow.
            spl = tokenizer(text)
    else:
        spl = list(text)

    spl = tuple(spl)
    if spl in existing:
        raise RuntimeError(
            f"The converter cannot guess how to split expression "
            f"{text!r} into tokens. This case happens when tokens have "
            f"spaces.")
    if (op.ngram_range[0] == 1 and
            (len(op.ngram_range) == 1 or op.ngram_range[1] > 1)):
        # All grams should be existing in the vocabulary.
        for g in spl:
            if g not in op.vocabulary_:
                raise RuntimeError(
                    "Unable to split n-grams '{}' into tokens {} "
                    "existing in the vocabulary. Token '{}' does not "
                    "exist in the vocabulary."
                    ".".format(text, spl, g))
    existing.add(spl)
    return spl


[docs]def convert_sklearn_text_vectorizer(scope: Scope, operator: Operator, container: ModelComponentContainer): """ Converters for class `TfidfVectorizer <https://scikit-learn.org/stable/modules/generated/ sklearn.feature_extraction.text.TfidfVectorizer.html>`_. The current implementation is a work in progress and the ONNX version does not produce the exact same results. The converter lets the user change some of its parameters. Additional options ------------------ tokenexp: string The default will change to true in version 1.6.0. The tokenizer splits into words using this regular expression or the regular expression specified by *scikit-learn* is the value is an empty string. See also note below. Default value: None separators: list of separators These separators are used to split a string into words. Options *separators* is ignore if options *tokenexp* is not None. Default value: ``[' ', '[.]', '\\\\?', ',', ';', ':', '\\\\!']``. Example (from :ref:`l-example-tfidfvectorizer`): :: seps = {TfidfVectorizer: {"separators": [' ', '[.]', '\\\\?', ',', ';', ':', '!', '\\\\(', '\\\\)', '\\n', '\\\\"', "'", "-", "\\\\[", "\\\\]", "@"]}} model_onnx = convert_sklearn(pipeline, "tfidf", initial_types=[("input", StringTensorType([None, 2]))], options=seps) The default regular expression of the tokenizer is ``(?u)\\\\b\\\\w\\\\w+\\\\b`` (see `re <https://docs.python.org/3/library/re.html>`_). This expression may not supported by the library handling the backend. `onnxruntime <https://github.com/Microsoft/onnxruntime>`_ uses `re2 <https://github.com/google/re2>`_. You may need to switch to a custom tokenizer based on `python wrapper for re2 <https://pypi.org/project/re2/>`_ or its sources `pyre2 <https://github.com/facebook/pyre2>`_ (`syntax <https://github.com/google/re2/blob/master/doc/syntax.txt>`_). If the regular expression is not specified and if the instance of TfidfVectorizer is using the default pattern ``(?u)\\\\b\\\\w\\\\w+\\\\b``, it is replaced by ``[a-zA-Z0-9_]+``. Any other case has to be manually handled. Regular expression ``[^\\\\\\\\n]`` is used to split a sentance into character (and not works) if ``analyser=='char'``. The mode ``analyser=='char_wb'`` is not implemented. .. versionchanged:: 1.6 Parameters have been renamed: *sep* into *separators*, *regex* into *tokenexp*. ```` """ # noqa op = operator.raw_operator if (container.target_opset is not None and container.target_opset < 9): raise RuntimeError( "Converter for '{}' only works for opset >= 9." "".format(op.__class__.__name__)) if op.analyzer == "char_wb": raise NotImplementedError( "CountVectorizer cannot be converted, " "only tokenizer='word' is fully supported. " "You may raise an issue at " "https://github.com/onnx/sklearn-onnx/issues.") if op.analyzer == "char": warnings.warn( "The conversion of CountVectorizer may not work. " "only tokenizer='word' is fully supported. " "You may raise an issue at " "https://github.com/onnx/sklearn-onnx/issues.", UserWarning) if op.strip_accents is not None: raise NotImplementedError( "CountVectorizer cannot be converted, " "only strip_accents=None is supported. " "You may raise an issue at " "https://github.com/onnx/sklearn-onnx/issues.") options = container.get_options( op, dict(separators="DEFAULT", tokenexp=None, nan=False, keep_empty_string=False)) if set(options) != {'separators', 'tokenexp', 'nan', 'keep_empty_string'}: raise RuntimeError("Unknown option {} for {}".format( set(options) - {'separators'}, type(op))) if op.analyzer == 'word': default_pattern = '(?u)\\b\\w\\w+\\b' if options['separators'] == "DEFAULT" and options['tokenexp'] is None: regex = op.token_pattern if regex == default_pattern: regex = '[a-zA-Z0-9_]+' default_separators = None elif options['tokenexp'] is not None: if options['tokenexp']: regex = options['tokenexp'] else: regex = op.token_pattern if regex == default_pattern: regex = '[a-zA-Z0-9_]+' default_separators = None else: regex = None default_separators = options['separators'] else: if options['separators'] != 'DEFAULT': raise RuntimeError("Option separators has no effect " "if analyser != 'word'.") regex = options['tokenexp'] if options['tokenexp'] else '.' default_separators = None if op.preprocessor is not None: raise NotImplementedError( "Custom preprocessor cannot be converted into ONNX. " "You may raise an issue at " "https://github.com/onnx/sklearn-onnx/issues.") if op.tokenizer is not None: raise NotImplementedError( "Custom tokenizer cannot be converted into ONNX. " "You may raise an issue at " "https://github.com/onnx/sklearn-onnx/issues.") if op.strip_accents is not None: raise NotImplementedError( "Operator StringNormalizer cannot remove accents. " "You may raise an issue at " "https://github.com/onnx/sklearn-onnx/issues.") if hasattr(op, "stop_words_"): stop_words = op.stop_words_ | ( set(op.stop_words) if op.stop_words else set()) else: stop_words = set() for w in stop_words: if not isinstance(w, str): raise TypeError( f"One stop word is not a string {w!r} " f"in stop_words={stop_words}.") if op.lowercase or stop_words: if len(operator.input_full_names) != 1: raise RuntimeError("Only one input is allowed, found {}.".format( operator.input_full_names)) # StringNormalizer op_type = 'StringNormalizer' attrs = {'name': scope.get_unique_operator_name(op_type)} normalized = scope.get_unique_variable_name('normalized') if container.target_opset >= 10: attrs.update({ 'case_change_action': 'LOWER', 'is_case_sensitive': not op.lowercase, }) op_version = 10 domain = '' else: attrs.update({ 'casechangeaction': 'LOWER', 'is_case_sensitive': not op.lowercase, }) op_version = 9 domain = 'com.microsoft' opvs = 1 if domain == 'com.microsoft' else op_version if stop_words: attrs['stopwords'] = list(sorted(stop_words)) if options['keep_empty_string']: del attrs['name'] op_norm = OnnxStringNormalizer( 'text_in', op_version=container.target_opset, output_names=['text_out'], **attrs) scan_body = op_norm.to_onnx( OrderedDict([('text_in', StringTensorType())]), outputs=[('text_out', StringTensorType())], target_opset=op_version) vector = scope.get_unique_variable_name('vector') apply_reshape(scope, operator.input_full_names[0], vector, container, desired_shape=(-1, 1)) container.add_node('Scan', vector, normalized, body=scan_body.graph, num_scan_inputs=1) else: flatten = scope.get_unique_variable_name('flattened') apply_reshape(scope, operator.input_full_names[0], flatten, container, desired_shape=(-1, )) container.add_node(op_type, flatten, normalized, op_version=opvs, op_domain=domain, **attrs) else: normalized = operator.input_full_names # Tokenizer padvalue = "#" while padvalue in op.vocabulary_: padvalue += "#" op_type = 'Tokenizer' attrs = {'name': scope.get_unique_operator_name(op_type)} attrs.update({ 'pad_value': padvalue, 'mark': False, 'mincharnum': 1, }) if regex is None: attrs['separators'] = default_separators else: attrs['tokenexp'] = regex tokenized = scope.get_unique_variable_name('tokenized') container.add_node(op_type, normalized, tokenized, op_domain='com.microsoft', **attrs) # Flatten # Tokenizer outputs shape {1, C} or {1, 1, C}. # Second shape is not allowed by TfIdfVectorizer. # We use Flatten which produces {1, C} in both cases. flatt_tokenized = scope.get_unique_variable_name('flattened') container.add_node("Flatten", tokenized, flatt_tokenized, name=scope.get_unique_operator_name('Flatten')) tokenized = flatt_tokenized # Ngram - TfIdfVectorizer C = max(op.vocabulary_.values()) + 1 words = [None for i in range(C)] weights = [0 for i in range(C)] for k, v in op.vocabulary_.items(): words[v] = k weights[v] = 1. mode = 'TF' # Scikit-learn sorts n-grams by alphabetical order.. # onnx assumes it is sorted by n. tokenizer = op.build_tokenizer() split_words = [] existing = set() errors = [] for w in words: if isinstance(w, tuple): # TraceableCountVectorizer, TraceableTfIdfVectorizer spl = list(w) w = ' '.join(w) else: # CountVectorizer, TfIdfVectorizer try: spl = _intelligent_split(w, op, tokenizer, existing) except RuntimeError as e: errors.append(e) continue split_words.append((spl, w)) if len(errors) > 0: err = "\n".join(map(str, errors)) raise RuntimeError( f"There were ambiguities between n-grams and tokens. " f"{len(errors)} errors occurred. You can fix it by using " f"class Traceable{op.__class__.__name__}.\n" f"You can learn more at https://github.com/scikit-learn/" f"scikit-learn/issues/13733.\n{err}") ng_split_words = sorted([(len(a[0]), a[0], i) for i, a in enumerate(split_words)]) key_indices = [a[2] for a in ng_split_words] ngcounts = [0 for i in range(op.ngram_range[0])] words = list(ng_split_words[0][1]) for i in range(1, len(ng_split_words)): if ng_split_words[i - 1][0] != ng_split_words[i][0]: ngcounts.append(len(words)) words.extend(ng_split_words[i][1]) weights_ = [weights[a[2]] for a in ng_split_words] weights = list(weights_) for i, ind in enumerate(key_indices): weights[ind] = weights_[i] # Create the node. attrs = {'name': scope.get_unique_operator_name("TfIdfVectorizer")} attrs.update({ 'min_gram_length': op.ngram_range[0], 'max_gram_length': op.ngram_range[1], 'mode': mode, 'max_skip_count': 0, 'pool_strings': words, 'ngram_indexes': key_indices, 'ngram_counts': ngcounts, 'weights': list(map(np.float32, weights)), }) output = scope.get_unique_variable_name('output') proto_dtype = guess_proto_type(operator.inputs[0].type) if proto_dtype != onnx_proto.TensorProto.DOUBLE: proto_dtype = onnx_proto.TensorProto.FLOAT if proto_dtype == onnx_proto.TensorProto.DOUBLE: output_tf = scope.get_unique_variable_name('cast_result') else: output_tf = output if container.target_opset < 9: op_type = 'Ngram' container.add_node(op_type, tokenized, output_tf, op_domain='com.microsoft', **attrs) else: op_type = 'TfIdfVectorizer' container.add_node(op_type, tokenized, output_tf, op_domain='', op_version=9, **attrs) if proto_dtype == onnx_proto.TensorProto.DOUBLE: apply_cast(scope, output_tf, output, container, to=proto_dtype) if op.binary: cast_result_name = scope.get_unique_variable_name('cast_result') output_name = scope.get_unique_variable_name('output_name') apply_cast(scope, output, cast_result_name, container, to=onnx_proto.TensorProto.BOOL) apply_cast(scope, cast_result_name, output_name, container, to=onnx_proto.TensorProto.FLOAT) output = output_name options = container.get_options(op, dict(nan=False)) replace_by_nan = options.get('nan', False) if replace_by_nan: # This part replaces all null values by nan. cst_nan_name = scope.get_unique_variable_name('nan_name') container.add_initializer(cst_nan_name, proto_dtype, [1], [np.nan]) cst_zero_name = scope.get_unique_variable_name('zero_name') container.add_initializer(cst_zero_name, proto_dtype, [1], [0]) mask_name = scope.get_unique_variable_name('mask_name') container.add_node('Equal', [output, cst_zero_name], mask_name, name=scope.get_unique_operator_name('Equal')) where_name = scope.get_unique_variable_name('where_name') container.add_node('Where', [mask_name, cst_nan_name, output], where_name, name=scope.get_unique_operator_name('Where')) output = where_name apply_identity(scope, output, operator.output_full_names, container)
register_converter('SklearnCountVectorizer', convert_sklearn_text_vectorizer, options={'tokenexp': None, 'separators': None, 'nan': [True, False], 'keep_empty_string': [True, False]})