Source code for mlprodict.onnx_conv.operator_converters.conv_lightgbm

"""
Modified converter from
`LightGbm.py <https://github.com/onnx/onnxmltools/blob/master/onnxmltools/convert/
lightgbm/operator_converters/LightGbm.py>`_.


:githublink:`%|py|7`
"""
from collections import Counter
import copy
import numbers
import numpy
from skl2onnx.common._apply_operation import apply_div, apply_reshape, apply_sub  # pylint: disable=E0611
from skl2onnx.common.tree_ensemble import get_default_tree_classifier_attribute_pairs
from skl2onnx.proto import onnx_proto
from skl2onnx.common.shape_calculator import (
    calculate_linear_regressor_output_shapes,
    calculate_linear_classifier_output_shapes)
from skl2onnx.common.data_types import guess_numpy_type


[docs]def calculate_lightgbm_output_shapes(operator):
    """
    Shape calculator for LightGBM Booster
    (see :epkg:`lightgbm`).


    :githublink:`%|py|24`
    """
    op = operator.raw_operator
    if hasattr(op, "_model_dict"):
        objective = op._model_dict['objective']
    else:
        objective = op.objective_
    if objective.startswith('binary') or objective.startswith('multiclass'):
        return calculate_linear_classifier_output_shapes(operator)
    if objective.startswith('regression'):  # pragma: no cover
        return calculate_linear_regressor_output_shapes(operator)
    raise NotImplementedError(  # pragma: no cover
        "Objective '{}' is not implemented yet.".format(objective))


[docs]def _translate_split_criterion(criterion):
    # If the criterion is true, LightGBM use the left child. Otherwise, right child is selected.
    if criterion == '<=':
        return 'BRANCH_LEQ'
    if criterion == '<':  # pragma: no cover
        return 'BRANCH_LT'
    if criterion == '>=':  # pragma: no cover
        return 'BRANCH_GTE'
    if criterion == '>':  # pragma: no cover
        return 'BRANCH_GT'
    if criterion == '==':  # pragma: no cover
        return 'BRANCH_EQ'
    if criterion == '!=':  # pragma: no cover
        return 'BRANCH_NEQ'
    raise ValueError(  # pragma: no cover
        'Unsupported splitting criterion: %s. Only <=, <, >=, and > are allowed.')


[docs]def _create_node_id(node_id_pool):
    i = 0
    while i in node_id_pool:
        i += 1
    node_id_pool.add(i)
    return i


[docs]def _parse_tree_structure(tree_id, class_id, learning_rate, tree_structure, attrs):
    """
    The pool of all nodes' indexes created when parsing a single tree.
    Different tree use different pools.


    :githublink:`%|py|68`
    """
    node_id_pool = set()
    node_pyid_pool = dict()

    node_id = _create_node_id(node_id_pool)
    node_pyid_pool[id(tree_structure)] = node_id

    # The root node is a leaf node.
    if 'left_child' not in tree_structure or 'right_child' not in tree_structure:
        _parse_node(tree_id, class_id, node_id, node_id_pool, node_pyid_pool,
                    learning_rate, tree_structure, attrs)
        return

    left_pyid = id(tree_structure['left_child'])
    right_pyid = id(tree_structure['right_child'])

    if left_pyid in node_pyid_pool:
        left_id = node_pyid_pool[left_pyid]
        left_parse = False
    else:
        left_id = _create_node_id(node_id_pool)
        node_pyid_pool[left_pyid] = left_id
        left_parse = True

    if right_pyid in node_pyid_pool:
        right_id = node_pyid_pool[right_pyid]
        right_parse = False
    else:
        right_id = _create_node_id(node_id_pool)
        node_pyid_pool[right_pyid] = right_id
        right_parse = True

    attrs['nodes_treeids'].append(tree_id)
    attrs['nodes_nodeids'].append(node_id)

    attrs['nodes_featureids'].append(tree_structure['split_feature'])
    attrs['nodes_modes'].append(
        _translate_split_criterion(tree_structure['decision_type']))
    if isinstance(tree_structure['threshold'], str):
        try:  # pragma: no cover
            attrs['nodes_values'].append(  # pragma: no cover
                float(tree_structure['threshold']))
        except ValueError as e:  # pragma: no cover
            import pprint
            text = pprint.pformat(tree_structure)
            if len(text) > 99999:
                text = text[:99999] + "\n..."
            raise TypeError("threshold must be a number not '{}'"
                            "\n{}".format(tree_structure['threshold'], text)) from e
    else:
        attrs['nodes_values'].append(tree_structure['threshold'])

    # Assume left is the true branch and right is the false branch
    attrs['nodes_truenodeids'].append(left_id)
    attrs['nodes_falsenodeids'].append(right_id)
    if tree_structure['default_left']:
        attrs['nodes_missing_value_tracks_true'].append(1)
    else:
        attrs['nodes_missing_value_tracks_true'].append(0)
    attrs['nodes_hitrates'].append(1.)
    if left_parse:
        _parse_node(tree_id, class_id, left_id, node_id_pool, node_pyid_pool,
                    learning_rate, tree_structure['left_child'], attrs)
    if right_parse:
        _parse_node(tree_id, class_id, right_id, node_id_pool, node_pyid_pool,
                    learning_rate, tree_structure['right_child'], attrs)


[docs]def _parse_node(tree_id, class_id, node_id, node_id_pool, node_pyid_pool,
                learning_rate, node, attrs):
    """
    Parses nodes.


    :githublink:`%|py|140`
    """
    if (hasattr(node, 'left_child') and hasattr(node, 'right_child')) or \
            ('left_child' in node and 'right_child' in node):

        left_pyid = id(node['left_child'])
        right_pyid = id(node['right_child'])

        if left_pyid in node_pyid_pool:
            left_id = node_pyid_pool[left_pyid]
            left_parse = False
        else:
            left_id = _create_node_id(node_id_pool)
            node_pyid_pool[left_pyid] = left_id
            left_parse = True

        if right_pyid in node_pyid_pool:
            right_id = node_pyid_pool[right_pyid]
            right_parse = False
        else:
            right_id = _create_node_id(node_id_pool)
            node_pyid_pool[right_pyid] = right_id
            right_parse = True

        attrs['nodes_treeids'].append(tree_id)
        attrs['nodes_nodeids'].append(node_id)

        attrs['nodes_featureids'].append(node['split_feature'])
        attrs['nodes_modes'].append(
            _translate_split_criterion(node['decision_type']))
        if isinstance(node['threshold'], str):
            try:  # pragma: no cover
                attrs['nodes_values'].append(  # pragma: no cover
                    float(node['threshold']))
            except ValueError as e:  # pragma: no cover
                import pprint
                text = pprint.pformat(node)
                if len(text) > 99999:
                    text = text[:99999] + "\n..."
                raise TypeError("threshold must be a number not '{}'"
                                "\n{}".format(node['threshold'], text)) from e
        else:
            attrs['nodes_values'].append(node['threshold'])

        # Assume left is the true branch and right is the false branch
        attrs['nodes_truenodeids'].append(left_id)
        attrs['nodes_falsenodeids'].append(right_id)
        if node['default_left']:
            attrs['nodes_missing_value_tracks_true'].append(1)
        else:
            attrs['nodes_missing_value_tracks_true'].append(0)
        attrs['nodes_hitrates'].append(1.)

        # Recursively dive into the child nodes
        if left_parse:
            _parse_node(tree_id, class_id, left_id, node_id_pool, node_pyid_pool,
                        learning_rate, node['left_child'], attrs)
        if right_parse:
            _parse_node(tree_id, class_id, right_id, node_id_pool, node_pyid_pool,
                        learning_rate, node['right_child'], attrs)
    elif hasattr(node, 'left_child') or hasattr(node, 'right_child'):
        raise ValueError('Need two branches')  # pragma: no cover
    else:
        # Node attributes
        attrs['nodes_treeids'].append(tree_id)
        attrs['nodes_nodeids'].append(node_id)
        attrs['nodes_featureids'].append(0)
        attrs['nodes_modes'].append('LEAF')
        # Leaf node has no threshold. A zero is appended but it will never be used.
        attrs['nodes_values'].append(0.)
        # Leaf node has no child. A zero is appended but it will never be used.
        attrs['nodes_truenodeids'].append(0)
        # Leaf node has no child. A zero is appended but it will never be used.
        attrs['nodes_falsenodeids'].append(0)
        # Leaf node has no split function. A zero is appended but it will never be used.
        attrs['nodes_missing_value_tracks_true'].append(0)
        attrs['nodes_hitrates'].append(1.)

        # Leaf attributes
        attrs['class_treeids'].append(tree_id)
        attrs['class_nodeids'].append(node_id)
        attrs['class_ids'].append(class_id)
        attrs['class_weights'].append(
            float(node['leaf_value']) * learning_rate)


[docs]def convert_lightgbm(scope, operator, container):
    """
    This converters reuses the code from
    `LightGbm.py <https://github.com/onnx/onnxmltools/blob/master/onnxmltools/convert/
    lightgbm/operator_converters/LightGbm.py>`_ and makes
    some modifications. It implements converters
    for models in :epkg:`lightgbm`.


    :githublink:`%|py|232`
    """
    gbm_model = operator.raw_operator
    gbm_text = gbm_model.booster_.dump_model()
    modify_tree_for_rule_in_set(gbm_text, use_float=True)

    attrs = get_default_tree_classifier_attribute_pairs()
    attrs['name'] = operator.full_name

    # Create different attributes for classifier and regressor, respectively
    if gbm_text['objective'].startswith('binary'):
        n_classes = 1
        attrs['post_transform'] = 'LOGISTIC'
    elif gbm_text['objective'].startswith('multiclass'):
        n_classes = gbm_text['num_class']
        attrs['post_transform'] = 'SOFTMAX'
    elif gbm_text['objective'].startswith('regression'):
        n_classes = 1  # Regressor has only one output variable
        attrs['post_transform'] = 'NONE'
        attrs['n_targets'] = n_classes
    else:
        raise RuntimeError(  # pragma: no cover
            "LightGBM objective should be cleaned already not '{}'.".format(
                gbm_text['objective']))

    # Use the same algorithm to parse the tree
    for i, tree in enumerate(gbm_text['tree_info']):
        tree_id = i
        class_id = tree_id % n_classes
        # tree['shrinkage'] --> LightGbm provides figures with it already.
        learning_rate = 1.
        _parse_tree_structure(
            tree_id, class_id, learning_rate, tree['tree_structure'], attrs)

    # Sort nodes_* attributes. For one tree, its node indexes should appear in an ascent order in nodes_nodeids. Nodes
    # from a tree with a smaller tree index should appear before trees with larger indexes in nodes_nodeids.
    node_numbers_per_tree = Counter(attrs['nodes_treeids'])
    tree_number = len(node_numbers_per_tree.keys())
    accumulated_node_numbers = [0] * tree_number
    for i in range(1, tree_number):
        accumulated_node_numbers[i] = (accumulated_node_numbers[i - 1] +
                                       node_numbers_per_tree[i - 1])
    global_node_indexes = []
    for i in range(len(attrs['nodes_nodeids'])):
        tree_id = attrs['nodes_treeids'][i]
        node_id = attrs['nodes_nodeids'][i]
        global_node_indexes.append(accumulated_node_numbers[tree_id] + node_id)
    for k, v in attrs.items():
        if k.startswith('nodes_'):
            merged_indexes = zip(copy.deepcopy(global_node_indexes), v)
            sorted_list = [pair[1]
                           for pair in sorted(merged_indexes, key=lambda x: x[0])]
            attrs[k] = sorted_list

    dtype = guess_numpy_type(operator.inputs[0].type)
    if dtype != numpy.float64:
        dtype = numpy.float32

    # Create ONNX object
    if (gbm_text['objective'].startswith('binary') or
            gbm_text['objective'].startswith('multiclass')):
        # Prepare label information for both of TreeEnsembleClassifier
        # and ZipMap
        class_type = onnx_proto.TensorProto.STRING  # pylint: disable=E1101
        zipmap_attrs = {'name': scope.get_unique_variable_name('ZipMap')}
        if all(isinstance(i, (numbers.Real, bool, numpy.bool_))
               for i in gbm_model.classes_):
            class_type = onnx_proto.TensorProto.INT64  # pylint: disable=E1101
            class_labels = [int(i) for i in gbm_model.classes_]
            attrs['classlabels_int64s'] = class_labels
            zipmap_attrs['classlabels_int64s'] = class_labels
        elif all(isinstance(i, str) for i in gbm_model.classes_):
            class_labels = [str(i) for i in gbm_model.classes_]
            attrs['classlabels_strings'] = class_labels
            zipmap_attrs['classlabels_strings'] = class_labels
        else:
            raise ValueError(  # pragma: no cover
                'Only string and integer class labels are allowed')

        # Create tree classifier
        probability_tensor_name = scope.get_unique_variable_name(
            'probability_tensor')
        label_tensor_name = scope.get_unique_variable_name('label_tensor')

        if dtype == numpy.float64:
            container.add_node('TreeEnsembleClassifierDouble', operator.input_full_names,
                               [label_tensor_name, probability_tensor_name],
                               op_domain='mlprodict', **attrs)
        else:
            container.add_node('TreeEnsembleClassifier', operator.input_full_names,
                               [label_tensor_name, probability_tensor_name],
                               op_domain='ai.onnx.ml', **attrs)

        prob_tensor = probability_tensor_name

        if gbm_model.boosting_type == 'rf':
            col_index_name = scope.get_unique_variable_name('col_index')
            first_col_name = scope.get_unique_variable_name('first_col')
            zeroth_col_name = scope.get_unique_variable_name('zeroth_col')
            denominator_name = scope.get_unique_variable_name('denominator')
            modified_first_col_name = scope.get_unique_variable_name(
                'modified_first_col')
            unit_float_tensor_name = scope.get_unique_variable_name(
                'unit_float_tensor')
            merged_prob_name = scope.get_unique_variable_name('merged_prob')
            predicted_label_name = scope.get_unique_variable_name(
                'predicted_label')
            classes_name = scope.get_unique_variable_name('classes')
            final_label_name = scope.get_unique_variable_name('final_label')

            container.add_initializer(
                col_index_name, onnx_proto.TensorProto.INT64, [], [1])  # pylint: disable=E1101
            container.add_initializer(
                unit_float_tensor_name, onnx_proto.TensorProto.FLOAT, [], [1.0])  # pylint: disable=E1101
            container.add_initializer(
                denominator_name, onnx_proto.TensorProto.FLOAT, [], [100.0])  # pylint: disable=E1101
            container.add_initializer(classes_name, class_type,
                                      [len(class_labels)], class_labels)

            container.add_node('ArrayFeatureExtractor', [probability_tensor_name, col_index_name],
                               first_col_name, name=scope.get_unique_operator_name(
                                   'ArrayFeatureExtractor'),
                               op_domain='ai.onnx.ml')
            apply_div(scope, [first_col_name, denominator_name],
                      modified_first_col_name, container, broadcast=1)
            apply_sub(scope, [unit_float_tensor_name, modified_first_col_name],
                      zeroth_col_name, container, broadcast=1)
            container.add_node('Concat', [zeroth_col_name, modified_first_col_name],
                               merged_prob_name, name=scope.get_unique_operator_name('Concat'), axis=1)
            container.add_node('ArgMax', merged_prob_name,
                               predicted_label_name, name=scope.get_unique_operator_name('ArgMax'), axis=1)
            container.add_node('ArrayFeatureExtractor', [classes_name, predicted_label_name], final_label_name,
                               name=scope.get_unique_operator_name('ArrayFeatureExtractor'), op_domain='ai.onnx.ml')
            apply_reshape(scope, final_label_name,
                          operator.outputs[0].full_name, container, desired_shape=[-1, ])
            prob_tensor = merged_prob_name
        else:
            container.add_node('Identity', label_tensor_name,
                               operator.outputs[0].full_name,
                               name=scope.get_unique_operator_name('Identity'))

        # Convert probability tensor to probability map
        # (keys are labels while values are the associated probabilities)
        container.add_node('Identity', prob_tensor,
                           operator.outputs[1].full_name)
    else:
        # Create tree regressor
        output_name = scope.get_unique_variable_name('output')

        keys_to_be_renamed = list(
            k for k in attrs if k.startswith('class_'))

        for k in keys_to_be_renamed:
            # Rename class_* attribute to target_* because TreeEnsebmleClassifier
            # and TreeEnsembleClassifier have different ONNX attributes
            attrs['target' + k[5:]] = copy.deepcopy(attrs[k])
            del attrs[k]
        if dtype == numpy.float64:
            container.add_node('TreeEnsembleRegressorDouble', operator.input_full_names,
                               output_name, op_domain='mlprodict', **attrs)
        else:
            container.add_node('TreeEnsembleRegressor', operator.input_full_names,
                               output_name, op_domain='ai.onnx.ml', **attrs)
        if gbm_model.boosting_type == 'rf':
            denominator_name = scope.get_unique_variable_name('denominator')

            container.add_initializer(
                denominator_name, onnx_proto.TensorProto.FLOAT, [], [100.0])  # pylint: disable=E1101

            apply_div(scope, [output_name, denominator_name],
                      operator.output_full_names, container, broadcast=1)
        else:
            container.add_node('Identity', output_name,
                               operator.output_full_names,
                               name=scope.get_unique_operator_name('Identity'))


[docs]def modify_tree_for_rule_in_set(gbm, use_float=False):  # pylint: disable=R1710
    """
    LightGBM produces sometimes a tree with a node set
    to use rule ``==`` to a set of values (= in set),
    the values are separated by ``||``.
    This function unfold theses nodes. A child looks
    like the following:

    .. runpython::
        :showcode:

        import pprint
        from mlprodict.onnx_conv.operator_converters.conv_lightgbm import modify_tree_for_rule_in_set

        tree = {'decision_type': '==',
                'default_left': True,
                'internal_count': 6805,
                'internal_value': 0.117558,
                'left_child': {'leaf_count': 4293,
                               'leaf_index': 18,
                               'leaf_value': 0.003519117642745049},
                'missing_type': 'None',
                'right_child': {'leaf_count': 2512,
                                'leaf_index': 25,
                                'leaf_value': 0.012305307958365394},
                'split_feature': 24,
                'split_gain': 12.233599662780762,
                'split_index': 24,
                'threshold': '10||12||13'}

        modify_tree_for_rule_in_set(tree)

        pprint.pprint(tree)


    :githublink:`%|py|441`
    """
    if 'tree_info' in gbm:
        for tree in gbm['tree_info']:
            modify_tree_for_rule_in_set(tree, use_float=use_float)
        return

    if 'tree_structure' in gbm:
        modify_tree_for_rule_in_set(gbm['tree_structure'], use_float=use_float)
        return

    if 'decision_type' not in gbm:
        return

    def recursive_call(this):
        if 'left_child' in this:
            modify_tree_for_rule_in_set(
                this['left_child'], use_float=use_float)
        if 'right_child' in this:
            modify_tree_for_rule_in_set(
                this['right_child'], use_float=use_float)

    def str2number(val):
        if use_float:
            return float(val)
        else:
            try:
                return int(val)
            except ValueError:  # pragma: no cover
                return float(val)

    dec = gbm['decision_type']
    if dec != '==':
        return recursive_call(gbm)

    th = gbm['threshold']
    if not isinstance(th, str) or '||' not in th:
        return recursive_call(gbm)

    pos = th.index('||')
    th1 = str2number(th[:pos])

    rest = th[pos + 2:]
    if '||' not in rest:
        rest = str2number(rest)

    gbm['threshold'] = th1
    new_node = gbm.copy()
    gbm['right_child'] = new_node
    new_node['threshold'] = rest
    return recursive_call(gbm)
Source code for mlprodict.onnx_conv.operator_converters.conv_lightgbm

mlprodict

Navigation

Related Topics