Source code for mlprodict.onnx_conv.operator_converters.conv_xgboost

"""
Modified converter from
`XGBoost.py <https://github.com/onnx/onnxmltools/blob/master/onnxmltools/convert/
xgboost/operator_converters/XGBoost.py>`_.


:githublink:`%|py|7`
"""
import json
from pprint import pformat
import numpy
from xgboost import XGBClassifier
from skl2onnx.common.data_types import guess_numpy_type  # pylint: disable=C0411


[docs]class XGBConverter:
    "common methods for converters"

[docs]    @staticmethod
    def get_xgb_params(xgb_node):
        """
        Retrieves parameters of a model.


        :githublink:`%|py|21`
        """
        pars = xgb_node.get_xgb_params()
        # xgboost >= 1.0
        if 'n_estimators' not in pars:
            pars['n_estimators'] = xgb_node.n_estimators
        return pars

[docs]    @staticmethod
    def validate(xgb_node):
        "validates the model"
        params = XGBConverter.get_xgb_params(xgb_node)
        try:
            if "objective" not in params:
                raise AttributeError('ojective')
        except AttributeError as e:
            raise RuntimeError('Missing attribute in XGBoost model.') from e

[docs]    @staticmethod
    def common_members(xgb_node, inputs):
        "common to regresssor and classifier"
        params = XGBConverter.get_xgb_params(xgb_node)
        objective = params["objective"]
        base_score = params["base_score"]
        booster = xgb_node.get_booster()
        # The json format was available in October 2017.
        # XGBoost 0.7 was the first version released with it.
        js_tree_list = booster.get_dump(with_stats=True, dump_format='json')
        js_trees = [json.loads(s) for s in js_tree_list]
        return objective, base_score, js_trees

[docs]    @staticmethod
    def _get_default_tree_attribute_pairs(is_classifier):
        attrs = {}
        for k in {'nodes_treeids', 'nodes_nodeids',
                  'nodes_featureids', 'nodes_modes', 'nodes_values',
                  'nodes_truenodeids', 'nodes_falsenodeids', 'nodes_missing_value_tracks_true'}:
            attrs[k] = []
        if is_classifier:
            for k in {'class_treeids', 'class_nodeids', 'class_ids', 'class_weights'}:
                attrs[k] = []
        else:
            for k in {'target_treeids', 'target_nodeids', 'target_ids', 'target_weights'}:
                attrs[k] = []
        return attrs

[docs]    @staticmethod
    def _add_node(attr_pairs, is_classifier, tree_id, tree_weight, node_id,
                  feature_id, mode, value, true_child_id, false_child_id, weights, weight_id_bias,
                  missing, hitrate):
        if isinstance(feature_id, str):
            # Something like f0, f1...
            if feature_id[0] == "f":
                try:
                    feature_id = int(feature_id[1:])
                except ValueError as e:  # pragma: no cover
                    raise RuntimeError(
                        "Unable to interpret '{0}'".format(feature_id)) from e
            else:  # pragma: no cover
                try:
                    feature_id = int(feature_id)
                except ValueError:
                    raise RuntimeError(
                        "Unable to interpret '{0}'".format(feature_id)) from e

        # Split condition for sklearn
        # * if X_ptr[X_sample_stride * i + X_fx_stride * node.feature] <= node.threshold:
        # * https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/tree/_tree.pyx#L946
        # Split condition for xgboost
        # * if (fvalue < split_value)
        # * https://github.com/dmlc/xgboost/blob/master/include/xgboost/tree_model.h#L804

        attr_pairs['nodes_treeids'].append(tree_id)
        attr_pairs['nodes_nodeids'].append(node_id)
        attr_pairs['nodes_featureids'].append(feature_id)
        attr_pairs['nodes_modes'].append(mode)
        attr_pairs['nodes_values'].append(float(value))
        attr_pairs['nodes_truenodeids'].append(true_child_id)
        attr_pairs['nodes_falsenodeids'].append(false_child_id)
        attr_pairs['nodes_missing_value_tracks_true'].append(missing)
        if 'nodes_hitrates' in attr_pairs:
            attr_pairs['nodes_hitrates'].append(hitrate)  # pragma: no cover
        if mode == 'LEAF':
            if is_classifier:
                for i, w in enumerate(weights):
                    attr_pairs['class_treeids'].append(tree_id)
                    attr_pairs['class_nodeids'].append(node_id)
                    attr_pairs['class_ids'].append(i + weight_id_bias)
                    attr_pairs['class_weights'].append(float(tree_weight * w))
            else:
                for i, w in enumerate(weights):
                    attr_pairs['target_treeids'].append(tree_id)
                    attr_pairs['target_nodeids'].append(node_id)
                    attr_pairs['target_ids'].append(i + weight_id_bias)
                    attr_pairs['target_weights'].append(float(tree_weight * w))

[docs]    @staticmethod
    def _fill_node_attributes(treeid, tree_weight, jsnode, attr_pairs, is_classifier, remap):
        if 'children' in jsnode:
            XGBConverter._add_node(attr_pairs=attr_pairs, is_classifier=is_classifier,
                                   tree_id=treeid, tree_weight=tree_weight,
                                   value=jsnode['split_condition'], node_id=remap[jsnode['nodeid']],
                                   feature_id=jsnode['split'],
                                   mode='BRANCH_LT',  # 'BRANCH_LEQ' --> is for sklearn
                                   # ['children'][0]['nodeid'],
                                   true_child_id=remap[jsnode['yes']],
                                   # ['children'][1]['nodeid'],
                                   false_child_id=remap[jsnode['no']],
                                   weights=None, weight_id_bias=None,
                                   # ['children'][0]['nodeid'],
                                   missing=jsnode.get(
                                       'missing', -1) == jsnode['yes'],
                                   hitrate=jsnode.get('cover', 0))

            for ch in jsnode['children']:
                if 'children' in ch or 'leaf' in ch:
                    XGBConverter._fill_node_attributes(
                        treeid, tree_weight, ch, attr_pairs, is_classifier, remap)
                else:
                    raise RuntimeError(  # pragma: no cover
                        "Unable to convert this node {0}".format(ch))

        else:
            weights = [jsnode['leaf']]
            weights_id_bias = 0
            XGBConverter._add_node(attr_pairs=attr_pairs, is_classifier=is_classifier,
                                   tree_id=treeid, tree_weight=tree_weight,
                                   value=0., node_id=remap[jsnode['nodeid']],
                                   feature_id=0, mode='LEAF',
                                   true_child_id=0, false_child_id=0,
                                   weights=weights, weight_id_bias=weights_id_bias,
                                   missing=False, hitrate=jsnode.get('cover', 0))

[docs]    @staticmethod
    def _remap_nodeid(jsnode, remap=None):
        if remap is None:
            remap = {}
        nid = jsnode['nodeid']
        remap[nid] = len(remap)
        if 'children' in jsnode:
            for ch in jsnode['children']:
                XGBConverter._remap_nodeid(ch, remap)
        return remap

[docs]    @staticmethod
    def fill_tree_attributes(js_xgb_node, attr_pairs, tree_weights, is_classifier):
        "fills tree attributes"
        if not isinstance(js_xgb_node, list):
            raise TypeError(  # pragma: no cover
                "js_xgb_node must be a list")
        for treeid, (jstree, w) in enumerate(zip(js_xgb_node, tree_weights)):
            remap = XGBConverter._remap_nodeid(jstree)
            XGBConverter._fill_node_attributes(
                treeid, w, jstree, attr_pairs, is_classifier, remap)


[docs]class XGBRegressorConverter(XGBConverter):
    "converter class"

[docs]    @staticmethod
    def validate(xgb_node):
        return XGBConverter.validate(xgb_node)

[docs]    @staticmethod
    def _get_default_tree_attribute_pairs():  # pylint: disable=W0221
        attrs = XGBConverter._get_default_tree_attribute_pairs(False)
        attrs['post_transform'] = 'NONE'
        attrs['n_targets'] = 1
        return attrs

[docs]    @staticmethod
    def convert(scope, operator, container):
        "converter method"
        dtype = guess_numpy_type(operator.inputs[0].type)
        if dtype != numpy.float64:
            dtype = numpy.float32
        xgb_node = operator.raw_operator
        inputs = operator.inputs
        objective, base_score, js_trees = XGBConverter.common_members(
            xgb_node, inputs)

        if objective in ["reg:gamma", "reg:tweedie"]:
            raise RuntimeError(  # pragma: no cover
                "Objective '{}' not supported.".format(objective))

        booster = xgb_node.get_booster()
        if booster is None:
            raise RuntimeError(  # pragma: no cover
                "The model was probably not trained.")

        best_ntree_limit = getattr(booster, 'best_ntree_limit', len(js_trees))
        if best_ntree_limit < len(js_trees):
            js_trees = js_trees[:best_ntree_limit]

        attr_pairs = XGBRegressorConverter._get_default_tree_attribute_pairs()
        attr_pairs['base_values'] = [base_score]
        XGBConverter.fill_tree_attributes(
            js_trees, attr_pairs, [1 for _ in js_trees], False)

        # add nodes
        if dtype == numpy.float64:
            container.add_node('TreeEnsembleRegressorDouble', operator.input_full_names,
                               operator.output_full_names,
                               name=scope.get_unique_operator_name(
                                   'TreeEnsembleRegressorDouble'),
                               op_domain='mlprodict', **attr_pairs)
        else:
            container.add_node('TreeEnsembleRegressor', operator.input_full_names,
                               operator.output_full_names,
                               name=scope.get_unique_operator_name(
                                   'TreeEnsembleRegressor'),
                               op_domain='ai.onnx.ml', **attr_pairs)


[docs]class XGBClassifierConverter(XGBConverter):
    "converter for XGBClassifier"

[docs]    @staticmethod
    def validate(xgb_node):
        return XGBConverter.validate(xgb_node)

[docs]    @staticmethod
    def _get_default_tree_attribute_pairs():  # pylint: disable=W0221
        attrs = XGBConverter._get_default_tree_attribute_pairs(True)
        # attrs['nodes_hitrates'] = []
        return attrs

[docs]    @staticmethod
    def convert(scope, operator, container):
        "convert method"
        dtype = guess_numpy_type(operator.inputs[0].type)
        if dtype != numpy.float64:
            dtype = numpy.float32
        xgb_node = operator.raw_operator
        inputs = operator.inputs

        objective, base_score, js_trees = XGBConverter.common_members(
            xgb_node, inputs)
        if base_score is None:
            raise RuntimeError(  # pragma: no cover
                "base_score cannot be None")
        params = XGBConverter.get_xgb_params(xgb_node)

        attr_pairs = XGBClassifierConverter._get_default_tree_attribute_pairs()
        XGBConverter.fill_tree_attributes(
            js_trees, attr_pairs, [1 for _ in js_trees], True)

        ncl = (max(attr_pairs['class_treeids']) + 1) // params['n_estimators']

        bst = xgb_node.get_booster()
        best_ntree_limit = getattr(
            bst, 'best_ntree_limit', len(js_trees)) * ncl
        if best_ntree_limit < len(js_trees):
            js_trees = js_trees[:best_ntree_limit]
            attr_pairs = XGBClassifierConverter._get_default_tree_attribute_pairs()
            XGBConverter.fill_tree_attributes(
                js_trees, attr_pairs, [1 for _ in js_trees], True)

        if len(attr_pairs['class_treeids']) == 0:
            raise RuntimeError(  # pragma: no cover
                "XGBoost model is empty.")
        if 'n_estimators' not in params:
            raise RuntimeError(  # pragma: no cover
                "Parameters not found, existing:\n{}".format(
                    pformat(params)))
        if ncl <= 1:
            ncl = 2
            # See https://github.com/dmlc/xgboost/blob/master/src/common/math.h#L23.
            attr_pairs['post_transform'] = "LOGISTIC"
            attr_pairs['class_ids'] = [0 for v in attr_pairs['class_treeids']]
        else:
            # See https://github.com/dmlc/xgboost/blob/master/src/common/math.h#L35.
            attr_pairs['post_transform'] = "SOFTMAX"
            # attr_pairs['base_values'] = [base_score for n in range(ncl)]
            attr_pairs['class_ids'] = [v % ncl
                                       for v in attr_pairs['class_treeids']]

        classes = xgb_node.classes_
        if (numpy.issubdtype(classes.dtype, numpy.floating) or
                numpy.issubdtype(classes.dtype, numpy.signedinteger)):
            attr_pairs['classlabels_int64s'] = classes.astype('int')
        else:
            classes = numpy.array([s.encode('utf-8') for s in classes])
            attr_pairs['classlabels_strings'] = classes

        if dtype == numpy.float64:
            op_name = "TreeEnsembleClassifierDouble"
        else:
            op_name = "TreeEnsembleClassifier"

        # add nodes
        if objective == "binary:logistic":
            ncl = 2
            container.add_node(op_name, operator.input_full_names,
                               operator.output_full_names,
                               name=scope.get_unique_operator_name(
                                   op_name),
                               op_domain='ai.onnx.ml', **attr_pairs)
        elif objective == "multi:softprob":
            ncl = len(js_trees) // params['n_estimators']
            container.add_node(op_name, operator.input_full_names,
                               operator.output_full_names,
                               name=scope.get_unique_operator_name(
                                   op_name),
                               op_domain='ai.onnx.ml', **attr_pairs)
        elif objective == "reg:logistic":
            ncl = len(js_trees) // params['n_estimators']
            if ncl == 1:
                ncl = 2
            container.add_node(op_name, operator.input_full_names,
                               operator.output_full_names,
                               name=scope.get_unique_operator_name(
                                   op_name),
                               op_domain='ai.onnx.ml', **attr_pairs)
        else:
            raise RuntimeError(  # pragma: no cover
                "Unexpected objective: {0}".format(objective))


[docs]def convert_xgboost(scope, operator, container):
    """
    This converters reuses the code from
    `XGBoost.py <https://github.com/onnx/onnxmltools/blob/master/onnxmltools/convert/
    xgboost/operator_converters/XGBoost.py>`_ and makes
    some modifications. It implements converters
    for models in :epkg:`xgboost`.


    :githublink:`%|py|346`
    """
    xgb_node = operator.raw_operator
    if isinstance(xgb_node, XGBClassifier):
        cls = XGBClassifierConverter
    else:
        cls = XGBRegressorConverter
    cls.convert(scope, operator, container)
Source code for mlprodict.onnx_conv.operator_converters.conv_xgboost

mlprodict

Navigation

Related Topics