"""
Modified converter from
`LightGbm.py <https://github.com/onnx/onnxmltools/blob/master/onnxmltools/convert/
lightgbm/operator_converters/LightGbm.py>`_.
:githublink:`%|py|7`
"""
from collections import Counter
import copy
import numbers
import numpy
from skl2onnx.common._apply_operation import apply_div, apply_reshape, apply_sub # pylint: disable=E0611
from skl2onnx.common.tree_ensemble import get_default_tree_classifier_attribute_pairs
from skl2onnx.proto import onnx_proto
from skl2onnx.common.shape_calculator import (
calculate_linear_regressor_output_shapes,
calculate_linear_classifier_output_shapes)
from skl2onnx.common.data_types import guess_numpy_type
[docs]def calculate_lightgbm_output_shapes(operator):
"""
Shape calculator for LightGBM Booster
(see :epkg:`lightgbm`).
:githublink:`%|py|24`
"""
op = operator.raw_operator
if hasattr(op, "_model_dict"):
objective = op._model_dict['objective']
else:
objective = op.objective_
if objective.startswith('binary') or objective.startswith('multiclass'):
return calculate_linear_classifier_output_shapes(operator)
if objective.startswith('regression'): # pragma: no cover
return calculate_linear_regressor_output_shapes(operator)
raise NotImplementedError( # pragma: no cover
"Objective '{}' is not implemented yet.".format(objective))
[docs]def _translate_split_criterion(criterion):
# If the criterion is true, LightGBM use the left child. Otherwise, right child is selected.
if criterion == '<=':
return 'BRANCH_LEQ'
if criterion == '<': # pragma: no cover
return 'BRANCH_LT'
if criterion == '>=': # pragma: no cover
return 'BRANCH_GTE'
if criterion == '>': # pragma: no cover
return 'BRANCH_GT'
if criterion == '==': # pragma: no cover
return 'BRANCH_EQ'
if criterion == '!=': # pragma: no cover
return 'BRANCH_NEQ'
raise ValueError( # pragma: no cover
'Unsupported splitting criterion: %s. Only <=, <, >=, and > are allowed.')
[docs]def _create_node_id(node_id_pool):
i = 0
while i in node_id_pool:
i += 1
node_id_pool.add(i)
return i
[docs]def _parse_tree_structure(tree_id, class_id, learning_rate, tree_structure, attrs):
"""
The pool of all nodes' indexes created when parsing a single tree.
Different tree use different pools.
:githublink:`%|py|68`
"""
node_id_pool = set()
node_pyid_pool = dict()
node_id = _create_node_id(node_id_pool)
node_pyid_pool[id(tree_structure)] = node_id
# The root node is a leaf node.
if 'left_child' not in tree_structure or 'right_child' not in tree_structure:
_parse_node(tree_id, class_id, node_id, node_id_pool, node_pyid_pool,
learning_rate, tree_structure, attrs)
return
left_pyid = id(tree_structure['left_child'])
right_pyid = id(tree_structure['right_child'])
if left_pyid in node_pyid_pool:
left_id = node_pyid_pool[left_pyid]
left_parse = False
else:
left_id = _create_node_id(node_id_pool)
node_pyid_pool[left_pyid] = left_id
left_parse = True
if right_pyid in node_pyid_pool:
right_id = node_pyid_pool[right_pyid]
right_parse = False
else:
right_id = _create_node_id(node_id_pool)
node_pyid_pool[right_pyid] = right_id
right_parse = True
attrs['nodes_treeids'].append(tree_id)
attrs['nodes_nodeids'].append(node_id)
attrs['nodes_featureids'].append(tree_structure['split_feature'])
attrs['nodes_modes'].append(
_translate_split_criterion(tree_structure['decision_type']))
if isinstance(tree_structure['threshold'], str):
try: # pragma: no cover
attrs['nodes_values'].append( # pragma: no cover
float(tree_structure['threshold']))
except ValueError as e: # pragma: no cover
import pprint
text = pprint.pformat(tree_structure)
if len(text) > 99999:
text = text[:99999] + "\n..."
raise TypeError("threshold must be a number not '{}'"
"\n{}".format(tree_structure['threshold'], text)) from e
else:
attrs['nodes_values'].append(tree_structure['threshold'])
# Assume left is the true branch and right is the false branch
attrs['nodes_truenodeids'].append(left_id)
attrs['nodes_falsenodeids'].append(right_id)
if tree_structure['default_left']:
attrs['nodes_missing_value_tracks_true'].append(1)
else:
attrs['nodes_missing_value_tracks_true'].append(0)
attrs['nodes_hitrates'].append(1.)
if left_parse:
_parse_node(tree_id, class_id, left_id, node_id_pool, node_pyid_pool,
learning_rate, tree_structure['left_child'], attrs)
if right_parse:
_parse_node(tree_id, class_id, right_id, node_id_pool, node_pyid_pool,
learning_rate, tree_structure['right_child'], attrs)
[docs]def _parse_node(tree_id, class_id, node_id, node_id_pool, node_pyid_pool,
learning_rate, node, attrs):
"""
Parses nodes.
:githublink:`%|py|140`
"""
if (hasattr(node, 'left_child') and hasattr(node, 'right_child')) or \
('left_child' in node and 'right_child' in node):
left_pyid = id(node['left_child'])
right_pyid = id(node['right_child'])
if left_pyid in node_pyid_pool:
left_id = node_pyid_pool[left_pyid]
left_parse = False
else:
left_id = _create_node_id(node_id_pool)
node_pyid_pool[left_pyid] = left_id
left_parse = True
if right_pyid in node_pyid_pool:
right_id = node_pyid_pool[right_pyid]
right_parse = False
else:
right_id = _create_node_id(node_id_pool)
node_pyid_pool[right_pyid] = right_id
right_parse = True
attrs['nodes_treeids'].append(tree_id)
attrs['nodes_nodeids'].append(node_id)
attrs['nodes_featureids'].append(node['split_feature'])
attrs['nodes_modes'].append(
_translate_split_criterion(node['decision_type']))
if isinstance(node['threshold'], str):
try: # pragma: no cover
attrs['nodes_values'].append( # pragma: no cover
float(node['threshold']))
except ValueError as e: # pragma: no cover
import pprint
text = pprint.pformat(node)
if len(text) > 99999:
text = text[:99999] + "\n..."
raise TypeError("threshold must be a number not '{}'"
"\n{}".format(node['threshold'], text)) from e
else:
attrs['nodes_values'].append(node['threshold'])
# Assume left is the true branch and right is the false branch
attrs['nodes_truenodeids'].append(left_id)
attrs['nodes_falsenodeids'].append(right_id)
if node['default_left']:
attrs['nodes_missing_value_tracks_true'].append(1)
else:
attrs['nodes_missing_value_tracks_true'].append(0)
attrs['nodes_hitrates'].append(1.)
# Recursively dive into the child nodes
if left_parse:
_parse_node(tree_id, class_id, left_id, node_id_pool, node_pyid_pool,
learning_rate, node['left_child'], attrs)
if right_parse:
_parse_node(tree_id, class_id, right_id, node_id_pool, node_pyid_pool,
learning_rate, node['right_child'], attrs)
elif hasattr(node, 'left_child') or hasattr(node, 'right_child'):
raise ValueError('Need two branches') # pragma: no cover
else:
# Node attributes
attrs['nodes_treeids'].append(tree_id)
attrs['nodes_nodeids'].append(node_id)
attrs['nodes_featureids'].append(0)
attrs['nodes_modes'].append('LEAF')
# Leaf node has no threshold. A zero is appended but it will never be used.
attrs['nodes_values'].append(0.)
# Leaf node has no child. A zero is appended but it will never be used.
attrs['nodes_truenodeids'].append(0)
# Leaf node has no child. A zero is appended but it will never be used.
attrs['nodes_falsenodeids'].append(0)
# Leaf node has no split function. A zero is appended but it will never be used.
attrs['nodes_missing_value_tracks_true'].append(0)
attrs['nodes_hitrates'].append(1.)
# Leaf attributes
attrs['class_treeids'].append(tree_id)
attrs['class_nodeids'].append(node_id)
attrs['class_ids'].append(class_id)
attrs['class_weights'].append(
float(node['leaf_value']) * learning_rate)
[docs]def convert_lightgbm(scope, operator, container):
"""
This converters reuses the code from
`LightGbm.py <https://github.com/onnx/onnxmltools/blob/master/onnxmltools/convert/
lightgbm/operator_converters/LightGbm.py>`_ and makes
some modifications. It implements converters
for models in :epkg:`lightgbm`.
:githublink:`%|py|232`
"""
gbm_model = operator.raw_operator
gbm_text = gbm_model.booster_.dump_model()
modify_tree_for_rule_in_set(gbm_text, use_float=True)
attrs = get_default_tree_classifier_attribute_pairs()
attrs['name'] = operator.full_name
# Create different attributes for classifier and regressor, respectively
if gbm_text['objective'].startswith('binary'):
n_classes = 1
attrs['post_transform'] = 'LOGISTIC'
elif gbm_text['objective'].startswith('multiclass'):
n_classes = gbm_text['num_class']
attrs['post_transform'] = 'SOFTMAX'
elif gbm_text['objective'].startswith('regression'):
n_classes = 1 # Regressor has only one output variable
attrs['post_transform'] = 'NONE'
attrs['n_targets'] = n_classes
else:
raise RuntimeError( # pragma: no cover
"LightGBM objective should be cleaned already not '{}'.".format(
gbm_text['objective']))
# Use the same algorithm to parse the tree
for i, tree in enumerate(gbm_text['tree_info']):
tree_id = i
class_id = tree_id % n_classes
# tree['shrinkage'] --> LightGbm provides figures with it already.
learning_rate = 1.
_parse_tree_structure(
tree_id, class_id, learning_rate, tree['tree_structure'], attrs)
# Sort nodes_* attributes. For one tree, its node indexes should appear in an ascent order in nodes_nodeids. Nodes
# from a tree with a smaller tree index should appear before trees with larger indexes in nodes_nodeids.
node_numbers_per_tree = Counter(attrs['nodes_treeids'])
tree_number = len(node_numbers_per_tree.keys())
accumulated_node_numbers = [0] * tree_number
for i in range(1, tree_number):
accumulated_node_numbers[i] = (accumulated_node_numbers[i - 1] +
node_numbers_per_tree[i - 1])
global_node_indexes = []
for i in range(len(attrs['nodes_nodeids'])):
tree_id = attrs['nodes_treeids'][i]
node_id = attrs['nodes_nodeids'][i]
global_node_indexes.append(accumulated_node_numbers[tree_id] + node_id)
for k, v in attrs.items():
if k.startswith('nodes_'):
merged_indexes = zip(copy.deepcopy(global_node_indexes), v)
sorted_list = [pair[1]
for pair in sorted(merged_indexes, key=lambda x: x[0])]
attrs[k] = sorted_list
dtype = guess_numpy_type(operator.inputs[0].type)
if dtype != numpy.float64:
dtype = numpy.float32
# Create ONNX object
if (gbm_text['objective'].startswith('binary') or
gbm_text['objective'].startswith('multiclass')):
# Prepare label information for both of TreeEnsembleClassifier
# and ZipMap
class_type = onnx_proto.TensorProto.STRING # pylint: disable=E1101
zipmap_attrs = {'name': scope.get_unique_variable_name('ZipMap')}
if all(isinstance(i, (numbers.Real, bool, numpy.bool_))
for i in gbm_model.classes_):
class_type = onnx_proto.TensorProto.INT64 # pylint: disable=E1101
class_labels = [int(i) for i in gbm_model.classes_]
attrs['classlabels_int64s'] = class_labels
zipmap_attrs['classlabels_int64s'] = class_labels
elif all(isinstance(i, str) for i in gbm_model.classes_):
class_labels = [str(i) for i in gbm_model.classes_]
attrs['classlabels_strings'] = class_labels
zipmap_attrs['classlabels_strings'] = class_labels
else:
raise ValueError( # pragma: no cover
'Only string and integer class labels are allowed')
# Create tree classifier
probability_tensor_name = scope.get_unique_variable_name(
'probability_tensor')
label_tensor_name = scope.get_unique_variable_name('label_tensor')
if dtype == numpy.float64:
container.add_node('TreeEnsembleClassifierDouble', operator.input_full_names,
[label_tensor_name, probability_tensor_name],
op_domain='mlprodict', **attrs)
else:
container.add_node('TreeEnsembleClassifier', operator.input_full_names,
[label_tensor_name, probability_tensor_name],
op_domain='ai.onnx.ml', **attrs)
prob_tensor = probability_tensor_name
if gbm_model.boosting_type == 'rf':
col_index_name = scope.get_unique_variable_name('col_index')
first_col_name = scope.get_unique_variable_name('first_col')
zeroth_col_name = scope.get_unique_variable_name('zeroth_col')
denominator_name = scope.get_unique_variable_name('denominator')
modified_first_col_name = scope.get_unique_variable_name(
'modified_first_col')
unit_float_tensor_name = scope.get_unique_variable_name(
'unit_float_tensor')
merged_prob_name = scope.get_unique_variable_name('merged_prob')
predicted_label_name = scope.get_unique_variable_name(
'predicted_label')
classes_name = scope.get_unique_variable_name('classes')
final_label_name = scope.get_unique_variable_name('final_label')
container.add_initializer(
col_index_name, onnx_proto.TensorProto.INT64, [], [1]) # pylint: disable=E1101
container.add_initializer(
unit_float_tensor_name, onnx_proto.TensorProto.FLOAT, [], [1.0]) # pylint: disable=E1101
container.add_initializer(
denominator_name, onnx_proto.TensorProto.FLOAT, [], [100.0]) # pylint: disable=E1101
container.add_initializer(classes_name, class_type,
[len(class_labels)], class_labels)
container.add_node('ArrayFeatureExtractor', [probability_tensor_name, col_index_name],
first_col_name, name=scope.get_unique_operator_name(
'ArrayFeatureExtractor'),
op_domain='ai.onnx.ml')
apply_div(scope, [first_col_name, denominator_name],
modified_first_col_name, container, broadcast=1)
apply_sub(scope, [unit_float_tensor_name, modified_first_col_name],
zeroth_col_name, container, broadcast=1)
container.add_node('Concat', [zeroth_col_name, modified_first_col_name],
merged_prob_name, name=scope.get_unique_operator_name('Concat'), axis=1)
container.add_node('ArgMax', merged_prob_name,
predicted_label_name, name=scope.get_unique_operator_name('ArgMax'), axis=1)
container.add_node('ArrayFeatureExtractor', [classes_name, predicted_label_name], final_label_name,
name=scope.get_unique_operator_name('ArrayFeatureExtractor'), op_domain='ai.onnx.ml')
apply_reshape(scope, final_label_name,
operator.outputs[0].full_name, container, desired_shape=[-1, ])
prob_tensor = merged_prob_name
else:
container.add_node('Identity', label_tensor_name,
operator.outputs[0].full_name,
name=scope.get_unique_operator_name('Identity'))
# Convert probability tensor to probability map
# (keys are labels while values are the associated probabilities)
container.add_node('Identity', prob_tensor,
operator.outputs[1].full_name)
else:
# Create tree regressor
output_name = scope.get_unique_variable_name('output')
keys_to_be_renamed = list(
k for k in attrs if k.startswith('class_'))
for k in keys_to_be_renamed:
# Rename class_* attribute to target_* because TreeEnsebmleClassifier
# and TreeEnsembleClassifier have different ONNX attributes
attrs['target' + k[5:]] = copy.deepcopy(attrs[k])
del attrs[k]
if dtype == numpy.float64:
container.add_node('TreeEnsembleRegressorDouble', operator.input_full_names,
output_name, op_domain='mlprodict', **attrs)
else:
container.add_node('TreeEnsembleRegressor', operator.input_full_names,
output_name, op_domain='ai.onnx.ml', **attrs)
if gbm_model.boosting_type == 'rf':
denominator_name = scope.get_unique_variable_name('denominator')
container.add_initializer(
denominator_name, onnx_proto.TensorProto.FLOAT, [], [100.0]) # pylint: disable=E1101
apply_div(scope, [output_name, denominator_name],
operator.output_full_names, container, broadcast=1)
else:
container.add_node('Identity', output_name,
operator.output_full_names,
name=scope.get_unique_operator_name('Identity'))
[docs]def modify_tree_for_rule_in_set(gbm, use_float=False): # pylint: disable=R1710
"""
LightGBM produces sometimes a tree with a node set
to use rule ``==`` to a set of values (= in set),
the values are separated by ``||``.
This function unfold theses nodes. A child looks
like the following:
.. runpython::
:showcode:
import pprint
from mlprodict.onnx_conv.operator_converters.conv_lightgbm import modify_tree_for_rule_in_set
tree = {'decision_type': '==',
'default_left': True,
'internal_count': 6805,
'internal_value': 0.117558,
'left_child': {'leaf_count': 4293,
'leaf_index': 18,
'leaf_value': 0.003519117642745049},
'missing_type': 'None',
'right_child': {'leaf_count': 2512,
'leaf_index': 25,
'leaf_value': 0.012305307958365394},
'split_feature': 24,
'split_gain': 12.233599662780762,
'split_index': 24,
'threshold': '10||12||13'}
modify_tree_for_rule_in_set(tree)
pprint.pprint(tree)
:githublink:`%|py|441`
"""
if 'tree_info' in gbm:
for tree in gbm['tree_info']:
modify_tree_for_rule_in_set(tree, use_float=use_float)
return
if 'tree_structure' in gbm:
modify_tree_for_rule_in_set(gbm['tree_structure'], use_float=use_float)
return
if 'decision_type' not in gbm:
return
def recursive_call(this):
if 'left_child' in this:
modify_tree_for_rule_in_set(
this['left_child'], use_float=use_float)
if 'right_child' in this:
modify_tree_for_rule_in_set(
this['right_child'], use_float=use_float)
def str2number(val):
if use_float:
return float(val)
else:
try:
return int(val)
except ValueError: # pragma: no cover
return float(val)
dec = gbm['decision_type']
if dec != '==':
return recursive_call(gbm)
th = gbm['threshold']
if not isinstance(th, str) or '||' not in th:
return recursive_call(gbm)
pos = th.index('||')
th1 = str2number(th[:pos])
rest = th[pos + 2:]
if '||' not in rest:
rest = str2number(rest)
gbm['threshold'] = th1
new_node = gbm.copy()
gbm['right_child'] = new_node
new_node['threshold'] = rest
return recursive_call(gbm)