Source code for mlprodict.tools.model_info

"""
Functions to help get more information about the models.


:githublink:`%|py|5`
"""
import inspect
from collections import Counter
import numpy


[docs]def _analyse_tree(tree): """ Extract information from a tree. :githublink:`%|py|13` """ info = {} if hasattr(tree, 'node_count'): info['node_count'] = tree.node_count n_nodes = tree.node_count children_left = tree.children_left children_right = tree.children_right node_depth = numpy.zeros(shape=n_nodes, dtype=numpy.int64) is_leaves = numpy.zeros(shape=n_nodes, dtype=bool) stack = [(0, -1)] while len(stack) > 0: node_id, parent_depth = stack.pop() node_depth[node_id] = parent_depth + 1 if children_left[node_id] != children_right[node_id]: stack.append((children_left[node_id], parent_depth + 1)) stack.append((children_right[node_id], parent_depth + 1)) else: is_leaves[node_id] = True info['leave_count'] = sum(is_leaves) info['max_depth'] = max(node_depth) return info
[docs]def _analyse_tree_h(tree): """ Extract information from a tree in a HistGradientBoosting. :githublink:`%|py|42` """ info = {} info['leave_count'] = tree.get_n_leaf_nodes() info['node_count'] = len(tree.nodes) info['max_depth'] = tree.get_max_depth() return info
[docs]def _reduce_infos(infos): """ Produces agregates features. :githublink:`%|py|53` """ def tof(obj): try: return obj[0] except TypeError: # pragma: no cover return obj if not isinstance(infos, list): raise TypeError( # pragma: no cover "infos must a list not {}.".format(type(infos))) keys = set() for info in infos: if not isinstance(info, dict): raise TypeError( # pragma: no cover "info must a dictionary not {}.".format(type(info))) keys |= set(info) info = {} for k in keys: values = [d.get(k, None) for d in infos] values = [_ for _ in values if _ is not None] if k.endswith('.leave_count') or k.endswith('.node_count'): info['sum|%s' % k] = sum(values) elif k.endswith('.max_depth'): info['max|%s' % k] = max(values) elif k.endswith('.size'): info['sum|%s' % k] = sum(values) # pragma: no cover else: try: un = set(values) except TypeError: un = set() if len(un) == 1: info[k] = list(un)[0] continue if k.endswith('.shape'): row = [_[0] for _ in values] col = [_[1] for _ in values if len(_) > 1] if len(col) == 0: info['max|%s' % k] = (max(row), ) else: info['max|%s' % k] = (max(row), max(col)) continue if k == 'n_classes_': info['n_classes_'] = max(tof(_) for _ in values) continue raise NotImplementedError( # pragma: no cover "Unable to reduce key '{}', values={}.".format(k, values)) return info
[docs]def _get_info_lgb(model): """ Get informations from and :epkg:`lightgbm` trees. :githublink:`%|py|107` """ from ..onnx_conv.operator_converters.conv_lightgbm import ( _parse_tree_structure, get_default_tree_classifier_attribute_pairs ) gbm_text = model.dump_model() info = {'objective': gbm_text['objective']} if gbm_text['objective'].startswith('binary'): info['n_classes'] = 1 elif gbm_text['objective'].startswith('multiclass'): info['n_classes'] = gbm_text['num_class'] elif gbm_text['objective'].startswith('regression'): info['n_targets'] = 1 else: raise NotImplementedError( # pragma: no cover "Unknown objective '{}'.".format(gbm_text['objective'])) n_classes = info.get('n_classes', info.get('n_targets', -1)) info['estimators_.size'] = len(gbm_text['tree_info']) attrs = get_default_tree_classifier_attribute_pairs() for i, tree in enumerate(gbm_text['tree_info']): tree_id = i class_id = tree_id % n_classes learning_rate = 1. _parse_tree_structure( tree_id, class_id, learning_rate, tree['tree_structure'], attrs) info['node_count'] = len(attrs['nodes_nodeids']) info['ntrees'] = len(set(attrs['nodes_treeids'])) dist = Counter(attrs['nodes_modes']) info['leave_count'] = dist['LEAF'] info['mode_count'] = len(dist) return info
[docs]def _get_info_xgb(model): """ Get informations from and :epkg:`lightgbm` trees. :githublink:`%|py|146` """ from ..onnx_conv.operator_converters.conv_xgboost import ( XGBConverter, XGBClassifierConverter) objective, _, js_trees = XGBConverter.common_members(model, None) attrs = XGBClassifierConverter._get_default_tree_attribute_pairs() XGBConverter.fill_tree_attributes( js_trees, attrs, [1 for _ in js_trees], True) info = {'objective': objective} info['estimators_.size'] = len(js_trees) info['node_count'] = len(attrs['nodes_nodeids']) info['ntrees'] = len(set(attrs['nodes_treeids'])) dist = Counter(attrs['nodes_modes']) info['leave_count'] = dist['LEAF'] info['mode_count'] = len(dist) return info
[docs]def analyze_model(model, simplify=True): """ Returns informations, statistics about a model, its number of nodes, its size... :param model: any model :param simplify: simplifies the tuple of length 1 :return: dictionary .. exref:: :title: Extract information from a model The function :func:`analyze_model <mlprodict.tools.model_info.analyze_model>` extracts global figures about a model, whatever it is. .. runpython:: :showcode: import pprint from sklearn.datasets import load_iris from sklearn.ensemble import RandomForestClassifier from mlprodict.tools.model_info import analyze_model data = load_iris() X, y = data.data, data.target model = RandomForestClassifier().fit(X, y) infos = analyze_model(model) pprint.pprint(infos) :githublink:`%|py|191` """ if hasattr(model, 'SerializeToString'): # ONNX model from ..onnxrt.optim.onnx_helper import onnx_statistics return onnx_statistics(model) if isinstance(model, numpy.ndarray): info = {'shape': model.shape} infos = [] for v in model.ravel(): if hasattr(v, 'fit'): ii = analyze_model(v, False) infos.append(ii) if len(infos) == 0: return info # pragma: no cover for k, v in _reduce_infos(infos).items(): info['.%s' % k] = v return info # linear model info = {} for k in model.__dict__: if k in ['tree_']: continue if k.endswith('_') and not k.startswith('_'): v = getattr(model, k) if isinstance(v, numpy.ndarray): info['%s.shape' % k] = v.shape elif isinstance(v, numpy.float64): info['%s.shape' % k] = 1 elif k in ('_fit_X', ): v = getattr(model, k) info['%s.shape' % k] = v.shape # classification for f in ['n_classes_', 'n_outputs', 'n_features_']: if hasattr(model, f): info[f] = getattr(model, f) # tree if hasattr(model, 'tree_'): for k, v in _analyse_tree(model.tree_).items(): info['tree_.%s' % k] = v # tree if hasattr(model, 'get_n_leaf_nodes'): for k, v in _analyse_tree_h(model).items(): info['tree_.%s' % k] = v # estimators if hasattr(model, 'estimators_'): info['estimators_.size'] = len(model.estimators_) infos = [analyze_model(est, False) for est in model.estimators_] for k, v in _reduce_infos(infos).items(): info['estimators_.%s' % k] = v # predictors if hasattr(model, '_predictors'): info['_predictors.size'] = len(model._predictors) infos = [] for est in model._predictors: ii = [analyze_model(e, False) for e in est] infos.extend(ii) for k, v in _reduce_infos(infos).items(): info['_predictors.%s' % k] = v # LGBM if hasattr(model, 'booster_'): info.update(_get_info_lgb(model.booster_)) # XGB if hasattr(model, 'get_booster'): info.update(_get_info_xgb(model)) # end if simplify: up = {} for k, v in info.items(): if isinstance(v, tuple) and len(v) == 1: up[k] = v[0] info.update(up) return info
[docs]def enumerate_models(model): """ Enumerates models with models. :param model: :epkg:`scikit-learn` model :return: enumerate models :githublink:`%|py|282` """ yield model sig = inspect.signature(model.__init__) for k in sig.parameters: sub = getattr(model, k, None) if sub is None: continue if not hasattr(sub, 'fit'): continue for m in enumerate_models(sub): yield m
[docs]def set_random_state(model, value=0): """ Sets all possible parameter *random_state* to 0. :param model: :epkg:`scikit-learn` model :param value: new value :return: model (same one) :githublink:`%|py|302` """ for m in enumerate_models(model): sig = inspect.signature(m.__init__) hasit = any(filter(lambda p: p == 'random_state', sig.parameters)) if hasit and hasattr(m, 'random_state'): m.random_state = value return model