Source code for mlprodict.tools.model_info

"""
Functions to help get more information about the models.


:githublink:`%|py|5`
"""
import inspect
from collections import Counter
import numpy


[docs]def _analyse_tree(tree):
    """
    Extract information from a tree.


    :githublink:`%|py|13`
    """
    info = {}
    if hasattr(tree, 'node_count'):
        info['node_count'] = tree.node_count

    n_nodes = tree.node_count
    children_left = tree.children_left
    children_right = tree.children_right
    node_depth = numpy.zeros(shape=n_nodes, dtype=numpy.int64)
    is_leaves = numpy.zeros(shape=n_nodes, dtype=bool)
    stack = [(0, -1)]
    while len(stack) > 0:
        node_id, parent_depth = stack.pop()
        node_depth[node_id] = parent_depth + 1
        if children_left[node_id] != children_right[node_id]:
            stack.append((children_left[node_id], parent_depth + 1))
            stack.append((children_right[node_id], parent_depth + 1))
        else:
            is_leaves[node_id] = True

    info['leave_count'] = sum(is_leaves)
    info['max_depth'] = max(node_depth)
    return info


[docs]def _analyse_tree_h(tree):
    """
    Extract information from a tree in a
    HistGradientBoosting.


    :githublink:`%|py|42`
    """
    info = {}
    info['leave_count'] = tree.get_n_leaf_nodes()
    info['node_count'] = len(tree.nodes)
    info['max_depth'] = tree.get_max_depth()
    return info


[docs]def _reduce_infos(infos):
    """
    Produces agregates features.


    :githublink:`%|py|53`
    """
    def tof(obj):
        try:
            return obj[0]
        except TypeError:  # pragma: no cover
            return obj

    if not isinstance(infos, list):
        raise TypeError(  # pragma: no cover
            "infos must a list not {}.".format(type(infos)))
    keys = set()
    for info in infos:
        if not isinstance(info, dict):
            raise TypeError(  # pragma: no cover
                "info must a dictionary not {}.".format(type(info)))
        keys |= set(info)

    info = {}
    for k in keys:
        values = [d.get(k, None) for d in infos]
        values = [_ for _ in values if _ is not None]
        if k.endswith('.leave_count') or k.endswith('.node_count'):
            info['sum|%s' % k] = sum(values)
        elif k.endswith('.max_depth'):
            info['max|%s' % k] = max(values)
        elif k.endswith('.size'):
            info['sum|%s' % k] = sum(values)  # pragma: no cover
        else:
            try:
                un = set(values)
            except TypeError:
                un = set()
            if len(un) == 1:
                info[k] = list(un)[0]
                continue
            if k.endswith('.shape'):
                row = [_[0] for _ in values]
                col = [_[1] for _ in values if len(_) > 1]
                if len(col) == 0:
                    info['max|%s' % k] = (max(row), )
                else:
                    info['max|%s' % k] = (max(row), max(col))
                continue
            if k == 'n_classes_':
                info['n_classes_'] = max(tof(_) for _ in values)
                continue
            raise NotImplementedError(  # pragma: no cover
                "Unable to reduce key '{}', values={}.".format(k, values))
    return info


[docs]def _get_info_lgb(model):
    """
    Get informations from and :epkg:`lightgbm` trees.


    :githublink:`%|py|107`
    """
    from ..onnx_conv.operator_converters.conv_lightgbm import (
        _parse_tree_structure,
        get_default_tree_classifier_attribute_pairs
    )
    gbm_text = model.dump_model()

    info = {'objective': gbm_text['objective']}
    if gbm_text['objective'].startswith('binary'):
        info['n_classes'] = 1
    elif gbm_text['objective'].startswith('multiclass'):
        info['n_classes'] = gbm_text['num_class']
    elif gbm_text['objective'].startswith('regression'):
        info['n_targets'] = 1
    else:
        raise NotImplementedError(  # pragma: no cover
            "Unknown objective '{}'.".format(gbm_text['objective']))
    n_classes = info.get('n_classes', info.get('n_targets', -1))

    info['estimators_.size'] = len(gbm_text['tree_info'])
    attrs = get_default_tree_classifier_attribute_pairs()
    for i, tree in enumerate(gbm_text['tree_info']):
        tree_id = i
        class_id = tree_id % n_classes
        learning_rate = 1.
        _parse_tree_structure(
            tree_id, class_id, learning_rate, tree['tree_structure'], attrs)

    info['node_count'] = len(attrs['nodes_nodeids'])
    info['ntrees'] = len(set(attrs['nodes_treeids']))
    dist = Counter(attrs['nodes_modes'])
    info['leave_count'] = dist['LEAF']
    info['mode_count'] = len(dist)
    return info


[docs]def _get_info_xgb(model):
    """
    Get informations from and :epkg:`lightgbm` trees.


    :githublink:`%|py|146`
    """
    from ..onnx_conv.operator_converters.conv_xgboost import (
        XGBConverter, XGBClassifierConverter)
    objective, _, js_trees = XGBConverter.common_members(model, None)
    attrs = XGBClassifierConverter._get_default_tree_attribute_pairs()
    XGBConverter.fill_tree_attributes(
        js_trees, attrs, [1 for _ in js_trees], True)
    info = {'objective': objective}
    info['estimators_.size'] = len(js_trees)
    info['node_count'] = len(attrs['nodes_nodeids'])
    info['ntrees'] = len(set(attrs['nodes_treeids']))
    dist = Counter(attrs['nodes_modes'])
    info['leave_count'] = dist['LEAF']
    info['mode_count'] = len(dist)
    return info


[docs]def analyze_model(model, simplify=True):
    """
    Returns informations, statistics about a model,
    its number of nodes, its size...

    :param      model:       any model
    :param      simplify:    simplifies the tuple of length 1
    :return:                 dictionary

    .. exref::
        :title: Extract information from a model

        The function :func:`analyze_model <mlprodict.tools.model_info.analyze_model>` extracts global
        figures about a model, whatever it is.

        .. runpython::
            :showcode:

            import pprint
            from sklearn.datasets import load_iris
            from sklearn.ensemble import RandomForestClassifier
            from mlprodict.tools.model_info import analyze_model

            data = load_iris()
            X, y = data.data, data.target
            model = RandomForestClassifier().fit(X, y)
            infos = analyze_model(model)
            pprint.pprint(infos)


    :githublink:`%|py|191`
    """
    if hasattr(model, 'SerializeToString'):
        # ONNX model
        from ..onnxrt.optim.onnx_helper import onnx_statistics
        return onnx_statistics(model)

    if isinstance(model, numpy.ndarray):
        info = {'shape': model.shape}
        infos = []
        for v in model.ravel():
            if hasattr(v, 'fit'):
                ii = analyze_model(v, False)
                infos.append(ii)
        if len(infos) == 0:
            return info  # pragma: no cover
        for k, v in _reduce_infos(infos).items():
            info['.%s' % k] = v
        return info

    # linear model
    info = {}
    for k in model.__dict__:
        if k in ['tree_']:
            continue
        if k.endswith('_') and not k.startswith('_'):
            v = getattr(model, k)
            if isinstance(v, numpy.ndarray):
                info['%s.shape' % k] = v.shape
            elif isinstance(v, numpy.float64):
                info['%s.shape' % k] = 1
        elif k in ('_fit_X', ):
            v = getattr(model, k)
            info['%s.shape' % k] = v.shape

    # classification
    for f in ['n_classes_', 'n_outputs', 'n_features_']:
        if hasattr(model, f):
            info[f] = getattr(model, f)

    # tree
    if hasattr(model, 'tree_'):
        for k, v in _analyse_tree(model.tree_).items():
            info['tree_.%s' % k] = v

    # tree
    if hasattr(model, 'get_n_leaf_nodes'):
        for k, v in _analyse_tree_h(model).items():
            info['tree_.%s' % k] = v

    # estimators
    if hasattr(model, 'estimators_'):
        info['estimators_.size'] = len(model.estimators_)
        infos = [analyze_model(est, False) for est in model.estimators_]
        for k, v in _reduce_infos(infos).items():
            info['estimators_.%s' % k] = v

    # predictors
    if hasattr(model, '_predictors'):
        info['_predictors.size'] = len(model._predictors)
        infos = []
        for est in model._predictors:
            ii = [analyze_model(e, False) for e in est]
            infos.extend(ii)
        for k, v in _reduce_infos(infos).items():
            info['_predictors.%s' % k] = v

    # LGBM
    if hasattr(model, 'booster_'):
        info.update(_get_info_lgb(model.booster_))

    # XGB
    if hasattr(model, 'get_booster'):
        info.update(_get_info_xgb(model))

    # end
    if simplify:
        up = {}
        for k, v in info.items():
            if isinstance(v, tuple) and len(v) == 1:
                up[k] = v[0]
        info.update(up)

    return info


[docs]def enumerate_models(model):
    """
    Enumerates models with models.

    :param      model:       :epkg:`scikit-learn` model
    :return:                 enumerate models


    :githublink:`%|py|282`
    """
    yield model
    sig = inspect.signature(model.__init__)
    for k in sig.parameters:
        sub = getattr(model, k, None)
        if sub is None:
            continue
        if not hasattr(sub, 'fit'):
            continue
        for m in enumerate_models(sub):
            yield m


[docs]def set_random_state(model, value=0):
    """
    Sets all possible parameter *random_state* to 0.

    :param      model:       :epkg:`scikit-learn` model
    :param      value:       new value
    :return:                 model (same one)


    :githublink:`%|py|302`
    """
    for m in enumerate_models(model):
        sig = inspect.signature(m.__init__)
        hasit = any(filter(lambda p: p == 'random_state',
                           sig.parameters))
        if hasit and hasattr(m, 'random_state'):
            m.random_state = value
    return model
Source code for mlprodict.tools.model_info

mlprodict

Navigation

Related Topics