Source code for mlinsights.mltree.tree_structure

"""
Helpers to investigate a tree structure.


:githublink:`%|py|5`
"""
import numpy
from sklearn.tree._tree import TREE_LEAF  # pylint: disable=E0611


[docs]def _get_tree(obj):
    """
    Returns the tree object.


    :githublink:`%|py|12`
    """
    if hasattr(obj, "children_left"):
        return obj
    if hasattr(obj, "tree_"):
        return obj.tree_
    raise AttributeError("obj is no tree: {}".format(type(obj)))


[docs]def tree_leave_index(model):
    """
    Returns the indices of every leave in a tree.

    :param      model:       something which has a member ``tree_``
    :return:                 leave indices


    :githublink:`%|py|26`
    """
    tree = _get_tree(model)
    res = []
    for i in range(tree.node_count):
        if tree.children_left[i] == TREE_LEAF:
            res.append(i)
    return res


[docs]def tree_find_path_to_root(tree, i, parents=None):
    """
    Lists nodes involved into the path to find node *i*.

    :param      tree:        tree
    :param      i:           node index (``tree.nodes[i]``)
    :param      parents:     precomputed parents (None -> calls :func:`tree_node_range <mlinsights.mltree.tree_structure.tree_node_range>`)
    :return:                 one array of size *(D, 2)* where *D* is the number of dimensions


    :githublink:`%|py|43`
    """
    tree = _get_tree(tree)
    path_i = [i]
    current_i = i
    while current_i in parents:
        current_i = parents[current_i]
        if current_i < 0:
            current_i = - current_i
        path_i.append(current_i)
    return list(reversed(path_i))


[docs]def tree_find_common_node(tree, i, j, parents=None):
    """
    Finds the common node to nodes *i* and *j*.

    :param      tree:        tree
    :param      i:           node index (``tree.nodes[i]``)
    :param      j:           node index (``tree.nodes[j]``)
    :param      parents:     precomputed parents (None -> calls :func:`tree_node_range <mlinsights.mltree.tree_structure.tree_node_range>`)
    :return:                 common root, remaining path to *i*, remaining path to *j*


    :githublink:`%|py|64`
    """
    tree = _get_tree(tree)
    if parents is None:
        parents = tree_node_parents(tree)
    path_i = tree_find_path_to_root(tree, i, parents)
    path_j = tree_find_path_to_root(tree, j, parents)
    for pos, (a, b) in enumerate(zip(path_i, path_j)):
        if a != b:
            return a, path_i[pos:], path_j[pos:]
    pi = parents.get(i, None)
    pj = parents.get(j, None)
    pos = min(len(path_i), len(path_j))
    if pi is not None and pi == j:
        return j, path_i[pos:], path_j[pos:]
    if pj is not None and pj == i:
        return i, path_i[pos:], path_j[pos:]
    raise RuntimeError(  # pragma: no cover
        "Paths are equal, i={} and j={} must be differet.".format(i, j))


[docs]def tree_node_parents(tree):
    """
    Returns a dictionary ``{node_id: parent_id}``.

    :param      tree:        tree
    :return:                 parents


    :githublink:`%|py|90`
    """
    tree = _get_tree(tree)
    parents = {}
    for i in range(tree.node_count):
        if tree.children_left[i] == TREE_LEAF:
            continue
        parents[tree.children_left[i]] = i
        parents[tree.children_right[i]] = -i
    return parents


[docs]def tree_node_range(tree, i, parents=None):
    """
    Determines the ranges for a node all dimensions.
    ``nan`` means infinity.

    :param      tree:        tree
    :param      i:           node index (``tree.nodes[i]``)
    :param      parents:     precomputed parents (None -> calls :func:`tree_node_range <mlinsights.mltree.tree_structure.tree_node_range>`)
    :return:                 one array of size *(D, 2)* where *D* is the number of dimensions

    The following example shows what the function returns
    in case of simple grid in two dimensions.

    .. runpython::
        :showcode:

        import numpy
        from sklearn.tree import DecisionTreeClassifier
        from mlinsights.mltree import tree_leave_index, tree_node_range

        X = numpy.array([[0, 0], [0, 1], [0, 2],
                         [1, 0], [1, 1], [1, 2],
                         [2, 0], [2, 1], [2, 2]])
        y = list(range(X.shape[0]))
        clr = DecisionTreeClassifier(max_depth=4)
        clr.fit(X, y)

        leaves = tree_leave_index(clr)
        ra = tree_node_range(clr, leaves[0])

        print(ra)


    :githublink:`%|py|132`
    """
    tree = _get_tree(tree)
    if parents is None:
        parents = tree_node_parents(tree)
    path = tree_find_path_to_root(tree, i, parents)
    mx = max([tree.feature[p] for p in path])
    res = numpy.full((mx + 1, 2), numpy.nan)
    for ind, p in enumerate(path):
        if p == i:
            break
        fn = tree.feature[p]
        lr = tree.children_left[p] == path[ind + 1]
        th = tree.threshold[p]
        if lr:
            res[fn, 1] = min(res[fn, 1], th) if not numpy.isnan(
                res[fn, 1]) else th
        else:
            res[fn, 0] = max(res[fn, 0], th) if not numpy.isnan(
                res[fn, 0]) else th
    return res


[docs]def predict_leaves(model, X):
    """
    Returns the leave every observations of *X*
    falls into.

    :param      model:       a decision tree
    :param      X:           observations
    :return:                 array of leaves


    :githublink:`%|py|162`
    """
    if hasattr(model, 'get_leaves_index'):
        leaves_index = model.get_leaves_index()
    else:
        leaves_index = [i for i in range(len(model.tree_.children_left))
                        if model.tree_.children_left[i] == TREE_LEAF]
    leaves = model.decision_path(X)
    leaves = leaves[:, leaves_index]
    mat = numpy.argmax(leaves, 1)
    res = numpy.asarray(mat).ravel()
    res = numpy.array([leaves_index[r] for r in res])
    return res


[docs]def tree_leave_neighbors(model):
    """
    The function determines which leaves are neighbors.
    The method uses some memory as it creates creates a
    grid of the feature spaces, each split multiplies the
    number of cells by two.

    :param      model:       a :epkg:`sklearn:tree:DecisionTreeRegressor`,
                            a :epkg:`sklearn:tree:DecisionTreeClassifier`,
                            a model which has a member ``tree_``

    :return:                 a dictionary ``{(i, j): (dimension, x1, x2)}``,
                            *i, j* are node indices, if :math:`X_d * sign < th  * sign`,
                            the observations goes to node *i*, *j* otherwise,
                            *i < j*. The border is somewhere in the segment ``[x1, x2]``.

    The following example shows what the function returns
    in case of simple grid in two dimensions.

    .. runpython::
        :showcode:

        import numpy
        from sklearn.tree import DecisionTreeClassifier
        from mlinsights.mltree import tree_leave_neighbors

        X = numpy.array([[0, 0], [0, 1], [0, 2],
                         [1, 0], [1, 1], [1, 2],
                         [2, 0], [2, 1], [2, 2]])
        y = list(range(X.shape[0]))
        clr = DecisionTreeClassifier(max_depth=4)
        clr.fit(X, y)

        nei = tree_leave_neighbors(clr)

        import pprint
        pprint.pprint(nei)


    :githublink:`%|py|212`
    """
    tree = _get_tree(model)

    # creates the coordinates of the grid

    features = {}
    for i in range(tree.node_count):
        fe = tree.feature[i]
        if fe < 0:
            # leave
            continue
        th = tree.threshold[i]
        if fe not in features:
            features[fe] = []
        features[fe].append(th)
    for fe in features:
        features[fe] = list(sorted(set(features[fe])))
    for fe, v in features.items():
        if len(v) == 1:
            d = abs(v[0]) / 10
            if d == v[0]:
                d = 1
            v.insert(0, v[0] - d)
            v.append(v[-1] + d)
        else:
            diff = [v[i + 1] - v[i] for i in range(len(v) - 1)]
            mdiff = min(diff)
            v.append(v[-1] + mdiff)
            v.insert(0, v[0] - mdiff)

    # predictions

    keys = list(sorted(features))
    pos = [0 for k in keys]
    shape = [len(features[k]) - 1 for k in keys]
    cells = numpy.full(shape, 0, numpy.int32)
    while pos[0] < len(features[keys[0]]) - 1:
        # evaluate
        xy = numpy.zeros((1, model.n_features_))
        for p, k in zip(pos, keys):
            xy[0, k] = (features[k][p] + features[k][p + 1]) / 2
        leave = predict_leaves(model, xy)
        cells[tuple(pos)] = leave[0]

        # next
        ind = len(pos) - 1
        pos[ind] += 1
        while ind > 0 and pos[ind] >= len(features[keys[ind]]) - 1:
            pos[ind] = 0
            ind -= 1
            pos[ind] += 1

    # neighbors

    neighbors = {}
    pos = [0 for k in keys]
    while pos[0] <= len(features[keys[0]]) - 1:
        # neighbors
        try:
            cl = cells[tuple(pos)]
        except IndexError:
            # outside the cube
            cl = None
        if cl is not None:
            for k in range(len(pos)):  # pylint: disable=C0200
                pos[k] += 1
                try:
                    cl2 = cells[tuple(pos)]
                except IndexError:
                    # outside the cube
                    pos[k] -= 1
                    continue
                if cl != cl2:
                    edge = (cl, cl2) if cl < cl2 else (cl2, cl)
                    if edge not in neighbors:
                        neighbors[edge] = []
                    xy = numpy.zeros((model.n_features_))
                    for p, f in zip(pos, keys):
                        xy[f] = (features[f][p] + features[f][p + 1]) / 2
                    x2 = tuple(xy)
                    pos[k] -= 1
                    p = pos[k]
                    key = keys[k]
                    xy[key] = (features[key][p] + features[key][p + 1]) / 2
                    x1 = tuple(xy)
                    neighbors[edge].append((key, x1, x2))
                else:
                    pos[k] -= 1

        # next

        ind = len(pos) - 1
        pos[ind] += 1
        while ind > 0 and pos[ind] >= len(features[keys[ind]]) - 1:
            pos[ind] = 0
            ind -= 1
            pos[ind] += 1

    return neighbors
Source code for mlinsights.mltree.tree_structure

mlinsights

Navigation

Related Topics