Source code for mlinsights.mltree.tree_structure

"""
Helpers to investigate a tree structure.


:githublink:`%|py|5`
"""
import numpy
from sklearn.tree._tree import TREE_LEAF  # pylint: disable=E0611


[docs]def _get_tree(obj): """ Returns the tree object. :githublink:`%|py|12` """ if hasattr(obj, "children_left"): return obj if hasattr(obj, "tree_"): return obj.tree_ raise AttributeError("obj is no tree: {}".format(type(obj)))
[docs]def tree_leave_index(model): """ Returns the indices of every leave in a tree. :param model: something which has a member ``tree_`` :return: leave indices :githublink:`%|py|26` """ tree = _get_tree(model) res = [] for i in range(tree.node_count): if tree.children_left[i] == TREE_LEAF: res.append(i) return res
[docs]def tree_find_path_to_root(tree, i, parents=None): """ Lists nodes involved into the path to find node *i*. :param tree: tree :param i: node index (``tree.nodes[i]``) :param parents: precomputed parents (None -> calls :func:`tree_node_range <mlinsights.mltree.tree_structure.tree_node_range>`) :return: one array of size *(D, 2)* where *D* is the number of dimensions :githublink:`%|py|43` """ tree = _get_tree(tree) path_i = [i] current_i = i while current_i in parents: current_i = parents[current_i] if current_i < 0: current_i = - current_i path_i.append(current_i) return list(reversed(path_i))
[docs]def tree_find_common_node(tree, i, j, parents=None): """ Finds the common node to nodes *i* and *j*. :param tree: tree :param i: node index (``tree.nodes[i]``) :param j: node index (``tree.nodes[j]``) :param parents: precomputed parents (None -> calls :func:`tree_node_range <mlinsights.mltree.tree_structure.tree_node_range>`) :return: common root, remaining path to *i*, remaining path to *j* :githublink:`%|py|64` """ tree = _get_tree(tree) if parents is None: parents = tree_node_parents(tree) path_i = tree_find_path_to_root(tree, i, parents) path_j = tree_find_path_to_root(tree, j, parents) for pos, (a, b) in enumerate(zip(path_i, path_j)): if a != b: return a, path_i[pos:], path_j[pos:] pi = parents.get(i, None) pj = parents.get(j, None) pos = min(len(path_i), len(path_j)) if pi is not None and pi == j: return j, path_i[pos:], path_j[pos:] if pj is not None and pj == i: return i, path_i[pos:], path_j[pos:] raise RuntimeError( # pragma: no cover "Paths are equal, i={} and j={} must be differet.".format(i, j))
[docs]def tree_node_parents(tree): """ Returns a dictionary ``{node_id: parent_id}``. :param tree: tree :return: parents :githublink:`%|py|90` """ tree = _get_tree(tree) parents = {} for i in range(tree.node_count): if tree.children_left[i] == TREE_LEAF: continue parents[tree.children_left[i]] = i parents[tree.children_right[i]] = -i return parents
[docs]def tree_node_range(tree, i, parents=None): """ Determines the ranges for a node all dimensions. ``nan`` means infinity. :param tree: tree :param i: node index (``tree.nodes[i]``) :param parents: precomputed parents (None -> calls :func:`tree_node_range <mlinsights.mltree.tree_structure.tree_node_range>`) :return: one array of size *(D, 2)* where *D* is the number of dimensions The following example shows what the function returns in case of simple grid in two dimensions. .. runpython:: :showcode: import numpy from sklearn.tree import DecisionTreeClassifier from mlinsights.mltree import tree_leave_index, tree_node_range X = numpy.array([[0, 0], [0, 1], [0, 2], [1, 0], [1, 1], [1, 2], [2, 0], [2, 1], [2, 2]]) y = list(range(X.shape[0])) clr = DecisionTreeClassifier(max_depth=4) clr.fit(X, y) leaves = tree_leave_index(clr) ra = tree_node_range(clr, leaves[0]) print(ra) :githublink:`%|py|132` """ tree = _get_tree(tree) if parents is None: parents = tree_node_parents(tree) path = tree_find_path_to_root(tree, i, parents) mx = max([tree.feature[p] for p in path]) res = numpy.full((mx + 1, 2), numpy.nan) for ind, p in enumerate(path): if p == i: break fn = tree.feature[p] lr = tree.children_left[p] == path[ind + 1] th = tree.threshold[p] if lr: res[fn, 1] = min(res[fn, 1], th) if not numpy.isnan( res[fn, 1]) else th else: res[fn, 0] = max(res[fn, 0], th) if not numpy.isnan( res[fn, 0]) else th return res
[docs]def predict_leaves(model, X): """ Returns the leave every observations of *X* falls into. :param model: a decision tree :param X: observations :return: array of leaves :githublink:`%|py|162` """ if hasattr(model, 'get_leaves_index'): leaves_index = model.get_leaves_index() else: leaves_index = [i for i in range(len(model.tree_.children_left)) if model.tree_.children_left[i] == TREE_LEAF] leaves = model.decision_path(X) leaves = leaves[:, leaves_index] mat = numpy.argmax(leaves, 1) res = numpy.asarray(mat).ravel() res = numpy.array([leaves_index[r] for r in res]) return res
[docs]def tree_leave_neighbors(model): """ The function determines which leaves are neighbors. The method uses some memory as it creates creates a grid of the feature spaces, each split multiplies the number of cells by two. :param model: a :epkg:`sklearn:tree:DecisionTreeRegressor`, a :epkg:`sklearn:tree:DecisionTreeClassifier`, a model which has a member ``tree_`` :return: a dictionary ``{(i, j): (dimension, x1, x2)}``, *i, j* are node indices, if :math:`X_d * sign < th * sign`, the observations goes to node *i*, *j* otherwise, *i < j*. The border is somewhere in the segment ``[x1, x2]``. The following example shows what the function returns in case of simple grid in two dimensions. .. runpython:: :showcode: import numpy from sklearn.tree import DecisionTreeClassifier from mlinsights.mltree import tree_leave_neighbors X = numpy.array([[0, 0], [0, 1], [0, 2], [1, 0], [1, 1], [1, 2], [2, 0], [2, 1], [2, 2]]) y = list(range(X.shape[0])) clr = DecisionTreeClassifier(max_depth=4) clr.fit(X, y) nei = tree_leave_neighbors(clr) import pprint pprint.pprint(nei) :githublink:`%|py|212` """ tree = _get_tree(model) # creates the coordinates of the grid features = {} for i in range(tree.node_count): fe = tree.feature[i] if fe < 0: # leave continue th = tree.threshold[i] if fe not in features: features[fe] = [] features[fe].append(th) for fe in features: features[fe] = list(sorted(set(features[fe]))) for fe, v in features.items(): if len(v) == 1: d = abs(v[0]) / 10 if d == v[0]: d = 1 v.insert(0, v[0] - d) v.append(v[-1] + d) else: diff = [v[i + 1] - v[i] for i in range(len(v) - 1)] mdiff = min(diff) v.append(v[-1] + mdiff) v.insert(0, v[0] - mdiff) # predictions keys = list(sorted(features)) pos = [0 for k in keys] shape = [len(features[k]) - 1 for k in keys] cells = numpy.full(shape, 0, numpy.int32) while pos[0] < len(features[keys[0]]) - 1: # evaluate xy = numpy.zeros((1, model.n_features_)) for p, k in zip(pos, keys): xy[0, k] = (features[k][p] + features[k][p + 1]) / 2 leave = predict_leaves(model, xy) cells[tuple(pos)] = leave[0] # next ind = len(pos) - 1 pos[ind] += 1 while ind > 0 and pos[ind] >= len(features[keys[ind]]) - 1: pos[ind] = 0 ind -= 1 pos[ind] += 1 # neighbors neighbors = {} pos = [0 for k in keys] while pos[0] <= len(features[keys[0]]) - 1: # neighbors try: cl = cells[tuple(pos)] except IndexError: # outside the cube cl = None if cl is not None: for k in range(len(pos)): # pylint: disable=C0200 pos[k] += 1 try: cl2 = cells[tuple(pos)] except IndexError: # outside the cube pos[k] -= 1 continue if cl != cl2: edge = (cl, cl2) if cl < cl2 else (cl2, cl) if edge not in neighbors: neighbors[edge] = [] xy = numpy.zeros((model.n_features_)) for p, f in zip(pos, keys): xy[f] = (features[f][p] + features[f][p + 1]) / 2 x2 = tuple(xy) pos[k] -= 1 p = pos[k] key = keys[k] xy[key] = (features[key][p] + features[key][p + 1]) / 2 x1 = tuple(xy) neighbors[edge].append((key, x1, x2)) else: pos[k] -= 1 # next ind = len(pos) - 1 pos[ind] += 1 while ind > 0 and pos[ind] >= len(features[keys[ind]]) - 1: pos[ind] = 0 ind -= 1 pos[ind] += 1 return neighbors