Piecewise classification with scikit-learn predictors#

Links: notebook, html, PDF, python, slides, GitHub

Piecewise regression is easier to understand but the concept can be extended to classification. That’s what this notebook explores.

from jyquickhelper import add_notebook_menu
%matplotlib inline
import warnings

Iris dataset and first logistic regression#

from sklearn import datasets
from sklearn.model_selection import train_test_split
iris = datasets.load_iris()
X = iris.data[:, :2]  # we only take the first two features.
Y = iris.target
X_train, X_test, y_train, y_test = train_test_split(X, Y)
import numpy
import matplotlib.pyplot as plt

def graph(X, Y, model):
    x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
    y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
    h = .02  # step size in the mesh
    xx, yy = numpy.meshgrid(numpy.arange(x_min, x_max, h),
                            numpy.arange(y_min, y_max, h))
    Z = model.predict(numpy.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)

    # Put the result into a color plot
    fig, ax = plt.subplots(1, 1, figsize=(4, 3))
    ax.pcolormesh(xx, yy, Z, cmap=plt.cm.Paired)

    # Plot also the training points
    ax.scatter(X[:, 0], X[:, 1], c=Y, edgecolors='k', cmap=plt.cm.Paired)
    ax.set_xlabel('Sepal length')
    ax.set_ylabel('Sepal width')

    ax.set_xlim(xx.min(), xx.max())
    ax.set_ylim(yy.min(), yy.max())
    return ax

from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
ax = graph(X_test, y_test, logreg)

Piecewise classication#

from sklearn.dummy import DummyClassifier
from sklearn.preprocessing import KBinsDiscretizer
from mlinsights.mlmodel import PiecewiseClassifier

dummy = DummyClassifier(strategy='most_frequent')
piece4 = PiecewiseClassifier(KBinsDiscretizer(n_bins=2),
                            estimator=dummy, verbose=True)
piece4.fit(X_train, y_train)
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.0s finished

We look into the bucket given to each point.

import pandas

bucket = piece4.transform_bins(X_test)
df = pandas.DataFrame(X_test, columns=("x1", "x2"))
df["bucket"] = bucket
df["label"] = y_test
df = df.set_index(bucket)
x1 x2 bucket label
0.0 6.2 3.4 0.0 2
0.0 6.7 3.1 0.0 1
2.0 5.1 3.8 2.0 0
2.0 4.8 3.0 2.0 0
3.0 5.5 2.3 3.0 1
import seaborn
ax = seaborn.scatterplot("x1", "x2", "bucket", data=df, palette='Set1', s=400)
seaborn.scatterplot("x1", "x2", "label", data=df, palette='Set1', marker="o", ax=ax, s=100)

We see there are four buckets. Two buckets only contains one label. The dummy classifier maps every bucket to the most frequent class in the bucket.

ax = graph(X_test, y_test, piece4)
ax.set_title("Piecewise Classification\n4 buckets");

We can increase the number of buckets.

dummy = DummyClassifier(strategy='most_frequent')
piece9 = PiecewiseClassifier(KBinsDiscretizer(n_bins=3),
                             estimator=dummy, verbose=True)
piece9.fit(X_train, y_train)
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    0.0s finished
ax = graph(X_test, y_test, piece9)
ax.set_title("Piecewise Classification\n9 buckets");

Let’s compute the ROC curve.

from sklearn.metrics import roc_curve, auc

def plot_roc_curve(models, X, y):
    if not isinstance(models, dict):
        return plot_roc_curve({models.__class__.__name__: models}, X, y)

    ax = None
    colors = 'bgrcmyk'
    for ic, (name, model) in enumerate(models.items()):
        fpr, tpr, roc_auc = dict(), dict(), dict()
        nb = len(model.classes_)
        y_score = model.predict_proba(X)
        for i in range(nb):
            c = model.classes_[i]
            fpr[i], tpr[i], _ = roc_curve(y_test == c, y_score[:, i])
            roc_auc[i] = auc(fpr[i], tpr[i])

        if ax is None:
            lw = 2
            _, ax = plt.subplots(1, nb, figsize=(4 * nb, 4))
            for i in range(nb):
                ax[i].plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
        plotname = "".join(c for c in name if "A" <= c <= "Z" or "0" <= c <= "9")
        for i in range(nb):
            ax[i].plot(fpr[i], tpr[i], color=colors[ic],
                       lw=lw, label='%0.2f %s' % (roc_auc[i], plotname))
            ax[i].set_title("class {}".format(model.classes_[i]))
    for k in range(ax.shape[0]):
    return ax

plot_roc_curve({'LR': logreg, 'P4': piece4, 'P9': piece9}, X_test, y_test);

Let’s use the decision tree to create buckets.

dummy = DummyClassifier(strategy='most_frequent')
pieceT = PiecewiseClassifier("tree", estimator=dummy, verbose=True)
pieceT.fit(X_train, y_train)
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  14 out of  14 | elapsed:    0.0s finished
ax = graph(X_test, y_test, pieceT)
ax.set_title("Piecewise Classification\n%d buckets (tree)" % len(pieceT.estimators_));
plot_roc_curve({'LR': logreg, 'P4': piece4, 'P9': piece9, "DT": pieceT},
               X_test, y_test);