Visualize a scikit-learn pipeline

Links: notebook, html, PDF, python, slides, slides(2), GitHub

Pipeline can be big with scikit-learn, let’s dig into a visual way to look a them.

from jyquickhelper import add_notebook_menu
add_notebook_menu()
%matplotlib inline

Simple model

Let’s vizualize a simple pipeline, a single model not even trained.

import pandas
from sklearn import datasets
from sklearn.linear_model import LogisticRegression

iris = datasets.load_iris()
X = iris.data[:, :4]
df = pandas.DataFrame(X)
df.columns = ["X1", "X2", "X3", "X4"]
clf = LogisticRegression()
clf
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

The trick consists in converting the pipeline in a graph through the DOT language.

from mlinsights.plotting import pipeline2dot
dot = pipeline2dot(clf, df)
print(dot)
digraph{
  orientation=portrait;
  ranksep=0.25;
  nodesep=0.05;
  sch0[label="<f0> X1|<f1> X2|<f2> X3|<f3> X4",shape=record,fontsize=8];
  node1[label="union",shape=box,style="filled,rounded",color=cyan,fontsize=12];
  sch0:f0 -> node1;
  sch0:f1 -> node1;
  sch0:f2 -> node1;
  sch0:f3 -> node1;
  sch1[label="<f0> -v-0",shape=record,fontsize=8];
  node1 -> sch1:f0;
  node2[label="LogisticRegression",shape=box,style="filled,rounded",color=yellow,fontsize=12];
  sch1:f0 -> node2;
  sch2[label="<f0> PredictedLabel|<f1> Probabilities",shape=record,fontsize=8];
  node2 -> sch2:f0;
  node2 -> sch2:f1;
}

It is lot better with an image.

dot_file = "graph.dot"
with open(dot_file, "w", encoding="utf-8") as f:
    f.write(dot)
# might be needed on windows
import sys
import os
if sys.platform.startswith("win") and "Graphviz" not in os.environ["PATH"]:
    os.environ['PATH'] = os.environ['PATH'] + r';C:\Program Files (x86)\Graphviz2.38\bin'
from pyquickhelper.loghelper import run_cmd
cmd = "dot -G=300 -Tpng {0} -o{0}.png".format(dot_file)
run_cmd(cmd, wait=True, fLOG=print);
[run_cmd] execute dot -G=300 -Tpng graph.dot -ograph.dot.png
end of execution dot -G=300 -Tpng graph.dot -ograph.dot.png
from PIL import Image
img = Image.open("graph.dot.png")
img
../_images/visualize_pipeline_11_0.png

Complex pipeline

scikit-learn instroduced a couple of transform to play with features in a single pipeline. The following example is taken from Column Transformer with Mixed Types.

from sklearn import datasets
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
columns = ['pclass', 'name', 'sex', 'age', 'sibsp', 'parch', 'ticket', 'fare',
           'cabin', 'embarked', 'boat', 'body', 'home.dest']

numeric_features = ['age', 'fare']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_features = ['embarked', 'sex', 'pclass']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features),
        ])

clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', LogisticRegression(solver='lbfgs'))])
clf
Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipe...cept_scaling=1,
                                    l1_ratio=None, max_iter=100,
                                    multi_class='warn', n_jobs=None,
                                    penalty='l2', random_state=None,
                                    solver='lbfgs', tol=0.0001, verbose=0,
                                    warm_start=False))])

Let’s see it first as a simplified text.

from mlinsights.plotting import pipeline2str
print(pipeline2str(clf))
Pipeline
   ColumnTransformer
      Pipeline(age,fare)
         SimpleImputer
         StandardScaler
      Pipeline(embarked,sex,pclass)
         SimpleImputer
         OneHotEncoder
   LogisticRegression
dot = pipeline2dot(clf, columns)
dot_file = "graph2.dot"
with open(dot_file, "w", encoding="utf-8") as f:
    f.write(dot)
cmd = "dot -G=300 -Tpng {0} -o{0}.png".format(dot_file)
run_cmd(cmd, wait=True, fLOG=print);
[run_cmd] execute dot -G=300 -Tpng graph2.dot -ograph2.dot.png
end of execution dot -G=300 -Tpng graph2.dot -ograph2.dot.png
img = Image.open("graph2.dot.png")
img
../_images/visualize_pipeline_20_0.png

With javascript

from jyquickhelper import RenderJsDot
RenderJsDot(dot)

Example with FeatureUnion

from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures

model = Pipeline([('poly', PolynomialFeatures()),
                          ('union', FeatureUnion([
                                        ('scaler2', MinMaxScaler()),
                                        ('scaler3', StandardScaler())]))])
dot = pipeline2dot(model, columns)
RenderJsDot(dot)

Compute intermediate outputs

It is difficult to access intermediate outputs with scikit-learn but it may be interesting to do so. The method alter_pipeline_for_debugging modifies the pipeline to intercept intermediate outputs.

from numpy.random import randn

model = Pipeline([('scaler1', StandardScaler()),
                  ('union', FeatureUnion([
                      ('scaler2', StandardScaler()),
                      ('scaler3', MinMaxScaler())])),
                  ('lr', LinearRegression())])

X = randn(4, 5)
y = randn(4)
model.fit(X, y)
Pipeline(memory=None,
         steps=[('scaler1',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('union',
                 FeatureUnion(n_jobs=None,
                              transformer_list=[('scaler2',
                                                 StandardScaler(copy=True,
                                                                with_mean=True,
                                                                with_std=True)),
                                                ('scaler3',
                                                 MinMaxScaler(copy=True,
                                                              feature_range=(0,
                                                                             1)))],
                              transformer_weights=None)),
                ('lr',
                 LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
                                  normalize=False))])
print(pipeline2str(model))
Pipeline
   StandardScaler
   FeatureUnion
      StandardScaler
      MinMaxScaler
   LinearRegression

Let’s now modify the pipeline to get the intermediate outputs.

from mlinsights.helpers.pipeline import alter_pipeline_for_debugging
alter_pipeline_for_debugging(model)

The function adds a member _debug which stores inputs and outputs in every piece of the pipeline.

model.steps[0][1]._debug
BaseEstimatorDebugInformation(StandardScaler)
model.predict(X)
array([ 1.0871708 ,  0.21408905,  0.67145575, -0.20147965])

The member was populated with inputs and outputs.

model.steps[0][1]._debug
BaseEstimatorDebugInformation(StandardScaler)
  transform(
   shape=(4, 5)
   [[ 0.71061578  1.42059668  0.37088807  0.10064102  1.08834826]
    [ 0.81697275 -0.32169094  0.0506303   0.47284371 -0.45626977]
    [ 0.04326168  0.78471957 -0.13197047 -0.32399437  0.95757499]
    [ 1.28025757 -0.56532314 -1.81051696 -0.65783714 -0.13205629]]
  ) -> (
   shape=(4, 5)
   [[-0.00489048  1.34743397  0.88874478  0.47463257  1.08042296]
    [ 0.23578382 -0.80432775  0.50981265  1.34604534 -1.22476825]
    [-1.51504058  0.56211241  0.29375762 -0.51953731  0.88525665]
    [ 1.28414724 -1.10521862 -1.69231505 -1.3011406  -0.74091136]]
  )

Every piece behaves the same way.

from mlinsights.helpers.pipeline import enumerate_pipeline_models
for coor, model, vars in enumerate_pipeline_models(model):
    print(coor)
    print(model._debug)
(0,)
BaseEstimatorDebugInformation(Pipeline)
  predict(
   shape=(4, 5)
   [[ 0.71061578  1.42059668  0.37088807  0.10064102  1.08834826]
    [ 0.81697275 -0.32169094  0.0506303   0.47284371 -0.45626977]
    [ 0.04326168  0.78471957 -0.13197047 -0.32399437  0.95757499]
    [ 1.28025757 -0.56532314 -1.81051696 -0.65783714 -0.13205629]]
  ) -> (
   shape=(4,)
   [ 1.0871708   0.21408905  0.67145575 -0.20147965]
  )
(0, 0)
BaseEstimatorDebugInformation(StandardScaler)
  transform(
   shape=(4, 5)
   [[ 0.71061578  1.42059668  0.37088807  0.10064102  1.08834826]
    [ 0.81697275 -0.32169094  0.0506303   0.47284371 -0.45626977]
    [ 0.04326168  0.78471957 -0.13197047 -0.32399437  0.95757499]
    [ 1.28025757 -0.56532314 -1.81051696 -0.65783714 -0.13205629]]
  ) -> (
   shape=(4, 5)
   [[-0.00489048  1.34743397  0.88874478  0.47463257  1.08042296]
    [ 0.23578382 -0.80432775  0.50981265  1.34604534 -1.22476825]
    [-1.51504058  0.56211241  0.29375762 -0.51953731  0.88525665]
    [ 1.28414724 -1.10521862 -1.69231505 -1.3011406  -0.74091136]]
  )
(0, 1)
BaseEstimatorDebugInformation(FeatureUnion)
  transform(
   shape=(4, 5)
   [[-0.00489048  1.34743397  0.88874478  0.47463257  1.08042296]
    [ 0.23578382 -0.80432775  0.50981265  1.34604534 -1.22476825]
    [-1.51504058  0.56211241  0.29375762 -0.51953731  0.88525665]
    [ 1.28414724 -1.10521862 -1.69231505 -1.3011406  -0.74091136]]
  ) -> (
   shape=(4, 10)
   [[-0.00489048  1.34743397  0.88874478  0.47463257  1.08042296  0.53949581
      1.          1.          0.67081543  1.        ]
    [ 0.23578382 -0.80432775  0.50981265  1.34604534 -1.22476825  0.62547586
      0.12267978  0.85318739  1.          0.        ]
    [-1.51504058  0.56211241  0.29375762 -0.51953731  0.88525665  0.
   ...
  )
(0, 1, 0)
BaseEstimatorDebugInformation(StandardScaler)
  transform(
   shape=(4, 5)
   [[-0.00489048  1.34743397  0.88874478  0.47463257  1.08042296]
    [ 0.23578382 -0.80432775  0.50981265  1.34604534 -1.22476825]
    [-1.51504058  0.56211241  0.29375762 -0.51953731  0.88525665]
    [ 1.28414724 -1.10521862 -1.69231505 -1.3011406  -0.74091136]]
  ) -> (
   shape=(4, 5)
   [[-0.00489048  1.34743397  0.88874478  0.47463257  1.08042296]
    [ 0.23578382 -0.80432775  0.50981265  1.34604534 -1.22476825]
    [-1.51504058  0.56211241  0.29375762 -0.51953731  0.88525665]
    [ 1.28414724 -1.10521862 -1.69231505 -1.3011406  -0.74091136]]
  )
(0, 1, 1)
BaseEstimatorDebugInformation(MinMaxScaler)
  transform(
   shape=(4, 5)
   [[-0.00489048  1.34743397  0.88874478  0.47463257  1.08042296]
    [ 0.23578382 -0.80432775  0.50981265  1.34604534 -1.22476825]
    [-1.51504058  0.56211241  0.29375762 -0.51953731  0.88525665]
    [ 1.28414724 -1.10521862 -1.69231505 -1.3011406  -0.74091136]]
  ) -> (
   shape=(4, 5)
   [[0.53949581 1.         1.         0.67081543 1.        ]
    [0.62547586 0.12267978 0.85318739 1.         0.        ]
    [0.         0.67980726 0.76947952 0.29525818 0.91533617]
    [1.         0.         0.         0.         0.20989881]]
  )
(0, 2)
BaseEstimatorDebugInformation(LinearRegression)
  predict(
   shape=(4, 10)
   [[-0.00489048  1.34743397  0.88874478  0.47463257  1.08042296  0.53949581
      1.          1.          0.67081543  1.        ]
    [ 0.23578382 -0.80432775  0.50981265  1.34604534 -1.22476825  0.62547586
      0.12267978  0.85318739  1.          0.        ]
    [-1.51504058  0.56211241  0.29375762 -0.51953731  0.88525665  0.
   ...
  ) -> (
   shape=(4,)
   [ 1.0871708   0.21408905  0.67145575 -0.20147965]
  )