Visualize a scikit-learn pipeline

Links: notebook, html, PDF, python, slides, GitHub

Pipeline can be big with scikit-learn, let’s dig into a visual way to look a them.

from jyquickhelper import add_notebook_menu
add_notebook_menu()
%matplotlib inline
import warnings
warnings.simplefilter("ignore")

Simple model

Let’s vizualize a simple pipeline, a single model not even trained.

import pandas
from sklearn import datasets
from sklearn.linear_model import LogisticRegression

iris = datasets.load_iris()
X = iris.data[:, :4]
df = pandas.DataFrame(X)
df.columns = ["X1", "X2", "X3", "X4"]
clf = LogisticRegression()
clf
LogisticRegression()

The trick consists in converting the pipeline in a graph through the DOT language.

from mlinsights.plotting import pipeline2dot
dot = pipeline2dot(clf, df)
print(dot)
digraph{
  orientation=portrait;
  nodesep=0.05;
  ranksep=0.25;
  sch0[label="<f0> X1|<f1> X2|<f2> X3|<f3> X4",shape=record,fontsize=8];
  node1[label="union",shape=box,style="filled,rounded",color=cyan,fontsize=12];
  sch0:f0 -> node1;
  sch0:f1 -> node1;
  sch0:f2 -> node1;
  sch0:f3 -> node1;
  sch1[label="<f0> -v-0",shape=record,fontsize=8];
  node1 -> sch1:f0;
  node2[label="LogisticRegression",shape=box,style="filled,rounded",color=yellow,fontsize=12];
  sch1:f0 -> node2;
  sch2[label="<f0> PredictedLabel|<f1> Probabilities",shape=record,fontsize=8];
  node2 -> sch2:f0;
  node2 -> sch2:f1;
}

It is lot better with an image.

dot_file = "graph.dot"
with open(dot_file, "w", encoding="utf-8") as f:
    f.write(dot)
# might be needed on windows
import sys
import os
if sys.platform.startswith("win") and "Graphviz" not in os.environ["PATH"]:
    os.environ['PATH'] = os.environ['PATH'] + r';C:\Program Files (x86)\Graphviz2.38\bin'
from pyquickhelper.loghelper import run_cmd
cmd = "dot -G=300 -Tpng {0} -o{0}.png".format(dot_file)
run_cmd(cmd, wait=True, fLOG=print);
[run_cmd] execute dot -G=300 -Tpng graph.dot -ograph.dot.png
end of execution dot -G=300 -Tpng graph.dot -ograph.dot.png
from PIL import Image
img = Image.open("graph.dot.png")
img
../_images/visualize_pipeline_12_0.png

Complex pipeline

scikit-learn instroduced a couple of transform to play with features in a single pipeline. The following example is taken from Column Transformer with Mixed Types.

from sklearn import datasets
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
columns = ['pclass', 'name', 'sex', 'age', 'sibsp', 'parch', 'ticket', 'fare',
           'cabin', 'embarked', 'boat', 'body', 'home.dest']

numeric_features = ['age', 'fare']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_features = ['embarked', 'sex', 'pclass']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features),
        ])

clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', LogisticRegression(solver='lbfgs'))])
clf
Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['age', 'fare']),
                                                 ('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(fill_value='missing',
                                                                                 strategy='constant')),
                                                                  ('onehot',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  ['embarked', 'sex',
                                                   'pclass'])])),
                ('classifier', LogisticRegression())])

Let’s see it first as a simplified text.

from mlinsights.plotting import pipeline2str
print(pipeline2str(clf))
Pipeline
   ColumnTransformer
      Pipeline(age,fare)
         SimpleImputer
         StandardScaler
      Pipeline(embarked,sex,pclass)
         SimpleImputer
         OneHotEncoder
   LogisticRegression
dot = pipeline2dot(clf, columns)
dot_file = "graph2.dot"
with open(dot_file, "w", encoding="utf-8") as f:
    f.write(dot)
cmd = "dot -G=300 -Tpng {0} -o{0}.png".format(dot_file)
run_cmd(cmd, wait=True, fLOG=print);
[run_cmd] execute dot -G=300 -Tpng graph2.dot -ograph2.dot.png
end of execution dot -G=300 -Tpng graph2.dot -ograph2.dot.png
img = Image.open("graph2.dot.png")
img
../_images/visualize_pipeline_21_0.png

With javascript

from jyquickhelper import RenderJsDot
RenderJsDot(dot)

Example with FeatureUnion

from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures

model = Pipeline([('poly', PolynomialFeatures()),
                          ('union', FeatureUnion([
                                        ('scaler2', MinMaxScaler()),
                                        ('scaler3', StandardScaler())]))])
dot = pipeline2dot(model, columns)
RenderJsDot(dot)

Compute intermediate outputs

It is difficult to access intermediate outputs with scikit-learn but it may be interesting to do so. The method alter_pipeline_for_debugging modifies the pipeline to intercept intermediate outputs.

from numpy.random import randn

model = Pipeline([('scaler1', StandardScaler()),
                  ('union', FeatureUnion([
                      ('scaler2', StandardScaler()),
                      ('scaler3', MinMaxScaler())])),
                  ('lr', LinearRegression())])

X = randn(4, 5)
y = randn(4)
model.fit(X, y)
Pipeline(steps=[('scaler1', StandardScaler()),
                ('union',
                 FeatureUnion(transformer_list=[('scaler2', StandardScaler()),
                                                ('scaler3', MinMaxScaler())])),
                ('lr', LinearRegression())])
print(pipeline2str(model))
Pipeline
   StandardScaler
   FeatureUnion
      StandardScaler
      MinMaxScaler
   LinearRegression

Let’s now modify the pipeline to get the intermediate outputs.

from mlinsights.helpers.pipeline import alter_pipeline_for_debugging
alter_pipeline_for_debugging(model)

The function adds a member _debug which stores inputs and outputs in every piece of the pipeline.

model.steps[0][1]._debug
BaseEstimatorDebugInformation(StandardScaler)
model.predict(X)
array([ 0.73619378,  0.87936142, -0.56528874, -0.2675163 ])

The member was populated with inputs and outputs.

model.steps[0][1]._debug
BaseEstimatorDebugInformation(StandardScaler)
  transform(
   shape=(4, 5) type=<class 'numpy.ndarray'>
   [[ 1.22836841  2.35164607 -0.37367786  0.61490475 -0.45377634]
    [-0.77187962  0.43540786  0.20465106  0.8910651  -0.23104796]
    [-0.36750208  0.35154324  1.78609517 -1.59325463  1.51595267]
    [ 1.37547609  1.59470748 -0.5932628   0.57822003  0.56034736]]
  ) -> (
   shape=(4, 5) type=<class 'numpy.ndarray'>
   [[ 0.90946066  1.4000516  -0.67682808  0.49311806 -1.03765861]
    [-1.20030006 -0.89626498 -0.05514595  0.76980985 -0.7493565 ]
    [-0.77378303 -0.99676381  1.64484777 -1.71929067  1.51198065]
    [ 1.06462242  0.49297719 -0.91287374  0.45636275  0.27503446]]
  )

Every piece behaves the same way.

from mlinsights.helpers.pipeline import enumerate_pipeline_models
for coor, model, vars in enumerate_pipeline_models(model):
    print(coor)
    print(model._debug)
(0,)
BaseEstimatorDebugInformation(Pipeline)
  predict(
   shape=(4, 5) type=<class 'numpy.ndarray'>
   [[ 1.22836841  2.35164607 -0.37367786  0.61490475 -0.45377634]
    [-0.77187962  0.43540786  0.20465106  0.8910651  -0.23104796]
    [-0.36750208  0.35154324  1.78609517 -1.59325463  1.51595267]
    [ 1.37547609  1.59470748 -0.5932628   0.57822003  0.56034736]]
  ) -> (
   shape=(4,) type=<class 'numpy.ndarray'>
   [ 0.73619378  0.87936142 -0.56528874 -0.2675163 ]
  )
(0, 0)
BaseEstimatorDebugInformation(StandardScaler)
  transform(
   shape=(4, 5) type=<class 'numpy.ndarray'>
   [[ 1.22836841  2.35164607 -0.37367786  0.61490475 -0.45377634]
    [-0.77187962  0.43540786  0.20465106  0.8910651  -0.23104796]
    [-0.36750208  0.35154324  1.78609517 -1.59325463  1.51595267]
    [ 1.37547609  1.59470748 -0.5932628   0.57822003  0.56034736]]
  ) -> (
   shape=(4, 5) type=<class 'numpy.ndarray'>
   [[ 0.90946066  1.4000516  -0.67682808  0.49311806 -1.03765861]
    [-1.20030006 -0.89626498 -0.05514595  0.76980985 -0.7493565 ]
    [-0.77378303 -0.99676381  1.64484777 -1.71929067  1.51198065]
    [ 1.06462242  0.49297719 -0.91287374  0.45636275  0.27503446]]
  )
(0, 1)
BaseEstimatorDebugInformation(FeatureUnion)
  transform(
   shape=(4, 5) type=<class 'numpy.ndarray'>
   [[ 0.90946066  1.4000516  -0.67682808  0.49311806 -1.03765861]
    [-1.20030006 -0.89626498 -0.05514595  0.76980985 -0.7493565 ]
    [-0.77378303 -0.99676381  1.64484777 -1.71929067  1.51198065]
    [ 1.06462242  0.49297719 -0.91287374  0.45636275  0.27503446]]
  ) -> (
   shape=(4, 10) type=<class 'numpy.ndarray'>
   [[ 0.90946066  1.4000516  -0.67682808  0.49311806 -1.03765861  0.93149357
      1.          0.09228748  0.88883864  0.        ]
    [-1.20030006 -0.89626498 -0.05514595  0.76980985 -0.7493565   0.
      0.04193015  0.33534839  1.          0.11307564]
    [-0.77378303 -0.99676381  1.64484777 -1.71929067  1.51198065  0.18831419
   ...
  )
(0, 1, 0)
BaseEstimatorDebugInformation(StandardScaler)
  transform(
   shape=(4, 5) type=<class 'numpy.ndarray'>
   [[ 0.90946066  1.4000516  -0.67682808  0.49311806 -1.03765861]
    [-1.20030006 -0.89626498 -0.05514595  0.76980985 -0.7493565 ]
    [-0.77378303 -0.99676381  1.64484777 -1.71929067  1.51198065]
    [ 1.06462242  0.49297719 -0.91287374  0.45636275  0.27503446]]
  ) -> (
   shape=(4, 5) type=<class 'numpy.ndarray'>
   [[ 0.90946066  1.4000516  -0.67682808  0.49311806 -1.03765861]
    [-1.20030006 -0.89626498 -0.05514595  0.76980985 -0.7493565 ]
    [-0.77378303 -0.99676381  1.64484777 -1.71929067  1.51198065]
    [ 1.06462242  0.49297719 -0.91287374  0.45636275  0.27503446]]
  )
(0, 1, 1)
BaseEstimatorDebugInformation(MinMaxScaler)
  transform(
   shape=(4, 5) type=<class 'numpy.ndarray'>
   [[ 0.90946066  1.4000516  -0.67682808  0.49311806 -1.03765861]
    [-1.20030006 -0.89626498 -0.05514595  0.76980985 -0.7493565 ]
    [-0.77378303 -0.99676381  1.64484777 -1.71929067  1.51198065]
    [ 1.06462242  0.49297719 -0.91287374  0.45636275  0.27503446]]
  ) -> (
   shape=(4, 5) type=<class 'numpy.ndarray'>
   [[0.93149357 1.         0.09228748 0.88883864 0.        ]
    [0.         0.04193015 0.33534839 1.         0.11307564]
    [0.18831419 0.         1.         0.         1.        ]
    [1.         0.62155016 0.         0.87407214 0.51485443]]
  )
(0, 2)
BaseEstimatorDebugInformation(LinearRegression)
  predict(
   shape=(4, 10) type=<class 'numpy.ndarray'>
   [[ 0.90946066  1.4000516  -0.67682808  0.49311806 -1.03765861  0.93149357
      1.          0.09228748  0.88883864  0.        ]
    [-1.20030006 -0.89626498 -0.05514595  0.76980985 -0.7493565   0.
      0.04193015  0.33534839  1.          0.11307564]
    [-0.77378303 -0.99676381  1.64484777 -1.71929067  1.51198065  0.18831419
   ...
  ) -> (
   shape=(4,) type=<class 'numpy.ndarray'>
   [ 0.73619378  0.87936142 -0.56528874 -0.2675163 ]
  )