from jyquickhelper import add_notebook_menu
add_notebook_menu()


%matplotlib inline


from sklearn.datasets import load_diabetes
data = load_diabetes()
X, y = data.data, data.target


from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)


import numpy
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Lasso

# Apprentissage d'une forêt aléatoire
clr = RandomForestRegressor()
clr.fit(X_train, y_train)

# Récupération de la prédiction de chaque arbre
X_train_2 = numpy.zeros((X_train.shape[0], len(clr.estimators_)))
estimators = numpy.array(clr.estimators_).ravel()
for i, est in enumerate(estimators):
    pred = est.predict(X_train)
    X_train_2[:, i] = pred

# Apprentissage d'une régression Lasso
lrs = Lasso(max_iter=10000)
lrs.fit(X_train_2, y_train)
lrs.coef_

array([0.00516931, 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.05150952, 0.        ,
       0.0114454 , 0.00778913, 0.        , 0.04239907, 0.01882099,
       0.02956967, 0.        , 0.04699227, 0.        , 0.04588009,
       0.00476672, 0.05276899, 0.        , 0.        , 0.00719994,
       0.        , 0.02817731, 0.        , 0.        , 0.03606261,
       0.00228349, 0.01204062, 0.02018557, 0.        , 0.        ,
       0.03759611, 0.04608785, 0.        , 0.00316996, 0.        ,
       0.        , 0.        , 0.01678394, 0.        , 0.        ,
       0.        , 0.00801926, 0.07006079, 0.03263025, 0.        ,
       0.00770145, 0.        , 0.00351302, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.00183299, 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.02545205, 0.05789703, 0.        , 0.        ,
       0.        , 0.0065516 , 0.        , 0.        , 0.        ,
       0.07234827, 0.        , 0.03547108, 0.        , 0.        ,
       0.03080198, 0.00930293, 0.04231454, 0.        , 0.01124574,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.00108674, 0.02485889, 0.01839299, 0.        , 0.        ,
       0.03118312, 0.        , 0.        , 0.        , 0.        ])


from sklearn.pipeline import Pipeline

try:
    pipe = Pipeline(steps=[
        ('rf', RandomForestRegressor()),
        ("une fonction qui n'existe pas encore", fct),
        ("lasso", Lasso()),
    ])
except Exception as e:
    print(e)

name 'fct' is not defined


from sklearn.preprocessing import FunctionTransformer

def random_forest_tree_prediction(rf, X):
    preds = numpy.zeros((X.shape[0], len(rf.estimators_)))
    estimators = numpy.array(rf.estimators_).ravel()
    for i, est in enumerate(estimators):
        pred = est.predict(X)
        preds[:, i] = pred
    return preds
    

random_forest_tree_prediction(clr, X)

array([[23.8, 31.2, 32. , ..., 32. , 29.9, 28. ],
       [21.6, 22.9, 21.6, ..., 21.6, 22. , 24.4],
       [33.8, 37.2, 34.7, ..., 34.9, 34.7, 34.7],
       ...,
       [23.9, 23.9, 31.5, ..., 29.9, 23.9, 23.9],
       [23.9, 22. , 22. , ..., 29.9, 23.9, 22. ],
       [11.9, 11.9, 11.9, ..., 11.9, 20.6, 11.9]])


fct = FunctionTransformer(lambda X, rf=clr: random_forest_tree_prediction(rf, X) )

fct.transform(X_train)

array([[ 8.5,  8.5,  5.6, ...,  5. ,  5. ,  8.5],
       [14.8, 15. , 13.4, ..., 19.5, 21.7, 16.6],
       [13.1, 13.1, 15.1, ..., 13.1, 11.3, 12.6],
       ...,
       [21.4, 22. , 21.4, ..., 20. , 22.6, 21.4],
       [25.1, 29.9, 25.1, ..., 25.1, 25.1, 25.1],
       [28.4, 28.4, 28.4, ..., 28.4, 28.4, 28.4]])


try:
    pipe = Pipeline(steps=[
        ('rf', RandomForestRegressor()),
        ("tree_pred", fct),
        ("lasso", Lasso()),
    ])
except Exception as e:
    print(e)

All intermediate steps should be transformers and implement fit and transform or be the string 'passthrough' 'RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=100,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)' (type <class 'sklearn.ensemble.forest.RandomForestRegressor'>) doesn't


hasattr(clr, 'transform')

False


from jyquickhelper import RenderJsDot


RenderJsDot("""digraph {
  A [label="RandomForestRegressor pipline"];
  A2 [label="RandomForestRegressor - pretrained"];
  B [label="FunctionTransformer"]; C [label="Lasso"];
  A -> B [label="X"]; B -> C [label="X2"]; A2 -> B [label="rf"]; }""")


class RandomForestRegressorAsTransformer:
    
    def __init__(self, **kwargs):
        self.rf = RandomForestRegressor(**kwargs)
        
    def fit(self, X, y):
        self.rf.fit(X, y)
        return self
        
    def transform(self, X):
        preds = numpy.zeros((X.shape[0], len(self.rf.estimators_)))
        estimators = numpy.array(self.rf.estimators_).ravel()
        for i, est in enumerate(estimators):
            pred = est.predict(X)
            preds[:, i] = pred
        return preds


trrf = RandomForestRegressorAsTransformer()
trrf.fit(X_train, y_train)
trrf.transform(X_train)

array([[ 8.5,  8.5, 10.4, ...,  8.5,  8.5,  5.6],
       [13.4, 15. , 21.7, ..., 21.7, 21.7, 20. ],
       [ 8.3, 13.1, 15.1, ..., 13.6, 13.1, 17.1],
       ...,
       [21.4, 21.2, 21.4, ..., 21.4, 21.4, 21.4],
       [23.8, 25.1, 25.1, ..., 24.3, 25.1, 22. ],
       [28.4, 28.4, 28.4, ..., 28.1, 28.4, 28.4]])


pipe = Pipeline(steps=[('trrf', RandomForestRegressorAsTransformer()),
                       ("lasso", Lasso())])

pipe.fit(X_train, y_train)

C:\xavierdupre\__home_\github_fork\scikit-learn\sklearn\linear_model\coordinate_descent.py:475: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 15.605865570498736, tolerance: 3.566623625329815
  positive)

Pipeline(memory=None,
         steps=[('trrf',
                 <__main__.RandomForestRegressorAsTransformer object at 0x000002D7FBD6B0F0>),
                ('lasso',
                 Lasso(alpha=1.0, copy_X=True, fit_intercept=True,
                       max_iter=1000, normalize=False, positive=False,
                       precompute=False, random_state=None, selection='cyclic',
                       tol=0.0001, warm_start=False))],
         verbose=False)


pipe.steps[1][1].coef_

array([0.00000000e+00, 8.18725785e-03, 2.57107281e-02, 2.64260468e-02,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 2.91496034e-02, 0.00000000e+00, 2.59224355e-03,
       0.00000000e+00, 0.00000000e+00, 1.11199737e-02, 2.25351658e-02,
       0.00000000e+00, 1.89481812e-02, 1.02779896e-01, 0.00000000e+00,
       6.25993012e-03, 2.88645052e-02, 2.26525053e-02, 0.00000000e+00,
       1.58723695e-02, 2.17116677e-02, 5.73111769e-02, 4.07723945e-02,
       3.07676159e-02, 0.00000000e+00, 0.00000000e+00, 2.96368833e-02,
       6.31627239e-03, 3.05513736e-04, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 2.61832331e-02, 0.00000000e+00, 0.00000000e+00,
       1.95009449e-02, 3.88476951e-02, 1.12862592e-02, 1.97136005e-02,
       0.00000000e+00, 5.67052346e-02, 9.39029327e-03, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 6.86248078e-03, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 5.22709050e-02, 1.56786096e-02,
       0.00000000e+00, 1.06189159e-02, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 7.99152616e-02, 0.00000000e+00, 1.05299329e-02,
       0.00000000e+00, 0.00000000e+00, 4.70392340e-02, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       1.04728058e-03, 3.60665273e-02, 0.00000000e+00, 0.00000000e+00,
       1.21597852e-02, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 4.73818504e-02, 1.70113005e-02, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 2.29949689e-06, 0.00000000e+00])


from sklearn.model_selection import GridSearchCV

param_grid = {'trrf__n_estimators': [30, 50, 80, 100],
              'lasso__alpha': [0.5, 1.0, 1.5]}

try:
    grid = GridSearchCV(pipe, cv=5, verbose=1, param_grid=param_grid)
    grid.fit(X_train, y_train)
except Exception as e:
    print(e)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
'RandomForestRegressorAsTransformer' object has no attribute 'set_params'

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


class RandomForestRegressorAsTransformer:
    
    def __init__(self, **kwargs):
        self.rf = RandomForestRegressor(**kwargs)
        
    def fit(self, X, y):
        self.rf.fit(X, y)
        return self
        
    def transform(self, X):
        preds = numpy.zeros((X.shape[0], len(self.rf.estimators_)))
        estimators = numpy.array(self.rf.estimators_).ravel()
        for i, est in enumerate(estimators):
            pred = est.predict(X)
            preds[:, i] = pred
        return preds
    
    def set_params(self, **params):
        self.rf.set_params(**params)


import warnings
from sklearn.exceptions import ConvergenceWarning

pipe = Pipeline(steps=[('trrf', RandomForestRegressorAsTransformer()),
                       ("lasso", Lasso())])
    
param_grid = {'trrf__n_estimators': [50, 100],
              'lasso__alpha': [0.5, 1.0, 1.5]}

grid = GridSearchCV(pipe, cv=5, verbose=2, param_grid=param_grid)

with warnings.catch_warnings(record=False) as w:
    # On ignore les convergence warning car il y en beaucoup.
    warnings.simplefilter("ignore", ConvergenceWarning)
    grid.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV] lasso__alpha=0.5, trrf__n_estimators=50 .........................
[CV] .......... lasso__alpha=0.5, trrf__n_estimators=50, total=   0.3s
[CV] lasso__alpha=0.5, trrf__n_estimators=50 .........................

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s remaining:    0.0s

[CV] .......... lasso__alpha=0.5, trrf__n_estimators=50, total=   0.3s
[CV] lasso__alpha=0.5, trrf__n_estimators=50 .........................
[CV] .......... lasso__alpha=0.5, trrf__n_estimators=50, total=   0.3s
[CV] lasso__alpha=0.5, trrf__n_estimators=50 .........................
[CV] .......... lasso__alpha=0.5, trrf__n_estimators=50, total=   0.2s
[CV] lasso__alpha=0.5, trrf__n_estimators=50 .........................
[CV] .......... lasso__alpha=0.5, trrf__n_estimators=50, total=   0.2s
[CV] lasso__alpha=0.5, trrf__n_estimators=100 ........................
[CV] ......... lasso__alpha=0.5, trrf__n_estimators=100, total=   0.4s
[CV] lasso__alpha=0.5, trrf__n_estimators=100 ........................
[CV] ......... lasso__alpha=0.5, trrf__n_estimators=100, total=   0.4s
[CV] lasso__alpha=0.5, trrf__n_estimators=100 ........................
[CV] ......... lasso__alpha=0.5, trrf__n_estimators=100, total=   0.4s
[CV] lasso__alpha=0.5, trrf__n_estimators=100 ........................
[CV] ......... lasso__alpha=0.5, trrf__n_estimators=100, total=   0.5s
[CV] lasso__alpha=0.5, trrf__n_estimators=100 ........................
[CV] ......... lasso__alpha=0.5, trrf__n_estimators=100, total=   0.4s
[CV] lasso__alpha=1.0, trrf__n_estimators=50 .........................
[CV] .......... lasso__alpha=1.0, trrf__n_estimators=50, total=   0.2s
[CV] lasso__alpha=1.0, trrf__n_estimators=50 .........................
[CV] .......... lasso__alpha=1.0, trrf__n_estimators=50, total=   0.2s
[CV] lasso__alpha=1.0, trrf__n_estimators=50 .........................
[CV] .......... lasso__alpha=1.0, trrf__n_estimators=50, total=   0.3s
[CV] lasso__alpha=1.0, trrf__n_estimators=50 .........................
[CV] .......... lasso__alpha=1.0, trrf__n_estimators=50, total=   0.2s
[CV] lasso__alpha=1.0, trrf__n_estimators=50 .........................
[CV] .......... lasso__alpha=1.0, trrf__n_estimators=50, total=   0.2s
[CV] lasso__alpha=1.0, trrf__n_estimators=100 ........................
[CV] ......... lasso__alpha=1.0, trrf__n_estimators=100, total=   0.5s
[CV] lasso__alpha=1.0, trrf__n_estimators=100 ........................
[CV] ......... lasso__alpha=1.0, trrf__n_estimators=100, total=   0.4s
[CV] lasso__alpha=1.0, trrf__n_estimators=100 ........................
[CV] ......... lasso__alpha=1.0, trrf__n_estimators=100, total=   0.5s
[CV] lasso__alpha=1.0, trrf__n_estimators=100 ........................
[CV] ......... lasso__alpha=1.0, trrf__n_estimators=100, total=   0.5s
[CV] lasso__alpha=1.0, trrf__n_estimators=100 ........................
[CV] ......... lasso__alpha=1.0, trrf__n_estimators=100, total=   0.4s
[CV] lasso__alpha=1.5, trrf__n_estimators=50 .........................
[CV] .......... lasso__alpha=1.5, trrf__n_estimators=50, total=   0.2s
[CV] lasso__alpha=1.5, trrf__n_estimators=50 .........................
[CV] .......... lasso__alpha=1.5, trrf__n_estimators=50, total=   0.2s
[CV] lasso__alpha=1.5, trrf__n_estimators=50 .........................
[CV] .......... lasso__alpha=1.5, trrf__n_estimators=50, total=   0.2s
[CV] lasso__alpha=1.5, trrf__n_estimators=50 .........................
[CV] .......... lasso__alpha=1.5, trrf__n_estimators=50, total=   0.2s
[CV] lasso__alpha=1.5, trrf__n_estimators=50 .........................
[CV] .......... lasso__alpha=1.5, trrf__n_estimators=50, total=   0.2s
[CV] lasso__alpha=1.5, trrf__n_estimators=100 ........................
[CV] ......... lasso__alpha=1.5, trrf__n_estimators=100, total=   0.4s
[CV] lasso__alpha=1.5, trrf__n_estimators=100 ........................
[CV] ......... lasso__alpha=1.5, trrf__n_estimators=100, total=   0.4s
[CV] lasso__alpha=1.5, trrf__n_estimators=100 ........................
[CV] ......... lasso__alpha=1.5, trrf__n_estimators=100, total=   0.5s
[CV] lasso__alpha=1.5, trrf__n_estimators=100 ........................
[CV] ......... lasso__alpha=1.5, trrf__n_estimators=100, total=   0.7s
[CV] lasso__alpha=1.5, trrf__n_estimators=100 ........................
[CV] ......... lasso__alpha=1.5, trrf__n_estimators=100, total=   0.5s

[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:   10.6s finished


grid.best_params_

{'lasso__alpha': 0.5, 'trrf__n_estimators': 50}


grid.best_estimator_.steps[1][1].coef_

array([ 0.01661755,  0.09608553, -0.        ,  0.04337892,  0.00256722,
        0.04875441,  0.0022436 ,  0.00757652,  0.        ,  0.        ,
        0.        ,  0.04868714,  0.00878259,  0.        ,  0.01989812,
        0.0123234 ,  0.        ,  0.06432313,  0.00565488,  0.        ,
        0.00119269,  0.        ,  0.00611262,  0.        ,  0.        ,
        0.        ,  0.01786513,  0.        ,  0.026654  ,  0.        ,
        0.        ,  0.        ,  0.09583967,  0.00722895,  0.05395944,
        0.063898  ,  0.0586511 ,  0.        ,  0.        ,  0.1290402 ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.00750308,  0.        ,  0.08633491,  0.03593556,  0.05771344])


grid.best_score_

0.8321710116268228


grid.score(X_test, y_test)

0.8772908724536076


coef = grid.best_estimator_.steps[1][1].coef_
coef.shape, sum(coef != 0)

((50,), 27)

2A.ml - Pipeline pour un réduction d'une forêt aléatoire - correction¶

Datasets¶

Forêt aléatoire suivi de Lasso¶

Premier pipeline¶

Second pipeline¶

A quoi ça sert : GridSearchCV¶