2018-09-18 - API de scikit-learn

Links: notebook, html, PDF, python, slides, GitHub

Présentation de l’API de scikit-learn et implémentation d’un prédicteur fait maison. On utilise le jeu du Titanic qu’on peut récupérer sur opendatasoft ou awesome-public-datasets.

import pandas
df = pandas.read_csv("titanic.csv/titanic.csv")
df.head(n=2)
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
X, y = df[["Age", "Fare"]], df['Survived']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)
from sklearn.linear_model import LogisticRegression
cls = LogisticRegression()
try:
    cls.fit(X_train, y_train)
except Exception as e:
    print(e)
Input contains NaN, infinity or a value too large for dtype('float64').
from sklearn.preprocessing import Imputer
imp = Imputer()
imp.fit(X_train)
X_train_nomiss = imp.transform(X_train)
cls = LogisticRegression()
cls.fit(X_train_nomiss, y_train)
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
cls.score(imp.transform(X_test), y_test)
0.6681614349775785
from sklearn.pipeline import make_pipeline
pipe = make_pipeline(Imputer(), LogisticRegression())
pipe.fit(X_train, y_train)
Pipeline(memory=None,
     steps=[('imputer', Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0)), ('logisticregression', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])
pipe.score(X_test, y_test)
0.6681614349775785
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(pipe, {"imputer__strategy": ['mean', 'most_frequent'],
                           "logisticregression__max_iter": [5, 10, 50]})
grid.fit(X_train, y_train)
GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('imputer', Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0)), ('logisticregression', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'imputer__strategy': ['mean', 'most_frequent'], 'logisticregression__max_iter': [5, 10, 50]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)
res = pandas.DataFrame(grid.cv_results_)
col = [_ for _ in res.columns if 'param_' in _ or "test_score" in _]
res[col].T
c:python370_x64libsite-packagessklearnutilsdeprecation.py:122: FutureWarning: You are accessing a training score ('split0_train_score'), which will not be available by default any more in 0.21. If you need training scores, please set return_train_score=True
  warnings.warn(*warn_args, **warn_kwargs)
c:python370_x64libsite-packagessklearnutilsdeprecation.py:122: FutureWarning: You are accessing a training score ('split1_train_score'), which will not be available by default any more in 0.21. If you need training scores, please set return_train_score=True
  warnings.warn(*warn_args, **warn_kwargs)
c:python370_x64libsite-packagessklearnutilsdeprecation.py:122: FutureWarning: You are accessing a training score ('split2_train_score'), which will not be available by default any more in 0.21. If you need training scores, please set return_train_score=True
  warnings.warn(*warn_args, **warn_kwargs)
c:python370_x64libsite-packagessklearnutilsdeprecation.py:122: FutureWarning: You are accessing a training score ('mean_train_score'), which will not be available by default any more in 0.21. If you need training scores, please set return_train_score=True
  warnings.warn(*warn_args, **warn_kwargs)
c:python370_x64libsite-packagessklearnutilsdeprecation.py:122: FutureWarning: You are accessing a training score ('std_train_score'), which will not be available by default any more in 0.21. If you need training scores, please set return_train_score=True
  warnings.warn(*warn_args, **warn_kwargs)
0 1 2 3 4 5
param_imputer__strategy mean mean mean most_frequent most_frequent most_frequent
param_logisticregression__max_iter 5 10 50 5 10 50
split0_test_score 0.659193 0.659193 0.659193 0.668161 0.668161 0.668161
split1_test_score 0.681614 0.681614 0.681614 0.681614 0.681614 0.681614
split2_test_score 0.612613 0.626126 0.626126 0.612613 0.630631 0.630631
mean_test_score 0.651198 0.655689 0.655689 0.654192 0.66018 0.66018
std_test_score 0.0287224 0.0227799 0.0227799 0.0298453 0.0215598 0.0215598
rank_test_score 6 3 3 5 1 1
from sklearn.base import BaseEstimator, ClassifierMixin
import numpy

class MeanPredictor(BaseEstimator, ClassifierMixin):
    def __init__(self, alpha=0.5):
        self.alpha = alpha

    def fit(self, X, y):
        self.mean_ = int(self.alpha + numpy.mean(y))

    def predict(self, X):
        return numpy.array(list(self.mean_ for k in range(X.shape[0])))
pipe_mean = make_pipeline(Imputer(), MeanPredictor())
pipe_mean.fit(X_train, y_train)
Pipeline(memory=None,
     steps=[('imputer', Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0)), ('meanpredictor', MeanPredictor(alpha=0.5))])
pipe_mean.score(X_test, y_test)
0.6098654708520179
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(pipe_mean, {"imputer__strategy": ['mean', 'most_frequent'],
                                "meanpredictor__alpha": [0.2, 0.5, 0.8]})
grid.fit(X_train, y_train)
GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('imputer', Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0)), ('meanpredictor', MeanPredictor(alpha=0.5))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'imputer__strategy': ['mean', 'most_frequent'], 'meanpredictor__alpha': [0.2, 0.5, 0.8]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)
res = pandas.DataFrame(grid.cv_results_)
col = [_ for _ in res.columns if 'param_' in _ or "test_score" in _]
res[col].T
c:python370_x64libsite-packagessklearnutilsdeprecation.py:122: FutureWarning: You are accessing a training score ('split0_train_score'), which will not be available by default any more in 0.21. If you need training scores, please set return_train_score=True
  warnings.warn(*warn_args, **warn_kwargs)
c:python370_x64libsite-packagessklearnutilsdeprecation.py:122: FutureWarning: You are accessing a training score ('split1_train_score'), which will not be available by default any more in 0.21. If you need training scores, please set return_train_score=True
  warnings.warn(*warn_args, **warn_kwargs)
c:python370_x64libsite-packagessklearnutilsdeprecation.py:122: FutureWarning: You are accessing a training score ('split2_train_score'), which will not be available by default any more in 0.21. If you need training scores, please set return_train_score=True
  warnings.warn(*warn_args, **warn_kwargs)
c:python370_x64libsite-packagessklearnutilsdeprecation.py:122: FutureWarning: You are accessing a training score ('mean_train_score'), which will not be available by default any more in 0.21. If you need training scores, please set return_train_score=True
  warnings.warn(*warn_args, **warn_kwargs)
c:python370_x64libsite-packagessklearnutilsdeprecation.py:122: FutureWarning: You are accessing a training score ('std_train_score'), which will not be available by default any more in 0.21. If you need training scores, please set return_train_score=True
  warnings.warn(*warn_args, **warn_kwargs)
0 1 2 3 4 5
param_imputer__strategy mean mean mean most_frequent most_frequent most_frequent
param_meanpredictor__alpha 0.2 0.5 0.8 0.2 0.5 0.8
split0_test_score 0.618834 0.618834 0.381166 0.618834 0.618834 0.381166
split1_test_score 0.618834 0.618834 0.381166 0.618834 0.618834 0.381166
split2_test_score 0.617117 0.617117 0.382883 0.617117 0.617117 0.382883
mean_test_score 0.618263 0.618263 0.381737 0.618263 0.618263 0.381737
std_test_score 0.000808777 0.000808777 0.000808777 0.000808777 0.000808777 0.000808777
rank_test_score 1 1 5 1 1 5
best = grid.best_estimator_
import pickle
with open("model.pkl", "wb") as f:
    pickle.dump(best, f)
with open("model.pkl", "rb") as f:
    model = pickle.load(f)
model.predict(X_test) == best.predict(X_test)
array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True])