.. _2018-09-18sklearnapirst: ================================ 2018-09-18 - API de scikit-learn ================================ .. only:: html **Links:** :download:`notebook <2018-09-18_sklearn_api.ipynb>`, :downloadlink:`html <2018-09-18_sklearn_api2html.html>`, :download:`python <2018-09-18_sklearn_api.py>`, :downloadlink:`slides <2018-09-18_sklearn_api.slides.html>`, :githublink:`GitHub|_doc/notebooks/notebook_eleves/2018-2019/2018-09-18_sklearn_api.ipynb|*` Présentation de l’API de *scikit-learn* et implémentation d’un prédicteur fait maison. On utilise le jeu du Titanic qu’on peut récupérer sur `opendatasoft `__ ou `awesome-public-datasets `__. .. code:: ipython3 import pandas df = pandas.read_csv("titanic.csv/titanic.csv") df.head(n=2) .. raw:: html

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked
0	1	0	3	Braund, Mr. Owen Harris	male	22.0	1	0	A/5 21171	7.2500	NaN	S
1	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	female	38.0	1	0	PC 17599	71.2833	C85	C

.. code:: ipython3 X, y = df[["Age", "Fare"]], df['Survived'] .. code:: ipython3 from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y) .. code:: ipython3 from sklearn.linear_model import LogisticRegression cls = LogisticRegression() try: cls.fit(X_train, y_train) except Exception as e: print(e) .. parsed-literal:: Input contains NaN, infinity or a value too large for dtype('float64'). .. code:: ipython3 try: from sklearn.impute import SimpleImputer as Imputer except ImportError: from sklearn.preprocessing import Imputer imp = Imputer() imp.fit(X_train) X_train_nomiss = imp.transform(X_train) .. code:: ipython3 cls = LogisticRegression() cls.fit(X_train_nomiss, y_train) .. parsed-literal:: LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, l1_ratio=None, max_iter=100, multi_class='auto', n_jobs=None, penalty='l2', random_state=None, solver='lbfgs', tol=0.0001, verbose=0, warm_start=False) .. code:: ipython3 cls.score(imp.transform(X_test), y_test) .. parsed-literal:: 0.6502242152466368 .. code:: ipython3 from sklearn.pipeline import Pipeline pipe = Pipeline([("imputer", Imputer()), ("lr", LogisticRegression())]) pipe.fit(X_train, y_train) .. parsed-literal:: Pipeline(memory=None, steps=[('imputer', SimpleImputer(add_indicator=False, copy=True, fill_value=None, missing_values=nan, strategy='mean', verbose=0)), ('lr', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, l1_ratio=None, max_iter=100, multi_class='auto', n_jobs=None, penalty='l2', random_state=None, solver='lbfgs', tol=0.0001, verbose=0, warm_start=False))], verbose=False) .. code:: ipython3 pipe.score(X_test, y_test) .. parsed-literal:: 0.6502242152466368 .. code:: ipython3 from sklearn.model_selection import GridSearchCV grid = GridSearchCV(pipe, {"imputer__strategy": ['mean', 'most_frequent'], "lr__max_iter": [5, 10, 50]}) grid.fit(X_train, y_train) .. parsed-literal:: C:\xavierdupre\__home_\github_fork\scikit-learn\sklearn\linear_model\logistic.py:935: ConvergenceWarning: lbfgs failed to converge. Increase the number of iterations. "of iterations.", ConvergenceWarning) C:\xavierdupre\__home_\github_fork\scikit-learn\sklearn\linear_model\logistic.py:935: ConvergenceWarning: lbfgs failed to converge. Increase the number of iterations. "of iterations.", ConvergenceWarning) C:\xavierdupre\__home_\github_fork\scikit-learn\sklearn\linear_model\logistic.py:935: ConvergenceWarning: lbfgs failed to converge. Increase the number of iterations. "of iterations.", ConvergenceWarning) C:\xavierdupre\__home_\github_fork\scikit-learn\sklearn\linear_model\logistic.py:935: ConvergenceWarning: lbfgs failed to converge. Increase the number of iterations. "of iterations.", ConvergenceWarning) C:\xavierdupre\__home_\github_fork\scikit-learn\sklearn\linear_model\logistic.py:935: ConvergenceWarning: lbfgs failed to converge. Increase the number of iterations. "of iterations.", ConvergenceWarning) C:\xavierdupre\__home_\github_fork\scikit-learn\sklearn\linear_model\logistic.py:935: ConvergenceWarning: lbfgs failed to converge. Increase the number of iterations. "of iterations.", ConvergenceWarning) C:\xavierdupre\__home_\github_fork\scikit-learn\sklearn\linear_model\logistic.py:935: ConvergenceWarning: lbfgs failed to converge. Increase the number of iterations. "of iterations.", ConvergenceWarning) C:\xavierdupre\__home_\github_fork\scikit-learn\sklearn\linear_model\logistic.py:935: ConvergenceWarning: lbfgs failed to converge. Increase the number of iterations. "of iterations.", ConvergenceWarning) C:\xavierdupre\__home_\github_fork\scikit-learn\sklearn\linear_model\logistic.py:935: ConvergenceWarning: lbfgs failed to converge. Increase the number of iterations. "of iterations.", ConvergenceWarning) C:\xavierdupre\__home_\github_fork\scikit-learn\sklearn\linear_model\logistic.py:935: ConvergenceWarning: lbfgs failed to converge. Increase the number of iterations. "of iterations.", ConvergenceWarning) C:\xavierdupre\__home_\github_fork\scikit-learn\sklearn\linear_model\logistic.py:935: ConvergenceWarning: lbfgs failed to converge. Increase the number of iterations. "of iterations.", ConvergenceWarning) C:\xavierdupre\__home_\github_fork\scikit-learn\sklearn\linear_model\logistic.py:935: ConvergenceWarning: lbfgs failed to converge. Increase the number of iterations. "of iterations.", ConvergenceWarning) C:\xavierdupre\__home_\github_fork\scikit-learn\sklearn\linear_model\logistic.py:935: ConvergenceWarning: lbfgs failed to converge. Increase the number of iterations. "of iterations.", ConvergenceWarning) C:\xavierdupre\__home_\github_fork\scikit-learn\sklearn\linear_model\logistic.py:935: ConvergenceWarning: lbfgs failed to converge. Increase the number of iterations. "of iterations.", ConvergenceWarning) C:\xavierdupre\__home_\github_fork\scikit-learn\sklearn\linear_model\logistic.py:935: ConvergenceWarning: lbfgs failed to converge. Increase the number of iterations. "of iterations.", ConvergenceWarning) C:\xavierdupre\__home_\github_fork\scikit-learn\sklearn\linear_model\logistic.py:935: ConvergenceWarning: lbfgs failed to converge. Increase the number of iterations. "of iterations.", ConvergenceWarning) C:\xavierdupre\__home_\github_fork\scikit-learn\sklearn\linear_model\logistic.py:935: ConvergenceWarning: lbfgs failed to converge. Increase the number of iterations. "of iterations.", ConvergenceWarning) C:\xavierdupre\__home_\github_fork\scikit-learn\sklearn\linear_model\logistic.py:935: ConvergenceWarning: lbfgs failed to converge. Increase the number of iterations. "of iterations.", ConvergenceWarning) C:\xavierdupre\__home_\github_fork\scikit-learn\sklearn\linear_model\logistic.py:935: ConvergenceWarning: lbfgs failed to converge. Increase the number of iterations. "of iterations.", ConvergenceWarning) C:\xavierdupre\__home_\github_fork\scikit-learn\sklearn\linear_model\logistic.py:935: ConvergenceWarning: lbfgs failed to converge. Increase the number of iterations. "of iterations.", ConvergenceWarning) C:\xavierdupre\__home_\github_fork\scikit-learn\sklearn\linear_model\logistic.py:935: ConvergenceWarning: lbfgs failed to converge. Increase the number of iterations. "of iterations.", ConvergenceWarning) .. parsed-literal:: GridSearchCV(cv=None, error_score=nan, estimator=Pipeline(memory=None, steps=[('imputer', SimpleImputer(add_indicator=False, copy=True, fill_value=None, missing_values=nan, strategy='mean', verbose=0)), ('lr', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, l1_ratio=None, max_iter=100, multi_class='auto', n_jobs=None, penalty='l2', random_state=None, solver='lbfgs', tol=0.0001, verbose=0, warm_start=False))], verbose=False), iid='deprecated', n_jobs=None, param_grid={'imputer__strategy': ['mean', 'most_frequent'], 'lr__max_iter': [5, 10, 50]}, pre_dispatch='2*n_jobs', refit=True, return_train_score=False, scoring=None, verbose=0) .. code:: ipython3 res = pandas.DataFrame(grid.cv_results_) col = [_ for _ in res.columns if 'param_' in _ or "test_score" in _] res[col].T .. raw:: html

	0	1	2	3	4	5
param_imputer__strategy	mean	mean	mean	most_frequent	most_frequent	most_frequent
param_lr__max_iter	5	10	50	5	10	50
split0_test_score	0.686567	0.69403	0.656716	0.686567	0.69403	0.656716
split1_test_score	0.619403	0.604478	0.597015	0.61194	0.626866	0.61194
split2_test_score	0.679104	0.679104	0.671642	0.664179	0.671642	0.656716
split3_test_score	0.706767	0.699248	0.684211	0.706767	0.714286	0.684211
split4_test_score	0.676692	0.699248	0.699248	0.676692	0.706767	0.691729
mean_test_score	0.673707	0.675222	0.661766	0.669229	0.682718	0.660263
std_test_score	0.0291387	0.0361333	0.0352828	0.0318525	0.0314484	0.0280138
rank_test_score	3	2	5	4	1	6

.. code:: ipython3 from sklearn.base import BaseEstimator, ClassifierMixin import numpy class MeanPredictor(BaseEstimator, ClassifierMixin): def __init__(self, alpha=0.5): self.alpha = alpha def fit(self, X, y): self.mean_ = int(self.alpha + numpy.mean(y)) def predict(self, X): return numpy.array(list(self.mean_ for k in range(X.shape[0]))) .. code:: ipython3 pipe_mean = Pipeline([('imputer', Imputer()), ('meanpredictor', MeanPredictor())]) pipe_mean.fit(X_train, y_train) .. parsed-literal:: Pipeline(memory=None, steps=[('imputer', SimpleImputer(add_indicator=False, copy=True, fill_value=None, missing_values=nan, strategy='mean', verbose=0)), ('meanpredictor', MeanPredictor(alpha=0.5))], verbose=False) .. code:: ipython3 pipe_mean.score(X_test, y_test) .. parsed-literal:: 0.6322869955156951 .. code:: ipython3 from sklearn.model_selection import GridSearchCV grid = GridSearchCV(pipe_mean, {"imputer__strategy": ['mean', 'most_frequent'], "meanpredictor__alpha": [0.2, 0.5, 0.8]}) grid.fit(X_train, y_train) .. parsed-literal:: GridSearchCV(cv=None, error_score=nan, estimator=Pipeline(memory=None, steps=[('imputer', SimpleImputer(add_indicator=False, copy=True, fill_value=None, missing_values=nan, strategy='mean', verbose=0)), ('meanpredictor', MeanPredictor(alpha=0.5))], verbose=False), iid='deprecated', n_jobs=None, param_grid={'imputer__strategy': ['mean', 'most_frequent'], 'meanpredictor__alpha': [0.2, 0.5, 0.8]}, pre_dispatch='2*n_jobs', refit=True, return_train_score=False, scoring=None, verbose=0) .. code:: ipython3 res = pandas.DataFrame(grid.cv_results_) col = [_ for _ in res.columns if 'param_' in _ or "test_score" in _] res[col].T .. raw:: html

	0	1	2	3	4	5
param_imputer__strategy	mean	mean	mean	most_frequent	most_frequent	most_frequent
param_meanpredictor__alpha	0.2	0.5	0.8	0.2	0.5	0.8
split0_test_score	0.61194	0.61194	0.38806	0.61194	0.61194	0.38806
split1_test_score	0.61194	0.61194	0.38806	0.61194	0.61194	0.38806
split2_test_score	0.61194	0.61194	0.38806	0.61194	0.61194	0.38806
split3_test_score	0.609023	0.609023	0.390977	0.609023	0.609023	0.390977
split4_test_score	0.609023	0.609023	0.390977	0.609023	0.609023	0.390977
mean_test_score	0.610773	0.610773	0.389227	0.610773	0.610773	0.389227
std_test_score	0.0014294	0.0014294	0.0014294	0.0014294	0.0014294	0.0014294
rank_test_score	1	1	5	1	1	5

.. code:: ipython3 best = grid.best_estimator_ .. code:: ipython3 import pickle with open("model.pkl", "wb") as f: pickle.dump(best, f) .. code:: ipython3 with open("model.pkl", "rb") as f: model = pickle.load(f) .. code:: ipython3 model.predict(X_test) == best.predict(X_test) .. parsed-literal:: array([ True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True])