{"cells": [{"cell_type": "markdown", "metadata": {}, "source": ["# 2018-09-18 - API de scikit-learn\n", "\n", "Pr\u00e9sentation de l'API de *scikit-learn* et impl\u00e9mentation d'un pr\u00e9dicteur fait maison. On utilise le jeu du Titanic qu'on peut r\u00e9cup\u00e9rer sur [opendatasoft](https://public.opendatasoft.com/explore/dataset/titanic-passengers/?flg=fr) ou [awesome-public-datasets](https://github.com/awesomedata/awesome-public-datasets/tree/master/Datasets)."]}, {"cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [{"data": {"text/html": ["<div>\n", "<style scoped>\n", "    .dataframe tbody tr th:only-of-type {\n", "        vertical-align: middle;\n", "    }\n", "\n", "    .dataframe tbody tr th {\n", "        vertical-align: top;\n", "    }\n", "\n", "    .dataframe thead th {\n", "        text-align: right;\n", "    }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", "  <thead>\n", "    <tr style=\"text-align: right;\">\n", "      <th></th>\n", "      <th>PassengerId</th>\n", "      <th>Survived</th>\n", "      <th>Pclass</th>\n", "      <th>Name</th>\n", "      <th>Sex</th>\n", "      <th>Age</th>\n", "      <th>SibSp</th>\n", "      <th>Parch</th>\n", "      <th>Ticket</th>\n", "      <th>Fare</th>\n", "      <th>Cabin</th>\n", "      <th>Embarked</th>\n", "    </tr>\n", "  </thead>\n", "  <tbody>\n", "    <tr>\n", "      <th>0</th>\n", "      <td>1</td>\n", "      <td>0</td>\n", "      <td>3</td>\n", "      <td>Braund, Mr. Owen Harris</td>\n", "      <td>male</td>\n", "      <td>22.0</td>\n", "      <td>1</td>\n", "      <td>0</td>\n", "      <td>A/5 21171</td>\n", "      <td>7.2500</td>\n", "      <td>NaN</td>\n", "      <td>S</td>\n", "    </tr>\n", "    <tr>\n", "      <th>1</th>\n", "      <td>2</td>\n", "      <td>1</td>\n", "      <td>1</td>\n", "      <td>Cumings, Mrs. John Bradley (Florence Briggs Th...</td>\n", "      <td>female</td>\n", "      <td>38.0</td>\n", "      <td>1</td>\n", "      <td>0</td>\n", "      <td>PC 17599</td>\n", "      <td>71.2833</td>\n", "      <td>C85</td>\n", "      <td>C</td>\n", "    </tr>\n", "  </tbody>\n", "</table>\n", "</div>"], "text/plain": ["   PassengerId  Survived  Pclass  \\\n", "0            1         0       3   \n", "1            2         1       1   \n", "\n", "                                                Name     Sex   Age  SibSp  \\\n", "0                            Braund, Mr. Owen Harris    male  22.0      1   \n", "1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   \n", "\n", "   Parch     Ticket     Fare Cabin Embarked  \n", "0      0  A/5 21171   7.2500   NaN        S  \n", "1      0   PC 17599  71.2833   C85        C  "]}, "execution_count": 2, "metadata": {}, "output_type": "execute_result"}], "source": ["import pandas\n", "df = pandas.read_csv(\"titanic.csv/titanic.csv\")\n", "df.head(n=2)"]}, {"cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": ["X, y = df[[\"Age\", \"Fare\"]], df['Survived']"]}, {"cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": ["from sklearn.model_selection import train_test_split\n", "X_train, X_test, y_train, y_test = train_test_split(X, y)"]}, {"cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["Input contains NaN, infinity or a value too large for dtype('float64').\n"]}], "source": ["from sklearn.linear_model import LogisticRegression\n", "cls = LogisticRegression()\n", "try:\n", "    cls.fit(X_train, y_train)\n", "except Exception as e:\n", "    print(e)"]}, {"cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": ["try:\n", "    from sklearn.impute import SimpleImputer as Imputer\n", "except ImportError:\n", "    from sklearn.preprocessing import Imputer\n", "imp = Imputer()\n", "imp.fit(X_train)\n", "X_train_nomiss = imp.transform(X_train)"]}, {"cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [{"data": {"text/plain": ["LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n", "                   intercept_scaling=1, l1_ratio=None, max_iter=100,\n", "                   multi_class='auto', n_jobs=None, penalty='l2',\n", "                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,\n", "                   warm_start=False)"]}, "execution_count": 7, "metadata": {}, "output_type": "execute_result"}], "source": ["cls = LogisticRegression()\n", "cls.fit(X_train_nomiss, y_train)"]}, {"cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [{"data": {"text/plain": ["0.6502242152466368"]}, "execution_count": 8, "metadata": {}, "output_type": "execute_result"}], "source": ["cls.score(imp.transform(X_test), y_test)"]}, {"cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [{"data": {"text/plain": ["Pipeline(memory=None,\n", "         steps=[('imputer',\n", "                 SimpleImputer(add_indicator=False, copy=True, fill_value=None,\n", "                               missing_values=nan, strategy='mean',\n", "                               verbose=0)),\n", "                ('lr',\n", "                 LogisticRegression(C=1.0, class_weight=None, dual=False,\n", "                                    fit_intercept=True, intercept_scaling=1,\n", "                                    l1_ratio=None, max_iter=100,\n", "                                    multi_class='auto', n_jobs=None,\n", "                                    penalty='l2', random_state=None,\n", "                                    solver='lbfgs', tol=0.0001, verbose=0,\n", "                                    warm_start=False))],\n", "         verbose=False)"]}, "execution_count": 9, "metadata": {}, "output_type": "execute_result"}], "source": ["from sklearn.pipeline import Pipeline\n", "pipe = Pipeline([(\"imputer\", Imputer()), \n", "                 (\"lr\", LogisticRegression())])\n", "pipe.fit(X_train, y_train)"]}, {"cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [{"data": {"text/plain": ["0.6502242152466368"]}, "execution_count": 10, "metadata": {}, "output_type": "execute_result"}], "source": ["pipe.score(X_test, y_test)"]}, {"cell_type": "code", "execution_count": 10, "metadata": {"scrolled": false}, "outputs": [{"name": "stderr", "output_type": "stream", "text": ["C:\\xavierdupre\\__home_\\github_fork\\scikit-learn\\sklearn\\linear_model\\logistic.py:935: ConvergenceWarning: lbfgs failed to converge. Increase the number of iterations.\n", "  \"of iterations.\", ConvergenceWarning)\n", "C:\\xavierdupre\\__home_\\github_fork\\scikit-learn\\sklearn\\linear_model\\logistic.py:935: ConvergenceWarning: lbfgs failed to converge. Increase the number of iterations.\n", "  \"of iterations.\", ConvergenceWarning)\n", "C:\\xavierdupre\\__home_\\github_fork\\scikit-learn\\sklearn\\linear_model\\logistic.py:935: ConvergenceWarning: lbfgs failed to converge. Increase the number of iterations.\n", "  \"of iterations.\", ConvergenceWarning)\n", "C:\\xavierdupre\\__home_\\github_fork\\scikit-learn\\sklearn\\linear_model\\logistic.py:935: ConvergenceWarning: lbfgs failed to converge. Increase the number of iterations.\n", "  \"of iterations.\", ConvergenceWarning)\n", "C:\\xavierdupre\\__home_\\github_fork\\scikit-learn\\sklearn\\linear_model\\logistic.py:935: ConvergenceWarning: lbfgs failed to converge. Increase the number of iterations.\n", "  \"of iterations.\", ConvergenceWarning)\n", "C:\\xavierdupre\\__home_\\github_fork\\scikit-learn\\sklearn\\linear_model\\logistic.py:935: ConvergenceWarning: lbfgs failed to converge. Increase the number of iterations.\n", "  \"of iterations.\", ConvergenceWarning)\n", "C:\\xavierdupre\\__home_\\github_fork\\scikit-learn\\sklearn\\linear_model\\logistic.py:935: ConvergenceWarning: lbfgs failed to converge. Increase the number of iterations.\n", "  \"of iterations.\", ConvergenceWarning)\n", "C:\\xavierdupre\\__home_\\github_fork\\scikit-learn\\sklearn\\linear_model\\logistic.py:935: ConvergenceWarning: lbfgs failed to converge. Increase the number of iterations.\n", "  \"of iterations.\", ConvergenceWarning)\n", "C:\\xavierdupre\\__home_\\github_fork\\scikit-learn\\sklearn\\linear_model\\logistic.py:935: ConvergenceWarning: lbfgs failed to converge. Increase the number of iterations.\n", "  \"of iterations.\", ConvergenceWarning)\n", "C:\\xavierdupre\\__home_\\github_fork\\scikit-learn\\sklearn\\linear_model\\logistic.py:935: ConvergenceWarning: lbfgs failed to converge. Increase the number of iterations.\n", "  \"of iterations.\", ConvergenceWarning)\n", "C:\\xavierdupre\\__home_\\github_fork\\scikit-learn\\sklearn\\linear_model\\logistic.py:935: ConvergenceWarning: lbfgs failed to converge. Increase the number of iterations.\n", "  \"of iterations.\", ConvergenceWarning)\n", "C:\\xavierdupre\\__home_\\github_fork\\scikit-learn\\sklearn\\linear_model\\logistic.py:935: ConvergenceWarning: lbfgs failed to converge. Increase the number of iterations.\n", "  \"of iterations.\", ConvergenceWarning)\n", "C:\\xavierdupre\\__home_\\github_fork\\scikit-learn\\sklearn\\linear_model\\logistic.py:935: ConvergenceWarning: lbfgs failed to converge. Increase the number of iterations.\n", "  \"of iterations.\", ConvergenceWarning)\n", "C:\\xavierdupre\\__home_\\github_fork\\scikit-learn\\sklearn\\linear_model\\logistic.py:935: ConvergenceWarning: lbfgs failed to converge. Increase the number of iterations.\n", "  \"of iterations.\", ConvergenceWarning)\n", "C:\\xavierdupre\\__home_\\github_fork\\scikit-learn\\sklearn\\linear_model\\logistic.py:935: ConvergenceWarning: lbfgs failed to converge. Increase the number of iterations.\n", "  \"of iterations.\", ConvergenceWarning)\n", "C:\\xavierdupre\\__home_\\github_fork\\scikit-learn\\sklearn\\linear_model\\logistic.py:935: ConvergenceWarning: lbfgs failed to converge. Increase the number of iterations.\n", "  \"of iterations.\", ConvergenceWarning)\n", "C:\\xavierdupre\\__home_\\github_fork\\scikit-learn\\sklearn\\linear_model\\logistic.py:935: ConvergenceWarning: lbfgs failed to converge. Increase the number of iterations.\n", "  \"of iterations.\", ConvergenceWarning)\n", "C:\\xavierdupre\\__home_\\github_fork\\scikit-learn\\sklearn\\linear_model\\logistic.py:935: ConvergenceWarning: lbfgs failed to converge. Increase the number of iterations.\n", "  \"of iterations.\", ConvergenceWarning)\n", "C:\\xavierdupre\\__home_\\github_fork\\scikit-learn\\sklearn\\linear_model\\logistic.py:935: ConvergenceWarning: lbfgs failed to converge. Increase the number of iterations.\n", "  \"of iterations.\", ConvergenceWarning)\n", "C:\\xavierdupre\\__home_\\github_fork\\scikit-learn\\sklearn\\linear_model\\logistic.py:935: ConvergenceWarning: lbfgs failed to converge. Increase the number of iterations.\n", "  \"of iterations.\", ConvergenceWarning)\n", "C:\\xavierdupre\\__home_\\github_fork\\scikit-learn\\sklearn\\linear_model\\logistic.py:935: ConvergenceWarning: lbfgs failed to converge. Increase the number of iterations.\n", "  \"of iterations.\", ConvergenceWarning)\n"]}, {"data": {"text/plain": ["GridSearchCV(cv=None, error_score=nan,\n", "             estimator=Pipeline(memory=None,\n", "                                steps=[('imputer',\n", "                                        SimpleImputer(add_indicator=False,\n", "                                                      copy=True,\n", "                                                      fill_value=None,\n", "                                                      missing_values=nan,\n", "                                                      strategy='mean',\n", "                                                      verbose=0)),\n", "                                       ('lr',\n", "                                        LogisticRegression(C=1.0,\n", "                                                           class_weight=None,\n", "                                                           dual=False,\n", "                                                           fit_intercept=True,\n", "                                                           intercept_scaling=1,\n", "                                                           l1_ratio=None,\n", "                                                           max_iter=100,\n", "                                                           multi_class='auto',\n", "                                                           n_jobs=None,\n", "                                                           penalty='l2',\n", "                                                           random_state=None,\n", "                                                           solver='lbfgs',\n", "                                                           tol=0.0001,\n", "                                                           verbose=0,\n", "                                                           warm_start=False))],\n", "                                verbose=False),\n", "             iid='deprecated', n_jobs=None,\n", "             param_grid={'imputer__strategy': ['mean', 'most_frequent'],\n", "                         'lr__max_iter': [5, 10, 50]},\n", "             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,\n", "             scoring=None, verbose=0)"]}, "execution_count": 11, "metadata": {}, "output_type": "execute_result"}], "source": ["from sklearn.model_selection import GridSearchCV\n", "grid = GridSearchCV(pipe, {\"imputer__strategy\": ['mean', 'most_frequent'],\n", "                           \"lr__max_iter\": [5, 10, 50]})\n", "grid.fit(X_train, y_train)"]}, {"cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [{"data": {"text/html": ["<div>\n", "<style scoped>\n", "    .dataframe tbody tr th:only-of-type {\n", "        vertical-align: middle;\n", "    }\n", "\n", "    .dataframe tbody tr th {\n", "        vertical-align: top;\n", "    }\n", "\n", "    .dataframe thead th {\n", "        text-align: right;\n", "    }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", "  <thead>\n", "    <tr style=\"text-align: right;\">\n", "      <th></th>\n", "      <th>0</th>\n", "      <th>1</th>\n", "      <th>2</th>\n", "      <th>3</th>\n", "      <th>4</th>\n", "      <th>5</th>\n", "    </tr>\n", "  </thead>\n", "  <tbody>\n", "    <tr>\n", "      <th>param_imputer__strategy</th>\n", "      <td>mean</td>\n", "      <td>mean</td>\n", "      <td>mean</td>\n", "      <td>most_frequent</td>\n", "      <td>most_frequent</td>\n", "      <td>most_frequent</td>\n", "    </tr>\n", "    <tr>\n", "      <th>param_lr__max_iter</th>\n", "      <td>5</td>\n", "      <td>10</td>\n", "      <td>50</td>\n", "      <td>5</td>\n", "      <td>10</td>\n", "      <td>50</td>\n", "    </tr>\n", "    <tr>\n", "      <th>split0_test_score</th>\n", "      <td>0.686567</td>\n", "      <td>0.69403</td>\n", "      <td>0.656716</td>\n", "      <td>0.686567</td>\n", "      <td>0.69403</td>\n", "      <td>0.656716</td>\n", "    </tr>\n", "    <tr>\n", "      <th>split1_test_score</th>\n", "      <td>0.619403</td>\n", "      <td>0.604478</td>\n", "      <td>0.597015</td>\n", "      <td>0.61194</td>\n", "      <td>0.626866</td>\n", "      <td>0.61194</td>\n", "    </tr>\n", "    <tr>\n", "      <th>split2_test_score</th>\n", "      <td>0.679104</td>\n", "      <td>0.679104</td>\n", "      <td>0.671642</td>\n", "      <td>0.664179</td>\n", "      <td>0.671642</td>\n", "      <td>0.656716</td>\n", "    </tr>\n", "    <tr>\n", "      <th>split3_test_score</th>\n", "      <td>0.706767</td>\n", "      <td>0.699248</td>\n", "      <td>0.684211</td>\n", "      <td>0.706767</td>\n", "      <td>0.714286</td>\n", "      <td>0.684211</td>\n", "    </tr>\n", "    <tr>\n", "      <th>split4_test_score</th>\n", "      <td>0.676692</td>\n", "      <td>0.699248</td>\n", "      <td>0.699248</td>\n", "      <td>0.676692</td>\n", "      <td>0.706767</td>\n", "      <td>0.691729</td>\n", "    </tr>\n", "    <tr>\n", "      <th>mean_test_score</th>\n", "      <td>0.673707</td>\n", "      <td>0.675222</td>\n", "      <td>0.661766</td>\n", "      <td>0.669229</td>\n", "      <td>0.682718</td>\n", "      <td>0.660263</td>\n", "    </tr>\n", "    <tr>\n", "      <th>std_test_score</th>\n", "      <td>0.0291387</td>\n", "      <td>0.0361333</td>\n", "      <td>0.0352828</td>\n", "      <td>0.0318525</td>\n", "      <td>0.0314484</td>\n", "      <td>0.0280138</td>\n", "    </tr>\n", "    <tr>\n", "      <th>rank_test_score</th>\n", "      <td>3</td>\n", "      <td>2</td>\n", "      <td>5</td>\n", "      <td>4</td>\n", "      <td>1</td>\n", "      <td>6</td>\n", "    </tr>\n", "  </tbody>\n", "</table>\n", "</div>"], "text/plain": ["                                 0          1          2              3  \\\n", "param_imputer__strategy       mean       mean       mean  most_frequent   \n", "param_lr__max_iter               5         10         50              5   \n", "split0_test_score         0.686567    0.69403   0.656716       0.686567   \n", "split1_test_score         0.619403   0.604478   0.597015        0.61194   \n", "split2_test_score         0.679104   0.679104   0.671642       0.664179   \n", "split3_test_score         0.706767   0.699248   0.684211       0.706767   \n", "split4_test_score         0.676692   0.699248   0.699248       0.676692   \n", "mean_test_score           0.673707   0.675222   0.661766       0.669229   \n", "std_test_score           0.0291387  0.0361333  0.0352828      0.0318525   \n", "rank_test_score                  3          2          5              4   \n", "\n", "                                     4              5  \n", "param_imputer__strategy  most_frequent  most_frequent  \n", "param_lr__max_iter                  10             50  \n", "split0_test_score              0.69403       0.656716  \n", "split1_test_score             0.626866        0.61194  \n", "split2_test_score             0.671642       0.656716  \n", "split3_test_score             0.714286       0.684211  \n", "split4_test_score             0.706767       0.691729  \n", "mean_test_score               0.682718       0.660263  \n", "std_test_score               0.0314484      0.0280138  \n", "rank_test_score                      1              6  "]}, "execution_count": 12, "metadata": {}, "output_type": "execute_result"}], "source": ["res = pandas.DataFrame(grid.cv_results_)\n", "col = [_ for _ in res.columns if 'param_' in _ or \"test_score\" in _]\n", "res[col].T"]}, {"cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": ["from sklearn.base import BaseEstimator, ClassifierMixin\n", "import numpy\n", "\n", "class MeanPredictor(BaseEstimator, ClassifierMixin):\n", "    def __init__(self, alpha=0.5):\n", "        self.alpha = alpha\n", "        \n", "    def fit(self, X, y):\n", "        self.mean_ = int(self.alpha + numpy.mean(y))\n", "        \n", "    def predict(self, X):\n", "        return numpy.array(list(self.mean_ for k in range(X.shape[0])))"]}, {"cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [{"data": {"text/plain": ["Pipeline(memory=None,\n", "         steps=[('imputer',\n", "                 SimpleImputer(add_indicator=False, copy=True, fill_value=None,\n", "                               missing_values=nan, strategy='mean',\n", "                               verbose=0)),\n", "                ('meanpredictor', MeanPredictor(alpha=0.5))],\n", "         verbose=False)"]}, "execution_count": 14, "metadata": {}, "output_type": "execute_result"}], "source": ["pipe_mean = Pipeline([('imputer', Imputer()), \n", "                      ('meanpredictor', MeanPredictor())])\n", "pipe_mean.fit(X_train, y_train)"]}, {"cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [{"data": {"text/plain": ["0.6322869955156951"]}, "execution_count": 15, "metadata": {}, "output_type": "execute_result"}], "source": ["pipe_mean.score(X_test, y_test)"]}, {"cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [{"data": {"text/plain": ["GridSearchCV(cv=None, error_score=nan,\n", "             estimator=Pipeline(memory=None,\n", "                                steps=[('imputer',\n", "                                        SimpleImputer(add_indicator=False,\n", "                                                      copy=True,\n", "                                                      fill_value=None,\n", "                                                      missing_values=nan,\n", "                                                      strategy='mean',\n", "                                                      verbose=0)),\n", "                                       ('meanpredictor',\n", "                                        MeanPredictor(alpha=0.5))],\n", "                                verbose=False),\n", "             iid='deprecated', n_jobs=None,\n", "             param_grid={'imputer__strategy': ['mean', 'most_frequent'],\n", "                         'meanpredictor__alpha': [0.2, 0.5, 0.8]},\n", "             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,\n", "             scoring=None, verbose=0)"]}, "execution_count": 16, "metadata": {}, "output_type": "execute_result"}], "source": ["from sklearn.model_selection import GridSearchCV\n", "grid = GridSearchCV(pipe_mean, {\"imputer__strategy\": ['mean', 'most_frequent'],\n", "                                \"meanpredictor__alpha\": [0.2, 0.5, 0.8]})\n", "grid.fit(X_train, y_train)"]}, {"cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [{"data": {"text/html": ["<div>\n", "<style scoped>\n", "    .dataframe tbody tr th:only-of-type {\n", "        vertical-align: middle;\n", "    }\n", "\n", "    .dataframe tbody tr th {\n", "        vertical-align: top;\n", "    }\n", "\n", "    .dataframe thead th {\n", "        text-align: right;\n", "    }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", "  <thead>\n", "    <tr style=\"text-align: right;\">\n", "      <th></th>\n", "      <th>0</th>\n", "      <th>1</th>\n", "      <th>2</th>\n", "      <th>3</th>\n", "      <th>4</th>\n", "      <th>5</th>\n", "    </tr>\n", "  </thead>\n", "  <tbody>\n", "    <tr>\n", "      <th>param_imputer__strategy</th>\n", "      <td>mean</td>\n", "      <td>mean</td>\n", "      <td>mean</td>\n", "      <td>most_frequent</td>\n", "      <td>most_frequent</td>\n", "      <td>most_frequent</td>\n", "    </tr>\n", "    <tr>\n", "      <th>param_meanpredictor__alpha</th>\n", "      <td>0.2</td>\n", "      <td>0.5</td>\n", "      <td>0.8</td>\n", "      <td>0.2</td>\n", "      <td>0.5</td>\n", "      <td>0.8</td>\n", "    </tr>\n", "    <tr>\n", "      <th>split0_test_score</th>\n", "      <td>0.61194</td>\n", "      <td>0.61194</td>\n", "      <td>0.38806</td>\n", "      <td>0.61194</td>\n", "      <td>0.61194</td>\n", "      <td>0.38806</td>\n", "    </tr>\n", "    <tr>\n", "      <th>split1_test_score</th>\n", "      <td>0.61194</td>\n", "      <td>0.61194</td>\n", "      <td>0.38806</td>\n", "      <td>0.61194</td>\n", "      <td>0.61194</td>\n", "      <td>0.38806</td>\n", "    </tr>\n", "    <tr>\n", "      <th>split2_test_score</th>\n", "      <td>0.61194</td>\n", "      <td>0.61194</td>\n", "      <td>0.38806</td>\n", "      <td>0.61194</td>\n", "      <td>0.61194</td>\n", "      <td>0.38806</td>\n", "    </tr>\n", "    <tr>\n", "      <th>split3_test_score</th>\n", "      <td>0.609023</td>\n", "      <td>0.609023</td>\n", "      <td>0.390977</td>\n", "      <td>0.609023</td>\n", "      <td>0.609023</td>\n", "      <td>0.390977</td>\n", "    </tr>\n", "    <tr>\n", "      <th>split4_test_score</th>\n", "      <td>0.609023</td>\n", "      <td>0.609023</td>\n", "      <td>0.390977</td>\n", "      <td>0.609023</td>\n", "      <td>0.609023</td>\n", "      <td>0.390977</td>\n", "    </tr>\n", "    <tr>\n", "      <th>mean_test_score</th>\n", "      <td>0.610773</td>\n", "      <td>0.610773</td>\n", "      <td>0.389227</td>\n", "      <td>0.610773</td>\n", "      <td>0.610773</td>\n", "      <td>0.389227</td>\n", "    </tr>\n", "    <tr>\n", "      <th>std_test_score</th>\n", "      <td>0.0014294</td>\n", "      <td>0.0014294</td>\n", "      <td>0.0014294</td>\n", "      <td>0.0014294</td>\n", "      <td>0.0014294</td>\n", "      <td>0.0014294</td>\n", "    </tr>\n", "    <tr>\n", "      <th>rank_test_score</th>\n", "      <td>1</td>\n", "      <td>1</td>\n", "      <td>5</td>\n", "      <td>1</td>\n", "      <td>1</td>\n", "      <td>5</td>\n", "    </tr>\n", "  </tbody>\n", "</table>\n", "</div>"], "text/plain": ["                                    0          1          2              3  \\\n", "param_imputer__strategy          mean       mean       mean  most_frequent   \n", "param_meanpredictor__alpha        0.2        0.5        0.8            0.2   \n", "split0_test_score             0.61194    0.61194    0.38806        0.61194   \n", "split1_test_score             0.61194    0.61194    0.38806        0.61194   \n", "split2_test_score             0.61194    0.61194    0.38806        0.61194   \n", "split3_test_score            0.609023   0.609023   0.390977       0.609023   \n", "split4_test_score            0.609023   0.609023   0.390977       0.609023   \n", "mean_test_score              0.610773   0.610773   0.389227       0.610773   \n", "std_test_score              0.0014294  0.0014294  0.0014294      0.0014294   \n", "rank_test_score                     1          1          5              1   \n", "\n", "                                        4              5  \n", "param_imputer__strategy     most_frequent  most_frequent  \n", "param_meanpredictor__alpha            0.5            0.8  \n", "split0_test_score                 0.61194        0.38806  \n", "split1_test_score                 0.61194        0.38806  \n", "split2_test_score                 0.61194        0.38806  \n", "split3_test_score                0.609023       0.390977  \n", "split4_test_score                0.609023       0.390977  \n", "mean_test_score                  0.610773       0.389227  \n", "std_test_score                  0.0014294      0.0014294  \n", "rank_test_score                         1              5  "]}, "execution_count": 17, "metadata": {}, "output_type": "execute_result"}], "source": ["res = pandas.DataFrame(grid.cv_results_)\n", "col = [_ for _ in res.columns if 'param_' in _ or \"test_score\" in _]\n", "res[col].T"]}, {"cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": ["best = grid.best_estimator_"]}, {"cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": ["import pickle\n", "with open(\"model.pkl\", \"wb\") as f:\n", "    pickle.dump(best, f)"]}, {"cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": ["with open(\"model.pkl\", \"rb\") as f:\n", "    model = pickle.load(f)"]}, {"cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [{"data": {"text/plain": ["array([ True,  True,  True,  True,  True,  True,  True,  True,  True,\n", "        True,  True,  True,  True,  True,  True,  True,  True,  True,\n", "        True,  True,  True,  True,  True,  True,  True,  True,  True,\n", "        True,  True,  True,  True,  True,  True,  True,  True,  True,\n", "        True,  True,  True,  True,  True,  True,  True,  True,  True,\n", "        True,  True,  True,  True,  True,  True,  True,  True,  True,\n", "        True,  True,  True,  True,  True,  True,  True,  True,  True,\n", "        True,  True,  True,  True,  True,  True,  True,  True,  True,\n", "        True,  True,  True,  True,  True,  True,  True,  True,  True,\n", "        True,  True,  True,  True,  True,  True,  True,  True,  True,\n", "        True,  True,  True,  True,  True,  True,  True,  True,  True,\n", "        True,  True,  True,  True,  True,  True,  True,  True,  True,\n", "        True,  True,  True,  True,  True,  True,  True,  True,  True,\n", "        True,  True,  True,  True,  True,  True,  True,  True,  True,\n", "        True,  True,  True,  True,  True,  True,  True,  True,  True,\n", "        True,  True,  True,  True,  True,  True,  True,  True,  True,\n", "        True,  True,  True,  True,  True,  True,  True,  True,  True,\n", "        True,  True,  True,  True,  True,  True,  True,  True,  True,\n", "        True,  True,  True,  True,  True,  True,  True,  True,  True,\n", "        True,  True,  True,  True,  True,  True,  True,  True,  True,\n", "        True,  True,  True,  True,  True,  True,  True,  True,  True,\n", "        True,  True,  True,  True,  True,  True,  True,  True,  True,\n", "        True,  True,  True,  True,  True,  True,  True,  True,  True,\n", "        True,  True,  True,  True,  True,  True,  True,  True,  True,\n", "        True,  True,  True,  True,  True,  True,  True])"]}, "execution_count": 21, "metadata": {}, "output_type": "execute_result"}], "source": ["model.predict(X_test) == best.predict(X_test)"]}, {"cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": []}], "metadata": {"kernelspec": {"display_name": "Python 3", "language": "python", "name": "python3"}, "language_info": {"codemirror_mode": {"name": "ipython", "version": 3}, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.2"}}, "nbformat": 4, "nbformat_minor": 2}