{"cells": [{"cell_type": "markdown", "metadata": {}, "source": ["# 2018-10-09 Ensemble, Gradient, Boosting...\n", "\n", "Le noteboook explore quelques particularit\u00e9s des algorithmes d'apprentissage pour expliquer certains r\u00e9sultats num\u00e9riques. L'algoithme [AdaBoost](https://fr.wikipedia.org/wiki/AdaBoost) surpond\u00e8re les exemples sur lequel un mod\u00e8le fait des erreurs."]}, {"cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [{"data": {"text/html": ["
run previous cell, wait for 2 seconds
\n", ""], "text/plain": [""]}, "execution_count": 2, "metadata": {}, "output_type": "execute_result"}], "source": ["from jyquickhelper import add_notebook_menu\n", "add_notebook_menu()"]}, {"cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": ["%matplotlib inline"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## Skewed split train test\n", "\n", "Lorsqu'une classe est sous repr\u00e9sent\u00e9e, il est difficile de pr\u00e9dire les r\u00e9sultats d'un mod\u00e8le de machine learning."]}, {"cell_type": "code", "execution_count": 3, "metadata": {"scrolled": false}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["n= 1\n", "n= 2\n", "n= 5\n", "n= 10\n", "n= 20\n", "n= 50\n", "n= 80\n", "n= 90\n", "n= 100\n", "n= 110\n"]}], "source": ["import numpy, numpy.random\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.tree import DecisionTreeClassifier\n", "from sklearn.neural_network import MLPClassifier\n", "from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier\n", "from sklearn.metrics import confusion_matrix\n", "\n", "N = 1000\n", "\n", "res = []\n", "\n", "for n in [1, 2, 5, 10, 20, 50, 80, 90, 100, 110]:\n", " print(\"n=\", n)\n", " for k in range(10):\n", "\n", " X = numpy.zeros((N, 2))\n", " X[:, 0] = numpy.random.randint(0, 2, (N,))\n", " X[:, 1] = numpy.random.randint(0, n+1, (N,))\n", " Y = X[:, 0] + X[:, 1] + numpy.random.normal(size=(N,)) / 2\n", " Y[Y < 1.5] = 0\n", " Y[Y >= 1.5] = 1\n", "\n", " X_train, X_test, y_train, y_test = train_test_split(X, Y)\n", "\n", " stat = dict(N=N, n=n, ratio_train=y_train.sum()/y_train.shape[0],\n", " k=k, ratio_test=y_test.sum()/y_test.shape[0])\n", " \n", " for model in [LogisticRegression(solver=\"liblinear\"),\n", " MLPClassifier(max_iter=500),\n", " RandomForestClassifier(n_estimators=10),\n", " AdaBoostClassifier(DecisionTreeClassifier(), n_estimators=10)]:\n", " obs = stat.copy()\n", " obs[\"model\"] = model.__class__.__name__\n", " if obs[\"model\"] == \"AdaBoostClassifier\":\n", " obs[\"model\"] = \"AdaB-\" + model.base_estimator.__class__.__name__\n", " try:\n", " model.fit(X_train, y_train)\n", " except ValueError as e:\n", " obs[\"erreur\"] = str(e)\n", " res.append(obs)\n", " continue\n", " sc = model.score(X_test, y_test)\n", " obs[\"accuracy\"] = sc\n", " conf = confusion_matrix(y_test, model.predict(X_test))\n", " try:\n", " obs[\"Error-0|1\"] = conf[0, 1] / conf[0, :].sum()\n", " obs[\"Error-1|0\"] = conf[1, 0] / conf[1, :].sum()\n", " except Exception:\n", " pass\n", " res.append(obs)"]}, {"cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [{"data": {"text/html": ["
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Nnratio_trainkratio_testmodelaccuracyError-0|1Error-1|0diff_ratio
0100010.27333300.300AdaB-DecisionTreeClassifier0.8600.0628570.3200000.026667
1100010.27466710.328AdaB-DecisionTreeClassifier0.9160.0297620.1951220.053333
2100010.30400020.284AdaB-DecisionTreeClassifier0.8600.0726260.3098590.020000
3100010.28533330.268AdaB-DecisionTreeClassifier0.8960.0273220.3134330.017333
4100010.29733340.256AdaB-DecisionTreeClassifier0.8880.0537630.2812500.041333
\n", "
"], "text/plain": [" N n ratio_train k ratio_test model accuracy \\\n", "0 1000 1 0.273333 0 0.300 AdaB-DecisionTreeClassifier 0.860 \n", "1 1000 1 0.274667 1 0.328 AdaB-DecisionTreeClassifier 0.916 \n", "2 1000 1 0.304000 2 0.284 AdaB-DecisionTreeClassifier 0.860 \n", "3 1000 1 0.285333 3 0.268 AdaB-DecisionTreeClassifier 0.896 \n", "4 1000 1 0.297333 4 0.256 AdaB-DecisionTreeClassifier 0.888 \n", "\n", " Error-0|1 Error-1|0 diff_ratio \n", "0 0.062857 0.320000 0.026667 \n", "1 0.029762 0.195122 0.053333 \n", "2 0.072626 0.309859 0.020000 \n", "3 0.027322 0.313433 0.017333 \n", "4 0.053763 0.281250 0.041333 "]}, "execution_count": 5, "metadata": {}, "output_type": "execute_result"}], "source": ["from pandas import DataFrame\n", "df = DataFrame(res)\n", "df = df.sort_values(['n', 'model', 'model', \"k\"]).reset_index(drop=True)\n", "df[\"diff_ratio\"] = (df[\"ratio_test\"] - df[\"ratio_train\"]).abs()\n", "df.head(n=5)"]}, {"cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [{"data": {"text/html": ["
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Nnratio_trainkratio_testmodelaccuracyError-0|1Error-1|0diff_ratio
39510001100.98266750.996RandomForestClassifier0.9960.00.0040160.013333
39610001100.99066760.980RandomForestClassifier0.9960.20.0000000.010667
39710001100.98533370.988RandomForestClassifier1.0000.00.0000000.002667
39810001100.98533380.992RandomForestClassifier1.0000.00.0000000.006667
39910001100.98533390.992RandomForestClassifier0.9960.50.0000000.006667
\n", "
"], "text/plain": [" N n ratio_train k ratio_test model accuracy \\\n", "395 1000 110 0.982667 5 0.996 RandomForestClassifier 0.996 \n", "396 1000 110 0.990667 6 0.980 RandomForestClassifier 0.996 \n", "397 1000 110 0.985333 7 0.988 RandomForestClassifier 1.000 \n", "398 1000 110 0.985333 8 0.992 RandomForestClassifier 1.000 \n", "399 1000 110 0.985333 9 0.992 RandomForestClassifier 0.996 \n", "\n", " Error-0|1 Error-1|0 diff_ratio \n", "395 0.0 0.004016 0.013333 \n", "396 0.2 0.000000 0.010667 \n", "397 0.0 0.000000 0.002667 \n", "398 0.0 0.000000 0.006667 \n", "399 0.5 0.000000 0.006667 "]}, "execution_count": 6, "metadata": {}, "output_type": "execute_result"}], "source": ["df.tail(n=5)"]}, {"cell_type": "markdown", "metadata": {}, "source": ["La r\u00e9partition train/test est loin d'\u00eatre statisfaisante lorsqu'il existe une classe sous repr\u00e9sent\u00e9e."]}, {"cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [{"data": {"text/html": ["
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nratio_testratio_train
3201000.9800.992000
3211000.9840.980000
3221000.9880.984000
3231000.9880.986667
3241000.9760.986667
3251000.9840.985333
3261000.9840.981333
3271000.9880.982667
3281000.9840.989333
3291000.9920.989333
\n", "
"], "text/plain": [" n ratio_test ratio_train\n", "320 100 0.980 0.992000\n", "321 100 0.984 0.980000\n", "322 100 0.988 0.984000\n", "323 100 0.988 0.986667\n", "324 100 0.976 0.986667\n", "325 100 0.984 0.985333\n", "326 100 0.984 0.981333\n", "327 100 0.988 0.982667\n", "328 100 0.984 0.989333\n", "329 100 0.992 0.989333"]}, "execution_count": 7, "metadata": {}, "output_type": "execute_result"}], "source": ["df[df.n==100][[\"n\", \"ratio_test\", \"ratio_train\"]].head(n=10)"]}, {"cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": ["#df.to_excel(\"data.xlsx\")"]}, {"cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [{"data": {"text/html": ["
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nNmodelratio_trainkratio_testaccuracyError-0|1Error-1|0diff_ratio
351001000RandomForestClassifier0.9857334.50.98480.99560.1850000.0012160.004933
361101000AdaB-DecisionTreeClassifier0.9865334.50.99000.99720.1300000.0008100.007200
371101000LogisticRegression0.9865334.50.99000.99600.3466670.0004020.007200
381101000MLPClassifier0.9865334.50.99000.99560.3466670.0008100.007200
391101000RandomForestClassifier0.9865334.50.99000.99800.0900000.0008100.007200
\n", "
"], "text/plain": [" n N model ratio_train k ratio_test \\\n", "35 100 1000 RandomForestClassifier 0.985733 4.5 0.9848 \n", "36 110 1000 AdaB-DecisionTreeClassifier 0.986533 4.5 0.9900 \n", "37 110 1000 LogisticRegression 0.986533 4.5 0.9900 \n", "38 110 1000 MLPClassifier 0.986533 4.5 0.9900 \n", "39 110 1000 RandomForestClassifier 0.986533 4.5 0.9900 \n", "\n", " accuracy Error-0|1 Error-1|0 diff_ratio \n", "35 0.9956 0.185000 0.001216 0.004933 \n", "36 0.9972 0.130000 0.000810 0.007200 \n", "37 0.9960 0.346667 0.000402 0.007200 \n", "38 0.9956 0.346667 0.000810 0.007200 \n", "39 0.9980 0.090000 0.000810 0.007200 "]}, "execution_count": 9, "metadata": {}, "output_type": "execute_result"}], "source": ["columns = [\"n\", \"N\", \"model\"]\n", "agg = df.groupby(columns, as_index=False).mean().sort_values([\"n\", \"model\"]).reset_index(drop=True)\n", "agg.tail()"]}, {"cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [{"data": {"image/png": "", "text/plain": ["
"]}, "metadata": {"needs_background": "light"}, "output_type": "display_data"}], "source": ["import matplotlib.pyplot as plt\n", "fig, ax = plt.subplots(1, 2, figsize=(10,4))\n", "agg.plot(x=\"n\", y=\"diff_ratio\", ax=ax[0])\n", "agg.plot(x=\"n\", y=\"ratio_train\", ax=ax[1])\n", "agg.plot(x=\"n\", y=\"ratio_test\", ax=ax[1])\n", "ax[0].set_title(\"Maximum difference between\\nratio of first class on train and test\")\n", "ax[1].set_title(\"Ratio of first class on train and test\")\n", "ax[0].legend();"]}, {"cell_type": "markdown", "metadata": {}, "source": ["Une astuce pour \u00e9viter les doublons avant d'effecturer un pivot."]}, {"cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": ["agg2 = agg.copy()\n", "agg2[\"ratio_test2\"] = agg2[\"ratio_test\"] + agg2[\"n\"] / 100000"]}, {"cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [{"data": {"image/png": "", "text/plain": ["
"]}, "metadata": {"needs_background": "light"}, "output_type": "display_data"}], "source": ["import matplotlib.pyplot as plt\n", "fig, ax = plt.subplots(1, 3, figsize=(14,4))\n", "agg2.pivot(\"ratio_test2\", \"model\", \"accuracy\").plot(ax=ax[0])\n", "agg2.pivot(\"ratio_test2\", \"model\", \"Error-0|1\").plot(ax=ax[1])\n", "agg2.pivot(\"ratio_test2\", \"model\", \"Error-1|0\").plot(ax=ax[2])\n", "ax[0].plot([0.5, 1.0], [0.5, 1.0], '--', label=\"constant\")\n", "ax[0].set_title(\"Accuracy\")\n", "ax[1].set_title(\"Error-0|1\")\n", "ax[2].set_title(\"Error-1|0\")\n", "ax[0].legend();"]}, {"cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [{"data": {"text/html": ["
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
modelAdaB-DecisionTreeClassifierLogisticRegressionMLPClassifierRandomForestClassifier
ratio_test2
0.297210.0522490.0522490.0522490.052249
0.506820.1106860.1106860.1106860.110686
0.755250.1195780.1195780.1195780.119578
0.866900.0993330.0993330.0993330.099333
0.929000.0880950.1130950.1130950.088095
0.969700.1063490.2539680.2206350.163492
0.981200.1250000.3100000.2000000.175000
0.984900.1100000.1550000.1550000.170000
0.985800.1850000.3350000.2683330.185000
0.991100.1300000.3466670.3466670.090000
\n", "
"], "text/plain": ["model AdaB-DecisionTreeClassifier LogisticRegression MLPClassifier \\\n", "ratio_test2 \n", "0.29721 0.052249 0.052249 0.052249 \n", "0.50682 0.110686 0.110686 0.110686 \n", "0.75525 0.119578 0.119578 0.119578 \n", "0.86690 0.099333 0.099333 0.099333 \n", "0.92900 0.088095 0.113095 0.113095 \n", "0.96970 0.106349 0.253968 0.220635 \n", "0.98120 0.125000 0.310000 0.200000 \n", "0.98490 0.110000 0.155000 0.155000 \n", "0.98580 0.185000 0.335000 0.268333 \n", "0.99110 0.130000 0.346667 0.346667 \n", "\n", "model RandomForestClassifier \n", "ratio_test2 \n", "0.29721 0.052249 \n", "0.50682 0.110686 \n", "0.75525 0.119578 \n", "0.86690 0.099333 \n", "0.92900 0.088095 \n", "0.96970 0.163492 \n", "0.98120 0.175000 \n", "0.98490 0.170000 \n", "0.98580 0.185000 \n", "0.99110 0.090000 "]}, "execution_count": 13, "metadata": {}, "output_type": "execute_result"}], "source": ["agg2.pivot(\"ratio_test2\", \"model\", \"Error-0|1\")"]}, {"cell_type": "markdown", "metadata": {}, "source": ["Le mod\u00e8le [AdaBoost](http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html) construit 10 arbres tout comme la for\u00eat al\u00e9atoire \u00e0 ceci pr\u00e8s que le poids associ\u00e9 \u00e0 chacun des arbres des diff\u00e9rents et non uniforme."]}, {"cell_type": "markdown", "metadata": {}, "source": ["## Apprentissage continu\n", "\n", "Apprendre une for\u00eat al\u00e9atoire, puis ajouter un arbre, encore un tout en gardant le r\u00e9sultat des apprentissages pr\u00e9c\u00e9dents."]}, {"cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": ["from sklearn.datasets import load_diabetes\n", "data = load_diabetes()\n", "X, y = data.data, data.target"]}, {"cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": ["X_train, X_test, y_train, y_test = train_test_split(X, y)"]}, {"cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": ["from sklearn.ensemble import RandomForestRegressor\n", "\n", "model = None\n", "res = []\n", "for i in range(0, 20):\n", " if model is None:\n", " model = RandomForestRegressor(n_estimators=1, warm_start=True)\n", " else:\n", " model.set_params(**dict(n_estimators=model.n_estimators+1))\n", " model.fit(X_train, y_train)\n", " score = model.score(X_test, y_test)\n", " res.append(dict(n_estimators=model.n_estimators, score=score))"]}, {"cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [{"data": {"text/html": ["
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
n_estimatorsscore
010.128906
120.323854
230.352876
340.389476
450.429992
\n", "
"], "text/plain": [" n_estimators score\n", "0 1 0.128906\n", "1 2 0.323854\n", "2 3 0.352876\n", "3 4 0.389476\n", "4 5 0.429992"]}, "execution_count": 17, "metadata": {}, "output_type": "execute_result"}], "source": ["df = DataFrame(res)\n", "df.head()"]}, {"cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [{"data": {"image/png": "", "text/plain": ["
"]}, "metadata": {"needs_background": "light"}, "output_type": "display_data"}], "source": ["ax = df.plot(x=\"n_estimators\", y=\"score\")\n", "ax.set_title(\"Apprentissage continu\\nmesure de la performance \u00e0 chaque it\u00e9ration\");"]}, {"cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": []}, {"cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": []}], "metadata": {"kernelspec": {"display_name": "Python 3", "language": "python", "name": "python3"}, "language_info": {"codemirror_mode": {"name": "ipython", "version": 3}, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.5"}}, "nbformat": 4, "nbformat_minor": 2}