{"cells": [{"cell_type": "markdown", "metadata": {}, "source": ["# 2A.data - Pandas et it\u00e9rateurs - correction\n", "\n", "[pandas](http://pandas.pydata.org/) a tendance a prendre beaucoup d'espace m\u00e9moire pour charger les donn\u00e9es, environ trois fois plus que sa taille sur disque.  Quand la m\u00e9moire n'est pas assez grande, que peut-on faire ?"]}, {"cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [{"data": {"text/html": ["<div id=\"my_id_menu_nb\">run previous cell, wait for 2 seconds</div>\n", "<script>\n", "function repeat_indent_string(n){\n", "    var a = \"\" ;\n", "    for ( ; n > 0 ; --n)\n", "        a += \"    \";\n", "    return a;\n", "}\n", "var update_menu_string = function(begin, lfirst, llast, sformat, send, keep_item, begin_format, end_format) {\n", "    var anchors = document.getElementsByClassName(\"section\");\n", "    if (anchors.length == 0) {\n", "        anchors = document.getElementsByClassName(\"text_cell_render rendered_html\");\n", "    }\n", "    var i,t;\n", "    var text_menu = begin;\n", "    var text_memo = \"<pre>\\nlength:\" + anchors.length + \"\\n\";\n", "    var ind = \"\";\n", "    var memo_level = 1;\n", "    var href;\n", "    var tags = [];\n", "    var main_item = 0;\n", "    var format_open = 0;\n", "    for (i = 0; i <= llast; i++)\n", "        tags.push(\"h\" + i);\n", "\n", "    for (i = 0; i < anchors.length; i++) {\n", "        text_memo += \"**\" + anchors[i].id + \"--\\n\";\n", "\n", "        var child = null;\n", "        for(t = 0; t < tags.length; t++) {\n", "            var r = anchors[i].getElementsByTagName(tags[t]);\n", "            if (r.length > 0) {\n", "child = r[0];\n", "break;\n", "            }\n", "        }\n", "        if (child == null) {\n", "            text_memo += \"null\\n\";\n", "            continue;\n", "        }\n", "        if (anchors[i].hasAttribute(\"id\")) {\n", "            // when converted in RST\n", "            href = anchors[i].id;\n", "            text_memo += \"#1-\" + href;\n", "            // passer \u00e0 child suivant (le chercher)\n", "        }\n", "        else if (child.hasAttribute(\"id\")) {\n", "            // in a notebook\n", "            href = child.id;\n", "            text_memo += \"#2-\" + href;\n", "        }\n", "        else {\n", "            text_memo += \"#3-\" + \"*\" + \"\\n\";\n", "            continue;\n", "        }\n", "        var title = child.textContent;\n", "        var level = parseInt(child.tagName.substring(1,2));\n", "\n", "        text_memo += \"--\" + level + \"?\" + lfirst + \"--\" + title + \"\\n\";\n", "\n", "        if ((level < lfirst) || (level > llast)) {\n", "            continue ;\n", "        }\n", "        if (title.endsWith('\u00b6')) {\n", "            title = title.substring(0,title.length-1).replace(\"<\", \"&lt;\")\n", "         .replace(\">\", \"&gt;\").replace(\"&\", \"&amp;\");\n", "        }\n", "        if (title.length == 0) {\n", "            continue;\n", "        }\n", "\n", "        while (level < memo_level) {\n", "            text_menu += end_format + \"</ul>\\n\";\n", "            format_open -= 1;\n", "            memo_level -= 1;\n", "        }\n", "        if (level == lfirst) {\n", "            main_item += 1;\n", "        }\n", "        if (keep_item != -1 && main_item != keep_item + 1) {\n", "            // alert(main_item + \" - \" + level + \" - \" + keep_item);\n", "            continue;\n", "        }\n", "        while (level > memo_level) {\n", "            text_menu += \"<ul>\\n\";\n", "            memo_level += 1;\n", "        }\n", "        text_menu += repeat_indent_string(level-2);\n", "        text_menu += begin_format + sformat.replace(\"__HREF__\", href).replace(\"__TITLE__\", title);\n", "        format_open += 1;\n", "    }\n", "    while (1 < memo_level) {\n", "        text_menu += end_format + \"</ul>\\n\";\n", "        memo_level -= 1;\n", "        format_open -= 1;\n", "    }\n", "    text_menu += send;\n", "    //text_menu += \"\\n\" + text_memo;\n", "\n", "    while (format_open > 0) {\n", "        text_menu += end_format;\n", "        format_open -= 1;\n", "    }\n", "    return text_menu;\n", "};\n", "var update_menu = function() {\n", "    var sbegin = \"\";\n", "    var sformat = '<a href=\"#__HREF__\">__TITLE__</a>';\n", "    var send = \"\";\n", "    var begin_format = '<li>';\n", "    var end_format = '</li>';\n", "    var keep_item = -1;\n", "    var text_menu = update_menu_string(sbegin, 2, 4, sformat, send, keep_item,\n", "       begin_format, end_format);\n", "    var menu = document.getElementById(\"my_id_menu_nb\");\n", "    menu.innerHTML=text_menu;\n", "};\n", "window.setTimeout(update_menu,2000);\n", "            </script>"], "text/plain": ["<IPython.core.display.HTML object>"]}, "execution_count": 2, "metadata": {}, "output_type": "execute_result"}], "source": ["from jyquickhelper import add_notebook_menu\n", "add_notebook_menu()"]}, {"cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [{"data": {"text/html": ["<div>\n", "<style>\n", "    .dataframe thead tr:only-child th {\n", "        text-align: right;\n", "    }\n", "\n", "    .dataframe thead th {\n", "        text-align: left;\n", "    }\n", "\n", "    .dataframe tbody tr th {\n", "        vertical-align: top;\n", "    }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", "  <thead>\n", "    <tr style=\"text-align: right;\">\n", "      <th></th>\n", "      <th>0</th>\n", "      <th>1</th>\n", "      <th>2</th>\n", "      <th>3</th>\n", "      <th>target</th>\n", "    </tr>\n", "  </thead>\n", "  <tbody>\n", "    <tr>\n", "      <th>0</th>\n", "      <td>5.1</td>\n", "      <td>3.5</td>\n", "      <td>1.4</td>\n", "      <td>0.2</td>\n", "      <td>0</td>\n", "    </tr>\n", "    <tr>\n", "      <th>1</th>\n", "      <td>4.9</td>\n", "      <td>3.0</td>\n", "      <td>1.4</td>\n", "      <td>0.2</td>\n", "      <td>0</td>\n", "    </tr>\n", "  </tbody>\n", "</table>\n", "</div>"], "text/plain": ["     0    1    2    3  target\n", "0  5.1  3.5  1.4  0.2       0\n", "1  4.9  3.0  1.4  0.2       0"]}, "execution_count": 3, "metadata": {}, "output_type": "execute_result"}], "source": ["from sklearn.datasets import load_iris\n", "data = load_iris()\n", "import pandas\n", "df = pandas.DataFrame(data.data)\n", "df.column = \"X1 X2 X3 X4\".split()\n", "df[\"target\"] = data.target\n", "df.head(n=2)"]}, {"cell_type": "markdown", "metadata": {}, "source": ["On m\u00e9lange les lignes car le dataframe est tri\u00e9 et cela masque quelques effets al\u00e9atoires."]}, {"cell_type": "code", "execution_count": 3, "metadata": {"collapsed": true}, "outputs": [], "source": ["import sklearn.utils\n", "df = sklearn.utils.shuffle(df)"]}, {"cell_type": "code", "execution_count": 4, "metadata": {"collapsed": true}, "outputs": [], "source": ["df.to_csv(\"iris.txt\", sep=\"\\t\", index=False)"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## Exercice 1 : it\u00e9rer sur un grand fichier\n", "\n", "Le param\u00e8tre *iterator* de la fonction [read_csv](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_csv.html) sert \u00e0 parcourir un fichier par blocs dont la taille est d\u00e9finie par le param\u00e8tres *chunksize*. La fonction [read_csv](http://www.xavierdupre.fr/app/pandas_streaming/helpsphinx/pandas_streaming/df/dataframe.html#pandas_streaming.df.dataframe.StreamingDataFrame.read_csv) impl\u00e9mente ce m\u00e9canisme."]}, {"cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["(60, 5)\n", "(60, 5)\n", "(30, 5)\n"]}], "source": ["for df in pandas.read_csv(\"iris.txt\", sep=\"\\t\", iterator=True, chunksize=60):\n", "    print(df.shape)"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## Exercice 2 : split train test\n", "\n", "La solution propos\u00e9e est impl\u00e9ment\u00e9e par [train_test_split](http://www.xavierdupre.fr/app/pandas_streaming/helpsphinx/pandas_streaming/df/dataframe.html#pandas_streaming.df.dataframe.StreamingDataFrame.train_test_split)."]}, {"cell_type": "code", "execution_count": 6, "metadata": {"collapsed": true}, "outputs": [], "source": ["from sklearn.model_selection import train_test_split\n", "\n", "df_full_it = pandas.read_csv('iris.txt', sep='\\t', chunksize=10, encoding='utf-8',\n", "                             engine='python')\n", "\n", "first_exec = True\n", "\n", "for df_full_chunk in df_full_it:\n", "    X_train_chunk, X_test_chunk = train_test_split(df_full_chunk)\n", "    if first_exec:\n", "        X_train_chunk.to_csv(\"X_train.csv\", sep=\"\\t\", index=False)\n", "        X_test_chunk.to_csv(\"X_test.csv\", sep=\"\\t\", index=False)\n", "        first_exec = False\n", "    else:\n", "        X_train_chunk.to_csv(\"X_train.csv\", sep=\"\\t\", index=False, mode='a', header=False)\n", "        X_test_chunk.to_csv(\"X_test.csv\", sep=\"\\t\", index=False, mode='a', header=False)"]}, {"cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [{"data": {"text/html": ["<div>\n", "<style>\n", "    .dataframe thead tr:only-child th {\n", "        text-align: right;\n", "    }\n", "\n", "    .dataframe thead th {\n", "        text-align: left;\n", "    }\n", "\n", "    .dataframe tbody tr th {\n", "        vertical-align: top;\n", "    }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", "  <thead>\n", "    <tr style=\"text-align: right;\">\n", "      <th></th>\n", "      <th>0</th>\n", "      <th>1</th>\n", "      <th>2</th>\n", "      <th>3</th>\n", "      <th>target</th>\n", "    </tr>\n", "  </thead>\n", "  <tbody>\n", "    <tr>\n", "      <th>0</th>\n", "      <td>5.7</td>\n", "      <td>2.9</td>\n", "      <td>4.2</td>\n", "      <td>1.3</td>\n", "      <td>1</td>\n", "    </tr>\n", "    <tr>\n", "      <th>1</th>\n", "      <td>6.1</td>\n", "      <td>3.0</td>\n", "      <td>4.6</td>\n", "      <td>1.4</td>\n", "      <td>1</td>\n", "    </tr>\n", "  </tbody>\n", "</table>\n", "</div>"], "text/plain": ["     0    1    2    3  target\n", "0  5.7  2.9  4.2  1.3       1\n", "1  6.1  3.0  4.6  1.4       1"]}, "execution_count": 8, "metadata": {}, "output_type": "execute_result"}], "source": ["X_train = pandas.read_csv(\"X_train.csv\", sep=\"\\t\")\n", "X_train.head(n=2)"]}, {"cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [{"data": {"text/plain": ["(105, 5)"]}, "execution_count": 9, "metadata": {}, "output_type": "execute_result"}], "source": ["X_train.shape"]}, {"cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [{"data": {"text/html": ["<div>\n", "<style>\n", "    .dataframe thead tr:only-child th {\n", "        text-align: right;\n", "    }\n", "\n", "    .dataframe thead th {\n", "        text-align: left;\n", "    }\n", "\n", "    .dataframe tbody tr th {\n", "        vertical-align: top;\n", "    }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", "  <thead>\n", "    <tr style=\"text-align: right;\">\n", "      <th></th>\n", "      <th>0</th>\n", "      <th>1</th>\n", "      <th>2</th>\n", "      <th>3</th>\n", "      <th>target</th>\n", "    </tr>\n", "  </thead>\n", "  <tbody>\n", "    <tr>\n", "      <th>0</th>\n", "      <td>7.0</td>\n", "      <td>3.2</td>\n", "      <td>4.7</td>\n", "      <td>1.4</td>\n", "      <td>1</td>\n", "    </tr>\n", "    <tr>\n", "      <th>1</th>\n", "      <td>5.6</td>\n", "      <td>3.0</td>\n", "      <td>4.5</td>\n", "      <td>1.5</td>\n", "      <td>1</td>\n", "    </tr>\n", "  </tbody>\n", "</table>\n", "</div>"], "text/plain": ["     0    1    2    3  target\n", "0  7.0  3.2  4.7  1.4       1\n", "1  5.6  3.0  4.5  1.5       1"]}, "execution_count": 10, "metadata": {}, "output_type": "execute_result"}], "source": ["X_test = pandas.read_csv(\"X_test.csv\", sep=\"\\t\")\n", "X_test.head(n=2)"]}, {"cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [{"data": {"text/plain": ["(45, 5)"]}, "execution_count": 11, "metadata": {}, "output_type": "execute_result"}], "source": ["X_test.shape"]}, {"cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [{"data": {"text/html": ["<div>\n", "<style>\n", "    .dataframe thead tr:only-child th {\n", "        text-align: right;\n", "    }\n", "\n", "    .dataframe thead th {\n", "        text-align: left;\n", "    }\n", "\n", "    .dataframe tbody tr th {\n", "        vertical-align: top;\n", "    }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", "  <thead>\n", "    <tr style=\"text-align: right;\">\n", "      <th></th>\n", "      <th>0</th>\n", "      <th>1</th>\n", "      <th>2</th>\n", "      <th>3</th>\n", "    </tr>\n", "    <tr>\n", "      <th>target</th>\n", "      <th></th>\n", "      <th></th>\n", "      <th></th>\n", "      <th></th>\n", "    </tr>\n", "  </thead>\n", "  <tbody>\n", "    <tr>\n", "      <th>0</th>\n", "      <td>30</td>\n", "      <td>30</td>\n", "      <td>30</td>\n", "      <td>30</td>\n", "    </tr>\n", "    <tr>\n", "      <th>1</th>\n", "      <td>36</td>\n", "      <td>36</td>\n", "      <td>36</td>\n", "      <td>36</td>\n", "    </tr>\n", "    <tr>\n", "      <th>2</th>\n", "      <td>39</td>\n", "      <td>39</td>\n", "      <td>39</td>\n", "      <td>39</td>\n", "    </tr>\n", "  </tbody>\n", "</table>\n", "</div>"], "text/plain": ["         0   1   2   3\n", "target                \n", "0       30  30  30  30\n", "1       36  36  36  36\n", "2       39  39  39  39"]}, "execution_count": 12, "metadata": {}, "output_type": "execute_result"}], "source": ["X_train.groupby(\"target\").count()"]}, {"cell_type": "markdown", "metadata": {}, "source": ["La r\u00e9partition des classes n'est pas uniforme. Lorsque les classes sont bien repr\u00e9sent\u00e9es, cela ne nuit pas aux r\u00e9sultats. En revanche, des classes sous-repr\u00e9sent\u00e9es pourraient dispara\u00eetre de l'une des deux parties."]}, {"cell_type": "markdown", "metadata": {}, "source": ["## Exercice 3 : stratify ?\n", "\n", "Le param\u00e8tre *stratify* est int\u00e9ressant pour un probl\u00e8me de classification et quand une classes et sous-repr\u00e9sent\u00e9e. Il est fort probable  que cette classe ne soit pas assez repr\u00e9sent\u00e9e dans l'un des deux jeux et c'est pourquoi il existe une option pour imposer un nombre d'exemples de cette dans chaque des deux jeux (train, test). La qualit\u00e9 des mod\u00e8les est accrue tout comme la qualit\u00e9 des sondages sur un [\u00e9chantillonnage stratifi\u00e9](https://en.wikipedia.org/wiki/Stratified_sampling).\n", "\n", "Si jamais tout ces exemples sont plac\u00e9s au d\u00e9but du gros fichier \u00e0 lire, le programme commence \u00e0 avoir une fausse id\u00e9e de la r\u00e9partition des classes. La seule fa\u00e7on de faire est de faire d'abord une division train/test par classe (indiqu\u00e9 par la variable de stratification) puis de recomposer les bases d'apprentissage et de tests en imposant les proportions voulues."]}, {"cell_type": "code", "execution_count": 12, "metadata": {"collapsed": true}, "outputs": [], "source": ["from sklearn.model_selection import train_test_split\n", "\n", "strat_name = 'target'\n", "\n", "df_full_it = pandas.read_csv('iris.txt', sep='\\t', chunksize=10, encoding='utf-8',\n", "                             dtype=object, engine='python')\n", "\n", "strat_list = []\n", "\n", "for df_full_chunk in df_full_it:\n", "    for current_strat in df_full_chunk[strat_name].unique():\n", "        if str(current_strat) in strat_list:\n", "            selection = df_full_chunk[df_full_chunk[strat_name] == current_strat]\n", "            selection.to_csv(\"strat_{}.csv\".format(current_strat), sep=\"\\t\", index=False,\n", "                             encoding='utf-8', mode='a', header=False)\n", "        else:\n", "            strat_list.append(str(current_strat))\n", "            selection = df_full_chunk[df_full_chunk[strat_name] == current_strat]\n", "            selection.to_csv(\"strat_{}.csv\".format(current_strat), sep=\"\\t\", index=False, encoding='utf-8')\n", "\n", "first_exec = True\n", "\n", "for current_strat in strat_list:\n", "    df_strat_it = pandas.read_csv(\"strat_{}.csv\".format(current_strat), sep='\\t', chunksize=1000,\n", "                                  encoding='utf-8', dtype=object, engine='python')\n", "    for df_strat_chunk in df_strat_it:\n", "        X_train_chunk, X_test_chunk = train_test_split(df_strat_chunk)\n", "        if first_exec:\n", "            X_train_chunk.to_csv(\"X_train_strat.csv\", sep=\"\\t\", index=False, encoding='utf-8')\n", "            X_test_chunk.to_csv(\"X_test_strat.csv\", sep=\"\\t\", index=False, encoding='utf-8')\n", "            first_exec = False\n", "        else:\n", "            X_train_chunk.to_csv(\"X_train_strat.csv\", sep=\"\\t\", index=False, encoding='utf-8', mode='a', header=False)\n", "            X_test_chunk.to_csv(\"X_test_strat.csv\", sep=\"\\t\", index=False, encoding='utf-8', mode='a', header=False)"]}, {"cell_type": "markdown", "metadata": {}, "source": ["On v\u00e9rifie que l'\u00e9chantillon est stratifi\u00e9e."]}, {"cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [{"data": {"text/html": ["<div>\n", "<style>\n", "    .dataframe thead tr:only-child th {\n", "        text-align: right;\n", "    }\n", "\n", "    .dataframe thead th {\n", "        text-align: left;\n", "    }\n", "\n", "    .dataframe tbody tr th {\n", "        vertical-align: top;\n", "    }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", "  <thead>\n", "    <tr style=\"text-align: right;\">\n", "      <th></th>\n", "      <th>0</th>\n", "      <th>1</th>\n", "      <th>2</th>\n", "      <th>3</th>\n", "      <th>target</th>\n", "    </tr>\n", "  </thead>\n", "  <tbody>\n", "    <tr>\n", "      <th>0</th>\n", "      <td>5.1</td>\n", "      <td>3.8</td>\n", "      <td>1.6</td>\n", "      <td>0.2</td>\n", "      <td>0</td>\n", "    </tr>\n", "    <tr>\n", "      <th>1</th>\n", "      <td>4.7</td>\n", "      <td>3.2</td>\n", "      <td>1.3</td>\n", "      <td>0.2</td>\n", "      <td>0</td>\n", "    </tr>\n", "  </tbody>\n", "</table>\n", "</div>"], "text/plain": ["     0    1    2    3  target\n", "0  5.1  3.8  1.6  0.2       0\n", "1  4.7  3.2  1.3  0.2       0"]}, "execution_count": 14, "metadata": {}, "output_type": "execute_result"}], "source": ["X_train = pandas.read_csv(\"X_train_strat.csv\", sep=\"\\t\")\n", "X_train.head(n=2)"]}, {"cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [{"data": {"text/html": ["<div>\n", "<style>\n", "    .dataframe thead tr:only-child th {\n", "        text-align: right;\n", "    }\n", "\n", "    .dataframe thead th {\n", "        text-align: left;\n", "    }\n", "\n", "    .dataframe tbody tr th {\n", "        vertical-align: top;\n", "    }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", "  <thead>\n", "    <tr style=\"text-align: right;\">\n", "      <th></th>\n", "      <th>0</th>\n", "      <th>1</th>\n", "      <th>2</th>\n", "      <th>3</th>\n", "    </tr>\n", "    <tr>\n", "      <th>target</th>\n", "      <th></th>\n", "      <th></th>\n", "      <th></th>\n", "      <th></th>\n", "    </tr>\n", "  </thead>\n", "  <tbody>\n", "    <tr>\n", "      <th>0</th>\n", "      <td>37</td>\n", "      <td>37</td>\n", "      <td>37</td>\n", "      <td>37</td>\n", "    </tr>\n", "    <tr>\n", "      <th>1</th>\n", "      <td>37</td>\n", "      <td>37</td>\n", "      <td>37</td>\n", "      <td>37</td>\n", "    </tr>\n", "    <tr>\n", "      <th>2</th>\n", "      <td>37</td>\n", "      <td>37</td>\n", "      <td>37</td>\n", "      <td>37</td>\n", "    </tr>\n", "  </tbody>\n", "</table>\n", "</div>"], "text/plain": ["         0   1   2   3\n", "target                \n", "0       37  37  37  37\n", "1       37  37  37  37\n", "2       37  37  37  37"]}, "execution_count": 15, "metadata": {}, "output_type": "execute_result"}], "source": ["X_train.groupby(\"target\").count()"]}, {"cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [{"data": {"text/html": ["<div>\n", "<style>\n", "    .dataframe thead tr:only-child th {\n", "        text-align: right;\n", "    }\n", "\n", "    .dataframe thead th {\n", "        text-align: left;\n", "    }\n", "\n", "    .dataframe tbody tr th {\n", "        vertical-align: top;\n", "    }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", "  <thead>\n", "    <tr style=\"text-align: right;\">\n", "      <th></th>\n", "      <th>0</th>\n", "      <th>1</th>\n", "      <th>2</th>\n", "      <th>3</th>\n", "      <th>target</th>\n", "    </tr>\n", "  </thead>\n", "  <tbody>\n", "    <tr>\n", "      <th>0</th>\n", "      <td>5.7</td>\n", "      <td>3.8</td>\n", "      <td>1.7</td>\n", "      <td>0.3</td>\n", "      <td>0</td>\n", "    </tr>\n", "    <tr>\n", "      <th>1</th>\n", "      <td>4.8</td>\n", "      <td>3.4</td>\n", "      <td>1.9</td>\n", "      <td>0.2</td>\n", "      <td>0</td>\n", "    </tr>\n", "  </tbody>\n", "</table>\n", "</div>"], "text/plain": ["     0    1    2    3  target\n", "0  5.7  3.8  1.7  0.3       0\n", "1  4.8  3.4  1.9  0.2       0"]}, "execution_count": 16, "metadata": {}, "output_type": "execute_result"}], "source": ["X_test = pandas.read_csv(\"X_test_strat.csv\", sep=\"\\t\")\n", "X_test.head(n=2)"]}, {"cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [{"data": {"text/html": ["<div>\n", "<style>\n", "    .dataframe thead tr:only-child th {\n", "        text-align: right;\n", "    }\n", "\n", "    .dataframe thead th {\n", "        text-align: left;\n", "    }\n", "\n", "    .dataframe tbody tr th {\n", "        vertical-align: top;\n", "    }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", "  <thead>\n", "    <tr style=\"text-align: right;\">\n", "      <th></th>\n", "      <th>0</th>\n", "      <th>1</th>\n", "      <th>2</th>\n", "      <th>3</th>\n", "    </tr>\n", "    <tr>\n", "      <th>target</th>\n", "      <th></th>\n", "      <th></th>\n", "      <th></th>\n", "      <th></th>\n", "    </tr>\n", "  </thead>\n", "  <tbody>\n", "    <tr>\n", "      <th>0</th>\n", "      <td>13</td>\n", "      <td>13</td>\n", "      <td>13</td>\n", "      <td>13</td>\n", "    </tr>\n", "    <tr>\n", "      <th>1</th>\n", "      <td>13</td>\n", "      <td>13</td>\n", "      <td>13</td>\n", "      <td>13</td>\n", "    </tr>\n", "    <tr>\n", "      <th>2</th>\n", "      <td>13</td>\n", "      <td>13</td>\n", "      <td>13</td>\n", "      <td>13</td>\n", "    </tr>\n", "  </tbody>\n", "</table>\n", "</div>"], "text/plain": ["         0   1   2   3\n", "target                \n", "0       13  13  13  13\n", "1       13  13  13  13\n", "2       13  13  13  13"]}, "execution_count": 17, "metadata": {}, "output_type": "execute_result"}], "source": ["X_test.groupby(\"target\").count()"]}, {"cell_type": "markdown", "metadata": {}, "source": ["Les classes sont bien r\u00e9parties."]}, {"cell_type": "markdown", "metadata": {}, "source": ["## Exercice 4 : quelques id\u00e9es pour un group by ?\n", "\n", "La fonction [groupby](http://www.xavierdupre.fr/app/pandas_streaming/helpsphinx/pandas_streaming/df/dataframe.html#pandas_streaming.df.dataframe.StreamingDataFrame.groupby) impl\u00e9mente une fa\u00e7on de faire. Il faut distinguer deux cas possibles. Premier cas, l'agr\u00e9gation aboutit \u00e0 un r\u00e9sultat qui tient en m\u00e9moire auquel on peut s'en sortir ais\u00e9ment. Dans le second cas, l'agr\u00e9gation ne tient pas en m\u00e9moire et il faudra probablement passer par un fichier interm\u00e9diaire ou comme le sugg\u00e8re la solution propos\u00e9 par [StreamingDataFrame.groupby](http://www.xavierdupre.fr/app/pandas_streaming/helpsphinx/pandas_streaming/df/dataframe.html#pandas_streaming.df.dataframe.StreamingDataFrame.groupby), on peut agr\u00e9ger un cours de route \u00e0 condition que l'agr\u00e9gration impl\u00e9ment\u00e9e soit compatible avec ce type de m\u00e9thode."]}, {"cell_type": "code", "execution_count": 17, "metadata": {"collapsed": true}, "outputs": [], "source": []}, {"cell_type": "code", "execution_count": 18, "metadata": {"collapsed": true}, "outputs": [], "source": []}, {"cell_type": "markdown", "metadata": {}, "source": ["## Notes\n", "\n", "*Notebook en grande partie issue de la contribution des \u00e9tudiants.*"]}, {"cell_type": "code", "execution_count": 19, "metadata": {"collapsed": true}, "outputs": [], "source": []}], "metadata": {"kernelspec": {"display_name": "Python 3", "language": "python", "name": "python3"}, "language_info": {"codemirror_mode": {"name": "ipython", "version": 3}, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.1"}, "varInspector": {"cols": {"lenName": 16, "lenType": 16, "lenVar": 40}, "kernels_config": {"python": {"delete_cmd_postfix": "", "delete_cmd_prefix": "del ", "library": "var_list.py", "varRefreshCmd": "print(var_dic_list())"}, "r": {"delete_cmd_postfix": ") ", "delete_cmd_prefix": "rm(", "library": "var_list.r", "varRefreshCmd": "cat(var_dic_list()) "}}, "types_to_exclude": ["module", "function", "builtin_function_or_method", "instance", "_Feature"], "window_display": false}}, "nbformat": 4, "nbformat_minor": 2}