{"cells": [{"cell_type": "markdown", "metadata": {}, "source": ["# 2A.i - S\u00e9rialisation - correction\n", "\n", "S\u00e9rialisation d'objets, en particulier de dataframes. Mesures de vitesse."]}, {"cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [{"data": {"text/html": ["<div id=\"my_id_menu_nb\">run previous cell, wait for 2 seconds</div>\n", "<script>\n", "function repeat_indent_string(n){\n", "    var a = \"\" ;\n", "    for ( ; n > 0 ; --n)\n", "        a += \"    \";\n", "    return a;\n", "}\n", "// look up into all sections and builds an automated menu //\n", "var update_menu_string = function(begin, lfirst, llast, sformat, send, keep_item, begin_format, end_format) {\n", "    var anchors = document.getElementsByClassName(\"section\");\n", "    if (anchors.length == 0) {\n", "        anchors = document.getElementsByClassName(\"text_cell_render rendered_html\");\n", "    }\n", "    var i,t;\n", "    var text_menu = begin;\n", "    var text_memo = \"<pre>\\nlength:\" + anchors.length + \"\\n\";\n", "    var ind = \"\";\n", "    var memo_level = 1;\n", "    var href;\n", "    var tags = [];\n", "    var main_item = 0;\n", "    var format_open = 0;\n", "    for (i = 0; i <= llast; i++)\n", "        tags.push(\"h\" + i);\n", "\n", "    for (i = 0; i < anchors.length; i++) {\n", "        text_memo += \"**\" + anchors[i].id + \"--\\n\";\n", "\n", "        var child = null;\n", "        for(t = 0; t < tags.length; t++) {\n", "            var r = anchors[i].getElementsByTagName(tags[t]);\n", "            if (r.length > 0) {\n", "child = r[0];\n", "break;\n", "            }\n", "        }\n", "        if (child == null) {\n", "            text_memo += \"null\\n\";\n", "            continue;\n", "        }\n", "        if (anchors[i].hasAttribute(\"id\")) {\n", "            // when converted in RST\n", "            href = anchors[i].id;\n", "            text_memo += \"#1-\" + href;\n", "            // passer \u00e0 child suivant (le chercher)\n", "        }\n", "        else if (child.hasAttribute(\"id\")) {\n", "            // in a notebook\n", "            href = child.id;\n", "            text_memo += \"#2-\" + href;\n", "        }\n", "        else {\n", "            text_memo += \"#3-\" + \"*\" + \"\\n\";\n", "            continue;\n", "        }\n", "        var title = child.textContent;\n", "        var level = parseInt(child.tagName.substring(1,2));\n", "\n", "        text_memo += \"--\" + level + \"?\" + lfirst + \"--\" + title + \"\\n\";\n", "\n", "        if ((level < lfirst) || (level > llast)) {\n", "            continue ;\n", "        }\n", "        if (title.endsWith('\u00b6')) {\n", "            title = title.substring(0,title.length-1).replace(\"<\", \"&lt;\")\n", "         .replace(\">\", \"&gt;\").replace(\"&\", \"&amp;\");\n", "        }\n", "        if (title.length == 0) {\n", "            continue;\n", "        }\n", "\n", "        while (level < memo_level) {\n", "            text_menu += end_format + \"</ul>\\n\";\n", "            format_open -= 1;\n", "            memo_level -= 1;\n", "        }\n", "        if (level == lfirst) {\n", "            main_item += 1;\n", "        }\n", "        if (keep_item != -1 && main_item != keep_item + 1) {\n", "            // alert(main_item + \" - \" + level + \" - \" + keep_item);\n", "            continue;\n", "        }\n", "        while (level > memo_level) {\n", "            text_menu += \"<ul>\\n\";\n", "            memo_level += 1;\n", "        }\n", "        text_menu += repeat_indent_string(level-2);\n", "        text_menu += begin_format + sformat.replace(\"__HREF__\", href).replace(\"__TITLE__\", title);\n", "        format_open += 1;\n", "    }\n", "    while (1 < memo_level) {\n", "        text_menu += end_format + \"</ul>\\n\";\n", "        memo_level -= 1;\n", "        format_open -= 1;\n", "    }\n", "    text_menu += send;\n", "    //text_menu += \"\\n\" + text_memo;\n", "\n", "    while (format_open > 0) {\n", "        text_menu += end_format;\n", "        format_open -= 1;\n", "    }\n", "    return text_menu;\n", "};\n", "var update_menu = function() {\n", "    var sbegin = \"\";\n", "    var sformat = '<a href=\"#__HREF__\">__TITLE__</a>';\n", "    var send = \"\";\n", "    var begin_format = '<li>';\n", "    var end_format = '</li>';\n", "    var keep_item = -1;\n", "    var text_menu = update_menu_string(sbegin, 2, 4, sformat, send, keep_item,\n", "       begin_format, end_format);\n", "    var menu = document.getElementById(\"my_id_menu_nb\");\n", "    menu.innerHTML=text_menu;\n", "};\n", "window.setTimeout(update_menu,2000);\n", "            </script>"], "text/plain": ["<IPython.core.display.HTML object>"]}, "execution_count": 2, "metadata": {}, "output_type": "execute_result"}], "source": ["from jyquickhelper import add_notebook_menu\n", "add_notebook_menu()"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## Exercice 1 : s\u00e9rialisation d'un gros dataframe\n", "\n", "**Etape 1 :** construction d'un gros dataframe compos\u00e9 de nombres al\u00e9atoires"]}, {"cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": ["import random\n", "values = [ [random.random() for i in range(0,20)] for _ in range(0,100000) ]\n", "col = [ \"col%d\" % i for i in range(0,20) ]"]}, {"cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": ["import pandas\n", "df = pandas.DataFrame( values, columns = col )"]}, {"cell_type": "markdown", "metadata": {}, "source": ["**Etape 2 :** on sauve ce dataframe sous deux formats texte et s\u00e9rialis\u00e9 (binaire)"]}, {"cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": ["df.to_csv(\"df_text.txt\", sep=\"\\t\")"]}, {"cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": ["df.to_pickle(\"df_text.bin\")"]}, {"cell_type": "markdown", "metadata": {}, "source": ["**Etape 3 :** on mesure le temps de chargement"]}, {"cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["499 ms \u00b1 8.82 ms per loop (mean \u00b1 std. dev. of 7 runs, 1 loop each)\n"]}], "source": ["%timeit pandas.read_csv(\"df_text.txt\", sep=\"\\t\")"]}, {"cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["10.1 ms \u00b1 1.05 ms per loop (mean \u00b1 std. dev. of 7 runs, 100 loops each)\n"]}], "source": ["%timeit pandas.read_pickle(\"df_text.bin\")"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## Exercice 2 : json\n", "\n", "Un premier essai."]}, {"cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [{"data": {"text/plain": ["'{\"a\": [50, \"r\"], \"gg\": {\"py/tuple\": [5, \"t\"]}}'"]}, "execution_count": 9, "metadata": {}, "output_type": "execute_result"}], "source": ["obj = dict(a=[50, \"r\"], gg=(5, 't'))\n", "\n", "import jsonpickle\n", "frozen = jsonpickle.encode(obj)\n", "frozen"]}, {"cell_type": "markdown", "metadata": {}, "source": ["Ce module est \u00e9quivalent au module [json](https://docs.python.org/3/library/json.html) sur les types standard du langage Python (liste, dictionnaires, nombres, ...). Mais le module [json](https://docs.python.org/3/library/json.html) ne fonctionne pas sur les dataframe."]}, {"cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": ["frozen = jsonpickle.encode(df)"]}, {"cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [{"data": {"text/plain": ["(22025124, str, '{\"py/object\": \"pandas.core.frame.DataFrame\", \"py/state\"')"]}, "execution_count": 11, "metadata": {}, "output_type": "execute_result"}], "source": ["len(frozen), type(frozen), frozen[:55]"]}, {"cell_type": "markdown", "metadata": {}, "source": ["La methode [to_json](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_json.html) donnera un r\u00e9sultat statisfaisant \u00e9galement mais ne pourra s'appliquer \u00e0 un mod\u00e8le de machine learning produit par [scikit-learn](http://scikit-learn.org/)."]}, {"cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": ["def to_json(obj, filename):\n", "    frozen = jsonpickle.encode(obj)\n", "    with open(filename, \"w\", encoding=\"utf-8\") as f:\n", "        f.write(frozen)\n", "        \n", "def read_json(filename):\n", "    with open(filename, \"r\", encoding=\"utf-8\") as f:\n", "        enc = f.read()\n", "    return jsonpickle.decode(enc)"]}, {"cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": ["to_json(df, \"df_text.json\")"]}, {"cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["all inputs must be Index\n"]}], "source": ["try:\n", "    df = read_json(\"df_text.json\")\n", "except Exception as e:\n", "    print(e)"]}, {"cell_type": "markdown", "metadata": {}, "source": ["Visiblement, cela ne fonctionne pas sur les DataFrame. Il faudra s'inspirer du module [numpyson](https://github.com/hpk42/numpyson)."]}, {"cell_type": "markdown", "metadata": {}, "source": ["## json + scikit-learn\n", "\n", "Il faut lire l'issue [147](https://github.com/jsonpickle/jsonpickle/issues/147) pour saisir l'int\u00e9r\u00eat des deux lignes suivantes."]}, {"cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": ["import jsonpickle.ext.numpy as jsonpickle_numpy\n", "jsonpickle_numpy.register_handlers()"]}, {"cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": ["from sklearn import datasets\n", "iris = datasets.load_iris()\n", "X = iris.data[:, :2]  # we only take the first two features.\n", "y = iris.target"]}, {"cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [{"data": {"text/plain": ["LogisticRegression()"]}, "execution_count": 17, "metadata": {}, "output_type": "execute_result"}], "source": ["from sklearn.linear_model import LogisticRegression\n", "clf = LogisticRegression()\n", "clf.fit(X,y)"]}, {"cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [{"data": {"text/plain": ["array([[9.98521017e-01, 1.47896452e-03, 1.84545577e-08]])"]}, "execution_count": 18, "metadata": {}, "output_type": "execute_result"}], "source": ["clf.predict_proba([[0.1, 0.2]])"]}, {"cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": ["to_json(clf, \"logreg.json\")"]}, {"cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["'list' object has no attribute 'flags'\n"]}], "source": ["try:\n", "    clf2 = read_json(\"logreg.json\")\n", "except AttributeError as e:\n", "    # Pour une raison inconnue, un bug sans doute, le code ne fonctionne pas.\n", "    print(e)"]}, {"cell_type": "markdown", "metadata": {}, "source": ["Donc on essaye d'une essaye d'une autre fa\u00e7on. Si le code pr\u00e9c\u00e9dent ne fonctionne pas et le suivant si, c'est un bug de [jsonpickle](https://github.com/jsonpickle/jsonpickle)."]}, {"cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": ["class EncapsulateLogisticRegression:\n", "    def __init__(self, obj):\n", "        self.obj = obj\n", "    def __getstate__(self):\n", "        return {k: v for k, v in sorted(self.obj.__getstate__().items())}\n", "    def __setstate__(self, data):\n", "        self.obj = LogisticRegression()\n", "        self.obj.__setstate__(data)\n", "        \n", "enc = EncapsulateLogisticRegression(clf)\n", "to_json(enc, \"logreg.json\")"]}, {"cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": ["enc2 = read_json(\"logreg.json\")\n", "clf2 = enc2.obj"]}, {"cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [{"data": {"text/plain": ["array([[9.98521017e-01, 1.47896452e-03, 1.84545577e-08]])"]}, "execution_count": 23, "metadata": {}, "output_type": "execute_result"}], "source": ["clf2.predict_proba([[0.1, 0.2]])"]}, {"cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [{"data": {"text/plain": ["'{\"py/object\": \"__main__.EncapsulateLogisticRegression\", \"py/state\": {\"C\": 1.0, \"_sklearn_version\": \"1.0.dev0\", \"class_weight\": null, \"classes_\": {\"py/object\": \"numpy.ndarray\", \"dtype\": \"int32\", \"values\": [0, 1, 2]}, \"coef_\": {\"py/object\": \"numpy.ndarray\", \"base\": {\"py/object\": \"numpy.ndarray\", \"dtype\": \"float64\", \"values\": [[[-2.7089024902680983, 2.3240237755859914, 7.913221292541044], [0.6127325890163979, -1.5705880338943812, 1.8450471421510946], [2.0961699012517387, -0.7534357416910977, -9.758268434691205]]]}, \"strides\": [24, 8], \"shape\": [3, 2], \"dtype\": \"float64\", \"values\": [[-2.7089024902680983, 2.3240237755859914], [0.6127325890163979, -1.5705880338943812], [2.0961699012517387, -0.7534357416910977]]}, \"dual\": false, \"fit_intercept\": true, \"intercept_\": {\"py/object\": \"numpy.ndarray\", \"base\": {\"py/id\": 4}, \"offset\": 16, \"strides\": [24], \"shape\": [3], \"dtype\": \"float64\", \"values\": [7.913221292541044, 1.8450471421510946, -9.758268434691205]}, \"intercept_scaling\": 1, \"l1_ratio\": null, \"max_iter\": 100, \"multi_class\": \"auto\", \"n_features_in_\": 2, \"n_iter_\": {\"py/object\": \"numpy.ndarray\", \"base\": {\"py/object\": \"numpy.ndarray\", \"dtype\": \"int32\", \"values\": [[50]]}, \"shape\": [1], \"dtype\": \"int32\", \"values\": [50]}, \"n_jobs\": null, \"penalty\": \"l2\", \"random_state\": null, \"solver\": \"lbfgs\", \"tol\": 0.0001, \"verbose\": 0, \"warm_start\": false}}'"]}, "execution_count": 24, "metadata": {}, "output_type": "execute_result"}], "source": ["with open(\"logreg.json\", \"r\") as f:\n", "    content = f.read()\n", "content"]}, {"cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [], "source": []}], "metadata": {"kernelspec": {"display_name": "Python 3", "language": "python", "name": "python3"}, "language_info": {"codemirror_mode": {"name": "ipython", "version": 3}, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.7"}}, "nbformat": 4, "nbformat_minor": 2}