{"cells": [{"cell_type": "markdown", "metadata": {}, "source": ["# Uncommon operation with dataframes\n", "\n", "Cheat sheet on uncommand operation with pandas such as reading a big file."]}, {"cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [{"data": {"text/html": ["
\n", ""], "text/plain": [""]}, "execution_count": 2, "metadata": {}, "output_type": "execute_result"}], "source": ["from jyquickhelper import add_notebook_menu\n", "add_notebook_menu()"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## Pointer on notebooks\n", "\n", "* [Rappel de ce que vous savez d\u00e9j\u00e0 mais avez peut-\u00eatre oubli\u00e9](http://www.xavierdupre.fr/app/ensae_teaching_cs/helpsphinx/notebooks/td2_eco_rappels_1a.html)\n", "* [Python pour un Data Scientist / Economiste](http://www.xavierdupre.fr/app/ensae_teaching_cs/helpsphinx/td_2a.html)\n", "* [Exercices Pratiques](http://www.xavierdupre.fr/app/actuariat_python/helpsphinx/i_seances_base.html)"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## List of strings into binaries features"]}, {"cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [{"data": {"text/html": ["\n", "\n", "
\n", " \n", " \n", " | \n", " features | \n", " target | \n", "
\n", " \n", " \n", " \n", " 0 | \n", " [a, b, c] | \n", " 0 | \n", "
\n", " \n", " 1 | \n", " [a, b] | \n", " 1 | \n", "
\n", " \n", " 2 | \n", " [c, b] | \n", " 2 | \n", "
\n", " \n", "
\n", "
"], "text/plain": [" features target\n", "0 [a, b, c] 0\n", "1 [a, b] 1\n", "2 [c, b] 2"]}, "execution_count": 3, "metadata": {}, "output_type": "execute_result"}], "source": ["import pandas\n", "df = pandas.DataFrame([{\"target\":0, \"features\":[\"a\", \"b\", \"c\"]},\n", " {\"target\":1, \"features\":[\"a\", \"b\"]},\n", " {\"target\":2, \"features\":[\"c\", \"b\"]}])\n", "df"]}, {"cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [{"data": {"text/html": ["\n", "\n", "
\n", " \n", " \n", " | \n", " a | \n", " b | \n", " c | \n", "
\n", " \n", " \n", " \n", " 0 | \n", " 1 | \n", " 1 | \n", " 1 | \n", "
\n", " \n", " 1 | \n", " 1 | \n", " 1 | \n", " 0 | \n", "
\n", " \n", " 2 | \n", " 0 | \n", " 1 | \n", " 1 | \n", "
\n", " \n", "
\n", "
"], "text/plain": [" a b c\n", "0 1 1 1\n", "1 1 1 0\n", "2 0 1 1"]}, "execution_count": 4, "metadata": {}, "output_type": "execute_result"}], "source": ["df.features.str.join(\"*\").str.get_dummies(\"*\")"]}, {"cell_type": "markdown", "metadata": {"collapsed": true}, "source": ["## Big files\n", "\n", "Let's save some data first."]}, {"cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": ["from sklearn.datasets import load_breast_cancer\n", "data = load_breast_cancer()\n", "import pandas\n", "df = pandas.DataFrame(data.data, columns=data.feature_names)\n", "df.to_csv(\"cancer.txt\", sep=\"\\t\", encoding=\"utf-8\", index=False)"]}, {"cell_type": "markdown", "metadata": {}, "source": ["### first lines : nrows"]}, {"cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [{"data": {"text/html": ["\n", "\n", "
\n", " \n", " \n", " | \n", " mean radius\tmean texture\tmean perimeter\tmean area\tmean smoothness\tmean compactness\tmean concavity\tmean concave points\tmean symmetry\tmean fractal dimension\tradius error\ttexture error\tperimeter error\tarea error\tsmoothness error\tcompactness error\tconcavity error\tconcave points error\tsymmetry error\tfractal dimension error\tworst radius\tworst texture\tworst perimeter\tworst area\tworst smoothness\tworst compactness\tworst concavity\tworst concave points\tworst symmetry\tworst fractal dimension | \n", "
\n", " \n", " \n", " \n", " 0 | \n", " 17.99\\t10.38\\t122.8\\t1001.0\\t0.1184\\t0.2776\\t0... | \n", "
\n", " \n", " 1 | \n", " 20.57\\t17.77\\t132.9\\t1326.0\\t0.08474\\t0.07864\\... | \n", "
\n", " \n", " 2 | \n", " 19.69\\t21.25\\t130.0\\t1203.0\\t0.1096\\t0.1599\\t0... | \n", "
\n", " \n", "
\n", "
"], "text/plain": [" mean radius\\tmean texture\\tmean perimeter\\tmean area\\tmean smoothness\\tmean compactness\\tmean concavity\\tmean concave points\\tmean symmetry\\tmean fractal dimension\\tradius error\\ttexture error\\tperimeter error\\tarea error\\tsmoothness error\\tcompactness error\\tconcavity error\\tconcave points error\\tsymmetry error\\tfractal dimension error\\tworst radius\\tworst texture\\tworst perimeter\\tworst area\\tworst smoothness\\tworst compactness\\tworst concavity\\tworst concave points\\tworst symmetry\\tworst fractal dimension\n", "0 17.99\\t10.38\\t122.8\\t1001.0\\t0.1184\\t0.2776\\t0... \n", "1 20.57\\t17.77\\t132.9\\t1326.0\\t0.08474\\t0.07864\\... \n", "2 19.69\\t21.25\\t130.0\\t1203.0\\t0.1096\\t0.1599\\t0... "]}, "execution_count": 6, "metadata": {}, "output_type": "execute_result"}], "source": ["df = pandas.read_csv(\"cancer.txt\", nrows=3)\n", "df"]}, {"cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [{"data": {"text/html": ["\n", "\n", "
\n", " \n", " \n", " | \n", " mean radius | \n", " mean texture | \n", " mean perimeter | \n", " mean area | \n", " mean smoothness | \n", " mean compactness | \n", " mean concavity | \n", " mean concave points | \n", " mean symmetry | \n", " mean fractal dimension | \n", " ... | \n", " worst radius | \n", " worst texture | \n", " worst perimeter | \n", " worst area | \n", " worst smoothness | \n", " worst compactness | \n", " worst concavity | \n", " worst concave points | \n", " worst symmetry | \n", " worst fractal dimension | \n", "
\n", " \n", " \n", " \n", " 0 | \n", " 17.99 | \n", " 10.38 | \n", " 122.8 | \n", " 1001.0 | \n", " 0.11840 | \n", " 0.27760 | \n", " 0.3001 | \n", " 0.14710 | \n", " 0.2419 | \n", " 0.07871 | \n", " ... | \n", " 25.38 | \n", " 17.33 | \n", " 184.6 | \n", " 2019.0 | \n", " 0.1622 | \n", " 0.6656 | \n", " 0.7119 | \n", " 0.2654 | \n", " 0.4601 | \n", " 0.11890 | \n", "
\n", " \n", " 1 | \n", " 20.57 | \n", " 17.77 | \n", " 132.9 | \n", " 1326.0 | \n", " 0.08474 | \n", " 0.07864 | \n", " 0.0869 | \n", " 0.07017 | \n", " 0.1812 | \n", " 0.05667 | \n", " ... | \n", " 24.99 | \n", " 23.41 | \n", " 158.8 | \n", " 1956.0 | \n", " 0.1238 | \n", " 0.1866 | \n", " 0.2416 | \n", " 0.1860 | \n", " 0.2750 | \n", " 0.08902 | \n", "
\n", " \n", " 2 | \n", " 19.69 | \n", " 21.25 | \n", " 130.0 | \n", " 1203.0 | \n", " 0.10960 | \n", " 0.15990 | \n", " 0.1974 | \n", " 0.12790 | \n", " 0.2069 | \n", " 0.05999 | \n", " ... | \n", " 23.57 | \n", " 25.53 | \n", " 152.5 | \n", " 1709.0 | \n", " 0.1444 | \n", " 0.4245 | \n", " 0.4504 | \n", " 0.2430 | \n", " 0.3613 | \n", " 0.08758 | \n", "
\n", " \n", "
\n", "
3 rows \u00d7 30 columns
\n", "
"], "text/plain": [" mean radius mean texture mean perimeter mean area mean smoothness \\\n", "0 17.99 10.38 122.8 1001.0 0.11840 \n", "1 20.57 17.77 132.9 1326.0 0.08474 \n", "2 19.69 21.25 130.0 1203.0 0.10960 \n", "\n", " mean compactness mean concavity mean concave points mean symmetry \\\n", "0 0.27760 0.3001 0.14710 0.2419 \n", "1 0.07864 0.0869 0.07017 0.1812 \n", "2 0.15990 0.1974 0.12790 0.2069 \n", "\n", " mean fractal dimension ... worst radius \\\n", "0 0.07871 ... 25.38 \n", "1 0.05667 ... 24.99 \n", "2 0.05999 ... 23.57 \n", "\n", " worst texture worst perimeter worst area worst smoothness \\\n", "0 17.33 184.6 2019.0 0.1622 \n", "1 23.41 158.8 1956.0 0.1238 \n", "2 25.53 152.5 1709.0 0.1444 \n", "\n", " worst compactness worst concavity worst concave points worst symmetry \\\n", "0 0.6656 0.7119 0.2654 0.4601 \n", "1 0.1866 0.2416 0.1860 0.2750 \n", "2 0.4245 0.4504 0.2430 0.3613 \n", "\n", " worst fractal dimension \n", "0 0.11890 \n", "1 0.08902 \n", "2 0.08758 \n", "\n", "[3 rows x 30 columns]"]}, "execution_count": 7, "metadata": {}, "output_type": "execute_result"}], "source": ["df = pandas.read_csv(\"cancer.txt\", nrows=3, sep=\"\\t\")\n", "df"]}, {"cell_type": "markdown", "metadata": {}, "source": ["### middle lines : nrows + skiprows"]}, {"cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [{"data": {"text/html": ["\n", "\n", "
\n", " \n", " \n", " | \n", " 0 | \n", " 1 | \n", " 2 | \n", " 3 | \n", " 4 | \n", " 5 | \n", " 6 | \n", " 7 | \n", " 8 | \n", " 9 | \n", " ... | \n", " 20 | \n", " 21 | \n", " 22 | \n", " 23 | \n", " 24 | \n", " 25 | \n", " 26 | \n", " 27 | \n", " 28 | \n", " 29 | \n", "
\n", " \n", " \n", " \n", " 0 | \n", " 14.420 | \n", " 19.77 | \n", " 94.48 | \n", " 642.5 | \n", " 0.09752 | \n", " 0.11410 | \n", " 0.09388 | \n", " 0.05839 | \n", " 0.1879 | \n", " 0.06390 | \n", " ... | \n", " 16.33 | \n", " 30.86 | \n", " 109.50 | \n", " 826.4 | \n", " 0.1431 | \n", " 0.3026 | \n", " 0.3194 | \n", " 0.1565 | \n", " 0.2718 | \n", " 0.09353 | \n", "
\n", " \n", " 1 | \n", " 13.610 | \n", " 24.98 | \n", " 88.05 | \n", " 582.7 | \n", " 0.09488 | \n", " 0.08511 | \n", " 0.08625 | \n", " 0.04489 | \n", " 0.1609 | \n", " 0.05871 | \n", " ... | \n", " 16.99 | \n", " 35.27 | \n", " 108.60 | \n", " 906.5 | \n", " 0.1265 | \n", " 0.1943 | \n", " 0.3169 | \n", " 0.1184 | \n", " 0.2651 | \n", " 0.07397 | \n", "
\n", " \n", " 2 | \n", " 6.981 | \n", " 13.43 | \n", " 43.79 | \n", " 143.5 | \n", " 0.11700 | \n", " 0.07568 | \n", " 0.00000 | \n", " 0.00000 | \n", " 0.1930 | \n", " 0.07818 | \n", " ... | \n", " 7.93 | \n", " 19.54 | \n", " 50.41 | \n", " 185.2 | \n", " 0.1584 | \n", " 0.1202 | \n", " 0.0000 | \n", " 0.0000 | \n", " 0.2932 | \n", " 0.09382 | \n", "
\n", " \n", "
\n", "
3 rows \u00d7 30 columns
\n", "
"], "text/plain": [" 0 1 2 3 4 5 6 7 8 \\\n", "0 14.420 19.77 94.48 642.5 0.09752 0.11410 0.09388 0.05839 0.1879 \n", "1 13.610 24.98 88.05 582.7 0.09488 0.08511 0.08625 0.04489 0.1609 \n", "2 6.981 13.43 43.79 143.5 0.11700 0.07568 0.00000 0.00000 0.1930 \n", "\n", " 9 ... 20 21 22 23 24 25 26 \\\n", "0 0.06390 ... 16.33 30.86 109.50 826.4 0.1431 0.3026 0.3194 \n", "1 0.05871 ... 16.99 35.27 108.60 906.5 0.1265 0.1943 0.3169 \n", "2 0.07818 ... 7.93 19.54 50.41 185.2 0.1584 0.1202 0.0000 \n", "\n", " 27 28 29 \n", "0 0.1565 0.2718 0.09353 \n", "1 0.1184 0.2651 0.07397 \n", "2 0.0000 0.2932 0.09382 \n", "\n", "[3 rows x 30 columns]"]}, "execution_count": 8, "metadata": {}, "output_type": "execute_result"}], "source": ["df = pandas.read_csv(\"cancer.txt\", nrows=3, skiprows=100, sep=\"\\t\", header=None)\n", "df"]}, {"cell_type": "markdown", "metadata": {}, "source": ["### big files : iterator"]}, {"cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["0 (3, 30)\n", "1 (3, 30)\n", "2 (3, 30)\n", "3 (3, 30)\n"]}], "source": ["for piece, df in enumerate(pandas.read_csv(\"cancer.txt\", iterator=True, sep=\"\\t\", chunksize=3)):\n", " print(piece, df.shape)\n", " if piece > 2:\n", " break"]}, {"cell_type": "markdown", "metadata": {}, "source": ["### sample on big files : iterator + concat"]}, {"cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [{"data": {"text/plain": ["(57, 30)"]}, "execution_count": 10, "metadata": {}, "output_type": "execute_result"}], "source": ["samples = []\n", "for df in pandas.read_csv(\"cancer.txt\", iterator=True, sep=\"\\t\", chunksize=30):\n", " sample = df.sample(3)\n", " samples.append(sample)\n", "dfsample = pandas.concat(samples)\n", "dfsample.shape"]}, {"cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": []}], "metadata": {"kernelspec": {"display_name": "Python 3", "language": "python", "name": "python3"}, "language_info": {"codemirror_mode": {"name": "ipython", "version": 3}, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.0"}}, "nbformat": 4, "nbformat_minor": 2}