{"cells": [{"cell_type": "markdown", "metadata": {}, "source": ["# 1A.algo - Arbre et Trie (correction)\n", "\n", "Correction."]}, {"cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [{"data": {"text/html": ["
run previous cell, wait for 2 seconds
\n", ""], "text/plain": [""]}, "execution_count": 2, "metadata": {}, "output_type": "execute_result"}], "source": ["from jyquickhelper import add_notebook_menu\n", "add_notebook_menu()"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## Exercice 1\n"]}, {"cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["10000\n"]}], "source": ["import random\n", "def mot_alea (l) :\n", " l = [ chr(97+random.randint(0,25)) for i in range(l) ]\n", " return \"\".join(l)\n", "\n", "taille = 20\n", "N = 10000\n", "mots = [ mot_alea(taille) for _ in range (N) ]\n", "print(len(mots))"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## Exercice 2"]}, {"cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["recherche simple 1.4759376217423303\n"]}], "source": ["import time\n", "debut = time.perf_counter()\n", "for k in mots :\n", " i = mots.index(k)\n", "fin = time.perf_counter()\n", "print (\"recherche simple\",fin - debut)"]}, {"cell_type": "markdown", "metadata": {}, "source": ["Avec ``%timeit`` :"]}, {"cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["1 loop, best of 3: 1.41 s per loop\n"]}], "source": ["%timeit for k in mots : i = mots.index(k)"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## Exercice 3 : recherche dichotomique"]}, {"cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["dichotomie 0.07686653042216562\n"]}], "source": ["def dicho (mots, x) :\n", " a = 0\n", " b = len(mots)-1\n", " while a < b :\n", " m = (a+b)//2\n", " t = mots[m] \n", " if t < x : \n", " b = m-1 \n", " elif t == x :\n", " return m\n", " else :\n", " a = m+1\n", " return a\n", "\n", "mots.sort()\n", "\n", "debut = time.perf_counter()\n", "for k in mots :\n", " i = dicho(mots, k)\n", "fin = time.perf_counter()\n", "print (\"dichotomie\",fin - debut)"]}, {"cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["10 loops, best of 3: 57.4 ms per loop\n"]}], "source": ["%timeit for k in mots : i = dicho(mots, k)"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## Exercice 4"]}, {"cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["10 simple 0.00046013522672971874 dicho 0.0013958005390772854 ratio 0.32965686274480016 ratio th\u00e9orique 0.1003433318879937\n", "100 simple 0.0017960668701348936 dicho 0.0025906126426864517 ratio 0.6932981181904454 ratio th\u00e9orique 0.5017166594399687\n", "1000 simple 0.014991171476061993 dicho 0.00428319185030368 ratio 3.4999999999997926 ratio th\u00e9orique 3.3447777295997914\n", "10000 simple 0.139653179487377 dicho 0.005854322726703387 ratio 23.854711468224224 ratio th\u00e9orique 25.085832971998425\n", "100000 simple 5.177541637747854 dicho 0.007615751164344431 ratio 679.8464821156667 ratio th\u00e9orique 200.68666377598748\n"]}], "source": ["import math\n", "\n", "for N in [10, 100, 1000, 10000, 100000] :\n", " mots = [ mot_alea(taille) for _ in range (N) ]\n", " tolook = [ mots[ random.randint(0,len(mots)-1) ] for i in range(0,1000) ]\n", " mots.sort()\n", " \n", " debut = time.perf_counter()\n", " for k in tolook :\n", " i = mots.index(k)\n", " fin = time.perf_counter() \n", " ds = fin-debut\n", " \n", " debut = time.perf_counter()\n", " for k in tolook :\n", " i = dicho(mots, k)\n", " fin = time.perf_counter() \n", " dd = fin-debut\n", " \n", " print(N, \"simple\",ds, \"dicho\",dd, \"ratio\", ds / max(dd, 1), \" ratio th\u00e9orique \",\n", " len(mots)/math.log(len(mots)) * math.log(2)/30)"]}, {"cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["N= 10\n", "1000 loops, best of 3: 320 \u00b5s per loop\n", "1000 loops, best of 3: 1.29 ms per loop\n", "N= 100\n", "1000 loops, best of 3: 1.6 ms per loop\n", "100 loops, best of 3: 2.63 ms per loop\n", "N= 1000\n", "100 loops, best of 3: 14.1 ms per loop\n", "100 loops, best of 3: 4.03 ms per loop\n", "N= 10000\n", "10 loops, best of 3: 144 ms per loop\n", "100 loops, best of 3: 5.63 ms per loop\n", "N= 100000\n", "1 loop, best of 3: 5.02 s per loop\n", "100 loops, best of 3: 7.12 ms per loop\n"]}], "source": ["for N in [10, 100, 1000, 10000, 100000] :\n", " print(\"N=\",N)\n", " mots = [ mot_alea(taille) for _ in range (N) ]\n", " tolook = [ mots[ random.randint(0,len(mots)-1) ] for i in range(0,1000) ]\n", " mots.sort()\n", " \n", " %timeit for k in tolook : i = mots.index(k)\n", " %timeit for k in tolook : i = dicho(mots, k)"]}, {"cell_type": "markdown", "metadata": {}, "source": ["Soit $N$ le nombre de mots dans la liste :\n", "\n", "* co\u00fbt de la recherche simple : $O(N)$ \n", "* co\u00fbt de la recherche dichotomique : $O(\\ln N)$ \n", "\n", "Le ratio $N/\\ln N$ mesur\u00e9 en pratique devrait \u00eatre sensiblement \u00e9gal au ratio th\u00e9orique \u00e0 un facteur multiplication pr\u00e8s. Le trie du tableau qui pr\u00e9c\u00e8de la recherche dichotomique n'est pas pris en compte. Plus on effectue de recherche, plus son co\u00fbt devient marginal. Ce co\u00fbt explique n\u00e9anmoins pourquoi on ne fait pas toujours une recherche dichotomique."]}, {"cell_type": "markdown", "metadata": {}, "source": ["## Exercice 5 : trie"]}, {"cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["{'a': {'a': {'a': {}, 'b': {}}, 'b': {'a': {}}}, 'b': {'a': {'a': {}, 'b': {}}, 'b': {'a': {}, 'b': {}}}}\n"]}], "source": ["def build_trie(mots) :\n", " trie = { }\n", " for m in mots :\n", " r = trie\n", " for c in m :\n", " if c not in r : r[c] = { }\n", " r = r[c]\n", " return trie\n", "\n", "mots = [ \"aaa\", \"aba\", \"aab\", \"baa\", \"bbb\", \"bba\", \"bab\" ]\n", "\n", "trie = build_trie(mots)\n", "print(trie)"]}, {"cell_type": "markdown", "metadata": {}, "source": ["Quelques explications suppl\u00e9mentaires concernant cette correction :\n", "\n", "* [Question \u00e0 propos de trie](http://www.xavierdupre.fr/app/ensae_teaching_cs/helpsphinx/questions/question_2014.html#question-a-propos-de-trie)"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## Exercice 6 : recherche dans un trie"]}, {"cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["aaa True\n", "aba True\n", "aab True\n", "baa True\n", "bbb True\n", "bba True\n", "bab True\n", "bcc False\n"]}], "source": ["def lookup(trie, m) :\n", " r = trie\n", " for c in m :\n", " if c in r :\n", " r = r[c]\n", " else :\n", " return False\n", " return True\n", "\n", "for k in mots :\n", " print(k, lookup(trie, k))\n", "print(\"bcc\", lookup(trie, \"bcc\")) "]}, {"cell_type": "markdown", "metadata": {}, "source": ["Pour illustrer la structure d'arbre du trie, on l'affiche avec la fonction suivante : "]}, {"cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["a\n", " a\n", " a\n", " b\n", " b\n", " a\n", "b\n", " a\n", " a\n", " b\n", " b\n", " a\n", " b\n"]}], "source": ["def print_trie(trie, niveau = 0):\n", " for k,v in sorted(trie.items()):\n", " print(\" \" * niveau + k)\n", " if len(v) > 0 :\n", " print_trie(v, niveau+1)\n", "print_trie(trie)"]}, {"cell_type": "markdown", "metadata": {}, "source": ["Il reste un inconv\u00e9nient \u00e0 cette repr\u00e9sentation. Si on construire le trie pour le mot ``[\"aaa\"]`` ou les mots ``[\"aa\",\"aaa]\"``, on obtient le m\u00eame trie :"]}, {"cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["a\n", " a\n", " a\n"]}], "source": ["print_trie (build_trie( [\"aaa\"]) )"]}, {"cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["a\n", " a\n", " a\n"]}], "source": ["print_trie (build_trie( [\"aaa\", \"aa\"]) )"]}, {"cell_type": "markdown", "metadata": {}, "source": ["Pour \u00e9viter cela, le plus simple est de repr\u00e9senter la fin d'un mot comme un caract\u00e8re \u00e0 part enti\u00e8re."]}, {"cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["a\n", " a\n", " a\n", " *\n"]}], "source": ["print_trie (build_trie( [\"aaa*\"]) )"]}, {"cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["a\n", " a\n", " *\n", " a\n", " *\n"]}], "source": ["print_trie (build_trie( [\"aaa*\", \"aa*\"]) )"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## Exercice 7\n", "\n", "Soit $L$ la longueur maximale des mots et $C$ le nombre de lettres distinctes, avec un trie, le co\u00fbt de la recherche est major\u00e9 par : $O(L \\ln C)$. On reprend le code de l'exercice 5 et on ajoute le code associ\u00e9 au trie. On effectue 10000 recherches au lieu de 1000 pour avoir une meilleure estimation de la diff\u00e9rence (pour vous en convaincre, il suffit comparer les temps obtenus par deux ex\u00e9cution de ce m\u00eame code)."]}, {"cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["10 dicho 0.014463042289250438 trie 0.020240390721525614\n", "100 dicho 0.03220732769624135 trie 0.019780683129752674\n", "1000 dicho 0.059169712496739635 trie 0.04887183480633439\n", "10000 dicho 0.05923257483719624 trie 0.04443469436714054\n", "100000 dicho 0.07402190260785346 trie 0.0492733840422801\n", "200000 dicho 0.07787318313937419 trie 0.05416595572353344\n", "400000 dicho 0.0827054582899791 trie 0.06291493955595229\n"]}], "source": ["for N in [10, 100, 1000, 10000, 100000, 200000, 400000] :\n", " mots = [ mot_alea(taille) for _ in range (N) ]\n", " tolook = [ mots[ random.randint(0,len(mots)-1) ] for i in range(0,10000) ]\n", " trie = build_trie(mots)\n", " mots.sort()\n", " \n", " debut = time.perf_counter()\n", " for k in tolook :\n", " i = dicho(mots, k)\n", " fin = time.perf_counter() \n", " dd = fin-debut\n", " \n", " debut = time.perf_counter()\n", " for k in tolook :\n", " i = lookup(trie, k)\n", " fin = time.perf_counter()\n", " dt = fin - debut\n", "\n", " print(N, \"dicho\",dd, \"trie\", dt)"]}, {"cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["N= 10\n", "100 loops, best of 3: 12.7 ms per loop\n", "100 loops, best of 3: 17.7 ms per loop\n", "N= 100\n", "10 loops, best of 3: 26.1 ms per loop\n", "100 loops, best of 3: 17.9 ms per loop\n", "N= 1000\n", "10 loops, best of 3: 40.1 ms per loop\n", "10 loops, best of 3: 32.4 ms per loop\n", "N= 10000\n", "10 loops, best of 3: 57.1 ms per loop\n", "10 loops, best of 3: 44.5 ms per loop\n", "N= 100000\n", "10 loops, best of 3: 97.9 ms per loop\n", "10 loops, best of 3: 54 ms per loop\n", "N= 200000\n", "10 loops, best of 3: 77 ms per loop\n", "10 loops, best of 3: 50 ms per loop\n", "N= 400000\n", "10 loops, best of 3: 80.8 ms per loop\n", "10 loops, best of 3: 52.7 ms per loop\n"]}], "source": ["for N in [10, 100, 1000, 10000, 100000, 200000, 400000] :\n", " print(\"N=\",N)\n", " mots = [ mot_alea(taille) for _ in range (N) ]\n", " tolook = [ mots[ random.randint(0,len(mots)-1) ] for i in range(0,10000) ]\n", " trie = build_trie(mots)\n", " mots.sort()\n", " \n", " %timeit for k in tolook : i = dicho(mots, k)\n", " %timeit for k in tolook : i = lookup(trie, k)"]}, {"cell_type": "markdown", "metadata": {}, "source": ["Encore une fois, le temps de construction du trie n'est pas pris en compte. Plus il y a de recherche \u00e0 effectuer, plus il devient n\u00e9gligeable.\n", "\n", "Le dictionnaire est un object courant dans la plupart des languages. En python, celui-ci utilise une [table de hachage](http://fr.wikipedia.org/wiki/Table_de_hachage) et le co\u00fbt d'acc\u00e8s \u00e0 un \u00e9l\u00e9ment n'est pas en $O(\\ln n)$ mais en $O(n)$ (voir [time complexity](https://wiki.python.org/moin/TimeComplexity)). En C++, le dictionnaire (ou [map](http://www.cplusplus.com/reference/map/map/)) utilise un arbre binaire et l'acc\u00e8s \u00e0 un \u00e9l\u00e9ment a un co\u00fbt logarithmique : [Standard C++ Containers](http://www.cs.northwestern.edu/~riesbeck/programming/c++/stl-summary.html)."]}, {"cell_type": "markdown", "metadata": {}, "source": ["## Plus en d\u00e9tails"]}, {"cell_type": "markdown", "metadata": {}, "source": ["La recherche dichotomique est \u00e9quivalente \u00e0 celle op\u00e9r\u00e9e avec un [arbre binaire de recherche](http://fr.wikipedia.org/wiki/Arbre_binaire_de_recherche) (si ce dernier est [\u00e9quilibr\u00e9](http://fr.wikipedia.org/wiki/Arbre_B) ou [arbre rouge/noir](http://fr.wikipedia.org/wiki/Arbre_bicolore)). Ce dernier consiste \u00e0 classer les \u00e9l\u00e9ments par ordre alphab\u00e9tique. Un arbre est souvent repr\u00e9sent\u00e9 par une classe et non par un dictionnaire comme la derni\u00e8re partie de cette session le laissait supposer."]}, {"cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["value=racine\n", " value=child 1\n", " value=child 2\n", " value=child 1000\n", " value=child 3\n"]}], "source": ["class Arbre:\n", " def __init__(self, value):\n", " self.value = value\n", " self.children = [ ]\n", " \n", " def add_child(self, child):\n", " self.children.append(child)\n", " \n", " def __str__(self):\n", " rows = [ \"value={0}\".format(self.value) ]\n", " for c in self.children:\n", " s = str(c)\n", " lines = [ \" \" + l for l in s.split(\"\\n\") ]\n", " rows.extend(lines)\n", " return \"\\n\".join(rows)\n", " \n", "root = Arbre(\"racine\")\n", "child1 = Arbre(\"child 1\")\n", "child1.add_child ( Arbre(\"child 2\") )\n", "child1.add_child ( Arbre(\"child 1000\") )\n", "root.add_child(child1)\n", "root.add_child( Arbre (\"child 3\") )\n", "print(root) "]}, {"cell_type": "markdown", "metadata": {}, "source": ["Les arbres sont des graphes particuliers car il ne contiennent pas de cycles. Il est possible de parcourir les noeuds, de les num\u00e9roter. Ils sont tr\u00e8s utilis\u00e9s en machine learning avec les [arbres de d\u00e9cisions](http://fr.wikipedia.org/wiki/Arbre_de_d%C3%A9cision) ou les [random forests](http://en.wikipedia.org/wiki/Random_forest). Ils sont parfois cach\u00e9s comme dans le cas de la recherche dichotomique qui peut \u00eatre impl\u00e9ment\u00e9e \u00e0 partir d'une structure d'arbre.\n", "\n", "Dans le cas de le recherche dichotomique, on suppose que le nombre de noeuds fils est toujours 2. L'ordre alphab\u00e9tique est le suivant : noeuds fils 1, noeud courant, noeud fils 2. Les deux noeuds fils pourraient \u00eatre nuls. L'impl\u00e9mentation de l'arbre serait la suivante :"]}, {"cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["['alphabet', 'avant', 'avant apr\u00e8s', 'milieu', 'zillion']\n", "value=alphabet\n", "value=avant\n", "value=avant apr\u00e8s\n", "value=milieu\n", "value=zillion\n"]}], "source": ["class ArbreDicho:\n", " def __init__(self, value):\n", " self.value = value\n", " self.before = None\n", " self.after = None\n", " \n", " def __str__(self):\n", " return \"value={0}\".format(self.value)\n", " \n", " def add_before(self, child):\n", " self.before = child\n", "\n", " def add_after(self, child):\n", " self.after = child\n", " \n", " def find(self, word):\n", " if self.value == word : return self\n", " elif word < self.value : \n", " if self.before is None : return None\n", " else : return self.before.find(word)\n", " else : \n", " if self.after is None : return None\n", " else : return self.after.find(word)\n", " \n", " def sorted_list(self):\n", " res = [ ]\n", " if self.before is not None: res.extend ( self.before.sorted_list() )\n", " res.append(self.value)\n", " if self.after is not None: res.extend ( self.after.sorted_list() )\n", " return res\n", "\n", "# on cr\u00e9e un arbre dont les noeuds v\u00e9rifient la propri\u00e9t\u00e9 \u00e9nonc\u00e9 plus haut (les mots apparaissent dans le bon ordre)\n", "root = ArbreDicho(\"milieu\")\n", "root.add_before(ArbreDicho(\"avant\"))\n", "root.add_after(ArbreDicho(\"zillion\"))\n", "root.before.add_before(ArbreDicho(\"alphabet\"))\n", "root.before.add_after(ArbreDicho(\"avant apr\u00e8s\"))\n", "\n", "# on v\u00e9rifie que c'est bien le cas\n", "all = root.sorted_list()\n", "assert all == sorted(all)\n", "print(all)\n", "\n", "# on effectue la recherche\n", "for a in all:\n", " f = root.find(a)\n", " print(f)"]}, {"cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": []}], "metadata": {"kernelspec": {"display_name": "Python 3", "language": "python", "name": "python3"}, "language_info": {"codemirror_mode": {"name": "ipython", "version": 3}, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.1"}}, "nbformat": 4, "nbformat_minor": 2}