{"cells": [{"cell_type": "markdown", "metadata": {}, "source": ["# Example with re2\n", "\n", "*wrapclib* wraps the library [re2](https://github.com/google/re2) using the wrapper [pyre2](https://github.com/facebook/pyre2)."]}, {"cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [{"data": {"text/html": ["
run previous cell, wait for 2 seconds
\n", ""], "text/plain": [""]}, "execution_count": 2, "metadata": {}, "output_type": "execute_result"}], "source": ["from jyquickhelper import add_notebook_menu\n", "add_notebook_menu()"]}, {"cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": ["from wrapclib import re2"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## Example with HTML"]}, {"cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["('

mot

',)\n"]}], "source": ["import re\n", "s = \"

mot

\"\n", "print(re.compile(\"(<.*>)\").match(s).groups())"]}, {"cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["('

mot

',)\n"]}], "source": ["s = \"

mot

\"\n", "print(re2.compile(\"(<.*>)\").match(s).groups())"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## Group, Span"]}, {"cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [{"data": {"text/plain": ["('14/9/2000', '20')"]}, "execution_count": 6, "metadata": {}, "output_type": "execute_result"}], "source": ["s = \"\"\"date 0 : 14/9/2000\n", "date 1 : 20/04/1971 date 2 : 14/09/1913 date 3 : 2/3/1978\n", "date 4 : 1/7/1986 date 5 : 7/3/47 date 6 : 15/10/1914\n", "date 7 : 08/03/1941 date 8 : 8/1/1980 date 9 : 30/6/1976\"\"\"\n", "\n", "expression = re2.compile(\n", " \"([0-3]?[0-9]/[0-1]?[0-9]/([0-2][0-9])?[0-9][0-9])[^\\d]\")\n", "expression.search(s).group(1, 2)"]}, {"cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [{"data": {"text/plain": ["'14/9/2000'"]}, "execution_count": 7, "metadata": {}, "output_type": "execute_result"}], "source": ["c = expression.search(s).span(1)\n", "s[c[0]:c[1]]"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## Names"]}, {"cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["{'aa': '2010', 'jj': '05', 'mm': '22'}\n"]}], "source": ["date = \"05/22/2010\"\n", "exp = \"(?P[0-9]{1,2})/(?P[0-9]{1,2})/(?P((19)|(20))[0-9]{2})\"\n", "com = re2.compile(exp)\n", "print(com.search(date).groupdict())"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## findall\n", "\n", "*findall* is not natively implemented in *re2*. It was added."]}, {"cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [{"data": {"text/plain": ["[('14/9/2000', '20'),\n", " ('20/04/1971', '19'),\n", " ('14/09/1913', '19'),\n", " ('2/3/1978', '19'),\n", " ('1/7/1986', '19'),\n", " ('7/3/47', None),\n", " ('15/10/1914', '19'),\n", " ('08/03/1941', '19'),\n", " ('8/1/1980', '19')]"]}, "execution_count": 9, "metadata": {}, "output_type": "execute_result"}], "source": ["s = \"\"\"date 0 : 14/9/2000\n", "date 1 : 20/04/1971 date 2 : 14/09/1913 date 3 : 2/3/1978\n", "date 4 : 1/7/1986 date 5 : 7/3/47 date 6 : 15/10/1914\n", "date 7 : 08/03/1941 date 8 : 8/1/1980 date 9 : 30/6/1976\"\"\"\n", "\n", "expression = re2.compile(\n", " \"([0-3]?[0-9]/[0-1]?[0-9]/([0-2][0-9])?[0-9][0-9])[^\\d]\")\n", "\n", "re2.findall(expression, s)"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## benchmark"]}, {"cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["10.5 \u00b5s \u00b1 296 ns per loop (mean \u00b1 std. dev. of 7 runs, 100000 loops each)\n"]}], "source": ["s = \"\"\"date 0 : 14/9/2000\n", "date 1 : 20/04/1971 date 2 : 14/09/1913 date 3 : 2/3/1978\n", "date 4 : 1/7/1986 date 5 : 7/3/47 date 6 : 15/10/1914\n", "date 7 : 08/03/1941 date 8 : 8/1/1980 date 9 : 30/6/1976\"\"\"\n", "\n", "expression = re.compile(\n", " \"([0-3]?[0-9]/[0-1]?[0-9]/([0-2][0-9])?[0-9][0-9])[^\\d]\")\n", "\n", "%timeit expression.findall(s)"]}, {"cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["18.4 \u00b5s \u00b1 1.51 \u00b5s per loop (mean \u00b1 std. dev. of 7 runs, 10000 loops each)\n"]}], "source": ["%timeit re2.findall(expression, s)"]}, {"cell_type": "markdown", "metadata": {}, "source": ["That's expected as method *findall* is implemented in python and not C."]}, {"cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": []}], "metadata": {"kernelspec": {"display_name": "Python 3", "language": "python", "name": "python3"}, "language_info": {"codemirror_mode": {"name": "ipython", "version": 3}, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.2"}}, "nbformat": 4, "nbformat_minor": 2}