Source code for pyquickhelper.ipythonhelper.run_notebook

"""
Functions to run notebooks.


:githublink:`%|py|5`
"""
import time
import os
import warnings
import re
from io import StringIO
import urllib.request as urllib_request
from datetime import datetime, timedelta

from ..loghelper.flog import noLOG
from ..filehelper import explore_folder
from .notebook_runner import NotebookRunner, NotebookKernelError
from .notebook_exception import NotebookException
from .notebook_helper import writes


try:
    from nbformat.reader import reads
    from nbformat.reader import NotJSONError
except ImportError:  # pragma: no cover
    from IPython.nbformat.reader import reads
    from IPython.nbformat.reader import NotJSONError


[docs]def _cache_url_to_file(cache_urls, folder, fLOG=noLOG): """ Downloads file corresponding to url stored in *cache_urls*. :param cache_urls: list of urls :param folder: where to store the cached files :param fLOG: logging function :return: dictionary { url: file } The function detects if the file was already downloaded. In that case, it does not do it a second time. :githublink:`%|py|39` """ if cache_urls is None: return None if folder is None: raise FileNotFoundError( # pragma: no cover "folder cannot be None") res = {} for url in cache_urls: local_file = "__cached__" + url.split("/")[-1] local_file = local_file.replace(":", "_").replace("%", "_") local_file = os.path.abspath(os.path.join(folder, local_file)) if not os.path.exists(local_file): fLOG("download", url, "to", local_file) with open(local_file, "wb") as f: fu = urllib_request.urlopen(url) c = fu.read(2 ** 21) while len(c) > 0: f.write(c) f.flush() c = fu.read(2 ** 21) fu.close() # to avoid having backslahes inside strings res[url] = "file:///" + local_file.replace("\\", "/") return res
[docs]def run_notebook(filename, profile_dir=None, working_dir=None, skip_exceptions=False, outfilename=None, encoding="utf8", additional_path=None, valid=None, clean_function=None, code_init=None, fLOG=noLOG, kernel_name="python", log_level="30", extended_args=None, cache_urls=None, replacements=None, detailed_log=None, startup_timeout=300): """ Runs a notebook end to end, it is inspired from module `runipy <https://github.com/paulgb/runipy/>`_. :param filename: notebook filename :param profile_dir: profile directory :param working_dir: working directory :param skip_exceptions: skip exceptions :param outfilename: if not None, saves the output in this notebook :param encoding: encoding for the notebooks :param additional_path: additional paths for import :param valid: if not None, valid is a function which returns whether or not the cell should be executed or not, if the function returns None, the execution of the notebooks and skip the execution of the other cells :param clean_function: function which cleans a cell's code before executing it (None for None) :param code_init: code to run before the execution of the notebook as if it was a cell :param fLOG: logging function :param kernel_name: kernel name, it can be None :param log_level: Choices: (0, 10, 20, 30=default, 40, 50, 'DEBUG', 'INFO', 'WARN', 'ERROR', 'CRITICAL') :param extended_args: others arguments to pass to the command line ('--KernelManager.autorestar=True' for example), see :ref:`l-ipython_notebook_args` for a full list :param cache_urls: list of urls to cache :param replacements: list of additional replacements, list of tuple :param detailed_log: a second function to log more information when executing the notebook, this should be a function with the same signature as ``print`` or None :param startup_timeout: wait for this long for the kernel to be ready, see `wait_for_ready <https://github.com/jupyter/jupyter_client/blob/master/jupyter_client/blocking/client.py#L84>`_ :return: tuple (statistics, output) .. warning:: The function calls `basicConfig <https://docs.python.org/3/library/logging.html#logging.basicConfig>`_. .. exref:: :title: Run a notebook end to end :: from pyquickhelper.ipythonhelper import run_notebook run_notebook("source.ipynb", working_dir="temp", outfilename="modified.ipynb", additional_path=["custom_path"] ) The function adds the local variable ``theNotebook`` with the absolute file name of the notebook. The execution of a notebook might fail because it relies on remote data specified by url. The function downloads the data first and stores it in folder *working_dir* (must not be None). The url string is replaced by the absolute path to the file. .. versionchanged:: 1.8 Parameters *detailed_log*, *startup_timeout* were added. :githublink:`%|py|126` """ cached_rep = _cache_url_to_file(cache_urls, working_dir, fLOG=fLOG) if replacements is None: replacements = cached_rep elif cached_rep is not None: cached_rep.update(replacements) else: cached_rep = replacements with open(filename, "r", encoding=encoding) as payload: try: nbc = payload.read() except UnicodeDecodeError as e: # pragma: no cover raise NotebookException( "(2) Unable to read file '{0}' encoding='{1}'.".format(filename, encoding)) from e try: nb = reads(nbc) except NotJSONError as e: # pragma: no cover raise NotebookException( "(1) Unable to read file '{0}' encoding='{1}'.".format(filename, encoding)) from e out = StringIO() def flogging(*args, **kwargs): if len(args) > 0: out.write(" ".join(args)) if len(kwargs) > 0: out.write(str(kwargs)) out.write("\n") fLOG(*args, **kwargs) try: nb_runner = NotebookRunner(nb, profile_dir, working_dir, fLOG=flogging, filename=filename, theNotebook=os.path.abspath(filename), code_init=code_init, log_level=log_level, extended_args=extended_args, kernel_name=kernel_name, replacements=cached_rep, kernel=True, detailed_log=detailed_log, startup_timeout=startup_timeout) except NotebookKernelError: # pragma: no cover # It fails. We try again once. nb_runner = NotebookRunner(nb, profile_dir, working_dir, fLOG=flogging, filename=filename, theNotebook=os.path.abspath(filename), code_init=code_init, log_level=log_level, extended_args=extended_args, kernel_name=kernel_name, replacements=cached_rep, kernel=True, detailed_log=detailed_log, startup_timeout=startup_timeout) try: stat = nb_runner.run_notebook(skip_exceptions=skip_exceptions, additional_path=additional_path, valid=valid, clean_function=clean_function) if outfilename is not None: with open(outfilename, 'w', encoding=encoding) as f: try: s = writes(nb_runner.nb) except NotebookException as e: # pragma: no cover raise NotebookException( "issue with notebook: '{}'".format(filename)) from e if isinstance(s, bytes): s = s.decode('utf8') f.write(s) finally: nb_runner.shutdown_kernel() return stat, out.getvalue()
[docs]def execute_notebook_list(folder, notebooks, clean_function=None, valid=None, fLOG=noLOG, additional_path=None, deepfLOG=noLOG, kernel_name="python", log_level="30", extended_args=None, cache_urls=None, replacements=None, detailed_log=None, startup_timeout=300): """ Executes a list of notebooks. :param folder: folder (where to execute the notebook, current folder for the notebook) :param notebooks: list of notebooks to execute (or a list of tuple(notebook, code which initializes the notebook)) :param clean_function: function which transform the code before running it :param valid: if not None, valid is a function which returns whether or not the cell should be executed or not, if the function returns None, the execution of the notebooks and skip the execution of the other cells :param fLOG: logging function :param deepfLOG: logging function used to run the notebook :param additional_path: path to add to *sys.path* before running the notebook :param kernel_name: kernel name, it can be None :param log_level: Choices: (0, 10, 20, 30=default, 40, 50, 'DEBUG', 'INFO', 'WARN', 'ERROR', 'CRITICAL') :param extended_args: others arguments to pass to the command line ('--KernelManager.autorestar=True' for example), see :ref:`l-ipython_notebook_args` for a full list :param cache_urls: list of urls to cache :param replacements: additional replacements :param detailed_log: detailed log :param startup_timeout: wait for this long for the kernel to be ready, see `wait_for_ready <https://github.com/jupyter/jupyter_client/blob/master/jupyter_client/blocking/client.py#L84>`_ :return: dictionary of dictionaries ``{ notebook_name: { } }`` If *isSuccess* is False, *statistics* contains the execution time, *output* is the exception raised during the execution. The signature of function ``valid_cell`` is:: def valid_cell(cell): return True or False or None to stop execution of the notebook before this cell The signature of function ``clean_function`` is:: def clean_function(cell): return new_cell_content The execution of a notebook might fail because it relies on remote data specified by url. The function downloads the data first and stores it in folder *working_dir* (must not be None). The url string is replaced by the absolute path to the file. .. versionchanged:: 1.8 Parameters *detailed_log*, *startup_timeout* were added. :githublink:`%|py|243` """ if additional_path is None: additional_path = [] # we cache urls before running through the list of notebooks _cache_url_to_file(cache_urls, folder, fLOG=fLOG) results = {} for i, note in enumerate(notebooks): if isinstance(note, tuple): note, code_init = note else: code_init = None fLOG("[execute_notebook_list] {0}/{1} - {2}".format(i + 1, len(notebooks), os.path.split(note)[-1])) outfile = os.path.join(folder, "out_" + os.path.split(note)[-1]) cl = time.perf_counter() try: stat, out = run_notebook(note, working_dir=folder, outfilename=outfile, additional_path=additional_path, valid=valid, clean_function=clean_function, fLOG=deepfLOG, code_init=code_init, kernel_name=kernel_name, log_level=log_level, extended_args=extended_args, cache_urls=cache_urls, replacements=replacements, detailed_log=detailed_log, startup_timeout=startup_timeout) if not os.path.exists(outfile): raise FileNotFoundError(outfile) # pragma: no cover etime = time.perf_counter() - cl results[note] = dict(success=True, output=out, name=note, etime=etime, date=datetime.now()) results[note].update(stat) except Exception as e: etime = time.perf_counter() - cl results[note] = dict(success=False, etime=etime, error=e, name=note, date=datetime.now()) return results
[docs]def _get_dump_default_path(dump): """ Proposes a default location to dump results about notebooks execution. :param dump: location of the dump or module. :return: location of the dump The result might be equal to the input if *dump* is already path. :githublink:`%|py|289` """ if hasattr(dump, '__file__') and hasattr(dump, '__name__'): # Default value. We check it is none travis or appveyor. from ..pycode import is_travis_or_appveyor if is_travis_or_appveyor(): dump = None if dump is not None: # We guess the package name. name = dump.__name__.split('.')[-1] loc = os.path.dirname(dump.__file__) src_loc = os.path.split(loc) if src_loc[-1] == 'src': # We choose a path for the dumps in a way fold = os.path.join(loc, "..", "..", "..", "_notebook_dumps") else: src_loc_loc = os.path.split(src_loc[0]) if src_loc_loc[-1] == 'src': # We choose a path for the dumps in a way fold = os.path.join( loc, "..", "..", "..", "_notebook_dumps") else: # This should be a parameter. fold = os.path.join(loc, "..", "..", "_notebook_dumps") if not os.path.exists(fold): os.mkdir(fold) dump = os.path.join(fold, "notebook.{0}.txt".format(name)) return dump return dump
[docs]def _existing_dump(dump): """ Loads an existing dump. :param dump: filename :return: :epkg:`pandas:DataFrame` :githublink:`%|py|325` """ import pandas from pandas.errors import ParserError def read_file(dump): try: df = pandas.read_csv(dump, sep="\t", encoding="utf-8") except ParserError: # pragma: no cover df = pandas.read_csv( dump, sep="\t", encoding="utf-8", error_bad_lines=False, warn_bad_lines=True) return df if os.path.exists(dump): # There might be some risk here to see another process writing the # file at the same time. try: df = read_file(dump) except PermissionError: # pragma: no cover # We try again once. time.sleep(10) try: df = read_file(dump) except Exception as e: raise RuntimeError( "Unable to read '{0}' due to '{1}'".format(dump, e)) from e except Exception as e: # pragma: no cover raise RuntimeError( "Unable to read '{0}' due to '{1}'".format(dump, e)) from e else: df = None return df
[docs]def execute_notebook_list_finalize_ut(res, dump=None, fLOG=noLOG): """ Checks the list of results and raises an exception if one failed. This is meant to be used in unit tests. :param res: output of :func:`execute_notebook_list <pyquickhelper.ipythonhelper.run_notebook.execute_notebook_list>` :param dump: if not None, dump the results of the execution in a flat file :param fLOG: logging function The dump relies on :epkg:`pandas` and append the results a previous dump. If *dump* is a module, the function stores the output of the execution in a default location only if the process does not run on :epkg:`travis` or :epkg:`appveyor`. The default location is something like: .. runpython:: from pyquickhelper.ipythonhelper.run_notebook import _get_dump_default_path import pyquickhelper print(_get_dump_default_path(pyquickhelper)) :githublink:`%|py|378` """ if len(res) == 0: raise RuntimeError("No notebook was run.") # pragma: no cover def fail_note(v): return "error" in v fails = [(os.path.split(k)[-1], v) for k, v in sorted(res.items()) if fail_note(v)] for f in fails: fLOG(f) for k, v in sorted(res.items()): name = os.path.split(k)[-1] fLOG(name, v.get("success", None), v.get("etime", None)) if len(fails) > 0: raise fails[0][1]["error"] dump = _get_dump_default_path(dump) if dump is not None: import pandas df = _existing_dump(dump) new_df = pandas.DataFrame(data=list(res.values())) # We replace every EOL. def eol_replace(t): return t.replace("\r", "").replace("\n", "\\n") subdf = new_df.select_dtypes(include=['object']).applymap(eol_replace) for c in subdf.columns: new_df[c] = subdf[c] if df is None: df = new_df else: df = pandas.concat([df, new_df]).copy() # There could be a conflict while several # processes in parallel could overwrite the same file. if not os.path.exists(dump): df.to_csv(dump, sep="\t", encoding="utf-8", index=False) else: # There might be some risk here to see another process # writing or reading the file at the same time. # Module filelock does not work in this case. # locket (https://github.com/mwilliamson/locket.py) was not tried. try: df.to_csv(dump, sep="\t", encoding="utf-8", index=False) except PermissionError: # pragma: no cover time.sleep(7) df.to_csv(dump, sep="\t", encoding="utf-8", index=False)
[docs]def notebook_coverage(module_or_path, dump=None, too_old=30): """ Extracts a list of notebooks and merges with a list of runs dumped by function :func:`execute_notebook_list_finalize_ut <pyquickhelper.ipythonhelper.run_notebook.execute_notebook_list_finalize_ut>`. :param module_or_path: a module or a path :param dump: dump (or None to get the location by default) :param too_old: drop executions older than *too_old* days from now :return: dataframe If *module_or_path* is a module, the function will get a list notebooks assuming it follows the same design as :epkg:`pyquickhelper`. :githublink:`%|py|441` """ if dump is None: dump = _get_dump_default_path(module_or_path) else: dump = _get_dump_default_path(dump) # Create the list of existing notebooks. if isinstance(module_or_path, list): nbs = [_[1] if isinstance(_, tuple) else _ for _ in module_or_path] elif hasattr(module_or_path, '__file__') and hasattr(module_or_path, '__name__'): fold = os.path.dirname(module_or_path.__file__) _doc = os.path.join(fold, "..", "..", "_doc") if not os.path.exists(_doc): raise FileNotFoundError( # pragma: no cover "Unable to find path '{0}' for module '{1}'".format( _doc, module_or_path)) nbpath = os.path.join(_doc, "notebooks") if not os.path.exists(nbpath): raise FileNotFoundError( # pragma: no cover "Unable to find path '{0}' for module '{1}'".format( nbpath, module_or_path)) nbs = explore_folder(nbpath, ".*[.]ipynb$")[1] else: nbpath = module_or_path nbs = explore_folder(nbpath, ".*[.]ipynb$")[1] import pandas dfnb = pandas.DataFrame(data=dict(notebooks=nbs)) dfnb["notebooks"] = dfnb["notebooks"].apply(lambda x: os.path.normpath(x)) dfnb = dfnb[~dfnb.notebooks.str.contains(".ipynb_checkpoints")].copy() dfnb["key"] = dfnb["notebooks"].apply(lambda x: "/".join(os.path.normpath( x).replace("\\", "/").split("/")[-3:]) if isinstance(x, str) else x) dfnb["key"] = dfnb["key"].apply( lambda x: x.lower() if isinstance(x, str) else x) # There might be some risk here to see another process writing the # file at the same time. try: dfall = pandas.read_csv(dump, sep="\t", encoding="utf-8") except PermissionError: # pragma: no cover # We try again once. time.sleep(10) dfall = pandas.read_csv(dump, sep="\t", encoding="utf-8") # We drop too old execution. old = datetime.now() - timedelta(too_old) old = "%04d-%02d-%02d" % (old.year, old.month, old.day) dfall = dfall[dfall.date > old].copy() # We add a key to merge. dfall["name"] = dfall["name"].apply(lambda x: os.path.normpath(x)) dfall["key"] = dfall["name"].apply(lambda x: "/".join(os.path.normpath( x).replace("\\", "/").split("/")[-3:]) if isinstance(x, str) else x) dfall["key"] = dfall["key"].apply( lambda x: x.lower() if isinstance(x, str) else x) # We keep the last execution. gr = dfall.sort_values("date", ascending=False).groupby( "key", as_index=False).first().reset_index(drop=True).copy() gr = gr.drop("name", axis=1) # Folders might be different so we merge on the last part of the path. merged = dfnb.merge(gr, left_on="key", right_on="key", how="outer") merged = merged[merged.notebooks.notnull()] merged = merged.sort_values("key").reset_index(drop=True).copy() if "last_name" not in merged.columns: merged["last_name"] = merged["key"].apply( lambda x: os.path.split(x)[-1]) # We check there is no duplicates in merged. for c in ["key", "last_name"]: names = [_ for _ in merged[c] if isinstance(_, str)] if len(names) > len(set(names)): raise ValueError( # pragma: no cover "Unexpected duplicated names in column '{1}'\n{0}".format( "\n".join(sorted(names)), c)) return merged
[docs]def badge_notebook_coverage(df, image_name): """ Builds a badge reporting on the notebook coverage. It gives the proportion of run cells. :param df: output of :func:`notebook_coverage <pyquickhelper.ipythonhelper.run_notebook.notebook_coverage>` :param image_name: image to produce :return: coverage estimation The function relies on module :epkg:`Pillow`. :githublink:`%|py|532` """ cell = df["nbcell"].sum() run = df["nbrun"].sum() valid = df["nbvalid"].sum() cov = run * 100.0 / cell if cell > 0 else 1.0 dcov = min(100., cov) val = valid * 100.0 / cell if cell > 0 else 1.0 with warnings.catch_warnings(): warnings.simplefilter("ignore", ImportWarning) from PIL import Image, ImageFont, ImageDraw if cov <= 60: color = (200, 87, 51) elif cov <= 70: color = (200, 156, 18) elif cov <= 75: color = (140, 140, 140) elif cov <= 80: color = (88, 171, 171) elif cov <= 85: color = (88, 140, 86) elif cov <= 90: color = (80, 155, 86) elif cov <= 95: color = (80, 190, 73) else: color = (20, 190, 50) img = Image.new(mode='RGB', size=(70, 20), color=color) im = ImageDraw.Draw(img) font = ImageFont.load_default() try: cov = int(cov) cov = min(cov, 100) except ValueError: # pragma: no cover cov = "?" try: val = int(val) val = min(val, 100) except ValueError: # pragma: no cover val = "?" if cov != val: im.text((3, 4), "NB:{0}%-{1}% ".format(cov, val), (255, 255, 255), font=font) else: im.text((3, 4), "NB: {0}% ".format( cov), (255, 255, 255), font=font) img.save(image_name) return dcov
[docs]def get_additional_paths(modules): """ Returns a list of paths to add before running the notebooks for a given a list of modules. :return: list of paths :githublink:`%|py|587` """ addpath = [os.path.dirname(mod.__file__) for mod in modules] addpath = [os.path.normpath(os.path.join(_, "..")) for _ in addpath] return addpath
[docs]def retrieve_notebooks_in_folder(folder, posreg=".*[.]ipynb$", negreg=None): """ Retrieves notebooks in a test folder. :param folder: folder :param regex: regular expression :return: list of found notebooks :githublink:`%|py|600` """ pos = re.compile(posreg) neg = re.compile(negreg) if negreg is not None else None res = [] for name in os.listdir(folder): if pos.search(name): if neg is None or not neg.search(name): res.append(os.path.join(folder, name)) if len(res) == 0: raise FileNotFoundError( # pragma: no cover "No notebook found in '{0}'.".format(folder)) return res