Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""
2@file
3@brief Functions to run notebooks.
4"""
5import time
6import os
7import warnings
8import re
9from io import StringIO
10import urllib.request as urllib_request
11from datetime import datetime, timedelta
13from ..loghelper.flog import noLOG
14from ..filehelper import explore_folder
15from .notebook_runner import NotebookRunner, NotebookKernelError
16from .notebook_exception import NotebookException
17from .notebook_helper import writes
20try:
21 from nbformat.reader import reads
22 from nbformat.reader import NotJSONError
23except ImportError: # pragma: no cover
24 from IPython.nbformat.reader import reads
25 from IPython.nbformat.reader import NotJSONError
28def _cache_url_to_file(cache_urls, folder, fLOG=noLOG):
29 """
30 Downloads file corresponding to url stored in *cache_urls*.
32 @param cache_urls list of urls
33 @param folder where to store the cached files
34 @param fLOG logging function
35 @return dictionary { url: file }
37 The function detects if the file was already downloaded.
38 In that case, it does not do it a second time.
39 """
40 if cache_urls is None:
41 return None
42 if folder is None:
43 raise FileNotFoundError( # pragma: no cover
44 "folder cannot be None")
45 res = {}
46 for url in cache_urls:
47 local_file = "__cached__" + url.split("/")[-1]
48 local_file = local_file.replace(":", "_").replace("%", "_")
49 local_file = os.path.abspath(os.path.join(folder, local_file))
50 if not os.path.exists(local_file):
51 fLOG("download", url, "to", local_file)
52 with open(local_file, "wb") as f:
53 fu = urllib_request.urlopen(url)
54 c = fu.read(2 ** 21)
55 while len(c) > 0:
56 f.write(c)
57 f.flush()
58 c = fu.read(2 ** 21)
59 fu.close()
61 # to avoid having backslahes inside strings
62 res[url] = "file:///" + local_file.replace("\\", "/")
63 return res
66def run_notebook(filename, profile_dir=None, working_dir=None, skip_exceptions=False,
67 outfilename=None, encoding="utf8", additional_path=None,
68 valid=None, clean_function=None, code_init=None,
69 fLOG=noLOG, kernel_name="python", log_level="30",
70 extended_args=None, cache_urls=None, replacements=None,
71 detailed_log=None, startup_timeout=300):
72 """
73 Runs a notebook end to end,
74 it is inspired from module `runipy <https://github.com/paulgb/runipy/>`_.
76 @param filename notebook filename
77 @param profile_dir profile directory
78 @param working_dir working directory
79 @param skip_exceptions skip exceptions
80 @param outfilename if not None, saves the output in this notebook
81 @param encoding encoding for the notebooks
82 @param additional_path additional paths for import
83 @param valid if not None, valid is a function which returns whether
84 or not the cell should be executed or not, if the function
85 returns None, the execution of the notebooks and skip the execution
86 of the other cells
87 @param clean_function function which cleans a cell's code before executing it (None for None)
88 @param code_init code to run before the execution of the notebook as if it was a cell
89 @param fLOG logging function
90 @param kernel_name kernel name, it can be None
91 @param log_level Choices: (0, 10, 20, 30=default, 40, 50, 'DEBUG', 'INFO', 'WARN', 'ERROR', 'CRITICAL')
92 @param extended_args others arguments to pass to the command line ('--KernelManager.autorestar=True' for example),
93 see :ref:`l-ipython_notebook_args` for a full list
94 @param cache_urls list of urls to cache
95 @param replacements list of additional replacements, list of tuple
96 @param detailed_log a second function to log more information when executing the notebook,
97 this should be a function with the same signature as ``print`` or None
98 @param startup_timeout wait for this long for the kernel to be ready,
99 see `wait_for_ready
100 <https://github.com/jupyter/jupyter_client/blob/master/jupyter_client/blocking/client.py#L84>`_
101 @return tuple (statistics, output)
103 @warning The function calls `basicConfig
104 <https://docs.python.org/3/library/logging.html#logging.basicConfig>`_.
106 .. exref::
107 :title: Run a notebook end to end
109 ::
111 from pyquickhelper.ipythonhelper import run_notebook
112 run_notebook("source.ipynb", working_dir="temp",
113 outfilename="modified.ipynb",
114 additional_path=["custom_path"] )
116 The function adds the local variable ``theNotebook`` with
117 the absolute file name of the notebook.
118 The execution of a notebook might fail because it relies on remote data
119 specified by url. The function downloads the data first and stores it in
120 folder *working_dir* (must not be None). The url string is replaced by
121 the absolute path to the file.
122 """
123 cached_rep = _cache_url_to_file(cache_urls, working_dir, fLOG=fLOG)
124 if replacements is None:
125 replacements = cached_rep
126 elif cached_rep is not None:
127 cached_rep.update(replacements)
128 else:
129 cached_rep = replacements
131 with open(filename, "r", encoding=encoding) as payload:
132 try:
133 nbc = payload.read()
134 except UnicodeDecodeError as e: # pragma: no cover
135 raise NotebookException(
136 "(2) Unable to read file '{0}' encoding='{1}'.".format(filename, encoding)) from e
137 try:
138 nb = reads(nbc)
139 except NotJSONError as e: # pragma: no cover
140 raise NotebookException(
141 "(1) Unable to read file '{0}' encoding='{1}'.".format(filename, encoding)) from e
143 out = StringIO()
145 def flogging(*args, **kwargs):
146 if len(args) > 0:
147 out.write(" ".join(args))
148 if len(kwargs) > 0:
149 out.write(str(kwargs))
150 out.write("\n")
151 fLOG(*args, **kwargs)
153 try:
154 nb_runner = NotebookRunner(nb, profile_dir, working_dir, fLOG=flogging, filename=filename,
155 theNotebook=os.path.abspath(filename),
156 code_init=code_init, log_level=log_level,
157 extended_args=extended_args, kernel_name=kernel_name,
158 replacements=cached_rep, kernel=True, detailed_log=detailed_log,
159 startup_timeout=startup_timeout)
160 except NotebookKernelError: # pragma: no cover
161 # It fails. We try again once.
162 nb_runner = NotebookRunner(nb, profile_dir, working_dir, fLOG=flogging, filename=filename,
163 theNotebook=os.path.abspath(filename),
164 code_init=code_init, log_level=log_level,
165 extended_args=extended_args, kernel_name=kernel_name,
166 replacements=cached_rep, kernel=True, detailed_log=detailed_log,
167 startup_timeout=startup_timeout)
169 try:
170 stat = nb_runner.run_notebook(skip_exceptions=skip_exceptions, additional_path=additional_path,
171 valid=valid, clean_function=clean_function)
173 if outfilename is not None:
174 with open(outfilename, 'w', encoding=encoding) as f:
175 try:
176 s = writes(nb_runner.nb)
177 except NotebookException as e: # pragma: no cover
178 raise NotebookException(
179 "issue with notebook: '{}'".format(filename)) from e
180 if isinstance(s, bytes):
181 s = s.decode('utf8')
182 f.write(s)
184 finally:
185 nb_runner.shutdown_kernel()
187 return stat, out.getvalue()
190def execute_notebook_list(folder, notebooks, clean_function=None, valid=None, fLOG=noLOG,
191 additional_path=None, deepfLOG=noLOG, kernel_name="python",
192 log_level="30", extended_args=None, cache_urls=None,
193 replacements=None, detailed_log=None, startup_timeout=300):
194 """
195 Executes a list of notebooks.
197 @param folder folder (where to execute the notebook, current folder for the notebook)
198 @param notebooks list of notebooks to execute (or a list of tuple(notebook, code which initializes the notebook))
199 @param clean_function function which transform the code before running it
200 @param valid if not None, valid is a function which returns whether
201 or not the cell should be executed or not, if the function
202 returns None, the execution of the notebooks and skip the execution
203 of the other cells
204 @param fLOG logging function
205 @param deepfLOG logging function used to run the notebook
206 @param additional_path path to add to *sys.path* before running the notebook
207 @param kernel_name kernel name, it can be None
208 @param log_level Choices: (0, 10, 20, 30=default, 40, 50, 'DEBUG', 'INFO', 'WARN', 'ERROR', 'CRITICAL')
209 @param extended_args others arguments to pass to the command line ('--KernelManager.autorestar=True' for example),
210 see :ref:`l-ipython_notebook_args` for a full list
211 @param cache_urls list of urls to cache
212 @param replacements additional replacements
213 @param detailed_log detailed log
214 @param startup_timeout wait for this long for the kernel to be ready,
215 see `wait_for_ready
216 <https://github.com/jupyter/jupyter_client/blob/master/jupyter_client/blocking/client.py#L84>`_
217 @return dictionary of dictionaries ``{ notebook_name: { } }``
219 If *isSuccess* is False, *statistics* contains the execution time, *output* is the exception
220 raised during the execution.
222 The signature of function ``valid_cell`` is::
224 def valid_cell(cell):
225 return True or False or None to stop execution of the notebook before this cell
227 The signature of function ``clean_function`` is::
229 def clean_function(cell):
230 return new_cell_content
232 The execution of a notebook might fail because it relies on remote data
233 specified by url. The function downloads the data first and stores it in
234 folder *working_dir* (must not be None). The url string is replaced by
235 the absolute path to the file.
236 """
237 if additional_path is None:
238 additional_path = []
240 # we cache urls before running through the list of notebooks
241 _cache_url_to_file(cache_urls, folder, fLOG=fLOG)
243 results = {}
244 for i, note in enumerate(notebooks):
245 if isinstance(note, tuple):
246 note, code_init = note
247 else:
248 code_init = None
249 fLOG("[execute_notebook_list] {0}/{1} - {2}".format(i + 1,
250 len(notebooks), os.path.split(note)[-1]))
251 outfile = os.path.join(folder, "out_" + os.path.split(note)[-1])
252 cl = time.perf_counter()
253 try:
254 stat, out = run_notebook(note, working_dir=folder, outfilename=outfile,
255 additional_path=additional_path, valid=valid,
256 clean_function=clean_function, fLOG=deepfLOG,
257 code_init=code_init, kernel_name=kernel_name,
258 log_level=log_level, extended_args=extended_args,
259 cache_urls=cache_urls, replacements=replacements,
260 detailed_log=detailed_log, startup_timeout=startup_timeout)
261 if not os.path.exists(outfile):
262 raise FileNotFoundError(outfile) # pragma: no cover
263 etime = time.perf_counter() - cl
264 results[note] = dict(success=True, output=out, name=note, etime=etime,
265 date=datetime.now())
266 results[note].update(stat)
267 except Exception as e:
268 etime = time.perf_counter() - cl
269 results[note] = dict(success=False, etime=etime, error=e, name=note,
270 date=datetime.now())
271 return results
274def _get_dump_default_path(dump):
275 """
276 Proposes a default location to dump results about notebooks execution.
278 @param dump location of the dump or module.
279 @return location of the dump
281 The result might be equal to the input if *dump* is already path.
282 """
283 if hasattr(dump, '__file__') and hasattr(dump, '__name__'):
284 # Default value. We check it is none travis or appveyor.
285 from ..pycode import is_travis_or_appveyor
286 if is_travis_or_appveyor():
287 dump = None
288 if dump is not None:
289 # We guess the package name.
290 name = dump.__name__.split('.')[-1]
291 loc = os.path.dirname(dump.__file__)
292 src_loc = os.path.split(loc)
293 if src_loc[-1] == 'src':
294 # We choose a path for the dumps in a way
295 fold = os.path.join(loc, "..", "..", "..", "_notebook_dumps")
296 else:
297 src_loc_loc = os.path.split(src_loc[0])
298 if src_loc_loc[-1] == 'src':
299 # We choose a path for the dumps in a way
300 fold = os.path.join(
301 loc, "..", "..", "..", "_notebook_dumps")
302 else:
303 # This should be a parameter.
304 fold = os.path.join(loc, "..", "..", "_notebook_dumps")
305 if not os.path.exists(fold):
306 os.mkdir(fold)
307 dump = os.path.join(fold, "notebook.{0}.txt".format(name))
308 return dump
309 return dump
312def _existing_dump(dump):
313 """
314 Loads an existing dump.
316 @param dump filename
317 @return :epkg:`pandas:DataFrame`
318 """
319 import pandas
320 from pandas.errors import ParserError
322 def read_file(dump):
323 try:
324 df = pandas.read_csv(dump, sep="\t", encoding="utf-8")
325 except ParserError: # pragma: no cover
326 df = pandas.read_csv(
327 dump, sep="\t", encoding="utf-8", error_bad_lines=False, warn_bad_lines=True)
328 return df
330 if os.path.exists(dump):
331 # There might be some risk here to see another process writing the
332 # file at the same time.
333 try:
334 df = read_file(dump)
335 except PermissionError: # pragma: no cover
336 # We try again once.
337 time.sleep(10)
338 try:
339 df = read_file(dump)
340 except Exception as e:
341 raise RuntimeError(
342 "Unable to read '{0}' due to '{1}'".format(dump, e)) from e
343 except Exception as e: # pragma: no cover
344 raise RuntimeError(
345 "Unable to read '{0}' due to '{1}'".format(dump, e)) from e
346 else:
347 df = None
349 return df
352def execute_notebook_list_finalize_ut(res, dump=None, fLOG=noLOG):
353 """
354 Checks the list of results and raises an exception if one failed.
355 This is meant to be used in unit tests.
357 @param res output of @see fn execute_notebook_list
358 @param dump if not None, dump the results of the execution in a flat file
359 @param fLOG logging function
361 The dump relies on :epkg:`pandas` and append the results a previous dump.
362 If *dump* is a module, the function stores the output of the execution in a default
363 location only if the process does not run on :epkg:`travis` or :epkg:`appveyor`.
364 The default location is something like:
366 .. runpython::
368 from pyquickhelper.ipythonhelper.run_notebook import _get_dump_default_path
369 import pyquickhelper
370 print(_get_dump_default_path(pyquickhelper))
371 """
372 if len(res) == 0:
373 raise RuntimeError("No notebook was run.") # pragma: no cover
375 def fail_note(v):
376 return "error" in v
377 fails = [(os.path.split(k)[-1], v)
378 for k, v in sorted(res.items()) if fail_note(v)]
379 for f in fails:
380 fLOG(f)
381 for k, v in sorted(res.items()):
382 name = os.path.split(k)[-1]
383 fLOG(name, v.get("success", None), v.get("etime", None))
384 if len(fails) > 0:
385 raise fails[0][1]["error"]
387 dump = _get_dump_default_path(dump)
388 if dump is not None:
389 import pandas
390 df = _existing_dump(dump)
391 new_df = pandas.DataFrame(data=list(res.values()))
393 # We replace every EOL.
394 def eol_replace(t):
395 return t.replace("\r", "").replace("\n", "\\n")
397 subdf = new_df.select_dtypes(include=['object']).applymap(eol_replace)
398 for c in subdf.columns:
399 new_df[c] = subdf[c]
401 if df is None:
402 df = new_df
403 else:
404 df = pandas.concat([df, new_df]).copy()
406 # There could be a conflict while several
407 # processes in parallel could overwrite the same file.
408 if not os.path.exists(dump):
409 df.to_csv(dump, sep="\t", encoding="utf-8", index=False)
410 else:
411 # There might be some risk here to see another process
412 # writing or reading the file at the same time.
413 # Module filelock does not work in this case.
414 # locket (https://github.com/mwilliamson/locket.py) was not tried.
415 try:
416 df.to_csv(dump, sep="\t", encoding="utf-8", # pylint: disable=E1101
417 index=False)
418 except PermissionError: # pragma: no cover
419 time.sleep(7)
420 df.to_csv(dump, sep="\t", encoding="utf-8", # pylint: disable=E1101
421 index=False)
424def notebook_coverage(module_or_path, dump=None, too_old=30):
425 """
426 Extracts a list of notebooks and merges with a list of runs dumped by
427 function @see fn execute_notebook_list_finalize_ut.
429 @param module_or_path a module or a path
430 @param dump dump (or None to get the location by default)
431 @param too_old drop executions older than *too_old* days from now
432 @return dataframe
434 If *module_or_path* is a module, the function will get a list notebooks
435 assuming it follows the same design as :epkg:`pyquickhelper`.
436 """
437 if dump is None:
438 dump = _get_dump_default_path(module_or_path)
439 else:
440 dump = _get_dump_default_path(dump)
442 # Create the list of existing notebooks.
443 if isinstance(module_or_path, list):
444 nbs = [_[1] if isinstance(_, tuple) else _ for _ in module_or_path]
445 elif hasattr(module_or_path, '__file__') and hasattr(module_or_path, '__name__'):
446 fold = os.path.dirname(module_or_path.__file__)
447 _doc = os.path.join(fold, "..", "..", "_doc")
448 if not os.path.exists(_doc):
449 raise FileNotFoundError( # pragma: no cover
450 "Unable to find path '{0}' for module '{1}'".format(
451 _doc, module_or_path))
452 nbpath = os.path.join(_doc, "notebooks")
453 if not os.path.exists(nbpath):
454 raise FileNotFoundError( # pragma: no cover
455 "Unable to find path '{0}' for module '{1}'".format(
456 nbpath, module_or_path))
457 nbs = explore_folder(nbpath, ".*[.]ipynb$")[1]
458 else:
459 nbpath = module_or_path
460 nbs = explore_folder(nbpath, ".*[.]ipynb$")[1]
462 import pandas
463 dfnb = pandas.DataFrame(data=dict(notebooks=nbs))
464 dfnb["notebooks"] = dfnb["notebooks"].apply(lambda x: os.path.normpath(x))
465 dfnb = dfnb[~dfnb.notebooks.str.contains(".ipynb_checkpoints")].copy()
466 dfnb["key"] = dfnb["notebooks"].apply(lambda x: "/".join(os.path.normpath(
467 x).replace("\\", "/").split("/")[-3:]) if isinstance(x, str) else x)
468 dfnb["key"] = dfnb["key"].apply(
469 lambda x: x.lower() if isinstance(x, str) else x)
471 # There might be some risk here to see another process writing the
472 # file at the same time.
473 try:
474 dfall = pandas.read_csv(dump, sep="\t", encoding="utf-8")
475 except PermissionError: # pragma: no cover
476 # We try again once.
477 time.sleep(10)
478 dfall = pandas.read_csv(dump, sep="\t", encoding="utf-8")
480 # We drop too old execution.
481 old = datetime.now() - timedelta(too_old)
482 old = "%04d-%02d-%02d" % (old.year, old.month, old.day)
483 dfall = dfall[dfall.date > old].copy()
485 # We add a key to merge.
486 dfall["name"] = dfall["name"].apply(lambda x: os.path.normpath(x))
487 dfall["key"] = dfall["name"].apply(lambda x: "/".join(os.path.normpath(
488 x).replace("\\", "/").split("/")[-3:]) if isinstance(x, str) else x)
489 dfall["key"] = dfall["key"].apply(
490 lambda x: x.lower() if isinstance(x, str) else x)
492 # We keep the last execution.
493 gr = dfall.sort_values("date", ascending=False).groupby(
494 "key", as_index=False).first().reset_index(drop=True).copy()
495 gr = gr.drop("name", axis=1)
497 # Folders might be different so we merge on the last part of the path.
498 merged = dfnb.merge(gr, left_on="key", right_on="key", how="outer")
499 merged = merged[merged.notebooks.notnull()]
500 merged = merged.sort_values("key").reset_index(drop=True).copy()
502 if "last_name" not in merged.columns:
503 merged["last_name"] = merged["key"].apply(
504 lambda x: os.path.split(x)[-1])
506 # We check there is no duplicates in merged.
507 for c in ["key", "last_name"]:
508 names = [_ for _ in merged[c] if isinstance(_, str)]
509 if len(names) > len(set(names)):
510 raise ValueError( # pragma: no cover
511 "Unexpected duplicated names in column '{1}'\n{0}".format(
512 "\n".join(sorted(names)), c))
514 return merged
517def badge_notebook_coverage(df, image_name):
518 """
519 Builds a badge reporting on the notebook coverage.
520 It gives the proportion of run cells.
522 @param df output of @see fn notebook_coverage
523 @param image_name image to produce
524 @return coverage estimation
526 The function relies on module :epkg:`Pillow`.
527 """
528 cell = df["nbcell"].sum()
529 run = df["nbrun"].sum()
530 valid = df["nbvalid"].sum()
531 cov = run * 100.0 / cell if cell > 0 else 1.0
532 dcov = min(100., cov)
533 val = valid * 100.0 / cell if cell > 0 else 1.0
534 with warnings.catch_warnings():
535 warnings.simplefilter("ignore", ImportWarning)
536 from PIL import Image, ImageFont, ImageDraw
537 if cov <= 60:
538 color = (200, 87, 51)
539 elif cov <= 70:
540 color = (200, 156, 18)
541 elif cov <= 75:
542 color = (140, 140, 140)
543 elif cov <= 80:
544 color = (88, 171, 171)
545 elif cov <= 85:
546 color = (88, 140, 86)
547 elif cov <= 90:
548 color = (80, 155, 86)
549 elif cov <= 95:
550 color = (80, 190, 73)
551 else:
552 color = (20, 190, 50)
553 img = Image.new(mode='RGB', size=(70, 20), color=color)
554 im = ImageDraw.Draw(img)
555 font = ImageFont.load_default()
556 try:
557 cov = int(cov)
558 cov = min(cov, 100)
559 except ValueError: # pragma: no cover
560 cov = "?"
561 try:
562 val = int(val)
563 val = min(val, 100)
564 except ValueError: # pragma: no cover
565 val = "?"
566 if cov != val:
567 im.text((3, 4), "NB:{0}%-{1}% ".format(cov, val),
568 (255, 255, 255), font=font)
569 else:
570 im.text((3, 4), "NB: {0}% ".format(
571 cov), (255, 255, 255), font=font)
572 img.save(image_name)
573 return dcov
576def get_additional_paths(modules):
577 """
578 Returns a list of paths to add before running the notebooks
579 for a given a list of modules.
581 @return list of paths
582 """
583 addpath = [os.path.dirname(mod.__file__) for mod in modules]
584 addpath = [os.path.normpath(os.path.join(_, "..")) for _ in addpath]
585 return addpath
588def retrieve_notebooks_in_folder(folder, posreg=".*[.]ipynb$", negreg=None):
589 """
590 Retrieves notebooks in a test folder.
592 @param folder folder
593 @param regex regular expression
594 @return list of found notebooks
595 """
596 pos = re.compile(posreg)
597 neg = re.compile(negreg) if negreg is not None else None
598 res = []
599 for name in os.listdir(folder):
600 if pos.search(name):
601 if neg is None or not neg.search(name):
602 res.append(os.path.join(folder, name))
603 if len(res) == 0:
604 raise FileNotFoundError( # pragma: no cover
605 "No notebook found in '{0}'.".format(folder))
606 return res