Coverage for pyquickhelper/ipythonhelper/run_notebook.py: 89%
245 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-03 02:21 +0200
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-03 02:21 +0200
1"""
2@file
3@brief Functions to run notebooks.
4"""
5import time
6import os
7import warnings
8import re
9from io import StringIO
10import urllib.request as urllib_request
11from datetime import datetime, timedelta
13from ..loghelper.flog import noLOG
14from ..filehelper import explore_folder
15from .notebook_runner import NotebookRunner, NotebookKernelError
16from .notebook_exception import NotebookException
17from .notebook_helper import writes
20try:
21 from nbformat.reader import reads
22 from nbformat.reader import NotJSONError
23except ImportError: # pragma: no cover
24 from IPython.nbformat.reader import reads
25 from IPython.nbformat.reader import NotJSONError
28def _cache_url_to_file(cache_urls, folder, fLOG=noLOG):
29 """
30 Downloads file corresponding to url stored in *cache_urls*.
32 @param cache_urls list of urls
33 @param folder where to store the cached files
34 @param fLOG logging function
35 @return dictionary { url: file }
37 The function detects if the file was already downloaded.
38 In that case, it does not do it a second time.
39 """
40 if cache_urls is None:
41 return None
42 if folder is None:
43 raise FileNotFoundError( # pragma: no cover
44 "folder cannot be None")
45 res = {}
46 for url in cache_urls:
47 local_file = "__cached__" + url.split("/")[-1]
48 local_file = local_file.replace(":", "_").replace("%", "_")
49 local_file = os.path.abspath(os.path.join(folder, local_file))
50 if not os.path.exists(local_file):
51 fLOG("download", url, "to", local_file)
52 with open(local_file, "wb") as f:
53 fu = urllib_request.urlopen(url)
54 c = fu.read(2 ** 21)
55 while len(c) > 0:
56 f.write(c)
57 f.flush()
58 c = fu.read(2 ** 21)
59 fu.close()
61 # to avoid having backslahes inside strings
62 res[url] = "file:///" + local_file.replace("\\", "/")
63 return res
66def run_notebook(filename, profile_dir=None, working_dir=None, skip_exceptions=False,
67 outfilename=None, encoding="utf8", additional_path=None,
68 valid=None, clean_function=None, code_init=None,
69 fLOG=noLOG, kernel_name="python", log_level="30",
70 extended_args=None, cache_urls=None, replacements=None,
71 detailed_log=None, startup_timeout=30, raise_exception=False):
72 """
73 Runs a notebook end to end,
74 it is inspired from module `runipy <https://github.com/paulgb/runipy/>`_.
76 @param filename notebook filename
77 @param profile_dir profile directory
78 @param working_dir working directory
79 @param skip_exceptions skip exceptions
80 @param outfilename if not None, saves the output in this notebook
81 @param encoding encoding for the notebooks
82 @param additional_path additional paths for import
83 @param valid if not None, valid is a function which returns whether
84 or not the cell should be executed or not, if the function
85 returns None, the execution of the notebooks and skip the execution
86 of the other cells
87 @param clean_function function which cleans a cell's code before executing it (None for None)
88 @param code_init code to run before the execution of the notebook as if it was a cell
89 @param fLOG logging function
90 @param kernel_name kernel name, it can be None
91 @param log_level Choices: (0, 10, 20, 30=default, 40, 50, 'DEBUG', 'INFO', 'WARN', 'ERROR', 'CRITICAL')
92 @param extended_args others arguments to pass to the command line ('--KernelManager.autorestar=True' for example),
93 see :ref:`l-ipython_notebook_args` for a full list
94 @param cache_urls list of urls to cache
95 @param replacements list of additional replacements, list of tuple
96 @param detailed_log a second function to log more information when executing the notebook,
97 this should be a function with the same signature as ``print`` or None
98 @param startup_timeout wait for this long for the kernel to be ready,
99 see `wait_for_ready
100 <https://github.com/jupyter/jupyter_client/blob/master/jupyter_client/blocking/client.py#L84>`_
101 @param raise_exception raise an exception if a cell raises one
102 @return tuple (statistics, output)
104 @warning The function calls `basicConfig
105 <https://docs.python.org/3/library/logging.html#logging.basicConfig>`_.
107 .. exref::
108 :title: Run a notebook end to end
110 ::
112 from pyquickhelper.ipythonhelper import run_notebook
113 run_notebook("source.ipynb", working_dir="temp",
114 outfilename="modified.ipynb",
115 additional_path=["custom_path"] )
117 The function adds the local variable ``theNotebook`` with
118 the absolute file name of the notebook.
119 The execution of a notebook might fail because it relies on remote data
120 specified by url. The function downloads the data first and stores it in
121 folder *working_dir* (must not be None). The url string is replaced by
122 the absolute path to the file.
123 """
124 cached_rep = _cache_url_to_file(cache_urls, working_dir, fLOG=fLOG)
125 if replacements is None:
126 replacements = cached_rep
127 elif cached_rep is not None:
128 cached_rep.update(replacements)
129 else:
130 cached_rep = replacements
132 with open(filename, "r", encoding=encoding) as payload:
133 try:
134 nbc = payload.read()
135 except UnicodeDecodeError as e: # pragma: no cover
136 raise NotebookException(
137 f"(2) Unable to read file '{filename}' encoding='{encoding}'.") from e
138 try:
139 nb = reads(nbc)
140 except NotJSONError as e: # pragma: no cover
141 raise NotebookException(
142 f"(1) Unable to read file '{filename}' encoding='{encoding}'.") from e
144 out = StringIO()
146 def flogging(*args, **kwargs):
147 if len(args) > 0:
148 out.write(" ".join(args))
149 if len(kwargs) > 0:
150 out.write(str(kwargs))
151 out.write("\n")
152 fLOG(*args, **kwargs)
154 try:
155 nb_runner = NotebookRunner(
156 nb, profile_dir, working_dir, fLOG=flogging, filename=filename,
157 theNotebook=os.path.abspath(filename),
158 code_init=code_init, log_level=log_level,
159 extended_args=extended_args, kernel_name=kernel_name,
160 replacements=cached_rep, kernel=True, detailed_log=detailed_log,
161 startup_timeout=startup_timeout, raise_exception=raise_exception)
162 except NotebookKernelError: # pragma: no cover
163 # It fails. We try again once.
164 nb_runner = NotebookRunner(
165 nb, profile_dir, working_dir, fLOG=flogging, filename=filename,
166 theNotebook=os.path.abspath(filename),
167 code_init=code_init, log_level=log_level,
168 extended_args=extended_args, kernel_name=kernel_name,
169 replacements=cached_rep, kernel=True, detailed_log=detailed_log,
170 startup_timeout=startup_timeout, raise_exception=raise_exception)
172 try:
173 stat = nb_runner.run_notebook(
174 skip_exceptions=skip_exceptions, additional_path=additional_path,
175 valid=valid, clean_function=clean_function)
177 if outfilename is not None:
178 with open(outfilename, 'w', encoding=encoding) as f:
179 try:
180 s = writes(nb_runner.nb)
181 except NotebookException as e: # pragma: no cover
182 raise NotebookException(
183 f"issue with notebook: '{filename}'") from e
184 if isinstance(s, bytes):
185 s = s.decode('utf8')
186 f.write(s)
188 finally:
189 nb_runner.shutdown_kernel()
191 return stat, out.getvalue()
194def execute_notebook_list(folder, notebooks, clean_function=None, valid=None, fLOG=noLOG,
195 additional_path=None, deepfLOG=noLOG, kernel_name="python",
196 log_level="30", extended_args=None, cache_urls=None,
197 replacements=None, detailed_log=None, startup_timeout=300):
198 """
199 Executes a list of notebooks.
201 @param folder folder (where to execute the notebook, current folder for the notebook)
202 @param notebooks list of notebooks to execute (or a list of tuple(notebook, code which initializes the notebook))
203 @param clean_function function which transform the code before running it
204 @param valid if not None, valid is a function which returns whether
205 or not the cell should be executed or not, if the function
206 returns None, the execution of the notebooks and skip the execution
207 of the other cells
208 @param fLOG logging function
209 @param deepfLOG logging function used to run the notebook
210 @param additional_path path to add to *sys.path* before running the notebook
211 @param kernel_name kernel name, it can be None
212 @param log_level Choices: (0, 10, 20, 30=default, 40, 50, 'DEBUG', 'INFO', 'WARN', 'ERROR', 'CRITICAL')
213 @param extended_args others arguments to pass to the command line ('--KernelManager.autorestar=True' for example),
214 see :ref:`l-ipython_notebook_args` for a full list
215 @param cache_urls list of urls to cache
216 @param replacements additional replacements
217 @param detailed_log detailed log
218 @param startup_timeout wait for this long for the kernel to be ready,
219 see `wait_for_ready
220 <https://github.com/jupyter/jupyter_client/blob/master/jupyter_client/blocking/client.py#L84>`_
221 @return dictionary of dictionaries ``{ notebook_name: { } }``
223 If *isSuccess* is False, *statistics* contains the execution time, *output* is the exception
224 raised during the execution.
226 The signature of function ``valid_cell`` is::
228 def valid_cell(cell):
229 return True or False or None to stop execution of the notebook before this cell
231 The signature of function ``clean_function`` is::
233 def clean_function(cell):
234 return new_cell_content
236 The execution of a notebook might fail because it relies on remote data
237 specified by url. The function downloads the data first and stores it in
238 folder *working_dir* (must not be None). The url string is replaced by
239 the absolute path to the file.
240 """
241 if additional_path is None:
242 additional_path = []
244 # we cache urls before running through the list of notebooks
245 _cache_url_to_file(cache_urls, folder, fLOG=fLOG)
247 results = {}
248 for i, note in enumerate(notebooks):
249 if isinstance(note, tuple):
250 note, code_init = note
251 else:
252 code_init = None
253 fLOG("[execute_notebook_list] {0}/{1} - {2}".format(i + 1,
254 len(notebooks), os.path.split(note)[-1]))
255 outfile = os.path.join(folder, "out_" + os.path.split(note)[-1])
256 cl = time.perf_counter()
257 try:
258 stat, out = run_notebook(note, working_dir=folder, outfilename=outfile,
259 additional_path=additional_path, valid=valid,
260 clean_function=clean_function, fLOG=deepfLOG,
261 code_init=code_init, kernel_name=kernel_name,
262 log_level=log_level, extended_args=extended_args,
263 cache_urls=cache_urls, replacements=replacements,
264 detailed_log=detailed_log, startup_timeout=startup_timeout)
265 if not os.path.exists(outfile):
266 raise FileNotFoundError(outfile) # pragma: no cover
267 etime = time.perf_counter() - cl
268 results[note] = dict(success=True, output=out, name=note, etime=etime,
269 date=datetime.now())
270 results[note].update(stat)
271 except Exception as e:
272 etime = time.perf_counter() - cl
273 results[note] = dict(success=False, etime=etime, error=e, name=note,
274 date=datetime.now())
275 return results
278def _get_dump_default_path(dump):
279 """
280 Proposes a default location to dump results about notebooks execution.
282 @param dump location of the dump or module.
283 @return location of the dump
285 The result might be equal to the input if *dump* is already path.
286 """
287 if hasattr(dump, '__file__') and hasattr(dump, '__name__'):
288 # Default value. We check it is none travis or appveyor.
289 from ..pycode import is_travis_or_appveyor
290 if is_travis_or_appveyor():
291 dump = None
292 if dump is not None:
293 # We guess the package name.
294 name = dump.__name__.split('.')[-1]
295 loc = os.path.dirname(dump.__file__)
296 src_loc = os.path.split(loc)
297 if src_loc[-1] == 'src':
298 # We choose a path for the dumps in a way
299 fold = os.path.join(loc, "..", "..", "..", "_notebook_dumps")
300 else:
301 src_loc_loc = os.path.split(src_loc[0])
302 if src_loc_loc[-1] == 'src':
303 # We choose a path for the dumps in a way
304 fold = os.path.join(
305 loc, "..", "..", "..", "_notebook_dumps")
306 else:
307 # This should be a parameter.
308 fold = os.path.join(loc, "..", "..", "_notebook_dumps")
309 if not os.path.exists(fold):
310 os.mkdir(fold)
311 dump = os.path.join(fold, f"notebook.{name}.txt")
312 return dump
313 return dump
316def _existing_dump(dump):
317 """
318 Loads an existing dump.
320 @param dump filename
321 @return :epkg:`pandas:DataFrame`
322 """
323 import pandas
324 from pandas.errors import ParserError
326 def read_file(dump):
327 try:
328 df = pandas.read_csv(dump, sep="\t", encoding="utf-8")
329 except ParserError: # pragma: no cover
330 df = pandas.read_csv(
331 dump, sep="\t", encoding="utf-8", error_bad_lines=False, warn_bad_lines=True)
332 return df
334 if os.path.exists(dump):
335 # There might be some risk here to see another process writing the
336 # file at the same time.
337 try:
338 df = read_file(dump)
339 except PermissionError: # pragma: no cover
340 # We try again once.
341 time.sleep(10)
342 try:
343 df = read_file(dump)
344 except Exception as e:
345 raise RuntimeError(
346 f"Unable to read '{dump}' due to '{e}'") from e
347 except Exception as e: # pragma: no cover
348 raise RuntimeError(
349 f"Unable to read '{dump}' due to '{e}'") from e
350 else:
351 df = None
353 return df
356def execute_notebook_list_finalize_ut(res, dump=None, fLOG=noLOG):
357 """
358 Checks the list of results and raises an exception if one failed.
359 This is meant to be used in unit tests.
361 @param res output of @see fn execute_notebook_list
362 @param dump if not None, dump the results of the execution in a flat file
363 @param fLOG logging function
365 The dump relies on :epkg:`pandas` and append the results a previous dump.
366 If *dump* is a module, the function stores the output of the execution in a default
367 location only if the process does not run on :epkg:`travis` or :epkg:`appveyor`.
368 The default location is something like:
370 .. runpython::
372 from pyquickhelper.ipythonhelper.run_notebook import _get_dump_default_path
373 import pyquickhelper
374 print(_get_dump_default_path(pyquickhelper))
375 """
376 if len(res) == 0:
377 raise RuntimeError("No notebook was run.") # pragma: no cover
379 def fail_note(v):
380 return "error" in v
381 fails = [(os.path.split(k)[-1], v)
382 for k, v in sorted(res.items()) if fail_note(v)]
383 for f in fails:
384 fLOG(f)
385 for k, v in sorted(res.items()):
386 name = os.path.split(k)[-1]
387 fLOG(name, v.get("success", None), v.get("etime", None))
388 if len(fails) > 0:
389 raise fails[0][1]["error"]
391 dump = _get_dump_default_path(dump)
392 if dump is not None:
393 import pandas
394 df = _existing_dump(dump)
395 new_df = pandas.DataFrame(data=list(res.values()))
397 # We replace every EOL.
398 def eol_replace(t):
399 return t.replace("\r", "").replace("\n", "\\n")
401 subdf = new_df.select_dtypes(include=['object']).applymap(eol_replace)
402 for c in subdf.columns:
403 new_df[c] = subdf[c]
405 if df is None:
406 df = new_df
407 else:
408 df = pandas.concat([df, new_df]).copy()
410 # There could be a conflict while several
411 # processes in parallel could overwrite the same file.
412 if not os.path.exists(dump):
413 df.to_csv(dump, sep="\t", encoding="utf-8", index=False)
414 else:
415 # There might be some risk here to see another process
416 # writing or reading the file at the same time.
417 # Module filelock does not work in this case.
418 # locket (https://github.com/mwilliamson/locket.py) was not tried.
419 try:
420 df.to_csv(dump, sep="\t", encoding="utf-8", # pylint: disable=E1101
421 index=False)
422 except PermissionError: # pragma: no cover
423 time.sleep(7)
424 df.to_csv(dump, sep="\t", encoding="utf-8", # pylint: disable=E1101
425 index=False)
428def notebook_coverage(module_or_path, dump=None, too_old=30):
429 """
430 Extracts a list of notebooks and merges with a list of runs dumped by
431 function @see fn execute_notebook_list_finalize_ut.
433 @param module_or_path a module or a path
434 @param dump dump (or None to get the location by default)
435 @param too_old drop executions older than *too_old* days from now
436 @return dataframe
438 If *module_or_path* is a module, the function will get a list notebooks
439 assuming it follows the same design as :epkg:`pyquickhelper`.
440 """
441 if dump is None:
442 dump = _get_dump_default_path(module_or_path)
443 else:
444 dump = _get_dump_default_path(dump)
446 # Create the list of existing notebooks.
447 if isinstance(module_or_path, list):
448 nbs = [_[1] if isinstance(_, tuple) else _ for _ in module_or_path]
449 elif hasattr(module_or_path, '__file__') and hasattr(module_or_path, '__name__'):
450 fold = os.path.dirname(module_or_path.__file__)
451 _doc = os.path.join(fold, "..", "..", "_doc")
452 if not os.path.exists(_doc):
453 raise FileNotFoundError( # pragma: no cover
454 f"Unable to find path '{_doc}' for module '{module_or_path}'")
455 nbpath = os.path.join(_doc, "notebooks")
456 if not os.path.exists(nbpath):
457 raise FileNotFoundError( # pragma: no cover
458 f"Unable to find path '{nbpath}' for module '{module_or_path}'")
459 nbs = explore_folder(nbpath, ".*[.]ipynb$")[1]
460 else:
461 nbpath = module_or_path
462 nbs = explore_folder(nbpath, ".*[.]ipynb$")[1]
464 import pandas
465 dfnb = pandas.DataFrame(data=dict(notebooks=nbs))
466 dfnb["notebooks"] = dfnb["notebooks"].apply(lambda x: os.path.normpath(x))
467 dfnb = dfnb[~dfnb.notebooks.str.contains(".ipynb_checkpoints")].copy()
468 dfnb["key"] = dfnb["notebooks"].apply(lambda x: "/".join(os.path.normpath(
469 x).replace("\\", "/").split("/")[-3:]) if isinstance(x, str) else x)
470 dfnb["key"] = dfnb["key"].apply(
471 lambda x: x.lower() if isinstance(x, str) else x)
473 # There might be some risk here to see another process writing the
474 # file at the same time.
475 try:
476 dfall = pandas.read_csv(dump, sep="\t", encoding="utf-8")
477 except PermissionError: # pragma: no cover
478 # We try again once.
479 time.sleep(10)
480 dfall = pandas.read_csv(dump, sep="\t", encoding="utf-8")
482 # We drop too old execution.
483 old = datetime.now() - timedelta(too_old)
484 old = "%04d-%02d-%02d" % (old.year, old.month, old.day)
485 dfall = dfall[dfall.date > old].copy()
487 # We add a key to merge.
488 dfall["name"] = dfall["name"].apply(lambda x: os.path.normpath(x))
489 dfall["key"] = dfall["name"].apply(lambda x: "/".join(os.path.normpath(
490 x).replace("\\", "/").split("/")[-3:]) if isinstance(x, str) else x)
491 dfall["key"] = dfall["key"].apply(
492 lambda x: x.lower() if isinstance(x, str) else x)
494 # We keep the last execution.
495 gr = dfall.sort_values("date", ascending=False).groupby(
496 "key", as_index=False).first().reset_index(drop=True).copy()
497 gr = gr.drop("name", axis=1)
499 # Folders might be different so we merge on the last part of the path.
500 merged = dfnb.merge(gr, left_on="key", right_on="key", how="outer")
501 merged = merged[merged.notebooks.notnull()]
502 merged = merged.sort_values("key").reset_index(drop=True).copy()
504 if "last_name" not in merged.columns:
505 merged["last_name"] = merged["key"].apply(
506 lambda x: os.path.split(x)[-1])
508 # We check there is no duplicates in merged.
509 for c in ["key", "last_name"]:
510 names = [_ for _ in merged[c] if isinstance(_, str)]
511 if len(names) > len(set(names)):
512 raise ValueError( # pragma: no cover
513 "Unexpected duplicated names in column '{1}'\n{0}".format(
514 "\n".join(sorted(names)), c))
516 return merged
519def badge_notebook_coverage(df, image_name):
520 """
521 Builds a badge reporting on the notebook coverage.
522 It gives the proportion of run cells.
524 @param df output of @see fn notebook_coverage
525 @param image_name image to produce
526 @return coverage estimation
528 The function relies on module :epkg:`Pillow`.
529 """
530 cell = df["nbcell"].sum()
531 run = df["nbrun"].sum()
532 valid = df["nbvalid"].sum()
533 cov = run * 100.0 / cell if cell > 0 else 1.0
534 dcov = min(100., cov)
535 val = valid * 100.0 / cell if cell > 0 else 1.0
536 with warnings.catch_warnings():
537 warnings.simplefilter("ignore", ImportWarning)
538 from PIL import Image, ImageFont, ImageDraw
539 if cov <= 60:
540 color = (200, 87, 51)
541 elif cov <= 70:
542 color = (200, 156, 18)
543 elif cov <= 75:
544 color = (140, 140, 140)
545 elif cov <= 80:
546 color = (88, 171, 171)
547 elif cov <= 85:
548 color = (88, 140, 86)
549 elif cov <= 90:
550 color = (80, 155, 86)
551 elif cov <= 95:
552 color = (80, 190, 73)
553 else:
554 color = (20, 190, 50)
555 img = Image.new(mode='RGB', size=(70, 20), color=color)
556 im = ImageDraw.Draw(img)
557 font = ImageFont.load_default()
558 try:
559 cov = int(cov)
560 cov = min(cov, 100)
561 except ValueError: # pragma: no cover
562 cov = "?"
563 try:
564 val = int(val)
565 val = min(val, 100)
566 except ValueError: # pragma: no cover
567 val = "?"
568 if cov != val:
569 im.text((3, 4), f"NB:{cov}%-{val}% ",
570 (255, 255, 255), font=font)
571 else:
572 im.text((3, 4), f"NB: {cov}% ", (255, 255, 255), font=font)
573 img.save(image_name)
574 return dcov
577def get_additional_paths(modules):
578 """
579 Returns a list of paths to add before running the notebooks
580 for a given a list of modules.
582 @return list of paths
583 """
584 addpath = [os.path.dirname(mod.__file__) for mod in modules]
585 addpath = [os.path.normpath(os.path.join(_, "..")) for _ in addpath]
586 return addpath
589def retrieve_notebooks_in_folder(folder, posreg=".*[.]ipynb$", negreg=None):
590 """
591 Retrieves notebooks in a test folder.
593 @param folder folder
594 @param regex regular expression
595 @return list of found notebooks
596 """
597 pos = re.compile(posreg)
598 neg = re.compile(negreg) if negreg is not None else None
599 res = []
600 for name in os.listdir(folder):
601 if pos.search(name):
602 if neg is None or not neg.search(name):
603 res.append(os.path.join(folder, name))
604 if len(res) == 0:
605 raise FileNotFoundError( # pragma: no cover
606 f"No notebook found in '{folder}'.")
607 return res