Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1""" 

2@file 

3@brief Functions to run notebooks. 

4""" 

5import time 

6import os 

7import warnings 

8import re 

9from io import StringIO 

10import urllib.request as urllib_request 

11from datetime import datetime, timedelta 

12 

13from ..loghelper.flog import noLOG 

14from ..filehelper import explore_folder 

15from .notebook_runner import NotebookRunner, NotebookKernelError 

16from .notebook_exception import NotebookException 

17from .notebook_helper import writes 

18 

19 

20try: 

21 from nbformat.reader import reads 

22 from nbformat.reader import NotJSONError 

23except ImportError: 

24 from IPython.nbformat.reader import reads 

25 from IPython.nbformat.reader import NotJSONError 

26 

27 

28def _cache_url_to_file(cache_urls, folder, fLOG=noLOG): 

29 """ 

30 Downloads file corresponding to url stored in *cache_urls*. 

31 

32 @param cache_urls list of urls 

33 @param folder where to store the cached files 

34 @param fLOG logging function 

35 @return dictionary { url: file } 

36 

37 The function detects if the file was already downloaded. 

38 In that case, it does not do it a second time. 

39 """ 

40 if cache_urls is None: 

41 return None 

42 if folder is None: 

43 raise FileNotFoundError("folder cannot be None") 

44 res = {} 

45 for url in cache_urls: 

46 local_file = "__cached__" + url.split("/")[-1] 

47 local_file = local_file.replace(":", "_").replace("%", "_") 

48 local_file = os.path.abspath(os.path.join(folder, local_file)) 

49 if not os.path.exists(local_file): 

50 fLOG("download", url, "to", local_file) 

51 with open(local_file, "wb") as f: 

52 fu = urllib_request.urlopen(url) 

53 c = fu.read(2 ** 21) 

54 while len(c) > 0: 

55 f.write(c) 

56 f.flush() 

57 c = fu.read(2 ** 21) 

58 fu.close() 

59 

60 # to avoid having backslahes inside strings 

61 res[url] = "file:///" + local_file.replace("\\", "/") 

62 return res 

63 

64 

65def run_notebook(filename, profile_dir=None, working_dir=None, skip_exceptions=False, 

66 outfilename=None, encoding="utf8", additional_path=None, 

67 valid=None, clean_function=None, code_init=None, 

68 fLOG=noLOG, kernel_name="python", log_level="30", 

69 extended_args=None, cache_urls=None, replacements=None, 

70 detailed_log=None, startup_timeout=300): 

71 """ 

72 Runs a notebook end to end, 

73 it is inspired from module `runipy <https://github.com/paulgb/runipy/>`_. 

74 

75 @param filename notebook filename 

76 @param profile_dir profile directory 

77 @param working_dir working directory 

78 @param skip_exceptions skip exceptions 

79 @param outfilename if not None, saves the output in this notebook 

80 @param encoding encoding for the notebooks 

81 @param additional_path additional paths for import 

82 @param valid if not None, valid is a function which returns whether 

83 or not the cell should be executed or not, if the function 

84 returns None, the execution of the notebooks and skip the execution 

85 of the other cells 

86 @param clean_function function which cleans a cell's code before executing it (None for None) 

87 @param code_init code to run before the execution of the notebook as if it was a cell 

88 @param fLOG logging function 

89 @param kernel_name kernel name, it can be None 

90 @param log_level Choices: (0, 10, 20, 30=default, 40, 50, 'DEBUG', 'INFO', 'WARN', 'ERROR', 'CRITICAL') 

91 @param extended_args others arguments to pass to the command line ('--KernelManager.autorestar=True' for example), 

92 see :ref:`l-ipython_notebook_args` for a full list 

93 @param cache_urls list of urls to cache 

94 @param replacements list of additional replacements, list of tuple 

95 @param detailed_log a second function to log more information when executing the notebook, 

96 this should be a function with the same signature as ``print`` or None 

97 @param startup_timeout wait for this long for the kernel to be ready, 

98 see `wait_for_ready 

99 <https://github.com/jupyter/jupyter_client/blob/master/jupyter_client/blocking/client.py#L84>`_ 

100 @return tuple (statistics, output) 

101 

102 @warning The function calls `basicConfig 

103 <https://docs.python.org/3/library/logging.html#logging.basicConfig>`_. 

104 

105 .. exref:: 

106 :title: Run a notebook end to end 

107 

108 :: 

109 

110 from pyquickhelper.ipythonhelper import run_notebook 

111 run_notebook("source.ipynb", working_dir="temp", 

112 outfilename="modified.ipynb", 

113 additional_path=["custom_path"] ) 

114 

115 The function adds the local variable ``theNotebook`` with 

116 the absolute file name of the notebook. 

117 

118 The execution of a notebook might fail because it relies on remote data 

119 specified by url. The function downloads the data first and stores it in 

120 folder *working_dir* (must not be None). The url string is replaced by 

121 the absolute path to the file. 

122 

123 .. versionchanged:: 1.8 

124 Parameters *detailed_log*, *startup_timeout* were added. 

125 """ 

126 cached_rep = _cache_url_to_file(cache_urls, working_dir, fLOG=fLOG) 

127 if replacements is None: 

128 replacements = cached_rep 

129 elif cached_rep is not None: 

130 cached_rep.update(replacements) 

131 else: 

132 cached_rep = replacements 

133 

134 with open(filename, "r", encoding=encoding) as payload: 

135 try: 

136 nbc = payload.read() 

137 except UnicodeDecodeError as e: 

138 raise NotebookException( 

139 "(2) Unable to read file '{0}' encoding='{1}'.".format(filename, encoding)) from e 

140 try: 

141 nb = reads(nbc) 

142 except NotJSONError as e: 

143 raise NotebookException( 

144 "(1) Unable to read file '{0}' encoding='{1}'.".format(filename, encoding)) from e 

145 

146 out = StringIO() 

147 

148 def flogging(*args, **kwargs): 

149 if len(args) > 0: 

150 out.write(" ".join(args)) 

151 if len(kwargs) > 0: 

152 out.write(str(kwargs)) 

153 out.write("\n") 

154 fLOG(*args, **kwargs) 

155 

156 try: 

157 nb_runner = NotebookRunner(nb, profile_dir, working_dir, fLOG=flogging, filename=filename, 

158 theNotebook=os.path.abspath(filename), 

159 code_init=code_init, log_level=log_level, 

160 extended_args=extended_args, kernel_name=kernel_name, 

161 replacements=cached_rep, kernel=True, detailed_log=detailed_log, 

162 startup_timeout=startup_timeout) 

163 except NotebookKernelError: 

164 # It fails. We try again once. 

165 nb_runner = NotebookRunner(nb, profile_dir, working_dir, fLOG=flogging, filename=filename, 

166 theNotebook=os.path.abspath(filename), 

167 code_init=code_init, log_level=log_level, 

168 extended_args=extended_args, kernel_name=kernel_name, 

169 replacements=cached_rep, kernel=True, detailed_log=detailed_log, 

170 startup_timeout=startup_timeout) 

171 

172 try: 

173 stat = nb_runner.run_notebook(skip_exceptions=skip_exceptions, additional_path=additional_path, 

174 valid=valid, clean_function=clean_function) 

175 

176 if outfilename is not None: 

177 with open(outfilename, 'w', encoding=encoding) as f: 

178 try: 

179 s = writes(nb_runner.nb) 

180 except NotebookException as e: 

181 raise NotebookException( 

182 "issue with notebook: '{}'".format(filename)) from e 

183 if isinstance(s, bytes): 

184 s = s.decode('utf8') 

185 f.write(s) 

186 

187 finally: 

188 nb_runner.shutdown_kernel() 

189 

190 return stat, out.getvalue() 

191 

192 

193def execute_notebook_list(folder, notebooks, clean_function=None, valid=None, fLOG=noLOG, 

194 additional_path=None, deepfLOG=noLOG, kernel_name="python", 

195 log_level="30", extended_args=None, cache_urls=None, 

196 replacements=None, detailed_log=None, startup_timeout=300): 

197 """ 

198 Executes a list of notebooks. 

199 

200 @param folder folder (where to execute the notebook, current folder for the notebook) 

201 @param notebooks list of notebooks to execute (or a list of tuple(notebook, code which initializes the notebook)) 

202 @param clean_function function which transform the code before running it 

203 @param valid if not None, valid is a function which returns whether 

204 or not the cell should be executed or not, if the function 

205 returns None, the execution of the notebooks and skip the execution 

206 of the other cells 

207 @param fLOG logging function 

208 @param deepfLOG logging function used to run the notebook 

209 @param additional_path path to add to *sys.path* before running the notebook 

210 @param kernel_name kernel name, it can be None 

211 @param log_level Choices: (0, 10, 20, 30=default, 40, 50, 'DEBUG', 'INFO', 'WARN', 'ERROR', 'CRITICAL') 

212 @param extended_args others arguments to pass to the command line ('--KernelManager.autorestar=True' for example), 

213 see :ref:`l-ipython_notebook_args` for a full list 

214 @param cache_urls list of urls to cache 

215 @param replacements additional replacements 

216 @param detailed_log detailed log 

217 @param startup_timeout wait for this long for the kernel to be ready, 

218 see `wait_for_ready 

219 <https://github.com/jupyter/jupyter_client/blob/master/jupyter_client/blocking/client.py#L84>`_ 

220 @return dictionary of dictionaries ``{ notebook_name: { } }`` 

221 

222 If *isSuccess* is False, *statistics* contains the execution time, *output* is the exception 

223 raised during the execution. 

224 

225 The signature of function ``valid_cell`` is:: 

226 

227 def valid_cell(cell): 

228 return True or False or None to stop execution of the notebook before this cell 

229 

230 The signature of function ``clean_function`` is:: 

231 

232 def clean_function(cell): 

233 return new_cell_content 

234 

235 The execution of a notebook might fail because it relies on remote data 

236 specified by url. The function downloads the data first and stores it in 

237 folder *working_dir* (must not be None). The url string is replaced by 

238 the absolute path to the file. 

239 

240 .. versionchanged:: 1.8 

241 Parameters *detailed_log*, *startup_timeout* were added. 

242 """ 

243 if additional_path is None: 

244 additional_path = [] 

245 

246 # we cache urls before running through the list of notebooks 

247 _cache_url_to_file(cache_urls, folder, fLOG=fLOG) 

248 

249 results = {} 

250 for i, note in enumerate(notebooks): 

251 if isinstance(note, tuple): 

252 note, code_init = note 

253 else: 

254 code_init = None 

255 fLOG("[execute_notebook_list] {0}/{1} - {2}".format(i + 1, 

256 len(notebooks), os.path.split(note)[-1])) 

257 outfile = os.path.join(folder, "out_" + os.path.split(note)[-1]) 

258 cl = time.perf_counter() 

259 try: 

260 stat, out = run_notebook(note, working_dir=folder, outfilename=outfile, 

261 additional_path=additional_path, valid=valid, 

262 clean_function=clean_function, fLOG=deepfLOG, 

263 code_init=code_init, kernel_name=kernel_name, 

264 log_level=log_level, extended_args=extended_args, 

265 cache_urls=cache_urls, replacements=replacements, 

266 detailed_log=detailed_log, startup_timeout=startup_timeout) 

267 if not os.path.exists(outfile): 

268 raise FileNotFoundError(outfile) 

269 etime = time.perf_counter() - cl 

270 results[note] = dict(success=True, output=out, name=note, etime=etime, 

271 date=datetime.now()) 

272 results[note].update(stat) 

273 except Exception as e: 

274 etime = time.perf_counter() - cl 

275 results[note] = dict(success=False, etime=etime, error=e, name=note, 

276 date=datetime.now()) 

277 return results 

278 

279 

280def _get_dump_default_path(dump): 

281 """ 

282 Proposes a default location to dump results about notebooks execution. 

283 

284 @param dump location of the dump or module. 

285 @return location of the dump 

286 

287 The result might be equal to the input if *dump* is already path. 

288 """ 

289 if hasattr(dump, '__file__') and hasattr(dump, '__name__'): 

290 # Default value. We check it is none travis or appveyor. 

291 from ..pycode import is_travis_or_appveyor 

292 if is_travis_or_appveyor(): 

293 dump = None 

294 if dump is not None: 

295 # We guess the package name. 

296 name = dump.__name__.split('.')[-1] 

297 loc = os.path.dirname(dump.__file__) 

298 src_loc = os.path.split(loc) 

299 if src_loc[-1] == 'src': 

300 # We choose a path for the dumps in a way 

301 fold = os.path.join(loc, "..", "..", "..", "_notebook_dumps") 

302 else: 

303 src_loc_loc = os.path.split(src_loc[0]) 

304 if src_loc_loc[-1] == 'src': 

305 # We choose a path for the dumps in a way 

306 fold = os.path.join( 

307 loc, "..", "..", "..", "_notebook_dumps") 

308 else: 

309 # This should be a parameter. 

310 fold = os.path.join(loc, "..", "..", "_notebook_dumps") 

311 if not os.path.exists(fold): 

312 os.mkdir(fold) 

313 dump = os.path.join(fold, "notebook.{0}.txt".format(name)) 

314 return dump 

315 return dump 

316 

317 

318def _existing_dump(dump): 

319 """ 

320 Loads an existing dump. 

321 

322 @param dump filename 

323 @return :epkg:`pandas:DataFrame` 

324 """ 

325 import pandas 

326 from pandas.errors import ParserError 

327 

328 def read_file(dump): 

329 try: 

330 df = pandas.read_csv(dump, sep="\t", encoding="utf-8") 

331 except ParserError: 

332 df = pandas.read_csv( 

333 dump, sep="\t", encoding="utf-8", error_bad_lines=False, warn_bad_lines=True) 

334 return df 

335 

336 if os.path.exists(dump): 

337 # There might be some risk here to see another process writing the 

338 # file at the same time. 

339 try: 

340 df = read_file(dump) 

341 except PermissionError: 

342 # We try again once. 

343 time.sleep(10) 

344 try: 

345 df = read_file(dump) 

346 except Exception as e: 

347 raise Exception( 

348 "Unable to read '{0}' due to '{1}'".format(dump, e)) from e 

349 except Exception as e: 

350 raise Exception( 

351 "Unable to read '{0}' due to '{1}'".format(dump, e)) from e 

352 else: 

353 df = None 

354 

355 return df 

356 

357 

358def execute_notebook_list_finalize_ut(res, dump=None, fLOG=noLOG): 

359 """ 

360 Checks the list of results and raises an exception if one failed. 

361 This is meant to be used in unit tests. 

362 

363 @param res output of @see fn execute_notebook_list 

364 @param dump if not None, dump the results of the execution in a flat file 

365 @param fLOG logging function 

366 

367 The dump relies on :epkg:`pandas` and append the results a previous dump. 

368 If *dump* is a module, the function stores the output of the execution in a default 

369 location only if the process does not run on :epkg:`travis` or :epkg:`appveyor`. 

370 The default location is something like: 

371 

372 .. runpython:: 

373 

374 from pyquickhelper.ipythonhelper.run_notebook import _get_dump_default_path 

375 import pyquickhelper 

376 print(_get_dump_default_path(pyquickhelper)) 

377 """ 

378 if len(res) == 0: 

379 raise Exception("No notebook was run.") 

380 

381 def fail_note(v): 

382 return "error" in v 

383 fails = [(os.path.split(k)[-1], v) 

384 for k, v in sorted(res.items()) if fail_note(v)] 

385 for f in fails: 

386 fLOG(f) 

387 for k, v in sorted(res.items()): 

388 name = os.path.split(k)[-1] 

389 fLOG(name, v.get("success", None), v.get("etime", None)) 

390 if len(fails) > 0: 

391 raise fails[0][1]["error"] 

392 

393 dump = _get_dump_default_path(dump) 

394 if dump is not None: 

395 import pandas 

396 df = _existing_dump(dump) 

397 new_df = pandas.DataFrame(data=list(res.values())) 

398 

399 # We replace every EOL. 

400 def eol_replace(t): 

401 return t.replace("\r", "").replace("\n", "\\n") 

402 

403 subdf = new_df.select_dtypes(include=['object']).applymap(eol_replace) 

404 for c in subdf.columns: 

405 new_df[c] = subdf[c] 

406 

407 if df is None: 

408 df = new_df 

409 else: 

410 df = pandas.concat([df, new_df]).copy() 

411 

412 # There could be a conflict while several 

413 # processes in parallel could overwrite the same file. 

414 if not os.path.exists(dump): 

415 df.to_csv(dump, sep="\t", encoding="utf-8", index=False) 

416 else: 

417 # There might be some risk here to see another process 

418 # writing or reading the file at the same time. 

419 # Module filelock does not work in this case. 

420 # locket (https://github.com/mwilliamson/locket.py) was not tried. 

421 try: 

422 df.to_csv(dump, sep="\t", encoding="utf-8", index=False) 

423 except PermissionError: 

424 time.sleep(7) 

425 df.to_csv(dump, sep="\t", encoding="utf-8", index=False) 

426 

427 

428def notebook_coverage(module_or_path, dump=None, too_old=30): 

429 """ 

430 Extracts a list of notebooks and merges with a list of runs dumped by 

431 function @see fn execute_notebook_list_finalize_ut. 

432 

433 @param module_or_path a module or a path 

434 @param dump dump (or None to get the location by default) 

435 @param too_old drop executions older than *too_old* days from now 

436 @return dataframe 

437 

438 If *module_or_path* is a module, the function will get a list notebooks 

439 assuming it follows the same design as :epkg:`pyquickhelper`. 

440 """ 

441 if dump is None: 

442 dump = _get_dump_default_path(module_or_path) 

443 else: 

444 dump = _get_dump_default_path(dump) 

445 

446 # Create the list of existing notebooks. 

447 if isinstance(module_or_path, list): 

448 nbs = [_[1] if isinstance(_, tuple) else _ for _ in module_or_path] 

449 elif hasattr(module_or_path, '__file__') and hasattr(module_or_path, '__name__'): 

450 fold = os.path.dirname(module_or_path.__file__) 

451 _doc = os.path.join(fold, "..", "..", "_doc") 

452 if not os.path.exists(_doc): 

453 raise FileNotFoundError( 

454 "Unable to find path '{0}' for module '{1}'".format(_doc, module_or_path)) 

455 nbpath = os.path.join(_doc, "notebooks") 

456 if not os.path.exists(nbpath): 

457 raise FileNotFoundError( 

458 "Unable to find path '{0}' for module '{1}'".format(nbpath, module_or_path)) 

459 nbs = explore_folder(nbpath, ".*[.]ipynb$")[1] 

460 else: 

461 nbpath = module_or_path 

462 nbs = explore_folder(nbpath, ".*[.]ipynb$")[1] 

463 

464 import pandas 

465 dfnb = pandas.DataFrame(data=dict(notebooks=nbs)) 

466 dfnb["notebooks"] = dfnb["notebooks"].apply(lambda x: os.path.normpath(x)) 

467 dfnb = dfnb[~dfnb.notebooks.str.contains(".ipynb_checkpoints")].copy() 

468 dfnb["key"] = dfnb["notebooks"].apply(lambda x: "/".join(os.path.normpath( 

469 x).replace("\\", "/").split("/")[-3:]) if isinstance(x, str) else x) 

470 dfnb["key"] = dfnb["key"].apply( 

471 lambda x: x.lower() if isinstance(x, str) else x) 

472 

473 # There might be some risk here to see another process writing the 

474 # file at the same time. 

475 try: 

476 dfall = pandas.read_csv(dump, sep="\t", encoding="utf-8") 

477 except PermissionError: 

478 # We try again once. 

479 time.sleep(10) 

480 dfall = pandas.read_csv(dump, sep="\t", encoding="utf-8") 

481 

482 # We drop too old execution. 

483 old = datetime.now() - timedelta(too_old) 

484 old = "%04d-%02d-%02d" % (old.year, old.month, old.day) 

485 dfall = dfall[dfall.date > old].copy() 

486 

487 # We add a key to merge. 

488 dfall["name"] = dfall["name"].apply(lambda x: os.path.normpath(x)) 

489 dfall["key"] = dfall["name"].apply(lambda x: "/".join(os.path.normpath( 

490 x).replace("\\", "/").split("/")[-3:]) if isinstance(x, str) else x) 

491 dfall["key"] = dfall["key"].apply( 

492 lambda x: x.lower() if isinstance(x, str) else x) 

493 

494 # We keep the last execution. 

495 gr = dfall.sort_values("date", ascending=False).groupby( 

496 "key", as_index=False).first().reset_index(drop=True).copy() 

497 gr = gr.drop("name", axis=1) 

498 

499 # Folders might be different so we merge on the last part of the path. 

500 merged = dfnb.merge(gr, left_on="key", right_on="key", how="outer") 

501 merged = merged[merged.notebooks.notnull()] 

502 merged = merged.sort_values("key").reset_index(drop=True).copy() 

503 

504 if "last_name" not in merged.columns: 

505 merged["last_name"] = merged["key"].apply( 

506 lambda x: os.path.split(x)[-1]) 

507 

508 # We check there is no duplicates in merged. 

509 for c in ["key", "last_name"]: 

510 names = [_ for _ in merged[c] if isinstance(_, str)] 

511 if len(names) > len(set(names)): 

512 raise ValueError("Unexpected duplicated names in column '{1}'\n{0}".format( 

513 "\n".join(sorted(names)), c)) 

514 

515 return merged 

516 

517 

518def badge_notebook_coverage(df, image_name): 

519 """ 

520 Builds a badge reporting on the notebook coverage. 

521 It gives the proportion of run cells. 

522 

523 @param df output of @see fn notebook_coverage 

524 @param image_name image to produce 

525 @return coverage estimation 

526 

527 The function relies on module :epkg:`Pillow`. 

528 """ 

529 cell = df["nbcell"].sum() 

530 run = df["nbrun"].sum() 

531 valid = df["nbvalid"].sum() 

532 cov = run * 100.0 / cell if cell > 0 else 1.0 

533 dcov = min(100., cov) 

534 val = valid * 100.0 / cell if cell > 0 else 1.0 

535 with warnings.catch_warnings(): 

536 warnings.simplefilter("ignore", ImportWarning) 

537 from PIL import Image, ImageFont, ImageDraw 

538 if cov <= 60: 

539 color = (200, 87, 51) 

540 elif cov <= 70: 

541 color = (200, 156, 18) 

542 elif cov <= 75: 

543 color = (140, 140, 140) 

544 elif cov <= 80: 

545 color = (88, 171, 171) 

546 elif cov <= 85: 

547 color = (88, 140, 86) 

548 elif cov <= 90: 

549 color = (80, 155, 86) 

550 elif cov <= 95: 

551 color = (80, 190, 73) 

552 else: 

553 color = (20, 190, 50) 

554 img = Image.new(mode='RGB', size=(70, 20), color=color) 

555 im = ImageDraw.Draw(img) 

556 font = ImageFont.load_default() 

557 try: 

558 cov = int(cov) 

559 cov = min(cov, 100) 

560 except ValueError: 

561 cov = "?" 

562 try: 

563 val = int(val) 

564 val = min(val, 100) 

565 except ValueError: 

566 val = "?" 

567 if cov != val: 

568 im.text((3, 4), "NB:{0}%-{1}% ".format(cov, val), 

569 (255, 255, 255), font=font) 

570 else: 

571 im.text((3, 4), "NB: {0}% ".format( 

572 cov), (255, 255, 255), font=font) 

573 img.save(image_name) 

574 return dcov 

575 

576 

577def get_additional_paths(modules): 

578 """ 

579 Returns a list of paths to add before running the notebooks 

580 for a given a list of modules. 

581 

582 @return list of paths 

583 """ 

584 addpath = [os.path.dirname(mod.__file__) for mod in modules] 

585 addpath = [os.path.normpath(os.path.join(_, "..")) for _ in addpath] 

586 return addpath 

587 

588 

589def retrieve_notebooks_in_folder(folder, posreg=".*[.]ipynb$", negreg=None): 

590 """ 

591 Retrieves notebooks in a test folder. 

592 

593 @param folder folder 

594 @param regex regular expression 

595 @return list of found notebooks 

596 """ 

597 pos = re.compile(posreg) 

598 neg = re.compile(negreg) if negreg is not None else None 

599 res = [] 

600 for name in os.listdir(folder): 

601 if pos.search(name): 

602 if neg is None or not neg.search(name): 

603 res.append(os.path.join(folder, name)) 

604 if len(res) == 0: 

605 raise FileNotFoundError("No notebook found in '{0}'.".format(folder)) 

606 return res