Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# -*- coding: utf-8 -*-
2"""
3@file
4@brief Series of functions related to folder, explore, synchronize, remove (recursively).
5"""
6import re
7from .synchelper import explore_folder_iterfile
8from .download_helper import get_urls_content_timeout
11def download_urls_in_folder_content(folder, pattern=".+[.]((py)|(ipynb))", neg_pattern=None,
12 recursive=True, timeout=10, folder_dest=None,
13 encoding='utf-8', raise_exception=False, chunk=None,
14 fLOG=None):
15 """
16 Iterates on files in folder, parse them, extracts all urls, download
17 them in a folder.
19 :param folder: folder
20 :param pattern: if None, get all files, otherwise, it is a regular expression,
21 the filename must verify (with the folder is fullname is True)
22 :param neg_pattern: negative pattern to exclude files
23 :param fullname: if True, include the subfolder while checking the regex
24 :param recursive: look into subfolders
25 :param urls: urls
26 :param timeout: in seconds, after this time, the function drops an returns None, -1 for forever
27 :param folder_dest: if None, the content is stored in that file
28 :param encoding: None by default, but if it is None, the returned information is binary
29 :param raise_exception: True to raise an exception, False to send a warnings
30 :param chunk: save data every chunk (only if output is not None)
31 :param fLOG: logging function (only applies when chunk is not None)
32 :return: list of downloaded content
33 """
34 if neg_pattern == '':
35 neg_pattern = None # pragma: no cover
36 if chunk == '':
37 chunk = None # pragma: no cover
38 if isinstance(chunk, str):
39 chunk = int(chunk) # pragma: no cover
40 res = []
41 url_pattern = ("(?i)\\b((?:[a-z][\\w-]+:(?:/{1,3}|[a-z0-9%])|www\\d{0,3}[.]|[a-z0-9.\\-]+"
42 "[.][a-z]{2,4}/)(?:[^\\s()<>]+|\\(([^\\s()<>]+|(\\([^\\s()<>]+\\)))*\\))+"
43 "(?:\\(([^\\s()<>]+|(\\([^\\s()<>]+\\)))*\\)|[^\\s`!()\\[\\]{};:"
44 "'\\\".,<>?]))")
45 reg = re.compile(url_pattern)
46 for obj in explore_folder_iterfile(folder, pattern=pattern, neg_pattern=neg_pattern,
47 fullname=True, recursive=recursive):
48 with open(obj, "r", encoding=encoding, errors='ignore') as f:
49 content = f.read()
50 fall = reg.findall(content)
51 if len(fall) == 0:
52 continue
53 if fLOG is not None:
54 fLOG( # pragma: no cover
55 "[download_urls_in_folder_content] explore '{}'".format(obj))
56 urls = [f[0] for f in fall]
57 r = get_urls_content_timeout(urls, folder=folder_dest, timeout=timeout,
58 raise_exception=raise_exception, chunk=chunk,
59 fLOG=fLOG)
60 res.extend(r)
61 return res