Source code for pyquickhelper.filehelper.download_urls_helper
# -*- coding: utf-8 -*-
"""
Series of functions related to folder, explore, synchronize, remove (recursively).
:githublink:`%|py|6`
"""
import re
from .synchelper import explore_folder_iterfile
from .download_helper import get_urls_content_timeout
[docs]def download_urls_in_folder_content(folder, pattern=".+[.]((py)|(ipynb))", neg_pattern=None,
recursive=True, timeout=10, folder_dest=None,
encoding='utf-8', raise_exception=False, chunk=None,
fLOG=None):
"""
Iterates on files in folder, parse them, extracts all urls, download
them in a folder.
:param folder: folder
:param pattern: if None, get all files, otherwise, it is a regular expression,
the filename must verify (with the folder is fullname is True)
:param neg_pattern: negative pattern to exclude files
:param fullname: if True, include the subfolder while checking the regex
:param recursive: look into subfolders
:param urls: urls
:param timeout: in seconds, after this time, the function drops an returns None, -1 for forever
:param folder_dest: if None, the content is stored in that file
:param encoding: None by default, but if it is None, the returned information is binary
:param raise_exception: True to raise an exception, False to send a warnings
:param chunk: save data every chunk (only if output is not None)
:param fLOG: logging function (only applies when chunk is not None)
:return: list of downloaded content
:githublink:`%|py|33`
"""
if neg_pattern == '':
neg_pattern = None # pragma: no cover
if chunk == '':
chunk = None # pragma: no cover
if isinstance(chunk, str):
chunk = int(chunk) # pragma: no cover
res = []
url_pattern = ("(?i)\\b((?:[a-z][\\w-]+:(?:/{1,3}|[a-z0-9%])|www\\d{0,3}[.]|[a-z0-9.\\-]+"
"[.][a-z]{2,4}/)(?:[^\\s()<>]+|\\(([^\\s()<>]+|(\\([^\\s()<>]+\\)))*\\))+"
"(?:\\(([^\\s()<>]+|(\\([^\\s()<>]+\\)))*\\)|[^\\s`!()\\[\\]{};:"
"'\\\".,<>?]))")
reg = re.compile(url_pattern)
for obj in explore_folder_iterfile(folder, pattern=pattern, neg_pattern=neg_pattern,
fullname=True, recursive=recursive):
with open(obj, "r", encoding=encoding, errors='ignore') as f:
content = f.read()
fall = reg.findall(content)
if len(fall) == 0:
continue
if fLOG is not None:
fLOG("[download_urls_in_folder_content] explore '{}'".format(obj))
urls = [f[0] for f in fall]
r = get_urls_content_timeout(urls, folder=folder_dest, timeout=timeout,
raise_exception=raise_exception, chunk=chunk,
fLOG=fLOG)
res.extend(r)
return res