Source code for pyquickhelper.filehelper.download_urls_helper

# -*- coding: utf-8 -*-
"""
Series of functions related to folder, explore, synchronize, remove (recursively).


:githublink:`%|py|6`
"""
import re
from .synchelper import explore_folder_iterfile
from .download_helper import get_urls_content_timeout


[docs]def download_urls_in_folder_content(folder, pattern=".+[.]((py)|(ipynb))", neg_pattern=None, recursive=True, timeout=10, folder_dest=None, encoding='utf-8', raise_exception=False, chunk=None, fLOG=None): """ Iterates on files in folder, parse them, extracts all urls, download them in a folder. :param folder: folder :param pattern: if None, get all files, otherwise, it is a regular expression, the filename must verify (with the folder is fullname is True) :param neg_pattern: negative pattern to exclude files :param fullname: if True, include the subfolder while checking the regex :param recursive: look into subfolders :param urls: urls :param timeout: in seconds, after this time, the function drops an returns None, -1 for forever :param folder_dest: if None, the content is stored in that file :param encoding: None by default, but if it is None, the returned information is binary :param raise_exception: True to raise an exception, False to send a warnings :param chunk: save data every chunk (only if output is not None) :param fLOG: logging function (only applies when chunk is not None) :return: list of downloaded content :githublink:`%|py|33` """ if neg_pattern == '': neg_pattern = None # pragma: no cover if chunk == '': chunk = None # pragma: no cover if isinstance(chunk, str): chunk = int(chunk) # pragma: no cover res = [] url_pattern = ("(?i)\\b((?:[a-z][\\w-]+:(?:/{1,3}|[a-z0-9%])|www\\d{0,3}[.]|[a-z0-9.\\-]+" "[.][a-z]{2,4}/)(?:[^\\s()<>]+|\\(([^\\s()<>]+|(\\([^\\s()<>]+\\)))*\\))+" "(?:\\(([^\\s()<>]+|(\\([^\\s()<>]+\\)))*\\)|[^\\s`!()\\[\\]{};:" "'\\\".,<>?]))") reg = re.compile(url_pattern) for obj in explore_folder_iterfile(folder, pattern=pattern, neg_pattern=neg_pattern, fullname=True, recursive=recursive): with open(obj, "r", encoding=encoding, errors='ignore') as f: content = f.read() fall = reg.findall(content) if len(fall) == 0: continue if fLOG is not None: fLOG("[download_urls_in_folder_content] explore '{}'".format(obj)) urls = [f[0] for f in fall] r = get_urls_content_timeout(urls, folder=folder_dest, timeout=timeout, raise_exception=raise_exception, chunk=chunk, fLOG=fLOG) res.extend(r) return res