Source code for pyensae.datasource.http_retrieve

"""
Various functions to get data from a website, a reference website.


:githublink:`%|py|5`
"""
import os
import sys
import importlib
import re
import time
import urllib.request
from pyquickhelper.loghelper import noLOG


[docs]class DownloadDataException(Exception): """ raised when data cannot be downloaded :githublink:`%|py|17` """ pass
[docs]class RetrieveDataException(Exception): """ raised when data cannot be downloaded :githublink:`%|py|24` """ pass
[docs]def remove_empty_line(file): """ Removes empty line in an imported file. :param file: local file name :githublink:`%|py|33` """ try: f = open(file, "r") lines = f.readlines() f.close() encoding = None except UnicodeDecodeError: try: f = open(file, "r", encoding="latin-1") lines = f.readlines() f.close() encoding = "latin-1" except UnicodeDecodeError: f = open(file, "r", encoding="utf8") lines = f.readlines() f.close() encoding = "utf8" nbrn = len([_ for _ in lines if _.endswith("\n")]) lines = [_.rstrip(" \n") for _ in lines] nbempty = len([_ for _ in lines if len(_) == 0]) skip = 0 if nbempty + nbrn > len(lines) / 3: res = lines lines = [] last = -1 for i, line in enumerate(res): if len(line) == 0: if last >= i - 2: last = i lines.append(line) else: skip += 1 else: lines.append(line) if skip > 0: with open(file, "w", encoding=encoding) as f: f.write("\n".join(lines))
[docs]def download_data(name, moduleName=None, url=None, glo=None, loc=None, whereTo=".", website="xd", timeout=None, retry=2, silent=False, fLOG=noLOG): """ Retrieves a module given its name, a text file or a :epkg:`zip` file, looks for it on ``http://www.xavierdupre.fr/...`` (website), the file is copied at this file location and uncompressed if it is a :epkg:`zip` file (or a :epkg:`tar.gz` file). This function can be replaced in most cases by function `urlretrieve <https://docs.python.org/3/library/urllib.request.html#urllib.request.urlretrieve>`_. :: import urllib.request url = 'https://...' dest = "downloaded_file.bin" urllib.request.urlretrieve(url, dest) :param name: (str) name of the file to download :param moduleName: (str|None) like import name as moduleName if *name* is a module :param url: (str|list|None) link to the website to use (or the websites if list) :param glo: (dict|None) if None, it will be replaced ``globals()`` :param loc: (dict|None) if None, it will be replaced ``locals()`` :param whereTo: specify a folder where downloaded files will be placed :param website: website to look for :param timeout: timeout (seconds) when establishing the connection (see `urlopen <https://docs.python.org/3/library/urllib.request.html#urllib.request.urlopen>`_) :param retry: number of retries in case of failure when downloading the data :param silent: if True, convert some exception into warnings when unzipping a tar file :param fLOG: logging function :return: modules or list of files By extension, this function also download various zip files and decompresses it. If the file was already downloaded, the function will not do it again. .. exref:: :title: Download data for a practical lesson :: from pyensae.datasource import download_data download_data('voeux.zip', website='xd') .. exref:: :title: Download data from a website :: download_data("facebook.tar.gz", website="http://snap.stanford.edu/data/") If it does not work, I suggest to use standard python: `Download a file from Dropbox with Python <http://www.xavierdupre.fr/blog/2015-01-20_nojs.html>`_. .. versionchanged:: 1.1 Parameters *retry*, *silent* were added. .. versionchanged:: 1.2 Parameter *url* can be a list. The function tries the first one which contains the file. :githublink:`%|py|132` """ from ..filehelper.decompress_helper import decompress_zip, decompress_targz, decompress_gz, decompress_bz2 if glo is None: glo = globals() if loc is None: loc = locals() def transform_url(w): "local function" if isinstance(w, list): return [transform_url(_) for _ in w] if w == "xd": w = "http://www.xavierdupre.fr/enseignement/complements/" elif w == "xdtd": w = "http://www.xavierdupre.fr/site2013/enseignements/tddata/" return w website = transform_url(website) url = transform_url(url) if url is None: url = website if not os.path.exists(whereTo): raise FileExistsError("this folder should exists " + whereTo) # Multiple downloads. if isinstance(url, list): single = isinstance(name, str) if single: name = [name] * len(url) if not isinstance(name, list): raise TypeError("If url is a list, name be a list too.") if len(name) != len(url): raise ValueError("url and name must be list of the same size.") outfiles = [] for i, u in enumerate(url): res = download_data(name[i], moduleName=moduleName, url=u, glo=glo, loc=loc, whereTo=whereTo, website=website, timeout=timeout, retry=retry, silent=silent, fLOG=fLOG) if isinstance(res, list): outfiles.extend(res) else: outfiles.append(res) if single and res is not None and os.path.exists(res): break return outfiles elif isinstance(name, list): outfiles = [] for i, n in enumerate(name): res = download_data(n, moduleName=moduleName, url=url, glo=glo, loc=loc, whereTo=whereTo, website=website, timeout=timeout, retry=retry, silent=silent, fLOG=fLOG) if isinstance(res, list): outfiles.extend(res) else: outfiles.append(res) return outfiles # Single download. origname = name if name in sys.modules: return sys.modules[name] elif "." not in name: fLOG("[download_data] unable to find module '{0}'".format(name)) file = name if "." in name else "%s.py" % name outfile = file if whereTo == "." else os.path.join(whereTo, file) if url is not None and not os.path.exists(outfile): excs = [] success = False alls = None url += file fLOG("[download_data] download '{0}' to '{1}'".format( url, outfile)) while retry > 0: try: u = urllib.request.urlopen( url) if timeout is None else urllib.request.urlopen(url, timeout=timeout) alls = u.read() u.close() success = True break except ConnectionResetError as ee: if retry <= 0: exc = DownloadDataException( "Unable (1) to retrieve data from '{0}'. Error: {1}".format(url, ee)) excs.append(exc) excs.append(ee) break fLOG("[download_data] (1) fail and retry to download '{0}' to '{1}'".format( url, outfile)) # We wait for 2 seconds. time.sleep(2) except Exception as e: if retry <= 1: exc = DownloadDataException( "Unable (2) to retrieve data from '{0}'. Error: {1}".format(url, e)) excs.append(exc) excs.append(e) break fLOG("[download_data] (2) fail and retry to download '{0}' to '{1}'".format( url, outfile)) # We wait for 2 seconds. time.sleep(2) retry -= 1 if success and alls is not None: u = open(outfile, "wb") u.write(alls) u.close() elif len(excs) > 0: raise excs[0] else: raise DownloadDataException( "Unable to retrieve data from '{0}'".format(url)) if name.endswith(".zip"): try: return decompress_zip(outfile, whereTo, fLOG) except RuntimeError as e: raise RetrieveDataException( # pragma: no cover "Unable to unzip '{}' to '{}' (url='{}').".format( outfile, whereTo, url)) from e elif name.endswith(".tar.gz"): return decompress_targz(outfile, whereTo, silent=silent, fLOG=fLOG) elif name.endswith(".gz"): return decompress_gz(outfile, whereTo, fLOG) elif name.endswith(".bz2"): return decompress_bz2(outfile, whereTo, fLOG) elif "." not in name: path, filename = os.path.split(outfile) if filename != outfile: if path not in sys.path: sys.path.append(path) remove_empty_line(outfile) try: temp = importlib.import_module(name) except SystemError as e: if "Parent module '' not loaded" in str(e): reg1 = re.compile("^(from +[.])[a-zA-Z]") reg2 = re.compile("^from +[.]{2}") fLOG("[download_data] removing relative import for ", name) with open(outfile, "r") as f: lines = f.readlines() fil = [] fir = True for li in lines: r1 = reg1.search(li) r2 = reg2.search(li) if r2: ls = "" if fir: ls = "fLOG = print" fir = False elif r1: st = r1.groups()[0] ls = ls.replace(st, "from ") if fir: ls += "\nfLOG = print" fir = False fil.append(ls.strip("\n\r")) if not fir: fLOG("[download_data] end removing relative import for ", name) with open(outfile, "w") as f: f.write("\n".join(fil)) try: temp = importlib.import_module(name) except Exception as e: fLOG("[download_data] issue (3) while importing ", name, " -- ", origname) fLOG("[download_data] sys.path ", sys.path) for _ in sys.path: fLOG("[download_data] path ", _) fLOG("[download_data] sys.modules.keys()", list(sys.modules.keys())) for _ in sorted(sys.modules): fLOG("[download_data] modules ", _) raise e except Exception as e: fLOG("[download_data] issue (2) while importing ", name, " -- ", origname) fLOG("[download_data] sys.path ", sys.path) for _ in sys.path: fLOG("[download_data] path ", _) fLOG("[download_data] sys.modules.keys()", list(sys.modules.keys())) for _ in sorted(sys.modules): fLOG("[download_data] modules ", _) raise e if name not in temp.__name__: raise NameError( "name should be present in __name__ " + name + " ? " + temp.__name__) glo[moduleName] = temp sys.modules[moduleName] = temp sys.modules[origname] = temp return temp elif file.split(".")[-1] in ["txt", "csv", "tsv", "xml", "html"]: remove_empty_line(outfile) return outfile else: return outfile