Source code for pyensae.datasource.http_retrieve

"""
Various functions to get data from a website, a reference website.


:githublink:`%|py|5`
"""
import os
import sys
import importlib
import re
import time
import urllib.request
from pyquickhelper.loghelper import noLOG


[docs]class DownloadDataException(Exception):
    """
    raised when data cannot be downloaded


    :githublink:`%|py|17`
    """
    pass


[docs]class RetrieveDataException(Exception):
    """
    raised when data cannot be downloaded


    :githublink:`%|py|24`
    """
    pass


[docs]def remove_empty_line(file):
    """
    Removes empty line in an imported file.

    :param      file:        local file name


    :githublink:`%|py|33`
    """
    try:
        f = open(file, "r")
        lines = f.readlines()
        f.close()
        encoding = None
    except UnicodeDecodeError:
        try:
            f = open(file, "r", encoding="latin-1")
            lines = f.readlines()
            f.close()
            encoding = "latin-1"
        except UnicodeDecodeError:
            f = open(file, "r", encoding="utf8")
            lines = f.readlines()
            f.close()
            encoding = "utf8"

    nbrn = len([_ for _ in lines if _.endswith("\n")])
    lines = [_.rstrip(" \n") for _ in lines]
    nbempty = len([_ for _ in lines if len(_) == 0])
    skip = 0
    if nbempty + nbrn > len(lines) / 3:
        res = lines
        lines = []
        last = -1
        for i, line in enumerate(res):
            if len(line) == 0:
                if last >= i - 2:
                    last = i
                    lines.append(line)
                else:
                    skip += 1
            else:
                lines.append(line)
    if skip > 0:
        with open(file, "w", encoding=encoding) as f:
            f.write("\n".join(lines))


[docs]def download_data(name, moduleName=None, url=None, glo=None,
                  loc=None, whereTo=".", website="xd", timeout=None,
                  retry=2, silent=False, fLOG=noLOG):
    """
    Retrieves a module given its name, a text file or a :epkg:`zip` file,
    looks for it on ``http://www.xavierdupre.fr/...`` (website),
    the file is copied at this file location and uncompressed
    if it is a :epkg:`zip` file (or a :epkg:`tar.gz` file).
    This function can be replaced in most cases by function
    `urlretrieve <https://docs.python.org/3/library/urllib.request.html#urllib.request.urlretrieve>`_.

    ::

        import urllib.request
        url = 'https://...'
        dest = "downloaded_file.bin"
        urllib.request.urlretrieve(url, dest)

    :param      name:        (str) name of the file to download
    :param      moduleName:  (str|None) like import name as moduleName if *name* is a module
    :param      url:         (str|list|None) link to the website to use (or the websites if list)
    :param      glo:         (dict|None) if None, it will be replaced ``globals()``
    :param      loc:         (dict|None) if None, it will be replaced ``locals()``
    :param      whereTo:     specify a folder where downloaded files will be placed
    :param      website:     website to look for
    :param      timeout:     timeout (seconds) when establishing the connection
                            (see `urlopen <https://docs.python.org/3/library/urllib.request.html#urllib.request.urlopen>`_)

    :param      retry:       number of retries in case of failure when downloading the data
    :param      silent:      if True, convert some exception into warnings when unzipping a tar file
    :param      fLOG:        logging function
    :return:                 modules or list of files

    By extension, this function also download various zip files and decompresses it.
    If the file was already downloaded, the function will not do it again.

    .. exref::
        :title: Download data for a practical lesson

        ::

            from pyensae.datasource import download_data
            download_data('voeux.zip', website='xd')

    .. exref::
        :title: Download data from a website

        ::

            download_data("facebook.tar.gz", website="http://snap.stanford.edu/data/")

    If it does not work, I suggest to use standard python:
    `Download a file from Dropbox with Python <http://www.xavierdupre.fr/blog/2015-01-20_nojs.html>`_.

    .. versionchanged:: 1.1
        Parameters *retry*, *silent* were added.

    .. versionchanged:: 1.2
        Parameter *url* can be a list. The function
        tries the first one which contains the file.


    :githublink:`%|py|132`
    """
    from ..filehelper.decompress_helper import decompress_zip, decompress_targz, decompress_gz, decompress_bz2

    if glo is None:
        glo = globals()
    if loc is None:
        loc = locals()

    def transform_url(w):
        "local function"
        if isinstance(w, list):
            return [transform_url(_) for _ in w]
        if w == "xd":
            w = "http://www.xavierdupre.fr/enseignement/complements/"
        elif w == "xdtd":
            w = "http://www.xavierdupre.fr/site2013/enseignements/tddata/"
        return w

    website = transform_url(website)
    url = transform_url(url)
    if url is None:
        url = website

    if not os.path.exists(whereTo):
        raise FileExistsError("this folder should exists " + whereTo)

    # Multiple downloads.
    if isinstance(url, list):
        single = isinstance(name, str)
        if single:
            name = [name] * len(url)
        if not isinstance(name, list):
            raise TypeError("If url is a list, name be a list too.")
        if len(name) != len(url):
            raise ValueError("url and name must be list of the same size.")
        outfiles = []
        for i, u in enumerate(url):
            res = download_data(name[i], moduleName=moduleName, url=u, glo=glo,
                                loc=loc, whereTo=whereTo, website=website, timeout=timeout,
                                retry=retry, silent=silent, fLOG=fLOG)
            if isinstance(res, list):
                outfiles.extend(res)
            else:
                outfiles.append(res)
                if single and res is not None and os.path.exists(res):
                    break
        return outfiles
    elif isinstance(name, list):
        outfiles = []
        for i, n in enumerate(name):
            res = download_data(n, moduleName=moduleName, url=url, glo=glo,
                                loc=loc, whereTo=whereTo, website=website, timeout=timeout,
                                retry=retry, silent=silent, fLOG=fLOG)
            if isinstance(res, list):
                outfiles.extend(res)
            else:
                outfiles.append(res)
        return outfiles

    # Single download.
    origname = name
    if name in sys.modules:
        return sys.modules[name]
    elif "." not in name:
        fLOG("[download_data] unable to find module '{0}'".format(name))

    file = name if "." in name else "%s.py" % name
    outfile = file if whereTo == "." else os.path.join(whereTo, file)

    if url is not None and not os.path.exists(outfile):
        excs = []
        success = False
        alls = None
        url += file
        fLOG("[download_data]    download '{0}' to '{1}'".format(
            url, outfile))
        while retry > 0:
            try:
                u = urllib.request.urlopen(
                    url) if timeout is None else urllib.request.urlopen(url, timeout=timeout)
                alls = u.read()
                u.close()
                success = True
                break
            except ConnectionResetError as ee:
                if retry <= 0:
                    exc = DownloadDataException(
                        "Unable (1) to retrieve data from '{0}'. Error: {1}".format(url, ee))
                    excs.append(exc)
                    excs.append(ee)
                    break
                fLOG("[download_data] (1)  fail and retry to download '{0}' to '{1}'".format(
                    url, outfile))
                # We wait for 2 seconds.
                time.sleep(2)
            except Exception as e:
                if retry <= 1:
                    exc = DownloadDataException(
                        "Unable (2) to retrieve data from '{0}'. Error: {1}".format(url, e))
                    excs.append(exc)
                    excs.append(e)
                    break
                fLOG("[download_data] (2)  fail and retry to download '{0}' to '{1}'".format(
                    url, outfile))
                # We wait for 2 seconds.
                time.sleep(2)
            retry -= 1

        if success and alls is not None:
            u = open(outfile, "wb")
            u.write(alls)
            u.close()
        elif len(excs) > 0:
            raise excs[0]
        else:
            raise DownloadDataException(
                "Unable to retrieve data from '{0}'".format(url))

    if name.endswith(".zip"):
        try:
            return decompress_zip(outfile, whereTo, fLOG)
        except RuntimeError as e:
            raise RetrieveDataException(  # pragma: no cover
                "Unable to unzip '{}' to '{}' (url='{}').".format(
                    outfile, whereTo, url)) from e

    elif name.endswith(".tar.gz"):
        return decompress_targz(outfile, whereTo, silent=silent, fLOG=fLOG)

    elif name.endswith(".gz"):
        return decompress_gz(outfile, whereTo, fLOG)

    elif name.endswith(".bz2"):
        return decompress_bz2(outfile, whereTo, fLOG)

    elif "." not in name:
        path, filename = os.path.split(outfile)
        if filename != outfile:
            if path not in sys.path:
                sys.path.append(path)

        remove_empty_line(outfile)

        try:
            temp = importlib.import_module(name)
        except SystemError as e:
            if "Parent module '' not loaded" in str(e):
                reg1 = re.compile("^(from +[.])[a-zA-Z]")
                reg2 = re.compile("^from +[.]{2}")
                fLOG("[download_data] removing relative import for ", name)
                with open(outfile, "r") as f:
                    lines = f.readlines()
                fil = []
                fir = True
                for li in lines:
                    r1 = reg1.search(li)
                    r2 = reg2.search(li)
                    if r2:
                        ls = ""
                        if fir:
                            ls = "fLOG = print"
                            fir = False
                    elif r1:
                        st = r1.groups()[0]
                        ls = ls.replace(st, "from ")
                        if fir:
                            ls += "\nfLOG = print"
                            fir = False
                    fil.append(ls.strip("\n\r"))
                if not fir:
                    fLOG("[download_data] end removing relative import for ", name)
                    with open(outfile, "w") as f:
                        f.write("\n".join(fil))

            try:
                temp = importlib.import_module(name)
            except Exception as e:
                fLOG("[download_data] issue (3) while importing ",
                     name, " -- ", origname)
                fLOG("[download_data] sys.path ", sys.path)
                for _ in sys.path:
                    fLOG("[download_data]     path ", _)
                fLOG("[download_data] sys.modules.keys()",
                     list(sys.modules.keys()))
                for _ in sorted(sys.modules):
                    fLOG("[download_data]     modules ", _)
                raise e

        except Exception as e:
            fLOG("[download_data] issue (2) while importing ",
                 name, " -- ", origname)
            fLOG("[download_data] sys.path ", sys.path)
            for _ in sys.path:
                fLOG("[download_data]     path ", _)
            fLOG("[download_data] sys.modules.keys()", list(sys.modules.keys()))
            for _ in sorted(sys.modules):
                fLOG("[download_data]     modules ", _)
            raise e

        if name not in temp.__name__:
            raise NameError(
                "name should be present in __name__ " +
                name +
                " ? " +
                temp.__name__)
        glo[moduleName] = temp
        sys.modules[moduleName] = temp
        sys.modules[origname] = temp
        return temp

    elif file.split(".")[-1] in ["txt", "csv", "tsv", "xml", "html"]:
        remove_empty_line(outfile)
        return outfile
    else:
        return outfile
Source code for pyensae.datasource.http_retrieve

Links

Contents

Information

Related Topics