Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""
2@file
3@brief Various functions to get data from a website, a reference website.
4"""
5import os
6import sys
7import importlib
8import re
9import time
10import urllib.request
11from pyquickhelper.loghelper import noLOG
14class DownloadDataException(Exception):
15 """
16 raised when data cannot be downloaded
17 """
18 pass
21class RetrieveDataException(Exception):
22 """
23 raised when data cannot be downloaded
24 """
25 pass
28def remove_empty_line(file):
29 """
30 Removes empty line in an imported file.
32 @param file local file name
33 """
34 try:
35 f = open(file, "r")
36 lines = f.readlines()
37 f.close()
38 encoding = None
39 except UnicodeDecodeError:
40 try:
41 f = open(file, "r", encoding="latin-1")
42 lines = f.readlines()
43 f.close()
44 encoding = "latin-1"
45 except UnicodeDecodeError:
46 f = open(file, "r", encoding="utf8")
47 lines = f.readlines()
48 f.close()
49 encoding = "utf8"
51 nbrn = len([_ for _ in lines if _.endswith("\n")])
52 lines = [_.rstrip(" \n") for _ in lines]
53 nbempty = len([_ for _ in lines if len(_) == 0])
54 skip = 0
55 if nbempty + nbrn > len(lines) / 3:
56 res = lines
57 lines = []
58 last = -1
59 for i, line in enumerate(res):
60 if len(line) == 0:
61 if last >= i - 2:
62 last = i
63 lines.append(line)
64 else:
65 skip += 1
66 else:
67 lines.append(line)
68 if skip > 0:
69 with open(file, "w", encoding=encoding) as f:
70 f.write("\n".join(lines))
73def download_data(name, moduleName=None, url=None, glo=None,
74 loc=None, whereTo=".", website="xd", timeout=None,
75 retry=2, silent=False, fLOG=noLOG):
76 """
77 Retrieves a module given its name, a text file or a :epkg:`zip` file,
78 looks for it on ``http://www.xavierdupre.fr/...`` (website),
79 the file is copied at this file location and uncompressed
80 if it is a :epkg:`zip` file (or a :epkg:`tar.gz` file).
81 This function can be replaced in most cases by function
82 `urlretrieve <https://docs.python.org/3/library/urllib.request.html#urllib.request.urlretrieve>`_.
84 ::
86 import urllib.request
87 url = 'https://...'
88 dest = "downloaded_file.bin"
89 urllib.request.urlretrieve(url, dest)
91 @param name (str) name of the file to download
92 @param moduleName (str|None) like import name as moduleName if *name* is a module
93 @param url (str|list|None) link to the website to use (or the websites if list)
94 @param glo (dict|None) if None, it will be replaced ``globals()``
95 @param loc (dict|None) if None, it will be replaced ``locals()``
96 @param whereTo specify a folder where downloaded files will be placed
97 @param website website to look for
98 @param timeout timeout (seconds) when establishing the connection
99 (see `urlopen <https://docs.python.org/3/library/urllib.request.html#urllib.request.urlopen>`_)
100 @param retry number of retries in case of failure when downloading the data
101 @param silent if True, convert some exception into warnings when unzipping a tar file
102 @param fLOG logging function
103 @return modules or list of files
105 By extension, this function also download various zip files and decompresses it.
106 If the file was already downloaded, the function will not do it again.
108 .. exref::
109 :title: Download data for a practical lesson
111 ::
113 from pyensae.datasource import download_data
114 download_data('voeux.zip', website='xd')
116 .. exref::
117 :title: Download data from a website
119 ::
121 download_data("facebook.tar.gz", website="http://snap.stanford.edu/data/")
123 If it does not work, I suggest to use standard python:
124 `Download a file from Dropbox with Python <http://www.xavierdupre.fr/blog/2015-01-20_nojs.html>`_.
126 .. versionchanged:: 1.1
127 Parameters *retry*, *silent* were added.
129 .. versionchanged:: 1.2
130 Parameter *url* can be a list. The function
131 tries the first one which contains the file.
132 """
133 from ..filehelper.decompress_helper import decompress_zip, decompress_targz, decompress_gz, decompress_bz2
135 if glo is None:
136 glo = globals()
137 if loc is None:
138 loc = locals()
140 def transform_url(w):
141 "local function"
142 if isinstance(w, list):
143 return [transform_url(_) for _ in w]
144 if w == "xd":
145 w = "http://www.xavierdupre.fr/enseignement/complements/"
146 elif w == "xdtd":
147 w = "http://www.xavierdupre.fr/site2013/enseignements/tddata/"
148 return w
150 website = transform_url(website)
151 url = transform_url(url)
152 if url is None:
153 url = website
155 if not os.path.exists(whereTo):
156 raise FileExistsError("this folder should exists " + whereTo)
158 # Multiple downloads.
159 if isinstance(url, list):
160 single = isinstance(name, str)
161 if single:
162 name = [name] * len(url)
163 if not isinstance(name, list):
164 raise TypeError("If url is a list, name be a list too.")
165 if len(name) != len(url):
166 raise ValueError("url and name must be list of the same size.")
167 outfiles = []
168 for i, u in enumerate(url):
169 res = download_data(name[i], moduleName=moduleName, url=u, glo=glo,
170 loc=loc, whereTo=whereTo, website=website, timeout=timeout,
171 retry=retry, silent=silent, fLOG=fLOG)
172 if isinstance(res, list):
173 outfiles.extend(res)
174 else:
175 outfiles.append(res)
176 if single and res is not None and os.path.exists(res):
177 break
178 return outfiles
179 elif isinstance(name, list):
180 outfiles = []
181 for i, n in enumerate(name):
182 res = download_data(n, moduleName=moduleName, url=url, glo=glo,
183 loc=loc, whereTo=whereTo, website=website, timeout=timeout,
184 retry=retry, silent=silent, fLOG=fLOG)
185 if isinstance(res, list):
186 outfiles.extend(res)
187 else:
188 outfiles.append(res)
189 return outfiles
191 # Single download.
192 origname = name
193 if name in sys.modules:
194 return sys.modules[name]
195 elif "." not in name:
196 fLOG("[download_data] unable to find module '{0}'".format(name))
198 file = name if "." in name else "%s.py" % name
199 outfile = file if whereTo == "." else os.path.join(whereTo, file)
201 if url is not None and not os.path.exists(outfile):
202 excs = []
203 success = False
204 alls = None
205 url += file
206 fLOG("[download_data] download '{0}' to '{1}'".format(
207 url, outfile))
208 while retry > 0:
209 try:
210 u = urllib.request.urlopen(
211 url) if timeout is None else urllib.request.urlopen(url, timeout=timeout)
212 alls = u.read()
213 u.close()
214 success = True
215 break
216 except ConnectionResetError as ee:
217 if retry <= 0:
218 exc = DownloadDataException(
219 "Unable (1) to retrieve data from '{0}'. Error: {1}".format(url, ee))
220 excs.append(exc)
221 excs.append(ee)
222 break
223 fLOG("[download_data] (1) fail and retry to download '{0}' to '{1}'".format(
224 url, outfile))
225 # We wait for 2 seconds.
226 time.sleep(2)
227 except Exception as e:
228 if retry <= 1:
229 exc = DownloadDataException(
230 "Unable (2) to retrieve data from '{0}'. Error: {1}".format(url, e))
231 excs.append(exc)
232 excs.append(e)
233 break
234 fLOG("[download_data] (2) fail and retry to download '{0}' to '{1}'".format(
235 url, outfile))
236 # We wait for 2 seconds.
237 time.sleep(2)
238 retry -= 1
240 if success and alls is not None:
241 u = open(outfile, "wb")
242 u.write(alls)
243 u.close()
244 elif len(excs) > 0:
245 raise excs[0]
246 else:
247 raise DownloadDataException(
248 "Unable to retrieve data from '{0}'".format(url))
250 if name.endswith(".zip"):
251 try:
252 return decompress_zip(outfile, whereTo, fLOG)
253 except RuntimeError as e:
254 raise RetrieveDataException( # pragma: no cover
255 "Unable to unzip '{}' to '{}' (url='{}').".format(
256 outfile, whereTo, url)) from e
258 elif name.endswith(".tar.gz"):
259 return decompress_targz(outfile, whereTo, silent=silent, fLOG=fLOG)
261 elif name.endswith(".gz"):
262 return decompress_gz(outfile, whereTo, fLOG)
264 elif name.endswith(".bz2"):
265 return decompress_bz2(outfile, whereTo, fLOG)
267 elif "." not in name:
268 path, filename = os.path.split(outfile)
269 if filename != outfile:
270 if path not in sys.path:
271 sys.path.append(path)
273 remove_empty_line(outfile)
275 try:
276 temp = importlib.import_module(name)
277 except SystemError as e:
278 if "Parent module '' not loaded" in str(e):
279 reg1 = re.compile("^(from +[.])[a-zA-Z]")
280 reg2 = re.compile("^from +[.]{2}")
281 fLOG("[download_data] removing relative import for ", name)
282 with open(outfile, "r") as f:
283 lines = f.readlines()
284 fil = []
285 fir = True
286 for li in lines:
287 r1 = reg1.search(li)
288 r2 = reg2.search(li)
289 if r2:
290 ls = ""
291 if fir:
292 ls = "fLOG = print"
293 fir = False
294 elif r1:
295 st = r1.groups()[0]
296 ls = ls.replace(st, "from ")
297 if fir:
298 ls += "\nfLOG = print"
299 fir = False
300 fil.append(ls.strip("\n\r"))
301 if not fir:
302 fLOG("[download_data] end removing relative import for ", name)
303 with open(outfile, "w") as f:
304 f.write("\n".join(fil))
306 try:
307 temp = importlib.import_module(name)
308 except Exception as e:
309 fLOG("[download_data] issue (3) while importing ",
310 name, " -- ", origname)
311 fLOG("[download_data] sys.path ", sys.path)
312 for _ in sys.path:
313 fLOG("[download_data] path ", _)
314 fLOG("[download_data] sys.modules.keys()",
315 list(sys.modules.keys()))
316 for _ in sorted(sys.modules):
317 fLOG("[download_data] modules ", _)
318 raise e
320 except Exception as e:
321 fLOG("[download_data] issue (2) while importing ",
322 name, " -- ", origname)
323 fLOG("[download_data] sys.path ", sys.path)
324 for _ in sys.path:
325 fLOG("[download_data] path ", _)
326 fLOG("[download_data] sys.modules.keys()", list(sys.modules.keys()))
327 for _ in sorted(sys.modules):
328 fLOG("[download_data] modules ", _)
329 raise e
331 if name not in temp.__name__:
332 raise NameError(
333 "name should be present in __name__ " +
334 name +
335 " ? " +
336 temp.__name__)
337 glo[moduleName] = temp
338 sys.modules[moduleName] = temp
339 sys.modules[origname] = temp
340 return temp
342 elif file.split(".")[-1] in ["txt", "csv", "tsv", "xml", "html"]:
343 remove_empty_line(outfile)
344 return outfile
345 else:
346 return outfile