Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1""" 

2@file 

3@brief Various functions to get data from a website, a reference website. 

4""" 

5import os 

6import sys 

7import importlib 

8import re 

9import time 

10import urllib.request 

11from pyquickhelper.loghelper import noLOG 

12 

13 

14class DownloadDataException(Exception): 

15 """ 

16 raised when data cannot be downloaded 

17 """ 

18 pass 

19 

20 

21class RetrieveDataException(Exception): 

22 """ 

23 raised when data cannot be downloaded 

24 """ 

25 pass 

26 

27 

28def remove_empty_line(file): 

29 """ 

30 Removes empty line in an imported file. 

31 

32 @param file local file name 

33 """ 

34 try: 

35 f = open(file, "r") 

36 lines = f.readlines() 

37 f.close() 

38 encoding = None 

39 except UnicodeDecodeError: 

40 try: 

41 f = open(file, "r", encoding="latin-1") 

42 lines = f.readlines() 

43 f.close() 

44 encoding = "latin-1" 

45 except UnicodeDecodeError: 

46 f = open(file, "r", encoding="utf8") 

47 lines = f.readlines() 

48 f.close() 

49 encoding = "utf8" 

50 

51 nbrn = len([_ for _ in lines if _.endswith("\n")]) 

52 lines = [_.rstrip(" \n") for _ in lines] 

53 nbempty = len([_ for _ in lines if len(_) == 0]) 

54 skip = 0 

55 if nbempty + nbrn > len(lines) / 3: 

56 res = lines 

57 lines = [] 

58 last = -1 

59 for i, line in enumerate(res): 

60 if len(line) == 0: 

61 if last >= i - 2: 

62 last = i 

63 lines.append(line) 

64 else: 

65 skip += 1 

66 else: 

67 lines.append(line) 

68 if skip > 0: 

69 with open(file, "w", encoding=encoding) as f: 

70 f.write("\n".join(lines)) 

71 

72 

73def download_data(name, moduleName=None, url=None, glo=None, 

74 loc=None, whereTo=".", website="xd", timeout=None, 

75 retry=2, silent=False, fLOG=noLOG): 

76 """ 

77 Retrieves a module given its name, a text file or a :epkg:`zip` file, 

78 looks for it on ``http://www.xavierdupre.fr/...`` (website), 

79 the file is copied at this file location and uncompressed 

80 if it is a :epkg:`zip` file (or a :epkg:`tar.gz` file). 

81 This function can be replaced in most cases by function 

82 `urlretrieve <https://docs.python.org/3/library/urllib.request.html#urllib.request.urlretrieve>`_. 

83 

84 :: 

85 

86 import urllib.request 

87 url = 'https://...' 

88 dest = "downloaded_file.bin" 

89 urllib.request.urlretrieve(url, dest) 

90 

91 @param name (str) name of the file to download 

92 @param moduleName (str|None) like import name as moduleName if *name* is a module 

93 @param url (str|list|None) link to the website to use (or the websites if list) 

94 @param glo (dict|None) if None, it will be replaced ``globals()`` 

95 @param loc (dict|None) if None, it will be replaced ``locals()`` 

96 @param whereTo specify a folder where downloaded files will be placed 

97 @param website website to look for 

98 @param timeout timeout (seconds) when establishing the connection 

99 (see `urlopen <https://docs.python.org/3/library/urllib.request.html#urllib.request.urlopen>`_) 

100 @param retry number of retries in case of failure when downloading the data 

101 @param silent if True, convert some exception into warnings when unzipping a tar file 

102 @param fLOG logging function 

103 @return modules or list of files 

104 

105 By extension, this function also download various zip files and decompresses it. 

106 If the file was already downloaded, the function will not do it again. 

107 

108 .. exref:: 

109 :title: Download data for a practical lesson 

110 

111 :: 

112 

113 from pyensae.datasource import download_data 

114 download_data('voeux.zip', website='xd') 

115 

116 .. exref:: 

117 :title: Download data from a website 

118 

119 :: 

120 

121 download_data("facebook.tar.gz", website="http://snap.stanford.edu/data/") 

122 

123 If it does not work, I suggest to use standard python: 

124 `Download a file from Dropbox with Python <http://www.xavierdupre.fr/blog/2015-01-20_nojs.html>`_. 

125 

126 .. versionchanged:: 1.1 

127 Parameters *retry*, *silent* were added. 

128 

129 .. versionchanged:: 1.2 

130 Parameter *url* can be a list. The function 

131 tries the first one which contains the file. 

132 """ 

133 from ..filehelper.decompress_helper import decompress_zip, decompress_targz, decompress_gz, decompress_bz2 

134 

135 if glo is None: 

136 glo = globals() 

137 if loc is None: 

138 loc = locals() 

139 

140 def transform_url(w): 

141 "local function" 

142 if isinstance(w, list): 

143 return [transform_url(_) for _ in w] 

144 if w == "xd": 

145 w = "http://www.xavierdupre.fr/enseignement/complements/" 

146 elif w == "xdtd": 

147 w = "http://www.xavierdupre.fr/site2013/enseignements/tddata/" 

148 return w 

149 

150 website = transform_url(website) 

151 url = transform_url(url) 

152 if url is None: 

153 url = website 

154 

155 if not os.path.exists(whereTo): 

156 raise FileExistsError("this folder should exists " + whereTo) 

157 

158 # Multiple downloads. 

159 if isinstance(url, list): 

160 single = isinstance(name, str) 

161 if single: 

162 name = [name] * len(url) 

163 if not isinstance(name, list): 

164 raise TypeError("If url is a list, name be a list too.") 

165 if len(name) != len(url): 

166 raise ValueError("url and name must be list of the same size.") 

167 outfiles = [] 

168 for i, u in enumerate(url): 

169 res = download_data(name[i], moduleName=moduleName, url=u, glo=glo, 

170 loc=loc, whereTo=whereTo, website=website, timeout=timeout, 

171 retry=retry, silent=silent, fLOG=fLOG) 

172 if isinstance(res, list): 

173 outfiles.extend(res) 

174 else: 

175 outfiles.append(res) 

176 if single and res is not None and os.path.exists(res): 

177 break 

178 return outfiles 

179 elif isinstance(name, list): 

180 outfiles = [] 

181 for i, n in enumerate(name): 

182 res = download_data(n, moduleName=moduleName, url=url, glo=glo, 

183 loc=loc, whereTo=whereTo, website=website, timeout=timeout, 

184 retry=retry, silent=silent, fLOG=fLOG) 

185 if isinstance(res, list): 

186 outfiles.extend(res) 

187 else: 

188 outfiles.append(res) 

189 return outfiles 

190 

191 # Single download. 

192 origname = name 

193 if name in sys.modules: 

194 return sys.modules[name] 

195 elif "." not in name: 

196 fLOG("[download_data] unable to find module '{0}'".format(name)) 

197 

198 file = name if "." in name else "%s.py" % name 

199 outfile = file if whereTo == "." else os.path.join(whereTo, file) 

200 

201 if url is not None and not os.path.exists(outfile): 

202 excs = [] 

203 success = False 

204 alls = None 

205 url += file 

206 fLOG("[download_data] download '{0}' to '{1}'".format( 

207 url, outfile)) 

208 while retry > 0: 

209 try: 

210 u = urllib.request.urlopen( 

211 url) if timeout is None else urllib.request.urlopen(url, timeout=timeout) 

212 alls = u.read() 

213 u.close() 

214 success = True 

215 break 

216 except ConnectionResetError as ee: 

217 if retry <= 0: 

218 exc = DownloadDataException( 

219 "Unable (1) to retrieve data from '{0}'. Error: {1}".format(url, ee)) 

220 excs.append(exc) 

221 excs.append(ee) 

222 break 

223 fLOG("[download_data] (1) fail and retry to download '{0}' to '{1}'".format( 

224 url, outfile)) 

225 # We wait for 2 seconds. 

226 time.sleep(2) 

227 except Exception as e: 

228 if retry <= 1: 

229 exc = DownloadDataException( 

230 "Unable (2) to retrieve data from '{0}'. Error: {1}".format(url, e)) 

231 excs.append(exc) 

232 excs.append(e) 

233 break 

234 fLOG("[download_data] (2) fail and retry to download '{0}' to '{1}'".format( 

235 url, outfile)) 

236 # We wait for 2 seconds. 

237 time.sleep(2) 

238 retry -= 1 

239 

240 if success and alls is not None: 

241 u = open(outfile, "wb") 

242 u.write(alls) 

243 u.close() 

244 elif len(excs) > 0: 

245 raise excs[0] 

246 else: 

247 raise DownloadDataException( 

248 "Unable to retrieve data from '{0}'".format(url)) 

249 

250 if name.endswith(".zip"): 

251 try: 

252 return decompress_zip(outfile, whereTo, fLOG) 

253 except RuntimeError as e: 

254 raise RetrieveDataException( # pragma: no cover 

255 "Unable to unzip '{}' to '{}' (url='{}').".format( 

256 outfile, whereTo, url)) from e 

257 

258 elif name.endswith(".tar.gz"): 

259 return decompress_targz(outfile, whereTo, silent=silent, fLOG=fLOG) 

260 

261 elif name.endswith(".gz"): 

262 return decompress_gz(outfile, whereTo, fLOG) 

263 

264 elif name.endswith(".bz2"): 

265 return decompress_bz2(outfile, whereTo, fLOG) 

266 

267 elif "." not in name: 

268 path, filename = os.path.split(outfile) 

269 if filename != outfile: 

270 if path not in sys.path: 

271 sys.path.append(path) 

272 

273 remove_empty_line(outfile) 

274 

275 try: 

276 temp = importlib.import_module(name) 

277 except SystemError as e: 

278 if "Parent module '' not loaded" in str(e): 

279 reg1 = re.compile("^(from +[.])[a-zA-Z]") 

280 reg2 = re.compile("^from +[.]{2}") 

281 fLOG("[download_data] removing relative import for ", name) 

282 with open(outfile, "r") as f: 

283 lines = f.readlines() 

284 fil = [] 

285 fir = True 

286 for li in lines: 

287 r1 = reg1.search(li) 

288 r2 = reg2.search(li) 

289 if r2: 

290 ls = "" 

291 if fir: 

292 ls = "fLOG = print" 

293 fir = False 

294 elif r1: 

295 st = r1.groups()[0] 

296 ls = ls.replace(st, "from ") 

297 if fir: 

298 ls += "\nfLOG = print" 

299 fir = False 

300 fil.append(ls.strip("\n\r")) 

301 if not fir: 

302 fLOG("[download_data] end removing relative import for ", name) 

303 with open(outfile, "w") as f: 

304 f.write("\n".join(fil)) 

305 

306 try: 

307 temp = importlib.import_module(name) 

308 except Exception as e: 

309 fLOG("[download_data] issue (3) while importing ", 

310 name, " -- ", origname) 

311 fLOG("[download_data] sys.path ", sys.path) 

312 for _ in sys.path: 

313 fLOG("[download_data] path ", _) 

314 fLOG("[download_data] sys.modules.keys()", 

315 list(sys.modules.keys())) 

316 for _ in sorted(sys.modules): 

317 fLOG("[download_data] modules ", _) 

318 raise e 

319 

320 except Exception as e: 

321 fLOG("[download_data] issue (2) while importing ", 

322 name, " -- ", origname) 

323 fLOG("[download_data] sys.path ", sys.path) 

324 for _ in sys.path: 

325 fLOG("[download_data] path ", _) 

326 fLOG("[download_data] sys.modules.keys()", list(sys.modules.keys())) 

327 for _ in sorted(sys.modules): 

328 fLOG("[download_data] modules ", _) 

329 raise e 

330 

331 if name not in temp.__name__: 

332 raise NameError( 

333 "name should be present in __name__ " + 

334 name + 

335 " ? " + 

336 temp.__name__) 

337 glo[moduleName] = temp 

338 sys.modules[moduleName] = temp 

339 sys.modules[origname] = temp 

340 return temp 

341 

342 elif file.split(".")[-1] in ["txt", "csv", "tsv", "xml", "html"]: 

343 remove_empty_line(outfile) 

344 return outfile 

345 else: 

346 return outfile