3@brief A function to download the content of a url. 


5import os 

6from datetime import datetime 

7import socket 

8import gzip 

9import warnings 

10import hashlib 

11import urllib.error as urllib_error 

12import urllib.request as urllib_request 

13import http.client as http_client 


15 from http.client import InvalidURL 

16except ImportError: 

17 InvalidURL = ValueError 



20class InternetException(Exception): 


22 """ 

23 Exception for the function @see fn get_url_content_timeout 

24 """ 

25 pass 



28def get_url_content_timeout(url, timeout=10, output=None, encoding="utf8", 

29 raise_exception=True, chunk=None, fLOG=None): 

30 """ 

31 Downloads a file from internet (by default, it assumes 

32 it is text information, otherwise, encoding should be None). 


34 @param url (str) url 

35 @param timeout (int) in seconds, after this time, the function drops an returns None, -1 for forever 

36 @param output (str) if None, the content is stored in that file 

37 @param encoding (str) utf8 by default, but if it is None, the returned information is binary 

38 @param raise_exception (bool) True to raise an exception, False to send a warnings 

39 @param chunk (int|None) save data every chunk (only if output is not None) 

40 @param fLOG logging function (only applies when chunk is not None) 

41 @return content of the url 


43 If the function automatically detects that the downloaded data is in gzip 

44 format, it will decompress it. 


46 The function raises the exception @see cl InternetException. 

47 """ 

48 def save_content(content, append=False): 

49 "local function" 

50 app = "a" if append else "w" 

51 if encoding is not None: 

52 with open(output, app, encoding=encoding) as f: 

53 f.write(content) 

54 else: 

55 with open(output, app + "b") as f: 

56 f.write(content) 


58 try: 

59 if chunk is not None: 

60 if output is None: 

61 raise ValueError( 

62 "output cannot be None if chunk is not None") 

63 app = [False] 

64 size = [0] 


66 def _local_loop(ur): 

67 while True: 

68 res = ur.read(chunk) 

69 size[0] += len(res) # pylint: disable=E1137 

70 if fLOG is not None: 

71 fLOG("[get_url_content_timeout] downloaded", 

72 size, "bytes") 

73 if len(res) > 0: 

74 if encoding is not None: 

75 res = res.decode(encoding=encoding) 

76 save_content(res, app) 

77 else: 

78 break 

79 app[0] = True # pylint: disable=E1137 


81 if timeout != -1: 

82 with urllib_request.urlopen(url, timeout=timeout) as ur: 

83 _local_loop(ur) 

84 else: 

85 with urllib_request.urlopen(url) as ur: 

86 _local_loop(ur) 

87 app = app[0] 

88 size = size[0] 

89 else: 

90 if timeout != -1: 

91 with urllib_request.urlopen(url, timeout=timeout) as ur: 

92 res = ur.read() 

93 else: 

94 with urllib_request.urlopen(url) as ur: 

95 res = ur.read() 

96 except (urllib_error.HTTPError, urllib_error.URLError, 

97 ConnectionRefusedError) as e: 

98 if raise_exception: 

99 raise InternetException( 

100 "Unable to retrieve content, url='{0}'".format(url)) from e 

101 warnings.warn( 

102 "Unable to retrieve content from '{0}' exc: {1}".format(url, e), ResourceWarning) 

103 return None 

104 except socket.timeout as e: 

105 if raise_exception: 

106 raise InternetException( 

107 "Unable to retrieve content, url='{0}'".format(url)) from e 

108 warnings.warn("unable to retrieve content from {0} because of timeout {1}: {2}".format( 

109 url, timeout, e), ResourceWarning) 

110 return None 

111 except ConnectionResetError as e: 

112 if raise_exception: 

113 raise InternetException( 

114 "Unable to retrieve content, url='{0}'".format(url)) from e 

115 warnings.warn( 

116 "unable to retrieve content from {0} because of ConnectionResetError: {1}".format(url, e), ResourceWarning) 

117 return None 

118 except http_client.BadStatusLine as e: 

119 if raise_exception: 

120 raise InternetException( 

121 "Unable to retrieve content, url='{0}'".format(url)) from e 

122 warnings.warn( 

123 "Unable to retrieve content from '{0}' because of http.client.BadStatusLine: {1}".format(url, e), ResourceWarning) 

124 return None 

125 except http_client.IncompleteRead as e: 

126 if raise_exception: 

127 raise InternetException( 

128 "Unable to retrieve content url='{0}'".format(url)) from e 

129 warnings.warn( 

130 "Unable to retrieve content from '{0}' because of http.client.IncompleteRead: {1}".format(url, e), ResourceWarning) 

131 return None 

132 except (ValueError, InvalidURL) as e: 

133 if raise_exception: 

134 raise InternetException( 

135 "Unable to retrieve content url='{0}'".format(url)) from e 

136 warnings.warn( 

137 "Unable to retrieve content from '{0}' because of {1}".format(url, e), ResourceWarning) 

138 return None 

139 except Exception as e: 

140 if raise_exception: 

141 raise InternetException( 

142 "Unable to retrieve content, url='{0}', exc={1}".format(url, e)) from e 

143 warnings.warn( 

144 "Unable to retrieve content from '{0}' because of unknown exception: {1}".format(url, e), ResourceWarning) 

145 raise e 


147 if chunk is None: 

148 if len(res) >= 2 and res[:2] == b"\x1f\x8B": 

149 # gzip format 

150 res = gzip.decompress(res) 


152 if encoding is not None: 

153 try: 

154 content = res.decode(encoding) 

155 except UnicodeDecodeError as e: 

156 # it tries different encoding 


158 laste = [e] 

159 othenc = ["iso-8859-1", "latin-1"] 


161 for encode in othenc: 

162 try: 

163 content = res.decode(encode) 

164 break 

165 except UnicodeDecodeError as e: 

166 laste.append(e) 

167 content = None 


169 if content is None: 

170 mes = ["Unable to parse text from '{0}'.".format(url)] 

171 mes.append("tried:" + str([encoding] + othenc)) 

172 mes.append("beginning:\n" + str([res])[:50]) 

173 for e in laste: 

174 mes.append("Exception: " + str(e)) 

175 raise ValueError("\n".join(mes)) 

176 else: 

177 content = res 

178 else: 

179 content = None 


181 if output is not None and chunk is None: 

182 save_content(content) 


184 return content 



187def _hash_url(url): 

188 m = hashlib.sha256() 

189 m.update(url.encode('utf-8')) 

190 return m.hexdigest()[:25] 



193def get_urls_content_timeout(urls, timeout=10, folder=None, encoding=None, 

194 raise_exception=True, chunk=None, fLOG=None): 

195 """ 

196 Downloads data from urls (by default, it assumes 

197 it is text information, otherwise, encoding should be None). 


199 :param urls: urls 

200 :param timeout: in seconds, after this time, the function drops an returns None, -1 for forever 

201 :param folder: if None, the content is stored in that file 

202 :param encoding: None by default, but if it is None, the returned information is binary 

203 :param raise_exception: True to raise an exception, False to send a warnings 

204 :param chunk: save data every chunk (only if output is not None) 

205 :param fLOG: logging function (only applies when chunk is not None) 

206 :return: list of downloaded content 


208 If the function automatically detects that the downloaded data is in gzip 

209 format, it will decompress it. 


211 The function raises the exception @see cl InternetException. 

212 """ 

213 import pandas 

214 import pandas.errors 

215 if not isinstance(urls, list): 

216 raise TypeError("urls must be a list") 

217 if folder is None: 

218 raise ValueError("folder should not be None") 

219 summary = os.path.join(folder, "summary.csv") 

220 if os.path.exists(summary): 

221 try: 

222 df = pandas.read_csv(summary) 

223 except pandas.errors.EmptyDataError: 

224 df = None 

225 else: 

226 df = None 

227 if df is not None: 

228 all_obs = [dict(url=df.loc[i, 'url'], # pylint: disable=E1101 

229 size=df.loc[i, 'size'], # pylint: disable=E1101 

230 date=df.loc[i, 'date'], # pylint: disable=E1101 

231 dest=df.loc[i, 'dest']) # pylint: disable=E1101 

232 for i in range(df.shape[0])] # pylint: disable=E1101 

233 done = set(d['dest'] for d in all_obs) 

234 else: 

235 all_obs = [] 

236 done = set() 

237 for i, url in enumerate(urls): 

238 dest = _hash_url(url) 

239 if dest in done: 

240 continue 

241 full_dest = os.path.join(folder, dest + '.bin') 

242 content = get_url_content_timeout(url, timeout=timeout, output=full_dest, 

243 encoding=encoding, chunk=chunk, 

244 raise_exception=raise_exception) 

245 if content is None: 

246 continue 

247 if fLOG is not None: 

248 fLOG("{}/{} downloaded {} bytes from '{}' to '{}'.".format( 

249 i + 1, len(urls), len(content), url, dest + '.bin')) 


251 obs = dict(url=url, size=len(content), date=datetime.now(), 

252 dest=dest) 

253 all_obs.append(obs) 

254 done.add(dest) 


256 new_df = pandas.DataFrame(all_obs) 

257 new_df.to_csv(summary, index=False) 

258 return all_obs 



261def local_url(url, folder=None, envvar='REPO_LOCAL_URLS'): 

262 """ 

263 Replaces the url by a local file in a folder 

264 or an environment variable 

265 if *folder* is None. 


267 :param url: url to replace 

268 :param folder: local folder 

269 :param envvar: environment variable 

270 :return: local file or url 

271 """ 

272 if folder is None: 

273 folder = os.environ.get(envvar, None) # pragma: no cover 

274 if folder is None: 

275 raise FileNotFoundError( 

276 "Unable to find local folder '{}' or environment variable '{}'.".format( 

277 folder, envvar)) 

278 loc = _hash_url(url) 

279 name = os.path.join(folder, loc + '.bin') 

280 if os.path.exists(name): 

281 return name 

282 return url