Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1""" 

2@file 

3@brief A function to download the content of a url. 

4""" 

5import os 

6from datetime import datetime 

7import socket 

8import gzip 

9import warnings 

10import hashlib 

11import urllib.error as urllib_error 

12import urllib.request as urllib_request 

13import http.client as http_client 

14try: 

15 from http.client import InvalidURL 

16except ImportError: 

17 InvalidURL = ValueError 

18 

19 

20class InternetException(Exception): 

21 

22 """ 

23 Exception for the function @see fn get_url_content_timeout 

24 """ 

25 pass 

26 

27 

28def get_url_content_timeout(url, timeout=10, output=None, encoding="utf8", 

29 raise_exception=True, chunk=None, fLOG=None): 

30 """ 

31 Downloads a file from internet (by default, it assumes 

32 it is text information, otherwise, encoding should be None). 

33 

34 @param url (str) url 

35 @param timeout (int) in seconds, after this time, the function drops an returns None, -1 for forever 

36 @param output (str) if None, the content is stored in that file 

37 @param encoding (str) utf8 by default, but if it is None, the returned information is binary 

38 @param raise_exception (bool) True to raise an exception, False to send a warnings 

39 @param chunk (int|None) save data every chunk (only if output is not None) 

40 @param fLOG logging function (only applies when chunk is not None) 

41 @return content of the url 

42 

43 If the function automatically detects that the downloaded data is in gzip 

44 format, it will decompress it. 

45 

46 The function raises the exception @see cl InternetException. 

47 """ 

48 def save_content(content, append=False): 

49 "local function" 

50 app = "a" if append else "w" 

51 if encoding is not None: 

52 with open(output, app, encoding=encoding) as f: 

53 f.write(content) 

54 else: 

55 with open(output, app + "b") as f: 

56 f.write(content) 

57 

58 try: 

59 if chunk is not None: 

60 if output is None: 

61 raise ValueError( 

62 "output cannot be None if chunk is not None") 

63 app = [False] 

64 size = [0] 

65 

66 def _local_loop(ur): 

67 while True: 

68 res = ur.read(chunk) 

69 size[0] += len(res) # pylint: disable=E1137 

70 if fLOG is not None: 

71 fLOG("[get_url_content_timeout] downloaded", 

72 size, "bytes") 

73 if len(res) > 0: 

74 if encoding is not None: 

75 res = res.decode(encoding=encoding) 

76 save_content(res, app) 

77 else: 

78 break 

79 app[0] = True # pylint: disable=E1137 

80 

81 if timeout != -1: 

82 with urllib_request.urlopen(url, timeout=timeout) as ur: 

83 _local_loop(ur) 

84 else: 

85 with urllib_request.urlopen(url) as ur: 

86 _local_loop(ur) 

87 app = app[0] 

88 size = size[0] 

89 else: 

90 if timeout != -1: 

91 with urllib_request.urlopen(url, timeout=timeout) as ur: 

92 res = ur.read() 

93 else: 

94 with urllib_request.urlopen(url) as ur: 

95 res = ur.read() 

96 except (urllib_error.HTTPError, urllib_error.URLError, 

97 ConnectionRefusedError) as e: 

98 if raise_exception: 

99 raise InternetException( 

100 "Unable to retrieve content, url='{0}'".format(url)) from e 

101 warnings.warn( 

102 "Unable to retrieve content from '{0}' exc: {1}".format(url, e), ResourceWarning) 

103 return None 

104 except socket.timeout as e: 

105 if raise_exception: 

106 raise InternetException( 

107 "Unable to retrieve content, url='{0}'".format(url)) from e 

108 warnings.warn("unable to retrieve content from {0} because of timeout {1}: {2}".format( 

109 url, timeout, e), ResourceWarning) 

110 return None 

111 except ConnectionResetError as e: 

112 if raise_exception: 

113 raise InternetException( 

114 "Unable to retrieve content, url='{0}'".format(url)) from e 

115 warnings.warn( 

116 "unable to retrieve content from {0} because of ConnectionResetError: {1}".format(url, e), ResourceWarning) 

117 return None 

118 except http_client.BadStatusLine as e: 

119 if raise_exception: 

120 raise InternetException( 

121 "Unable to retrieve content, url='{0}'".format(url)) from e 

122 warnings.warn( 

123 "Unable to retrieve content from '{0}' because of http.client.BadStatusLine: {1}".format(url, e), ResourceWarning) 

124 return None 

125 except http_client.IncompleteRead as e: 

126 if raise_exception: 

127 raise InternetException( 

128 "Unable to retrieve content url='{0}'".format(url)) from e 

129 warnings.warn( 

130 "Unable to retrieve content from '{0}' because of http.client.IncompleteRead: {1}".format(url, e), ResourceWarning) 

131 return None 

132 except (ValueError, InvalidURL) as e: 

133 if raise_exception: 

134 raise InternetException( 

135 "Unable to retrieve content url='{0}'".format(url)) from e 

136 warnings.warn( 

137 "Unable to retrieve content from '{0}' because of {1}".format(url, e), ResourceWarning) 

138 return None 

139 except Exception as e: 

140 if raise_exception: 

141 raise InternetException( 

142 "Unable to retrieve content, url='{0}', exc={1}".format(url, e)) from e 

143 warnings.warn( 

144 "Unable to retrieve content from '{0}' because of unknown exception: {1}".format(url, e), ResourceWarning) 

145 raise e 

146 

147 if chunk is None: 

148 if len(res) >= 2 and res[:2] == b"\x1f\x8B": 

149 # gzip format 

150 res = gzip.decompress(res) 

151 

152 if encoding is not None: 

153 try: 

154 content = res.decode(encoding) 

155 except UnicodeDecodeError as e: 

156 # it tries different encoding 

157 

158 laste = [e] 

159 othenc = ["iso-8859-1", "latin-1"] 

160 

161 for encode in othenc: 

162 try: 

163 content = res.decode(encode) 

164 break 

165 except UnicodeDecodeError as e: 

166 laste.append(e) 

167 content = None 

168 

169 if content is None: 

170 mes = ["Unable to parse text from '{0}'.".format(url)] 

171 mes.append("tried:" + str([encoding] + othenc)) 

172 mes.append("beginning:\n" + str([res])[:50]) 

173 for e in laste: 

174 mes.append("Exception: " + str(e)) 

175 raise ValueError("\n".join(mes)) 

176 else: 

177 content = res 

178 else: 

179 content = None 

180 

181 if output is not None and chunk is None: 

182 save_content(content) 

183 

184 return content 

185 

186 

187def _hash_url(url): 

188 m = hashlib.sha256() 

189 m.update(url.encode('utf-8')) 

190 return m.hexdigest()[:25] 

191 

192 

193def get_urls_content_timeout(urls, timeout=10, folder=None, encoding=None, 

194 raise_exception=True, chunk=None, fLOG=None): 

195 """ 

196 Downloads data from urls (by default, it assumes 

197 it is text information, otherwise, encoding should be None). 

198 

199 :param urls: urls 

200 :param timeout: in seconds, after this time, the function drops an returns None, -1 for forever 

201 :param folder: if None, the content is stored in that file 

202 :param encoding: None by default, but if it is None, the returned information is binary 

203 :param raise_exception: True to raise an exception, False to send a warnings 

204 :param chunk: save data every chunk (only if output is not None) 

205 :param fLOG: logging function (only applies when chunk is not None) 

206 :return: list of downloaded content 

207 

208 If the function automatically detects that the downloaded data is in gzip 

209 format, it will decompress it. 

210 

211 The function raises the exception @see cl InternetException. 

212 """ 

213 import pandas 

214 import pandas.errors 

215 if not isinstance(urls, list): 

216 raise TypeError("urls must be a list") 

217 if folder is None: 

218 raise ValueError("folder should not be None") 

219 summary = os.path.join(folder, "summary.csv") 

220 if os.path.exists(summary): 

221 try: 

222 df = pandas.read_csv(summary) 

223 except pandas.errors.EmptyDataError: 

224 df = None 

225 else: 

226 df = None 

227 if df is not None: 

228 all_obs = [dict(url=df.loc[i, 'url'], # pylint: disable=E1101 

229 size=df.loc[i, 'size'], # pylint: disable=E1101 

230 date=df.loc[i, 'date'], # pylint: disable=E1101 

231 dest=df.loc[i, 'dest']) # pylint: disable=E1101 

232 for i in range(df.shape[0])] # pylint: disable=E1101 

233 done = set(d['dest'] for d in all_obs) 

234 else: 

235 all_obs = [] 

236 done = set() 

237 for i, url in enumerate(urls): 

238 dest = _hash_url(url) 

239 if dest in done: 

240 continue 

241 full_dest = os.path.join(folder, dest + '.bin') 

242 content = get_url_content_timeout(url, timeout=timeout, output=full_dest, 

243 encoding=encoding, chunk=chunk, 

244 raise_exception=raise_exception) 

245 if content is None: 

246 continue 

247 if fLOG is not None: 

248 fLOG("{}/{} downloaded {} bytes from '{}' to '{}'.".format( 

249 i + 1, len(urls), len(content), url, dest + '.bin')) 

250 

251 obs = dict(url=url, size=len(content), date=datetime.now(), 

252 dest=dest) 

253 all_obs.append(obs) 

254 done.add(dest) 

255 

256 new_df = pandas.DataFrame(all_obs) 

257 new_df.to_csv(summary, index=False) 

258 return all_obs 

259 

260 

261def local_url(url, folder=None, envvar='REPO_LOCAL_URLS'): 

262 """ 

263 Replaces the url by a local file in a folder 

264 or an environment variable 

265 if *folder* is None. 

266 

267 :param url: url to replace 

268 :param folder: local folder 

269 :param envvar: environment variable 

270 :return: local file or url 

271 """ 

272 if folder is None: 

273 folder = os.environ.get(envvar, None) # pragma: no cover 

274 if folder is None: 

275 raise FileNotFoundError( 

276 "Unable to find local folder '{}' or environment variable '{}'.".format( 

277 folder, envvar)) 

278 loc = _hash_url(url) 

279 name = os.path.join(folder, loc + '.bin') 

280 if os.path.exists(name): 

281 return name 

282 return url