Coverage for pyquickhelper/filehelper/download_helper.py: 91%

117 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-06-03 02:21 +0200

1""" 

2@file 

3@brief A function to download the content of a url. 

4""" 

5import os 

6from datetime import datetime 

7import socket 

8import gzip 

9import warnings 

10import hashlib 

11import urllib.error as urllib_error 

12import urllib.request as urllib_request 

13import http.client as http_client 

14try: 

15 from http.client import InvalidURL 

16except ImportError: # pragma: no cover 

17 InvalidURL = ValueError 

18 

19 

20class InternetException(Exception): 

21 

22 """ 

23 Exception for the function @see fn get_url_content_timeout 

24 """ 

25 pass 

26 

27 

28def get_url_content_timeout(url, timeout=10, output=None, encoding="utf8", 

29 raise_exception=True, chunk=None, fLOG=None): 

30 """ 

31 Downloads a file from internet (by default, it assumes 

32 it is text information, otherwise, encoding should be None). 

33 

34 @param url (str) url 

35 @param timeout (int) in seconds, after this time, the function drops an returns None, -1 for forever 

36 @param output (str) if None, the content is stored in that file 

37 @param encoding (str) utf8 by default, but if it is None, the returned information is binary 

38 @param raise_exception (bool) True to raise an exception, False to send a warnings 

39 @param chunk (int|None) save data every chunk (only if output is not None) 

40 @param fLOG logging function (only applies when chunk is not None) 

41 @return content of the url 

42 

43 If the function automatically detects that the downloaded data is in gzip 

44 format, it will decompress it. 

45 

46 The function raises the exception @see cl InternetException. 

47 """ 

48 def save_content(content, append=False): 

49 "local function" 

50 app = "a" if append else "w" 

51 if encoding is not None: 

52 with open(output, app, encoding=encoding) as f: 

53 f.write(content) 

54 else: 

55 with open(output, app + "b") as f: 

56 f.write(content) 

57 

58 try: 

59 if chunk is not None: 

60 if output is None: 

61 raise ValueError( 

62 "output cannot be None if chunk is not None") 

63 app = [False] 

64 size = [0] 

65 

66 def _local_loop(ur): 

67 while True: 

68 res = ur.read(chunk) 

69 size[0] += len(res) # pylint: disable=E1137 

70 if fLOG is not None: 

71 fLOG("[get_url_content_timeout] downloaded", 

72 size, "bytes") 

73 if len(res) > 0: 

74 if encoding is not None: 

75 res = res.decode(encoding=encoding) 

76 save_content(res, app) 

77 else: 

78 break 

79 app[0] = True # pylint: disable=E1137 

80 

81 if timeout != -1: 

82 with urllib_request.urlopen(url, timeout=timeout) as ur: 

83 _local_loop(ur) 

84 else: 

85 with urllib_request.urlopen(url) as ur: 

86 _local_loop(ur) 

87 app = app[0] 

88 size = size[0] 

89 else: 

90 if timeout != -1: 

91 with urllib_request.urlopen(url, timeout=timeout) as ur: 

92 res = ur.read() 

93 else: 

94 with urllib_request.urlopen(url) as ur: 

95 res = ur.read() 

96 except (urllib_error.HTTPError, urllib_error.URLError, ConnectionRefusedError, 

97 socket.timeout, ConnectionResetError, http_client.BadStatusLine, 

98 http_client.IncompleteRead, ValueError, InvalidURL) as e: 

99 if raise_exception: 

100 raise InternetException( 

101 f"Unable to retrieve content url='{url}'") from e 

102 warnings.warn( 

103 f"Unable to retrieve content from '{url}' because of {e}", ResourceWarning) 

104 return None 

105 except Exception as e: 

106 if raise_exception: # pragma: no cover 

107 raise InternetException( 

108 f"Unable to retrieve content, url='{url}', exc={e}") from e 

109 warnings.warn( 

110 f"Unable to retrieve content from '{url}' because of unknown exception: {e}", ResourceWarning) 

111 raise e 

112 

113 if chunk is None: 

114 if len(res) >= 2 and res[:2] == b"\x1f\x8B": 

115 # gzip format 

116 res = gzip.decompress(res) 

117 

118 if encoding is not None: 

119 try: 

120 content = res.decode(encoding) 

121 except UnicodeDecodeError as e: # pragma: no cover 

122 # it tries different encoding 

123 

124 laste = [e] 

125 othenc = ["iso-8859-1", "latin-1"] 

126 

127 for encode in othenc: 

128 try: 

129 content = res.decode(encode) 

130 break 

131 except UnicodeDecodeError as ee: 

132 laste.append(ee) 

133 content = None 

134 

135 if content is None: 

136 mes = [f"Unable to parse text from '{url}'."] 

137 mes.append("tried:" + str([encoding] + othenc)) 

138 mes.append("beginning:\n" + str([res])[:50]) 

139 for e in laste: 

140 mes.append("Exception: " + str(e)) 

141 raise ValueError("\n".join(mes)) 

142 else: 

143 content = res 

144 else: 

145 content = None 

146 

147 if output is not None and chunk is None: 

148 save_content(content) 

149 

150 return content 

151 

152 

153def _hash_url(url): 

154 m = hashlib.sha256() 

155 m.update(url.encode('utf-8')) 

156 return m.hexdigest()[:25] 

157 

158 

159def get_urls_content_timeout(urls, timeout=10, folder=None, encoding=None, 

160 raise_exception=True, chunk=None, fLOG=None): 

161 """ 

162 Downloads data from urls (by default, it assumes 

163 it is text information, otherwise, encoding should be None). 

164 

165 :param urls: urls 

166 :param timeout: in seconds, after this time, the function drops an returns None, -1 for forever 

167 :param folder: if None, the content is stored in that file 

168 :param encoding: None by default, but if it is None, the returned information is binary 

169 :param raise_exception: True to raise an exception, False to send a warnings 

170 :param chunk: save data every chunk (only if output is not None) 

171 :param fLOG: logging function (only applies when chunk is not None) 

172 :return: list of downloaded content 

173 

174 If the function automatically detects that the downloaded data is in gzip 

175 format, it will decompress it. 

176 

177 The function raises the exception @see cl InternetException. 

178 """ 

179 import pandas 

180 import pandas.errors 

181 if not isinstance(urls, list): 

182 raise TypeError("urls must be a list") 

183 if folder is None: 

184 raise ValueError("folder should not be None") 

185 summary = os.path.join(folder, "summary.csv") 

186 if os.path.exists(summary): 

187 try: 

188 df = pandas.read_csv(summary) 

189 except pandas.errors.EmptyDataError: 

190 df = None 

191 else: 

192 df = None 

193 if df is not None: 

194 all_obs = [dict(url=df.loc[i, 'url'], # pylint: disable=E1101 

195 size=df.loc[i, 'size'], # pylint: disable=E1101 

196 date=df.loc[i, 'date'], # pylint: disable=E1101 

197 dest=df.loc[i, 'dest']) # pylint: disable=E1101 

198 for i in range(df.shape[0])] # pylint: disable=E1101 

199 done = set(d['dest'] for d in all_obs) 

200 else: 

201 all_obs = [] 

202 done = set() 

203 for i, url in enumerate(urls): 

204 dest = _hash_url(url) 

205 if dest in done: 

206 continue 

207 full_dest = os.path.join(folder, dest + '.bin') 

208 content = get_url_content_timeout(url, timeout=timeout, output=full_dest, 

209 encoding=encoding, chunk=chunk, 

210 raise_exception=raise_exception) 

211 if content is None: 

212 continue 

213 if fLOG is not None: 

214 fLOG("{}/{} downloaded {} bytes from '{}' to '{}'.".format( 

215 i + 1, len(urls), len(content), url, dest + '.bin')) 

216 

217 obs = dict(url=url, size=len(content), date=datetime.now(), 

218 dest=dest) 

219 all_obs.append(obs) 

220 done.add(dest) 

221 

222 new_df = pandas.DataFrame(all_obs) 

223 new_df.to_csv(summary, index=False) 

224 return all_obs 

225 

226 

227def local_url(url, folder=None, envvar='REPO_LOCAL_URLS'): 

228 """ 

229 Replaces the url by a local file in a folder 

230 or an environment variable 

231 if *folder* is None. 

232 

233 :param url: url to replace 

234 :param folder: local folder 

235 :param envvar: environment variable 

236 :return: local file or url 

237 """ 

238 if folder is None: 

239 folder = os.environ.get(envvar, None) # pragma: no cover 

240 if folder is None: 

241 raise FileNotFoundError( 

242 "Unable to find local folder '{}' or environment variable '{}'.".format( 

243 folder, envvar)) 

244 loc = _hash_url(url) 

245 name = os.path.join(folder, loc + '.bin') 

246 if os.path.exists(name): 

247 return name 

248 return url