Coverage for pyquickhelper/filehelper/download

1"""

2@file

3@brief A function to download the content of a url.

4"""

5import os

6from datetime import datetime

7import socket

8import gzip

9import warnings

10import hashlib

11import urllib.error as urllib_error

12import urllib.request as urllib_request

13import http.client as http_client

14try:

15 from http.client import InvalidURL

16except ImportError: # pragma: no cover

17 InvalidURL = ValueError

20class InternetException(Exception):

22 """

23 Exception for the function @see fn get_url_content_timeout

24 """

25 pass

28def get_url_content_timeout(url, timeout=10, output=None, encoding="utf8",

29 raise_exception=True, chunk=None, fLOG=None):

30 """

31 Downloads a file from internet (by default, it assumes

32 it is text information, otherwise, encoding should be None).

34 @param url (str) url

35 @param timeout (int) in seconds, after this time, the function drops an returns None, -1 for forever

36 @param output (str) if None, the content is stored in that file

37 @param encoding (str) utf8 by default, but if it is None, the returned information is binary

38 @param raise_exception (bool) True to raise an exception, False to send a warnings

39 @param chunk (int|None) save data every chunk (only if output is not None)

40 @param fLOG logging function (only applies when chunk is not None)

41 @return content of the url

43 If the function automatically detects that the downloaded data is in gzip

44 format, it will decompress it.

46 The function raises the exception @see cl InternetException.

47 """

48 def save_content(content, append=False):

49 "local function"

50 app = "a" if append else "w"

51 if encoding is not None:

52 with open(output, app, encoding=encoding) as f:

53 f.write(content)

54 else:

55 with open(output, app + "b") as f:

56 f.write(content)

58 try:

59 if chunk is not None:

60 if output is None:

61 raise ValueError(

62 "output cannot be None if chunk is not None")

63 app = [False]

64 size = [0]

66 def _local_loop(ur):

67 while True:

68 res = ur.read(chunk)

69 size[0] += len(res) # pylint: disable=E1137

70 if fLOG is not None:

71 fLOG("[get_url_content_timeout] downloaded",

72 size, "bytes")

73 if len(res) > 0:

74 if encoding is not None:

75 res = res.decode(encoding=encoding)

76 save_content(res, app)

77 else:

78 break

79 app[0] = True # pylint: disable=E1137

81 if timeout != -1:

82 with urllib_request.urlopen(url, timeout=timeout) as ur:

83 _local_loop(ur)

84 else:

85 with urllib_request.urlopen(url) as ur:

86 _local_loop(ur)

87 app = app[0]

88 size = size[0]

89 else:

90 if timeout != -1:

91 with urllib_request.urlopen(url, timeout=timeout) as ur:

92 res = ur.read()

93 else:

94 with urllib_request.urlopen(url) as ur:

95 res = ur.read()

96 except (urllib_error.HTTPError, urllib_error.URLError, ConnectionRefusedError,

97 socket.timeout, ConnectionResetError, http_client.BadStatusLine,

98 http_client.IncompleteRead, ValueError, InvalidURL) as e:

99 if raise_exception:

100 raise InternetException(

101 f"Unable to retrieve content url='{url}'") from e

102 warnings.warn(

103 f"Unable to retrieve content from '{url}' because of {e}", ResourceWarning)

104 return None

105 except Exception as e:

106 if raise_exception: # pragma: no cover

107 raise InternetException(

108 f"Unable to retrieve content, url='{url}', exc={e}") from e

109 warnings.warn(

110 f"Unable to retrieve content from '{url}' because of unknown exception: {e}", ResourceWarning)

111 raise e

112

113 if chunk is None:

114 if len(res) >= 2 and res[:2] == b"\x1f\x8B":

115 # gzip format

116 res = gzip.decompress(res)

117

118 if encoding is not None:

119 try:

120 content = res.decode(encoding)

121 except UnicodeDecodeError as e: # pragma: no cover

122 # it tries different encoding

123

124 laste = [e]

125 othenc = ["iso-8859-1", "latin-1"]

126

127 for encode in othenc:

128 try:

129 content = res.decode(encode)

130 break

131 except UnicodeDecodeError as ee:

132 laste.append(ee)

133 content = None

134

135 if content is None:

136 mes = [f"Unable to parse text from '{url}'."]

137 mes.append("tried:" + str([encoding] + othenc))

138 mes.append("beginning:\n" + str([res])[:50])

139 for e in laste:

140 mes.append("Exception: " + str(e))

141 raise ValueError("\n".join(mes))

142 else:

143 content = res

144 else:

145 content = None

146

147 if output is not None and chunk is None:

148 save_content(content)

149

150 return content

151

152

153def _hash_url(url):

154 m = hashlib.sha256()

155 m.update(url.encode('utf-8'))

156 return m.hexdigest()[:25]

157

158

159def get_urls_content_timeout(urls, timeout=10, folder=None, encoding=None,

160 raise_exception=True, chunk=None, fLOG=None):

161 """

162 Downloads data from urls (by default, it assumes

163 it is text information, otherwise, encoding should be None).

164

165 :param urls: urls

166 :param timeout: in seconds, after this time, the function drops an returns None, -1 for forever

167 :param folder: if None, the content is stored in that file

168 :param encoding: None by default, but if it is None, the returned information is binary

169 :param raise_exception: True to raise an exception, False to send a warnings

170 :param chunk: save data every chunk (only if output is not None)

171 :param fLOG: logging function (only applies when chunk is not None)

172 :return: list of downloaded content

173

174 If the function automatically detects that the downloaded data is in gzip

175 format, it will decompress it.

176

177 The function raises the exception @see cl InternetException.

178 """

179 import pandas

180 import pandas.errors

181 if not isinstance(urls, list):

182 raise TypeError("urls must be a list")

183 if folder is None:

184 raise ValueError("folder should not be None")

185 summary = os.path.join(folder, "summary.csv")

186 if os.path.exists(summary):

187 try:

188 df = pandas.read_csv(summary)

189 except pandas.errors.EmptyDataError:

190 df = None

191 else:

192 df = None

193 if df is not None:

194 all_obs = [dict(url=df.loc[i, 'url'], # pylint: disable=E1101

195 size=df.loc[i, 'size'], # pylint: disable=E1101

196 date=df.loc[i, 'date'], # pylint: disable=E1101

197 dest=df.loc[i, 'dest']) # pylint: disable=E1101

198 for i in range(df.shape[0])] # pylint: disable=E1101

199 done = set(d['dest'] for d in all_obs)

200 else:

201 all_obs = []

202 done = set()

203 for i, url in enumerate(urls):

204 dest = _hash_url(url)

205 if dest in done:

206 continue

207 full_dest = os.path.join(folder, dest + '.bin')

208 content = get_url_content_timeout(url, timeout=timeout, output=full_dest,

209 encoding=encoding, chunk=chunk,

210 raise_exception=raise_exception)

211 if content is None:

212 continue

213 if fLOG is not None:

214 fLOG("{}/{} downloaded {} bytes from '{}' to '{}'.".format(

215 i + 1, len(urls), len(content), url, dest + '.bin'))

216

217 obs = dict(url=url, size=len(content), date=datetime.now(),

218 dest=dest)

219 all_obs.append(obs)

220 done.add(dest)

221

222 new_df = pandas.DataFrame(all_obs)

223 new_df.to_csv(summary, index=False)

224 return all_obs

225

226

227def local_url(url, folder=None, envvar='REPO_LOCAL_URLS'):

228 """

229 Replaces the url by a local file in a folder

230 or an environment variable

231 if *folder* is None.

232

233 :param url: url to replace

234 :param folder: local folder

235 :param envvar: environment variable

236 :return: local file or url

237 """

238 if folder is None:

239 folder = os.environ.get(envvar, None) # pragma: no cover

240 if folder is None:

241 raise FileNotFoundError(

242 "Unable to find local folder '{}' or environment variable '{}'.".format(

243 folder, envvar))

244 loc = _hash_url(url)

245 name = os.path.join(folder, loc + '.bin')

246 if os.path.exists(name):

247 return name

248 return url

Coverage for pyquickhelper/filehelper/download_helper.py: 91%

117 statements