Coverage for src/pymyinstall/installhelper/module_install_page_wheel.py: 63%

145 statements  

« prev     ^ index     » next       coverage.py v7.1.0, created at 2023-07-19 01:47 +0200

1# -*- coding: utf-8 -*- 

2""" 

3@file 

4@brief Functions get_page_wheel 

5""" 

6 

7import sys 

8from ssl import SSLEOFError 

9from .install_memoize import install_memoize 

10from .internet_settings import default_user_agent 

11 

12if sys.version_info[0] == 2: 

13 import urllib2 as urllib_request 

14 from codecs import open 

15 from HTMLParser import HTMLParser 

16else: 

17 import urllib.request as urllib_request 

18 from html.parser import HTMLParser 

19 from urllib.error import URLError 

20 

21 

22class InternalJsException(RuntimeError): 

23 """ 

24 Raises when a javascript url cannot be decrypted. 

25 """ 

26 pass 

27 

28 

29@install_memoize 

30def get_page_wheel(page, sele=True): 

31 """ 

32 get the page 

33 

34 @param page location 

35 @param sele use selenium or not or False to try if the other way did not work 

36 @return page content 

37 """ 

38 req = urllib_request.Request( 

39 page, 

40 headers={ 

41 'User-agent': default_user_agent}) 

42 ull = False 

43 try: 

44 u = urllib_request.urlopen(req) 

45 ull = True 

46 except (SSLEOFError, URLError) as ee: 

47 # This usually happens on Windows. 

48 # ssl.SSLEOFError: EOF occurred in violation of protocol (_ssl.c:749) 

49 if sele: 

50 from ..installcustom.install_custom_chromedriver import install_chromedriver 

51 import selenium.webdriver 

52 install_chromedriver(fLOG=None) 

53 try: 

54 browser = selenium.webdriver.Chrome() 

55 except Exception as ex: 

56 raise ValueError( 

57 f"Unable to load {page!r} (selenium failed too {ex}).") from ee 

58 browser.get(page) 

59 text = browser.page_source 

60 browser.close() 

61 if len(text) < 1000: 

62 raise ValueError( 

63 "Unable to retrieve information from '{0}' with selenium " 

64 "len={1}".format(page, len(text))) 

65 print(text) 

66 else: 

67 raise ee 

68 except Exception as e: 

69 raise RuntimeError( 

70 "unable to get '{0}' '{1}'".format(page, type(e))) from e 

71 

72 if ull: 

73 text = u.read() 

74 u.close() 

75 text = text.decode("utf8") 

76 

77 return _clean_page_wheel(text) 

78 

79 

80def _clean_page_wheel(text): 

81 """ 

82 remove unexpected characters 

83 

84 @param text string 

85 @return string 

86 """ 

87 text = text.replace("&quot;", "'") 

88 text = text.replace("&#8209;", "-") 

89 text = text.replace("&#46;", ".") 

90 text = text.replace(" &middot; ", "-") 

91 text = text.replace("&ndash;", "-") 

92 return text 

93 

94 

95def save_page_wheel(filename, content): 

96 """ 

97 cache a HTML page 

98 

99 @param filename filename 

100 @param content content 

101 @return filename 

102 """ 

103 with open(filename, "w", encoding="utf8") as f: 

104 f.write(content) 

105 

106 

107def read_page_wheel(filename): 

108 """ 

109 read a cached HTML page 

110 

111 @param filename filename 

112 @return filename 

113 """ 

114 with open(filename, "r", encoding="utf8") as f: 

115 text = f.read() 

116 return _clean_page_wheel(text) 

117 

118 

119def _cg_dl1(ml, mi): 

120 ot = "" 

121 for j in range(0, len(mi)): 

122 ot += chr(ml[ord(mi[j]) - 48]) 

123 return ot 

124 

125 

126def _cg_dl(ml, mi, fLOG=None): 

127 """ 

128 compressed:: 

129 

130 if (top.location!=location) top.location.href=location.href; 

131 function dc(ml,mi){var ot="";for(var j=0;j<mi.length;j++)ot+=String.fromCharCode(ml[mi.charCodeAt(j)-48]); 

132 document.write(ot);}function dl1(ml,mi){var ot="";for(var j=0;j<mi.length;j++)ot+=String.fromCharCode(ml[mi.charCodeAt(j)-48]); 

133 location.href=ot;}function dl(ml,mi){mi=mi.replace('&lt;','<');mi=mi.replace('&#62;','>');mi=mi.replace('&#38;','&'); 

134 setTimeout(function(){dl1(ml,mi)},1500);} 

135 

136 source:: 

137 

138 <script type="text/javascript"> 

139 // <![CDATA[ 

140 if (top.location!=location) 

141 top.location.href=location.href; 

142 function dc(ml,mi) 

143 { 

144 var ot=""; 

145 for(var j=0;j<mi.length;j++) 

146 ot+=String.fromCharCode(ml[mi.charCodeAt(j)-48]); 

147 document.write(ot); 

148 } 

149 function dl1(ml,mi) 

150 { 

151 var ot=""; 

152 for(var j=0;j<mi.length;j++) 

153 ot+=String.fromCharCode(ml[mi.charCodeAt(j)-48]); 

154 location.href=ot; 

155 } 

156 function dl(ml,mi) 

157 { 

158 mi=mi.replace('&lt;','<'); 

159 mi=mi.replace('&#62;','>'); 

160 mi=mi.replace('&#38;','&'); 

161 setTimeout(function(){dl1(ml,mi)},1500); 

162 } 

163 // ]]> 

164 </script> 

165 """ 

166 if fLOG: 

167 fLOG("[pymy] decode", ml) 

168 fLOG("[pymy] decode", mi) 

169 mi = mi.replace('&lt;', '<') 

170 mi = mi.replace('&#62;', '>') 

171 mi = mi.replace('&gt;', '>') 

172 mi = mi.replace('&#38;', '&') 

173 return _cg_dl1(ml, mi) 

174 

175 

176class HTMLParser4Links(HTMLParser): 

177 """ 

178 extreact all links ni HTML page 

179 """ 

180 

181 def __init__(self): 

182 """ 

183 constructor 

184 """ 

185 if sys.version_info[0] == 2: 

186 HTMLParser.__init__(self) 

187 else: 

188 HTMLParser.__init__(self, convert_charrefs=True) 

189 self.links = [] 

190 self.current = None 

191 

192 def handle_starttag(self, tag, attrs): 

193 """ 

194 enters a tag 

195 """ 

196 if tag == "a": 

197 self.current = "" 

198 self.attrs = attrs 

199 

200 def handle_endtag(self, tag): 

201 """ 

202 ends of a tag 

203 """ 

204 def clean_dashes(st): 

205 b = st.encode('utf-8') 

206 b = b.replace(b'\xe2\x80\x91', b'-') 

207 b = b.replace(b'\xc2\xa0', b' ') 

208 return b.decode('utf-8') 

209 if tag == "a": 

210 if self.current is not None and len(self.current) > 0: 

211 app = (clean_dashes(self.current), 

212 [(clean_dashes(name), clean_dashes(link)) for name, link in self.attrs]) 

213 self.links.append(app) 

214 self.current = None 

215 

216 def handle_data(self, data): 

217 """ 

218 stores data if a link 

219 """ 

220 if self.current is not None: 

221 self.current += data 

222 

223 

224def extract_all_links(text): 

225 """ 

226 parses HTML to extract all links 

227 

228 @param text HTML page 

229 @return list of links 

230 """ 

231 parser = HTMLParser4Links() 

232 parser.feed(text) 

233 return parser.links 

234 

235 

236def enumerate_links_module(name, alls, version, plat): 

237 """ 

238 Selects the links for a specific module. 

239 

240 @param name module name 

241 @param alls all links from @see fn extract_all_links 

242 @param version python version 

243 @param plat platform 

244 """ 

245 version = "%d%d" % version[:2] 

246 lname = name.lower() 

247 lname_ = lname.replace("-", "_") + "-" 

248 lname += "-" 

249 for a in alls: 

250 n = a[0] 

251 ln = n.lower() 

252 if (ln.startswith(lname) or ln.startswith(lname_)) and plat in ln: 

253 vers = ("cp" + version, "py" + version) 

254 good = False 

255 for v in vers: 

256 if v in ln: 

257 good = True 

258 if not good: 

259 continue 

260 else: 

261 continue 

262 

263 js = None 

264 for at, val in a[1]: 

265 if at == "onclick": 

266 js = val.lstrip() 

267 

268 if js: 

269 js0 = js 

270 suf = '"javascript:dl("' 

271 bs = ["javascript:", "javascript :", "javascript :"] 

272 res = None 

273 for b in bs: 

274 if js.startswith(b): 

275 js = js[len(b):] 

276 if js.endswith(suf): 

277 js = js[:-len(suf) - 2] 

278 if "javascript:" in js: 

279 # Addition: 207-08-24 

280 js = js[:js.index('javascript:')] 

281 dl = _cg_dl 

282 js = js.strip('" \t ;\'') 

283 if dl is not None: 

284 try: 

285 res = eval(js) 

286 except SyntaxError as e: 

287 raise SyntaxError( 

288 "Unable to evaluate '{0}'\njs0='{1}'.".format(js, js0)) from e 

289 break 

290 if res is None: 

291 raise InternalJsException( 

292 "Unable to decode js '{0}'".format(js)) 

293 yield n, js, res