Coverage for src/pymyinstall/installhelper/module_install_page_wheel.py: 63%
145 statements
« prev ^ index » next coverage.py v7.1.0, created at 2023-07-19 01:47 +0200
« prev ^ index » next coverage.py v7.1.0, created at 2023-07-19 01:47 +0200
1# -*- coding: utf-8 -*-
2"""
3@file
4@brief Functions get_page_wheel
5"""
7import sys
8from ssl import SSLEOFError
9from .install_memoize import install_memoize
10from .internet_settings import default_user_agent
12if sys.version_info[0] == 2:
13 import urllib2 as urllib_request
14 from codecs import open
15 from HTMLParser import HTMLParser
16else:
17 import urllib.request as urllib_request
18 from html.parser import HTMLParser
19 from urllib.error import URLError
22class InternalJsException(RuntimeError):
23 """
24 Raises when a javascript url cannot be decrypted.
25 """
26 pass
29@install_memoize
30def get_page_wheel(page, sele=True):
31 """
32 get the page
34 @param page location
35 @param sele use selenium or not or False to try if the other way did not work
36 @return page content
37 """
38 req = urllib_request.Request(
39 page,
40 headers={
41 'User-agent': default_user_agent})
42 ull = False
43 try:
44 u = urllib_request.urlopen(req)
45 ull = True
46 except (SSLEOFError, URLError) as ee:
47 # This usually happens on Windows.
48 # ssl.SSLEOFError: EOF occurred in violation of protocol (_ssl.c:749)
49 if sele:
50 from ..installcustom.install_custom_chromedriver import install_chromedriver
51 import selenium.webdriver
52 install_chromedriver(fLOG=None)
53 try:
54 browser = selenium.webdriver.Chrome()
55 except Exception as ex:
56 raise ValueError(
57 f"Unable to load {page!r} (selenium failed too {ex}).") from ee
58 browser.get(page)
59 text = browser.page_source
60 browser.close()
61 if len(text) < 1000:
62 raise ValueError(
63 "Unable to retrieve information from '{0}' with selenium "
64 "len={1}".format(page, len(text)))
65 print(text)
66 else:
67 raise ee
68 except Exception as e:
69 raise RuntimeError(
70 "unable to get '{0}' '{1}'".format(page, type(e))) from e
72 if ull:
73 text = u.read()
74 u.close()
75 text = text.decode("utf8")
77 return _clean_page_wheel(text)
80def _clean_page_wheel(text):
81 """
82 remove unexpected characters
84 @param text string
85 @return string
86 """
87 text = text.replace(""", "'")
88 text = text.replace("‑", "-")
89 text = text.replace(".", ".")
90 text = text.replace(" · ", "-")
91 text = text.replace("–", "-")
92 return text
95def save_page_wheel(filename, content):
96 """
97 cache a HTML page
99 @param filename filename
100 @param content content
101 @return filename
102 """
103 with open(filename, "w", encoding="utf8") as f:
104 f.write(content)
107def read_page_wheel(filename):
108 """
109 read a cached HTML page
111 @param filename filename
112 @return filename
113 """
114 with open(filename, "r", encoding="utf8") as f:
115 text = f.read()
116 return _clean_page_wheel(text)
119def _cg_dl1(ml, mi):
120 ot = ""
121 for j in range(0, len(mi)):
122 ot += chr(ml[ord(mi[j]) - 48])
123 return ot
126def _cg_dl(ml, mi, fLOG=None):
127 """
128 compressed::
130 if (top.location!=location) top.location.href=location.href;
131 function dc(ml,mi){var ot="";for(var j=0;j<mi.length;j++)ot+=String.fromCharCode(ml[mi.charCodeAt(j)-48]);
132 document.write(ot);}function dl1(ml,mi){var ot="";for(var j=0;j<mi.length;j++)ot+=String.fromCharCode(ml[mi.charCodeAt(j)-48]);
133 location.href=ot;}function dl(ml,mi){mi=mi.replace('<','<');mi=mi.replace('>','>');mi=mi.replace('&','&');
134 setTimeout(function(){dl1(ml,mi)},1500);}
136 source::
138 <script type="text/javascript">
139 // <![CDATA[
140 if (top.location!=location)
141 top.location.href=location.href;
142 function dc(ml,mi)
143 {
144 var ot="";
145 for(var j=0;j<mi.length;j++)
146 ot+=String.fromCharCode(ml[mi.charCodeAt(j)-48]);
147 document.write(ot);
148 }
149 function dl1(ml,mi)
150 {
151 var ot="";
152 for(var j=0;j<mi.length;j++)
153 ot+=String.fromCharCode(ml[mi.charCodeAt(j)-48]);
154 location.href=ot;
155 }
156 function dl(ml,mi)
157 {
158 mi=mi.replace('<','<');
159 mi=mi.replace('>','>');
160 mi=mi.replace('&','&');
161 setTimeout(function(){dl1(ml,mi)},1500);
162 }
163 // ]]>
164 </script>
165 """
166 if fLOG:
167 fLOG("[pymy] decode", ml)
168 fLOG("[pymy] decode", mi)
169 mi = mi.replace('<', '<')
170 mi = mi.replace('>', '>')
171 mi = mi.replace('>', '>')
172 mi = mi.replace('&', '&')
173 return _cg_dl1(ml, mi)
176class HTMLParser4Links(HTMLParser):
177 """
178 extreact all links ni HTML page
179 """
181 def __init__(self):
182 """
183 constructor
184 """
185 if sys.version_info[0] == 2:
186 HTMLParser.__init__(self)
187 else:
188 HTMLParser.__init__(self, convert_charrefs=True)
189 self.links = []
190 self.current = None
192 def handle_starttag(self, tag, attrs):
193 """
194 enters a tag
195 """
196 if tag == "a":
197 self.current = ""
198 self.attrs = attrs
200 def handle_endtag(self, tag):
201 """
202 ends of a tag
203 """
204 def clean_dashes(st):
205 b = st.encode('utf-8')
206 b = b.replace(b'\xe2\x80\x91', b'-')
207 b = b.replace(b'\xc2\xa0', b' ')
208 return b.decode('utf-8')
209 if tag == "a":
210 if self.current is not None and len(self.current) > 0:
211 app = (clean_dashes(self.current),
212 [(clean_dashes(name), clean_dashes(link)) for name, link in self.attrs])
213 self.links.append(app)
214 self.current = None
216 def handle_data(self, data):
217 """
218 stores data if a link
219 """
220 if self.current is not None:
221 self.current += data
224def extract_all_links(text):
225 """
226 parses HTML to extract all links
228 @param text HTML page
229 @return list of links
230 """
231 parser = HTMLParser4Links()
232 parser.feed(text)
233 return parser.links
236def enumerate_links_module(name, alls, version, plat):
237 """
238 Selects the links for a specific module.
240 @param name module name
241 @param alls all links from @see fn extract_all_links
242 @param version python version
243 @param plat platform
244 """
245 version = "%d%d" % version[:2]
246 lname = name.lower()
247 lname_ = lname.replace("-", "_") + "-"
248 lname += "-"
249 for a in alls:
250 n = a[0]
251 ln = n.lower()
252 if (ln.startswith(lname) or ln.startswith(lname_)) and plat in ln:
253 vers = ("cp" + version, "py" + version)
254 good = False
255 for v in vers:
256 if v in ln:
257 good = True
258 if not good:
259 continue
260 else:
261 continue
263 js = None
264 for at, val in a[1]:
265 if at == "onclick":
266 js = val.lstrip()
268 if js:
269 js0 = js
270 suf = '"javascript:dl("'
271 bs = ["javascript:", "javascript :", "javascript :"]
272 res = None
273 for b in bs:
274 if js.startswith(b):
275 js = js[len(b):]
276 if js.endswith(suf):
277 js = js[:-len(suf) - 2]
278 if "javascript:" in js:
279 # Addition: 207-08-24
280 js = js[:js.index('javascript:')]
281 dl = _cg_dl
282 js = js.strip('" \t ;\'')
283 if dl is not None:
284 try:
285 res = eval(js)
286 except SyntaxError as e:
287 raise SyntaxError(
288 "Unable to evaluate '{0}'\njs0='{1}'.".format(js, js0)) from e
289 break
290 if res is None:
291 raise InternalJsException(
292 "Unable to decode js '{0}'".format(js))
293 yield n, js, res