Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# -*- coding: utf-8 -*-
2"""
3@file
4@brief Functions get_page_wheel
5"""
7import sys
8from ssl import SSLEOFError
9from .install_memoize import install_memoize
10from .internet_settings import default_user_agent
12if sys.version_info[0] == 2:
13 import urllib2 as urllib_request
14 from codecs import open
15 from HTMLParser import HTMLParser
16else:
17 import urllib.request as urllib_request
18 from html.parser import HTMLParser
19 from urllib.error import URLError
22class InternalJsException(Exception):
23 """
24 Raises when a javascript url cannot be decrypted.
25 """
26 pass
29@install_memoize
30def get_page_wheel(page, sele=True):
31 """
32 get the page
34 @param page location
35 @param sele use selenium or not or False to try if the other way did not work
36 @return page content
37 """
38 req = urllib_request.Request(
39 page,
40 headers={
41 'User-agent': default_user_agent})
42 ull = False
43 try:
44 u = urllib_request.urlopen(req)
45 ull = True
46 except (SSLEOFError, URLError) as ee:
47 # This usually happens on Windows.
48 # ssl.SSLEOFError: EOF occurred in violation of protocol (_ssl.c:749)
49 if sele:
50 from ..installcustom.install_custom_chromedriver import install_chromedriver
51 import selenium.webdriver
52 install_chromedriver(fLOG=None)
53 browser = selenium.webdriver.Chrome()
54 browser.get(page)
55 text = browser.page_source
56 browser.close()
57 if len(text) < 1000:
58 raise ValueError(
59 "Unable to retrieve information from '{0}' with selenium len={1}".format(page, len(text)))
60 else:
61 raise ee
62 except Exception as e:
63 raise Exception(
64 "unable to get '{0}' '{1}'".format(page, type(e))) from e
66 if ull:
67 text = u.read()
68 u.close()
69 text = text.decode("utf8")
71 return _clean_page_wheel(text)
74def _clean_page_wheel(text):
75 """
76 remove unexpected characters
78 @param text string
79 @return string
80 """
81 text = text.replace(""", "'")
82 text = text.replace("‑", "-")
83 text = text.replace(".", ".")
84 text = text.replace(" · ", "-")
85 text = text.replace("–", "-")
86 return text
89def save_page_wheel(filename, content):
90 """
91 cache a HTML page
93 @param filename filename
94 @param content content
95 @return filename
96 """
97 with open(filename, "w", encoding="utf8") as f:
98 f.write(content)
101def read_page_wheel(filename):
102 """
103 read a cached HTML page
105 @param filename filename
106 @return filename
107 """
108 with open(filename, "r", encoding="utf8") as f:
109 text = f.read()
110 return _clean_page_wheel(text)
113def _cg_dl1(ml, mi):
114 ot = ""
115 for j in range(0, len(mi)):
116 ot += chr(ml[ord(mi[j]) - 48])
117 return ot
120def _cg_dl(ml, mi, fLOG=None):
121 """
122 compressed::
124 if (top.location!=location) top.location.href=location.href;
125 function dc(ml,mi){var ot="";for(var j=0;j<mi.length;j++)ot+=String.fromCharCode(ml[mi.charCodeAt(j)-48]);
126 document.write(ot);}function dl1(ml,mi){var ot="";for(var j=0;j<mi.length;j++)ot+=String.fromCharCode(ml[mi.charCodeAt(j)-48]);
127 location.href=ot;}function dl(ml,mi){mi=mi.replace('<','<');mi=mi.replace('>','>');mi=mi.replace('&','&');
128 setTimeout(function(){dl1(ml,mi)},1500);}
130 source::
132 <script type="text/javascript">
133 // <![CDATA[
134 if (top.location!=location)
135 top.location.href=location.href;
136 function dc(ml,mi)
137 {
138 var ot="";
139 for(var j=0;j<mi.length;j++)
140 ot+=String.fromCharCode(ml[mi.charCodeAt(j)-48]);
141 document.write(ot);
142 }
143 function dl1(ml,mi)
144 {
145 var ot="";
146 for(var j=0;j<mi.length;j++)
147 ot+=String.fromCharCode(ml[mi.charCodeAt(j)-48]);
148 location.href=ot;
149 }
150 function dl(ml,mi)
151 {
152 mi=mi.replace('<','<');
153 mi=mi.replace('>','>');
154 mi=mi.replace('&','&');
155 setTimeout(function(){dl1(ml,mi)},1500);
156 }
157 // ]]>
158 </script>
159 """
160 if fLOG:
161 fLOG("[pymy] decode", ml)
162 fLOG("[pymy] decode", mi)
163 mi = mi.replace('<', '<')
164 mi = mi.replace('>', '>')
165 mi = mi.replace('>', '>')
166 mi = mi.replace('&', '&')
167 return _cg_dl1(ml, mi)
170class HTMLParser4Links(HTMLParser):
171 """
172 extreact all links ni HTML page
173 """
175 def __init__(self):
176 """
177 constructor
178 """
179 if sys.version_info[0] == 2:
180 HTMLParser.__init__(self)
181 else:
182 HTMLParser.__init__(self, convert_charrefs=True)
183 self.links = []
184 self.current = None
186 def handle_starttag(self, tag, attrs):
187 """
188 enters a tag
189 """
190 if tag == "a":
191 self.current = ""
192 self.attrs = attrs
194 def handle_endtag(self, tag):
195 """
196 ends of a tag
197 """
198 def clean_dashes(st):
199 b = st.encode('utf-8')
200 b = b.replace(b'\xe2\x80\x91', b'-')
201 b = b.replace(b'\xc2\xa0', b' ')
202 return b.decode('utf-8')
203 if tag == "a":
204 if self.current is not None and len(self.current) > 0:
205 app = (clean_dashes(self.current),
206 [(clean_dashes(name), clean_dashes(link)) for name, link in self.attrs])
207 self.links.append(app)
208 self.current = None
210 def handle_data(self, data):
211 """
212 stores data if a link
213 """
214 if self.current is not None:
215 self.current += data
218def extract_all_links(text):
219 """
220 parses HTML to extract all links
222 @param text HTML page
223 @return list of links
224 """
225 parser = HTMLParser4Links()
226 parser.feed(text)
227 return parser.links
230def enumerate_links_module(name, alls, version, plat):
231 """
232 Selects the links for a specific module.
234 @param name module name
235 @param alls all links from @see fn extract_all_links
236 @param version python version
237 @param plat platform
238 """
239 version = "%d%d" % version[:2]
240 lname = name.lower()
241 lname_ = lname.replace("-", "_") + "-"
242 lname += "-"
243 for a in alls:
244 n = a[0]
245 ln = n.lower()
246 if (ln.startswith(lname) or ln.startswith(lname_)) and plat in ln:
247 vers = ("cp" + version, "py" + version)
248 good = False
249 for v in vers:
250 if v in ln:
251 good = True
252 if not good:
253 continue
254 else:
255 continue
257 js = None
258 for at, val in a[1]:
259 if at == "onclick":
260 js = val.lstrip()
262 if js:
263 js0 = js
264 suf = '"javascript:dl("'
265 bs = ["javascript:", "javascript :", "javascript :"]
266 res = None
267 for b in bs:
268 if js.startswith(b):
269 js = js[len(b):]
270 if js.endswith(suf):
271 js = js[:-len(suf) - 2]
272 if "javascript:" in js:
273 # Addition: 207-08-24
274 js = js[:js.index('javascript:')]
275 dl = _cg_dl
276 js = js.strip('" \t ;\'')
277 if dl is not None:
278 try:
279 res = eval(js)
280 except SyntaxError as e:
281 raise SyntaxError(
282 "Unable to evaluate '{0}'\njs0='{1}'.".format(js, js0)) from e
283 break
284 if res is None:
285 raise InternalJsException(
286 "Unable to decode js '{0}'".format(js))
287 yield n, js, res