Coverage for src/ensae_teaching_cs/faq/faq_web.py: 57%
100 statements
« prev ^ index » next coverage.py v7.1.0, created at 2023-04-28 06:23 +0200
« prev ^ index » next coverage.py v7.1.0, created at 2023-04-28 06:23 +0200
1# -*- coding: utf-8 -*-
2"""
3@file
4@brief A few functions about scrapping
6"""
7import sys
8import os
9import datetime
10import warnings
11from pyquickhelper.loghelper import noLOG
12from pymyinstall.installcustom import where_in_path, install_chromedriver, install_operadriver
15default_driver = "opera"
18def webshot(img, url, navigator=default_driver, add_date=False,
19 size=None, fLOG=noLOG):
20 """
21 Uses the module :epkg:`selenium`
22 to take a picture of a website.
23 If url and img are lists, the function goes
24 through all the urls and save webshots.
26 @param img list of image names
27 @param url url
28 @param navigator firefox, chrome, (ie: does not work well)
29 @param add_date add a date to the image filename
30 @param size to resize the webshot (if not None)
31 @param fLOG logging function
32 @return list of [ ( url, image name) ]
34 Check the list of available webdriver at
35 `selenium/webdriver <https://github.com/SeleniumHQ/selenium/tree/master/py/selenium/webdriver>`_
36 and add one to the code if needed.
38 Chrome requires the `chromedriver <http://chromedriver.storage.googleapis.com/index.html>`_.
39 See function `install_chromedriver <http://www.xavierdupre.fr/app/pymyinstall/helpsphinx/pymyinstall/
40 installcustom/install_custom_chromedriver.html?highlight=chromedriver
41 #pymyinstall.installcustom.install_custom_chromedriver.install_chromedriver>`_.
42 """
43 res = []
44 browser = _get_selenium_browser(navigator, fLOG=fLOG)
46 if size is not None:
47 fLOG("set size", size)
48 browser.set_window_size(size[0], size[1])
50 if not isinstance(url, list):
51 url = [url]
52 if not isinstance(img, list):
53 img = [img]
54 if len(url) != len(img):
55 raise RuntimeError("different number of urls and images")
56 for u, i in zip(url, img):
57 fLOG("url", url, " into ", img)
58 browser.get(u)
59 if add_date:
60 dt = datetime.datetime.now()
61 a, b = os.path.splitext(i)
62 i = f"{a}.{str(dt).replace(':', '-').replace('/', '-')}{b}"
63 browser.get_screenshot_as_file(i)
64 res.append((u, i))
65 browser.quit()
66 return res
69def _get_selenium_browser(navigator, fLOG=noLOG):
70 """
71 Returns the associated driver with some custom settings.
73 The function automatically gets chromedriver if not present (:epkg:`Windows` only).
74 On :epkg:`Linux`, package *chromium-driver* should be installed:
75 ``apt-get install chromium-driver``.
77 .. faqref::
78 :tag: web
79 :title: Issue with Selenium and Firefox
80 :lid: faq-web-selenium
82 Firefox >= v47 does not work on Windows.
83 See `Selenium WebDriver and Firefox 47 <http://www.theautomatedtester.co.uk/blog/2016/selenium-webdriver-and-firefox-47.html>`_.
85 Voir `ChromeDriver download <http://chromedriver.storage.googleapis.com/index.html>`_,
86 `Error message: 'chromedriver' executable needs to be available in the path
87 <http://stackoverflow.com/questions/29858752/error-message-chromedriver-executable-needs-to-be-available-in-the-path>`_.
89 See `Selenium - Remote WebDriver example
90 <https://sauceclient.readthedocs.io/en/latest/selenium_on_sauce.html#selenium-remote-webdriver-example>`_,
91 see also `Running the remote driver with Selenium and python <https://gist.github.com/alfredo/1962031>`_.
92 """
93 with warnings.catch_warnings():
94 warnings.simplefilter("ignore", ImportWarning)
95 from selenium import webdriver
96 from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
98 fLOG("[webshot] navigator=", navigator)
99 if navigator == "firefox":
100 firefox_capabilities = DesiredCapabilities.FIREFOX.copy()
101 firefox_capabilities['marionette'] = True
102 firefox_capabilities[
103 'binary'] = r"C:\Program Files (x86)\Mozilla Firefox\firefox.exe"
104 browser = webdriver.Firefox(capabilities=firefox_capabilities)
105 elif navigator == "chrome":
106 if sys.platform.startswith("win"):
107 chromed = where_in_path("chromedriver.exe")
108 if chromed is None:
109 install_chromedriver(fLOG=fLOG)
110 chromed = where_in_path("chromedriver.exe")
111 if chromed is None:
112 raise FileNotFoundError(
113 "unable to install 'chromedriver.exe'")
114 else:
115 fLOG("[_get_selenium_browser] found chromedriver:", chromed)
116 else:
117 chromed = 'chromedriver'
119 start_navi = True
120 if start_navi:
121 fLOG("[_get_selenium_browser] start", navigator)
122 chrome_options = webdriver.ChromeOptions()
123 chrome_options.add_argument('--headless')
124 chrome_options.add_argument('--no-sandbox')
125 chrome_options.add_argument('--verbose')
126 browser = webdriver.Chrome(executable_path=chromed,
127 chrome_options=chrome_options)
128 else:
129 with warnings.catch_warnings():
130 warnings.simplefilter("ignore", ImportWarning)
131 import selenium.webdriver.chrome.service as wservice
132 fLOG("[_get_selenium_browser] create service")
133 service = wservice.Service(chromed)
134 fLOG("[_get_selenium_browser] start service")
135 service.start()
136 fLOG("[_get_selenium_browser] declare remote")
137 capabilities = {'chrome.binary': chromed}
138 browser = webdriver.Remote(service.service_url, capabilities)
139 elif navigator == "ie":
140 browser = webdriver.Ie()
141 elif navigator == "opera":
142 if sys.platform.startswith("win"):
143 chromed = where_in_path("operadriver.exe")
144 if chromed is None:
145 install_operadriver(fLOG=fLOG)
146 chromed = where_in_path("operadriver.exe")
147 if chromed is None:
148 raise FileNotFoundError(
149 "unable to install operadriver.exe")
150 else:
151 fLOG("[_get_selenium_browser] found chromedriver:", chromed)
152 else:
153 chromed = 'operadriver'
154 browser = webdriver.Opera(chromed) # pylint: disable=E1101
155 elif navigator == "edge":
156 browser = webdriver.Edge()
157 else:
158 raise RuntimeError(
159 f"unable to interpret the navigator '{navigator}'")
160 fLOG("[_get_selenium_browser] navigator is started")
161 return browser
164def webhtml(url, navigator=default_driver, fLOG=noLOG):
165 """
166 Uses the module `selenium <http://selenium-python.readthedocs.io/>`_
167 to retrieve the html content of a website.
169 @param url url
170 @param navigator firefox, chrome, (ie: does not work well)
171 @param fLOG logging function
172 @return list of [ ( url, html) ]
174 Check the list of available webdriver at
175 `selenium/webdriver
176 <https://github.com/SeleniumHQ/selenium/tree/master/py/selenium/webdriver>`_
177 and add one to the code if needed.
178 """
179 res = []
180 browser = _get_selenium_browser(navigator, fLOG=fLOG)
181 if not isinstance(url, list):
182 url = [url]
183 for u in url:
184 fLOG(f"[webhtml] get url '{url}'")
185 browser.get(u)
186 i = browser.page_source
187 res.append((u, i))
188 browser.quit()
189 return res