Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# -*- coding: utf-8 -*-
2"""
3@file
4@brief A few functions about scrapping
6"""
7import sys
8import os
9import datetime
10import warnings
11from pyquickhelper.loghelper import noLOG
12from pymyinstall.installcustom import where_in_path, install_chromedriver, install_operadriver
15default_driver = "opera"
18def webshot(img, url, navigator=default_driver, add_date=False,
19 size=None, fLOG=noLOG):
20 """
21 Uses the module :epkg:`selenium`
22 to take a picture of a website.
23 If url and img are lists, the function goes
24 through all the urls and save webshots.
26 @param img list of image names
27 @param url url
28 @param navigator firefox, chrome, (ie: does not work well)
29 @param add_date add a date to the image filename
30 @param size to resize the webshot (if not None)
31 @param fLOG logging function
32 @return list of [ ( url, image name) ]
34 Check the list of available webdriver at
35 `selenium/webdriver <https://github.com/SeleniumHQ/selenium/tree/master/py/selenium/webdriver>`_
36 and add one to the code if needed.
38 Chrome requires the `chromedriver <http://chromedriver.storage.googleapis.com/index.html>`_.
39 See function `install_chromedriver <http://www.xavierdupre.fr/app/pymyinstall/helpsphinx/pymyinstall/
40 installcustom/install_custom_chromedriver.html?highlight=chromedriver
41 #pymyinstall.installcustom.install_custom_chromedriver.install_chromedriver>`_.
42 """
43 res = []
44 browser = _get_selenium_browser(navigator, fLOG=fLOG)
46 if size is not None:
47 fLOG("set size", size)
48 browser.set_window_size(size[0], size[1])
50 if not isinstance(url, list):
51 url = [url]
52 if not isinstance(img, list):
53 img = [img]
54 if len(url) != len(img):
55 raise Exception("different number of urls and images")
56 for u, i in zip(url, img):
57 fLOG("url", url, " into ", img)
58 browser.get(u)
59 if add_date:
60 dt = datetime.datetime.now()
61 a, b = os.path.splitext(i)
62 i = "{0}.{1}{2}".format(a, str(dt).replace(
63 ":", "-").replace("/", "-"), b)
64 browser.get_screenshot_as_file(i)
65 res.append((u, i))
66 browser.quit()
67 return res
70def _get_selenium_browser(navigator, fLOG=noLOG):
71 """
72 Returns the associated driver with some custom settings.
74 The function automatically gets chromedriver if not present (:epkg:`Windows` only).
75 On :epkg:`Linux`, package *chromium-driver* should be installed:
76 ``apt-get install chromium-driver``.
78 .. faqref::
79 :tag: web
80 :title: Issue with Selenium and Firefox
81 :lid: faq-web-selenium
83 Firefox >= v47 does not work on Windows.
84 See `Selenium WebDriver and Firefox 47 <http://www.theautomatedtester.co.uk/blog/2016/selenium-webdriver-and-firefox-47.html>`_.
86 Voir `ChromeDriver download <http://chromedriver.storage.googleapis.com/index.html>`_,
87 `Error message: 'chromedriver' executable needs to be available in the path
88 <http://stackoverflow.com/questions/29858752/error-message-chromedriver-executable-needs-to-be-available-in-the-path>`_.
90 See `Selenium - Remote WebDriver example
91 <https://sauceclient.readthedocs.io/en/latest/selenium_on_sauce.html#selenium-remote-webdriver-example>`_,
92 see also `Running the remote driver with Selenium and python <https://gist.github.com/alfredo/1962031>`_.
93 """
94 with warnings.catch_warnings():
95 warnings.simplefilter("ignore", ImportWarning)
96 from selenium import webdriver
97 from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
99 fLOG("[webshot] navigator=", navigator)
100 if navigator == "firefox":
101 firefox_capabilities = DesiredCapabilities.FIREFOX.copy()
102 firefox_capabilities['marionette'] = True
103 firefox_capabilities[
104 'binary'] = r"C:\Program Files (x86)\Mozilla Firefox\firefox.exe"
105 browser = webdriver.Firefox(capabilities=firefox_capabilities)
106 elif navigator == "chrome":
107 if sys.platform.startswith("win"):
108 chromed = where_in_path("chromedriver.exe")
109 if chromed is None:
110 install_chromedriver(fLOG=fLOG)
111 chromed = where_in_path("chromedriver.exe")
112 if chromed is None:
113 raise FileNotFoundError(
114 "unable to install 'chromedriver.exe'")
115 else:
116 fLOG("[_get_selenium_browser] found chromedriver:", chromed)
117 else:
118 chromed = 'chromedriver'
120 start_navi = True
121 if start_navi:
122 fLOG("[_get_selenium_browser] start", navigator)
123 chrome_options = webdriver.ChromeOptions()
124 chrome_options.add_argument('--headless')
125 chrome_options.add_argument('--no-sandbox')
126 chrome_options.add_argument('--verbose')
127 browser = webdriver.Chrome(executable_path=chromed,
128 chrome_options=chrome_options)
129 else:
130 with warnings.catch_warnings():
131 warnings.simplefilter("ignore", ImportWarning)
132 import selenium.webdriver.chrome.service as wservice
133 fLOG("[_get_selenium_browser] create service")
134 service = wservice.Service(chromed)
135 fLOG("[_get_selenium_browser] start service")
136 service.start()
137 fLOG("[_get_selenium_browser] declare remote")
138 capabilities = {'chrome.binary': chromed}
139 browser = webdriver.Remote(service.service_url, capabilities)
140 elif navigator == "ie":
141 browser = webdriver.Ie()
142 elif navigator == "opera":
143 if sys.platform.startswith("win"):
144 chromed = where_in_path("operadriver.exe")
145 if chromed is None:
146 install_operadriver(fLOG=fLOG)
147 chromed = where_in_path("operadriver.exe")
148 if chromed is None:
149 raise FileNotFoundError(
150 "unable to install operadriver.exe")
151 else:
152 fLOG("[_get_selenium_browser] found chromedriver:", chromed)
153 else:
154 chromed = 'operadriver'
155 browser = webdriver.Opera(chromed)
156 elif navigator == "edge":
157 browser = webdriver.Edge()
158 else:
159 raise Exception(
160 "unable to interpret the navigator '{0}'".format(navigator))
161 fLOG("[_get_selenium_browser] navigator is started")
162 return browser
165def webhtml(url, navigator=default_driver, fLOG=noLOG):
166 """
167 Uses the module `selenium <http://selenium-python.readthedocs.io/>`_
168 to retrieve the html content of a website.
170 @param url url
171 @param navigator firefox, chrome, (ie: does not work well)
172 @param fLOG logging function
173 @return list of [ ( url, html) ]
175 Check the list of available webdriver at
176 `selenium/webdriver
177 <https://github.com/SeleniumHQ/selenium/tree/master/py/selenium/webdriver>`_
178 and add one to the code if needed.
179 """
180 res = []
181 browser = _get_selenium_browser(navigator, fLOG=fLOG)
182 if not isinstance(url, list):
183 url = [url]
184 for u in url:
185 fLOG("[webhtml] get url '{0}'".format(url))
186 browser.get(u)
187 i = browser.page_source
188 res.append((u, i))
189 browser.quit()
190 return res