Code source de ensae_teaching_cs.faq.faq_web

# -*- coding: utf-8 -*-
"""
A few functions about scrapping



:githublink:`%|py|7`
"""
import sys
import os
import datetime
import warnings
from pyquickhelper.loghelper import noLOG
from pymyinstall.installcustom import where_in_path, install_chromedriver, install_operadriver


default_driver = "opera"


[docs]def webshot(img, url, navigator=default_driver, add_date=False,
            size=None, fLOG=noLOG):
    """
    Uses the module :epkg:`selenium`
    to take a picture of a website.
    If url and img are lists, the function goes
    through all the urls and save webshots.

    :param      img:             list of image names
    :param      url:             url
    :param      navigator:       firefox, chrome, (ie: does not work well)
    :param      add_date:        add a date to the image filename
    :param      size:            to resize the webshot (if not None)
    :param      fLOG:            logging function
    :return:                     list of [ ( url, image name) ]

    Check the list of available webdriver at
    `selenium/webdriver <https://github.com/SeleniumHQ/selenium/tree/master/py/selenium/webdriver>`_
    and add one to the code if needed.

    Chrome requires the `chromedriver <http://chromedriver.storage.googleapis.com/index.html>`_.
    See function `install_chromedriver <http://www.xavierdupre.fr/app/pymyinstall/helpsphinx/pymyinstall/
    installcustom/install_custom_chromedriver.html?highlight=chromedriver
    #pymyinstall.installcustom.install_custom_chromedriver.install_chromedriver>`_.


    :githublink:`%|py|42`
    """
    res = []
    browser = _get_selenium_browser(navigator, fLOG=fLOG)

    if size is not None:
        fLOG("set size", size)
        browser.set_window_size(size[0], size[1])

    if not isinstance(url, list):
        url = [url]
    if not isinstance(img, list):
        img = [img]
    if len(url) != len(img):
        raise Exception("different number of urls and images")
    for u, i in zip(url, img):
        fLOG("url", url, " into ", img)
        browser.get(u)
        if add_date:
            dt = datetime.datetime.now()
            a, b = os.path.splitext(i)
            i = "{0}.{1}{2}".format(a, str(dt).replace(
                ":", "-").replace("/", "-"), b)
        browser.get_screenshot_as_file(i)
        res.append((u, i))
    browser.quit()
    return res


[docs]def _get_selenium_browser(navigator, fLOG=noLOG):
    """
    Returns the associated driver with some custom settings.

    The function automatically gets chromedriver if not present (:epkg:`Windows` only).
    On :epkg:`Linux`, package *chromium-driver* should be installed:
    ``apt-get install chromium-driver``.

    .. faqref::
        :tag: web
        :title: Issue with Selenium and Firefox
        :lid: faq-web-selenium

        Firefox >= v47 does not work on Windows.
        See `Selenium WebDriver and Firefox 47 <http://www.theautomatedtester.co.uk/blog/2016/selenium-webdriver-and-firefox-47.html>`_.

        Voir `ChromeDriver download <http://chromedriver.storage.googleapis.com/index.html>`_,
        `Error message: 'chromedriver' executable needs to be available in the path
        <http://stackoverflow.com/questions/29858752/error-message-chromedriver-executable-needs-to-be-available-in-the-path>`_.

    See `Selenium - Remote WebDriver example
    <https://sauceclient.readthedocs.io/en/latest/selenium_on_sauce.html#selenium-remote-webdriver-example>`_,
    see also `Running the remote driver with Selenium and python <https://gist.github.com/alfredo/1962031>`_.


    :githublink:`%|py|93`
    """
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", ImportWarning)
        from selenium import webdriver
        from selenium.webdriver.common.desired_capabilities import DesiredCapabilities

    fLOG("[webshot] navigator=", navigator)
    if navigator == "firefox":
        firefox_capabilities = DesiredCapabilities.FIREFOX.copy()
        firefox_capabilities['marionette'] = True
        firefox_capabilities[
            'binary'] = r"C:\Program Files (x86)\Mozilla Firefox\firefox.exe"
        browser = webdriver.Firefox(capabilities=firefox_capabilities)
    elif navigator == "chrome":
        if sys.platform.startswith("win"):
            chromed = where_in_path("chromedriver.exe")
            if chromed is None:
                install_chromedriver(fLOG=fLOG)
                chromed = where_in_path("chromedriver.exe")
                if chromed is None:
                    raise FileNotFoundError(
                        "unable to install 'chromedriver.exe'")
            else:
                fLOG("[_get_selenium_browser] found chromedriver:", chromed)
        else:
            chromed = 'chromedriver'

        start_navi = True
        if start_navi:
            fLOG("[_get_selenium_browser] start", navigator)
            chrome_options = webdriver.ChromeOptions()
            chrome_options.add_argument('--headless')
            chrome_options.add_argument('--no-sandbox')
            chrome_options.add_argument('--verbose')
            browser = webdriver.Chrome(executable_path=chromed,
                                       chrome_options=chrome_options)
        else:
            with warnings.catch_warnings():
                warnings.simplefilter("ignore", ImportWarning)
                import selenium.webdriver.chrome.service as wservice
            fLOG("[_get_selenium_browser] create service")
            service = wservice.Service(chromed)
            fLOG("[_get_selenium_browser] start service")
            service.start()
            fLOG("[_get_selenium_browser] declare remote")
            capabilities = {'chrome.binary': chromed}
            browser = webdriver.Remote(service.service_url, capabilities)
    elif navigator == "ie":
        browser = webdriver.Ie()
    elif navigator == "opera":
        if sys.platform.startswith("win"):
            chromed = where_in_path("operadriver.exe")
            if chromed is None:
                install_operadriver(fLOG=fLOG)
                chromed = where_in_path("operadriver.exe")
                if chromed is None:
                    raise FileNotFoundError(
                        "unable to install operadriver.exe")
            else:
                fLOG("[_get_selenium_browser] found chromedriver:", chromed)
        else:
            chromed = 'operadriver'
        browser = webdriver.Opera(chromed)
    elif navigator == "edge":
        browser = webdriver.Edge()
    else:
        raise Exception(
            "unable to interpret the navigator '{0}'".format(navigator))
    fLOG("[_get_selenium_browser] navigator is started")
    return browser


[docs]def webhtml(url, navigator=default_driver, fLOG=noLOG):
    """
    Uses the module `selenium <http://selenium-python.readthedocs.io/>`_
    to retrieve the html content of a website.

    :param      url:             url
    :param      navigator:       firefox, chrome, (ie: does not work well)
    :param      fLOG:            logging function
    :return:                     list of [ ( url, html) ]

    Check the list of available webdriver at
    `selenium/webdriver
    <https://github.com/SeleniumHQ/selenium/tree/master/py/selenium/webdriver>`_
    and add one to the code if needed.


    :githublink:`%|py|179`
    """
    res = []
    browser = _get_selenium_browser(navigator, fLOG=fLOG)
    if not isinstance(url, list):
        url = [url]
    for u in url:
        fLOG("[webhtml] get url '{0}'".format(url))
        browser.get(u)
        i = browser.page_source
        res.append((u, i))
    browser.quit()
    return res
Liens

Contenu

Information

Code source de ensae_teaching_cs.faq.faq_web