Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# -*- coding: utf-8 -*- 

2""" 

3@file 

4@brief A few functions about scrapping 

5 

6""" 

7import sys 

8import os 

9import datetime 

10import warnings 

11from pyquickhelper.loghelper import noLOG 

12from pymyinstall.installcustom import where_in_path, install_chromedriver, install_operadriver 

13 

14 

15default_driver = "opera" 

16 

17 

18def webshot(img, url, navigator=default_driver, add_date=False, 

19 size=None, fLOG=noLOG): 

20 """ 

21 Uses the module :epkg:`selenium` 

22 to take a picture of a website. 

23 If url and img are lists, the function goes 

24 through all the urls and save webshots. 

25 

26 @param img list of image names 

27 @param url url 

28 @param navigator firefox, chrome, (ie: does not work well) 

29 @param add_date add a date to the image filename 

30 @param size to resize the webshot (if not None) 

31 @param fLOG logging function 

32 @return list of [ ( url, image name) ] 

33 

34 Check the list of available webdriver at 

35 `selenium/webdriver <https://github.com/SeleniumHQ/selenium/tree/master/py/selenium/webdriver>`_ 

36 and add one to the code if needed. 

37 

38 Chrome requires the `chromedriver <http://chromedriver.storage.googleapis.com/index.html>`_. 

39 See function `install_chromedriver <http://www.xavierdupre.fr/app/pymyinstall/helpsphinx/pymyinstall/ 

40 installcustom/install_custom_chromedriver.html?highlight=chromedriver 

41 #pymyinstall.installcustom.install_custom_chromedriver.install_chromedriver>`_. 

42 """ 

43 res = [] 

44 browser = _get_selenium_browser(navigator, fLOG=fLOG) 

45 

46 if size is not None: 

47 fLOG("set size", size) 

48 browser.set_window_size(size[0], size[1]) 

49 

50 if not isinstance(url, list): 

51 url = [url] 

52 if not isinstance(img, list): 

53 img = [img] 

54 if len(url) != len(img): 

55 raise Exception("different number of urls and images") 

56 for u, i in zip(url, img): 

57 fLOG("url", url, " into ", img) 

58 browser.get(u) 

59 if add_date: 

60 dt = datetime.datetime.now() 

61 a, b = os.path.splitext(i) 

62 i = "{0}.{1}{2}".format(a, str(dt).replace( 

63 ":", "-").replace("/", "-"), b) 

64 browser.get_screenshot_as_file(i) 

65 res.append((u, i)) 

66 browser.quit() 

67 return res 

68 

69 

70def _get_selenium_browser(navigator, fLOG=noLOG): 

71 """ 

72 Returns the associated driver with some custom settings. 

73 

74 The function automatically gets chromedriver if not present (:epkg:`Windows` only). 

75 On :epkg:`Linux`, package *chromium-driver* should be installed: 

76 ``apt-get install chromium-driver``. 

77 

78 .. faqref:: 

79 :tag: web 

80 :title: Issue with Selenium and Firefox 

81 :lid: faq-web-selenium 

82 

83 Firefox >= v47 does not work on Windows. 

84 See `Selenium WebDriver and Firefox 47 <http://www.theautomatedtester.co.uk/blog/2016/selenium-webdriver-and-firefox-47.html>`_. 

85 

86 Voir `ChromeDriver download <http://chromedriver.storage.googleapis.com/index.html>`_, 

87 `Error message: 'chromedriver' executable needs to be available in the path 

88 <http://stackoverflow.com/questions/29858752/error-message-chromedriver-executable-needs-to-be-available-in-the-path>`_. 

89 

90 See `Selenium - Remote WebDriver example 

91 <https://sauceclient.readthedocs.io/en/latest/selenium_on_sauce.html#selenium-remote-webdriver-example>`_, 

92 see also `Running the remote driver with Selenium and python <https://gist.github.com/alfredo/1962031>`_. 

93 """ 

94 with warnings.catch_warnings(): 

95 warnings.simplefilter("ignore", ImportWarning) 

96 from selenium import webdriver 

97 from selenium.webdriver.common.desired_capabilities import DesiredCapabilities 

98 

99 fLOG("[webshot] navigator=", navigator) 

100 if navigator == "firefox": 

101 firefox_capabilities = DesiredCapabilities.FIREFOX.copy() 

102 firefox_capabilities['marionette'] = True 

103 firefox_capabilities[ 

104 'binary'] = r"C:\Program Files (x86)\Mozilla Firefox\firefox.exe" 

105 browser = webdriver.Firefox(capabilities=firefox_capabilities) 

106 elif navigator == "chrome": 

107 if sys.platform.startswith("win"): 

108 chromed = where_in_path("chromedriver.exe") 

109 if chromed is None: 

110 install_chromedriver(fLOG=fLOG) 

111 chromed = where_in_path("chromedriver.exe") 

112 if chromed is None: 

113 raise FileNotFoundError( 

114 "unable to install 'chromedriver.exe'") 

115 else: 

116 fLOG("[_get_selenium_browser] found chromedriver:", chromed) 

117 else: 

118 chromed = 'chromedriver' 

119 

120 start_navi = True 

121 if start_navi: 

122 fLOG("[_get_selenium_browser] start", navigator) 

123 chrome_options = webdriver.ChromeOptions() 

124 chrome_options.add_argument('--headless') 

125 chrome_options.add_argument('--no-sandbox') 

126 chrome_options.add_argument('--verbose') 

127 browser = webdriver.Chrome(executable_path=chromed, 

128 chrome_options=chrome_options) 

129 else: 

130 with warnings.catch_warnings(): 

131 warnings.simplefilter("ignore", ImportWarning) 

132 import selenium.webdriver.chrome.service as wservice 

133 fLOG("[_get_selenium_browser] create service") 

134 service = wservice.Service(chromed) 

135 fLOG("[_get_selenium_browser] start service") 

136 service.start() 

137 fLOG("[_get_selenium_browser] declare remote") 

138 capabilities = {'chrome.binary': chromed} 

139 browser = webdriver.Remote(service.service_url, capabilities) 

140 elif navigator == "ie": 

141 browser = webdriver.Ie() 

142 elif navigator == "opera": 

143 if sys.platform.startswith("win"): 

144 chromed = where_in_path("operadriver.exe") 

145 if chromed is None: 

146 install_operadriver(fLOG=fLOG) 

147 chromed = where_in_path("operadriver.exe") 

148 if chromed is None: 

149 raise FileNotFoundError( 

150 "unable to install operadriver.exe") 

151 else: 

152 fLOG("[_get_selenium_browser] found chromedriver:", chromed) 

153 else: 

154 chromed = 'operadriver' 

155 browser = webdriver.Opera(chromed) 

156 elif navigator == "edge": 

157 browser = webdriver.Edge() 

158 else: 

159 raise Exception( 

160 "unable to interpret the navigator '{0}'".format(navigator)) 

161 fLOG("[_get_selenium_browser] navigator is started") 

162 return browser 

163 

164 

165def webhtml(url, navigator=default_driver, fLOG=noLOG): 

166 """ 

167 Uses the module `selenium <http://selenium-python.readthedocs.io/>`_ 

168 to retrieve the html content of a website. 

169 

170 @param url url 

171 @param navigator firefox, chrome, (ie: does not work well) 

172 @param fLOG logging function 

173 @return list of [ ( url, html) ] 

174 

175 Check the list of available webdriver at 

176 `selenium/webdriver 

177 <https://github.com/SeleniumHQ/selenium/tree/master/py/selenium/webdriver>`_ 

178 and add one to the code if needed. 

179 """ 

180 res = [] 

181 browser = _get_selenium_browser(navigator, fLOG=fLOG) 

182 if not isinstance(url, list): 

183 url = [url] 

184 for u in url: 

185 fLOG("[webhtml] get url '{0}'".format(url)) 

186 browser.get(u) 

187 i = browser.page_source 

188 res.append((u, i)) 

189 browser.quit() 

190 return res