Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# -*- coding: utf-8 -*- 

2""" 

3@file 

4@brief Helpers for the hackathon 2018 related to search internet. 

5""" 

6 

7 

8import urllib 

9import urllib.request 

10import urllib.parse 

11import time 

12import random 

13import re 

14import os 

15 

16 

17def extract_bing_result(search_page, filter_fct=lambda u: True): 

18 """ 

19 Extract the first results from a search page assuming 

20 it comes from :epkg:`Bing Image`. 

21 

22 @param search_page content of :epkg:`Bing Image` search page (or filename) 

23 @param filter_fct remove some urls if this function is False ``filter(u) --> True or False`` 

24 @return a list with the urls 

25 """ 

26 if search_page.endswith(".html"): 

27 with open(search_page, "r", encoding="utf-8") as f: 

28 search_page = f.read() 

29 reg = re.compile("""mediaurl=(http.*?)&""") 

30 res = reg.findall(search_page) 

31 if res is None or len(res) == 0: 

32 reg = re.compile('''href="(http.*?)"''') 

33 res = reg.findall(search_page) 

34 ext = {'.jpg', '.png', '.gif', '.tif'} 

35 res = [_ for _ in res if len(_) > 4 and _[-4:] in ext] 

36 return list(urllib.parse.unquote(_) for _ in set(filter(filter_fct, res))) 

37 

38 

39def query_bing_image(query, folder_cache="cache_search_page", 

40 filter_fct=lambda u: True, add_options=False, 

41 use_selenium=False, navigator=None, fLOG=None): 

42 """ 

43 Returns the search page from :epkg:`Bing Image` 

44 for a specific query. 

45 

46 @param query search query 

47 @param folder_cache folder used to stored the result page or to retrieve 

48 a page if the query was already searched for 

49 @param filter_fct remove some urls if this function is False 

50 ``filter(u) --> True or False`` 

51 @param add_options add options to the search url 

52 @param use_selenium relies on :epkg:`webhtml` 

53 @param navigator see :epkg:`webhtml` 

54 @param fLOG logging function 

55 @return list of urls 

56 """ 

57 if not os.path.exists(folder_cache): 

58 os.mkdir(folder_cache) 

59 cache = os.path.join(folder_cache, "%s.bing.html" % 

60 query.replace(" ", "_")) 

61 if os.path.exists(cache): 

62 with open(cache, "r", encoding="utf8") as f: 

63 text = f.read() 

64 else: 

65 if fLOG: 

66 fLOG("[query_bing_image] download results for '{0}'".format(query)) 

67 x = 1. + random.random() 

68 time.sleep(x) 

69 encoded = urllib.parse.quote(query) 

70 if add_options: 

71 uopts = "&qs=n&form=QBIR&sp=-1&sc=8-10&sk=" 

72 else: 

73 uopts = "" 

74 

75 url = "http://www.bing.com/images/search?q={0}{1}".format( 

76 encoded, uopts) 

77 

78 if use_selenium: 

79 from ensae_teaching_cs.faq.faq_web import webhtml 

80 if navigator is None: 

81 navigator = "chrome" 

82 res = webhtml(url, navigator=navigator) 

83 if len(res) == 0: 

84 return None 

85 text = res[0][1] 

86 else: 

87 with urllib.request.urlopen(url, timeout=10) as uur: 

88 text = uur.read() 

89 text = text.decode("utf8") 

90 

91 if fLOG: 

92 fLOG("[query_bing_image] cache results for '{0}' in '{1}'".format( 

93 query, cache)) 

94 with open(cache, "w", encoding="utf-8") as f: 

95 f.write(text) 

96 

97 urls = extract_bing_result(text, filter_fct) 

98 return urls