Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# -*- coding: utf-8 -*-
2"""
3@file
4@brief Helpers for the hackathon 2018 related to search internet.
5"""
8import urllib
9import urllib.request
10import urllib.parse
11import time
12import random
13import re
14import os
17def extract_bing_result(search_page, filter_fct=lambda u: True):
18 """
19 Extract the first results from a search page assuming
20 it comes from :epkg:`Bing Image`.
22 @param search_page content of :epkg:`Bing Image` search page (or filename)
23 @param filter_fct remove some urls if this function is False ``filter(u) --> True or False``
24 @return a list with the urls
25 """
26 if search_page.endswith(".html"):
27 with open(search_page, "r", encoding="utf-8") as f:
28 search_page = f.read()
29 reg = re.compile("""mediaurl=(http.*?)&""")
30 res = reg.findall(search_page)
31 if res is None or len(res) == 0:
32 reg = re.compile('''href="(http.*?)"''')
33 res = reg.findall(search_page)
34 ext = {'.jpg', '.png', '.gif', '.tif'}
35 res = [_ for _ in res if len(_) > 4 and _[-4:] in ext]
36 return list(urllib.parse.unquote(_) for _ in set(filter(filter_fct, res)))
39def query_bing_image(query, folder_cache="cache_search_page",
40 filter_fct=lambda u: True, add_options=False,
41 use_selenium=False, navigator=None, fLOG=None):
42 """
43 Returns the search page from :epkg:`Bing Image`
44 for a specific query.
46 @param query search query
47 @param folder_cache folder used to stored the result page or to retrieve
48 a page if the query was already searched for
49 @param filter_fct remove some urls if this function is False
50 ``filter(u) --> True or False``
51 @param add_options add options to the search url
52 @param use_selenium relies on :epkg:`webhtml`
53 @param navigator see :epkg:`webhtml`
54 @param fLOG logging function
55 @return list of urls
56 """
57 if not os.path.exists(folder_cache):
58 os.mkdir(folder_cache)
59 cache = os.path.join(folder_cache, "%s.bing.html" %
60 query.replace(" ", "_"))
61 if os.path.exists(cache):
62 with open(cache, "r", encoding="utf8") as f:
63 text = f.read()
64 else:
65 if fLOG:
66 fLOG("[query_bing_image] download results for '{0}'".format(query))
67 x = 1. + random.random()
68 time.sleep(x)
69 encoded = urllib.parse.quote(query)
70 if add_options:
71 uopts = "&qs=n&form=QBIR&sp=-1&sc=8-10&sk="
72 else:
73 uopts = ""
75 url = "http://www.bing.com/images/search?q={0}{1}".format(
76 encoded, uopts)
78 if use_selenium:
79 from ensae_teaching_cs.faq.faq_web import webhtml
80 if navigator is None:
81 navigator = "chrome"
82 res = webhtml(url, navigator=navigator)
83 if len(res) == 0:
84 return None
85 text = res[0][1]
86 else:
87 with urllib.request.urlopen(url, timeout=10) as uur:
88 text = uur.read()
89 text = text.decode("utf8")
91 if fLOG:
92 fLOG("[query_bing_image] cache results for '{0}' in '{1}'".format(
93 query, cache))
94 with open(cache, "w", encoding="utf-8") as f:
95 f.write(text)
97 urls = extract_bing_result(text, filter_fct)
98 return urls