Coverage for src/pyrsslocal/helper/search_engine.py: 90%
39 statements
« prev ^ index » next coverage.py v7.1.0, created at 2024-04-30 08:45 +0200
« prev ^ index » next coverage.py v7.1.0, created at 2024-04-30 08:45 +0200
1"""
2@file
4@brief various function to get the content of a page, of a search page...
5"""
7import urllib
8import urllib.request
9import time
10import random
11import re
12import os
14from pyquickhelper.loghelper import noLOG
17def extract_bing_result(searchPage, filter_=lambda u: True):
18 """
19 extract the first results from a search page assuming it coms from `Bing <http://www.bing.com>`_
20 @param searchPage content of `Bing <http://www.bing.com>`_ search page
21 @param filter_ remove some urls if this function is False ``filter_(u)`` --> True or False
22 @return a list with the urls
23 """
24 reg = re.compile("""<h2><a href="(.*?)" h="ID=SERP,""")
25 alls = reg.findall(searchPage)
26 if alls is None or len(alls) == 0:
27 return None
28 if len(alls) > 10:
29 alls = alls[:10]
30 # here I sort by length, maybe not the best idea
31 alltemp = sorted([(len(_), _) for _ in alls])
32 # alltemp = [ (len(_), _) for _ in alls ] # or not
33 alls = [_ for _ in alltemp if filter_(_[1])]
34 if len(alls) == 0:
35 mes = "\n".join(str(_) for _ in alltemp) # pragma: no cover
36 raise ValueError( # pragma: no cover
37 "unable to find a proper url\n" + mes)
38 res = alls[0][1]
39 if res in ["http://chrome.angrybirds.com/"]:
40 join = "\n".join(str(_) for _ in alls) # pragma: no cover
41 raise ValueError( # pragma: no cover
42 "bad result\n{0}".format(join))
43 return [_[1] for _ in alls]
46def query_bing(query, folderCache="cacheSearchPage",
47 filter_=lambda u: True, fLOG=noLOG):
48 """
49 Returns the search page from
50 `Bing <http://www.bing.com>`_ for a specific query.
51 @param query search query
52 @param folderCache folder used to stored the result page or
53 to retrieve a page if the query was already
54 searched for
55 @param filter_ remove some urls if this function is False
56 ``filter(u)`` --> True or False
57 @param fLOG logging function
58 @return list of urls
59 """
60 if not os.path.exists(folderCache):
61 os.mkdir(folderCache)
62 cache = os.path.join(folderCache, "%s.bing.html" % query)
63 if os.path.exists(cache):
64 with open(cache, "r", encoding="utf8") as f:
65 text = f.read()
66 else:
67 fLOG(" downloading results for ", query)
68 x = 1. + random.random()
69 time.sleep(x)
70 url = "http://www.bing.com/search?q=" + query.replace(" ", "%20")
71 with urllib.request.urlopen(url) as uur:
72 text = uur.read()
73 text = text.decode("utf8")
75 fLOG(" caching results for ", query, " in ", cache)
76 with open(cache, "w", encoding="utf8") as f:
77 f.write(text)
79 url = extract_bing_result(text, filter_)
80 return url