Source code for pyrsslocal.helper.search_engine

various function to get the content of a page, of a search page...


import urllib
import urllib.request
import time
import random
import re
import os

from pyquickhelper.loghelper import noLOG

[docs]def extract_bing_result(searchPage, filter_=lambda u: True): """ extract the first results from a search page assuming it coms from `Bing <>`_ :param searchPage: content of `Bing <>`_ search page :param filter_: remove some urls if this function is False ``filter_(u)`` --> True or False :return: a list with the urls :githublink:`%|py|23` """ reg = re.compile("""<h2><a href="(.*?)" h="ID=SERP,""") alls = reg.findall(searchPage) if alls is None or len(alls) == 0: return None if len(alls) > 10: alls = alls[:10] # here I sort by length, maybe not the best idea alltemp = sorted([(len(_), _) for _ in alls]) # alltemp = [ (len(_), _) for _ in alls ] # or not alls = [_ for _ in alltemp if filter_(_[1])] if len(alls) == 0: mes = "\n".join(str(_) for _ in alltemp) # pragma: no cover raise ValueError( # pragma: no cover "unable to find a proper url\n" + mes) res = alls[0][1] if res in [""]: join = "\n".join(str(_) for _ in alls) # pragma: no cover raise ValueError( # pragma: no cover "bad result\n{0}".format(join)) return [_[1] for _ in alls]
[docs]def query_bing(query, folderCache="cacheSearchPage", filter_=lambda u: True, fLOG=noLOG): """ Returns the search page from `Bing <>`_ for a specific query. :param query: search query :param folderCache: folder used to stored the result page or to retrieve a page if the query was already searched for :param filter_: remove some urls if this function is False ``filter(u)`` --> True or False :param fLOG: logging function :return: list of urls :githublink:`%|py|59` """ if not os.path.exists(folderCache): os.mkdir(folderCache) cache = os.path.join(folderCache, "" % query) if os.path.exists(cache): with open(cache, "r", encoding="utf8") as f: text = else: fLOG(" downloading results for ", query) x = 1. + random.random() time.sleep(x) url = "" + query.replace(" ", "%20") with urllib.request.urlopen(url) as uur: text = text = text.decode("utf8") fLOG(" caching results for ", query, " in ", cache) with open(cache, "w", encoding="utf8") as f: f.write(text) url = extract_bing_result(text, filter_) return url