Code source de ensae_teaching_cs.td_1a.discours_politique

# -*- coding: utf-8 -*-
"""
Retrive political speeches from Internet



:githublink:`%|py|7`
"""

import re
import html.parser
import html.entities as htmlentitydefs
import warnings
from pyquickhelper.loghelper import get_url_content


[docs]def xmlParsingLongestDiv(text): """ Extracts the longest div section. :param text: text of HTML page :return: text :githublink:`%|py|21` """ class MyHTMLParser(html.parser.HTMLParser): """ To get rid of paragraphs, and bolded text. :githublink:`%|py|25` """ def __init__(self): html.parser.HTMLParser.__init__(self, convert_charrefs=True) self.mtag = [] self.mvalue = [] self.mall = [] def handle_starttag(self, tag, attrs): if tag == "div": self.mtag.append(tag) self.mvalue.append([]) elif len(self.mtag) > 0: self.mvalue[-1].append(" ") def handle_endtag(self, tag): if tag == "div": self.mall.append((self.mtag[-1], "".join(self.mvalue[-1]))) self.mtag.pop() self.mvalue.pop() elif len(self.mtag) > 0: if tag == "p" or tag == "br": self.mvalue[-1].append("\n") else: self.mvalue[-1].append(" ") def handle_data(self, data): if len(self.mtag) > 0: self.mvalue[-1].append(data) parser = MyHTMLParser() text = text.replace(" -g8\" ", " ") parser.feed(text) best = "" for tag, value in parser.mall: if tag == "div" and len(value) > len(best): best = value endLine = "\n" res = best.replace( "<p>", "").replace( "</p>", endLine).replace( "\r", "").replace( "<br />", endLine).replace( "<br>", endLine) exp = re.compile("[|]((.|\n){5,50}) ") nb = exp.findall(res) if (len(nb) == 0 or len(res) > 10000) and "if (window.xtparam!=null)" not in res: return res else: return ""
[docs]def html_unescape(text): """ Removes :epkg:`HTML` or :epkg:`XML` character references and entities from a text string. keep ``&amp;``, ``&gt;``, ``&lt;`` in the source code. from `Fredrik Lundh <http://effbot.org/zone/re-sub.htm#unescape-html>`_ :param text: text :return: cleaning text :githublink:`%|py|93` """ def fixup(m): text = m.group(0) if text[:2] == "&#": try: if text[:3] == "&#x": return chr(int(text[3:-1], 16)) else: return chr(int(text[2:-1])) except ValueError: pass else: # named entity try: if text[1:-1] == "amp": text = "&amp;amp;" elif text[1:-1] == "gt": text = "&amp;gt;" elif text[1:-1] == "lt": text = "&amp;lt;" else: text = chr(htmlentitydefs.name2codepoint[text[1:-1]]) except KeyError: pass return text # leave as is return re.sub("&#?\\w+;", fixup, text)
[docs]def force_unicode(text): """ Deals with unicodes. :param text: text :return: text :githublink:`%|py|127` """ exp = re.compile("([0-9]+):") turn = 0 while True: try: text = text.encode("ascii", errors="ignore") break except UnicodeDecodeError as e: pos = exp.findall(str(e)) pos = int(pos[0]) text = text.replace("ô", "o").replace( "é", "e").replace("Ã", "a") text = text.replace( " ", " ").replace( "’", "'").replace( "a§", "c") text = text.replace( "a¹", "u").replace( "a¨", "e").replace( "a‰", "E") text = text.replace( "a¢", "a").replace( "aª", "e").replace( "aƒÂ´", "o") text = text.replace( "aƒÂ©", "e").replace( "aƒÂ", "e").replace( "©", "e") text = text.replace( "a»", "u").replace( "€", "E").replace( "a®", "i") text = text.replace( '\xa0', " ").replace( "Å“", "oe").replace( "«", " ") text = text.replace( "»", " ").replace( "e¹ ", "ei").replace( "‚Â", " ") turn += 1 if turn > 100: # too much return None return text
[docs]def remove_accent(text): """ Replaces French accents by regular letters. :param text: text :return: cleaned text :githublink:`%|py|201` """ for c in ["aàâä", "eéèêë", "iîï", "oöô", "uùüû"]: for d in c[1:]: text = text.replace(c, d) return text
[docs]def get_elysee_speech_from_elysees(title, url="https://www.elysee.fr/"): """ Retrieves the text from the :epkg:`Elysees`. :param title: title of the document :param url: website :return: html page The function tries something like:: url + title.replace(" ","-") :githublink:`%|py|219` """ if title.startswith("http"): full = title else: if not url.endswith("/"): raise Exception("url should end with /: " + url) link = remove_accent(title.lower()).replace( " ", "-").replace("'", "-").replace('"', "") full = url + "/" + link + "/" try: text = get_url_content(full) except Exception as e: warnings.warn("Unable to retrieve '{0}' - {1}".format(full, e)) return None return xmlParsingLongestDiv(text)
[docs]def enumerate_speeches_from_elysees(url="agenda", skip=0): """ Enumerates speeches from the :epkg:`Elysees`. :param url: subaddress, url source will be ``'https://www.elysee.fr/' + url`` :param skip: skip the first *skip* one in the list :return: enumerate dictionaries .. exref:: :title: Récupérer des discours du président de la république :tag: Exercice :: for i, disc in enumerate(enumerate_speeches_from_elysees()): print(disc) Others links can be used such as ``https://www.elysee.fr/recherche?query=discours``. The website changed in 2018 and no longer support xml or json streams. :githublink:`%|py|258` """ base = "https://www.elysee.fr/" if not url.startswith("http"): url = base + url xml = get_url_content(url) reg = re.compile( "href=\\\"(.+?/[0-9]{4}/[0-9]{2}/[0-9]{2}/.+?)\\\" class=") links = reg.findall(xml) for i, link in enumerate(links): if i < skip: continue if link.startswith("/"): link = base + link content = get_elysee_speech_from_elysees(link) if content is not None: yield dict(link=link, text=content) if len(links) == 0: raise ValueError("Unable to extract links from url='{0}'\npattern='{1}'\n-----\n{2}".format( url, reg, xml))