Coverage for src/ensae_teaching_cs/td_1a/discours_politique.py: 13%
123 statements
« prev ^ index » next coverage.py v7.1.0, created at 2023-04-28 06:23 +0200
« prev ^ index » next coverage.py v7.1.0, created at 2023-04-28 06:23 +0200
1# -*- coding: utf-8 -*-
2"""
3@file
4@brief Retrive political speeches from Internet
6"""
8import re
9import html.parser
10import html.entities as htmlentitydefs
11import warnings
12from pyquickhelper.loghelper import get_url_content
15def xmlParsingLongestDiv(text):
16 """
17 Extracts the longest div section.
19 @param text text of HTML page
20 @return text
21 """
22 class MyHTMLParser(html.parser.HTMLParser):
23 """
24 To get rid of paragraphs, and bolded text.
25 """
27 def __init__(self):
28 html.parser.HTMLParser.__init__(self, convert_charrefs=True)
29 self.mtag = []
30 self.mvalue = []
31 self.mall = []
33 def handle_starttag(self, tag, attrs):
34 if tag == "div":
35 self.mtag.append(tag)
36 self.mvalue.append([])
37 elif len(self.mtag) > 0:
38 self.mvalue[-1].append(" ")
40 def handle_endtag(self, tag):
41 if tag == "div":
42 self.mall.append((self.mtag[-1], "".join(self.mvalue[-1])))
43 self.mtag.pop()
44 self.mvalue.pop()
45 elif len(self.mtag) > 0:
46 if tag == "p" or tag == "br":
47 self.mvalue[-1].append("\n")
48 else:
49 self.mvalue[-1].append(" ")
51 def handle_data(self, data):
52 if len(self.mtag) > 0:
53 self.mvalue[-1].append(data)
55 parser = MyHTMLParser()
56 text = text.replace(" -g8\" ", " ")
57 parser.feed(text)
59 best = ""
60 for tag, value in parser.mall:
61 if tag == "div" and len(value) > len(best):
62 best = value
64 endLine = "\n"
65 res = best.replace(
66 "<p>",
67 "").replace(
68 "</p>",
69 endLine).replace(
70 "\r",
71 "").replace(
72 "<br />",
73 endLine).replace(
74 "<br>",
75 endLine)
76 exp = re.compile("[|]((.|\n){5,50}) ")
77 nb = exp.findall(res)
78 if (len(nb) == 0 or len(res) > 10000) and "if (window.xtparam!=null)" not in res:
79 return res
80 else:
81 return ""
84def html_unescape(text):
85 """
86 Removes :epkg:`HTML` or :epkg:`XML`
87 character references and entities from a text string.
88 keep ``&``, ``>``, ``<`` in the source code.
89 from `Fredrik Lundh <http://effbot.org/zone/re-sub.htm#unescape-html>`_
91 @param text text
92 @return cleaning text
93 """
94 def fixup(m):
95 text = m.group(0)
96 if text[:2] == "&#":
97 try:
98 if text[:3] == "&#x":
99 return chr(int(text[3:-1], 16))
100 else:
101 return chr(int(text[2:-1]))
102 except ValueError:
103 pass
104 else:
105 # named entity
106 try:
107 if text[1:-1] == "amp":
108 text = "&amp;"
109 elif text[1:-1] == "gt":
110 text = "&gt;"
111 elif text[1:-1] == "lt":
112 text = "&lt;"
113 else:
114 text = chr(htmlentitydefs.name2codepoint[text[1:-1]])
115 except KeyError:
116 pass
117 return text # leave as is
118 return re.sub("&#?\\w+;", fixup, text)
121def force_unicode(text):
122 """
123 Deals with unicodes.
125 @param text text
126 @return text
127 """
128 exp = re.compile("([0-9]+):")
129 turn = 0
130 while True:
131 try:
132 text = text.encode("ascii", errors="ignore")
133 break
134 except UnicodeDecodeError as e:
135 pos = exp.findall(str(e))
136 pos = int(pos[0])
137 text = text.replace("ô", "o").replace(
138 "é", "e").replace("Ã", "a")
139 text = text.replace(
140 " ",
141 " ").replace(
142 "’",
143 "'").replace(
144 "a§",
145 "c")
146 text = text.replace(
147 "a¹",
148 "u").replace(
149 "a¨",
150 "e").replace(
151 "a‰",
152 "E")
153 text = text.replace(
154 "a¢",
155 "a").replace(
156 "aª",
157 "e").replace(
158 "aƒÂ´",
159 "o")
160 text = text.replace(
161 "aƒÂ©",
162 "e").replace(
163 "aƒÂ",
164 "e").replace(
165 "©",
166 "e")
167 text = text.replace(
168 "a»",
169 "u").replace(
170 "€",
171 "E").replace(
172 "a®",
173 "i")
174 text = text.replace(
175 '\xa0',
176 " ").replace(
177 "Å“",
178 "oe").replace(
179 "«",
180 " ")
181 text = text.replace(
182 "»",
183 " ").replace(
184 "e¹ ",
185 "ei").replace(
186 "‚Â",
187 " ")
188 turn += 1
189 if turn > 100:
190 # too much
191 return None
192 return text
195def remove_accent(text):
196 """
197 Replaces French accents by regular letters.
199 @param text text
200 @return cleaned text
201 """
202 for c in ["aàâä", "eéèêë", "iîï", "oöô", "uùüû"]:
203 for d in c[1:]:
204 text = text.replace(c, d)
205 return text
208def get_elysee_speech_from_elysees(title, url="https://www.elysee.fr/"):
209 """
210 Retrieves the text from the :epkg:`Elysees`.
212 @param title title of the document
213 @param url website
214 @return html page
216 The function tries something like::
218 url + title.replace(" ","-")
219 """
220 if title.startswith("http"):
221 full = title
222 else:
223 if not url.endswith("/"):
224 raise RuntimeError("url should end with /: " + url)
225 link = remove_accent(title.lower()).replace(
226 " ", "-").replace("'", "-").replace('"', "")
227 full = url + "/" + link + "/"
228 try:
229 text = get_url_content(full)
230 except Exception as e:
231 warnings.warn(f"Unable to retrieve '{full}' - {e}")
232 return None
233 return xmlParsingLongestDiv(text)
236def enumerate_speeches_from_elysees(url="agenda", skip=0):
237 """
238 Enumerates speeches from the :epkg:`Elysees`.
240 @param url subaddress, url source will be
241 ``'https://www.elysee.fr/' + url``
242 @param skip skip the first *skip* one in the list
243 @return enumerate dictionaries
245 .. exref::
246 :title: Récupérer des discours du président de la république
247 :tag: Exercice
249 ::
251 for i, disc in enumerate(enumerate_speeches_from_elysees()):
252 print(disc)
254 Others links can be used such as
255 ``https://www.elysee.fr/recherche?query=discours``.
256 The website changed in 2018 and no longer support xml or json
257 streams.
258 """
259 base = "https://www.elysee.fr/"
260 if not url.startswith("http"):
261 url = base + url
262 xml = get_url_content(url)
263 reg = re.compile(
264 "href=\\\"(.+?/[0-9]{4}/[0-9]{2}/[0-9]{2}/.+?)\\\" class=")
265 links = reg.findall(xml)
266 for i, link in enumerate(links):
267 if i < skip:
268 continue
269 if link.startswith("/"):
270 link = base + link
271 content = get_elysee_speech_from_elysees(link)
272 if content is not None:
273 yield dict(link=link, text=content)
274 if len(links) == 0:
275 raise ValueError("Unable to extract links from url='{0}'\npattern='{1}'\n-----\n{2}".format(
276 url, reg, xml))