Coverage for src/ensae_teaching_cs/homeblog/buildkeywords.py: 86%
311 statements
« prev ^ index » next coverage.py v7.1.0, created at 2023-04-28 06:23 +0200
« prev ^ index » next coverage.py v7.1.0, created at 2023-04-28 06:23 +0200
1# -*- coding: utf-8 -*-
2"""
3@file
4@brief Contains the main function to published my blog (http://www.xavierdupre.fr/blog).
5executed:
6"""
7import re
8import os
9import xml.dom.minidom
10from pyquickhelper.loghelper import fLOG
11from .modifypost import load_and_modify_xml_dom
12from .filefunction import find_all_blogs_function
15def removeAccent(s):
16 return re.sub("([^~+'.0-9,ea-zA-Z&; -])", "", s)
19def removeAccent_debug(s):
20 return re.sub("([^~+'.#çôéàèâû0-9,ea-zA-Z&; -])", "", s)
23def removeHtmlAccent(s):
24 s = s.replace("é", "é") \
25 .replace("à", "à") \
26 .replace("â", "â") \
27 .replace("ê", "ê") \
28 .replace("ô", "ô") \
29 .replace("è", "è") \
30 .replace("ç", "ç") \
31 .replace("û", "û")
32 return s
35def FixIssuesWithAccent(text):
36 """
37 voir http://migo.sixbit.org/more/html-entities.html
38 http://www.thesauruslex.com/typo/eng/enghtml.htm
40 ::
42 é = é = é
43 è = è = è
44 à = Ã = à
45 ï = ï = ï
46 ô = ô = ô
47 ç = ç = ç
48 ê = ê = ê
49 ù = ù = ù
50 æ = æ = æ
51 œ = Å = œ
52 ë = ë = ë
53 ü = ü = ü
54 â = â = â
55 € = ⬠= €
56 © = © = ©
57 ¤ = ¤ = ¤
58 """
59 o = text
61 correspondance = [
62 ("ã©", "é"),
63 ("ô", "ô"),
64 ("â", "â"),
65 ("î", "î"),
66 ("è", "è"),
67 ("ê", "ê"),
68 ("â", "â"),
69 ("ç", "ç"),
70 ("Ã ", "à "),
71 ("\xE9", "é"),
72 ("\xE0", "à"),
73 ("\xA0", "à"),
74 ("\xE8", "è"),
75 ("\xA8", "è"),
76 ("\xF4", "ô"),
77 ("\xB4", "ô"),
78 ("\xFB", "û"),
79 ("\xC3\xAA", "ê"),
80 ("\xC3\xAE", "î"),
81 ("\xAE", "î"),
82 ("\xEE", "î"),
83 ("\xEA", "ê"),
84 ("\xAA", "ê"),
85 ("Ã", "à"),
86 ]
88 for k, v in correspondance:
89 text = text.replace("\xC3" + k, v).replace("\xE3" + k, v)
90 text = text.replace(k, v)
92 if len(removeAccent_debug(text)) != len(text) and len(text) < 50:
93 fLOG("FixIssuesWithAccent", o.encode("utf8"), text.encode("utf8"))
94 fLOG("FixIssuesWithAccent", o, text)
95 raise ValueError("unable to deal with " +
96 str([text, [text], removeAccent_debug(text), text.encode("utf8")]))
97 return text
100def modify_all_blogs_list_in_place(folder=".",
101 mainpage=os.path.join(
102 "blog", "xd_blog.html"),
103 outmainpage=os.path.join(
104 "blog", "xd_blog.html"),
105 allow_temp=False):
106 file = find_all_blogs_function(folder, allow_temp=allow_temp)
107 file = [os.path.split(_)[-1].replace(".html", "") for _ in file]
108 f = open(mainpage, "r", encoding="utf8")
109 cont = f.read()
110 f.close()
111 trois = cont.split("//////////////////////////////////////////")
112 assert len(trois) == 3
113 file.sort(reverse=True)
114 trois[1] = "\n" + ",\n".join([f"\"{_}\"" for _ in file]) + "\n"
115 cont = "//////////////////////////////////////////".join(trois)
116 f = open(outmainpage, "w", encoding="utf8")
117 f.write(cont)
118 f.close()
121def file_all_keywords(folder=".",
122 mainpage=os.path.join("blog", "xd_blog.html"),
123 outmainpage=os.path.join("blog", "xd_blog.html"),
124 exclude=None, allow_temp=False):
125 keepfile = find_all_blogs_function(folder, exclude, allow_temp=allow_temp)
126 if len(keepfile) == 0:
127 raise RuntimeError("no found file")
128 hist = {}
129 store_keywords = {}
130 files = []
132 for f in keepfile:
133 dom = load_and_modify_xml_dom(f, None)
134 meta = dom.documentElement.getElementsByTagName("meta")
135 node = [_ for _ in meta if "name" in _.attributes and _.attributes[
136 "name"].value == "keywords"]
137 keywords = [_.strip() for _ in node[0].attributes[
138 "content"].value.split(",")]
139 keywords.sort()
140 store_keywords[f] = keywords
141 for k in keywords:
142 k = k.strip()
143 hist[k] = hist.get(k, 0) + 1
144 res = [(v, k) for k, v in hist.items() if v > 1]
145 res.sort(reverse=True)
147 # tag
148 f = open(mainpage, "r", encoding="utf8")
149 cont = f.read()
150 f.close()
151 trois = cont.split("////////////###########")
152 trois[1] = "\n" + ",\n".join(["[\"%s (%d)\",\"%s\"]" %
153 (FixIssuesWithAccent(k), v, removeAccent(k)) for v, k in res]) + "\n"
154 cont = "////////////###########".join(trois)
156 # documents
157 trois = cont.split("////////////---------------------")
158 rows = []
159 for k, v in res:
160 files = []
161 text = f'"{removeAccent(v)}":'
162 for f in keepfile:
163 keywords = store_keywords[f]
164 if v in keywords:
165 files.append(f)
166 files = [os.path.split(_)[-1].replace(".html", "") for _ in files]
167 files.sort(reverse=True)
168 files = [f'"{_}"' for _ in files]
169 text += f"[ {', '.join(files)} ] "
170 rows.append(text)
171 trois[1] = "\n" + ",\n".join([_ for _ in rows]) + "\n"
173 cont = "////////////---------------------".join(trois)
175 # rev keywords
176 trois = cont.split("////////////+++++++++++++++++")
177 rows = []
178 for k, v in res:
179 text = removeAccent(v)
180 rows.append(f'"{text}":"{FixIssuesWithAccent(v)}"')
181 trois[1] = "\n" + ",\n".join([_ for _ in rows]) + "\n"
182 cont = "////////////+++++++++++++++++".join(trois)
184 f = open(outmainpage, "w", encoding="utf8")
185 f.write(cont)
186 f.close()
188 modify_all_blogs_list_in_place(
189 folder, outmainpage, outmainpage, allow_temp=allow_temp)
190 return store_keywords
193def build_bloc_keywords(res, frequence_threshold, rootfile):
194 """
195 builds the keywords bloc
197 @param res ....
198 @param frequence_threshold number of times a keyword needs to appear before getting the right bar
199 """
200 keywords = {}
201 for a, b in res.items():
202 for _ in b:
203 keywords[_] = keywords.get(_, 0) + 1
204 keywords = [(b, a) for a, b in keywords.items()]
205 keywords.sort(reverse=True)
206 text = []
207 for a, b in keywords:
208 if a >= frequence_threshold:
209 s = '<p class="keywordtitle"><a href="%s_%s.html" target="_parent">%s</a> (%d)</p>' % \
210 (rootfile, removeAccent(b), FixIssuesWithAccent(b), a)
211 text.append(s)
212 return "\n".join(text), keywords
215def build_bloc_months(res, rootfile):
216 """
217 builds the months bloc (we assume the page name is YYYY-MM-DD-something-.html
219 @param res list of blog per months
220 @param rootfile files location
221 """
222 months = {}
223 for a, b in res.items():
224 month = os.path.split(a)[-1][:7]
225 months[month] = months.get(month, 0) + 1
226 months = [(a, str(b)) for a, b in months.items()]
227 months.sort(reverse=True)
228 text = []
229 year = None
230 for a, b in months:
231 if year is not None and a[:4] != year:
232 text.append('<p class="smallspace">.</p>')
233 s = '<p class="monthtitle"><a href="%s_%s.html" target="_parent">%s</a> (%s)</p>' % \
234 (rootfile, a, a, b)
235 text.append(s)
236 year = a[:4]
237 months = [(b, a) for a, b in months]
238 return "\n".join(text), months
241def replace_xml_in_template_using_dom_dirty(dom, node, newvalue):
242 xmltext = node.toxml()
243 allxml = dom.documentElement.toxml()
244 pos = allxml.find(xmltext)
245 if pos == -1:
246 raise ValueError("unable to replace")
247 allxml = allxml.replace(xmltext, newvalue)
248 res = xml.dom.minidom.parseString(allxml)
249 return res
252def get_node_div(template, cl):
253 sidebar = template.documentElement.getElementsByTagName("div")
254 sidebar = [_ for _ in sidebar if "class" in _.attributes]
255 sidebar = [_ for _ in sidebar if _.attributes["class"].value == cl]
256 if len(sidebar) != 1:
257 raise ValueError("issue with HTML format: " +
258 cl + ", " + str(len(sidebar)))
259 sidebar = sidebar[0]
260 return sidebar
263def generate_html_article(res,
264 templateFile,
265 toFolder,
266 overwrite=False,
267 aggregatedFile=None,
268 maxAggregrate=15,
269 keywordsText=None,
270 otherLayer=None):
272 fileToReturn = []
274 if not os.path.exists(toFolder):
275 raise FileNotFoundError("not found " + toFolder)
277 # group files or not
278 toprocess = []
279 if aggregatedFile is not None:
280 counter = 0
281 stackFile = []
283 for file in sorted(res, reverse=True):
284 stackFile.append(file)
285 if len(stackFile) == maxAggregrate:
286 fileOutName = "%s_%04d.html" % (aggregatedFile.replace(".html", ""), counter) if counter > 0 \
287 else aggregatedFile
288 fileOutName = os.path.join(toFolder, fileOutName)
289 stackFile.sort(reverse=True)
290 toprocess.append((stackFile, fileOutName))
291 counter += len(stackFile)
292 stackFile = []
294 if len(stackFile) > 0:
295 fileOutName = "%s_%04d.html" % (aggregatedFile.replace(".html", ""), counter) if counter > 0 \
296 else aggregatedFile
297 fileOutName = os.path.join(toFolder, fileOutName)
298 stackFile.sort(reverse=True)
299 toprocess.append((stackFile, fileOutName))
300 else:
301 # we process all files, each of them gives a file
302 for file in sorted(res, reverse=True):
303 filename = os.path.split(file)[-1].replace(".html", "_nojs.html")
304 filename = os.path.join(toFolder, filename)
305 toprocess.append(([file], filename))
307 # updating the sidebar
308 template = load_and_modify_xml_dom(templateFile, None, False)
309 templateText = template.documentElement.toxml()
310 title_to_rep = template.documentElement.getElementsByTagName("title")[
311 0].toxml()
313 # all files to process are now in the list
314 for indexProcess, couple in enumerate(toprocess):
315 files, filename = couple
316 stackContent = []
317 scripthtml = ""
318 replacetitle = None
320 for file in files:
321 dom = load_and_modify_xml_dom(file, None)
322 date = os.path.split(file)[-1][:10]
324 title = dom.documentElement.getElementsByTagName("title")[
325 0].toxml()
326 if "XD blog" in title:
327 raise ValueError("a blog contains a bad title: " + file)
328 if len(files) == 1:
329 # in that case, we want to change the page title
330 replacetitle = title
332 title = title.replace("title>", "h2>")
333 link = f'<a href="{date}_nojs.html"><b>{date}</b></a>'
334 title = title.replace("<h2>", "<h2>" + link + " ")
336 scripts = dom.documentElement.getElementsByTagName("script")
337 if len(scripts) > 1:
338 scr = [""] + [_.toxml() for _ in scripts]
339 scripthtml += "\n".join(scr)
341 b = dom.documentElement.getElementsByTagName("body")[0]
342 body = b.toxml()
344 body = body[6:]
345 body = body[:-7]
347 if len(files) > 1 and '<!-- CUT PAGE HERE -->' in body:
348 # here we deal with shortcuts except if we process a single
349 # document
350 body = body.split('<!-- CUT PAGE HERE -->')[0]
351 body += "<br />" + \
352 f"<a href=\"{date}_nojs.html\">{'more...'}</a>"
354 if len(body.strip()) == 0:
355 raise ValueError("empty body for " + file)
356 stackContent.append(title + "\n" + body)
357 keywords = res[file]
359 # we
360 uniqueKeys = [_ for _ in set(keywords) if not _.startswith("~")]
361 uniqueKeys.sort()
362 keystext = ", ".join(uniqueKeys)
364 nextPage = ""
365 if indexProcess > 0:
366 nextPage += '<a href="%s"><i><--</i></a> ' % (
367 os.path.split(toprocess[indexProcess - 1][1])[-1])
368 if indexProcess < len(toprocess) - 1:
369 nextPage += '<a href="%s"><i>--></i></a> ' % (
370 os.path.split(toprocess[indexProcess + 1][1])[-1])
372 if keywordsText is not None:
373 keystext = keywordsText
375 # inside
377 post = templateText.replace(
378 "<!-- article here -->", "\n".join(stackContent))
379 post = post.replace(
380 '<a href="xd_blog_nojs_DDD.html"><i>suite</i></a>', nextPage)
381 post = post.replace("<!-- javascript here -->", scripthtml)
382 post = post.replace("<!-- article keywords -->", keystext)
383 post = post.replace("### KEYWORDS ###", keystext)
384 post = post.replace("### keywords ###", keystext)
386 enabled = False
387 if enabled:
388 olayer = f'<p class="keywordtitle"><a href="xd_blog.html?date={date}">Other Layer</a></p>' \
389 if otherLayer is None else \
390 f'<p class="keywordtitle"><a href="{otherLayer}">Other Layer</a></p>'
391 post = post.replace("<!-- other layer -->", olayer)
392 # it does not work (pages too big)
394 post = '<?xml version="1.0" encoding="utf-8"?>\n' + post
395 post = post.replace('type="text/javascript"/>',
396 'type="text/javascript"></script>')
398 post = FixIssuesWithAccent(post)
400 if replacetitle is not None:
401 # there was only one document, we replace it
402 post = post.replace(title_to_rep, replacetitle)
404 # we save the results
406 if os.path.exists(filename):
407 try:
408 f = open(filename, "r", encoding="utf8")
409 hist = f.read()
410 f.close()
411 except UnicodeDecodeError as e:
412 fLOG("issue with file ", filename)
413 content = open(filename, "r").read()
414 fLOG(content[170:])
415 raise e
416 else:
417 hist = ""
419 if post != hist or overwrite:
420 if "\xC3" in post:
421 #raise RuntimeError("forbidden character ")
422 pass
423 if not overwrite:
424 fLOG(" writing ", filename)
425 if "### keywords ###" in post.lower():
426 raise RuntimeError(
427 "unable to release that document with this string ### KEYWORDS ###,\nkeywords should be " + str(keystext))
428 f = open(filename, "w", encoding="utf8")
429 f.write(post)
430 f.close()
431 fileToReturn.append(filename)
433 return fileToReturn
436def build_process_all_pages(res,
437 keywordsHTML="frame_keywords.html",
438 siteFolder="../site/blog",
439 xd_blog_template_nojs=os.path.join(
440 "blog", "xd_blog_template_nojs.html"),
441 xd_blog_nojs="xd_blog_nojs.html",
442 frequence_keywords=3,
443 monthsHTML="frame_months.html"
444 ):
445 """
446 @param res output from function file_all_keywords
447 @param keywordsHTML html template for the keywords
448 @param siteFolder folder the blog (the one to be published)
449 @param xd_blog_template_nojs template for blog (static text, less javascript)
450 @param xd_blog_nojs main page (static text, less javascript)
451 @param frequence_keywords there won't be any page for a keyword whose frequency is below that threshold
452 @param monthsHTML html template for the months
453 @return all created pages
454 """
456 add = []
458 fLOG("processing keywords")
459 htmlkey, keywords = build_bloc_keywords(
460 res, frequence_keywords, "xd_blog_key")
461 if keywordsHTML is not None:
462 file = os.path.join(siteFolder, keywordsHTML)
463 fLOG("writing ", file)
464 f = open(file, "w", encoding="utf8")
465 f.write("""<?xml version="1.0" encoding="utf-8"?>\n""")
466 f.write("<html>\n")
467 f.write("<head>\n")
468 f.write(
469 """<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>\n""")
470 f.write("""<link href="pMenu.css" rel="stylesheet" type="text/css"/>\n""")
471 f.write("</head>\n")
472 f.write("<body>\n")
473 f.write("""<div class="sidebarfull">\n""")
474 f.write("""<p class="keywordtitle"><b>Keywords</b></p>\n""")
475 f.write(htmlkey)
476 f.write("\n</div>\n")
477 f.write("\n</body></html>\n")
478 f.close()
479 add.append(file)
481 fLOG("processing months")
482 htmlkeym, monthsp = build_bloc_months(res, "xd_blog_month")
483 if monthsHTML is not None:
484 file = os.path.join(siteFolder, monthsHTML)
485 fLOG("writing ", file)
486 f = open(file, "w", encoding="utf8")
487 f.write("""<?xml version="1.0" encoding="utf-8"?>\n""")
488 f.write("<html>\n")
489 f.write("<head>\n")
490 f.write(
491 """<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>\n""")
492 f.write("""<link href="pMenu.css" rel="stylesheet" type="text/css"/>\n""")
493 f.write("</head>\n")
494 f.write("<body>\n")
495 f.write("""<div class="sidebarfullleft">\n<hr />\n""")
496 f.write("""<p class="monthtitle"><b>Months</b></p>\n""")
497 f.write(htmlkeym)
498 f.write("\n</div>\n")
499 f.write("\n</body></html>\n")
500 f.close()
501 add.append(file)
503 # build keyword pages
504 fLOG("building aggregated page for keywords")
505 add += generate_html_article(
506 res,
507 xd_blog_template_nojs,
508 siteFolder,
509 True,
510 xd_blog_nojs,
511 keywordsText="",
512 otherLayer="xd_blog.html")
514 # process all pages for each keyword)
515 for a, b in keywords:
516 fLOG("building page for keyword", FixIssuesWithAccent(b))
517 bb = removeAccent(b)
518 tempres = {}
519 for k, v in res.items():
520 if b in v:
521 tempres[k] = ""
522 add += generate_html_article(
523 tempres,
524 xd_blog_template_nojs,
525 siteFolder,
526 True,
527 f"xd_blog_key_{bb}.html",
528 keywordsText=FixIssuesWithAccent(b),
529 otherLayer=f"xd_blog.html?tag={FixIssuesWithAccent(b)}")
531 # build months pages
532 fLOG("building aggregated page for months")
533 add += generate_html_article(
534 res,
535 xd_blog_template_nojs,
536 siteFolder,
537 True,
538 xd_blog_nojs,
539 keywordsText="",
540 otherLayer="xd_blog.html")
542 # process all pages for each months)
543 for a, b in monthsp:
544 fLOG("building page for months", b)
545 bb = removeAccent(b)
546 tempres = {}
547 for k, v in res.items():
548 if os.path.split(k)[-1].startswith(b):
549 tempres[k] = ""
550 add += generate_html_article(
551 tempres,
552 xd_blog_template_nojs,
553 siteFolder,
554 True,
555 f"xd_blog_month_{bb}.html",
556 keywordsText=FixIssuesWithAccent(b),
557 otherLayer=f"xd_blog.html?tag={FixIssuesWithAccent(b)}")
559 # build all pages (one per blog)
560 fLOG("building all pages")
561 add += generate_html_article(
562 res,
563 xd_blog_template_nojs,
564 siteFolder,
565 overwrite=True,
566 otherLayer=None)
567 return add