Coverage for src/ensae_teaching_cs/homeblog/buildkeywords.py: 86%

1# -*- coding: utf-8 -*-

2"""

3@file

4@brief Contains the main function to published my blog (http://www.xavierdupre.fr/blog).

5executed:

6"""

7import re

8import os

9import xml.dom.minidom

10from pyquickhelper.loghelper import fLOG

11from .modifypost import load_and_modify_xml_dom

12from .filefunction import find_all_blogs_function

15def removeAccent(s):

16 return re.sub("([^~+'.0-9,ea-zA-Z&; -])", "", s)

19def removeAccent_debug(s):

20 return re.sub("([^~+'.#çôéàèâû0-9,ea-zA-Z&; -])", "", s)

23def removeHtmlAccent(s):

24 s = s.replace("é", "é") \

25 .replace("à", "à") \

26 .replace("â", "â") \

27 .replace("ê", "ê") \

28 .replace("ô", "ô") \

29 .replace("è", "è") \

30 .replace("ç", "ç") \

31 .replace("û", "û")

32 return s

35def FixIssuesWithAccent(text):

36 """

37 voir http://migo.sixbit.org/more/html-entities.html

38 http://www.thesauruslex.com/typo/eng/enghtml.htm

40 ::

42 é = Ã© = é

43 è = Ã¨ = è

44 à = Ã = à

45 ï = Ã¯ = ï

46 ô = Ã´ = ô

47 ç = Ã§ = ç

48 ê = Ãª = ê

49 ù = Ã¹ = ù

50 æ = Ã¦ = æ

51 œ = Å“ = &oelig;

52 ë = Ã« = ë

53 ü = Ã¼ = ü

54 â = Ã¢ = â

55 € = â‚¬ = €

56 © = Â© = ©

57 ¤ = Â¤ = ¤

58 """

59 o = text

61 correspondance = [

62 ("ã©", "é"),

63 ("Ã´", "ô"),

64 ("Ã¢", "â"),

65 ("Ã®", "î"),

66 ("Ã¨", "è"),

67 ("Ãª", "ê"),

68 ("Ã¢", "â"),

69 ("Ã§", "ç"),

70 ("Ã ", "à "),

71 ("\xE9", "é"),

72 ("\xE0", "à"),

73 ("\xA0", "à"),

74 ("\xE8", "è"),

75 ("\xA8", "è"),

76 ("\xF4", "ô"),

77 ("\xB4", "ô"),

78 ("\xFB", "û"),

79 ("\xC3\xAA", "ê"),

80 ("\xC3\xAE", "î"),

81 ("\xAE", "î"),

82 ("\xEE", "î"),

83 ("\xEA", "ê"),

84 ("\xAA", "ê"),

85 ("Ã", "à"),

86 ]

88 for k, v in correspondance:

89 text = text.replace("\xC3" + k, v).replace("\xE3" + k, v)

90 text = text.replace(k, v)

92 if len(removeAccent_debug(text)) != len(text) and len(text) < 50:

93 fLOG("FixIssuesWithAccent", o.encode("utf8"), text.encode("utf8"))

94 fLOG("FixIssuesWithAccent", o, text)

95 raise ValueError("unable to deal with " +

96 str([text, [text], removeAccent_debug(text), text.encode("utf8")]))

97 return text

100def modify_all_blogs_list_in_place(folder=".",

101 mainpage=os.path.join(

102 "blog", "xd_blog.html"),

103 outmainpage=os.path.join(

104 "blog", "xd_blog.html"),

105 allow_temp=False):

106 file = find_all_blogs_function(folder, allow_temp=allow_temp)

107 file = [os.path.split(_)[-1].replace(".html", "") for _ in file]

108 f = open(mainpage, "r", encoding="utf8")

109 cont = f.read()

110 f.close()

111 trois = cont.split("//////////////////////////////////////////")

112 assert len(trois) == 3

113 file.sort(reverse=True)

114 trois[1] = "\n" + ",\n".join([f"\"{_}\"" for _ in file]) + "\n"

115 cont = "//////////////////////////////////////////".join(trois)

116 f = open(outmainpage, "w", encoding="utf8")

117 f.write(cont)

118 f.close()

119

120

121def file_all_keywords(folder=".",

122 mainpage=os.path.join("blog", "xd_blog.html"),

123 outmainpage=os.path.join("blog", "xd_blog.html"),

124 exclude=None, allow_temp=False):

125 keepfile = find_all_blogs_function(folder, exclude, allow_temp=allow_temp)

126 if len(keepfile) == 0:

127 raise RuntimeError("no found file")

128 hist = {}

129 store_keywords = {}

130 files = []

131

132 for f in keepfile:

133 dom = load_and_modify_xml_dom(f, None)

134 meta = dom.documentElement.getElementsByTagName("meta")

135 node = [_ for _ in meta if "name" in _.attributes and _.attributes[

136 "name"].value == "keywords"]

137 keywords = [_.strip() for _ in node[0].attributes[

138 "content"].value.split(",")]

139 keywords.sort()

140 store_keywords[f] = keywords

141 for k in keywords:

142 k = k.strip()

143 hist[k] = hist.get(k, 0) + 1

144 res = [(v, k) for k, v in hist.items() if v > 1]

145 res.sort(reverse=True)

146

147 # tag

148 f = open(mainpage, "r", encoding="utf8")

149 cont = f.read()

150 f.close()

151 trois = cont.split("////////////###########")

152 trois[1] = "\n" + ",\n".join(["[\"%s (%d)\",\"%s\"]" %

153 (FixIssuesWithAccent(k), v, removeAccent(k)) for v, k in res]) + "\n"

154 cont = "////////////###########".join(trois)

155

156 # documents

157 trois = cont.split("////////////---------------------")

158 rows = []

159 for k, v in res:

160 files = []

161 text = f'"{removeAccent(v)}":'

162 for f in keepfile:

163 keywords = store_keywords[f]

164 if v in keywords:

165 files.append(f)

166 files = [os.path.split(_)[-1].replace(".html", "") for _ in files]

167 files.sort(reverse=True)

168 files = [f'"{_}"' for _ in files]

169 text += f"[ {', '.join(files)} ] "

170 rows.append(text)

171 trois[1] = "\n" + ",\n".join([_ for _ in rows]) + "\n"

172

173 cont = "////////////---------------------".join(trois)

174

175 # rev keywords

176 trois = cont.split("////////////+++++++++++++++++")

177 rows = []

178 for k, v in res:

179 text = removeAccent(v)

180 rows.append(f'"{text}":"{FixIssuesWithAccent(v)}"')

181 trois[1] = "\n" + ",\n".join([_ for _ in rows]) + "\n"

182 cont = "////////////+++++++++++++++++".join(trois)

183

184 f = open(outmainpage, "w", encoding="utf8")

185 f.write(cont)

186 f.close()

187

188 modify_all_blogs_list_in_place(

189 folder, outmainpage, outmainpage, allow_temp=allow_temp)

190 return store_keywords

191

192

193def build_bloc_keywords(res, frequence_threshold, rootfile):

194 """

195 builds the keywords bloc

196

197 @param res ....

198 @param frequence_threshold number of times a keyword needs to appear before getting the right bar

199 """

200 keywords = {}

201 for a, b in res.items():

202 for _ in b:

203 keywords[_] = keywords.get(_, 0) + 1

204 keywords = [(b, a) for a, b in keywords.items()]

205 keywords.sort(reverse=True)

206 text = []

207 for a, b in keywords:

208 if a >= frequence_threshold:

209 s = '<p class="keywordtitle"><a href="%s_%s.html" target="_parent">%s</a> (%d)</p>' % \

210 (rootfile, removeAccent(b), FixIssuesWithAccent(b), a)

211 text.append(s)

212 return "\n".join(text), keywords

213

214

215def build_bloc_months(res, rootfile):

216 """

217 builds the months bloc (we assume the page name is YYYY-MM-DD-something-.html

218

219 @param res list of blog per months

220 @param rootfile files location

221 """

222 months = {}

223 for a, b in res.items():

224 month = os.path.split(a)[-1][:7]

225 months[month] = months.get(month, 0) + 1

226 months = [(a, str(b)) for a, b in months.items()]

227 months.sort(reverse=True)

228 text = []

229 year = None

230 for a, b in months:

231 if year is not None and a[:4] != year:

232 text.append('<p class="smallspace">.</p>')

233 s = '<p class="monthtitle"><a href="%s_%s.html" target="_parent">%s</a> (%s)</p>' % \

234 (rootfile, a, a, b)

235 text.append(s)

236 year = a[:4]

237 months = [(b, a) for a, b in months]

238 return "\n".join(text), months

239

240

241def replace_xml_in_template_using_dom_dirty(dom, node, newvalue):

242 xmltext = node.toxml()

243 allxml = dom.documentElement.toxml()

244 pos = allxml.find(xmltext)

245 if pos == -1:

246 raise ValueError("unable to replace")

247 allxml = allxml.replace(xmltext, newvalue)

248 res = xml.dom.minidom.parseString(allxml)

249 return res

250

251

252def get_node_div(template, cl):

253 sidebar = template.documentElement.getElementsByTagName("div")

254 sidebar = [_ for _ in sidebar if "class" in _.attributes]

255 sidebar = [_ for _ in sidebar if _.attributes["class"].value == cl]

256 if len(sidebar) != 1:

257 raise ValueError("issue with HTML format: " +

258 cl + ", " + str(len(sidebar)))

259 sidebar = sidebar[0]

260 return sidebar

261

262

263def generate_html_article(res,

264 templateFile,

265 toFolder,

266 overwrite=False,

267 aggregatedFile=None,

268 maxAggregrate=15,

269 keywordsText=None,

270 otherLayer=None):

271

272 fileToReturn = []

273

274 if not os.path.exists(toFolder):

275 raise FileNotFoundError("not found " + toFolder)

276

277 # group files or not

278 toprocess = []

279 if aggregatedFile is not None:

280 counter = 0

281 stackFile = []

282

283 for file in sorted(res, reverse=True):

284 stackFile.append(file)

285 if len(stackFile) == maxAggregrate:

286 fileOutName = "%s_%04d.html" % (aggregatedFile.replace(".html", ""), counter) if counter > 0 \

287 else aggregatedFile

288 fileOutName = os.path.join(toFolder, fileOutName)

289 stackFile.sort(reverse=True)

290 toprocess.append((stackFile, fileOutName))

291 counter += len(stackFile)

292 stackFile = []

293

294 if len(stackFile) > 0:

295 fileOutName = "%s_%04d.html" % (aggregatedFile.replace(".html", ""), counter) if counter > 0 \

296 else aggregatedFile

297 fileOutName = os.path.join(toFolder, fileOutName)

298 stackFile.sort(reverse=True)

299 toprocess.append((stackFile, fileOutName))

300 else:

301 # we process all files, each of them gives a file

302 for file in sorted(res, reverse=True):

303 filename = os.path.split(file)[-1].replace(".html", "_nojs.html")

304 filename = os.path.join(toFolder, filename)

305 toprocess.append(([file], filename))

306

307 # updating the sidebar

308 template = load_and_modify_xml_dom(templateFile, None, False)

309 templateText = template.documentElement.toxml()

310 title_to_rep = template.documentElement.getElementsByTagName("title")[

311 0].toxml()

312

313 # all files to process are now in the list

314 for indexProcess, couple in enumerate(toprocess):

315 files, filename = couple

316 stackContent = []

317 scripthtml = ""

318 replacetitle = None

319

320 for file in files:

321 dom = load_and_modify_xml_dom(file, None)

322 date = os.path.split(file)[-1][:10]

323

324 title = dom.documentElement.getElementsByTagName("title")[

325 0].toxml()

326 if "XD blog" in title:

327 raise ValueError("a blog contains a bad title: " + file)

328 if len(files) == 1:

329 # in that case, we want to change the page title

330 replacetitle = title

331

332 title = title.replace("title>", "h2>")

333 link = f'<a href="{date}_nojs.html"><b>{date}</b></a>'

334 title = title.replace("<h2>", "<h2>" + link + " ")

335

336 scripts = dom.documentElement.getElementsByTagName("script")

337 if len(scripts) > 1:

338 scr = [""] + [_.toxml() for _ in scripts]

339 scripthtml += "\n".join(scr)

340

341 b = dom.documentElement.getElementsByTagName("body")[0]

342 body = b.toxml()

343

344 body = body[6:]

345 body = body[:-7]

346

347 if len(files) > 1 and '' in body:

348 # here we deal with shortcuts except if we process a single

349 # document

350 body = body.split('')[0]

351 body += "<br />" + \

352 f"<a href=\"{date}_nojs.html\">{'more...'}</a>"

353

354 if len(body.strip()) == 0:

355 raise ValueError("empty body for " + file)

356 stackContent.append(title + "\n" + body)

357 keywords = res[file]

358

359 # we

360 uniqueKeys = [_ for _ in set(keywords) if not _.startswith("~")]

361 uniqueKeys.sort()

362 keystext = ", ".join(uniqueKeys)

363

364 nextPage = ""

365 if indexProcess > 0:

366 nextPage += '<a href="%s"><i><--</i></a> ' % (

367 os.path.split(toprocess[indexProcess - 1][1])[-1])

368 if indexProcess < len(toprocess) - 1:

369 nextPage += '<a href="%s"><i>--></i></a> ' % (

370 os.path.split(toprocess[indexProcess + 1][1])[-1])

371

372 if keywordsText is not None:

373 keystext = keywordsText

374

375 # inside

376

377 post = templateText.replace(

378 "", "\n".join(stackContent))

379 post = post.replace(

380 '<a href="xd_blog_nojs_DDD.html"><i>suite</i></a>', nextPage)

381 post = post.replace("", scripthtml)

382 post = post.replace("", keystext)

383 post = post.replace("### KEYWORDS ###", keystext)

384 post = post.replace("### keywords ###", keystext)

385

386 enabled = False

387 if enabled:

388 olayer = f'<p class="keywordtitle"><a href="xd_blog.html?date={date}">Other Layer</a></p>' \

389 if otherLayer is None else \

390 f'<p class="keywordtitle"><a href="{otherLayer}">Other Layer</a></p>'

391 post = post.replace("", olayer)

392 # it does not work (pages too big)

393

394 post = '<?xml version="1.0" encoding="utf-8"?>\n' + post

395 post = post.replace('type="text/javascript"/>',

396 'type="text/javascript"></script>')

397

398 post = FixIssuesWithAccent(post)

399

400 if replacetitle is not None:

401 # there was only one document, we replace it

402 post = post.replace(title_to_rep, replacetitle)

403

404 # we save the results

405

406 if os.path.exists(filename):

407 try:

408 f = open(filename, "r", encoding="utf8")

409 hist = f.read()

410 f.close()

411 except UnicodeDecodeError as e:

412 fLOG("issue with file ", filename)

413 content = open(filename, "r").read()

414 fLOG(content[170:])

415 raise e

416 else:

417 hist = ""

418

419 if post != hist or overwrite:

420 if "\xC3" in post:

421 #raise RuntimeError("forbidden character ")

422 pass

423 if not overwrite:

424 fLOG(" writing ", filename)

425 if "### keywords ###" in post.lower():

426 raise RuntimeError(

427 "unable to release that document with this string ### KEYWORDS ###,\nkeywords should be " + str(keystext))

428 f = open(filename, "w", encoding="utf8")

429 f.write(post)

430 f.close()

431 fileToReturn.append(filename)

432

433 return fileToReturn

434

435

436def build_process_all_pages(res,

437 keywordsHTML="frame_keywords.html",

438 siteFolder="../site/blog",

439 xd_blog_template_nojs=os.path.join(

440 "blog", "xd_blog_template_nojs.html"),

441 xd_blog_nojs="xd_blog_nojs.html",

442 frequence_keywords=3,

443 monthsHTML="frame_months.html"

444 ):

445 """

446 @param res output from function file_all_keywords

447 @param keywordsHTML html template for the keywords

448 @param siteFolder folder the blog (the one to be published)

449 @param xd_blog_template_nojs template for blog (static text, less javascript)

450 @param xd_blog_nojs main page (static text, less javascript)

451 @param frequence_keywords there won't be any page for a keyword whose frequency is below that threshold

452 @param monthsHTML html template for the months

453 @return all created pages

454 """

455

456 add = []

457

458 fLOG("processing keywords")

459 htmlkey, keywords = build_bloc_keywords(

460 res, frequence_keywords, "xd_blog_key")

461 if keywordsHTML is not None:

462 file = os.path.join(siteFolder, keywordsHTML)

463 fLOG("writing ", file)

464 f = open(file, "w", encoding="utf8")

465 f.write("""<?xml version="1.0" encoding="utf-8"?>\n""")

466 f.write("<html>\n")

467 f.write("<head>\n")

468 f.write(

469 """<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>\n""")

470 f.write("""<link href="pMenu.css" rel="stylesheet" type="text/css"/>\n""")

471 f.write("</head>\n")

472 f.write("<body>\n")

473 f.write("""<div class="sidebarfull">\n""")

474 f.write("""<p class="keywordtitle"><b>Keywords</b></p>\n""")

475 f.write(htmlkey)

476 f.write("\n</div>\n")

477 f.write("\n</body></html>\n")

478 f.close()

479 add.append(file)

480

481 fLOG("processing months")

482 htmlkeym, monthsp = build_bloc_months(res, "xd_blog_month")

483 if monthsHTML is not None:

484 file = os.path.join(siteFolder, monthsHTML)

485 fLOG("writing ", file)

486 f = open(file, "w", encoding="utf8")

487 f.write("""<?xml version="1.0" encoding="utf-8"?>\n""")

488 f.write("<html>\n")

489 f.write("<head>\n")

490 f.write(

491 """<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>\n""")

492 f.write("""<link href="pMenu.css" rel="stylesheet" type="text/css"/>\n""")

493 f.write("</head>\n")

494 f.write("<body>\n")

495 f.write("""<div class="sidebarfullleft">\n<hr />\n""")

496 f.write("""<p class="monthtitle"><b>Months</b></p>\n""")

497 f.write(htmlkeym)

498 f.write("\n</div>\n")

499 f.write("\n</body></html>\n")

500 f.close()

501 add.append(file)

502

503 # build keyword pages

504 fLOG("building aggregated page for keywords")

505 add += generate_html_article(

506 res,

507 xd_blog_template_nojs,

508 siteFolder,

509 True,

510 xd_blog_nojs,

511 keywordsText="",

512 otherLayer="xd_blog.html")

513

514 # process all pages for each keyword)

515 for a, b in keywords:

516 fLOG("building page for keyword", FixIssuesWithAccent(b))

517 bb = removeAccent(b)

518 tempres = {}

519 for k, v in res.items():

520 if b in v:

521 tempres[k] = ""

522 add += generate_html_article(

523 tempres,

524 xd_blog_template_nojs,

525 siteFolder,

526 True,

527 f"xd_blog_key_{bb}.html",

528 keywordsText=FixIssuesWithAccent(b),

529 otherLayer=f"xd_blog.html?tag={FixIssuesWithAccent(b)}")

530

531 # build months pages

532 fLOG("building aggregated page for months")

533 add += generate_html_article(

534 res,

535 xd_blog_template_nojs,

536 siteFolder,

537 True,

538 xd_blog_nojs,

539 keywordsText="",

540 otherLayer="xd_blog.html")

541

542 # process all pages for each months)

543 for a, b in monthsp:

544 fLOG("building page for months", b)

545 bb = removeAccent(b)

546 tempres = {}

547 for k, v in res.items():

548 if os.path.split(k)[-1].startswith(b):

549 tempres[k] = ""

550 add += generate_html_article(

551 tempres,

552 xd_blog_template_nojs,

553 siteFolder,

554 True,

555 f"xd_blog_month_{bb}.html",

556 keywordsText=FixIssuesWithAccent(b),

557 otherLayer=f"xd_blog.html?tag={FixIssuesWithAccent(b)}")

558

559 # build all pages (one per blog)

560 fLOG("building all pages")

561 add += generate_html_article(

562 res,

563 xd_blog_template_nojs,

564 siteFolder,

565 overwrite=True,

566 otherLayer=None)

567 return add