Source code for pyrsslocal.xmlhelper.xml_utils
# -*- coding: utf-8 -*-
"""
parsing XML
:githublink:`%|py|6`
"""
import re
from xml.sax.saxutils import escape as sax_escape
from html.entities import name2codepoint
[docs]def escape(s):
"""
:param s: string to escape
:return: escaped string
:githublink:`%|py|16`
"""
if isinstance(s, list):
return [escape(_) for _ in s]
else:
s = sax_escape(s)
s = s.replace("&", "&")
return s
[docs]def html_unescape(text):
"""
Removes :epkg:`HTML` or :epkg:`XML` character references
and entities from a text string.
keep ``&``, ``>``, ``<`` in the source code.
from `Fredrik Lundh
<http://effbot.org/zone/re-sub.htm#unescape-html>`_.
:githublink:`%|py|32`
"""
def fixup(m):
text = m.group(0)
if text[:2] == "&#":
try:
if text[:3] == "&#x":
return chr(int(text[3:-1], 16))
else:
return chr(int(text[2:-1]))
except ValueError:
pass
else:
# named entity
try:
if text[1:-1] == "amp":
text = "&amp;"
elif text[1:-1] == "gt":
text = "&gt;"
elif text[1:-1] == "lt":
text = "&lt;"
else:
text = chr(name2codepoint[text[1:-1]])
except KeyError:
pass
return text # leave as is
return re.sub("&#?\\w+;", fixup, text)
character_to_escape = {
"é": "é",
" ": " ",
"è": "è",
"à": "à",
"â": "â",
"ê": "ê",
"ë": "ë",
"î": "î",
"ù": "ù",
"ü": "ü",
"ô": "ô",
"œ": "œ",
}
[docs]def html_escape(text):
"""
Escapes any French character with an accent.
:githublink:`%|py|79`
"""
def fixup(m):
text = m.group(0)
return character_to_escape.get(text, text)
return re.sub("[àâäéèêëîïôöùüü]", fixup, text)