Coverage for src/pyrsslocal/xmlhelper/html_parser_json.py: 94%
77 statements
« prev ^ index » next coverage.py v7.1.0, created at 2024-04-30 08:45 +0200
« prev ^ index » next coverage.py v7.1.0, created at 2024-04-30 08:45 +0200
1"""
2@file
3@brief parsing HTML to convert it into JSON
4"""
5import html.parser
8def iterate_on_json(json_structure, prefix="", keep_dictionaries=False, # pylint: disable=W0102
9 skip=["__parent__"]): # pylint: disable=W0102
10 """
11 Iterates on every field contains in the :epkg:`JSON` structure.
13 @param json_structure json structure
14 @param prefix prefix to add
15 @param keep_dictionaries if True, add yield k,v where v is a JSON dictionary
16 @param skip do not enter the following tag
17 @return iterator of (path, value)
18 """
19 for k, v in sorted(json_structure.items()):
20 if k in skip:
21 continue
22 p = prefix + "/" + k
23 if isinstance(v, str):
24 yield (p, v)
25 elif isinstance(v, dict):
26 if keep_dictionaries:
27 yield (p, v)
28 for r in iterate_on_json(v, p, keep_dictionaries, skip):
29 yield r
30 elif isinstance(v, list):
31 for el in v:
32 if keep_dictionaries:
33 yield (p, el)
34 for r in iterate_on_json(el, p, keep_dictionaries, skip):
35 yield r
36 else:
37 raise TypeError( # pragma: no cover
38 "Unexpected type, the json was altered at path '{0}'".format(
39 p))
42class HTMLtoJSONParser(html.parser.HTMLParser):
44 """
45 Parses :epkg:`HTML` and output a :epkg:`JSON` structure.
46 Example:
48 ::
50 file = ...
51 with open(file,"r",encoding="utf8") as f : content = f.read()
52 parser = HTMLtoJSONParser()
53 parser.feed(content)
54 js = parser.json
56 Or:
58 ::
60 js = HTMLtoJSONParser.to_json(content)
62 To iterator on path:
64 ::
66 all = [ (k,v) for k,v in HTMLtoJSONParser.iterate(js) ]
67 """
69 def __init__(self, raise_exception=True):
70 """
71 @param raise_exception if True, raises an exception if the
72 HTML is malformed, otherwise does what it can
73 """
74 html.parser.HTMLParser.__init__(self, convert_charrefs=True)
75 self.doc = {}
76 self.path = []
77 self.cur = self.doc
78 self.line = 0
79 self.raise_exception = raise_exception
81 @property
82 def json(self):
83 """
84 Returns the :epkg:`JSON` strucure.
85 @return json
86 """
87 return self.doc
89 @staticmethod
90 def to_json(content, raise_exception=True):
91 """
92 Converts :epkg:`HTML` into :epkg:`JSON`.
93 @param content :epkg:`HTML` content to parse
94 @param raise_exception if True, raises an exception if the HTML is malformed, otherwise does what it can
95 """
96 parser = HTMLtoJSONParser(raise_exception=raise_exception)
97 parser.feed(content)
98 return parser.json
100 @staticmethod
101 def iterate(json_structure, prefix="", keep_dictionaries=False, # pylint: disable=W0102
102 skip=["__parent__"]): # pylint: disable=W0102
103 """
104 Iterates on every field contains in the :epkg:`JSON` structure.
106 @param json_structure json structure
107 @param prefix prefix to add
108 @param keep_dictionaries if True, add yield k,v where v is a JSON dictionary
109 @param skip do not enter the following tag
110 @return iterator of (path, value)
111 """
112 for _ in iterate_on_json(
113 json_structure, prefix, keep_dictionaries, skip):
114 yield _
116 def handle_starttag(self, tag, attrs):
117 """
118 What to do for a new tag.
119 """
120 self.path.append(tag)
121 attrs = {k: v for k, v in attrs} # pylint: disable=R1721
122 if tag in self.cur:
123 if isinstance(self.cur[tag], list):
124 self.cur[tag].append({"__parent__": self.cur})
125 self.cur = self.cur[tag][-1]
126 else:
127 self.cur[tag] = [self.cur[tag]]
128 self.cur[tag].append({"__parent__": self.cur})
129 self.cur = self.cur[tag][-1]
130 else:
131 self.cur[tag] = {"__parent__": self.cur}
132 self.cur = self.cur[tag]
134 for a, v in attrs.items():
135 self.cur["#" + a] = v
136 self.cur[""] = ""
138 def handle_endtag(self, tag):
139 """
140 What to do for the end of a tag.
141 """
142 if tag != self.path[-1] and self.raise_exception:
143 raise ValueError( # pragma: no cover
144 "html is malformed around line: {0} (it might be because "
145 "of a tag <br>, <hr>, <img .. > not closed)".format(
146 self.line))
147 del self.path[-1]
148 memo = self.cur
149 self.cur = self.cur["__parent__"]
150 self.clean(memo)
152 def handle_data(self, data):
153 """
154 What to do with data.
155 """
156 self.line += data.count("\n")
157 if "" in self.cur:
158 self.cur[""] += data
160 def clean(self, values):
161 """
162 Cleans a dictionary of value.
163 """
164 keys = list(values.keys())
165 for k in keys:
166 v = values[k]
167 if isinstance(v, str):
168 #print ("clean", k,[v])
169 c = v.strip(" \n\r\t")
170 if c != v:
171 if len(c) > 0:
172 values[k] = c
173 else:
174 del values[k]
175 elif len(v) == 0:
176 del values[k]
177 del values["__parent__"]