I needed to convert a HTML string into JSON. After looking for results without any success, I thought doing it myself should be faster than searching. As an exemple:
content = '<html><body><div class="an_example"><p>one paragraph</p></div></body></html>' js = HTMLtoJSONParser.to_json(content) print (js)Will produce this:
{'html': {'body': {'div': {'p': {'': 'one paragraph'}, '#class': 'an_example'}}}}
The implementation of HTMLtoJSONParser follows:
class HTMLtoJSONParser(html.parser.HTMLParser): def __init__(self, raise_exception = True) : html.parser.HTMLParser.__init__(self) self.doc = { } self.path = [] self.cur = self.doc self.line = 0 self.raise_exception = raise_exception @property def json(self): return self.doc @staticmethod def to_json(content, raise_exception = True): parser = HTMLtoJSONParser(raise_exception = raise_exception) parser.feed(content) return parser.json def handle_starttag(self, tag, attrs): self.path.append(tag) attrs = { k:v for k,v in attrs } if tag in self.cur : if isinstance(self.cur[tag],list) : self.cur[tag].append( { "__parent__": self.cur } ) self.cur = self.cur[tag][-1] else : self.cur[tag] = [ self.cur[tag] ] self.cur[tag].append( { "__parent__": self.cur } ) self.cur = self.cur[tag][-1] else : self.cur[tag] = { "__parent__": self.cur } self.cur = self.cur[tag] for a,v in attrs.items(): self.cur["#" + a] = v self.cur[""] = "" def handle_endtag(self, tag): if tag != self.path[-1] and self.raise_exception : raise Exception("html is malformed around line: {0} (it might be because of a tag <br>, <hr>, <img .. > not closed)".format(self.line)) del self.path[-1] memo = self.cur self.cur = self.cur["__parent__"] self.clean(memo) def handle_data(self, data): self.line += data.count("\n") if "" in self.cur : self.cur[""] += data def clean(self, values): keys = list(values.keys()) for k in keys: v = values[k] if isinstance(v, str) : #print ("clean", k,[v]) c = v.strip(" \n\r\t") if c != v : if len(c) > 0 : values[k] = c else : del values[k] del values["__parent__"]
<-- --> |