from html.parser import HTMLParser STATE_NONE = 0 STATE_PRE = 1 STATE_CODE = 2 class CodeParser(HTMLParser): def __init__(self, **kwargs): super().__init__() # This shall be a list of dictionnaries, which # look like this: # [ # {'lang': 'ly', 'content': '...'}, # {'lang': 'ly', 'content': '...'}, # {'lang': 'tex', 'content': '...'}, # ... # ] self.blocks = [] # Yes, this thing acts kinda like a state machine. self.state = STATE_NONE self.language = '' self.data = '' def handle_starttag(self, tag, attrs): if self.state == STATE_NONE and tag == "pre": self.state = STATE_PRE elif self.state == STATE_PRE and tag == "code": self.state = STATE_CODE for val in attrs: v = val[1] if val[0] == "class": if v == "language-lilypond": self.language = 'ly' if v == "language-tex" or v == "language-latex": self.language = 'tex' def handle_endtag(self, tag): if self.state == STATE_CODE and tag == "pre": self.blocks.append({'lang': self.language, 'content': self.data}) self.state = STATE_NONE self.data = '' self.language = '' def handle_data(self, data): if self.state == STATE_CODE: self.data += data