From 02ff21b158c76fcd355a74ddb940e1c54fc2d7ed Mon Sep 17 00:00:00 2001 From: jvoisin Date: Wed, 20 Feb 2019 16:28:11 -0800 Subject: Implement epub support --- libmat2/web.py | 122 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 122 insertions(+) create mode 100644 libmat2/web.py (limited to 'libmat2/web.py') diff --git a/libmat2/web.py b/libmat2/web.py new file mode 100644 index 0000000..13d5fc8 --- /dev/null +++ b/libmat2/web.py @@ -0,0 +1,122 @@ +from html import parser +from typing import Dict, Any, List, Tuple +import re +import string + +from . import abstract + + +class CSSParser(abstract.AbstractParser): + """There is no such things as metadata in CSS files, + only comments of the form `/* … */`, so we're removing the laters.""" + mimetypes = {'text/css', } + flags = re.MULTILINE | re.DOTALL + + def remove_all(self) -> bool: + with open(self.filename, encoding='utf-8') as f: + cleaned = re.sub(r'/\*.+?\*/', '', f.read(), 0, self.flags) + with open(self.output_filename, 'w', encoding='utf-8') as f: + f.write(cleaned) + return True + + def get_meta(self) -> Dict[str, Any]: + metadata = {} + with open(self.filename, encoding='utf-8') as f: + cssdoc = re.findall(r'/\*(.+?)\*/', f.read(), self.flags) + for match in cssdoc: + for line in match.splitlines(): + try: + k, v = line.split(':') + metadata[k.strip(string.whitespace + '*')] = v.strip() + except ValueError: + metadata['harmful data'] = line.strip() + return metadata + + +class HTMLParser(abstract.AbstractParser): + mimetypes = {'text/html', 'application/x-dtbncx+xml', } + def __init__(self, filename): + super().__init__(filename) + self.__parser = _HTMLParser(self.filename) + with open(filename, encoding='utf-8') as f: + self.__parser.feed(f.read()) + self.__parser.close() + + def get_meta(self) -> Dict[str, Any]: + return self.__parser.get_meta() + + def remove_all(self) -> bool: + return self.__parser.remove_all(self.output_filename) + + +class _HTMLParser(parser.HTMLParser): + """Python doesn't have a validating html parser in its stdlib, so + we're using an internal queue to track all the opening/closing tags, + and hoping for the best. + """ + tag_blacklist = {'doctitle', 'meta'} # everything is lowercase + def __init__(self, filename): + super().__init__() + self.filename = filename + self.__textrepr = '' + self.__meta = {} + self.__validation_queue = [] + # We're using a counter instead of a boolean to handle nested tags + self.__in_dangerous_tag = 0 + + def handle_starttag(self, tag: str, attrs: List[Tuple[str, str]]): + self.__validation_queue.append(tag) + if tag in self.tag_blacklist: + self.__in_dangerous_tag += 1 + return + + if self.__in_dangerous_tag == 0: + self.__textrepr += self.get_starttag_text() + + def handle_endtag(self, tag: str): + if not self.__validation_queue: + raise ValueError("The closing tag %s doesn't have a corresponding " + "opening one in %s." % (tag, self.filename)) + + previous_tag = self.__validation_queue.pop() + if tag != previous_tag: + raise ValueError("The closing tag %s doesn't match the previous " + "tag %s in %s" % + (tag, previous_tag, self.filename)) + elif tag in self.tag_blacklist: + self.__in_dangerous_tag -= 1 + return + + if self.__in_dangerous_tag == 0: + # There is no `get_endtag_text()` method :/ + self.__textrepr += '\n' + + def handle_data(self, data: str): + if self.__in_dangerous_tag == 0 and data.strip(): + self.__textrepr += data + + def handle_startendtag(self, tag: str, attrs: List[Tuple[str, str]]): + if tag in self.tag_blacklist: + meta = {k:v for k, v in attrs} + name = meta.get('name', 'harmful metadata') + content = meta.get('content', 'harmful data') + self.__meta[name] = content + else: + if self.__in_dangerous_tag == 0: + self.__textrepr += self.get_starttag_text() + + def remove_all(self, output_filename: str) -> bool: + if self.__validation_queue: + raise ValueError("Some tags (%s) were left unclosed in %s" % ( + ', '.join(self.__validation_queue), + self.filename)) + with open(output_filename, 'w', encoding='utf-8') as f: + f.write(self.__textrepr) + return True + + def get_meta(self) -> Dict[str, Any]: + if self.__validation_queue: + raise ValueError("Some tags (%s) were left unclosed in %s" % ( + ', '.join(self.__validation_queue), + self.filename)) + return self.__meta -- cgit v1.3