From 02ff21b158c76fcd355a74ddb940e1c54fc2d7ed Mon Sep 17 00:00:00 2001 From: jvoisin Date: Wed, 20 Feb 2019 16:28:11 -0800 Subject: Implement epub support --- libmat2/html.py | 69 --------------------------------------------------------- 1 file changed, 69 deletions(-) delete mode 100644 libmat2/html.py (limited to 'libmat2/html.py') diff --git a/libmat2/html.py b/libmat2/html.py deleted file mode 100644 index d0e9a2b..0000000 --- a/libmat2/html.py +++ /dev/null @@ -1,69 +0,0 @@ -from html import parser -from typing import Dict, Any, List, Tuple - -from . import abstract - - -class HTMLParser(abstract.AbstractParser): - mimetypes = {'text/html', } - def __init__(self, filename): - super().__init__(filename) - self.__parser = _HTMLParser() - with open(filename) as f: - self.__parser.feed(f.read()) - self.__parser.close() - - def get_meta(self) -> Dict[str, Any]: - return self.__parser.get_meta() - - def remove_all(self) -> bool: - return self.__parser.remove_all(self.output_filename) - - -class _HTMLParser(parser.HTMLParser): - """Python doesn't have a validating html parser in its stdlib, so - we're using an internal queue to track all the opening/closing tags, - and hoping for the best. - """ - def __init__(self): - super().__init__() - self.__textrepr = '' - self.__meta = {} - self.__validation_queue = [] - - def handle_starttag(self, tag: str, attrs: List[Tuple[str, str]]): - self.__textrepr += self.get_starttag_text() - self.__validation_queue.append(tag) - - def handle_endtag(self, tag: str): - if not self.__validation_queue: - raise ValueError - elif tag != self.__validation_queue.pop(): - raise ValueError - # There is no `get_endtag_text()` method :/ - self.__textrepr += '\n' - - def handle_data(self, data: str): - if data.strip(): - self.__textrepr += data - - def handle_startendtag(self, tag: str, attrs: List[Tuple[str, str]]): - if tag == 'meta': - meta = {k:v for k, v in attrs} - name = meta.get('name', 'harmful metadata') - content = meta.get('content', 'harmful data') - self.__meta[name] = content - else: - self.__textrepr += self.get_starttag_text() - - def remove_all(self, output_filename: str) -> bool: - if self.__validation_queue: - raise ValueError - with open(output_filename, 'w') as f: - f.write(self.__textrepr) - return True - - def get_meta(self) -> Dict[str, Any]: - if self.__validation_queue: - raise ValueError - return self.__meta -- cgit v1.3