From 02ff21b158c76fcd355a74ddb940e1c54fc2d7ed Mon Sep 17 00:00:00 2001 From: jvoisin Date: Wed, 20 Feb 2019 16:28:11 -0800 Subject: Implement epub support --- libmat2/epub.py | 47 ++++++++++++++++++ libmat2/html.py | 69 -------------------------- libmat2/parser_factory.py | 9 +++- libmat2/web.py | 122 ++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 177 insertions(+), 70 deletions(-) create mode 100644 libmat2/epub.py delete mode 100644 libmat2/html.py create mode 100644 libmat2/web.py (limited to 'libmat2') diff --git a/libmat2/epub.py b/libmat2/epub.py new file mode 100644 index 0000000..09b7937 --- /dev/null +++ b/libmat2/epub.py @@ -0,0 +1,47 @@ +import logging +import re +import xml.etree.ElementTree as ET # type: ignore + +from . import archive, office + +class EPUBParser(archive.ArchiveBasedAbstractParser): + mimetypes = {'application/epub+zip', } + + def __init__(self, filename): + super().__init__(filename) + self.files_to_keep = set(map(re.compile, { # type: ignore + 'META-INF/container.xml', + 'mimetype', + 'OEBPS/content.opf', + })) + + def _specific_get_meta(self, full_path, file_path): + if file_path != 'OEBPS/content.opf': + return {} + + with open(full_path, encoding='utf-8') as f: + try: + results = re.findall(r"<((?:meta|dc|cp).+?)[^>]*>(.+)", + f.read(), re.I|re.M) + return {k:v for (k, v) in results} + except (TypeError, UnicodeDecodeError): + # We didn't manage to parse the xml file + return {file_path: 'harmful content', } + + def _specific_cleanup(self, full_path: str): + if not full_path.endswith('OEBPS/content.opf'): + return True + + try: + tree, namespace = office._parse_xml(full_path) + except ET.ParseError: + logging.error("Unable to parse %s in %s.", full_path, self.filename) + return False + parent_map = {c:p for p in tree.iter() for c in p} + + for item in tree.iterfind('.//', namespace): + if item.tag.strip().lower().endswith('metadata'): + parent_map[item].remove(item) + break # there is only a single block + tree.write(full_path, xml_declaration=True) + return True diff --git a/libmat2/html.py b/libmat2/html.py deleted file mode 100644 index d0e9a2b..0000000 --- a/libmat2/html.py +++ /dev/null @@ -1,69 +0,0 @@ -from html import parser -from typing import Dict, Any, List, Tuple - -from . import abstract - - -class HTMLParser(abstract.AbstractParser): - mimetypes = {'text/html', } - def __init__(self, filename): - super().__init__(filename) - self.__parser = _HTMLParser() - with open(filename) as f: - self.__parser.feed(f.read()) - self.__parser.close() - - def get_meta(self) -> Dict[str, Any]: - return self.__parser.get_meta() - - def remove_all(self) -> bool: - return self.__parser.remove_all(self.output_filename) - - -class _HTMLParser(parser.HTMLParser): - """Python doesn't have a validating html parser in its stdlib, so - we're using an internal queue to track all the opening/closing tags, - and hoping for the best. - """ - def __init__(self): - super().__init__() - self.__textrepr = '' - self.__meta = {} - self.__validation_queue = [] - - def handle_starttag(self, tag: str, attrs: List[Tuple[str, str]]): - self.__textrepr += self.get_starttag_text() - self.__validation_queue.append(tag) - - def handle_endtag(self, tag: str): - if not self.__validation_queue: - raise ValueError - elif tag != self.__validation_queue.pop(): - raise ValueError - # There is no `get_endtag_text()` method :/ - self.__textrepr += '\n' - - def handle_data(self, data: str): - if data.strip(): - self.__textrepr += data - - def handle_startendtag(self, tag: str, attrs: List[Tuple[str, str]]): - if tag == 'meta': - meta = {k:v for k, v in attrs} - name = meta.get('name', 'harmful metadata') - content = meta.get('content', 'harmful data') - self.__meta[name] = content - else: - self.__textrepr += self.get_starttag_text() - - def remove_all(self, output_filename: str) -> bool: - if self.__validation_queue: - raise ValueError - with open(output_filename, 'w') as f: - f.write(self.__textrepr) - return True - - def get_meta(self) -> Dict[str, Any]: - if self.__validation_queue: - raise ValueError - return self.__meta diff --git a/libmat2/parser_factory.py b/libmat2/parser_factory.py index 30c3b52..e93ee4f 100644 --- a/libmat2/parser_factory.py +++ b/libmat2/parser_factory.py @@ -1,3 +1,4 @@ +import logging import glob import os import mimetypes @@ -10,6 +11,10 @@ assert Tuple # make pyflakes happy T = TypeVar('T', bound='abstract.AbstractParser') +mimetypes.add_type('application/epub+zip', '.epub') +# EPUB Navigation Control XML File +mimetypes.add_type('application/x-dtbncx+xml', '.ncx') + def __load_all_parsers(): """ Loads every parser in a dynamic way """ @@ -49,6 +54,8 @@ def get_parser(filename: str) -> Tuple[Optional[T], Optional[str]]: if mtype in parser_class.mimetypes: try: return parser_class(filename), mtype - except ValueError: + except ValueError as e: + logging.info("Got an exception when trying to instanciate " + "%s for %s: %s", parser_class, filename, e) return None, mtype return None, mtype diff --git a/libmat2/web.py b/libmat2/web.py new file mode 100644 index 0000000..13d5fc8 --- /dev/null +++ b/libmat2/web.py @@ -0,0 +1,122 @@ +from html import parser +from typing import Dict, Any, List, Tuple +import re +import string + +from . import abstract + + +class CSSParser(abstract.AbstractParser): + """There is no such things as metadata in CSS files, + only comments of the form `/* … */`, so we're removing the laters.""" + mimetypes = {'text/css', } + flags = re.MULTILINE | re.DOTALL + + def remove_all(self) -> bool: + with open(self.filename, encoding='utf-8') as f: + cleaned = re.sub(r'/\*.+?\*/', '', f.read(), 0, self.flags) + with open(self.output_filename, 'w', encoding='utf-8') as f: + f.write(cleaned) + return True + + def get_meta(self) -> Dict[str, Any]: + metadata = {} + with open(self.filename, encoding='utf-8') as f: + cssdoc = re.findall(r'/\*(.+?)\*/', f.read(), self.flags) + for match in cssdoc: + for line in match.splitlines(): + try: + k, v = line.split(':') + metadata[k.strip(string.whitespace + '*')] = v.strip() + except ValueError: + metadata['harmful data'] = line.strip() + return metadata + + +class HTMLParser(abstract.AbstractParser): + mimetypes = {'text/html', 'application/x-dtbncx+xml', } + def __init__(self, filename): + super().__init__(filename) + self.__parser = _HTMLParser(self.filename) + with open(filename, encoding='utf-8') as f: + self.__parser.feed(f.read()) + self.__parser.close() + + def get_meta(self) -> Dict[str, Any]: + return self.__parser.get_meta() + + def remove_all(self) -> bool: + return self.__parser.remove_all(self.output_filename) + + +class _HTMLParser(parser.HTMLParser): + """Python doesn't have a validating html parser in its stdlib, so + we're using an internal queue to track all the opening/closing tags, + and hoping for the best. + """ + tag_blacklist = {'doctitle', 'meta'} # everything is lowercase + def __init__(self, filename): + super().__init__() + self.filename = filename + self.__textrepr = '' + self.__meta = {} + self.__validation_queue = [] + # We're using a counter instead of a boolean to handle nested tags + self.__in_dangerous_tag = 0 + + def handle_starttag(self, tag: str, attrs: List[Tuple[str, str]]): + self.__validation_queue.append(tag) + if tag in self.tag_blacklist: + self.__in_dangerous_tag += 1 + return + + if self.__in_dangerous_tag == 0: + self.__textrepr += self.get_starttag_text() + + def handle_endtag(self, tag: str): + if not self.__validation_queue: + raise ValueError("The closing tag %s doesn't have a corresponding " + "opening one in %s." % (tag, self.filename)) + + previous_tag = self.__validation_queue.pop() + if tag != previous_tag: + raise ValueError("The closing tag %s doesn't match the previous " + "tag %s in %s" % + (tag, previous_tag, self.filename)) + elif tag in self.tag_blacklist: + self.__in_dangerous_tag -= 1 + return + + if self.__in_dangerous_tag == 0: + # There is no `get_endtag_text()` method :/ + self.__textrepr += '\n' + + def handle_data(self, data: str): + if self.__in_dangerous_tag == 0 and data.strip(): + self.__textrepr += data + + def handle_startendtag(self, tag: str, attrs: List[Tuple[str, str]]): + if tag in self.tag_blacklist: + meta = {k:v for k, v in attrs} + name = meta.get('name', 'harmful metadata') + content = meta.get('content', 'harmful data') + self.__meta[name] = content + else: + if self.__in_dangerous_tag == 0: + self.__textrepr += self.get_starttag_text() + + def remove_all(self, output_filename: str) -> bool: + if self.__validation_queue: + raise ValueError("Some tags (%s) were left unclosed in %s" % ( + ', '.join(self.__validation_queue), + self.filename)) + with open(output_filename, 'w', encoding='utf-8') as f: + f.write(self.__textrepr) + return True + + def get_meta(self) -> Dict[str, Any]: + if self.__validation_queue: + raise ValueError("Some tags (%s) were left unclosed in %s" % ( + ', '.join(self.__validation_queue), + self.filename)) + return self.__meta -- cgit v1.3