From 02ff21b158c76fcd355a74ddb940e1c54fc2d7ed Mon Sep 17 00:00:00 2001 From: jvoisin Date: Wed, 20 Feb 2019 16:28:11 -0800 Subject: Implement epub support --- libmat2/epub.py | 47 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) create mode 100644 libmat2/epub.py (limited to 'libmat2/epub.py') diff --git a/libmat2/epub.py b/libmat2/epub.py new file mode 100644 index 0000000..09b7937 --- /dev/null +++ b/libmat2/epub.py @@ -0,0 +1,47 @@ +import logging +import re +import xml.etree.ElementTree as ET # type: ignore + +from . import archive, office + +class EPUBParser(archive.ArchiveBasedAbstractParser): + mimetypes = {'application/epub+zip', } + + def __init__(self, filename): + super().__init__(filename) + self.files_to_keep = set(map(re.compile, { # type: ignore + 'META-INF/container.xml', + 'mimetype', + 'OEBPS/content.opf', + })) + + def _specific_get_meta(self, full_path, file_path): + if file_path != 'OEBPS/content.opf': + return {} + + with open(full_path, encoding='utf-8') as f: + try: + results = re.findall(r"<((?:meta|dc|cp).+?)[^>]*>(.+)", + f.read(), re.I|re.M) + return {k:v for (k, v) in results} + except (TypeError, UnicodeDecodeError): + # We didn't manage to parse the xml file + return {file_path: 'harmful content', } + + def _specific_cleanup(self, full_path: str): + if not full_path.endswith('OEBPS/content.opf'): + return True + + try: + tree, namespace = office._parse_xml(full_path) + except ET.ParseError: + logging.error("Unable to parse %s in %s.", full_path, self.filename) + return False + parent_map = {c:p for p in tree.iter() for c in p} + + for item in tree.iterfind('.//', namespace): + if item.tag.strip().lower().endswith('metadata'): + parent_map[item].remove(item) + break # there is only a single block + tree.write(full_path, xml_declaration=True) + return True -- cgit v1.3