From 02ff21b158c76fcd355a74ddb940e1c54fc2d7ed Mon Sep 17 00:00:00 2001
From: jvoisin
Date: Wed, 20 Feb 2019 16:28:11 -0800
Subject: Implement epub support

---
 libmat2/epub.py | 47 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 47 insertions(+)
 create mode 100644 libmat2/epub.py

(limited to 'libmat2/epub.py')
diff --git a/libmat2/epub.py b/libmat2/epub.py
new file mode 100644
index 0000000..09b7937
--- /dev/null
+++ b/libmat2/epub.py
@@ -0,0 +1,47 @@
+import logging
+import re
+import xml.etree.ElementTree as ET  # type: ignore
+
+from . import archive, office
+
+class EPUBParser(archive.ArchiveBasedAbstractParser):
+    mimetypes = {'application/epub+zip', }
+
+    def __init__(self, filename):
+        super().__init__(filename)
+        self.files_to_keep = set(map(re.compile, {  # type: ignore
+            'META-INF/container.xml',
+            'mimetype',
+            'OEBPS/content.opf',
+            }))
+
+    def _specific_get_meta(self, full_path, file_path):
+        if file_path != 'OEBPS/content.opf':
+            return {}
+
+        with open(full_path, encoding='utf-8') as f:
+            try:
+                results = re.findall(r"<((?:meta|dc|cp).+?)[^>]*>(.+)</\1>",
+                                     f.read(), re.I|re.M)
+                return {k:v for (k, v) in results}
+            except (TypeError, UnicodeDecodeError):
+                # We didn't manage to parse the xml file
+                return {file_path: 'harmful content', }
+
+    def _specific_cleanup(self, full_path: str):
+        if not full_path.endswith('OEBPS/content.opf'):
+            return True
+
+        try:
+            tree, namespace = office._parse_xml(full_path)
+        except ET.ParseError:
+            logging.error("Unable to parse %s in %s.", full_path, self.filename)
+            return False
+        parent_map = {c:p for p in tree.iter() for c in p}
+
+        for item in tree.iterfind('.//', namespace):
+            if item.tag.strip().lower().endswith('metadata'):
+                parent_map[item].remove(item)
+                break  # there is only a single <metadata> block
+        tree.write(full_path, xml_declaration=True)
+        return True
-- 
cgit v1.3