Improve epub support

author: jvoisin 2019-02-27 23:04:38 +0100
committer: jvoisin 2019-02-27 23:04:38 +0100
commit: 73d2966e8c10eb6c083a2abacc53f3297d16376e (patch)
tree: 24830b2e95097f220379930e8c654ad073c04bc0 /libmat2/epub.py
parent: eb2e702f3700a0ac88d10a524a5f6c573a52a8dd (diff)
1 files changed, 39 insertions, 7 deletions
diff --git a/libmat2/epub.py b/libmat2/epub.py
index 09b7937..d385465 100644
--- a/libmat2/epub.py
+++ b/libmat2/epub.py
@@ -1,11 +1,13 @@
 import logging
 import re
+import uuid
 import xml.etree.ElementTree as ET  # type: ignore
 from . import archive, office
 class EPUBParser(archive.ArchiveBasedAbstractParser):
    mimetypes = {'application/epub+zip', }
+    metadata_namespace = '{http://purl.org/dc/elements/1.1/}'
    def __init__(self, filename):
        super().__init__(filename)
@@ -14,6 +16,7 @@ class EPUBParser(archive.ArchiveBasedAbstractParser):
            'mimetype',
            'OEBPS/content.opf',
            }))
+        self.uniqid = uuid.uuid4()
    def _specific_get_meta(self, full_path, file_path):
        if file_path != 'OEBPS/content.opf':
@@ -25,23 +28,52 @@ class EPUBParser(archive.ArchiveBasedAbstractParser):
                                     f.read(), re.I|re.M)
                return {k:v for (k, v) in results}
            except (TypeError, UnicodeDecodeError):
-                # We didn't manage to parse the xml file
                return {file_path: 'harmful content', }
    def _specific_cleanup(self, full_path: str):
-        if not full_path.endswith('OEBPS/content.opf'):
+        if full_path.endswith('OEBPS/content.opf'):
-            return True
+            return self.__handle_contentopf(full_path)
+        elif full_path.endswith('OEBPS/toc.ncx'):
+            return self.__handle_tocncx(full_path)
+        return True
+    def __handle_tocncx(self, full_path: str):
+        try:
+            tree, namespace = office._parse_xml(full_path)
+        except ET.ParseError:  # pragma: nocover
+            logging.error("Unable to parse %s in %s.", full_path, self.filename)
+            return False
+        for item in tree.iterfind('.//', namespace):  # pragma: nocover
+            if item.tag.strip().lower().endswith('head'):
+                item.clear()
+                ET.SubElement(item, 'meta', attrib={'name': '', 'content': ''})
+                break
+        tree.write(full_path, xml_declaration=True, encoding='utf-8',
+                   short_empty_elements=False)
+        return True
+    def __handle_contentopf(self, full_path: str):
        try:
            tree, namespace = office._parse_xml(full_path)
        except ET.ParseError:
            logging.error("Unable to parse %s in %s.", full_path, self.filename)
            return False
-        parent_map = {c:p for p in tree.iter() for c in p}
-        for item in tree.iterfind('.//', namespace):
+        for item in tree.iterfind('.//', namespace):  # pragma: nocover
            if item.tag.strip().lower().endswith('metadata'):
-                parent_map[item].remove(item)
+                item.clear()
+                # item with mandatory content
+                uniqid = ET.Element(self.metadata_namespace + 'identifier')
+                uniqid.text = str(self.uniqid)
+                uniqid.set('id', 'id')
+                item.append(uniqid)
+                # items without mandatory content
+                for name in {'language', 'title'}:
+                    uniqid = ET.Element(self.metadata_namespace + name)
+                    item.append(uniqid)
                break  # there is only a single <metadata> block
-        tree.write(full_path, xml_declaration=True)
+        tree.write(full_path, xml_declaration=True, encoding='utf-8')
        return True
author	jvoisin	2019-02-27 23:04:38 +0100
committer	jvoisin	2019-02-27 23:04:38 +0100
commit	73d2966e8c10eb6c083a2abacc53f3297d16376e (patch)
tree	24830b2e95097f220379930e8c654ad073c04bc0 /libmat2/epub.py
parent	eb2e702f3700a0ac88d10a524a5f6c573a52a8dd (diff)