From 73d2966e8c10eb6c083a2abacc53f3297d16376e Mon Sep 17 00:00:00 2001 From: jvoisin Date: Wed, 27 Feb 2019 23:04:38 +0100 Subject: Improve epub support --- libmat2/epub.py | 46 +++++++++++++++++++++++++++++++++++++++------- 1 file changed, 39 insertions(+), 7 deletions(-) (limited to 'libmat2/epub.py') diff --git a/libmat2/epub.py b/libmat2/epub.py index 09b7937..d385465 100644 --- a/libmat2/epub.py +++ b/libmat2/epub.py @@ -1,11 +1,13 @@ import logging import re +import uuid import xml.etree.ElementTree as ET # type: ignore from . import archive, office class EPUBParser(archive.ArchiveBasedAbstractParser): mimetypes = {'application/epub+zip', } + metadata_namespace = '{http://purl.org/dc/elements/1.1/}' def __init__(self, filename): super().__init__(filename) @@ -14,6 +16,7 @@ class EPUBParser(archive.ArchiveBasedAbstractParser): 'mimetype', 'OEBPS/content.opf', })) + self.uniqid = uuid.uuid4() def _specific_get_meta(self, full_path, file_path): if file_path != 'OEBPS/content.opf': @@ -25,23 +28,52 @@ class EPUBParser(archive.ArchiveBasedAbstractParser): f.read(), re.I|re.M) return {k:v for (k, v) in results} except (TypeError, UnicodeDecodeError): - # We didn't manage to parse the xml file return {file_path: 'harmful content', } def _specific_cleanup(self, full_path: str): - if not full_path.endswith('OEBPS/content.opf'): - return True + if full_path.endswith('OEBPS/content.opf'): + return self.__handle_contentopf(full_path) + elif full_path.endswith('OEBPS/toc.ncx'): + return self.__handle_tocncx(full_path) + return True + + def __handle_tocncx(self, full_path: str): + try: + tree, namespace = office._parse_xml(full_path) + except ET.ParseError: # pragma: nocover + logging.error("Unable to parse %s in %s.", full_path, self.filename) + return False + + for item in tree.iterfind('.//', namespace): # pragma: nocover + if item.tag.strip().lower().endswith('head'): + item.clear() + ET.SubElement(item, 'meta', attrib={'name': '', 'content': ''}) + break + tree.write(full_path, xml_declaration=True, encoding='utf-8', + short_empty_elements=False) + return True + def __handle_contentopf(self, full_path: str): try: tree, namespace = office._parse_xml(full_path) except ET.ParseError: logging.error("Unable to parse %s in %s.", full_path, self.filename) return False - parent_map = {c:p for p in tree.iter() for c in p} - for item in tree.iterfind('.//', namespace): + for item in tree.iterfind('.//', namespace): # pragma: nocover if item.tag.strip().lower().endswith('metadata'): - parent_map[item].remove(item) + item.clear() + + # item with mandatory content + uniqid = ET.Element(self.metadata_namespace + 'identifier') + uniqid.text = str(self.uniqid) + uniqid.set('id', 'id') + item.append(uniqid) + + # items without mandatory content + for name in {'language', 'title'}: + uniqid = ET.Element(self.metadata_namespace + name) + item.append(uniqid) break # there is only a single block - tree.write(full_path, xml_declaration=True) + tree.write(full_path, xml_declaration=True, encoding='utf-8') return True -- cgit v1.3