diff options
| author | jvoisin | 2019-02-27 23:04:38 +0100 |
|---|---|---|
| committer | jvoisin | 2019-02-27 23:04:38 +0100 |
| commit | 73d2966e8c10eb6c083a2abacc53f3297d16376e (patch) | |
| tree | 24830b2e95097f220379930e8c654ad073c04bc0 /libmat2/epub.py | |
| parent | eb2e702f3700a0ac88d10a524a5f6c573a52a8dd (diff) | |
Improve epub support
Diffstat (limited to 'libmat2/epub.py')
| -rw-r--r-- | libmat2/epub.py | 46 |
1 files changed, 39 insertions, 7 deletions
diff --git a/libmat2/epub.py b/libmat2/epub.py index 09b7937..d385465 100644 --- a/libmat2/epub.py +++ b/libmat2/epub.py | |||
| @@ -1,11 +1,13 @@ | |||
| 1 | import logging | 1 | import logging |
| 2 | import re | 2 | import re |
| 3 | import uuid | ||
| 3 | import xml.etree.ElementTree as ET # type: ignore | 4 | import xml.etree.ElementTree as ET # type: ignore |
| 4 | 5 | ||
| 5 | from . import archive, office | 6 | from . import archive, office |
| 6 | 7 | ||
| 7 | class EPUBParser(archive.ArchiveBasedAbstractParser): | 8 | class EPUBParser(archive.ArchiveBasedAbstractParser): |
| 8 | mimetypes = {'application/epub+zip', } | 9 | mimetypes = {'application/epub+zip', } |
| 10 | metadata_namespace = '{http://purl.org/dc/elements/1.1/}' | ||
| 9 | 11 | ||
| 10 | def __init__(self, filename): | 12 | def __init__(self, filename): |
| 11 | super().__init__(filename) | 13 | super().__init__(filename) |
| @@ -14,6 +16,7 @@ class EPUBParser(archive.ArchiveBasedAbstractParser): | |||
| 14 | 'mimetype', | 16 | 'mimetype', |
| 15 | 'OEBPS/content.opf', | 17 | 'OEBPS/content.opf', |
| 16 | })) | 18 | })) |
| 19 | self.uniqid = uuid.uuid4() | ||
| 17 | 20 | ||
| 18 | def _specific_get_meta(self, full_path, file_path): | 21 | def _specific_get_meta(self, full_path, file_path): |
| 19 | if file_path != 'OEBPS/content.opf': | 22 | if file_path != 'OEBPS/content.opf': |
| @@ -25,23 +28,52 @@ class EPUBParser(archive.ArchiveBasedAbstractParser): | |||
| 25 | f.read(), re.I|re.M) | 28 | f.read(), re.I|re.M) |
| 26 | return {k:v for (k, v) in results} | 29 | return {k:v for (k, v) in results} |
| 27 | except (TypeError, UnicodeDecodeError): | 30 | except (TypeError, UnicodeDecodeError): |
| 28 | # We didn't manage to parse the xml file | ||
| 29 | return {file_path: 'harmful content', } | 31 | return {file_path: 'harmful content', } |
| 30 | 32 | ||
| 31 | def _specific_cleanup(self, full_path: str): | 33 | def _specific_cleanup(self, full_path: str): |
| 32 | if not full_path.endswith('OEBPS/content.opf'): | 34 | if full_path.endswith('OEBPS/content.opf'): |
| 33 | return True | 35 | return self.__handle_contentopf(full_path) |
| 36 | elif full_path.endswith('OEBPS/toc.ncx'): | ||
| 37 | return self.__handle_tocncx(full_path) | ||
| 38 | return True | ||
| 39 | |||
| 40 | def __handle_tocncx(self, full_path: str): | ||
| 41 | try: | ||
| 42 | tree, namespace = office._parse_xml(full_path) | ||
| 43 | except ET.ParseError: # pragma: nocover | ||
| 44 | logging.error("Unable to parse %s in %s.", full_path, self.filename) | ||
| 45 | return False | ||
| 46 | |||
| 47 | for item in tree.iterfind('.//', namespace): # pragma: nocover | ||
| 48 | if item.tag.strip().lower().endswith('head'): | ||
| 49 | item.clear() | ||
| 50 | ET.SubElement(item, 'meta', attrib={'name': '', 'content': ''}) | ||
| 51 | break | ||
| 52 | tree.write(full_path, xml_declaration=True, encoding='utf-8', | ||
| 53 | short_empty_elements=False) | ||
| 54 | return True | ||
| 34 | 55 | ||
| 56 | def __handle_contentopf(self, full_path: str): | ||
| 35 | try: | 57 | try: |
| 36 | tree, namespace = office._parse_xml(full_path) | 58 | tree, namespace = office._parse_xml(full_path) |
| 37 | except ET.ParseError: | 59 | except ET.ParseError: |
| 38 | logging.error("Unable to parse %s in %s.", full_path, self.filename) | 60 | logging.error("Unable to parse %s in %s.", full_path, self.filename) |
| 39 | return False | 61 | return False |
| 40 | parent_map = {c:p for p in tree.iter() for c in p} | ||
| 41 | 62 | ||
| 42 | for item in tree.iterfind('.//', namespace): | 63 | for item in tree.iterfind('.//', namespace): # pragma: nocover |
| 43 | if item.tag.strip().lower().endswith('metadata'): | 64 | if item.tag.strip().lower().endswith('metadata'): |
| 44 | parent_map[item].remove(item) | 65 | item.clear() |
| 66 | |||
| 67 | # item with mandatory content | ||
| 68 | uniqid = ET.Element(self.metadata_namespace + 'identifier') | ||
| 69 | uniqid.text = str(self.uniqid) | ||
| 70 | uniqid.set('id', 'id') | ||
| 71 | item.append(uniqid) | ||
| 72 | |||
| 73 | # items without mandatory content | ||
| 74 | for name in {'language', 'title'}: | ||
| 75 | uniqid = ET.Element(self.metadata_namespace + name) | ||
| 76 | item.append(uniqid) | ||
| 45 | break # there is only a single <metadata> block | 77 | break # there is only a single <metadata> block |
| 46 | tree.write(full_path, xml_declaration=True) | 78 | tree.write(full_path, xml_declaration=True, encoding='utf-8') |
| 47 | return True | 79 | return True |
