summaryrefslogtreecommitdiff
path: root/libmat2/epub.py
diff options
context:
space:
mode:
authorjvoisin2019-02-27 23:04:38 +0100
committerjvoisin2019-02-27 23:04:38 +0100
commit73d2966e8c10eb6c083a2abacc53f3297d16376e (patch)
tree24830b2e95097f220379930e8c654ad073c04bc0 /libmat2/epub.py
parenteb2e702f3700a0ac88d10a524a5f6c573a52a8dd (diff)
Improve epub support
Diffstat (limited to 'libmat2/epub.py')
-rw-r--r--libmat2/epub.py46
1 files changed, 39 insertions, 7 deletions
diff --git a/libmat2/epub.py b/libmat2/epub.py
index 09b7937..d385465 100644
--- a/libmat2/epub.py
+++ b/libmat2/epub.py
@@ -1,11 +1,13 @@
1import logging 1import logging
2import re 2import re
3import uuid
3import xml.etree.ElementTree as ET # type: ignore 4import xml.etree.ElementTree as ET # type: ignore
4 5
5from . import archive, office 6from . import archive, office
6 7
7class EPUBParser(archive.ArchiveBasedAbstractParser): 8class EPUBParser(archive.ArchiveBasedAbstractParser):
8 mimetypes = {'application/epub+zip', } 9 mimetypes = {'application/epub+zip', }
10 metadata_namespace = '{http://purl.org/dc/elements/1.1/}'
9 11
10 def __init__(self, filename): 12 def __init__(self, filename):
11 super().__init__(filename) 13 super().__init__(filename)
@@ -14,6 +16,7 @@ class EPUBParser(archive.ArchiveBasedAbstractParser):
14 'mimetype', 16 'mimetype',
15 'OEBPS/content.opf', 17 'OEBPS/content.opf',
16 })) 18 }))
19 self.uniqid = uuid.uuid4()
17 20
18 def _specific_get_meta(self, full_path, file_path): 21 def _specific_get_meta(self, full_path, file_path):
19 if file_path != 'OEBPS/content.opf': 22 if file_path != 'OEBPS/content.opf':
@@ -25,23 +28,52 @@ class EPUBParser(archive.ArchiveBasedAbstractParser):
25 f.read(), re.I|re.M) 28 f.read(), re.I|re.M)
26 return {k:v for (k, v) in results} 29 return {k:v for (k, v) in results}
27 except (TypeError, UnicodeDecodeError): 30 except (TypeError, UnicodeDecodeError):
28 # We didn't manage to parse the xml file
29 return {file_path: 'harmful content', } 31 return {file_path: 'harmful content', }
30 32
31 def _specific_cleanup(self, full_path: str): 33 def _specific_cleanup(self, full_path: str):
32 if not full_path.endswith('OEBPS/content.opf'): 34 if full_path.endswith('OEBPS/content.opf'):
33 return True 35 return self.__handle_contentopf(full_path)
36 elif full_path.endswith('OEBPS/toc.ncx'):
37 return self.__handle_tocncx(full_path)
38 return True
39
40 def __handle_tocncx(self, full_path: str):
41 try:
42 tree, namespace = office._parse_xml(full_path)
43 except ET.ParseError: # pragma: nocover
44 logging.error("Unable to parse %s in %s.", full_path, self.filename)
45 return False
46
47 for item in tree.iterfind('.//', namespace): # pragma: nocover
48 if item.tag.strip().lower().endswith('head'):
49 item.clear()
50 ET.SubElement(item, 'meta', attrib={'name': '', 'content': ''})
51 break
52 tree.write(full_path, xml_declaration=True, encoding='utf-8',
53 short_empty_elements=False)
54 return True
34 55
56 def __handle_contentopf(self, full_path: str):
35 try: 57 try:
36 tree, namespace = office._parse_xml(full_path) 58 tree, namespace = office._parse_xml(full_path)
37 except ET.ParseError: 59 except ET.ParseError:
38 logging.error("Unable to parse %s in %s.", full_path, self.filename) 60 logging.error("Unable to parse %s in %s.", full_path, self.filename)
39 return False 61 return False
40 parent_map = {c:p for p in tree.iter() for c in p}
41 62
42 for item in tree.iterfind('.//', namespace): 63 for item in tree.iterfind('.//', namespace): # pragma: nocover
43 if item.tag.strip().lower().endswith('metadata'): 64 if item.tag.strip().lower().endswith('metadata'):
44 parent_map[item].remove(item) 65 item.clear()
66
67 # item with mandatory content
68 uniqid = ET.Element(self.metadata_namespace + 'identifier')
69 uniqid.text = str(self.uniqid)
70 uniqid.set('id', 'id')
71 item.append(uniqid)
72
73 # items without mandatory content
74 for name in {'language', 'title'}:
75 uniqid = ET.Element(self.metadata_namespace + name)
76 item.append(uniqid)
45 break # there is only a single <metadata> block 77 break # there is only a single <metadata> block
46 tree.write(full_path, xml_declaration=True) 78 tree.write(full_path, xml_declaration=True, encoding='utf-8')
47 return True 79 return True