summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorjvoisin2021-02-07 17:17:16 +0100
committerjvoisin2021-02-07 17:24:50 +0100
commitec082d64833ed209d0bbec9ae5171e9378ffcb87 (patch)
treea5d06cd1ffd2524242348e2cbf0b60186cbf43f0
parentf8111547ae9e414901f38c8598704a25380cc06c (diff)
Improve a bit the support of epub
-rw-r--r--libmat2/epub.py28
1 files changed, 26 insertions, 2 deletions
diff --git a/libmat2/epub.py b/libmat2/epub.py
index fd38411..52fab1c 100644
--- a/libmat2/epub.py
+++ b/libmat2/epub.py
@@ -16,11 +16,17 @@ class EPUBParser(archive.ZipParser):
16 'mimetype', 16 'mimetype',
17 'OEBPS/content.opf', 17 'OEBPS/content.opf',
18 'content.opf', 18 'content.opf',
19 'hmh.opf',
20 'OPS/.+.xml'
19 })) 21 }))
22 self.files_to_omit = set(map(re.compile, { # type: ignore
23 'iTunesMetadata.plist'
24 'META-INF/calibre_bookmarks.txt'
25 }))
20 self.uniqid = uuid.uuid4() 26 self.uniqid = uuid.uuid4()
21 27
22 def _specific_get_meta(self, full_path, file_path): 28 def _specific_get_meta(self, full_path, file_path):
23 if not file_path.endswith('content.opf'): 29 if not file_path.endswith('.opf'):
24 return {} 30 return {}
25 31
26 with open(full_path, encoding='utf-8') as f: 32 with open(full_path, encoding='utf-8') as f:
@@ -32,12 +38,30 @@ class EPUBParser(archive.ZipParser):
32 return {file_path: 'harmful content', } 38 return {file_path: 'harmful content', }
33 39
34 def _specific_cleanup(self, full_path: str): 40 def _specific_cleanup(self, full_path: str):
35 if full_path.endswith('content.opf'): 41 if full_path.endswith('hmh.opf') or full_path.endswith('content.opf'):
36 return self.__handle_contentopf(full_path) 42 return self.__handle_contentopf(full_path)
37 elif full_path.endswith('OEBPS/toc.ncx'): 43 elif full_path.endswith('OEBPS/toc.ncx'):
38 return self.__handle_tocncx(full_path) 44 return self.__handle_tocncx(full_path)
45 elif re.search('/OPS/[^/]+.xml$', full_path):
46 return self.__handle_ops_xml(full_path)
39 return True 47 return True
40 48
49 def __handle_ops_xml(self, full_path: str):
50 try:
51 tree, namespace = office._parse_xml(full_path)
52 except ET.ParseError: # pragma: nocover
53 logging.error("Unable to parse %s in %s.", full_path, self.filename)
54 return False
55
56 for item in tree.iterfind('.//', namespace): # pragma: nocover
57 if item.tag.strip().lower().endswith('head'):
58 item.clear()
59 break
60 tree.write(full_path, xml_declaration=True, encoding='utf-8',
61 short_empty_elements=False)
62 return True
63
64
41 def __handle_tocncx(self, full_path: str): 65 def __handle_tocncx(self, full_path: str):
42 try: 66 try:
43 tree, namespace = office._parse_xml(full_path) 67 tree, namespace = office._parse_xml(full_path)