Improve a bit the support of epub

author: jvoisin 2021-02-07 17:17:16 +0100
committer: jvoisin 2021-02-07 17:24:50 +0100
commit: ec082d64833ed209d0bbec9ae5171e9378ffcb87 (patch)
tree: a5d06cd1ffd2524242348e2cbf0b60186cbf43f0
parent: f8111547ae9e414901f38c8598704a25380cc06c (diff)
1 files changed, 26 insertions, 2 deletions
diff --git a/libmat2/epub.py b/libmat2/epub.py
index fd38411..52fab1c 100644
--- a/libmat2/epub.py
+++ b/libmat2/epub.py
@@ -16,11 +16,17 @@ class EPUBParser(archive.ZipParser):
            'mimetype',
            'OEBPS/content.opf',
            'content.opf',
+            'hmh.opf',
+            'OPS/.+.xml'
            }))
+        self.files_to_omit = set(map(re.compile, {  # type: ignore
+            'iTunesMetadata.plist'
+            'META-INF/calibre_bookmarks.txt'
+             }))
        self.uniqid = uuid.uuid4()
    def _specific_get_meta(self, full_path, file_path):
-        if not file_path.endswith('content.opf'):
+        if not file_path.endswith('.opf'):
            return {}
        with open(full_path, encoding='utf-8') as f:
@@ -32,12 +38,30 @@ class EPUBParser(archive.ZipParser):
                return {file_path: 'harmful content', }
    def _specific_cleanup(self, full_path: str):
-        if full_path.endswith('content.opf'):
+        if full_path.endswith('hmh.opf') or full_path.endswith('content.opf'):
            return self.__handle_contentopf(full_path)
        elif full_path.endswith('OEBPS/toc.ncx'):
            return self.__handle_tocncx(full_path)
+        elif re.search('/OPS/[^/]+.xml$', full_path):
+            return self.__handle_ops_xml(full_path)
        return True
+    def __handle_ops_xml(self, full_path: str):
+        try:
+            tree, namespace = office._parse_xml(full_path)
+        except ET.ParseError:  # pragma: nocover
+            logging.error("Unable to parse %s in %s.", full_path, self.filename)
+            return False
+        for item in tree.iterfind('.//', namespace):  # pragma: nocover
+            if item.tag.strip().lower().endswith('head'):
+                item.clear()
+                break
+        tree.write(full_path, xml_declaration=True, encoding='utf-8',
+                   short_empty_elements=False)
+        return True
    def __handle_tocncx(self, full_path: str):
        try:
            tree, namespace = office._parse_xml(full_path)
author	jvoisin	2021-02-07 17:17:16 +0100
committer	jvoisin	2021-02-07 17:24:50 +0100
commit	ec082d64833ed209d0bbec9ae5171e9378ffcb87 (patch)
tree	a5d06cd1ffd2524242348e2cbf0b60186cbf43f0
parent	f8111547ae9e414901f38c8598704a25380cc06c (diff)

diff --git a/libmat2/epub.py b/libmat2/epub.py index fd38411..52fab1c 100644 --- a/libmat2/epub.py +++ b/libmat2/epub.py
@@ -16,11 +16,17 @@ class EPUBParser(archive.ZipParser):
16	'mimetype',	16	'mimetype',
17	'OEBPS/content.opf',	17	'OEBPS/content.opf',
18	'content.opf',	18	'content.opf',
		19	'hmh.opf',
		20	'OPS/.+.xml'
19	}))	21	}))
		22	self.files_to_omit = set(map(re.compile, { # type: ignore
		23	'iTunesMetadata.plist'
		24	'META-INF/calibre_bookmarks.txt'
		25	}))
20	self.uniqid = uuid.uuid4()	26	self.uniqid = uuid.uuid4()
21		27
22	def _specific_get_meta(self, full_path, file_path):	28	def _specific_get_meta(self, full_path, file_path):
23	if not file_path.endswith('content.opf'):	29	if not file_path.endswith('.opf'):
24	return {}	30	return {}
25		31
26	with open(full_path, encoding='utf-8') as f:	32	with open(full_path, encoding='utf-8') as f:
@@ -32,12 +38,30 @@ class EPUBParser(archive.ZipParser):
32	return {file_path: 'harmful content', }	38	return {file_path: 'harmful content', }
33		39
34	def _specific_cleanup(self, full_path: str):	40	def _specific_cleanup(self, full_path: str):
35	if full_path.endswith('content.opf'):	41	if full_path.endswith('hmh.opf') or full_path.endswith('content.opf'):
36	return self.__handle_contentopf(full_path)	42	return self.__handle_contentopf(full_path)
37	elif full_path.endswith('OEBPS/toc.ncx'):	43	elif full_path.endswith('OEBPS/toc.ncx'):
38	return self.__handle_tocncx(full_path)	44	return self.__handle_tocncx(full_path)
		45	elif re.search('/OPS/[^/]+.xml$', full_path):
		46	return self.__handle_ops_xml(full_path)
39	return True	47	return True
40		48
		49	def __handle_ops_xml(self, full_path: str):
		50	try:
		51	tree, namespace = office._parse_xml(full_path)
		52	except ET.ParseError: # pragma: nocover
		53	logging.error("Unable to parse %s in %s.", full_path, self.filename)
		54	return False
		55
		56	for item in tree.iterfind('.//', namespace): # pragma: nocover
		57	if item.tag.strip().lower().endswith('head'):
		58	item.clear()
		59	break
		60	tree.write(full_path, xml_declaration=True, encoding='utf-8',
		61	short_empty_elements=False)
		62	return True
		63
		64
41	def __handle_tocncx(self, full_path: str):	65	def __handle_tocncx(self, full_path: str):
42	try:	66	try:
43	tree, namespace = office._parse_xml(full_path)	67	tree, namespace = office._parse_xml(full_path)