summaryrefslogtreecommitdiff
path: root/libmat2/epub.py
diff options
context:
space:
mode:
authorjvoisin2019-02-20 16:28:11 -0800
committerjvoisin2019-02-20 16:28:11 -0800
commit02ff21b158c76fcd355a74ddb940e1c54fc2d7ed (patch)
tree701c6f5e316265e5a95a162356965ecf2fb8d6b2 /libmat2/epub.py
parent6b45064c784d03bb21ffaf7e50c9ba684e6985a9 (diff)
Implement epub support
Diffstat (limited to 'libmat2/epub.py')
-rw-r--r--libmat2/epub.py47
1 files changed, 47 insertions, 0 deletions
diff --git a/libmat2/epub.py b/libmat2/epub.py
new file mode 100644
index 0000000..09b7937
--- /dev/null
+++ b/libmat2/epub.py
@@ -0,0 +1,47 @@
1import logging
2import re
3import xml.etree.ElementTree as ET # type: ignore
4
5from . import archive, office
6
7class EPUBParser(archive.ArchiveBasedAbstractParser):
8 mimetypes = {'application/epub+zip', }
9
10 def __init__(self, filename):
11 super().__init__(filename)
12 self.files_to_keep = set(map(re.compile, { # type: ignore
13 'META-INF/container.xml',
14 'mimetype',
15 'OEBPS/content.opf',
16 }))
17
18 def _specific_get_meta(self, full_path, file_path):
19 if file_path != 'OEBPS/content.opf':
20 return {}
21
22 with open(full_path, encoding='utf-8') as f:
23 try:
24 results = re.findall(r"<((?:meta|dc|cp).+?)[^>]*>(.+)</\1>",
25 f.read(), re.I|re.M)
26 return {k:v for (k, v) in results}
27 except (TypeError, UnicodeDecodeError):
28 # We didn't manage to parse the xml file
29 return {file_path: 'harmful content', }
30
31 def _specific_cleanup(self, full_path: str):
32 if not full_path.endswith('OEBPS/content.opf'):
33 return True
34
35 try:
36 tree, namespace = office._parse_xml(full_path)
37 except ET.ParseError:
38 logging.error("Unable to parse %s in %s.", full_path, self.filename)
39 return False
40 parent_map = {c:p for p in tree.iter() for c in p}
41
42 for item in tree.iterfind('.//', namespace):
43 if item.tag.strip().lower().endswith('metadata'):
44 parent_map[item].remove(item)
45 break # there is only a single <metadata> block
46 tree.write(full_path, xml_declaration=True)
47 return True