diff options
Diffstat (limited to 'libmat2/epub.py')
| -rw-r--r-- | libmat2/epub.py | 47 |
1 files changed, 47 insertions, 0 deletions
diff --git a/libmat2/epub.py b/libmat2/epub.py new file mode 100644 index 0000000..09b7937 --- /dev/null +++ b/libmat2/epub.py | |||
| @@ -0,0 +1,47 @@ | |||
| 1 | import logging | ||
| 2 | import re | ||
| 3 | import xml.etree.ElementTree as ET # type: ignore | ||
| 4 | |||
| 5 | from . import archive, office | ||
| 6 | |||
| 7 | class EPUBParser(archive.ArchiveBasedAbstractParser): | ||
| 8 | mimetypes = {'application/epub+zip', } | ||
| 9 | |||
| 10 | def __init__(self, filename): | ||
| 11 | super().__init__(filename) | ||
| 12 | self.files_to_keep = set(map(re.compile, { # type: ignore | ||
| 13 | 'META-INF/container.xml', | ||
| 14 | 'mimetype', | ||
| 15 | 'OEBPS/content.opf', | ||
| 16 | })) | ||
| 17 | |||
| 18 | def _specific_get_meta(self, full_path, file_path): | ||
| 19 | if file_path != 'OEBPS/content.opf': | ||
| 20 | return {} | ||
| 21 | |||
| 22 | with open(full_path, encoding='utf-8') as f: | ||
| 23 | try: | ||
| 24 | results = re.findall(r"<((?:meta|dc|cp).+?)[^>]*>(.+)</\1>", | ||
| 25 | f.read(), re.I|re.M) | ||
| 26 | return {k:v for (k, v) in results} | ||
| 27 | except (TypeError, UnicodeDecodeError): | ||
| 28 | # We didn't manage to parse the xml file | ||
| 29 | return {file_path: 'harmful content', } | ||
| 30 | |||
| 31 | def _specific_cleanup(self, full_path: str): | ||
| 32 | if not full_path.endswith('OEBPS/content.opf'): | ||
| 33 | return True | ||
| 34 | |||
| 35 | try: | ||
| 36 | tree, namespace = office._parse_xml(full_path) | ||
| 37 | except ET.ParseError: | ||
| 38 | logging.error("Unable to parse %s in %s.", full_path, self.filename) | ||
| 39 | return False | ||
| 40 | parent_map = {c:p for p in tree.iter() for c in p} | ||
| 41 | |||
| 42 | for item in tree.iterfind('.//', namespace): | ||
| 43 | if item.tag.strip().lower().endswith('metadata'): | ||
| 44 | parent_map[item].remove(item) | ||
| 45 | break # there is only a single <metadata> block | ||
| 46 | tree.write(full_path, xml_declaration=True) | ||
| 47 | return True | ||
