diff options
| -rw-r--r-- | libmat2/epub.py | 28 |
1 files changed, 26 insertions, 2 deletions
diff --git a/libmat2/epub.py b/libmat2/epub.py index fd38411..52fab1c 100644 --- a/libmat2/epub.py +++ b/libmat2/epub.py | |||
| @@ -16,11 +16,17 @@ class EPUBParser(archive.ZipParser): | |||
| 16 | 'mimetype', | 16 | 'mimetype', |
| 17 | 'OEBPS/content.opf', | 17 | 'OEBPS/content.opf', |
| 18 | 'content.opf', | 18 | 'content.opf', |
| 19 | 'hmh.opf', | ||
| 20 | 'OPS/.+.xml' | ||
| 19 | })) | 21 | })) |
| 22 | self.files_to_omit = set(map(re.compile, { # type: ignore | ||
| 23 | 'iTunesMetadata.plist' | ||
| 24 | 'META-INF/calibre_bookmarks.txt' | ||
| 25 | })) | ||
| 20 | self.uniqid = uuid.uuid4() | 26 | self.uniqid = uuid.uuid4() |
| 21 | 27 | ||
| 22 | def _specific_get_meta(self, full_path, file_path): | 28 | def _specific_get_meta(self, full_path, file_path): |
| 23 | if not file_path.endswith('content.opf'): | 29 | if not file_path.endswith('.opf'): |
| 24 | return {} | 30 | return {} |
| 25 | 31 | ||
| 26 | with open(full_path, encoding='utf-8') as f: | 32 | with open(full_path, encoding='utf-8') as f: |
| @@ -32,12 +38,30 @@ class EPUBParser(archive.ZipParser): | |||
| 32 | return {file_path: 'harmful content', } | 38 | return {file_path: 'harmful content', } |
| 33 | 39 | ||
| 34 | def _specific_cleanup(self, full_path: str): | 40 | def _specific_cleanup(self, full_path: str): |
| 35 | if full_path.endswith('content.opf'): | 41 | if full_path.endswith('hmh.opf') or full_path.endswith('content.opf'): |
| 36 | return self.__handle_contentopf(full_path) | 42 | return self.__handle_contentopf(full_path) |
| 37 | elif full_path.endswith('OEBPS/toc.ncx'): | 43 | elif full_path.endswith('OEBPS/toc.ncx'): |
| 38 | return self.__handle_tocncx(full_path) | 44 | return self.__handle_tocncx(full_path) |
| 45 | elif re.search('/OPS/[^/]+.xml$', full_path): | ||
| 46 | return self.__handle_ops_xml(full_path) | ||
| 39 | return True | 47 | return True |
| 40 | 48 | ||
| 49 | def __handle_ops_xml(self, full_path: str): | ||
| 50 | try: | ||
| 51 | tree, namespace = office._parse_xml(full_path) | ||
| 52 | except ET.ParseError: # pragma: nocover | ||
| 53 | logging.error("Unable to parse %s in %s.", full_path, self.filename) | ||
| 54 | return False | ||
| 55 | |||
| 56 | for item in tree.iterfind('.//', namespace): # pragma: nocover | ||
| 57 | if item.tag.strip().lower().endswith('head'): | ||
| 58 | item.clear() | ||
| 59 | break | ||
| 60 | tree.write(full_path, xml_declaration=True, encoding='utf-8', | ||
| 61 | short_empty_elements=False) | ||
| 62 | return True | ||
| 63 | |||
| 64 | |||
| 41 | def __handle_tocncx(self, full_path: str): | 65 | def __handle_tocncx(self, full_path: str): |
| 42 | try: | 66 | try: |
| 43 | tree, namespace = office._parse_xml(full_path) | 67 | tree, namespace = office._parse_xml(full_path) |
