diff options
| author | jvoisin | 2019-02-27 23:04:38 +0100 |
|---|---|---|
| committer | jvoisin | 2019-02-27 23:04:38 +0100 |
| commit | 73d2966e8c10eb6c083a2abacc53f3297d16376e (patch) | |
| tree | 24830b2e95097f220379930e8c654ad073c04bc0 | |
| parent | eb2e702f3700a0ac88d10a524a5f6c573a52a8dd (diff) | |
Improve epub support
| -rw-r--r-- | libmat2/epub.py | 46 | ||||
| -rw-r--r-- | libmat2/web.py | 87 | ||||
| -rw-r--r-- | tests/test_corrupted_files.py | 7 | ||||
| -rw-r--r-- | tests/test_libmat2.py | 6 |
4 files changed, 114 insertions, 32 deletions
diff --git a/libmat2/epub.py b/libmat2/epub.py index 09b7937..d385465 100644 --- a/libmat2/epub.py +++ b/libmat2/epub.py | |||
| @@ -1,11 +1,13 @@ | |||
| 1 | import logging | 1 | import logging |
| 2 | import re | 2 | import re |
| 3 | import uuid | ||
| 3 | import xml.etree.ElementTree as ET # type: ignore | 4 | import xml.etree.ElementTree as ET # type: ignore |
| 4 | 5 | ||
| 5 | from . import archive, office | 6 | from . import archive, office |
| 6 | 7 | ||
| 7 | class EPUBParser(archive.ArchiveBasedAbstractParser): | 8 | class EPUBParser(archive.ArchiveBasedAbstractParser): |
| 8 | mimetypes = {'application/epub+zip', } | 9 | mimetypes = {'application/epub+zip', } |
| 10 | metadata_namespace = '{http://purl.org/dc/elements/1.1/}' | ||
| 9 | 11 | ||
| 10 | def __init__(self, filename): | 12 | def __init__(self, filename): |
| 11 | super().__init__(filename) | 13 | super().__init__(filename) |
| @@ -14,6 +16,7 @@ class EPUBParser(archive.ArchiveBasedAbstractParser): | |||
| 14 | 'mimetype', | 16 | 'mimetype', |
| 15 | 'OEBPS/content.opf', | 17 | 'OEBPS/content.opf', |
| 16 | })) | 18 | })) |
| 19 | self.uniqid = uuid.uuid4() | ||
| 17 | 20 | ||
| 18 | def _specific_get_meta(self, full_path, file_path): | 21 | def _specific_get_meta(self, full_path, file_path): |
| 19 | if file_path != 'OEBPS/content.opf': | 22 | if file_path != 'OEBPS/content.opf': |
| @@ -25,23 +28,52 @@ class EPUBParser(archive.ArchiveBasedAbstractParser): | |||
| 25 | f.read(), re.I|re.M) | 28 | f.read(), re.I|re.M) |
| 26 | return {k:v for (k, v) in results} | 29 | return {k:v for (k, v) in results} |
| 27 | except (TypeError, UnicodeDecodeError): | 30 | except (TypeError, UnicodeDecodeError): |
| 28 | # We didn't manage to parse the xml file | ||
| 29 | return {file_path: 'harmful content', } | 31 | return {file_path: 'harmful content', } |
| 30 | 32 | ||
| 31 | def _specific_cleanup(self, full_path: str): | 33 | def _specific_cleanup(self, full_path: str): |
| 32 | if not full_path.endswith('OEBPS/content.opf'): | 34 | if full_path.endswith('OEBPS/content.opf'): |
| 33 | return True | 35 | return self.__handle_contentopf(full_path) |
| 36 | elif full_path.endswith('OEBPS/toc.ncx'): | ||
| 37 | return self.__handle_tocncx(full_path) | ||
| 38 | return True | ||
| 39 | |||
| 40 | def __handle_tocncx(self, full_path: str): | ||
| 41 | try: | ||
| 42 | tree, namespace = office._parse_xml(full_path) | ||
| 43 | except ET.ParseError: # pragma: nocover | ||
| 44 | logging.error("Unable to parse %s in %s.", full_path, self.filename) | ||
| 45 | return False | ||
| 46 | |||
| 47 | for item in tree.iterfind('.//', namespace): # pragma: nocover | ||
| 48 | if item.tag.strip().lower().endswith('head'): | ||
| 49 | item.clear() | ||
| 50 | ET.SubElement(item, 'meta', attrib={'name': '', 'content': ''}) | ||
| 51 | break | ||
| 52 | tree.write(full_path, xml_declaration=True, encoding='utf-8', | ||
| 53 | short_empty_elements=False) | ||
| 54 | return True | ||
| 34 | 55 | ||
| 56 | def __handle_contentopf(self, full_path: str): | ||
| 35 | try: | 57 | try: |
| 36 | tree, namespace = office._parse_xml(full_path) | 58 | tree, namespace = office._parse_xml(full_path) |
| 37 | except ET.ParseError: | 59 | except ET.ParseError: |
| 38 | logging.error("Unable to parse %s in %s.", full_path, self.filename) | 60 | logging.error("Unable to parse %s in %s.", full_path, self.filename) |
| 39 | return False | 61 | return False |
| 40 | parent_map = {c:p for p in tree.iter() for c in p} | ||
| 41 | 62 | ||
| 42 | for item in tree.iterfind('.//', namespace): | 63 | for item in tree.iterfind('.//', namespace): # pragma: nocover |
| 43 | if item.tag.strip().lower().endswith('metadata'): | 64 | if item.tag.strip().lower().endswith('metadata'): |
| 44 | parent_map[item].remove(item) | 65 | item.clear() |
| 66 | |||
| 67 | # item with mandatory content | ||
| 68 | uniqid = ET.Element(self.metadata_namespace + 'identifier') | ||
| 69 | uniqid.text = str(self.uniqid) | ||
| 70 | uniqid.set('id', 'id') | ||
| 71 | item.append(uniqid) | ||
| 72 | |||
| 73 | # items without mandatory content | ||
| 74 | for name in {'language', 'title'}: | ||
| 75 | uniqid = ET.Element(self.metadata_namespace + name) | ||
| 76 | item.append(uniqid) | ||
| 45 | break # there is only a single <metadata> block | 77 | break # there is only a single <metadata> block |
| 46 | tree.write(full_path, xml_declaration=True) | 78 | tree.write(full_path, xml_declaration=True, encoding='utf-8') |
| 47 | return True | 79 | return True |
diff --git a/libmat2/web.py b/libmat2/web.py index c11b47d..067f5f9 100644 --- a/libmat2/web.py +++ b/libmat2/web.py | |||
| @@ -1,10 +1,13 @@ | |||
| 1 | from html import parser | 1 | from html import parser, escape |
| 2 | from typing import Dict, Any, List, Tuple | 2 | from typing import Dict, Any, List, Tuple, Set |
| 3 | import re | 3 | import re |
| 4 | import string | 4 | import string |
| 5 | 5 | ||
| 6 | from . import abstract | 6 | from . import abstract |
| 7 | 7 | ||
| 8 | assert Set | ||
| 9 | |||
| 10 | # pylint: disable=too-many-instance-attributes | ||
| 8 | 11 | ||
| 9 | class CSSParser(abstract.AbstractParser): | 12 | class CSSParser(abstract.AbstractParser): |
| 10 | """There is no such things as metadata in CSS files, | 13 | """There is no such things as metadata in CSS files, |
| @@ -33,11 +36,16 @@ class CSSParser(abstract.AbstractParser): | |||
| 33 | return metadata | 36 | return metadata |
| 34 | 37 | ||
| 35 | 38 | ||
| 36 | class HTMLParser(abstract.AbstractParser): | 39 | class AbstractHTMLParser(abstract.AbstractParser): |
| 37 | mimetypes = {'text/html', 'application/x-dtbncx+xml', } | 40 | tags_blacklist = set() # type: Set[str] |
| 41 | # In some html/xml based formats some tags are mandatory, | ||
| 42 | # so we're keeping them, but are discaring their contents | ||
| 43 | tags_required_blacklist = set() # type: Set[str] | ||
| 44 | |||
| 38 | def __init__(self, filename): | 45 | def __init__(self, filename): |
| 39 | super().__init__(filename) | 46 | super().__init__(filename) |
| 40 | self.__parser = _HTMLParser(self.filename) | 47 | self.__parser = _HTMLParser(self.filename, self.tags_blacklist, |
| 48 | self.tags_required_blacklist) | ||
| 41 | with open(filename, encoding='utf-8') as f: | 49 | with open(filename, encoding='utf-8') as f: |
| 42 | self.__parser.feed(f.read()) | 50 | self.__parser.feed(f.read()) |
| 43 | self.__parser.close() | 51 | self.__parser.close() |
| @@ -49,29 +57,50 @@ class HTMLParser(abstract.AbstractParser): | |||
| 49 | return self.__parser.remove_all(self.output_filename) | 57 | return self.__parser.remove_all(self.output_filename) |
| 50 | 58 | ||
| 51 | 59 | ||
| 60 | class HTMLParser(AbstractHTMLParser): | ||
| 61 | mimetypes = {'text/html', } | ||
| 62 | tags_blacklist = {'meta', } | ||
| 63 | tags_required_blacklist = {'title', } | ||
| 64 | |||
| 65 | |||
| 66 | class DTBNCXParser(AbstractHTMLParser): | ||
| 67 | mimetypes = {'application/x-dtbncx+xml', } | ||
| 68 | tags_required_blacklist = {'title', 'doctitle', 'meta'} | ||
| 69 | |||
| 70 | |||
| 52 | class _HTMLParser(parser.HTMLParser): | 71 | class _HTMLParser(parser.HTMLParser): |
| 53 | """Python doesn't have a validating html parser in its stdlib, so | 72 | """Python doesn't have a validating html parser in its stdlib, so |
| 54 | we're using an internal queue to track all the opening/closing tags, | 73 | we're using an internal queue to track all the opening/closing tags, |
| 55 | and hoping for the best. | 74 | and hoping for the best. |
| 56 | """ | 75 | """ |
| 57 | tag_blacklist = {'doctitle', 'meta', 'title'} # everything is lowercase | 76 | def __init__(self, filename, blacklisted_tags, required_blacklisted_tags): |
| 58 | def __init__(self, filename): | ||
| 59 | super().__init__() | 77 | super().__init__() |
| 60 | self.filename = filename | 78 | self.filename = filename |
| 61 | self.__textrepr = '' | 79 | self.__textrepr = '' |
| 62 | self.__meta = {} | 80 | self.__meta = {} |
| 63 | self.__validation_queue = [] | 81 | self.__validation_queue = [] # type: List[str] |
| 64 | # We're using a counter instead of a boolean to handle nested tags | 82 | # We're using counters instead of booleans, to handle nested tags |
| 83 | self.__in_dangerous_but_required_tag = 0 | ||
| 65 | self.__in_dangerous_tag = 0 | 84 | self.__in_dangerous_tag = 0 |
| 66 | 85 | ||
| 86 | if required_blacklisted_tags & blacklisted_tags: # pragma: nocover | ||
| 87 | raise ValueError("There is an overlap between %s and %s" % ( | ||
| 88 | required_blacklisted_tags, blacklisted_tags)) | ||
| 89 | self.tag_required_blacklist = required_blacklisted_tags | ||
| 90 | self.tag_blacklist = blacklisted_tags | ||
| 91 | |||
| 67 | def handle_starttag(self, tag: str, attrs: List[Tuple[str, str]]): | 92 | def handle_starttag(self, tag: str, attrs: List[Tuple[str, str]]): |
| 68 | self.__validation_queue.append(tag) | 93 | original_tag = self.get_starttag_text() |
| 94 | self.__validation_queue.append(original_tag) | ||
| 95 | |||
| 96 | if tag in self.tag_required_blacklist: | ||
| 97 | self.__in_dangerous_but_required_tag += 1 | ||
| 69 | if tag in self.tag_blacklist: | 98 | if tag in self.tag_blacklist: |
| 70 | self.__in_dangerous_tag += 1 | 99 | self.__in_dangerous_tag += 1 |
| 71 | return | ||
| 72 | 100 | ||
| 73 | if self.__in_dangerous_tag == 0: | 101 | if self.__in_dangerous_tag == 0: |
| 74 | self.__textrepr += self.get_starttag_text() | 102 | if self.__in_dangerous_but_required_tag <= 1: |
| 103 | self.__textrepr += original_tag | ||
| 75 | 104 | ||
| 76 | def handle_endtag(self, tag: str): | 105 | def handle_endtag(self, tag: str): |
| 77 | if not self.__validation_queue: | 106 | if not self.__validation_queue: |
| @@ -79,29 +108,43 @@ class _HTMLParser(parser.HTMLParser): | |||
| 79 | "opening one in %s." % (tag, self.filename)) | 108 | "opening one in %s." % (tag, self.filename)) |
| 80 | 109 | ||
| 81 | previous_tag = self.__validation_queue.pop() | 110 | previous_tag = self.__validation_queue.pop() |
| 82 | if tag != previous_tag: | 111 | previous_tag = previous_tag[1:-1] # remove < and > |
| 112 | previous_tag = previous_tag.split(' ')[0] # remove attributes | ||
| 113 | if tag != previous_tag.lower(): | ||
| 83 | raise ValueError("The closing tag %s doesn't match the previous " | 114 | raise ValueError("The closing tag %s doesn't match the previous " |
| 84 | "tag %s in %s" % | 115 | "tag %s in %s" % |
| 85 | (tag, previous_tag, self.filename)) | 116 | (tag, previous_tag, self.filename)) |
| 86 | elif tag in self.tag_blacklist: | ||
| 87 | self.__in_dangerous_tag -= 1 | ||
| 88 | return | ||
| 89 | 117 | ||
| 90 | if self.__in_dangerous_tag == 0: | 118 | if self.__in_dangerous_tag == 0: |
| 91 | # There is no `get_endtag_text()` method :/ | 119 | if self.__in_dangerous_but_required_tag <= 1: |
| 92 | self.__textrepr += '</' + tag + '>\n' | 120 | # There is no `get_endtag_text()` method :/ |
| 121 | self.__textrepr += '</' + previous_tag + '>' | ||
| 122 | |||
| 123 | if tag in self.tag_required_blacklist: | ||
| 124 | self.__in_dangerous_but_required_tag -= 1 | ||
| 125 | elif tag in self.tag_blacklist: | ||
| 126 | self.__in_dangerous_tag -= 1 | ||
| 93 | 127 | ||
| 94 | def handle_data(self, data: str): | 128 | def handle_data(self, data: str): |
| 95 | if self.__in_dangerous_tag == 0 and data.strip(): | 129 | if self.__in_dangerous_but_required_tag == 0: |
| 96 | self.__textrepr += data | 130 | if self.__in_dangerous_tag == 0: |
| 131 | if data.strip(): | ||
| 132 | self.__textrepr += escape(data) | ||
| 97 | 133 | ||
| 98 | def handle_startendtag(self, tag: str, attrs: List[Tuple[str, str]]): | 134 | def handle_startendtag(self, tag: str, attrs: List[Tuple[str, str]]): |
| 99 | if tag in self.tag_blacklist: | 135 | if tag in self.tag_required_blacklist | self.tag_blacklist: |
| 100 | meta = {k:v for k, v in attrs} | 136 | meta = {k:v for k, v in attrs} |
| 101 | name = meta.get('name', 'harmful metadata') | 137 | name = meta.get('name', 'harmful metadata') |
| 102 | content = meta.get('content', 'harmful data') | 138 | content = meta.get('content', 'harmful data') |
| 103 | self.__meta[name] = content | 139 | self.__meta[name] = content |
| 104 | else: | 140 | |
| 141 | if self.__in_dangerous_tag != 0: | ||
| 142 | return | ||
| 143 | elif tag in self.tag_required_blacklist: | ||
| 144 | self.__textrepr += '<' + tag + ' />' | ||
| 145 | return | ||
| 146 | |||
| 147 | if self.__in_dangerous_but_required_tag == 0: | ||
| 105 | if self.__in_dangerous_tag == 0: | 148 | if self.__in_dangerous_tag == 0: |
| 106 | self.__textrepr += self.get_starttag_text() | 149 | self.__textrepr += self.get_starttag_text() |
| 107 | 150 | ||
diff --git a/tests/test_corrupted_files.py b/tests/test_corrupted_files.py index 53c856a..b2cec00 100644 --- a/tests/test_corrupted_files.py +++ b/tests/test_corrupted_files.py | |||
| @@ -253,13 +253,13 @@ class TestCorruptedFiles(unittest.TestCase): | |||
| 253 | os.remove('./tests/data/clean.cleaned.html') | 253 | os.remove('./tests/data/clean.cleaned.html') |
| 254 | 254 | ||
| 255 | with open('./tests/data/clean.html', 'w') as f: | 255 | with open('./tests/data/clean.html', 'w') as f: |
| 256 | f.write('</close>') | 256 | f.write('</meta>') |
| 257 | with self.assertRaises(ValueError): | 257 | with self.assertRaises(ValueError): |
| 258 | web.HTMLParser('./tests/data/clean.html') | 258 | web.HTMLParser('./tests/data/clean.html') |
| 259 | os.remove('./tests/data/clean.html') | 259 | os.remove('./tests/data/clean.html') |
| 260 | 260 | ||
| 261 | with open('./tests/data/clean.html', 'w') as f: | 261 | with open('./tests/data/clean.html', 'w') as f: |
| 262 | f.write('<notclosed>') | 262 | f.write('<meta><a>test</a><set/></meta><title></title><meta>') |
| 263 | p = web.HTMLParser('./tests/data/clean.html') | 263 | p = web.HTMLParser('./tests/data/clean.html') |
| 264 | with self.assertRaises(ValueError): | 264 | with self.assertRaises(ValueError): |
| 265 | p.get_meta() | 265 | p.get_meta() |
| @@ -269,6 +269,9 @@ class TestCorruptedFiles(unittest.TestCase): | |||
| 269 | os.remove('./tests/data/clean.html') | 269 | os.remove('./tests/data/clean.html') |
| 270 | 270 | ||
| 271 | with open('./tests/data/clean.html', 'w') as f: | 271 | with open('./tests/data/clean.html', 'w') as f: |
| 272 | f.write('<meta><meta/></meta>') | ||
| 273 | f.write('<title><title>pouet</title></title>') | ||
| 274 | f.write('<title><mysupertag/></title>') | ||
| 272 | f.write('<doctitle><br/></doctitle><br/><notclosed>') | 275 | f.write('<doctitle><br/></doctitle><br/><notclosed>') |
| 273 | p = web.HTMLParser('./tests/data/clean.html') | 276 | p = web.HTMLParser('./tests/data/clean.html') |
| 274 | with self.assertRaises(ValueError): | 277 | with self.assertRaises(ValueError): |
diff --git a/tests/test_libmat2.py b/tests/test_libmat2.py index 249c56d..f4b1890 100644 --- a/tests/test_libmat2.py +++ b/tests/test_libmat2.py | |||
| @@ -3,6 +3,7 @@ | |||
| 3 | import unittest | 3 | import unittest |
| 4 | import shutil | 4 | import shutil |
| 5 | import os | 5 | import os |
| 6 | import re | ||
| 6 | import zipfile | 7 | import zipfile |
| 7 | 8 | ||
| 8 | from libmat2 import pdf, images, audio, office, parser_factory, torrent, harmless | 9 | from libmat2 import pdf, images, audio, office, parser_factory, torrent, harmless |
| @@ -644,7 +645,10 @@ class TestCleaning(unittest.TestCase): | |||
| 644 | self.assertTrue(ret) | 645 | self.assertTrue(ret) |
| 645 | 646 | ||
| 646 | p = epub.EPUBParser('./tests/data/clean.cleaned.epub') | 647 | p = epub.EPUBParser('./tests/data/clean.cleaned.epub') |
| 647 | self.assertEqual(p.get_meta(), {}) | 648 | meta = p.get_meta() |
| 649 | res = re.match(meta['OEBPS/content.opf']['metadata'], '^<dc:identifier>[0-9a-f-]+</dc:identifier><dc:title /><dc:language />$') | ||
| 650 | self.assertNotEqual(res, False) | ||
| 651 | |||
| 648 | self.assertTrue(p.remove_all()) | 652 | self.assertTrue(p.remove_all()) |
| 649 | 653 | ||
| 650 | os.remove('./tests/data/clean.epub') | 654 | os.remove('./tests/data/clean.epub') |
