diff options
| author | jvoisin | 2019-02-27 23:53:07 +0100 |
|---|---|---|
| committer | jvoisin | 2019-02-27 23:53:07 +0100 |
| commit | 55214206b5f0f1c12fec378967f89ef4cb5674f9 (patch) | |
| tree | 82c94503dfd1d844b4c53d9083c69483d9d4f68d | |
| parent | 73d2966e8c10eb6c083a2abacc53f3297d16376e (diff) | |
Improve the previous commit
- More tests
- More documentation
- Minor code cleanup
| -rw-r--r-- | libmat2/web.py | 38 | ||||
| -rw-r--r-- | tests/test_corrupted_files.py | 4 | ||||
| -rw-r--r-- | tests/test_libmat2.py | 27 |
3 files changed, 51 insertions, 18 deletions
diff --git a/libmat2/web.py b/libmat2/web.py index 067f5f9..62e7747 100644 --- a/libmat2/web.py +++ b/libmat2/web.py | |||
| @@ -38,8 +38,8 @@ class CSSParser(abstract.AbstractParser): | |||
| 38 | 38 | ||
| 39 | class AbstractHTMLParser(abstract.AbstractParser): | 39 | class AbstractHTMLParser(abstract.AbstractParser): |
| 40 | tags_blacklist = set() # type: Set[str] | 40 | tags_blacklist = set() # type: Set[str] |
| 41 | # In some html/xml based formats some tags are mandatory, | 41 | # In some html/xml-based formats some tags are mandatory, |
| 42 | # so we're keeping them, but are discaring their contents | 42 | # so we're keeping them, but are discaring their content |
| 43 | tags_required_blacklist = set() # type: Set[str] | 43 | tags_required_blacklist = set() # type: Set[str] |
| 44 | 44 | ||
| 45 | def __init__(self, filename): | 45 | def __init__(self, filename): |
| @@ -72,6 +72,12 @@ class _HTMLParser(parser.HTMLParser): | |||
| 72 | """Python doesn't have a validating html parser in its stdlib, so | 72 | """Python doesn't have a validating html parser in its stdlib, so |
| 73 | we're using an internal queue to track all the opening/closing tags, | 73 | we're using an internal queue to track all the opening/closing tags, |
| 74 | and hoping for the best. | 74 | and hoping for the best. |
| 75 | |||
| 76 | Moreover, the parser.HTMLParser call doesn't provide a get_endtag_text | ||
| 77 | method, so we have to use get_starttag_text instead, put its result in a | ||
| 78 | LIFO, and transform it in a closing tag when needed. | ||
| 79 | |||
| 80 | Also, gotcha: the `tag` parameters are always in lowercase. | ||
| 75 | """ | 81 | """ |
| 76 | def __init__(self, filename, blacklisted_tags, required_blacklisted_tags): | 82 | def __init__(self, filename, blacklisted_tags, required_blacklisted_tags): |
| 77 | super().__init__() | 83 | super().__init__() |
| @@ -79,6 +85,7 @@ class _HTMLParser(parser.HTMLParser): | |||
| 79 | self.__textrepr = '' | 85 | self.__textrepr = '' |
| 80 | self.__meta = {} | 86 | self.__meta = {} |
| 81 | self.__validation_queue = [] # type: List[str] | 87 | self.__validation_queue = [] # type: List[str] |
| 88 | |||
| 82 | # We're using counters instead of booleans, to handle nested tags | 89 | # We're using counters instead of booleans, to handle nested tags |
| 83 | self.__in_dangerous_but_required_tag = 0 | 90 | self.__in_dangerous_but_required_tag = 0 |
| 84 | self.__in_dangerous_tag = 0 | 91 | self.__in_dangerous_tag = 0 |
| @@ -93,15 +100,16 @@ class _HTMLParser(parser.HTMLParser): | |||
| 93 | original_tag = self.get_starttag_text() | 100 | original_tag = self.get_starttag_text() |
| 94 | self.__validation_queue.append(original_tag) | 101 | self.__validation_queue.append(original_tag) |
| 95 | 102 | ||
| 96 | if tag in self.tag_required_blacklist: | ||
| 97 | self.__in_dangerous_but_required_tag += 1 | ||
| 98 | if tag in self.tag_blacklist: | 103 | if tag in self.tag_blacklist: |
| 99 | self.__in_dangerous_tag += 1 | 104 | self.__in_dangerous_tag += 1 |
| 100 | 105 | ||
| 101 | if self.__in_dangerous_tag == 0: | 106 | if self.__in_dangerous_tag == 0: |
| 102 | if self.__in_dangerous_but_required_tag <= 1: | 107 | if self.__in_dangerous_but_required_tag == 0: |
| 103 | self.__textrepr += original_tag | 108 | self.__textrepr += original_tag |
| 104 | 109 | ||
| 110 | if tag in self.tag_required_blacklist: | ||
| 111 | self.__in_dangerous_but_required_tag += 1 | ||
| 112 | |||
| 105 | def handle_endtag(self, tag: str): | 113 | def handle_endtag(self, tag: str): |
| 106 | if not self.__validation_queue: | 114 | if not self.__validation_queue: |
| 107 | raise ValueError("The closing tag %s doesn't have a corresponding " | 115 | raise ValueError("The closing tag %s doesn't have a corresponding " |
| @@ -115,14 +123,15 @@ class _HTMLParser(parser.HTMLParser): | |||
| 115 | "tag %s in %s" % | 123 | "tag %s in %s" % |
| 116 | (tag, previous_tag, self.filename)) | 124 | (tag, previous_tag, self.filename)) |
| 117 | 125 | ||
| 126 | if tag in self.tag_required_blacklist: | ||
| 127 | self.__in_dangerous_but_required_tag -= 1 | ||
| 128 | |||
| 118 | if self.__in_dangerous_tag == 0: | 129 | if self.__in_dangerous_tag == 0: |
| 119 | if self.__in_dangerous_but_required_tag <= 1: | 130 | if self.__in_dangerous_but_required_tag == 0: |
| 120 | # There is no `get_endtag_text()` method :/ | 131 | # There is no `get_endtag_text()` method :/ |
| 121 | self.__textrepr += '</' + previous_tag + '>' | 132 | self.__textrepr += '</' + previous_tag + '>' |
| 122 | 133 | ||
| 123 | if tag in self.tag_required_blacklist: | 134 | if tag in self.tag_blacklist: |
| 124 | self.__in_dangerous_but_required_tag -= 1 | ||
| 125 | elif tag in self.tag_blacklist: | ||
| 126 | self.__in_dangerous_tag -= 1 | 135 | self.__in_dangerous_tag -= 1 |
| 127 | 136 | ||
| 128 | def handle_data(self, data: str): | 137 | def handle_data(self, data: str): |
| @@ -138,14 +147,13 @@ class _HTMLParser(parser.HTMLParser): | |||
| 138 | content = meta.get('content', 'harmful data') | 147 | content = meta.get('content', 'harmful data') |
| 139 | self.__meta[name] = content | 148 | self.__meta[name] = content |
| 140 | 149 | ||
| 141 | if self.__in_dangerous_tag != 0: | 150 | if self.__in_dangerous_tag == 0: |
| 151 | if tag in self.tag_required_blacklist: | ||
| 152 | self.__textrepr += '<' + tag + ' />' | ||
| 142 | return | 153 | return |
| 143 | elif tag in self.tag_required_blacklist: | ||
| 144 | self.__textrepr += '<' + tag + ' />' | ||
| 145 | return | ||
| 146 | 154 | ||
| 147 | if self.__in_dangerous_but_required_tag == 0: | 155 | if self.__in_dangerous_tag == 0: |
| 148 | if self.__in_dangerous_tag == 0: | 156 | if self.__in_dangerous_but_required_tag == 0: |
| 149 | self.__textrepr += self.get_starttag_text() | 157 | self.__textrepr += self.get_starttag_text() |
| 150 | 158 | ||
| 151 | def remove_all(self, output_filename: str) -> bool: | 159 | def remove_all(self, output_filename: str) -> bool: |
diff --git a/tests/test_corrupted_files.py b/tests/test_corrupted_files.py index b2cec00..4a16d51 100644 --- a/tests/test_corrupted_files.py +++ b/tests/test_corrupted_files.py | |||
| @@ -269,9 +269,6 @@ class TestCorruptedFiles(unittest.TestCase): | |||
| 269 | os.remove('./tests/data/clean.html') | 269 | os.remove('./tests/data/clean.html') |
| 270 | 270 | ||
| 271 | with open('./tests/data/clean.html', 'w') as f: | 271 | with open('./tests/data/clean.html', 'w') as f: |
| 272 | f.write('<meta><meta/></meta>') | ||
| 273 | f.write('<title><title>pouet</title></title>') | ||
| 274 | f.write('<title><mysupertag/></title>') | ||
| 275 | f.write('<doctitle><br/></doctitle><br/><notclosed>') | 272 | f.write('<doctitle><br/></doctitle><br/><notclosed>') |
| 276 | p = web.HTMLParser('./tests/data/clean.html') | 273 | p = web.HTMLParser('./tests/data/clean.html') |
| 277 | with self.assertRaises(ValueError): | 274 | with self.assertRaises(ValueError): |
| @@ -281,6 +278,7 @@ class TestCorruptedFiles(unittest.TestCase): | |||
| 281 | p.remove_all() | 278 | p.remove_all() |
| 282 | os.remove('./tests/data/clean.html') | 279 | os.remove('./tests/data/clean.html') |
| 283 | 280 | ||
| 281 | |||
| 284 | def test_epub(self): | 282 | def test_epub(self): |
| 285 | with zipfile.ZipFile('./tests/data/clean.epub', 'w') as zout: | 283 | with zipfile.ZipFile('./tests/data/clean.epub', 'w') as zout: |
| 286 | zout.write('./tests/data/dirty.jpg', 'OEBPS/content.opf') | 284 | zout.write('./tests/data/dirty.jpg', 'OEBPS/content.opf') |
diff --git a/tests/test_libmat2.py b/tests/test_libmat2.py index f4b1890..46e234e 100644 --- a/tests/test_libmat2.py +++ b/tests/test_libmat2.py | |||
| @@ -633,6 +633,33 @@ class TestCleaning(unittest.TestCase): | |||
| 633 | os.remove('./tests/data/clean.cleaned.html') | 633 | os.remove('./tests/data/clean.cleaned.html') |
| 634 | os.remove('./tests/data/clean.cleaned.cleaned.html') | 634 | os.remove('./tests/data/clean.cleaned.cleaned.html') |
| 635 | 635 | ||
| 636 | with open('./tests/data/clean.html', 'w') as f: | ||
| 637 | f.write('<title><title><pouet/><meta/></title></title><test/>') | ||
| 638 | p = web.HTMLParser('./tests/data/clean.html') | ||
| 639 | self.assertTrue(p.remove_all()) | ||
| 640 | with open('./tests/data/clean.cleaned.html', 'r') as f: | ||
| 641 | self.assertEqual(f.read(), '<title></title><test/>') | ||
| 642 | os.remove('./tests/data/clean.html') | ||
| 643 | os.remove('./tests/data/clean.cleaned.html') | ||
| 644 | |||
| 645 | with open('./tests/data/clean.html', 'w') as f: | ||
| 646 | f.write('<test><title>Some<b>metadata</b><br/></title></test>') | ||
| 647 | p = web.HTMLParser('./tests/data/clean.html') | ||
| 648 | self.assertTrue(p.remove_all()) | ||
| 649 | with open('./tests/data/clean.cleaned.html', 'r') as f: | ||
| 650 | self.assertEqual(f.read(), '<test><title></title></test>') | ||
| 651 | os.remove('./tests/data/clean.html') | ||
| 652 | os.remove('./tests/data/clean.cleaned.html') | ||
| 653 | |||
| 654 | with open('./tests/data/clean.html', 'w') as f: | ||
| 655 | f.write('<meta><meta/></meta>') | ||
| 656 | p = web.HTMLParser('./tests/data/clean.html') | ||
| 657 | self.assertTrue(p.remove_all()) | ||
| 658 | with open('./tests/data/clean.cleaned.html', 'r') as f: | ||
| 659 | self.assertEqual(f.read(), '') | ||
| 660 | os.remove('./tests/data/clean.html') | ||
| 661 | os.remove('./tests/data/clean.cleaned.html') | ||
| 662 | |||
| 636 | 663 | ||
| 637 | def test_epub(self): | 664 | def test_epub(self): |
| 638 | shutil.copy('./tests/data/dirty.epub', './tests/data/clean.epub') | 665 | shutil.copy('./tests/data/dirty.epub', './tests/data/clean.epub') |
