diff options
| -rw-r--r-- | libmat2/html.py | 69 | ||||
| -rw-r--r-- | tests/data/dirty.html | 14 | ||||
| -rw-r--r-- | tests/test_corrupted_files.py | 39 | ||||
| -rw-r--r-- | tests/test_libmat2.py | 20 |
4 files changed, 140 insertions, 2 deletions
diff --git a/libmat2/html.py b/libmat2/html.py new file mode 100644 index 0000000..d0e9a2b --- /dev/null +++ b/libmat2/html.py | |||
| @@ -0,0 +1,69 @@ | |||
| 1 | from html import parser | ||
| 2 | from typing import Dict, Any, List, Tuple | ||
| 3 | |||
| 4 | from . import abstract | ||
| 5 | |||
| 6 | |||
| 7 | class HTMLParser(abstract.AbstractParser): | ||
| 8 | mimetypes = {'text/html', } | ||
| 9 | def __init__(self, filename): | ||
| 10 | super().__init__(filename) | ||
| 11 | self.__parser = _HTMLParser() | ||
| 12 | with open(filename) as f: | ||
| 13 | self.__parser.feed(f.read()) | ||
| 14 | self.__parser.close() | ||
| 15 | |||
| 16 | def get_meta(self) -> Dict[str, Any]: | ||
| 17 | return self.__parser.get_meta() | ||
| 18 | |||
| 19 | def remove_all(self) -> bool: | ||
| 20 | return self.__parser.remove_all(self.output_filename) | ||
| 21 | |||
| 22 | |||
| 23 | class _HTMLParser(parser.HTMLParser): | ||
| 24 | """Python doesn't have a validating html parser in its stdlib, so | ||
| 25 | we're using an internal queue to track all the opening/closing tags, | ||
| 26 | and hoping for the best. | ||
| 27 | """ | ||
| 28 | def __init__(self): | ||
| 29 | super().__init__() | ||
| 30 | self.__textrepr = '' | ||
| 31 | self.__meta = {} | ||
| 32 | self.__validation_queue = [] | ||
| 33 | |||
| 34 | def handle_starttag(self, tag: str, attrs: List[Tuple[str, str]]): | ||
| 35 | self.__textrepr += self.get_starttag_text() | ||
| 36 | self.__validation_queue.append(tag) | ||
| 37 | |||
| 38 | def handle_endtag(self, tag: str): | ||
| 39 | if not self.__validation_queue: | ||
| 40 | raise ValueError | ||
| 41 | elif tag != self.__validation_queue.pop(): | ||
| 42 | raise ValueError | ||
| 43 | # There is no `get_endtag_text()` method :/ | ||
| 44 | self.__textrepr += '</' + tag + '>\n' | ||
| 45 | |||
| 46 | def handle_data(self, data: str): | ||
| 47 | if data.strip(): | ||
| 48 | self.__textrepr += data | ||
| 49 | |||
| 50 | def handle_startendtag(self, tag: str, attrs: List[Tuple[str, str]]): | ||
| 51 | if tag == 'meta': | ||
| 52 | meta = {k:v for k, v in attrs} | ||
| 53 | name = meta.get('name', 'harmful metadata') | ||
| 54 | content = meta.get('content', 'harmful data') | ||
| 55 | self.__meta[name] = content | ||
| 56 | else: | ||
| 57 | self.__textrepr += self.get_starttag_text() | ||
| 58 | |||
| 59 | def remove_all(self, output_filename: str) -> bool: | ||
| 60 | if self.__validation_queue: | ||
| 61 | raise ValueError | ||
| 62 | with open(output_filename, 'w') as f: | ||
| 63 | f.write(self.__textrepr) | ||
| 64 | return True | ||
| 65 | |||
| 66 | def get_meta(self) -> Dict[str, Any]: | ||
| 67 | if self.__validation_queue: | ||
| 68 | raise ValueError | ||
| 69 | return self.__meta | ||
diff --git a/tests/data/dirty.html b/tests/data/dirty.html new file mode 100644 index 0000000..1aa1723 --- /dev/null +++ b/tests/data/dirty.html | |||
| @@ -0,0 +1,14 @@ | |||
| 1 | <html> | ||
| 2 | <head> | ||
| 3 | <meta content="vim" name="generator"/> | ||
| 4 | <meta content="jvoisin" name="author"/> | ||
| 5 | </head> | ||
| 6 | <body> | ||
| 7 | <p> | ||
| 8 | <h1>Hello</h1> | ||
| 9 | I am a web page. | ||
| 10 | Please <b>love</b> me. | ||
| 11 | Here, have a pretty picture: <img src='dirty.jpg' alt='a pretty picture'/> | ||
| 12 | </p> | ||
| 13 | </body> | ||
| 14 | </html> | ||
diff --git a/tests/test_corrupted_files.py b/tests/test_corrupted_files.py index b2e7798..8728cb2 100644 --- a/tests/test_corrupted_files.py +++ b/tests/test_corrupted_files.py | |||
| @@ -7,7 +7,7 @@ import logging | |||
| 7 | import zipfile | 7 | import zipfile |
| 8 | 8 | ||
| 9 | from libmat2 import pdf, images, audio, office, parser_factory, torrent | 9 | from libmat2 import pdf, images, audio, office, parser_factory, torrent |
| 10 | from libmat2 import harmless, video | 10 | from libmat2 import harmless, video, html |
| 11 | 11 | ||
| 12 | # No need to logging messages, should something go wrong, | 12 | # No need to logging messages, should something go wrong, |
| 13 | # the testsuite _will_ fail. | 13 | # the testsuite _will_ fail. |
| @@ -232,3 +232,40 @@ class TestCorruptedFiles(unittest.TestCase): | |||
| 232 | self.assertEqual(meta['tests/data/dirty.docx']['word/media/image1.png']['Comment'], 'This is a comment, be careful!') | 232 | self.assertEqual(meta['tests/data/dirty.docx']['word/media/image1.png']['Comment'], 'This is a comment, be careful!') |
| 233 | self.assertFalse(p.remove_all()) | 233 | self.assertFalse(p.remove_all()) |
| 234 | os.remove('./tests/data/dirty.zip') | 234 | os.remove('./tests/data/dirty.zip') |
| 235 | |||
| 236 | def test_html(self): | ||
| 237 | shutil.copy('./tests/data/dirty.html', './tests/data/clean.html') | ||
| 238 | with open('./tests/data/clean.html', 'a') as f: | ||
| 239 | f.write('<open>but not</closed>') | ||
| 240 | with self.assertRaises(ValueError): | ||
| 241 | html.HTMLParser('./tests/data/clean.html') | ||
| 242 | os.remove('./tests/data/clean.html') | ||
| 243 | |||
| 244 | # Yes, we're able to deal with malformed html :/ | ||
| 245 | shutil.copy('./tests/data/dirty.html', './tests/data/clean.html') | ||
| 246 | with open('./tests/data/clean.html', 'a') as f: | ||
| 247 | f.write('<meta name=\'this" is="weird"/>') | ||
| 248 | p = html.HTMLParser('./tests/data/clean.html') | ||
| 249 | self.assertTrue(p.remove_all()) | ||
| 250 | p = html.HTMLParser('./tests/data/clean.cleaned.html') | ||
| 251 | self.assertEqual(p.get_meta(), {}) | ||
| 252 | os.remove('./tests/data/clean.html') | ||
| 253 | os.remove('./tests/data/clean.cleaned.html') | ||
| 254 | |||
| 255 | with open('./tests/data/clean.html', 'w') as f: | ||
| 256 | f.write('</close>') | ||
| 257 | with self.assertRaises(ValueError): | ||
| 258 | html.HTMLParser('./tests/data/clean.html') | ||
| 259 | os.remove('./tests/data/clean.html') | ||
| 260 | |||
| 261 | with open('./tests/data/clean.html', 'w') as f: | ||
| 262 | f.write('<notclosed>') | ||
| 263 | p = html.HTMLParser('./tests/data/clean.html') | ||
| 264 | with self.assertRaises(ValueError): | ||
| 265 | p.get_meta() | ||
| 266 | p = html.HTMLParser('./tests/data/clean.html') | ||
| 267 | with self.assertRaises(ValueError): | ||
| 268 | p.remove_all() | ||
| 269 | os.remove('./tests/data/clean.html') | ||
| 270 | |||
| 271 | |||
diff --git a/tests/test_libmat2.py b/tests/test_libmat2.py index 548b076..8753e09 100644 --- a/tests/test_libmat2.py +++ b/tests/test_libmat2.py | |||
| @@ -6,7 +6,7 @@ import os | |||
| 6 | import zipfile | 6 | import zipfile |
| 7 | 7 | ||
| 8 | from libmat2 import pdf, images, audio, office, parser_factory, torrent, harmless | 8 | from libmat2 import pdf, images, audio, office, parser_factory, torrent, harmless |
| 9 | from libmat2 import check_dependencies, video, archive | 9 | from libmat2 import check_dependencies, video, archive, html |
| 10 | 10 | ||
| 11 | 11 | ||
| 12 | class TestCheckDependencies(unittest.TestCase): | 12 | class TestCheckDependencies(unittest.TestCase): |
| @@ -596,3 +596,21 @@ class TestCleaning(unittest.TestCase): | |||
| 596 | os.remove('./tests/data/clean.gif') | 596 | os.remove('./tests/data/clean.gif') |
| 597 | os.remove('./tests/data/clean.cleaned.gif') | 597 | os.remove('./tests/data/clean.cleaned.gif') |
| 598 | os.remove('./tests/data/clean.cleaned.cleaned.gif') | 598 | os.remove('./tests/data/clean.cleaned.cleaned.gif') |
| 599 | |||
| 600 | def test_html(self): | ||
| 601 | shutil.copy('./tests/data/dirty.html', './tests/data/clean.html') | ||
| 602 | p = html.HTMLParser('./tests/data/clean.html') | ||
| 603 | |||
| 604 | meta = p.get_meta() | ||
| 605 | self.assertEqual(meta['author'], 'jvoisin') | ||
| 606 | |||
| 607 | ret = p.remove_all() | ||
| 608 | self.assertTrue(ret) | ||
| 609 | |||
| 610 | p = html.HTMLParser('./tests/data/clean.cleaned.html') | ||
| 611 | self.assertEqual(p.get_meta(), {}) | ||
| 612 | self.assertTrue(p.remove_all()) | ||
| 613 | |||
| 614 | os.remove('./tests/data/clean.html') | ||
| 615 | os.remove('./tests/data/clean.cleaned.html') | ||
| 616 | os.remove('./tests/data/clean.cleaned.cleaned.html') | ||
