summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--libmat2/html.py69
-rw-r--r--tests/data/dirty.html14
-rw-r--r--tests/test_corrupted_files.py39
-rw-r--r--tests/test_libmat2.py20
4 files changed, 140 insertions, 2 deletions
diff --git a/libmat2/html.py b/libmat2/html.py
new file mode 100644
index 0000000..d0e9a2b
--- /dev/null
+++ b/libmat2/html.py
@@ -0,0 +1,69 @@
1from html import parser
2from typing import Dict, Any, List, Tuple
3
4from . import abstract
5
6
7class HTMLParser(abstract.AbstractParser):
8 mimetypes = {'text/html', }
9 def __init__(self, filename):
10 super().__init__(filename)
11 self.__parser = _HTMLParser()
12 with open(filename) as f:
13 self.__parser.feed(f.read())
14 self.__parser.close()
15
16 def get_meta(self) -> Dict[str, Any]:
17 return self.__parser.get_meta()
18
19 def remove_all(self) -> bool:
20 return self.__parser.remove_all(self.output_filename)
21
22
23class _HTMLParser(parser.HTMLParser):
24 """Python doesn't have a validating html parser in its stdlib, so
25 we're using an internal queue to track all the opening/closing tags,
26 and hoping for the best.
27 """
28 def __init__(self):
29 super().__init__()
30 self.__textrepr = ''
31 self.__meta = {}
32 self.__validation_queue = []
33
34 def handle_starttag(self, tag: str, attrs: List[Tuple[str, str]]):
35 self.__textrepr += self.get_starttag_text()
36 self.__validation_queue.append(tag)
37
38 def handle_endtag(self, tag: str):
39 if not self.__validation_queue:
40 raise ValueError
41 elif tag != self.__validation_queue.pop():
42 raise ValueError
43 # There is no `get_endtag_text()` method :/
44 self.__textrepr += '</' + tag + '>\n'
45
46 def handle_data(self, data: str):
47 if data.strip():
48 self.__textrepr += data
49
50 def handle_startendtag(self, tag: str, attrs: List[Tuple[str, str]]):
51 if tag == 'meta':
52 meta = {k:v for k, v in attrs}
53 name = meta.get('name', 'harmful metadata')
54 content = meta.get('content', 'harmful data')
55 self.__meta[name] = content
56 else:
57 self.__textrepr += self.get_starttag_text()
58
59 def remove_all(self, output_filename: str) -> bool:
60 if self.__validation_queue:
61 raise ValueError
62 with open(output_filename, 'w') as f:
63 f.write(self.__textrepr)
64 return True
65
66 def get_meta(self) -> Dict[str, Any]:
67 if self.__validation_queue:
68 raise ValueError
69 return self.__meta
diff --git a/tests/data/dirty.html b/tests/data/dirty.html
new file mode 100644
index 0000000..1aa1723
--- /dev/null
+++ b/tests/data/dirty.html
@@ -0,0 +1,14 @@
1<html>
2 <head>
3 <meta content="vim" name="generator"/>
4 <meta content="jvoisin" name="author"/>
5</head>
6<body>
7 <p>
8 <h1>Hello</h1>
9 I am a web page.
10 Please <b>love</b> me.
11 Here, have a pretty picture: <img src='dirty.jpg' alt='a pretty picture'/>
12 </p>
13</body>
14</html>
diff --git a/tests/test_corrupted_files.py b/tests/test_corrupted_files.py
index b2e7798..8728cb2 100644
--- a/tests/test_corrupted_files.py
+++ b/tests/test_corrupted_files.py
@@ -7,7 +7,7 @@ import logging
7import zipfile 7import zipfile
8 8
9from libmat2 import pdf, images, audio, office, parser_factory, torrent 9from libmat2 import pdf, images, audio, office, parser_factory, torrent
10from libmat2 import harmless, video 10from libmat2 import harmless, video, html
11 11
12# No need to logging messages, should something go wrong, 12# No need to logging messages, should something go wrong,
13# the testsuite _will_ fail. 13# the testsuite _will_ fail.
@@ -232,3 +232,40 @@ class TestCorruptedFiles(unittest.TestCase):
232 self.assertEqual(meta['tests/data/dirty.docx']['word/media/image1.png']['Comment'], 'This is a comment, be careful!') 232 self.assertEqual(meta['tests/data/dirty.docx']['word/media/image1.png']['Comment'], 'This is a comment, be careful!')
233 self.assertFalse(p.remove_all()) 233 self.assertFalse(p.remove_all())
234 os.remove('./tests/data/dirty.zip') 234 os.remove('./tests/data/dirty.zip')
235
236 def test_html(self):
237 shutil.copy('./tests/data/dirty.html', './tests/data/clean.html')
238 with open('./tests/data/clean.html', 'a') as f:
239 f.write('<open>but not</closed>')
240 with self.assertRaises(ValueError):
241 html.HTMLParser('./tests/data/clean.html')
242 os.remove('./tests/data/clean.html')
243
244 # Yes, we're able to deal with malformed html :/
245 shutil.copy('./tests/data/dirty.html', './tests/data/clean.html')
246 with open('./tests/data/clean.html', 'a') as f:
247 f.write('<meta name=\'this" is="weird"/>')
248 p = html.HTMLParser('./tests/data/clean.html')
249 self.assertTrue(p.remove_all())
250 p = html.HTMLParser('./tests/data/clean.cleaned.html')
251 self.assertEqual(p.get_meta(), {})
252 os.remove('./tests/data/clean.html')
253 os.remove('./tests/data/clean.cleaned.html')
254
255 with open('./tests/data/clean.html', 'w') as f:
256 f.write('</close>')
257 with self.assertRaises(ValueError):
258 html.HTMLParser('./tests/data/clean.html')
259 os.remove('./tests/data/clean.html')
260
261 with open('./tests/data/clean.html', 'w') as f:
262 f.write('<notclosed>')
263 p = html.HTMLParser('./tests/data/clean.html')
264 with self.assertRaises(ValueError):
265 p.get_meta()
266 p = html.HTMLParser('./tests/data/clean.html')
267 with self.assertRaises(ValueError):
268 p.remove_all()
269 os.remove('./tests/data/clean.html')
270
271
diff --git a/tests/test_libmat2.py b/tests/test_libmat2.py
index 548b076..8753e09 100644
--- a/tests/test_libmat2.py
+++ b/tests/test_libmat2.py
@@ -6,7 +6,7 @@ import os
6import zipfile 6import zipfile
7 7
8from libmat2 import pdf, images, audio, office, parser_factory, torrent, harmless 8from libmat2 import pdf, images, audio, office, parser_factory, torrent, harmless
9from libmat2 import check_dependencies, video, archive 9from libmat2 import check_dependencies, video, archive, html
10 10
11 11
12class TestCheckDependencies(unittest.TestCase): 12class TestCheckDependencies(unittest.TestCase):
@@ -596,3 +596,21 @@ class TestCleaning(unittest.TestCase):
596 os.remove('./tests/data/clean.gif') 596 os.remove('./tests/data/clean.gif')
597 os.remove('./tests/data/clean.cleaned.gif') 597 os.remove('./tests/data/clean.cleaned.gif')
598 os.remove('./tests/data/clean.cleaned.cleaned.gif') 598 os.remove('./tests/data/clean.cleaned.cleaned.gif')
599
600 def test_html(self):
601 shutil.copy('./tests/data/dirty.html', './tests/data/clean.html')
602 p = html.HTMLParser('./tests/data/clean.html')
603
604 meta = p.get_meta()
605 self.assertEqual(meta['author'], 'jvoisin')
606
607 ret = p.remove_all()
608 self.assertTrue(ret)
609
610 p = html.HTMLParser('./tests/data/clean.cleaned.html')
611 self.assertEqual(p.get_meta(), {})
612 self.assertTrue(p.remove_all())
613
614 os.remove('./tests/data/clean.html')
615 os.remove('./tests/data/clean.cleaned.html')
616 os.remove('./tests/data/clean.cleaned.cleaned.html')