3 files changed, 71 insertions, 2 deletions
diff --git a/tests/data/dirty.html b/tests/data/dirty.html
new file mode 100644
index 0000000..1aa1723
--- /dev/null
+++ b/tests/data/dirty.html
@@ -0,0 +1,14 @@
+<html>
+        <head>
+                <meta content="vim" name="generator"/>
+                <meta content="jvoisin" name="author"/>
+</head>
+<body>
+        <p>
+                <h1>Hello</h1>
+                I am a web page.
+                Please <b>love</b> me.
+                Here, have a pretty picture: <img src='dirty.jpg' alt='a pretty picture'/>
+        </p>
+</body>
+</html>
diff --git a/tests/test_corrupted_files.py b/tests/test_corrupted_files.py
index b2e7798..8728cb2 100644
--- a/tests/test_corrupted_files.py
+++ b/tests/test_corrupted_files.py
@@ -7,7 +7,7 @@ import logging
 import zipfile
 from libmat2 import pdf, images, audio, office, parser_factory, torrent
-from libmat2 import harmless, video
+from libmat2 import harmless, video, html
 # No need to logging messages, should something go wrong,
 # the testsuite _will_ fail.
@@ -232,3 +232,40 @@ class TestCorruptedFiles(unittest.TestCase):
        self.assertEqual(meta['tests/data/dirty.docx']['word/media/image1.png']['Comment'], 'This is a comment, be careful!')
        self.assertFalse(p.remove_all())
        os.remove('./tests/data/dirty.zip')
+    def test_html(self):
+        shutil.copy('./tests/data/dirty.html', './tests/data/clean.html')
+        with open('./tests/data/clean.html', 'a') as f:
+            f.write('<open>but not</closed>')
+        with self.assertRaises(ValueError):
+            html.HTMLParser('./tests/data/clean.html')
+        os.remove('./tests/data/clean.html')
+        # Yes, we're able to deal with malformed html :/
+        shutil.copy('./tests/data/dirty.html', './tests/data/clean.html')
+        with open('./tests/data/clean.html', 'a') as f:
+            f.write('<meta name=\'this" is="weird"/>')
+        p = html.HTMLParser('./tests/data/clean.html')
+        self.assertTrue(p.remove_all())
+        p = html.HTMLParser('./tests/data/clean.cleaned.html')
+        self.assertEqual(p.get_meta(), {})
+        os.remove('./tests/data/clean.html')
+        os.remove('./tests/data/clean.cleaned.html')
+        with open('./tests/data/clean.html', 'w') as f:
+            f.write('</close>')
+        with self.assertRaises(ValueError):
+            html.HTMLParser('./tests/data/clean.html')
+        os.remove('./tests/data/clean.html')
+        with open('./tests/data/clean.html', 'w') as f:
+            f.write('<notclosed>')
+        p = html.HTMLParser('./tests/data/clean.html')
+        with self.assertRaises(ValueError):
+            p.get_meta()
+        p = html.HTMLParser('./tests/data/clean.html')
+        with self.assertRaises(ValueError):
+            p.remove_all()
+        os.remove('./tests/data/clean.html')
diff --git a/tests/test_libmat2.py b/tests/test_libmat2.py
index 548b076..8753e09 100644
--- a/tests/test_libmat2.py
+++ b/tests/test_libmat2.py
@@ -6,7 +6,7 @@ import os
 import zipfile
 from libmat2 import pdf, images, audio, office, parser_factory, torrent, harmless
-from libmat2 import check_dependencies, video, archive
+from libmat2 import check_dependencies, video, archive, html
 class TestCheckDependencies(unittest.TestCase):
@@ -596,3 +596,21 @@ class TestCleaning(unittest.TestCase):
        os.remove('./tests/data/clean.gif')
        os.remove('./tests/data/clean.cleaned.gif')
        os.remove('./tests/data/clean.cleaned.cleaned.gif')
+    def test_html(self):
+        shutil.copy('./tests/data/dirty.html', './tests/data/clean.html')
+        p = html.HTMLParser('./tests/data/clean.html')
+        meta = p.get_meta()
+        self.assertEqual(meta['author'], 'jvoisin')
+        ret = p.remove_all()
+        self.assertTrue(ret)
+        p = html.HTMLParser('./tests/data/clean.cleaned.html')
+        self.assertEqual(p.get_meta(), {})
+        self.assertTrue(p.remove_all())
+        os.remove('./tests/data/clean.html')
+        os.remove('./tests/data/clean.cleaned.html')
+        os.remove('./tests/data/clean.cleaned.cleaned.html')

diff --git a/tests/data/dirty.html b/tests/data/dirty.html new file mode 100644 index 0000000..1aa1723 --- /dev/null +++ b/tests/data/dirty.html
@@ -0,0 +1,14 @@
		1	<html>
		2	<head>
		3	<meta content="vim" name="generator"/>
		4	<meta content="jvoisin" name="author"/>
		5	</head>
		6	<body>
		7	<p>
		8	<h1>Hello</h1>
		9	I am a web page.
		10	Please <b>love</b> me.
		11	Here, have a pretty picture: <img src='dirty.jpg' alt='a pretty picture'/>
		12	</p>
		13	</body>
		14	</html>


diff --git a/tests/test_corrupted_files.py b/tests/test_corrupted_files.py index b2e7798..8728cb2 100644 --- a/tests/test_corrupted_files.py +++ b/tests/test_corrupted_files.py
@@ -7,7 +7,7 @@ import logging
7	import zipfile	7	import zipfile
8		8
9	from libmat2 import pdf, images, audio, office, parser_factory, torrent	9	from libmat2 import pdf, images, audio, office, parser_factory, torrent
10	from libmat2 import harmless, video	10	from libmat2 import harmless, video, html
11		11
12	# No need to logging messages, should something go wrong,	12	# No need to logging messages, should something go wrong,
13	# the testsuite _will_ fail.	13	# the testsuite _will_ fail.
@@ -232,3 +232,40 @@ class TestCorruptedFiles(unittest.TestCase):
232	self.assertEqual(meta['tests/data/dirty.docx']['word/media/image1.png']['Comment'], 'This is a comment, be careful!')	232	self.assertEqual(meta['tests/data/dirty.docx']['word/media/image1.png']['Comment'], 'This is a comment, be careful!')
233	self.assertFalse(p.remove_all())	233	self.assertFalse(p.remove_all())
234	os.remove('./tests/data/dirty.zip')	234	os.remove('./tests/data/dirty.zip')
		235
		236	def test_html(self):
		237	shutil.copy('./tests/data/dirty.html', './tests/data/clean.html')
		238	with open('./tests/data/clean.html', 'a') as f:
		239	f.write('<open>but not</closed>')
		240	with self.assertRaises(ValueError):
		241	html.HTMLParser('./tests/data/clean.html')
		242	os.remove('./tests/data/clean.html')
		243
		244	# Yes, we're able to deal with malformed html :/
		245	shutil.copy('./tests/data/dirty.html', './tests/data/clean.html')
		246	with open('./tests/data/clean.html', 'a') as f:
		247	f.write('<meta name=\'this" is="weird"/>')
		248	p = html.HTMLParser('./tests/data/clean.html')
		249	self.assertTrue(p.remove_all())
		250	p = html.HTMLParser('./tests/data/clean.cleaned.html')
		251	self.assertEqual(p.get_meta(), {})
		252	os.remove('./tests/data/clean.html')
		253	os.remove('./tests/data/clean.cleaned.html')
		254
		255	with open('./tests/data/clean.html', 'w') as f:
		256	f.write('</close>')
		257	with self.assertRaises(ValueError):
		258	html.HTMLParser('./tests/data/clean.html')
		259	os.remove('./tests/data/clean.html')
		260
		261	with open('./tests/data/clean.html', 'w') as f:
		262	f.write('<notclosed>')
		263	p = html.HTMLParser('./tests/data/clean.html')
		264	with self.assertRaises(ValueError):
		265	p.get_meta()
		266	p = html.HTMLParser('./tests/data/clean.html')
		267	with self.assertRaises(ValueError):
		268	p.remove_all()
		269	os.remove('./tests/data/clean.html')
		270
		271


diff --git a/tests/test_libmat2.py b/tests/test_libmat2.py index 548b076..8753e09 100644 --- a/tests/test_libmat2.py +++ b/tests/test_libmat2.py
@@ -6,7 +6,7 @@ import os
6	import zipfile	6	import zipfile
7		7
8	from libmat2 import pdf, images, audio, office, parser_factory, torrent, harmless	8	from libmat2 import pdf, images, audio, office, parser_factory, torrent, harmless
9	from libmat2 import check_dependencies, video, archive	9	from libmat2 import check_dependencies, video, archive, html
10		10
11		11
12	class TestCheckDependencies(unittest.TestCase):	12	class TestCheckDependencies(unittest.TestCase):
@@ -596,3 +596,21 @@ class TestCleaning(unittest.TestCase):
596	os.remove('./tests/data/clean.gif')	596	os.remove('./tests/data/clean.gif')
597	os.remove('./tests/data/clean.cleaned.gif')	597	os.remove('./tests/data/clean.cleaned.gif')
598	os.remove('./tests/data/clean.cleaned.cleaned.gif')	598	os.remove('./tests/data/clean.cleaned.cleaned.gif')
		599
		600	def test_html(self):
		601	shutil.copy('./tests/data/dirty.html', './tests/data/clean.html')
		602	p = html.HTMLParser('./tests/data/clean.html')
		603
		604	meta = p.get_meta()
		605	self.assertEqual(meta['author'], 'jvoisin')
		606
		607	ret = p.remove_all()
		608	self.assertTrue(ret)
		609
		610	p = html.HTMLParser('./tests/data/clean.cleaned.html')
		611	self.assertEqual(p.get_meta(), {})
		612	self.assertTrue(p.remove_all())
		613
		614	os.remove('./tests/data/clean.html')
		615	os.remove('./tests/data/clean.cleaned.html')
		616	os.remove('./tests/data/clean.cleaned.cleaned.html')