Implement epub support

author: jvoisin 2019-02-20 16:28:11 -0800
committer: jvoisin 2019-02-20 16:28:11 -0800
commit: 02ff21b158c76fcd355a74ddb940e1c54fc2d7ed (patch)
tree: 701c6f5e316265e5a95a162356965ecf2fb8d6b2 /tests
parent: 6b45064c784d03bb21ffaf7e50c9ba684e6985a9 (diff)
5 files changed, 105 insertions, 13 deletions
diff --git a/tests/data/dirty.css b/tests/data/dirty.css
new file mode 100644
index 0000000..f52caf9
--- /dev/null
+++ b/tests/data/dirty.css
@@ -0,0 +1,14 @@
+/**
+ * This is my super css framework
+ * version: 1.0
+ * author : jvoisin
+ */
+body {
+        color: red;
+        background-color: blue;
+}
+.underline {
+        text-decoration: underline; /* underline is cool */     
+}
diff --git a/tests/data/dirty.epub b/tests/data/dirty.epub
new file mode 100644
index 0000000..6389963
--- /dev/null
+++ b/tests/data/dirty.epub
Binary files differ
diff --git a/tests/dirty.epub b/tests/dirty.epub
new file mode 100644
index 0000000..6389963
--- /dev/null
+++ b/tests/dirty.epub
Binary files differ
diff --git a/tests/test_corrupted_files.py b/tests/test_corrupted_files.py
index 8728cb2..53c856a 100644
--- a/tests/test_corrupted_files.py
+++ b/tests/test_corrupted_files.py
@@ -7,7 +7,7 @@ import logging
 import zipfile
 from libmat2 import pdf, images, audio, office, parser_factory, torrent
-from libmat2 import harmless, video, html
+from libmat2 import harmless, video, web
 # No need to logging messages, should something go wrong,
 # the testsuite _will_ fail.
@@ -220,34 +220,34 @@ class TestCorruptedFiles(unittest.TestCase):
        os.remove('./tests/data/--output.avi')
    def test_zip(self):
-        with zipfile.ZipFile('./tests/data/dirty.zip', 'w') as zout:
+        with zipfile.ZipFile('./tests/data/clean.zip', 'w') as zout:
            zout.write('./tests/data/dirty.flac')
            zout.write('./tests/data/dirty.docx')
            zout.write('./tests/data/dirty.jpg')
            zout.write('./tests/data/embedded_corrupted.docx')
-        p, mimetype = parser_factory.get_parser('./tests/data/dirty.zip')
+        p, mimetype = parser_factory.get_parser('./tests/data/clean.zip')
        self.assertEqual(mimetype, 'application/zip')
        meta = p.get_meta()
        self.assertEqual(meta['tests/data/dirty.flac']['comments'], 'Thank you for using MAT !')
        self.assertEqual(meta['tests/data/dirty.docx']['word/media/image1.png']['Comment'], 'This is a comment, be careful!')
        self.assertFalse(p.remove_all())
-        os.remove('./tests/data/dirty.zip')
+        os.remove('./tests/data/clean.zip')
    def test_html(self):
        shutil.copy('./tests/data/dirty.html', './tests/data/clean.html')
        with open('./tests/data/clean.html', 'a') as f:
            f.write('<open>but not</closed>')
        with self.assertRaises(ValueError):
-            html.HTMLParser('./tests/data/clean.html')
+            web.HTMLParser('./tests/data/clean.html')
        os.remove('./tests/data/clean.html')
        # Yes, we're able to deal with malformed html :/
        shutil.copy('./tests/data/dirty.html', './tests/data/clean.html')
        with open('./tests/data/clean.html', 'a') as f:
            f.write('<meta name=\'this" is="weird"/>')
-        p = html.HTMLParser('./tests/data/clean.html')
+        p = web.HTMLParser('./tests/data/clean.html')
        self.assertTrue(p.remove_all())
-        p = html.HTMLParser('./tests/data/clean.cleaned.html')
+        p = web.HTMLParser('./tests/data/clean.cleaned.html')
        self.assertEqual(p.get_meta(), {})
        os.remove('./tests/data/clean.html')
        os.remove('./tests/data/clean.cleaned.html')
@@ -255,17 +255,38 @@ class TestCorruptedFiles(unittest.TestCase):
        with open('./tests/data/clean.html', 'w') as f:
            f.write('</close>')
        with self.assertRaises(ValueError):
-            html.HTMLParser('./tests/data/clean.html')
+            web.HTMLParser('./tests/data/clean.html')
        os.remove('./tests/data/clean.html')
        with open('./tests/data/clean.html', 'w') as f:
            f.write('<notclosed>')
-        p = html.HTMLParser('./tests/data/clean.html')
+        p = web.HTMLParser('./tests/data/clean.html')
        with self.assertRaises(ValueError):
            p.get_meta()
-        p = html.HTMLParser('./tests/data/clean.html')
+        p = web.HTMLParser('./tests/data/clean.html')
        with self.assertRaises(ValueError):
            p.remove_all()
        os.remove('./tests/data/clean.html')
+        with open('./tests/data/clean.html', 'w') as f:
+            f.write('<doctitle><br/></doctitle><br/><notclosed>')
+        p = web.HTMLParser('./tests/data/clean.html')
+        with self.assertRaises(ValueError):
+            p.get_meta()
+        p = web.HTMLParser('./tests/data/clean.html')
+        with self.assertRaises(ValueError):
+            p.remove_all()
+        os.remove('./tests/data/clean.html')
+    def test_epub(self):
+        with zipfile.ZipFile('./tests/data/clean.epub', 'w') as zout:
+            zout.write('./tests/data/dirty.jpg', 'OEBPS/content.opf')
+        p, mimetype = parser_factory.get_parser('./tests/data/clean.epub')
+        self.assertEqual(mimetype, 'application/epub+zip')
+        meta = p.get_meta()
+        self.assertEqual(meta['OEBPS/content.opf']['OEBPS/content.opf'],
+                'harmful content')
+        self.assertFalse(p.remove_all())
+        os.remove('./tests/data/clean.epub')
diff --git a/tests/test_libmat2.py b/tests/test_libmat2.py
index 8753e09..249c56d 100644
--- a/tests/test_libmat2.py
+++ b/tests/test_libmat2.py
@@ -6,7 +6,7 @@ import os
 import zipfile
 from libmat2 import pdf, images, audio, office, parser_factory, torrent, harmless
-from libmat2 import check_dependencies, video, archive, html
+from libmat2 import check_dependencies, video, archive, web, epub
 class TestCheckDependencies(unittest.TestCase):
@@ -177,6 +177,23 @@ class TestGetMeta(unittest.TestCase):
        meta = p.get_meta()
        self.assertEqual(meta['Comment'], 'this is a test comment')
+    def test_epub(self):
+        p, mimetype = parser_factory.get_parser('./tests/data/dirty.epub')
+        self.assertEqual(mimetype, 'application/epub+zip')
+        meta = p.get_meta()
+        self.assertEqual(meta['OEBPS/content.opf']['dc:creator'], 'Dorothy L. Sayers')
+        self.assertEqual(meta['OEBPS/toc.ncx']['dtb:generator'], 'Ebookmaker 0.4.0a5 by Marcello Perathoner <webmaster@gutenberg.org>')
+        self.assertEqual(meta['OEBPS/@public@vhost@g@gutenberg@html@files@58820@58820-h@images@shield25.jpg']['CreatorTool'], 'Adobe Photoshop CS5 Macintosh')
+        self.assertEqual(meta['OEBPS/@public@vhost@g@gutenberg@html@files@58820@58820-h@58820-h-2.htm.html']['generator'], 'Ebookmaker 0.4.0a5 by Marcello Perathoner <webmaster@gutenberg.org>')
+    def test_css(self):
+        p, mimetype = parser_factory.get_parser('./tests/data/dirty.css')
+        self.assertEqual(mimetype, 'text/css')
+        meta = p.get_meta()
+        self.assertEqual(meta['author'], 'jvoisin')
+        self.assertEqual(meta['version'], '1.0')
+        self.assertEqual(meta['harmful data'], 'underline is cool')
 class TestRemovingThumbnails(unittest.TestCase):
    def test_odt(self):
        shutil.copy('./tests/data/revision.odt', './tests/data/clean.odt')
@@ -599,7 +616,7 @@ class TestCleaning(unittest.TestCase):
    def test_html(self):
        shutil.copy('./tests/data/dirty.html', './tests/data/clean.html')
-        p = html.HTMLParser('./tests/data/clean.html')
+        p = web.HTMLParser('./tests/data/clean.html')
        meta = p.get_meta()
        self.assertEqual(meta['author'], 'jvoisin')
@@ -607,10 +624,50 @@ class TestCleaning(unittest.TestCase):
        ret = p.remove_all()
        self.assertTrue(ret)
-        p = html.HTMLParser('./tests/data/clean.cleaned.html')
+        p = web.HTMLParser('./tests/data/clean.cleaned.html')
        self.assertEqual(p.get_meta(), {})
        self.assertTrue(p.remove_all())
        os.remove('./tests/data/clean.html')
        os.remove('./tests/data/clean.cleaned.html')
        os.remove('./tests/data/clean.cleaned.cleaned.html')
+    def test_epub(self):
+        shutil.copy('./tests/data/dirty.epub', './tests/data/clean.epub')
+        p = epub.EPUBParser('./tests/data/clean.epub')
+        meta = p.get_meta()
+        self.assertEqual(meta['OEBPS/content.opf']['dc:source'], 'http://www.gutenberg.org/files/58820/58820-h/58820-h.htm')
+        ret = p.remove_all()
+        self.assertTrue(ret)
+        p = epub.EPUBParser('./tests/data/clean.cleaned.epub')
+        self.assertEqual(p.get_meta(), {})
+        self.assertTrue(p.remove_all())
+        os.remove('./tests/data/clean.epub')
+        os.remove('./tests/data/clean.cleaned.epub')
+        os.remove('./tests/data/clean.cleaned.cleaned.epub')
+    def test_css(self):
+        shutil.copy('./tests/data/dirty.css', './tests/data/clean.css')
+        p = web.CSSParser('./tests/data/clean.css')
+        self.assertEqual(p.get_meta(), {
+            'harmful data': 'underline is cool',
+            'version': '1.0',
+            'author': 'jvoisin'})
+        ret = p.remove_all()
+        self.assertTrue(ret)
+        p = web.CSSParser('./tests/data/clean.cleaned.css')
+        self.assertEqual(p.get_meta(), {})
+        self.assertTrue(p.remove_all())
+        os.remove('./tests/data/clean.css')
+        os.remove('./tests/data/clean.cleaned.css')
+        os.remove('./tests/data/clean.cleaned.cleaned.css')
author	jvoisin	2019-02-20 16:28:11 -0800
committer	jvoisin	2019-02-20 16:28:11 -0800
commit	02ff21b158c76fcd355a74ddb940e1c54fc2d7ed (patch)
tree	701c6f5e316265e5a95a162356965ecf2fb8d6b2 /tests
parent	6b45064c784d03bb21ffaf7e50c9ba684e6985a9 (diff)

diff --git a/tests/data/dirty.css b/tests/data/dirty.css new file mode 100644 index 0000000..f52caf9 --- /dev/null +++ b/tests/data/dirty.css
@@ -0,0 +1,14 @@
		1	/**
		2	* This is my super css framework
		3	* version: 1.0
		4	* author : jvoisin
		5	*/
		6
		7	body {
		8	color: red;
		9	background-color: blue;
		10	}
		11
		12	.underline {
		13	text-decoration: underline; /* underline is cool */
		14	}


diff --git a/tests/data/dirty.epub b/tests/data/dirty.epub new file mode 100644 index 0000000..6389963 --- /dev/null +++ b/tests/data/dirty.epub
Binary files differ


diff --git a/tests/dirty.epub b/tests/dirty.epub new file mode 100644 index 0000000..6389963 --- /dev/null +++ b/tests/dirty.epub
Binary files differ


diff --git a/tests/test_corrupted_files.py b/tests/test_corrupted_files.py index 8728cb2..53c856a 100644 --- a/tests/test_corrupted_files.py +++ b/tests/test_corrupted_files.py
@@ -7,7 +7,7 @@ import logging
7	import zipfile	7	import zipfile
8		8
9	from libmat2 import pdf, images, audio, office, parser_factory, torrent	9	from libmat2 import pdf, images, audio, office, parser_factory, torrent
10	from libmat2 import harmless, video, html	10	from libmat2 import harmless, video, web
11		11
12	# No need to logging messages, should something go wrong,	12	# No need to logging messages, should something go wrong,
13	# the testsuite _will_ fail.	13	# the testsuite _will_ fail.
@@ -220,34 +220,34 @@ class TestCorruptedFiles(unittest.TestCase):
220	os.remove('./tests/data/--output.avi')	220	os.remove('./tests/data/--output.avi')
221		221
222	def test_zip(self):	222	def test_zip(self):
223	with zipfile.ZipFile('./tests/data/dirty.zip', 'w') as zout:	223	with zipfile.ZipFile('./tests/data/clean.zip', 'w') as zout:
224	zout.write('./tests/data/dirty.flac')	224	zout.write('./tests/data/dirty.flac')
225	zout.write('./tests/data/dirty.docx')	225	zout.write('./tests/data/dirty.docx')
226	zout.write('./tests/data/dirty.jpg')	226	zout.write('./tests/data/dirty.jpg')
227	zout.write('./tests/data/embedded_corrupted.docx')	227	zout.write('./tests/data/embedded_corrupted.docx')
228	p, mimetype = parser_factory.get_parser('./tests/data/dirty.zip')	228	p, mimetype = parser_factory.get_parser('./tests/data/clean.zip')
229	self.assertEqual(mimetype, 'application/zip')	229	self.assertEqual(mimetype, 'application/zip')
230	meta = p.get_meta()	230	meta = p.get_meta()
231	self.assertEqual(meta['tests/data/dirty.flac']['comments'], 'Thank you for using MAT !')	231	self.assertEqual(meta['tests/data/dirty.flac']['comments'], 'Thank you for using MAT !')
232	self.assertEqual(meta['tests/data/dirty.docx']['word/media/image1.png']['Comment'], 'This is a comment, be careful!')	232	self.assertEqual(meta['tests/data/dirty.docx']['word/media/image1.png']['Comment'], 'This is a comment, be careful!')
233	self.assertFalse(p.remove_all())	233	self.assertFalse(p.remove_all())
234	os.remove('./tests/data/dirty.zip')	234	os.remove('./tests/data/clean.zip')
235		235
236	def test_html(self):	236	def test_html(self):
237	shutil.copy('./tests/data/dirty.html', './tests/data/clean.html')	237	shutil.copy('./tests/data/dirty.html', './tests/data/clean.html')
238	with open('./tests/data/clean.html', 'a') as f:	238	with open('./tests/data/clean.html', 'a') as f:
239	f.write('<open>but not</closed>')	239	f.write('<open>but not</closed>')
240	with self.assertRaises(ValueError):	240	with self.assertRaises(ValueError):
241	html.HTMLParser('./tests/data/clean.html')	241	web.HTMLParser('./tests/data/clean.html')
242	os.remove('./tests/data/clean.html')	242	os.remove('./tests/data/clean.html')
243		243
244	# Yes, we're able to deal with malformed html :/	244	# Yes, we're able to deal with malformed html :/
245	shutil.copy('./tests/data/dirty.html', './tests/data/clean.html')	245	shutil.copy('./tests/data/dirty.html', './tests/data/clean.html')
246	with open('./tests/data/clean.html', 'a') as f:	246	with open('./tests/data/clean.html', 'a') as f:
247	f.write('<meta name=\'this" is="weird"/>')	247	f.write('<meta name=\'this" is="weird"/>')
248	p = html.HTMLParser('./tests/data/clean.html')	248	p = web.HTMLParser('./tests/data/clean.html')
249	self.assertTrue(p.remove_all())	249	self.assertTrue(p.remove_all())
250	p = html.HTMLParser('./tests/data/clean.cleaned.html')	250	p = web.HTMLParser('./tests/data/clean.cleaned.html')
251	self.assertEqual(p.get_meta(), {})	251	self.assertEqual(p.get_meta(), {})
252	os.remove('./tests/data/clean.html')	252	os.remove('./tests/data/clean.html')
253	os.remove('./tests/data/clean.cleaned.html')	253	os.remove('./tests/data/clean.cleaned.html')
@@ -255,17 +255,38 @@ class TestCorruptedFiles(unittest.TestCase):
255	with open('./tests/data/clean.html', 'w') as f:	255	with open('./tests/data/clean.html', 'w') as f:
256	f.write('</close>')	256	f.write('</close>')
257	with self.assertRaises(ValueError):	257	with self.assertRaises(ValueError):
258	html.HTMLParser('./tests/data/clean.html')	258	web.HTMLParser('./tests/data/clean.html')
259	os.remove('./tests/data/clean.html')	259	os.remove('./tests/data/clean.html')
260		260
261	with open('./tests/data/clean.html', 'w') as f:	261	with open('./tests/data/clean.html', 'w') as f:
262	f.write('<notclosed>')	262	f.write('<notclosed>')
263	p = html.HTMLParser('./tests/data/clean.html')	263	p = web.HTMLParser('./tests/data/clean.html')
264	with self.assertRaises(ValueError):	264	with self.assertRaises(ValueError):
265	p.get_meta()	265	p.get_meta()
266	p = html.HTMLParser('./tests/data/clean.html')	266	p = web.HTMLParser('./tests/data/clean.html')
267	with self.assertRaises(ValueError):	267	with self.assertRaises(ValueError):
268	p.remove_all()	268	p.remove_all()
269	os.remove('./tests/data/clean.html')	269	os.remove('./tests/data/clean.html')
270		270
		271	with open('./tests/data/clean.html', 'w') as f:
		272	f.write('<doctitle><br/></doctitle><br/><notclosed>')
		273	p = web.HTMLParser('./tests/data/clean.html')
		274	with self.assertRaises(ValueError):
		275	p.get_meta()
		276	p = web.HTMLParser('./tests/data/clean.html')
		277	with self.assertRaises(ValueError):
		278	p.remove_all()
		279	os.remove('./tests/data/clean.html')
		280
		281	def test_epub(self):
		282	with zipfile.ZipFile('./tests/data/clean.epub', 'w') as zout:
		283	zout.write('./tests/data/dirty.jpg', 'OEBPS/content.opf')
		284	p, mimetype = parser_factory.get_parser('./tests/data/clean.epub')
		285	self.assertEqual(mimetype, 'application/epub+zip')
		286	meta = p.get_meta()
		287	self.assertEqual(meta['OEBPS/content.opf']['OEBPS/content.opf'],
		288	'harmful content')
		289
		290	self.assertFalse(p.remove_all())
		291	os.remove('./tests/data/clean.epub')
271		292


diff --git a/tests/test_libmat2.py b/tests/test_libmat2.py index 8753e09..249c56d 100644 --- a/tests/test_libmat2.py +++ b/tests/test_libmat2.py
@@ -6,7 +6,7 @@ import os
6	import zipfile	6	import zipfile
7		7
8	from libmat2 import pdf, images, audio, office, parser_factory, torrent, harmless	8	from libmat2 import pdf, images, audio, office, parser_factory, torrent, harmless
9	from libmat2 import check_dependencies, video, archive, html	9	from libmat2 import check_dependencies, video, archive, web, epub
10		10
11		11
12	class TestCheckDependencies(unittest.TestCase):	12	class TestCheckDependencies(unittest.TestCase):
@@ -177,6 +177,23 @@ class TestGetMeta(unittest.TestCase):
177	meta = p.get_meta()	177	meta = p.get_meta()
178	self.assertEqual(meta['Comment'], 'this is a test comment')	178	self.assertEqual(meta['Comment'], 'this is a test comment')
179		179
		180	def test_epub(self):
		181	p, mimetype = parser_factory.get_parser('./tests/data/dirty.epub')
		182	self.assertEqual(mimetype, 'application/epub+zip')
		183	meta = p.get_meta()
		184	self.assertEqual(meta['OEBPS/content.opf']['dc:creator'], 'Dorothy L. Sayers')
		185	self.assertEqual(meta['OEBPS/toc.ncx']['dtb:generator'], 'Ebookmaker 0.4.0a5 by Marcello Perathoner <webmaster@gutenberg.org>')
		186	self.assertEqual(meta['OEBPS/@public@vhost@g@gutenberg@html@files@58820@58820-h@images@shield25.jpg']['CreatorTool'], 'Adobe Photoshop CS5 Macintosh')
		187	self.assertEqual(meta['OEBPS/@public@vhost@g@gutenberg@html@files@58820@58820-h@58820-h-2.htm.html']['generator'], 'Ebookmaker 0.4.0a5 by Marcello Perathoner <webmaster@gutenberg.org>')
		188
		189	def test_css(self):
		190	p, mimetype = parser_factory.get_parser('./tests/data/dirty.css')
		191	self.assertEqual(mimetype, 'text/css')
		192	meta = p.get_meta()
		193	self.assertEqual(meta['author'], 'jvoisin')
		194	self.assertEqual(meta['version'], '1.0')
		195	self.assertEqual(meta['harmful data'], 'underline is cool')
		196
180	class TestRemovingThumbnails(unittest.TestCase):	197	class TestRemovingThumbnails(unittest.TestCase):
181	def test_odt(self):	198	def test_odt(self):
182	shutil.copy('./tests/data/revision.odt', './tests/data/clean.odt')	199	shutil.copy('./tests/data/revision.odt', './tests/data/clean.odt')
@@ -599,7 +616,7 @@ class TestCleaning(unittest.TestCase):
599		616
600	def test_html(self):	617	def test_html(self):
601	shutil.copy('./tests/data/dirty.html', './tests/data/clean.html')	618	shutil.copy('./tests/data/dirty.html', './tests/data/clean.html')
602	p = html.HTMLParser('./tests/data/clean.html')	619	p = web.HTMLParser('./tests/data/clean.html')
603		620
604	meta = p.get_meta()	621	meta = p.get_meta()
605	self.assertEqual(meta['author'], 'jvoisin')	622	self.assertEqual(meta['author'], 'jvoisin')
@@ -607,10 +624,50 @@ class TestCleaning(unittest.TestCase):
607	ret = p.remove_all()	624	ret = p.remove_all()
608	self.assertTrue(ret)	625	self.assertTrue(ret)
609		626
610	p = html.HTMLParser('./tests/data/clean.cleaned.html')	627	p = web.HTMLParser('./tests/data/clean.cleaned.html')
611	self.assertEqual(p.get_meta(), {})	628	self.assertEqual(p.get_meta(), {})
612	self.assertTrue(p.remove_all())	629	self.assertTrue(p.remove_all())
613		630
614	os.remove('./tests/data/clean.html')	631	os.remove('./tests/data/clean.html')
615	os.remove('./tests/data/clean.cleaned.html')	632	os.remove('./tests/data/clean.cleaned.html')
616	os.remove('./tests/data/clean.cleaned.cleaned.html')	633	os.remove('./tests/data/clean.cleaned.cleaned.html')
		634
		635
		636	def test_epub(self):
		637	shutil.copy('./tests/data/dirty.epub', './tests/data/clean.epub')
		638	p = epub.EPUBParser('./tests/data/clean.epub')
		639
		640	meta = p.get_meta()
		641	self.assertEqual(meta['OEBPS/content.opf']['dc:source'], 'http://www.gutenberg.org/files/58820/58820-h/58820-h.htm')
		642
		643	ret = p.remove_all()
		644	self.assertTrue(ret)
		645
		646	p = epub.EPUBParser('./tests/data/clean.cleaned.epub')
		647	self.assertEqual(p.get_meta(), {})
		648	self.assertTrue(p.remove_all())
		649
		650	os.remove('./tests/data/clean.epub')
		651	os.remove('./tests/data/clean.cleaned.epub')
		652	os.remove('./tests/data/clean.cleaned.cleaned.epub')
		653
		654
		655	def test_css(self):
		656	shutil.copy('./tests/data/dirty.css', './tests/data/clean.css')
		657	p = web.CSSParser('./tests/data/clean.css')
		658
		659	self.assertEqual(p.get_meta(), {
		660	'harmful data': 'underline is cool',
		661	'version': '1.0',
		662	'author': 'jvoisin'})
		663
		664	ret = p.remove_all()
		665	self.assertTrue(ret)
		666
		667	p = web.CSSParser('./tests/data/clean.cleaned.css')
		668	self.assertEqual(p.get_meta(), {})
		669	self.assertTrue(p.remove_all())
		670
		671	os.remove('./tests/data/clean.css')
		672	os.remove('./tests/data/clean.cleaned.css')
		673	os.remove('./tests/data/clean.cleaned.cleaned.css')