From 02ff21b158c76fcd355a74ddb940e1c54fc2d7ed Mon Sep 17 00:00:00 2001 From: jvoisin Date: Wed, 20 Feb 2019 16:28:11 -0800 Subject: Implement epub support --- tests/data/dirty.css | 14 ++++++++++ tests/data/dirty.epub | Bin 0 -> 296324 bytes tests/dirty.epub | Bin 0 -> 296324 bytes tests/test_corrupted_files.py | 41 ++++++++++++++++++++------- tests/test_libmat2.py | 63 ++++++++++++++++++++++++++++++++++++++++-- 5 files changed, 105 insertions(+), 13 deletions(-) create mode 100644 tests/data/dirty.css create mode 100644 tests/data/dirty.epub create mode 100644 tests/dirty.epub (limited to 'tests') diff --git a/tests/data/dirty.css b/tests/data/dirty.css new file mode 100644 index 0000000..f52caf9 --- /dev/null +++ b/tests/data/dirty.css @@ -0,0 +1,14 @@ +/** + * This is my super css framework + * version: 1.0 + * author : jvoisin + */ + +body { + color: red; + background-color: blue; +} + +.underline { + text-decoration: underline; /* underline is cool */ +} diff --git a/tests/data/dirty.epub b/tests/data/dirty.epub new file mode 100644 index 0000000..6389963 Binary files /dev/null and b/tests/data/dirty.epub differ diff --git a/tests/dirty.epub b/tests/dirty.epub new file mode 100644 index 0000000..6389963 Binary files /dev/null and b/tests/dirty.epub differ diff --git a/tests/test_corrupted_files.py b/tests/test_corrupted_files.py index 8728cb2..53c856a 100644 --- a/tests/test_corrupted_files.py +++ b/tests/test_corrupted_files.py @@ -7,7 +7,7 @@ import logging import zipfile from libmat2 import pdf, images, audio, office, parser_factory, torrent -from libmat2 import harmless, video, html +from libmat2 import harmless, video, web # No need to logging messages, should something go wrong, # the testsuite _will_ fail. @@ -220,34 +220,34 @@ class TestCorruptedFiles(unittest.TestCase): os.remove('./tests/data/--output.avi') def test_zip(self): - with zipfile.ZipFile('./tests/data/dirty.zip', 'w') as zout: + with zipfile.ZipFile('./tests/data/clean.zip', 'w') as zout: zout.write('./tests/data/dirty.flac') zout.write('./tests/data/dirty.docx') zout.write('./tests/data/dirty.jpg') zout.write('./tests/data/embedded_corrupted.docx') - p, mimetype = parser_factory.get_parser('./tests/data/dirty.zip') + p, mimetype = parser_factory.get_parser('./tests/data/clean.zip') self.assertEqual(mimetype, 'application/zip') meta = p.get_meta() self.assertEqual(meta['tests/data/dirty.flac']['comments'], 'Thank you for using MAT !') self.assertEqual(meta['tests/data/dirty.docx']['word/media/image1.png']['Comment'], 'This is a comment, be careful!') self.assertFalse(p.remove_all()) - os.remove('./tests/data/dirty.zip') + os.remove('./tests/data/clean.zip') def test_html(self): shutil.copy('./tests/data/dirty.html', './tests/data/clean.html') with open('./tests/data/clean.html', 'a') as f: f.write('but not') with self.assertRaises(ValueError): - html.HTMLParser('./tests/data/clean.html') + web.HTMLParser('./tests/data/clean.html') os.remove('./tests/data/clean.html') # Yes, we're able to deal with malformed html :/ shutil.copy('./tests/data/dirty.html', './tests/data/clean.html') with open('./tests/data/clean.html', 'a') as f: f.write('') - p = html.HTMLParser('./tests/data/clean.html') + p = web.HTMLParser('./tests/data/clean.html') self.assertTrue(p.remove_all()) - p = html.HTMLParser('./tests/data/clean.cleaned.html') + p = web.HTMLParser('./tests/data/clean.cleaned.html') self.assertEqual(p.get_meta(), {}) os.remove('./tests/data/clean.html') os.remove('./tests/data/clean.cleaned.html') @@ -255,17 +255,38 @@ class TestCorruptedFiles(unittest.TestCase): with open('./tests/data/clean.html', 'w') as f: f.write('') with self.assertRaises(ValueError): - html.HTMLParser('./tests/data/clean.html') + web.HTMLParser('./tests/data/clean.html') os.remove('./tests/data/clean.html') with open('./tests/data/clean.html', 'w') as f: f.write('') - p = html.HTMLParser('./tests/data/clean.html') + p = web.HTMLParser('./tests/data/clean.html') with self.assertRaises(ValueError): p.get_meta() - p = html.HTMLParser('./tests/data/clean.html') + p = web.HTMLParser('./tests/data/clean.html') with self.assertRaises(ValueError): p.remove_all() os.remove('./tests/data/clean.html') + with open('./tests/data/clean.html', 'w') as f: + f.write('

') + p = web.HTMLParser('./tests/data/clean.html') + with self.assertRaises(ValueError): + p.get_meta() + p = web.HTMLParser('./tests/data/clean.html') + with self.assertRaises(ValueError): + p.remove_all() + os.remove('./tests/data/clean.html') + + def test_epub(self): + with zipfile.ZipFile('./tests/data/clean.epub', 'w') as zout: + zout.write('./tests/data/dirty.jpg', 'OEBPS/content.opf') + p, mimetype = parser_factory.get_parser('./tests/data/clean.epub') + self.assertEqual(mimetype, 'application/epub+zip') + meta = p.get_meta() + self.assertEqual(meta['OEBPS/content.opf']['OEBPS/content.opf'], + 'harmful content') + + self.assertFalse(p.remove_all()) + os.remove('./tests/data/clean.epub') diff --git a/tests/test_libmat2.py b/tests/test_libmat2.py index 8753e09..249c56d 100644 --- a/tests/test_libmat2.py +++ b/tests/test_libmat2.py @@ -6,7 +6,7 @@ import os import zipfile from libmat2 import pdf, images, audio, office, parser_factory, torrent, harmless -from libmat2 import check_dependencies, video, archive, html +from libmat2 import check_dependencies, video, archive, web, epub class TestCheckDependencies(unittest.TestCase): @@ -177,6 +177,23 @@ class TestGetMeta(unittest.TestCase): meta = p.get_meta() self.assertEqual(meta['Comment'], 'this is a test comment') + def test_epub(self): + p, mimetype = parser_factory.get_parser('./tests/data/dirty.epub') + self.assertEqual(mimetype, 'application/epub+zip') + meta = p.get_meta() + self.assertEqual(meta['OEBPS/content.opf']['dc:creator'], 'Dorothy L. Sayers') + self.assertEqual(meta['OEBPS/toc.ncx']['dtb:generator'], 'Ebookmaker 0.4.0a5 by Marcello Perathoner ') + self.assertEqual(meta['OEBPS/@public@vhost@g@gutenberg@html@files@58820@58820-h@images@shield25.jpg']['CreatorTool'], 'Adobe Photoshop CS5 Macintosh') + self.assertEqual(meta['OEBPS/@public@vhost@g@gutenberg@html@files@58820@58820-h@58820-h-2.htm.html']['generator'], 'Ebookmaker 0.4.0a5 by Marcello Perathoner ') + + def test_css(self): + p, mimetype = parser_factory.get_parser('./tests/data/dirty.css') + self.assertEqual(mimetype, 'text/css') + meta = p.get_meta() + self.assertEqual(meta['author'], 'jvoisin') + self.assertEqual(meta['version'], '1.0') + self.assertEqual(meta['harmful data'], 'underline is cool') + class TestRemovingThumbnails(unittest.TestCase): def test_odt(self): shutil.copy('./tests/data/revision.odt', './tests/data/clean.odt') @@ -599,7 +616,7 @@ class TestCleaning(unittest.TestCase): def test_html(self): shutil.copy('./tests/data/dirty.html', './tests/data/clean.html') - p = html.HTMLParser('./tests/data/clean.html') + p = web.HTMLParser('./tests/data/clean.html') meta = p.get_meta() self.assertEqual(meta['author'], 'jvoisin') @@ -607,10 +624,50 @@ class TestCleaning(unittest.TestCase): ret = p.remove_all() self.assertTrue(ret) - p = html.HTMLParser('./tests/data/clean.cleaned.html') + p = web.HTMLParser('./tests/data/clean.cleaned.html') self.assertEqual(p.get_meta(), {}) self.assertTrue(p.remove_all()) os.remove('./tests/data/clean.html') os.remove('./tests/data/clean.cleaned.html') os.remove('./tests/data/clean.cleaned.cleaned.html') + + + def test_epub(self): + shutil.copy('./tests/data/dirty.epub', './tests/data/clean.epub') + p = epub.EPUBParser('./tests/data/clean.epub') + + meta = p.get_meta() + self.assertEqual(meta['OEBPS/content.opf']['dc:source'], 'http://www.gutenberg.org/files/58820/58820-h/58820-h.htm') + + ret = p.remove_all() + self.assertTrue(ret) + + p = epub.EPUBParser('./tests/data/clean.cleaned.epub') + self.assertEqual(p.get_meta(), {}) + self.assertTrue(p.remove_all()) + + os.remove('./tests/data/clean.epub') + os.remove('./tests/data/clean.cleaned.epub') + os.remove('./tests/data/clean.cleaned.cleaned.epub') + + + def test_css(self): + shutil.copy('./tests/data/dirty.css', './tests/data/clean.css') + p = web.CSSParser('./tests/data/clean.css') + + self.assertEqual(p.get_meta(), { + 'harmful data': 'underline is cool', + 'version': '1.0', + 'author': 'jvoisin'}) + + ret = p.remove_all() + self.assertTrue(ret) + + p = web.CSSParser('./tests/data/clean.cleaned.css') + self.assertEqual(p.get_meta(), {}) + self.assertTrue(p.remove_all()) + + os.remove('./tests/data/clean.css') + os.remove('./tests/data/clean.cleaned.css') + os.remove('./tests/data/clean.cleaned.cleaned.css') -- cgit v1.3