diff options
Diffstat (limited to 'tests')
| -rw-r--r-- | tests/data/dirty.css | 14 | ||||
| -rw-r--r-- | tests/data/dirty.epub | bin | 0 -> 296324 bytes | |||
| -rw-r--r-- | tests/dirty.epub | bin | 0 -> 296324 bytes | |||
| -rw-r--r-- | tests/test_corrupted_files.py | 41 | ||||
| -rw-r--r-- | tests/test_libmat2.py | 63 |
5 files changed, 105 insertions, 13 deletions
diff --git a/tests/data/dirty.css b/tests/data/dirty.css new file mode 100644 index 0000000..f52caf9 --- /dev/null +++ b/tests/data/dirty.css | |||
| @@ -0,0 +1,14 @@ | |||
| 1 | /** | ||
| 2 | * This is my super css framework | ||
| 3 | * version: 1.0 | ||
| 4 | * author : jvoisin | ||
| 5 | */ | ||
| 6 | |||
| 7 | body { | ||
| 8 | color: red; | ||
| 9 | background-color: blue; | ||
| 10 | } | ||
| 11 | |||
| 12 | .underline { | ||
| 13 | text-decoration: underline; /* underline is cool */ | ||
| 14 | } | ||
diff --git a/tests/data/dirty.epub b/tests/data/dirty.epub new file mode 100644 index 0000000..6389963 --- /dev/null +++ b/tests/data/dirty.epub | |||
| Binary files differ | |||
diff --git a/tests/dirty.epub b/tests/dirty.epub new file mode 100644 index 0000000..6389963 --- /dev/null +++ b/tests/dirty.epub | |||
| Binary files differ | |||
diff --git a/tests/test_corrupted_files.py b/tests/test_corrupted_files.py index 8728cb2..53c856a 100644 --- a/tests/test_corrupted_files.py +++ b/tests/test_corrupted_files.py | |||
| @@ -7,7 +7,7 @@ import logging | |||
| 7 | import zipfile | 7 | import zipfile |
| 8 | 8 | ||
| 9 | from libmat2 import pdf, images, audio, office, parser_factory, torrent | 9 | from libmat2 import pdf, images, audio, office, parser_factory, torrent |
| 10 | from libmat2 import harmless, video, html | 10 | from libmat2 import harmless, video, web |
| 11 | 11 | ||
| 12 | # No need to logging messages, should something go wrong, | 12 | # No need to logging messages, should something go wrong, |
| 13 | # the testsuite _will_ fail. | 13 | # the testsuite _will_ fail. |
| @@ -220,34 +220,34 @@ class TestCorruptedFiles(unittest.TestCase): | |||
| 220 | os.remove('./tests/data/--output.avi') | 220 | os.remove('./tests/data/--output.avi') |
| 221 | 221 | ||
| 222 | def test_zip(self): | 222 | def test_zip(self): |
| 223 | with zipfile.ZipFile('./tests/data/dirty.zip', 'w') as zout: | 223 | with zipfile.ZipFile('./tests/data/clean.zip', 'w') as zout: |
| 224 | zout.write('./tests/data/dirty.flac') | 224 | zout.write('./tests/data/dirty.flac') |
| 225 | zout.write('./tests/data/dirty.docx') | 225 | zout.write('./tests/data/dirty.docx') |
| 226 | zout.write('./tests/data/dirty.jpg') | 226 | zout.write('./tests/data/dirty.jpg') |
| 227 | zout.write('./tests/data/embedded_corrupted.docx') | 227 | zout.write('./tests/data/embedded_corrupted.docx') |
| 228 | p, mimetype = parser_factory.get_parser('./tests/data/dirty.zip') | 228 | p, mimetype = parser_factory.get_parser('./tests/data/clean.zip') |
| 229 | self.assertEqual(mimetype, 'application/zip') | 229 | self.assertEqual(mimetype, 'application/zip') |
| 230 | meta = p.get_meta() | 230 | meta = p.get_meta() |
| 231 | self.assertEqual(meta['tests/data/dirty.flac']['comments'], 'Thank you for using MAT !') | 231 | self.assertEqual(meta['tests/data/dirty.flac']['comments'], 'Thank you for using MAT !') |
| 232 | self.assertEqual(meta['tests/data/dirty.docx']['word/media/image1.png']['Comment'], 'This is a comment, be careful!') | 232 | self.assertEqual(meta['tests/data/dirty.docx']['word/media/image1.png']['Comment'], 'This is a comment, be careful!') |
| 233 | self.assertFalse(p.remove_all()) | 233 | self.assertFalse(p.remove_all()) |
| 234 | os.remove('./tests/data/dirty.zip') | 234 | os.remove('./tests/data/clean.zip') |
| 235 | 235 | ||
| 236 | def test_html(self): | 236 | def test_html(self): |
| 237 | shutil.copy('./tests/data/dirty.html', './tests/data/clean.html') | 237 | shutil.copy('./tests/data/dirty.html', './tests/data/clean.html') |
| 238 | with open('./tests/data/clean.html', 'a') as f: | 238 | with open('./tests/data/clean.html', 'a') as f: |
| 239 | f.write('<open>but not</closed>') | 239 | f.write('<open>but not</closed>') |
| 240 | with self.assertRaises(ValueError): | 240 | with self.assertRaises(ValueError): |
| 241 | html.HTMLParser('./tests/data/clean.html') | 241 | web.HTMLParser('./tests/data/clean.html') |
| 242 | os.remove('./tests/data/clean.html') | 242 | os.remove('./tests/data/clean.html') |
| 243 | 243 | ||
| 244 | # Yes, we're able to deal with malformed html :/ | 244 | # Yes, we're able to deal with malformed html :/ |
| 245 | shutil.copy('./tests/data/dirty.html', './tests/data/clean.html') | 245 | shutil.copy('./tests/data/dirty.html', './tests/data/clean.html') |
| 246 | with open('./tests/data/clean.html', 'a') as f: | 246 | with open('./tests/data/clean.html', 'a') as f: |
| 247 | f.write('<meta name=\'this" is="weird"/>') | 247 | f.write('<meta name=\'this" is="weird"/>') |
| 248 | p = html.HTMLParser('./tests/data/clean.html') | 248 | p = web.HTMLParser('./tests/data/clean.html') |
| 249 | self.assertTrue(p.remove_all()) | 249 | self.assertTrue(p.remove_all()) |
| 250 | p = html.HTMLParser('./tests/data/clean.cleaned.html') | 250 | p = web.HTMLParser('./tests/data/clean.cleaned.html') |
| 251 | self.assertEqual(p.get_meta(), {}) | 251 | self.assertEqual(p.get_meta(), {}) |
| 252 | os.remove('./tests/data/clean.html') | 252 | os.remove('./tests/data/clean.html') |
| 253 | os.remove('./tests/data/clean.cleaned.html') | 253 | os.remove('./tests/data/clean.cleaned.html') |
| @@ -255,17 +255,38 @@ class TestCorruptedFiles(unittest.TestCase): | |||
| 255 | with open('./tests/data/clean.html', 'w') as f: | 255 | with open('./tests/data/clean.html', 'w') as f: |
| 256 | f.write('</close>') | 256 | f.write('</close>') |
| 257 | with self.assertRaises(ValueError): | 257 | with self.assertRaises(ValueError): |
| 258 | html.HTMLParser('./tests/data/clean.html') | 258 | web.HTMLParser('./tests/data/clean.html') |
| 259 | os.remove('./tests/data/clean.html') | 259 | os.remove('./tests/data/clean.html') |
| 260 | 260 | ||
| 261 | with open('./tests/data/clean.html', 'w') as f: | 261 | with open('./tests/data/clean.html', 'w') as f: |
| 262 | f.write('<notclosed>') | 262 | f.write('<notclosed>') |
| 263 | p = html.HTMLParser('./tests/data/clean.html') | 263 | p = web.HTMLParser('./tests/data/clean.html') |
| 264 | with self.assertRaises(ValueError): | 264 | with self.assertRaises(ValueError): |
| 265 | p.get_meta() | 265 | p.get_meta() |
| 266 | p = html.HTMLParser('./tests/data/clean.html') | 266 | p = web.HTMLParser('./tests/data/clean.html') |
| 267 | with self.assertRaises(ValueError): | 267 | with self.assertRaises(ValueError): |
| 268 | p.remove_all() | 268 | p.remove_all() |
| 269 | os.remove('./tests/data/clean.html') | 269 | os.remove('./tests/data/clean.html') |
| 270 | 270 | ||
| 271 | with open('./tests/data/clean.html', 'w') as f: | ||
| 272 | f.write('<doctitle><br/></doctitle><br/><notclosed>') | ||
| 273 | p = web.HTMLParser('./tests/data/clean.html') | ||
| 274 | with self.assertRaises(ValueError): | ||
| 275 | p.get_meta() | ||
| 276 | p = web.HTMLParser('./tests/data/clean.html') | ||
| 277 | with self.assertRaises(ValueError): | ||
| 278 | p.remove_all() | ||
| 279 | os.remove('./tests/data/clean.html') | ||
| 280 | |||
| 281 | def test_epub(self): | ||
| 282 | with zipfile.ZipFile('./tests/data/clean.epub', 'w') as zout: | ||
| 283 | zout.write('./tests/data/dirty.jpg', 'OEBPS/content.opf') | ||
| 284 | p, mimetype = parser_factory.get_parser('./tests/data/clean.epub') | ||
| 285 | self.assertEqual(mimetype, 'application/epub+zip') | ||
| 286 | meta = p.get_meta() | ||
| 287 | self.assertEqual(meta['OEBPS/content.opf']['OEBPS/content.opf'], | ||
| 288 | 'harmful content') | ||
| 289 | |||
| 290 | self.assertFalse(p.remove_all()) | ||
| 291 | os.remove('./tests/data/clean.epub') | ||
| 271 | 292 | ||
diff --git a/tests/test_libmat2.py b/tests/test_libmat2.py index 8753e09..249c56d 100644 --- a/tests/test_libmat2.py +++ b/tests/test_libmat2.py | |||
| @@ -6,7 +6,7 @@ import os | |||
| 6 | import zipfile | 6 | import zipfile |
| 7 | 7 | ||
| 8 | from libmat2 import pdf, images, audio, office, parser_factory, torrent, harmless | 8 | from libmat2 import pdf, images, audio, office, parser_factory, torrent, harmless |
| 9 | from libmat2 import check_dependencies, video, archive, html | 9 | from libmat2 import check_dependencies, video, archive, web, epub |
| 10 | 10 | ||
| 11 | 11 | ||
| 12 | class TestCheckDependencies(unittest.TestCase): | 12 | class TestCheckDependencies(unittest.TestCase): |
| @@ -177,6 +177,23 @@ class TestGetMeta(unittest.TestCase): | |||
| 177 | meta = p.get_meta() | 177 | meta = p.get_meta() |
| 178 | self.assertEqual(meta['Comment'], 'this is a test comment') | 178 | self.assertEqual(meta['Comment'], 'this is a test comment') |
| 179 | 179 | ||
| 180 | def test_epub(self): | ||
| 181 | p, mimetype = parser_factory.get_parser('./tests/data/dirty.epub') | ||
| 182 | self.assertEqual(mimetype, 'application/epub+zip') | ||
| 183 | meta = p.get_meta() | ||
| 184 | self.assertEqual(meta['OEBPS/content.opf']['dc:creator'], 'Dorothy L. Sayers') | ||
| 185 | self.assertEqual(meta['OEBPS/toc.ncx']['dtb:generator'], 'Ebookmaker 0.4.0a5 by Marcello Perathoner <webmaster@gutenberg.org>') | ||
| 186 | self.assertEqual(meta['OEBPS/@public@vhost@g@gutenberg@html@files@58820@58820-h@images@shield25.jpg']['CreatorTool'], 'Adobe Photoshop CS5 Macintosh') | ||
| 187 | self.assertEqual(meta['OEBPS/@public@vhost@g@gutenberg@html@files@58820@58820-h@58820-h-2.htm.html']['generator'], 'Ebookmaker 0.4.0a5 by Marcello Perathoner <webmaster@gutenberg.org>') | ||
| 188 | |||
| 189 | def test_css(self): | ||
| 190 | p, mimetype = parser_factory.get_parser('./tests/data/dirty.css') | ||
| 191 | self.assertEqual(mimetype, 'text/css') | ||
| 192 | meta = p.get_meta() | ||
| 193 | self.assertEqual(meta['author'], 'jvoisin') | ||
| 194 | self.assertEqual(meta['version'], '1.0') | ||
| 195 | self.assertEqual(meta['harmful data'], 'underline is cool') | ||
| 196 | |||
| 180 | class TestRemovingThumbnails(unittest.TestCase): | 197 | class TestRemovingThumbnails(unittest.TestCase): |
| 181 | def test_odt(self): | 198 | def test_odt(self): |
| 182 | shutil.copy('./tests/data/revision.odt', './tests/data/clean.odt') | 199 | shutil.copy('./tests/data/revision.odt', './tests/data/clean.odt') |
| @@ -599,7 +616,7 @@ class TestCleaning(unittest.TestCase): | |||
| 599 | 616 | ||
| 600 | def test_html(self): | 617 | def test_html(self): |
| 601 | shutil.copy('./tests/data/dirty.html', './tests/data/clean.html') | 618 | shutil.copy('./tests/data/dirty.html', './tests/data/clean.html') |
| 602 | p = html.HTMLParser('./tests/data/clean.html') | 619 | p = web.HTMLParser('./tests/data/clean.html') |
| 603 | 620 | ||
| 604 | meta = p.get_meta() | 621 | meta = p.get_meta() |
| 605 | self.assertEqual(meta['author'], 'jvoisin') | 622 | self.assertEqual(meta['author'], 'jvoisin') |
| @@ -607,10 +624,50 @@ class TestCleaning(unittest.TestCase): | |||
| 607 | ret = p.remove_all() | 624 | ret = p.remove_all() |
| 608 | self.assertTrue(ret) | 625 | self.assertTrue(ret) |
| 609 | 626 | ||
| 610 | p = html.HTMLParser('./tests/data/clean.cleaned.html') | 627 | p = web.HTMLParser('./tests/data/clean.cleaned.html') |
| 611 | self.assertEqual(p.get_meta(), {}) | 628 | self.assertEqual(p.get_meta(), {}) |
| 612 | self.assertTrue(p.remove_all()) | 629 | self.assertTrue(p.remove_all()) |
| 613 | 630 | ||
| 614 | os.remove('./tests/data/clean.html') | 631 | os.remove('./tests/data/clean.html') |
| 615 | os.remove('./tests/data/clean.cleaned.html') | 632 | os.remove('./tests/data/clean.cleaned.html') |
| 616 | os.remove('./tests/data/clean.cleaned.cleaned.html') | 633 | os.remove('./tests/data/clean.cleaned.cleaned.html') |
| 634 | |||
| 635 | |||
| 636 | def test_epub(self): | ||
| 637 | shutil.copy('./tests/data/dirty.epub', './tests/data/clean.epub') | ||
| 638 | p = epub.EPUBParser('./tests/data/clean.epub') | ||
| 639 | |||
| 640 | meta = p.get_meta() | ||
| 641 | self.assertEqual(meta['OEBPS/content.opf']['dc:source'], 'http://www.gutenberg.org/files/58820/58820-h/58820-h.htm') | ||
| 642 | |||
| 643 | ret = p.remove_all() | ||
| 644 | self.assertTrue(ret) | ||
| 645 | |||
| 646 | p = epub.EPUBParser('./tests/data/clean.cleaned.epub') | ||
| 647 | self.assertEqual(p.get_meta(), {}) | ||
| 648 | self.assertTrue(p.remove_all()) | ||
| 649 | |||
| 650 | os.remove('./tests/data/clean.epub') | ||
| 651 | os.remove('./tests/data/clean.cleaned.epub') | ||
| 652 | os.remove('./tests/data/clean.cleaned.cleaned.epub') | ||
| 653 | |||
| 654 | |||
| 655 | def test_css(self): | ||
| 656 | shutil.copy('./tests/data/dirty.css', './tests/data/clean.css') | ||
| 657 | p = web.CSSParser('./tests/data/clean.css') | ||
| 658 | |||
| 659 | self.assertEqual(p.get_meta(), { | ||
| 660 | 'harmful data': 'underline is cool', | ||
| 661 | 'version': '1.0', | ||
| 662 | 'author': 'jvoisin'}) | ||
| 663 | |||
| 664 | ret = p.remove_all() | ||
| 665 | self.assertTrue(ret) | ||
| 666 | |||
| 667 | p = web.CSSParser('./tests/data/clean.cleaned.css') | ||
| 668 | self.assertEqual(p.get_meta(), {}) | ||
| 669 | self.assertTrue(p.remove_all()) | ||
| 670 | |||
| 671 | os.remove('./tests/data/clean.css') | ||
| 672 | os.remove('./tests/data/clean.cleaned.css') | ||
| 673 | os.remove('./tests/data/clean.cleaned.cleaned.css') | ||
