From 02ff21b158c76fcd355a74ddb940e1c54fc2d7ed Mon Sep 17 00:00:00 2001
From: jvoisin
Date: Wed, 20 Feb 2019 16:28:11 -0800
Subject: Implement epub support

---
 tests/data/dirty.css          |  14 ++++++++++
 tests/data/dirty.epub         | Bin 0 -> 296324 bytes
 tests/dirty.epub              | Bin 0 -> 296324 bytes
 tests/test_corrupted_files.py |  41 ++++++++++++++++++++-------
 tests/test_libmat2.py         |  63 ++++++++++++++++++++++++++++++++++++++++--
 5 files changed, 105 insertions(+), 13 deletions(-)
 create mode 100644 tests/data/dirty.css
 create mode 100644 tests/data/dirty.epub
 create mode 100644 tests/dirty.epub

(limited to 'tests')
diff --git a/tests/data/dirty.css b/tests/data/dirty.css
new file mode 100644
index 0000000..f52caf9
--- /dev/null
+++ b/tests/data/dirty.css
@@ -0,0 +1,14 @@
+/**
+ * This is my super css framework
+ * version: 1.0
+ * author : jvoisin
+ */
+
+body {
+	color: red;
+	background-color: blue;
+}
+
+.underline {
+	text-decoration: underline; /* underline is cool */	
+}
diff --git a/tests/data/dirty.epub b/tests/data/dirty.epub
new file mode 100644
index 0000000..6389963
Binary files /dev/null and b/tests/data/dirty.epub differ
diff --git a/tests/dirty.epub b/tests/dirty.epub
new file mode 100644
index 0000000..6389963
Binary files /dev/null and b/tests/dirty.epub differ
diff --git a/tests/test_corrupted_files.py b/tests/test_corrupted_files.py
index 8728cb2..53c856a 100644
--- a/tests/test_corrupted_files.py
+++ b/tests/test_corrupted_files.py
@@ -7,7 +7,7 @@ import logging
 import zipfile
 
 from libmat2 import pdf, images, audio, office, parser_factory, torrent
-from libmat2 import harmless, video, html
+from libmat2 import harmless, video, web
 
 # No need to logging messages, should something go wrong,
 # the testsuite _will_ fail.
@@ -220,34 +220,34 @@ class TestCorruptedFiles(unittest.TestCase):
         os.remove('./tests/data/--output.avi')
 
     def test_zip(self):
-        with zipfile.ZipFile('./tests/data/dirty.zip', 'w') as zout:
+        with zipfile.ZipFile('./tests/data/clean.zip', 'w') as zout:
             zout.write('./tests/data/dirty.flac')
             zout.write('./tests/data/dirty.docx')
             zout.write('./tests/data/dirty.jpg')
             zout.write('./tests/data/embedded_corrupted.docx')
-        p, mimetype = parser_factory.get_parser('./tests/data/dirty.zip')
+        p, mimetype = parser_factory.get_parser('./tests/data/clean.zip')
         self.assertEqual(mimetype, 'application/zip')
         meta = p.get_meta()
         self.assertEqual(meta['tests/data/dirty.flac']['comments'], 'Thank you for using MAT !')
         self.assertEqual(meta['tests/data/dirty.docx']['word/media/image1.png']['Comment'], 'This is a comment, be careful!')
         self.assertFalse(p.remove_all())
-        os.remove('./tests/data/dirty.zip')
+        os.remove('./tests/data/clean.zip')
 
     def test_html(self):
         shutil.copy('./tests/data/dirty.html', './tests/data/clean.html')
         with open('./tests/data/clean.html', 'a') as f:
             f.write('<open>but not</closed>')
         with self.assertRaises(ValueError):
-            html.HTMLParser('./tests/data/clean.html')
+            web.HTMLParser('./tests/data/clean.html')
         os.remove('./tests/data/clean.html')
 
         # Yes, we're able to deal with malformed html :/
         shutil.copy('./tests/data/dirty.html', './tests/data/clean.html')
         with open('./tests/data/clean.html', 'a') as f:
             f.write('<meta name=\'this" is="weird"/>')
-        p = html.HTMLParser('./tests/data/clean.html')
+        p = web.HTMLParser('./tests/data/clean.html')
         self.assertTrue(p.remove_all())
-        p = html.HTMLParser('./tests/data/clean.cleaned.html')
+        p = web.HTMLParser('./tests/data/clean.cleaned.html')
         self.assertEqual(p.get_meta(), {})
         os.remove('./tests/data/clean.html')
         os.remove('./tests/data/clean.cleaned.html')
@@ -255,17 +255,38 @@ class TestCorruptedFiles(unittest.TestCase):
         with open('./tests/data/clean.html', 'w') as f:
             f.write('</close>')
         with self.assertRaises(ValueError):
-            html.HTMLParser('./tests/data/clean.html')
+            web.HTMLParser('./tests/data/clean.html')
         os.remove('./tests/data/clean.html')
 
         with open('./tests/data/clean.html', 'w') as f:
             f.write('<notclosed>')
-        p = html.HTMLParser('./tests/data/clean.html')
+        p = web.HTMLParser('./tests/data/clean.html')
         with self.assertRaises(ValueError):
             p.get_meta()
-        p = html.HTMLParser('./tests/data/clean.html')
+        p = web.HTMLParser('./tests/data/clean.html')
         with self.assertRaises(ValueError):
             p.remove_all()
         os.remove('./tests/data/clean.html')
 
+        with open('./tests/data/clean.html', 'w') as f:
+            f.write('<doctitle><br/></doctitle><br/><notclosed>')
+        p = web.HTMLParser('./tests/data/clean.html')
+        with self.assertRaises(ValueError):
+            p.get_meta()
+        p = web.HTMLParser('./tests/data/clean.html')
+        with self.assertRaises(ValueError):
+            p.remove_all()
+        os.remove('./tests/data/clean.html')
+
+    def test_epub(self):
+        with zipfile.ZipFile('./tests/data/clean.epub', 'w') as zout:
+            zout.write('./tests/data/dirty.jpg', 'OEBPS/content.opf')
+        p, mimetype = parser_factory.get_parser('./tests/data/clean.epub')
+        self.assertEqual(mimetype, 'application/epub+zip')
+        meta = p.get_meta()
+        self.assertEqual(meta['OEBPS/content.opf']['OEBPS/content.opf'],
+                'harmful content')
+
+        self.assertFalse(p.remove_all())
+        os.remove('./tests/data/clean.epub')
 
diff --git a/tests/test_libmat2.py b/tests/test_libmat2.py
index 8753e09..249c56d 100644
--- a/tests/test_libmat2.py
+++ b/tests/test_libmat2.py
@@ -6,7 +6,7 @@ import os
 import zipfile
 
 from libmat2 import pdf, images, audio, office, parser_factory, torrent, harmless
-from libmat2 import check_dependencies, video, archive, html
+from libmat2 import check_dependencies, video, archive, web, epub
 
 
 class TestCheckDependencies(unittest.TestCase):
@@ -177,6 +177,23 @@ class TestGetMeta(unittest.TestCase):
         meta = p.get_meta()
         self.assertEqual(meta['Comment'], 'this is a test comment')
 
+    def test_epub(self):
+        p, mimetype = parser_factory.get_parser('./tests/data/dirty.epub')
+        self.assertEqual(mimetype, 'application/epub+zip')
+        meta = p.get_meta()
+        self.assertEqual(meta['OEBPS/content.opf']['dc:creator'], 'Dorothy L. Sayers')
+        self.assertEqual(meta['OEBPS/toc.ncx']['dtb:generator'], 'Ebookmaker 0.4.0a5 by Marcello Perathoner <webmaster@gutenberg.org>')
+        self.assertEqual(meta['OEBPS/@public@vhost@g@gutenberg@html@files@58820@58820-h@images@shield25.jpg']['CreatorTool'], 'Adobe Photoshop CS5 Macintosh')
+        self.assertEqual(meta['OEBPS/@public@vhost@g@gutenberg@html@files@58820@58820-h@58820-h-2.htm.html']['generator'], 'Ebookmaker 0.4.0a5 by Marcello Perathoner <webmaster@gutenberg.org>')
+
+    def test_css(self):
+        p, mimetype = parser_factory.get_parser('./tests/data/dirty.css')
+        self.assertEqual(mimetype, 'text/css')
+        meta = p.get_meta()
+        self.assertEqual(meta['author'], 'jvoisin')
+        self.assertEqual(meta['version'], '1.0')
+        self.assertEqual(meta['harmful data'], 'underline is cool')
+
 class TestRemovingThumbnails(unittest.TestCase):
     def test_odt(self):
         shutil.copy('./tests/data/revision.odt', './tests/data/clean.odt')
@@ -599,7 +616,7 @@ class TestCleaning(unittest.TestCase):
 
     def test_html(self):
         shutil.copy('./tests/data/dirty.html', './tests/data/clean.html')
-        p = html.HTMLParser('./tests/data/clean.html')
+        p = web.HTMLParser('./tests/data/clean.html')
 
         meta = p.get_meta()
         self.assertEqual(meta['author'], 'jvoisin')
@@ -607,10 +624,50 @@ class TestCleaning(unittest.TestCase):
         ret = p.remove_all()
         self.assertTrue(ret)
 
-        p = html.HTMLParser('./tests/data/clean.cleaned.html')
+        p = web.HTMLParser('./tests/data/clean.cleaned.html')
         self.assertEqual(p.get_meta(), {})
         self.assertTrue(p.remove_all())
 
         os.remove('./tests/data/clean.html')
         os.remove('./tests/data/clean.cleaned.html')
         os.remove('./tests/data/clean.cleaned.cleaned.html')
+
+
+    def test_epub(self):
+        shutil.copy('./tests/data/dirty.epub', './tests/data/clean.epub')
+        p = epub.EPUBParser('./tests/data/clean.epub')
+
+        meta = p.get_meta()
+        self.assertEqual(meta['OEBPS/content.opf']['dc:source'], 'http://www.gutenberg.org/files/58820/58820-h/58820-h.htm')
+
+        ret = p.remove_all()
+        self.assertTrue(ret)
+
+        p = epub.EPUBParser('./tests/data/clean.cleaned.epub')
+        self.assertEqual(p.get_meta(), {})
+        self.assertTrue(p.remove_all())
+
+        os.remove('./tests/data/clean.epub')
+        os.remove('./tests/data/clean.cleaned.epub')
+        os.remove('./tests/data/clean.cleaned.cleaned.epub')
+
+
+    def test_css(self):
+        shutil.copy('./tests/data/dirty.css', './tests/data/clean.css')
+        p = web.CSSParser('./tests/data/clean.css')
+
+        self.assertEqual(p.get_meta(), {
+            'harmful data': 'underline is cool',
+            'version': '1.0',
+            'author': 'jvoisin'})
+
+        ret = p.remove_all()
+        self.assertTrue(ret)
+
+        p = web.CSSParser('./tests/data/clean.cleaned.css')
+        self.assertEqual(p.get_meta(), {})
+        self.assertTrue(p.remove_all())
+
+        os.remove('./tests/data/clean.css')
+        os.remove('./tests/data/clean.cleaned.css')
+        os.remove('./tests/data/clean.cleaned.cleaned.css')
-- 
cgit v1.3