Implement epub support

author: jvoisin 2019-02-20 16:28:11 -0800
committer: jvoisin 2019-02-20 16:28:11 -0800
commit: 02ff21b158c76fcd355a74ddb940e1c54fc2d7ed (patch)
tree: 701c6f5e316265e5a95a162356965ecf2fb8d6b2
parent: 6b45064c784d03bb21ffaf7e50c9ba684e6985a9 (diff)
9 files changed, 282 insertions, 83 deletions
diff --git a/libmat2/epub.py b/libmat2/epub.py
new file mode 100644
index 0000000..09b7937
--- /dev/null
+++ b/libmat2/epub.py
@@ -0,0 +1,47 @@
+import logging
+import re
+import xml.etree.ElementTree as ET  # type: ignore
+from . import archive, office
+class EPUBParser(archive.ArchiveBasedAbstractParser):
+    mimetypes = {'application/epub+zip', }
+    def __init__(self, filename):
+        super().__init__(filename)
+        self.files_to_keep = set(map(re.compile, {  # type: ignore
+            'META-INF/container.xml',
+            'mimetype',
+            'OEBPS/content.opf',
+            }))
+    def _specific_get_meta(self, full_path, file_path):
+        if file_path != 'OEBPS/content.opf':
+            return {}
+        with open(full_path, encoding='utf-8') as f:
+            try:
+                results = re.findall(r"<((?:meta|dc|cp).+?)[^>]*>(.+)</\1>",
+                                     f.read(), re.I|re.M)
+                return {k:v for (k, v) in results}
+            except (TypeError, UnicodeDecodeError):
+                # We didn't manage to parse the xml file
+                return {file_path: 'harmful content', }
+    def _specific_cleanup(self, full_path: str):
+        if not full_path.endswith('OEBPS/content.opf'):
+            return True
+        try:
+            tree, namespace = office._parse_xml(full_path)
+        except ET.ParseError:
+            logging.error("Unable to parse %s in %s.", full_path, self.filename)
+            return False
+        parent_map = {c:p for p in tree.iter() for c in p}
+        for item in tree.iterfind('.//', namespace):
+            if item.tag.strip().lower().endswith('metadata'):
+                parent_map[item].remove(item)
+                break  # there is only a single <metadata> block
+        tree.write(full_path, xml_declaration=True)
+        return True
diff --git a/libmat2/html.py b/libmat2/html.py
deleted file mode 100644
index d0e9a2b..0000000
--- a/libmat2/html.py
+++ /dev/null
@@ -1,69 +0,0 @@
-from html import parser
-from typing import Dict, Any, List, Tuple
-from . import abstract
-class HTMLParser(abstract.AbstractParser):
-    mimetypes = {'text/html', }
-    def __init__(self, filename):
-        super().__init__(filename)
-        self.__parser = _HTMLParser()
-        with open(filename) as f:
-            self.__parser.feed(f.read())
-        self.__parser.close()
-    def get_meta(self) -> Dict[str, Any]:
-        return self.__parser.get_meta()
-    def remove_all(self) -> bool:
-        return self.__parser.remove_all(self.output_filename)
-class _HTMLParser(parser.HTMLParser):
-    """Python doesn't have a validating html parser in its stdlib, so
-    we're using an internal queue to track all the opening/closing tags,
-    and hoping for the best.
-    """
-    def __init__(self):
-        super().__init__()
-        self.__textrepr = ''
-        self.__meta = {}
-        self.__validation_queue = []
-    def handle_starttag(self, tag: str, attrs: List[Tuple[str, str]]):
-        self.__textrepr += self.get_starttag_text()
-        self.__validation_queue.append(tag)
-    def handle_endtag(self, tag: str):
-        if not self.__validation_queue:
-            raise ValueError
-        elif tag != self.__validation_queue.pop():
-            raise ValueError
-        # There is no `get_endtag_text()` method :/
-        self.__textrepr += '</' + tag + '>\n'
-    def handle_data(self, data: str):
-        if data.strip():
-            self.__textrepr += data
-    def handle_startendtag(self, tag: str, attrs: List[Tuple[str, str]]):
-        if tag == 'meta':
-            meta = {k:v for k, v in attrs}
-            name = meta.get('name', 'harmful metadata')
-            content = meta.get('content', 'harmful data')
-            self.__meta[name] = content
-        else:
-            self.__textrepr += self.get_starttag_text()
-    def remove_all(self, output_filename: str) -> bool:
-        if self.__validation_queue:
-            raise ValueError
-        with open(output_filename, 'w') as f:
-            f.write(self.__textrepr)
-        return True
-    def get_meta(self) -> Dict[str, Any]:
-        if self.__validation_queue:
-            raise ValueError
-        return self.__meta
diff --git a/libmat2/parser_factory.py b/libmat2/parser_factory.py
index 30c3b52..e93ee4f 100644
--- a/libmat2/parser_factory.py
+++ b/libmat2/parser_factory.py
@@ -1,3 +1,4 @@
+import logging
 import glob
 import os
 import mimetypes
@@ -10,6 +11,10 @@ assert Tuple  # make pyflakes happy
 T = TypeVar('T', bound='abstract.AbstractParser')
+mimetypes.add_type('application/epub+zip', '.epub')
+# EPUB Navigation Control XML File
+mimetypes.add_type('application/x-dtbncx+xml', '.ncx')
 def __load_all_parsers():
    """ Loads every parser in a dynamic way """
@@ -49,6 +54,8 @@ def get_parser(filename: str) -> Tuple[Optional[T], Optional[str]]:
        if mtype in parser_class.mimetypes:
            try:
                return parser_class(filename), mtype
-            except ValueError:
+            except ValueError as e:
+                logging.info("Got an exception when trying to instanciate "
+                             "%s for %s: %s", parser_class, filename, e)
                return None, mtype
    return None, mtype
diff --git a/libmat2/web.py b/libmat2/web.py
new file mode 100644
index 0000000..13d5fc8
--- /dev/null
+++ b/libmat2/web.py
@@ -0,0 +1,122 @@
+from html import parser
+from typing import Dict, Any, List, Tuple
+import re
+import string
+from . import abstract
+class CSSParser(abstract.AbstractParser):
+    """There is no such things as metadata in CSS files,
+    only comments of the form `/* … */`, so we're removing the laters."""
+    mimetypes = {'text/css', }
+    flags = re.MULTILINE | re.DOTALL
+    def remove_all(self) -> bool:
+        with open(self.filename, encoding='utf-8') as f:
+            cleaned = re.sub(r'/\*.+?\*/', '', f.read(), 0, self.flags)
+        with open(self.output_filename, 'w', encoding='utf-8') as f:
+            f.write(cleaned)
+        return True
+    def get_meta(self) -> Dict[str, Any]:
+        metadata = {}
+        with open(self.filename, encoding='utf-8') as f:
+            cssdoc = re.findall(r'/\*(.+?)\*/', f.read(), self.flags)
+        for match in cssdoc:
+            for line in match.splitlines():
+                try:
+                    k, v = line.split(':')
+                    metadata[k.strip(string.whitespace + '*')] = v.strip()
+                except ValueError:
+                    metadata['harmful data'] = line.strip()
+        return metadata
+class HTMLParser(abstract.AbstractParser):
+    mimetypes = {'text/html', 'application/x-dtbncx+xml', }
+    def __init__(self, filename):
+        super().__init__(filename)
+        self.__parser = _HTMLParser(self.filename)
+        with open(filename, encoding='utf-8') as f:
+            self.__parser.feed(f.read())
+        self.__parser.close()
+    def get_meta(self) -> Dict[str, Any]:
+        return self.__parser.get_meta()
+    def remove_all(self) -> bool:
+        return self.__parser.remove_all(self.output_filename)
+class _HTMLParser(parser.HTMLParser):
+    """Python doesn't have a validating html parser in its stdlib, so
+    we're using an internal queue to track all the opening/closing tags,
+    and hoping for the best.
+    """
+    tag_blacklist = {'doctitle', 'meta'}  # everything is lowercase
+    def __init__(self, filename):
+        super().__init__()
+        self.filename = filename
+        self.__textrepr = ''
+        self.__meta = {}
+        self.__validation_queue = []
+        # We're using a counter instead of a boolean to handle nested tags
+        self.__in_dangerous_tag = 0
+    def handle_starttag(self, tag: str, attrs: List[Tuple[str, str]]):
+        self.__validation_queue.append(tag)
+        if tag in self.tag_blacklist:
+            self.__in_dangerous_tag += 1
+            return
+        if self.__in_dangerous_tag == 0:
+            self.__textrepr += self.get_starttag_text()
+    def handle_endtag(self, tag: str):
+        if not self.__validation_queue:
+            raise ValueError("The closing tag %s doesn't have a corresponding "
+                             "opening one in %s." % (tag, self.filename))
+        previous_tag = self.__validation_queue.pop()
+        if tag != previous_tag:
+            raise ValueError("The closing tag %s doesn't match the previous "
+                             "tag %s in %s" %
+                             (tag, previous_tag, self.filename))
+        elif tag in self.tag_blacklist:
+            self.__in_dangerous_tag -= 1
+            return
+        if self.__in_dangerous_tag == 0:
+            # There is no `get_endtag_text()` method :/
+            self.__textrepr += '</' + tag + '>\n'
+    def handle_data(self, data: str):
+        if self.__in_dangerous_tag == 0 and data.strip():
+            self.__textrepr += data
+    def handle_startendtag(self, tag: str, attrs: List[Tuple[str, str]]):
+        if tag in self.tag_blacklist:
+            meta = {k:v for k, v in attrs}
+            name = meta.get('name', 'harmful metadata')
+            content = meta.get('content', 'harmful data')
+            self.__meta[name] = content
+        else:
+            if self.__in_dangerous_tag == 0:
+                self.__textrepr += self.get_starttag_text()
+    def remove_all(self, output_filename: str) -> bool:
+        if self.__validation_queue:
+            raise ValueError("Some tags (%s) were left unclosed in %s" % (
+                ', '.join(self.__validation_queue),
+                self.filename))
+        with open(output_filename, 'w', encoding='utf-8') as f:
+            f.write(self.__textrepr)
+        return True
+    def get_meta(self) -> Dict[str, Any]:
+        if self.__validation_queue:
+            raise ValueError("Some tags (%s) were left unclosed in %s" % (
+                ', '.join(self.__validation_queue),
+                self.filename))
+        return self.__meta
diff --git a/tests/data/dirty.css b/tests/data/dirty.css
new file mode 100644
index 0000000..f52caf9
--- /dev/null
+++ b/tests/data/dirty.css
@@ -0,0 +1,14 @@
+/**
+ * This is my super css framework
+ * version: 1.0
+ * author : jvoisin
+ */
+body {
+        color: red;
+        background-color: blue;
+}
+.underline {
+        text-decoration: underline; /* underline is cool */     
+}
diff --git a/tests/data/dirty.epub b/tests/data/dirty.epub
new file mode 100644
index 0000000..6389963
--- /dev/null
+++ b/tests/data/dirty.epub
Binary files differ
diff --git a/tests/dirty.epub b/tests/dirty.epub
new file mode 100644
index 0000000..6389963
--- /dev/null
+++ b/tests/dirty.epub
Binary files differ
diff --git a/tests/test_corrupted_files.py b/tests/test_corrupted_files.py
index 8728cb2..53c856a 100644
--- a/tests/test_corrupted_files.py
+++ b/tests/test_corrupted_files.py
@@ -7,7 +7,7 @@ import logging
 import zipfile
 from libmat2 import pdf, images, audio, office, parser_factory, torrent
-from libmat2 import harmless, video, html
+from libmat2 import harmless, video, web
 # No need to logging messages, should something go wrong,
 # the testsuite _will_ fail.
@@ -220,34 +220,34 @@ class TestCorruptedFiles(unittest.TestCase):
        os.remove('./tests/data/--output.avi')
    def test_zip(self):
-        with zipfile.ZipFile('./tests/data/dirty.zip', 'w') as zout:
+        with zipfile.ZipFile('./tests/data/clean.zip', 'w') as zout:
            zout.write('./tests/data/dirty.flac')
            zout.write('./tests/data/dirty.docx')
            zout.write('./tests/data/dirty.jpg')
            zout.write('./tests/data/embedded_corrupted.docx')
-        p, mimetype = parser_factory.get_parser('./tests/data/dirty.zip')
+        p, mimetype = parser_factory.get_parser('./tests/data/clean.zip')
        self.assertEqual(mimetype, 'application/zip')
        meta = p.get_meta()
        self.assertEqual(meta['tests/data/dirty.flac']['comments'], 'Thank you for using MAT !')
        self.assertEqual(meta['tests/data/dirty.docx']['word/media/image1.png']['Comment'], 'This is a comment, be careful!')
        self.assertFalse(p.remove_all())
-        os.remove('./tests/data/dirty.zip')
+        os.remove('./tests/data/clean.zip')
    def test_html(self):
        shutil.copy('./tests/data/dirty.html', './tests/data/clean.html')
        with open('./tests/data/clean.html', 'a') as f:
            f.write('<open>but not</closed>')
        with self.assertRaises(ValueError):
-            html.HTMLParser('./tests/data/clean.html')
+            web.HTMLParser('./tests/data/clean.html')
        os.remove('./tests/data/clean.html')
        # Yes, we're able to deal with malformed html :/
        shutil.copy('./tests/data/dirty.html', './tests/data/clean.html')
        with open('./tests/data/clean.html', 'a') as f:
            f.write('<meta name=\'this" is="weird"/>')
-        p = html.HTMLParser('./tests/data/clean.html')
+        p = web.HTMLParser('./tests/data/clean.html')
        self.assertTrue(p.remove_all())
-        p = html.HTMLParser('./tests/data/clean.cleaned.html')
+        p = web.HTMLParser('./tests/data/clean.cleaned.html')
        self.assertEqual(p.get_meta(), {})
        os.remove('./tests/data/clean.html')
        os.remove('./tests/data/clean.cleaned.html')
@@ -255,17 +255,38 @@ class TestCorruptedFiles(unittest.TestCase):
        with open('./tests/data/clean.html', 'w') as f:
            f.write('</close>')
        with self.assertRaises(ValueError):
-            html.HTMLParser('./tests/data/clean.html')
+            web.HTMLParser('./tests/data/clean.html')
        os.remove('./tests/data/clean.html')
        with open('./tests/data/clean.html', 'w') as f:
            f.write('<notclosed>')
-        p = html.HTMLParser('./tests/data/clean.html')
+        p = web.HTMLParser('./tests/data/clean.html')
        with self.assertRaises(ValueError):
            p.get_meta()
-        p = html.HTMLParser('./tests/data/clean.html')
+        p = web.HTMLParser('./tests/data/clean.html')
        with self.assertRaises(ValueError):
            p.remove_all()
        os.remove('./tests/data/clean.html')
+        with open('./tests/data/clean.html', 'w') as f:
+            f.write('<doctitle><br/></doctitle><br/><notclosed>')
+        p = web.HTMLParser('./tests/data/clean.html')
+        with self.assertRaises(ValueError):
+            p.get_meta()
+        p = web.HTMLParser('./tests/data/clean.html')
+        with self.assertRaises(ValueError):
+            p.remove_all()
+        os.remove('./tests/data/clean.html')
+    def test_epub(self):
+        with zipfile.ZipFile('./tests/data/clean.epub', 'w') as zout:
+            zout.write('./tests/data/dirty.jpg', 'OEBPS/content.opf')
+        p, mimetype = parser_factory.get_parser('./tests/data/clean.epub')
+        self.assertEqual(mimetype, 'application/epub+zip')
+        meta = p.get_meta()
+        self.assertEqual(meta['OEBPS/content.opf']['OEBPS/content.opf'],
+                'harmful content')
+        self.assertFalse(p.remove_all())
+        os.remove('./tests/data/clean.epub')
diff --git a/tests/test_libmat2.py b/tests/test_libmat2.py
index 8753e09..249c56d 100644
--- a/tests/test_libmat2.py
+++ b/tests/test_libmat2.py
@@ -6,7 +6,7 @@ import os
 import zipfile
 from libmat2 import pdf, images, audio, office, parser_factory, torrent, harmless
-from libmat2 import check_dependencies, video, archive, html
+from libmat2 import check_dependencies, video, archive, web, epub
 class TestCheckDependencies(unittest.TestCase):
@@ -177,6 +177,23 @@ class TestGetMeta(unittest.TestCase):
        meta = p.get_meta()
        self.assertEqual(meta['Comment'], 'this is a test comment')
+    def test_epub(self):
+        p, mimetype = parser_factory.get_parser('./tests/data/dirty.epub')
+        self.assertEqual(mimetype, 'application/epub+zip')
+        meta = p.get_meta()
+        self.assertEqual(meta['OEBPS/content.opf']['dc:creator'], 'Dorothy L. Sayers')
+        self.assertEqual(meta['OEBPS/toc.ncx']['dtb:generator'], 'Ebookmaker 0.4.0a5 by Marcello Perathoner <webmaster@gutenberg.org>')
+        self.assertEqual(meta['OEBPS/@public@vhost@g@gutenberg@html@files@58820@58820-h@images@shield25.jpg']['CreatorTool'], 'Adobe Photoshop CS5 Macintosh')
+        self.assertEqual(meta['OEBPS/@public@vhost@g@gutenberg@html@files@58820@58820-h@58820-h-2.htm.html']['generator'], 'Ebookmaker 0.4.0a5 by Marcello Perathoner <webmaster@gutenberg.org>')
+    def test_css(self):
+        p, mimetype = parser_factory.get_parser('./tests/data/dirty.css')
+        self.assertEqual(mimetype, 'text/css')
+        meta = p.get_meta()
+        self.assertEqual(meta['author'], 'jvoisin')
+        self.assertEqual(meta['version'], '1.0')
+        self.assertEqual(meta['harmful data'], 'underline is cool')
 class TestRemovingThumbnails(unittest.TestCase):
    def test_odt(self):
        shutil.copy('./tests/data/revision.odt', './tests/data/clean.odt')
@@ -599,7 +616,7 @@ class TestCleaning(unittest.TestCase):
    def test_html(self):
        shutil.copy('./tests/data/dirty.html', './tests/data/clean.html')
-        p = html.HTMLParser('./tests/data/clean.html')
+        p = web.HTMLParser('./tests/data/clean.html')
        meta = p.get_meta()
        self.assertEqual(meta['author'], 'jvoisin')
@@ -607,10 +624,50 @@ class TestCleaning(unittest.TestCase):
        ret = p.remove_all()
        self.assertTrue(ret)
-        p = html.HTMLParser('./tests/data/clean.cleaned.html')
+        p = web.HTMLParser('./tests/data/clean.cleaned.html')
        self.assertEqual(p.get_meta(), {})
        self.assertTrue(p.remove_all())
        os.remove('./tests/data/clean.html')
        os.remove('./tests/data/clean.cleaned.html')
        os.remove('./tests/data/clean.cleaned.cleaned.html')
+    def test_epub(self):
+        shutil.copy('./tests/data/dirty.epub', './tests/data/clean.epub')
+        p = epub.EPUBParser('./tests/data/clean.epub')
+        meta = p.get_meta()
+        self.assertEqual(meta['OEBPS/content.opf']['dc:source'], 'http://www.gutenberg.org/files/58820/58820-h/58820-h.htm')
+        ret = p.remove_all()
+        self.assertTrue(ret)
+        p = epub.EPUBParser('./tests/data/clean.cleaned.epub')
+        self.assertEqual(p.get_meta(), {})
+        self.assertTrue(p.remove_all())
+        os.remove('./tests/data/clean.epub')
+        os.remove('./tests/data/clean.cleaned.epub')
+        os.remove('./tests/data/clean.cleaned.cleaned.epub')
+    def test_css(self):
+        shutil.copy('./tests/data/dirty.css', './tests/data/clean.css')
+        p = web.CSSParser('./tests/data/clean.css')
+        self.assertEqual(p.get_meta(), {
+            'harmful data': 'underline is cool',
+            'version': '1.0',
+            'author': 'jvoisin'})
+        ret = p.remove_all()
+        self.assertTrue(ret)
+        p = web.CSSParser('./tests/data/clean.cleaned.css')
+        self.assertEqual(p.get_meta(), {})
+        self.assertTrue(p.remove_all())
+        os.remove('./tests/data/clean.css')
+        os.remove('./tests/data/clean.cleaned.css')
+        os.remove('./tests/data/clean.cleaned.cleaned.css')
author	jvoisin	2019-02-20 16:28:11 -0800
committer	jvoisin	2019-02-20 16:28:11 -0800
commit	02ff21b158c76fcd355a74ddb940e1c54fc2d7ed (patch)
tree	701c6f5e316265e5a95a162356965ecf2fb8d6b2
parent	6b45064c784d03bb21ffaf7e50c9ba684e6985a9 (diff)

diff --git a/libmat2/epub.py b/libmat2/epub.py new file mode 100644 index 0000000..09b7937 --- /dev/null +++ b/libmat2/epub.py
@@ -0,0 +1,47 @@
		1	import logging
		2	import re
		3	import xml.etree.ElementTree as ET # type: ignore
		4
		5	from . import archive, office
		6
		7	class EPUBParser(archive.ArchiveBasedAbstractParser):
		8	mimetypes = {'application/epub+zip', }
		9
		10	def __init__(self, filename):
		11	super().__init__(filename)
		12	self.files_to_keep = set(map(re.compile, { # type: ignore
		13	'META-INF/container.xml',
		14	'mimetype',
		15	'OEBPS/content.opf',
		16	}))
		17
		18	def _specific_get_meta(self, full_path, file_path):
		19	if file_path != 'OEBPS/content.opf':
		20	return {}
		21
		22	with open(full_path, encoding='utf-8') as f:
		23	try:
		24	results = re.findall(r"<((?:meta\|dc\|cp).+?)[^>]*>(.+)</\1>",
		25	f.read(), re.I\|re.M)
		26	return {k:v for (k, v) in results}
		27	except (TypeError, UnicodeDecodeError):
		28	# We didn't manage to parse the xml file
		29	return {file_path: 'harmful content', }
		30
		31	def _specific_cleanup(self, full_path: str):
		32	if not full_path.endswith('OEBPS/content.opf'):
		33	return True
		34
		35	try:
		36	tree, namespace = office._parse_xml(full_path)
		37	except ET.ParseError:
		38	logging.error("Unable to parse %s in %s.", full_path, self.filename)
		39	return False
		40	parent_map = {c:p for p in tree.iter() for c in p}
		41
		42	for item in tree.iterfind('.//', namespace):
		43	if item.tag.strip().lower().endswith('metadata'):
		44	parent_map[item].remove(item)
		45	break # there is only a single <metadata> block
		46	tree.write(full_path, xml_declaration=True)
		47	return True


diff --git a/libmat2/html.py b/libmat2/html.py deleted file mode 100644 index d0e9a2b..0000000 --- a/libmat2/html.py +++ /dev/null
@@ -1,69 +0,0 @@
1	from html import parser
2	from typing import Dict, Any, List, Tuple
3
4	from . import abstract
5
6
7	class HTMLParser(abstract.AbstractParser):
8	mimetypes = {'text/html', }
9	def __init__(self, filename):
10	super().__init__(filename)
11	self.__parser = _HTMLParser()
12	with open(filename) as f:
13	self.__parser.feed(f.read())
14	self.__parser.close()
15
16	def get_meta(self) -> Dict[str, Any]:
17	return self.__parser.get_meta()
18
19	def remove_all(self) -> bool:
20	return self.__parser.remove_all(self.output_filename)
21
22
23	class _HTMLParser(parser.HTMLParser):
24	"""Python doesn't have a validating html parser in its stdlib, so
25	we're using an internal queue to track all the opening/closing tags,
26	and hoping for the best.
27	"""
28	def __init__(self):
29	super().__init__()
30	self.__textrepr = ''
31	self.__meta = {}
32	self.__validation_queue = []
33
34	def handle_starttag(self, tag: str, attrs: List[Tuple[str, str]]):
35	self.__textrepr += self.get_starttag_text()
36	self.__validation_queue.append(tag)
37
38	def handle_endtag(self, tag: str):
39	if not self.__validation_queue:
40	raise ValueError
41	elif tag != self.__validation_queue.pop():
42	raise ValueError
43	# There is no `get_endtag_text()` method :/
44	self.__textrepr += '</' + tag + '>\n'
45
46	def handle_data(self, data: str):
47	if data.strip():
48	self.__textrepr += data
49
50	def handle_startendtag(self, tag: str, attrs: List[Tuple[str, str]]):
51	if tag == 'meta':
52	meta = {k:v for k, v in attrs}
53	name = meta.get('name', 'harmful metadata')
54	content = meta.get('content', 'harmful data')
55	self.__meta[name] = content
56	else:
57	self.__textrepr += self.get_starttag_text()
58
59	def remove_all(self, output_filename: str) -> bool:
60	if self.__validation_queue:
61	raise ValueError
62	with open(output_filename, 'w') as f:
63	f.write(self.__textrepr)
64	return True
65
66	def get_meta(self) -> Dict[str, Any]:
67	if self.__validation_queue:
68	raise ValueError
69	return self.__meta


diff --git a/libmat2/parser_factory.py b/libmat2/parser_factory.py index 30c3b52..e93ee4f 100644 --- a/libmat2/parser_factory.py +++ b/libmat2/parser_factory.py
@@ -1,3 +1,4 @@
		1	import logging
1	import glob	2	import glob
2	import os	3	import os
3	import mimetypes	4	import mimetypes
@@ -10,6 +11,10 @@ assert Tuple # make pyflakes happy
10		11
11	T = TypeVar('T', bound='abstract.AbstractParser')	12	T = TypeVar('T', bound='abstract.AbstractParser')
12		13
		14	mimetypes.add_type('application/epub+zip', '.epub')
		15	# EPUB Navigation Control XML File
		16	mimetypes.add_type('application/x-dtbncx+xml', '.ncx')
		17
13		18
14	def __load_all_parsers():	19	def __load_all_parsers():
15	""" Loads every parser in a dynamic way """	20	""" Loads every parser in a dynamic way """
@@ -49,6 +54,8 @@ def get_parser(filename: str) -> Tuple[Optional[T], Optional[str]]:
49	if mtype in parser_class.mimetypes:	54	if mtype in parser_class.mimetypes:
50	try:	55	try:
51	return parser_class(filename), mtype	56	return parser_class(filename), mtype
52	except ValueError:	57	except ValueError as e:
		58	logging.info("Got an exception when trying to instanciate "
		59	"%s for %s: %s", parser_class, filename, e)
53	return None, mtype	60	return None, mtype
54	return None, mtype	61	return None, mtype


diff --git a/libmat2/web.py b/libmat2/web.py new file mode 100644 index 0000000..13d5fc8 --- /dev/null +++ b/libmat2/web.py
@@ -0,0 +1,122 @@
		1	from html import parser
		2	from typing import Dict, Any, List, Tuple
		3	import re
		4	import string
		5
		6	from . import abstract
		7
		8
		9	class CSSParser(abstract.AbstractParser):
		10	"""There is no such things as metadata in CSS files,
		11	only comments of the form `/* … */`, so we're removing the laters."""
		12	mimetypes = {'text/css', }
		13	flags = re.MULTILINE \| re.DOTALL
		14
		15	def remove_all(self) -> bool:
		16	with open(self.filename, encoding='utf-8') as f:
		17	cleaned = re.sub(r'/\.+?\/', '', f.read(), 0, self.flags)
		18	with open(self.output_filename, 'w', encoding='utf-8') as f:
		19	f.write(cleaned)
		20	return True
		21
		22	def get_meta(self) -> Dict[str, Any]:
		23	metadata = {}
		24	with open(self.filename, encoding='utf-8') as f:
		25	cssdoc = re.findall(r'/\(.+?)\/', f.read(), self.flags)
		26	for match in cssdoc:
		27	for line in match.splitlines():
		28	try:
		29	k, v = line.split(':')
		30	metadata[k.strip(string.whitespace + '*')] = v.strip()
		31	except ValueError:
		32	metadata['harmful data'] = line.strip()
		33	return metadata
		34
		35
		36	class HTMLParser(abstract.AbstractParser):
		37	mimetypes = {'text/html', 'application/x-dtbncx+xml', }
		38	def __init__(self, filename):
		39	super().__init__(filename)
		40	self.__parser = _HTMLParser(self.filename)
		41	with open(filename, encoding='utf-8') as f:
		42	self.__parser.feed(f.read())
		43	self.__parser.close()
		44
		45	def get_meta(self) -> Dict[str, Any]:
		46	return self.__parser.get_meta()
		47
		48	def remove_all(self) -> bool:
		49	return self.__parser.remove_all(self.output_filename)
		50
		51
		52	class _HTMLParser(parser.HTMLParser):
		53	"""Python doesn't have a validating html parser in its stdlib, so
		54	we're using an internal queue to track all the opening/closing tags,
		55	and hoping for the best.
		56	"""
		57	tag_blacklist = {'doctitle', 'meta'} # everything is lowercase
		58	def __init__(self, filename):
		59	super().__init__()
		60	self.filename = filename
		61	self.__textrepr = ''
		62	self.__meta = {}
		63	self.__validation_queue = []
		64	# We're using a counter instead of a boolean to handle nested tags
		65	self.__in_dangerous_tag = 0
		66
		67	def handle_starttag(self, tag: str, attrs: List[Tuple[str, str]]):
		68	self.__validation_queue.append(tag)
		69	if tag in self.tag_blacklist:
		70	self.__in_dangerous_tag += 1
		71	return
		72
		73	if self.__in_dangerous_tag == 0:
		74	self.__textrepr += self.get_starttag_text()
		75
		76	def handle_endtag(self, tag: str):
		77	if not self.__validation_queue:
		78	raise ValueError("The closing tag %s doesn't have a corresponding "
		79	"opening one in %s." % (tag, self.filename))
		80
		81	previous_tag = self.__validation_queue.pop()
		82	if tag != previous_tag:
		83	raise ValueError("The closing tag %s doesn't match the previous "
		84	"tag %s in %s" %
		85	(tag, previous_tag, self.filename))
		86	elif tag in self.tag_blacklist:
		87	self.__in_dangerous_tag -= 1
		88	return
		89
		90	if self.__in_dangerous_tag == 0:
		91	# There is no `get_endtag_text()` method :/
		92	self.__textrepr += '</' + tag + '>\n'
		93
		94	def handle_data(self, data: str):
		95	if self.__in_dangerous_tag == 0 and data.strip():
		96	self.__textrepr += data
		97
		98	def handle_startendtag(self, tag: str, attrs: List[Tuple[str, str]]):
		99	if tag in self.tag_blacklist:
		100	meta = {k:v for k, v in attrs}
		101	name = meta.get('name', 'harmful metadata')
		102	content = meta.get('content', 'harmful data')
		103	self.__meta[name] = content
		104	else:
		105	if self.__in_dangerous_tag == 0:
		106	self.__textrepr += self.get_starttag_text()
		107
		108	def remove_all(self, output_filename: str) -> bool:
		109	if self.__validation_queue:
		110	raise ValueError("Some tags (%s) were left unclosed in %s" % (
		111	', '.join(self.__validation_queue),
		112	self.filename))
		113	with open(output_filename, 'w', encoding='utf-8') as f:
		114	f.write(self.__textrepr)
		115	return True
		116
		117	def get_meta(self) -> Dict[str, Any]:
		118	if self.__validation_queue:
		119	raise ValueError("Some tags (%s) were left unclosed in %s" % (
		120	', '.join(self.__validation_queue),
		121	self.filename))
		122	return self.__meta


diff --git a/tests/data/dirty.css b/tests/data/dirty.css new file mode 100644 index 0000000..f52caf9 --- /dev/null +++ b/tests/data/dirty.css
@@ -0,0 +1,14 @@
		1	/**
		2	* This is my super css framework
		3	* version: 1.0
		4	* author : jvoisin
		5	*/
		6
		7	body {
		8	color: red;
		9	background-color: blue;
		10	}
		11
		12	.underline {
		13	text-decoration: underline; /* underline is cool */
		14	}


diff --git a/tests/data/dirty.epub b/tests/data/dirty.epub new file mode 100644 index 0000000..6389963 --- /dev/null +++ b/tests/data/dirty.epub
Binary files differ


diff --git a/tests/dirty.epub b/tests/dirty.epub new file mode 100644 index 0000000..6389963 --- /dev/null +++ b/tests/dirty.epub
Binary files differ


diff --git a/tests/test_corrupted_files.py b/tests/test_corrupted_files.py index 8728cb2..53c856a 100644 --- a/tests/test_corrupted_files.py +++ b/tests/test_corrupted_files.py
@@ -7,7 +7,7 @@ import logging
7	import zipfile	7	import zipfile
8		8
9	from libmat2 import pdf, images, audio, office, parser_factory, torrent	9	from libmat2 import pdf, images, audio, office, parser_factory, torrent
10	from libmat2 import harmless, video, html	10	from libmat2 import harmless, video, web
11		11
12	# No need to logging messages, should something go wrong,	12	# No need to logging messages, should something go wrong,
13	# the testsuite _will_ fail.	13	# the testsuite _will_ fail.
@@ -220,34 +220,34 @@ class TestCorruptedFiles(unittest.TestCase):
220	os.remove('./tests/data/--output.avi')	220	os.remove('./tests/data/--output.avi')
221		221
222	def test_zip(self):	222	def test_zip(self):
223	with zipfile.ZipFile('./tests/data/dirty.zip', 'w') as zout:	223	with zipfile.ZipFile('./tests/data/clean.zip', 'w') as zout:
224	zout.write('./tests/data/dirty.flac')	224	zout.write('./tests/data/dirty.flac')
225	zout.write('./tests/data/dirty.docx')	225	zout.write('./tests/data/dirty.docx')
226	zout.write('./tests/data/dirty.jpg')	226	zout.write('./tests/data/dirty.jpg')
227	zout.write('./tests/data/embedded_corrupted.docx')	227	zout.write('./tests/data/embedded_corrupted.docx')
228	p, mimetype = parser_factory.get_parser('./tests/data/dirty.zip')	228	p, mimetype = parser_factory.get_parser('./tests/data/clean.zip')
229	self.assertEqual(mimetype, 'application/zip')	229	self.assertEqual(mimetype, 'application/zip')
230	meta = p.get_meta()	230	meta = p.get_meta()
231	self.assertEqual(meta['tests/data/dirty.flac']['comments'], 'Thank you for using MAT !')	231	self.assertEqual(meta['tests/data/dirty.flac']['comments'], 'Thank you for using MAT !')
232	self.assertEqual(meta['tests/data/dirty.docx']['word/media/image1.png']['Comment'], 'This is a comment, be careful!')	232	self.assertEqual(meta['tests/data/dirty.docx']['word/media/image1.png']['Comment'], 'This is a comment, be careful!')
233	self.assertFalse(p.remove_all())	233	self.assertFalse(p.remove_all())
234	os.remove('./tests/data/dirty.zip')	234	os.remove('./tests/data/clean.zip')
235		235
236	def test_html(self):	236	def test_html(self):
237	shutil.copy('./tests/data/dirty.html', './tests/data/clean.html')	237	shutil.copy('./tests/data/dirty.html', './tests/data/clean.html')
238	with open('./tests/data/clean.html', 'a') as f:	238	with open('./tests/data/clean.html', 'a') as f:
239	f.write('<open>but not</closed>')	239	f.write('<open>but not</closed>')
240	with self.assertRaises(ValueError):	240	with self.assertRaises(ValueError):
241	html.HTMLParser('./tests/data/clean.html')	241	web.HTMLParser('./tests/data/clean.html')
242	os.remove('./tests/data/clean.html')	242	os.remove('./tests/data/clean.html')
243		243
244	# Yes, we're able to deal with malformed html :/	244	# Yes, we're able to deal with malformed html :/
245	shutil.copy('./tests/data/dirty.html', './tests/data/clean.html')	245	shutil.copy('./tests/data/dirty.html', './tests/data/clean.html')
246	with open('./tests/data/clean.html', 'a') as f:	246	with open('./tests/data/clean.html', 'a') as f:
247	f.write('<meta name=\'this" is="weird"/>')	247	f.write('<meta name=\'this" is="weird"/>')
248	p = html.HTMLParser('./tests/data/clean.html')	248	p = web.HTMLParser('./tests/data/clean.html')
249	self.assertTrue(p.remove_all())	249	self.assertTrue(p.remove_all())
250	p = html.HTMLParser('./tests/data/clean.cleaned.html')	250	p = web.HTMLParser('./tests/data/clean.cleaned.html')
251	self.assertEqual(p.get_meta(), {})	251	self.assertEqual(p.get_meta(), {})
252	os.remove('./tests/data/clean.html')	252	os.remove('./tests/data/clean.html')
253	os.remove('./tests/data/clean.cleaned.html')	253	os.remove('./tests/data/clean.cleaned.html')
@@ -255,17 +255,38 @@ class TestCorruptedFiles(unittest.TestCase):
255	with open('./tests/data/clean.html', 'w') as f:	255	with open('./tests/data/clean.html', 'w') as f:
256	f.write('</close>')	256	f.write('</close>')
257	with self.assertRaises(ValueError):	257	with self.assertRaises(ValueError):
258	html.HTMLParser('./tests/data/clean.html')	258	web.HTMLParser('./tests/data/clean.html')
259	os.remove('./tests/data/clean.html')	259	os.remove('./tests/data/clean.html')
260		260
261	with open('./tests/data/clean.html', 'w') as f:	261	with open('./tests/data/clean.html', 'w') as f:
262	f.write('<notclosed>')	262	f.write('<notclosed>')
263	p = html.HTMLParser('./tests/data/clean.html')	263	p = web.HTMLParser('./tests/data/clean.html')
264	with self.assertRaises(ValueError):	264	with self.assertRaises(ValueError):
265	p.get_meta()	265	p.get_meta()
266	p = html.HTMLParser('./tests/data/clean.html')	266	p = web.HTMLParser('./tests/data/clean.html')
267	with self.assertRaises(ValueError):	267	with self.assertRaises(ValueError):
268	p.remove_all()	268	p.remove_all()
269	os.remove('./tests/data/clean.html')	269	os.remove('./tests/data/clean.html')
270		270
		271	with open('./tests/data/clean.html', 'w') as f:
		272	f.write('<doctitle><br/></doctitle><br/><notclosed>')
		273	p = web.HTMLParser('./tests/data/clean.html')
		274	with self.assertRaises(ValueError):
		275	p.get_meta()
		276	p = web.HTMLParser('./tests/data/clean.html')
		277	with self.assertRaises(ValueError):
		278	p.remove_all()
		279	os.remove('./tests/data/clean.html')
		280
		281	def test_epub(self):
		282	with zipfile.ZipFile('./tests/data/clean.epub', 'w') as zout:
		283	zout.write('./tests/data/dirty.jpg', 'OEBPS/content.opf')
		284	p, mimetype = parser_factory.get_parser('./tests/data/clean.epub')
		285	self.assertEqual(mimetype, 'application/epub+zip')
		286	meta = p.get_meta()
		287	self.assertEqual(meta['OEBPS/content.opf']['OEBPS/content.opf'],
		288	'harmful content')
		289
		290	self.assertFalse(p.remove_all())
		291	os.remove('./tests/data/clean.epub')
271		292


diff --git a/tests/test_libmat2.py b/tests/test_libmat2.py index 8753e09..249c56d 100644 --- a/tests/test_libmat2.py +++ b/tests/test_libmat2.py
@@ -6,7 +6,7 @@ import os
6	import zipfile	6	import zipfile
7		7
8	from libmat2 import pdf, images, audio, office, parser_factory, torrent, harmless	8	from libmat2 import pdf, images, audio, office, parser_factory, torrent, harmless
9	from libmat2 import check_dependencies, video, archive, html	9	from libmat2 import check_dependencies, video, archive, web, epub
10		10
11		11
12	class TestCheckDependencies(unittest.TestCase):	12	class TestCheckDependencies(unittest.TestCase):
@@ -177,6 +177,23 @@ class TestGetMeta(unittest.TestCase):
177	meta = p.get_meta()	177	meta = p.get_meta()
178	self.assertEqual(meta['Comment'], 'this is a test comment')	178	self.assertEqual(meta['Comment'], 'this is a test comment')
179		179
		180	def test_epub(self):
		181	p, mimetype = parser_factory.get_parser('./tests/data/dirty.epub')
		182	self.assertEqual(mimetype, 'application/epub+zip')
		183	meta = p.get_meta()
		184	self.assertEqual(meta['OEBPS/content.opf']['dc:creator'], 'Dorothy L. Sayers')
		185	self.assertEqual(meta['OEBPS/toc.ncx']['dtb:generator'], 'Ebookmaker 0.4.0a5 by Marcello Perathoner <webmaster@gutenberg.org>')
		186	self.assertEqual(meta['OEBPS/@public@vhost@g@gutenberg@html@files@58820@58820-h@images@shield25.jpg']['CreatorTool'], 'Adobe Photoshop CS5 Macintosh')
		187	self.assertEqual(meta['OEBPS/@public@vhost@g@gutenberg@html@files@58820@58820-h@58820-h-2.htm.html']['generator'], 'Ebookmaker 0.4.0a5 by Marcello Perathoner <webmaster@gutenberg.org>')
		188
		189	def test_css(self):
		190	p, mimetype = parser_factory.get_parser('./tests/data/dirty.css')
		191	self.assertEqual(mimetype, 'text/css')
		192	meta = p.get_meta()
		193	self.assertEqual(meta['author'], 'jvoisin')
		194	self.assertEqual(meta['version'], '1.0')
		195	self.assertEqual(meta['harmful data'], 'underline is cool')
		196
180	class TestRemovingThumbnails(unittest.TestCase):	197	class TestRemovingThumbnails(unittest.TestCase):
181	def test_odt(self):	198	def test_odt(self):
182	shutil.copy('./tests/data/revision.odt', './tests/data/clean.odt')	199	shutil.copy('./tests/data/revision.odt', './tests/data/clean.odt')
@@ -599,7 +616,7 @@ class TestCleaning(unittest.TestCase):
599		616
600	def test_html(self):	617	def test_html(self):
601	shutil.copy('./tests/data/dirty.html', './tests/data/clean.html')	618	shutil.copy('./tests/data/dirty.html', './tests/data/clean.html')
602	p = html.HTMLParser('./tests/data/clean.html')	619	p = web.HTMLParser('./tests/data/clean.html')
603		620
604	meta = p.get_meta()	621	meta = p.get_meta()
605	self.assertEqual(meta['author'], 'jvoisin')	622	self.assertEqual(meta['author'], 'jvoisin')
@@ -607,10 +624,50 @@ class TestCleaning(unittest.TestCase):
607	ret = p.remove_all()	624	ret = p.remove_all()
608	self.assertTrue(ret)	625	self.assertTrue(ret)
609		626
610	p = html.HTMLParser('./tests/data/clean.cleaned.html')	627	p = web.HTMLParser('./tests/data/clean.cleaned.html')
611	self.assertEqual(p.get_meta(), {})	628	self.assertEqual(p.get_meta(), {})
612	self.assertTrue(p.remove_all())	629	self.assertTrue(p.remove_all())
613		630
614	os.remove('./tests/data/clean.html')	631	os.remove('./tests/data/clean.html')
615	os.remove('./tests/data/clean.cleaned.html')	632	os.remove('./tests/data/clean.cleaned.html')
616	os.remove('./tests/data/clean.cleaned.cleaned.html')	633	os.remove('./tests/data/clean.cleaned.cleaned.html')
		634
		635
		636	def test_epub(self):
		637	shutil.copy('./tests/data/dirty.epub', './tests/data/clean.epub')
		638	p = epub.EPUBParser('./tests/data/clean.epub')
		639
		640	meta = p.get_meta()
		641	self.assertEqual(meta['OEBPS/content.opf']['dc:source'], 'http://www.gutenberg.org/files/58820/58820-h/58820-h.htm')
		642
		643	ret = p.remove_all()
		644	self.assertTrue(ret)
		645
		646	p = epub.EPUBParser('./tests/data/clean.cleaned.epub')
		647	self.assertEqual(p.get_meta(), {})
		648	self.assertTrue(p.remove_all())
		649
		650	os.remove('./tests/data/clean.epub')
		651	os.remove('./tests/data/clean.cleaned.epub')
		652	os.remove('./tests/data/clean.cleaned.cleaned.epub')
		653
		654
		655	def test_css(self):
		656	shutil.copy('./tests/data/dirty.css', './tests/data/clean.css')
		657	p = web.CSSParser('./tests/data/clean.css')
		658
		659	self.assertEqual(p.get_meta(), {
		660	'harmful data': 'underline is cool',
		661	'version': '1.0',
		662	'author': 'jvoisin'})
		663
		664	ret = p.remove_all()
		665	self.assertTrue(ret)
		666
		667	p = web.CSSParser('./tests/data/clean.cleaned.css')
		668	self.assertEqual(p.get_meta(), {})
		669	self.assertTrue(p.remove_all())
		670
		671	os.remove('./tests/data/clean.css')
		672	os.remove('./tests/data/clean.cleaned.css')
		673	os.remove('./tests/data/clean.cleaned.cleaned.css')