summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorjvoisin2019-02-20 16:28:11 -0800
committerjvoisin2019-02-20 16:28:11 -0800
commit02ff21b158c76fcd355a74ddb940e1c54fc2d7ed (patch)
tree701c6f5e316265e5a95a162356965ecf2fb8d6b2
parent6b45064c784d03bb21ffaf7e50c9ba684e6985a9 (diff)
Implement epub support
Diffstat (limited to '')
-rw-r--r--libmat2/epub.py47
-rw-r--r--libmat2/html.py69
-rw-r--r--libmat2/parser_factory.py9
-rw-r--r--libmat2/web.py122
-rw-r--r--tests/data/dirty.css14
-rw-r--r--tests/data/dirty.epubbin0 -> 296324 bytes
-rw-r--r--tests/dirty.epubbin0 -> 296324 bytes
-rw-r--r--tests/test_corrupted_files.py41
-rw-r--r--tests/test_libmat2.py63
9 files changed, 282 insertions, 83 deletions
diff --git a/libmat2/epub.py b/libmat2/epub.py
new file mode 100644
index 0000000..09b7937
--- /dev/null
+++ b/libmat2/epub.py
@@ -0,0 +1,47 @@
1import logging
2import re
3import xml.etree.ElementTree as ET # type: ignore
4
5from . import archive, office
6
7class EPUBParser(archive.ArchiveBasedAbstractParser):
8 mimetypes = {'application/epub+zip', }
9
10 def __init__(self, filename):
11 super().__init__(filename)
12 self.files_to_keep = set(map(re.compile, { # type: ignore
13 'META-INF/container.xml',
14 'mimetype',
15 'OEBPS/content.opf',
16 }))
17
18 def _specific_get_meta(self, full_path, file_path):
19 if file_path != 'OEBPS/content.opf':
20 return {}
21
22 with open(full_path, encoding='utf-8') as f:
23 try:
24 results = re.findall(r"<((?:meta|dc|cp).+?)[^>]*>(.+)</\1>",
25 f.read(), re.I|re.M)
26 return {k:v for (k, v) in results}
27 except (TypeError, UnicodeDecodeError):
28 # We didn't manage to parse the xml file
29 return {file_path: 'harmful content', }
30
31 def _specific_cleanup(self, full_path: str):
32 if not full_path.endswith('OEBPS/content.opf'):
33 return True
34
35 try:
36 tree, namespace = office._parse_xml(full_path)
37 except ET.ParseError:
38 logging.error("Unable to parse %s in %s.", full_path, self.filename)
39 return False
40 parent_map = {c:p for p in tree.iter() for c in p}
41
42 for item in tree.iterfind('.//', namespace):
43 if item.tag.strip().lower().endswith('metadata'):
44 parent_map[item].remove(item)
45 break # there is only a single <metadata> block
46 tree.write(full_path, xml_declaration=True)
47 return True
diff --git a/libmat2/html.py b/libmat2/html.py
deleted file mode 100644
index d0e9a2b..0000000
--- a/libmat2/html.py
+++ /dev/null
@@ -1,69 +0,0 @@
1from html import parser
2from typing import Dict, Any, List, Tuple
3
4from . import abstract
5
6
7class HTMLParser(abstract.AbstractParser):
8 mimetypes = {'text/html', }
9 def __init__(self, filename):
10 super().__init__(filename)
11 self.__parser = _HTMLParser()
12 with open(filename) as f:
13 self.__parser.feed(f.read())
14 self.__parser.close()
15
16 def get_meta(self) -> Dict[str, Any]:
17 return self.__parser.get_meta()
18
19 def remove_all(self) -> bool:
20 return self.__parser.remove_all(self.output_filename)
21
22
23class _HTMLParser(parser.HTMLParser):
24 """Python doesn't have a validating html parser in its stdlib, so
25 we're using an internal queue to track all the opening/closing tags,
26 and hoping for the best.
27 """
28 def __init__(self):
29 super().__init__()
30 self.__textrepr = ''
31 self.__meta = {}
32 self.__validation_queue = []
33
34 def handle_starttag(self, tag: str, attrs: List[Tuple[str, str]]):
35 self.__textrepr += self.get_starttag_text()
36 self.__validation_queue.append(tag)
37
38 def handle_endtag(self, tag: str):
39 if not self.__validation_queue:
40 raise ValueError
41 elif tag != self.__validation_queue.pop():
42 raise ValueError
43 # There is no `get_endtag_text()` method :/
44 self.__textrepr += '</' + tag + '>\n'
45
46 def handle_data(self, data: str):
47 if data.strip():
48 self.__textrepr += data
49
50 def handle_startendtag(self, tag: str, attrs: List[Tuple[str, str]]):
51 if tag == 'meta':
52 meta = {k:v for k, v in attrs}
53 name = meta.get('name', 'harmful metadata')
54 content = meta.get('content', 'harmful data')
55 self.__meta[name] = content
56 else:
57 self.__textrepr += self.get_starttag_text()
58
59 def remove_all(self, output_filename: str) -> bool:
60 if self.__validation_queue:
61 raise ValueError
62 with open(output_filename, 'w') as f:
63 f.write(self.__textrepr)
64 return True
65
66 def get_meta(self) -> Dict[str, Any]:
67 if self.__validation_queue:
68 raise ValueError
69 return self.__meta
diff --git a/libmat2/parser_factory.py b/libmat2/parser_factory.py
index 30c3b52..e93ee4f 100644
--- a/libmat2/parser_factory.py
+++ b/libmat2/parser_factory.py
@@ -1,3 +1,4 @@
1import logging
1import glob 2import glob
2import os 3import os
3import mimetypes 4import mimetypes
@@ -10,6 +11,10 @@ assert Tuple # make pyflakes happy
10 11
11T = TypeVar('T', bound='abstract.AbstractParser') 12T = TypeVar('T', bound='abstract.AbstractParser')
12 13
14mimetypes.add_type('application/epub+zip', '.epub')
15# EPUB Navigation Control XML File
16mimetypes.add_type('application/x-dtbncx+xml', '.ncx')
17
13 18
14def __load_all_parsers(): 19def __load_all_parsers():
15 """ Loads every parser in a dynamic way """ 20 """ Loads every parser in a dynamic way """
@@ -49,6 +54,8 @@ def get_parser(filename: str) -> Tuple[Optional[T], Optional[str]]:
49 if mtype in parser_class.mimetypes: 54 if mtype in parser_class.mimetypes:
50 try: 55 try:
51 return parser_class(filename), mtype 56 return parser_class(filename), mtype
52 except ValueError: 57 except ValueError as e:
58 logging.info("Got an exception when trying to instanciate "
59 "%s for %s: %s", parser_class, filename, e)
53 return None, mtype 60 return None, mtype
54 return None, mtype 61 return None, mtype
diff --git a/libmat2/web.py b/libmat2/web.py
new file mode 100644
index 0000000..13d5fc8
--- /dev/null
+++ b/libmat2/web.py
@@ -0,0 +1,122 @@
1from html import parser
2from typing import Dict, Any, List, Tuple
3import re
4import string
5
6from . import abstract
7
8
9class CSSParser(abstract.AbstractParser):
10 """There is no such things as metadata in CSS files,
11 only comments of the form `/* … */`, so we're removing the laters."""
12 mimetypes = {'text/css', }
13 flags = re.MULTILINE | re.DOTALL
14
15 def remove_all(self) -> bool:
16 with open(self.filename, encoding='utf-8') as f:
17 cleaned = re.sub(r'/\*.+?\*/', '', f.read(), 0, self.flags)
18 with open(self.output_filename, 'w', encoding='utf-8') as f:
19 f.write(cleaned)
20 return True
21
22 def get_meta(self) -> Dict[str, Any]:
23 metadata = {}
24 with open(self.filename, encoding='utf-8') as f:
25 cssdoc = re.findall(r'/\*(.+?)\*/', f.read(), self.flags)
26 for match in cssdoc:
27 for line in match.splitlines():
28 try:
29 k, v = line.split(':')
30 metadata[k.strip(string.whitespace + '*')] = v.strip()
31 except ValueError:
32 metadata['harmful data'] = line.strip()
33 return metadata
34
35
36class HTMLParser(abstract.AbstractParser):
37 mimetypes = {'text/html', 'application/x-dtbncx+xml', }
38 def __init__(self, filename):
39 super().__init__(filename)
40 self.__parser = _HTMLParser(self.filename)
41 with open(filename, encoding='utf-8') as f:
42 self.__parser.feed(f.read())
43 self.__parser.close()
44
45 def get_meta(self) -> Dict[str, Any]:
46 return self.__parser.get_meta()
47
48 def remove_all(self) -> bool:
49 return self.__parser.remove_all(self.output_filename)
50
51
52class _HTMLParser(parser.HTMLParser):
53 """Python doesn't have a validating html parser in its stdlib, so
54 we're using an internal queue to track all the opening/closing tags,
55 and hoping for the best.
56 """
57 tag_blacklist = {'doctitle', 'meta'} # everything is lowercase
58 def __init__(self, filename):
59 super().__init__()
60 self.filename = filename
61 self.__textrepr = ''
62 self.__meta = {}
63 self.__validation_queue = []
64 # We're using a counter instead of a boolean to handle nested tags
65 self.__in_dangerous_tag = 0
66
67 def handle_starttag(self, tag: str, attrs: List[Tuple[str, str]]):
68 self.__validation_queue.append(tag)
69 if tag in self.tag_blacklist:
70 self.__in_dangerous_tag += 1
71 return
72
73 if self.__in_dangerous_tag == 0:
74 self.__textrepr += self.get_starttag_text()
75
76 def handle_endtag(self, tag: str):
77 if not self.__validation_queue:
78 raise ValueError("The closing tag %s doesn't have a corresponding "
79 "opening one in %s." % (tag, self.filename))
80
81 previous_tag = self.__validation_queue.pop()
82 if tag != previous_tag:
83 raise ValueError("The closing tag %s doesn't match the previous "
84 "tag %s in %s" %
85 (tag, previous_tag, self.filename))
86 elif tag in self.tag_blacklist:
87 self.__in_dangerous_tag -= 1
88 return
89
90 if self.__in_dangerous_tag == 0:
91 # There is no `get_endtag_text()` method :/
92 self.__textrepr += '</' + tag + '>\n'
93
94 def handle_data(self, data: str):
95 if self.__in_dangerous_tag == 0 and data.strip():
96 self.__textrepr += data
97
98 def handle_startendtag(self, tag: str, attrs: List[Tuple[str, str]]):
99 if tag in self.tag_blacklist:
100 meta = {k:v for k, v in attrs}
101 name = meta.get('name', 'harmful metadata')
102 content = meta.get('content', 'harmful data')
103 self.__meta[name] = content
104 else:
105 if self.__in_dangerous_tag == 0:
106 self.__textrepr += self.get_starttag_text()
107
108 def remove_all(self, output_filename: str) -> bool:
109 if self.__validation_queue:
110 raise ValueError("Some tags (%s) were left unclosed in %s" % (
111 ', '.join(self.__validation_queue),
112 self.filename))
113 with open(output_filename, 'w', encoding='utf-8') as f:
114 f.write(self.__textrepr)
115 return True
116
117 def get_meta(self) -> Dict[str, Any]:
118 if self.__validation_queue:
119 raise ValueError("Some tags (%s) were left unclosed in %s" % (
120 ', '.join(self.__validation_queue),
121 self.filename))
122 return self.__meta
diff --git a/tests/data/dirty.css b/tests/data/dirty.css
new file mode 100644
index 0000000..f52caf9
--- /dev/null
+++ b/tests/data/dirty.css
@@ -0,0 +1,14 @@
1/**
2 * This is my super css framework
3 * version: 1.0
4 * author : jvoisin
5 */
6
7body {
8 color: red;
9 background-color: blue;
10}
11
12.underline {
13 text-decoration: underline; /* underline is cool */
14}
diff --git a/tests/data/dirty.epub b/tests/data/dirty.epub
new file mode 100644
index 0000000..6389963
--- /dev/null
+++ b/tests/data/dirty.epub
Binary files differ
diff --git a/tests/dirty.epub b/tests/dirty.epub
new file mode 100644
index 0000000..6389963
--- /dev/null
+++ b/tests/dirty.epub
Binary files differ
diff --git a/tests/test_corrupted_files.py b/tests/test_corrupted_files.py
index 8728cb2..53c856a 100644
--- a/tests/test_corrupted_files.py
+++ b/tests/test_corrupted_files.py
@@ -7,7 +7,7 @@ import logging
7import zipfile 7import zipfile
8 8
9from libmat2 import pdf, images, audio, office, parser_factory, torrent 9from libmat2 import pdf, images, audio, office, parser_factory, torrent
10from libmat2 import harmless, video, html 10from libmat2 import harmless, video, web
11 11
12# No need to logging messages, should something go wrong, 12# No need to logging messages, should something go wrong,
13# the testsuite _will_ fail. 13# the testsuite _will_ fail.
@@ -220,34 +220,34 @@ class TestCorruptedFiles(unittest.TestCase):
220 os.remove('./tests/data/--output.avi') 220 os.remove('./tests/data/--output.avi')
221 221
222 def test_zip(self): 222 def test_zip(self):
223 with zipfile.ZipFile('./tests/data/dirty.zip', 'w') as zout: 223 with zipfile.ZipFile('./tests/data/clean.zip', 'w') as zout:
224 zout.write('./tests/data/dirty.flac') 224 zout.write('./tests/data/dirty.flac')
225 zout.write('./tests/data/dirty.docx') 225 zout.write('./tests/data/dirty.docx')
226 zout.write('./tests/data/dirty.jpg') 226 zout.write('./tests/data/dirty.jpg')
227 zout.write('./tests/data/embedded_corrupted.docx') 227 zout.write('./tests/data/embedded_corrupted.docx')
228 p, mimetype = parser_factory.get_parser('./tests/data/dirty.zip') 228 p, mimetype = parser_factory.get_parser('./tests/data/clean.zip')
229 self.assertEqual(mimetype, 'application/zip') 229 self.assertEqual(mimetype, 'application/zip')
230 meta = p.get_meta() 230 meta = p.get_meta()
231 self.assertEqual(meta['tests/data/dirty.flac']['comments'], 'Thank you for using MAT !') 231 self.assertEqual(meta['tests/data/dirty.flac']['comments'], 'Thank you for using MAT !')
232 self.assertEqual(meta['tests/data/dirty.docx']['word/media/image1.png']['Comment'], 'This is a comment, be careful!') 232 self.assertEqual(meta['tests/data/dirty.docx']['word/media/image1.png']['Comment'], 'This is a comment, be careful!')
233 self.assertFalse(p.remove_all()) 233 self.assertFalse(p.remove_all())
234 os.remove('./tests/data/dirty.zip') 234 os.remove('./tests/data/clean.zip')
235 235
236 def test_html(self): 236 def test_html(self):
237 shutil.copy('./tests/data/dirty.html', './tests/data/clean.html') 237 shutil.copy('./tests/data/dirty.html', './tests/data/clean.html')
238 with open('./tests/data/clean.html', 'a') as f: 238 with open('./tests/data/clean.html', 'a') as f:
239 f.write('<open>but not</closed>') 239 f.write('<open>but not</closed>')
240 with self.assertRaises(ValueError): 240 with self.assertRaises(ValueError):
241 html.HTMLParser('./tests/data/clean.html') 241 web.HTMLParser('./tests/data/clean.html')
242 os.remove('./tests/data/clean.html') 242 os.remove('./tests/data/clean.html')
243 243
244 # Yes, we're able to deal with malformed html :/ 244 # Yes, we're able to deal with malformed html :/
245 shutil.copy('./tests/data/dirty.html', './tests/data/clean.html') 245 shutil.copy('./tests/data/dirty.html', './tests/data/clean.html')
246 with open('./tests/data/clean.html', 'a') as f: 246 with open('./tests/data/clean.html', 'a') as f:
247 f.write('<meta name=\'this" is="weird"/>') 247 f.write('<meta name=\'this" is="weird"/>')
248 p = html.HTMLParser('./tests/data/clean.html') 248 p = web.HTMLParser('./tests/data/clean.html')
249 self.assertTrue(p.remove_all()) 249 self.assertTrue(p.remove_all())
250 p = html.HTMLParser('./tests/data/clean.cleaned.html') 250 p = web.HTMLParser('./tests/data/clean.cleaned.html')
251 self.assertEqual(p.get_meta(), {}) 251 self.assertEqual(p.get_meta(), {})
252 os.remove('./tests/data/clean.html') 252 os.remove('./tests/data/clean.html')
253 os.remove('./tests/data/clean.cleaned.html') 253 os.remove('./tests/data/clean.cleaned.html')
@@ -255,17 +255,38 @@ class TestCorruptedFiles(unittest.TestCase):
255 with open('./tests/data/clean.html', 'w') as f: 255 with open('./tests/data/clean.html', 'w') as f:
256 f.write('</close>') 256 f.write('</close>')
257 with self.assertRaises(ValueError): 257 with self.assertRaises(ValueError):
258 html.HTMLParser('./tests/data/clean.html') 258 web.HTMLParser('./tests/data/clean.html')
259 os.remove('./tests/data/clean.html') 259 os.remove('./tests/data/clean.html')
260 260
261 with open('./tests/data/clean.html', 'w') as f: 261 with open('./tests/data/clean.html', 'w') as f:
262 f.write('<notclosed>') 262 f.write('<notclosed>')
263 p = html.HTMLParser('./tests/data/clean.html') 263 p = web.HTMLParser('./tests/data/clean.html')
264 with self.assertRaises(ValueError): 264 with self.assertRaises(ValueError):
265 p.get_meta() 265 p.get_meta()
266 p = html.HTMLParser('./tests/data/clean.html') 266 p = web.HTMLParser('./tests/data/clean.html')
267 with self.assertRaises(ValueError): 267 with self.assertRaises(ValueError):
268 p.remove_all() 268 p.remove_all()
269 os.remove('./tests/data/clean.html') 269 os.remove('./tests/data/clean.html')
270 270
271 with open('./tests/data/clean.html', 'w') as f:
272 f.write('<doctitle><br/></doctitle><br/><notclosed>')
273 p = web.HTMLParser('./tests/data/clean.html')
274 with self.assertRaises(ValueError):
275 p.get_meta()
276 p = web.HTMLParser('./tests/data/clean.html')
277 with self.assertRaises(ValueError):
278 p.remove_all()
279 os.remove('./tests/data/clean.html')
280
281 def test_epub(self):
282 with zipfile.ZipFile('./tests/data/clean.epub', 'w') as zout:
283 zout.write('./tests/data/dirty.jpg', 'OEBPS/content.opf')
284 p, mimetype = parser_factory.get_parser('./tests/data/clean.epub')
285 self.assertEqual(mimetype, 'application/epub+zip')
286 meta = p.get_meta()
287 self.assertEqual(meta['OEBPS/content.opf']['OEBPS/content.opf'],
288 'harmful content')
289
290 self.assertFalse(p.remove_all())
291 os.remove('./tests/data/clean.epub')
271 292
diff --git a/tests/test_libmat2.py b/tests/test_libmat2.py
index 8753e09..249c56d 100644
--- a/tests/test_libmat2.py
+++ b/tests/test_libmat2.py
@@ -6,7 +6,7 @@ import os
6import zipfile 6import zipfile
7 7
8from libmat2 import pdf, images, audio, office, parser_factory, torrent, harmless 8from libmat2 import pdf, images, audio, office, parser_factory, torrent, harmless
9from libmat2 import check_dependencies, video, archive, html 9from libmat2 import check_dependencies, video, archive, web, epub
10 10
11 11
12class TestCheckDependencies(unittest.TestCase): 12class TestCheckDependencies(unittest.TestCase):
@@ -177,6 +177,23 @@ class TestGetMeta(unittest.TestCase):
177 meta = p.get_meta() 177 meta = p.get_meta()
178 self.assertEqual(meta['Comment'], 'this is a test comment') 178 self.assertEqual(meta['Comment'], 'this is a test comment')
179 179
180 def test_epub(self):
181 p, mimetype = parser_factory.get_parser('./tests/data/dirty.epub')
182 self.assertEqual(mimetype, 'application/epub+zip')
183 meta = p.get_meta()
184 self.assertEqual(meta['OEBPS/content.opf']['dc:creator'], 'Dorothy L. Sayers')
185 self.assertEqual(meta['OEBPS/toc.ncx']['dtb:generator'], 'Ebookmaker 0.4.0a5 by Marcello Perathoner <webmaster@gutenberg.org>')
186 self.assertEqual(meta['OEBPS/@public@vhost@g@gutenberg@html@files@58820@58820-h@images@shield25.jpg']['CreatorTool'], 'Adobe Photoshop CS5 Macintosh')
187 self.assertEqual(meta['OEBPS/@public@vhost@g@gutenberg@html@files@58820@58820-h@58820-h-2.htm.html']['generator'], 'Ebookmaker 0.4.0a5 by Marcello Perathoner <webmaster@gutenberg.org>')
188
189 def test_css(self):
190 p, mimetype = parser_factory.get_parser('./tests/data/dirty.css')
191 self.assertEqual(mimetype, 'text/css')
192 meta = p.get_meta()
193 self.assertEqual(meta['author'], 'jvoisin')
194 self.assertEqual(meta['version'], '1.0')
195 self.assertEqual(meta['harmful data'], 'underline is cool')
196
180class TestRemovingThumbnails(unittest.TestCase): 197class TestRemovingThumbnails(unittest.TestCase):
181 def test_odt(self): 198 def test_odt(self):
182 shutil.copy('./tests/data/revision.odt', './tests/data/clean.odt') 199 shutil.copy('./tests/data/revision.odt', './tests/data/clean.odt')
@@ -599,7 +616,7 @@ class TestCleaning(unittest.TestCase):
599 616
600 def test_html(self): 617 def test_html(self):
601 shutil.copy('./tests/data/dirty.html', './tests/data/clean.html') 618 shutil.copy('./tests/data/dirty.html', './tests/data/clean.html')
602 p = html.HTMLParser('./tests/data/clean.html') 619 p = web.HTMLParser('./tests/data/clean.html')
603 620
604 meta = p.get_meta() 621 meta = p.get_meta()
605 self.assertEqual(meta['author'], 'jvoisin') 622 self.assertEqual(meta['author'], 'jvoisin')
@@ -607,10 +624,50 @@ class TestCleaning(unittest.TestCase):
607 ret = p.remove_all() 624 ret = p.remove_all()
608 self.assertTrue(ret) 625 self.assertTrue(ret)
609 626
610 p = html.HTMLParser('./tests/data/clean.cleaned.html') 627 p = web.HTMLParser('./tests/data/clean.cleaned.html')
611 self.assertEqual(p.get_meta(), {}) 628 self.assertEqual(p.get_meta(), {})
612 self.assertTrue(p.remove_all()) 629 self.assertTrue(p.remove_all())
613 630
614 os.remove('./tests/data/clean.html') 631 os.remove('./tests/data/clean.html')
615 os.remove('./tests/data/clean.cleaned.html') 632 os.remove('./tests/data/clean.cleaned.html')
616 os.remove('./tests/data/clean.cleaned.cleaned.html') 633 os.remove('./tests/data/clean.cleaned.cleaned.html')
634
635
636 def test_epub(self):
637 shutil.copy('./tests/data/dirty.epub', './tests/data/clean.epub')
638 p = epub.EPUBParser('./tests/data/clean.epub')
639
640 meta = p.get_meta()
641 self.assertEqual(meta['OEBPS/content.opf']['dc:source'], 'http://www.gutenberg.org/files/58820/58820-h/58820-h.htm')
642
643 ret = p.remove_all()
644 self.assertTrue(ret)
645
646 p = epub.EPUBParser('./tests/data/clean.cleaned.epub')
647 self.assertEqual(p.get_meta(), {})
648 self.assertTrue(p.remove_all())
649
650 os.remove('./tests/data/clean.epub')
651 os.remove('./tests/data/clean.cleaned.epub')
652 os.remove('./tests/data/clean.cleaned.cleaned.epub')
653
654
655 def test_css(self):
656 shutil.copy('./tests/data/dirty.css', './tests/data/clean.css')
657 p = web.CSSParser('./tests/data/clean.css')
658
659 self.assertEqual(p.get_meta(), {
660 'harmful data': 'underline is cool',
661 'version': '1.0',
662 'author': 'jvoisin'})
663
664 ret = p.remove_all()
665 self.assertTrue(ret)
666
667 p = web.CSSParser('./tests/data/clean.cleaned.css')
668 self.assertEqual(p.get_meta(), {})
669 self.assertTrue(p.remove_all())
670
671 os.remove('./tests/data/clean.css')
672 os.remove('./tests/data/clean.cleaned.css')
673 os.remove('./tests/data/clean.cleaned.cleaned.css')