diff options
| author | jvoisin | 2019-02-20 16:28:11 -0800 |
|---|---|---|
| committer | jvoisin | 2019-02-20 16:28:11 -0800 |
| commit | 02ff21b158c76fcd355a74ddb940e1c54fc2d7ed (patch) | |
| tree | 701c6f5e316265e5a95a162356965ecf2fb8d6b2 | |
| parent | 6b45064c784d03bb21ffaf7e50c9ba684e6985a9 (diff) | |
Implement epub support
| -rw-r--r-- | libmat2/epub.py | 47 | ||||
| -rw-r--r-- | libmat2/html.py | 69 | ||||
| -rw-r--r-- | libmat2/parser_factory.py | 9 | ||||
| -rw-r--r-- | libmat2/web.py | 122 | ||||
| -rw-r--r-- | tests/data/dirty.css | 14 | ||||
| -rw-r--r-- | tests/data/dirty.epub | bin | 0 -> 296324 bytes | |||
| -rw-r--r-- | tests/dirty.epub | bin | 0 -> 296324 bytes | |||
| -rw-r--r-- | tests/test_corrupted_files.py | 41 | ||||
| -rw-r--r-- | tests/test_libmat2.py | 63 |
9 files changed, 282 insertions, 83 deletions
diff --git a/libmat2/epub.py b/libmat2/epub.py new file mode 100644 index 0000000..09b7937 --- /dev/null +++ b/libmat2/epub.py | |||
| @@ -0,0 +1,47 @@ | |||
| 1 | import logging | ||
| 2 | import re | ||
| 3 | import xml.etree.ElementTree as ET # type: ignore | ||
| 4 | |||
| 5 | from . import archive, office | ||
| 6 | |||
| 7 | class EPUBParser(archive.ArchiveBasedAbstractParser): | ||
| 8 | mimetypes = {'application/epub+zip', } | ||
| 9 | |||
| 10 | def __init__(self, filename): | ||
| 11 | super().__init__(filename) | ||
| 12 | self.files_to_keep = set(map(re.compile, { # type: ignore | ||
| 13 | 'META-INF/container.xml', | ||
| 14 | 'mimetype', | ||
| 15 | 'OEBPS/content.opf', | ||
| 16 | })) | ||
| 17 | |||
| 18 | def _specific_get_meta(self, full_path, file_path): | ||
| 19 | if file_path != 'OEBPS/content.opf': | ||
| 20 | return {} | ||
| 21 | |||
| 22 | with open(full_path, encoding='utf-8') as f: | ||
| 23 | try: | ||
| 24 | results = re.findall(r"<((?:meta|dc|cp).+?)[^>]*>(.+)</\1>", | ||
| 25 | f.read(), re.I|re.M) | ||
| 26 | return {k:v for (k, v) in results} | ||
| 27 | except (TypeError, UnicodeDecodeError): | ||
| 28 | # We didn't manage to parse the xml file | ||
| 29 | return {file_path: 'harmful content', } | ||
| 30 | |||
| 31 | def _specific_cleanup(self, full_path: str): | ||
| 32 | if not full_path.endswith('OEBPS/content.opf'): | ||
| 33 | return True | ||
| 34 | |||
| 35 | try: | ||
| 36 | tree, namespace = office._parse_xml(full_path) | ||
| 37 | except ET.ParseError: | ||
| 38 | logging.error("Unable to parse %s in %s.", full_path, self.filename) | ||
| 39 | return False | ||
| 40 | parent_map = {c:p for p in tree.iter() for c in p} | ||
| 41 | |||
| 42 | for item in tree.iterfind('.//', namespace): | ||
| 43 | if item.tag.strip().lower().endswith('metadata'): | ||
| 44 | parent_map[item].remove(item) | ||
| 45 | break # there is only a single <metadata> block | ||
| 46 | tree.write(full_path, xml_declaration=True) | ||
| 47 | return True | ||
diff --git a/libmat2/html.py b/libmat2/html.py deleted file mode 100644 index d0e9a2b..0000000 --- a/libmat2/html.py +++ /dev/null | |||
| @@ -1,69 +0,0 @@ | |||
| 1 | from html import parser | ||
| 2 | from typing import Dict, Any, List, Tuple | ||
| 3 | |||
| 4 | from . import abstract | ||
| 5 | |||
| 6 | |||
| 7 | class HTMLParser(abstract.AbstractParser): | ||
| 8 | mimetypes = {'text/html', } | ||
| 9 | def __init__(self, filename): | ||
| 10 | super().__init__(filename) | ||
| 11 | self.__parser = _HTMLParser() | ||
| 12 | with open(filename) as f: | ||
| 13 | self.__parser.feed(f.read()) | ||
| 14 | self.__parser.close() | ||
| 15 | |||
| 16 | def get_meta(self) -> Dict[str, Any]: | ||
| 17 | return self.__parser.get_meta() | ||
| 18 | |||
| 19 | def remove_all(self) -> bool: | ||
| 20 | return self.__parser.remove_all(self.output_filename) | ||
| 21 | |||
| 22 | |||
| 23 | class _HTMLParser(parser.HTMLParser): | ||
| 24 | """Python doesn't have a validating html parser in its stdlib, so | ||
| 25 | we're using an internal queue to track all the opening/closing tags, | ||
| 26 | and hoping for the best. | ||
| 27 | """ | ||
| 28 | def __init__(self): | ||
| 29 | super().__init__() | ||
| 30 | self.__textrepr = '' | ||
| 31 | self.__meta = {} | ||
| 32 | self.__validation_queue = [] | ||
| 33 | |||
| 34 | def handle_starttag(self, tag: str, attrs: List[Tuple[str, str]]): | ||
| 35 | self.__textrepr += self.get_starttag_text() | ||
| 36 | self.__validation_queue.append(tag) | ||
| 37 | |||
| 38 | def handle_endtag(self, tag: str): | ||
| 39 | if not self.__validation_queue: | ||
| 40 | raise ValueError | ||
| 41 | elif tag != self.__validation_queue.pop(): | ||
| 42 | raise ValueError | ||
| 43 | # There is no `get_endtag_text()` method :/ | ||
| 44 | self.__textrepr += '</' + tag + '>\n' | ||
| 45 | |||
| 46 | def handle_data(self, data: str): | ||
| 47 | if data.strip(): | ||
| 48 | self.__textrepr += data | ||
| 49 | |||
| 50 | def handle_startendtag(self, tag: str, attrs: List[Tuple[str, str]]): | ||
| 51 | if tag == 'meta': | ||
| 52 | meta = {k:v for k, v in attrs} | ||
| 53 | name = meta.get('name', 'harmful metadata') | ||
| 54 | content = meta.get('content', 'harmful data') | ||
| 55 | self.__meta[name] = content | ||
| 56 | else: | ||
| 57 | self.__textrepr += self.get_starttag_text() | ||
| 58 | |||
| 59 | def remove_all(self, output_filename: str) -> bool: | ||
| 60 | if self.__validation_queue: | ||
| 61 | raise ValueError | ||
| 62 | with open(output_filename, 'w') as f: | ||
| 63 | f.write(self.__textrepr) | ||
| 64 | return True | ||
| 65 | |||
| 66 | def get_meta(self) -> Dict[str, Any]: | ||
| 67 | if self.__validation_queue: | ||
| 68 | raise ValueError | ||
| 69 | return self.__meta | ||
diff --git a/libmat2/parser_factory.py b/libmat2/parser_factory.py index 30c3b52..e93ee4f 100644 --- a/libmat2/parser_factory.py +++ b/libmat2/parser_factory.py | |||
| @@ -1,3 +1,4 @@ | |||
| 1 | import logging | ||
| 1 | import glob | 2 | import glob |
| 2 | import os | 3 | import os |
| 3 | import mimetypes | 4 | import mimetypes |
| @@ -10,6 +11,10 @@ assert Tuple # make pyflakes happy | |||
| 10 | 11 | ||
| 11 | T = TypeVar('T', bound='abstract.AbstractParser') | 12 | T = TypeVar('T', bound='abstract.AbstractParser') |
| 12 | 13 | ||
| 14 | mimetypes.add_type('application/epub+zip', '.epub') | ||
| 15 | # EPUB Navigation Control XML File | ||
| 16 | mimetypes.add_type('application/x-dtbncx+xml', '.ncx') | ||
| 17 | |||
| 13 | 18 | ||
| 14 | def __load_all_parsers(): | 19 | def __load_all_parsers(): |
| 15 | """ Loads every parser in a dynamic way """ | 20 | """ Loads every parser in a dynamic way """ |
| @@ -49,6 +54,8 @@ def get_parser(filename: str) -> Tuple[Optional[T], Optional[str]]: | |||
| 49 | if mtype in parser_class.mimetypes: | 54 | if mtype in parser_class.mimetypes: |
| 50 | try: | 55 | try: |
| 51 | return parser_class(filename), mtype | 56 | return parser_class(filename), mtype |
| 52 | except ValueError: | 57 | except ValueError as e: |
| 58 | logging.info("Got an exception when trying to instanciate " | ||
| 59 | "%s for %s: %s", parser_class, filename, e) | ||
| 53 | return None, mtype | 60 | return None, mtype |
| 54 | return None, mtype | 61 | return None, mtype |
diff --git a/libmat2/web.py b/libmat2/web.py new file mode 100644 index 0000000..13d5fc8 --- /dev/null +++ b/libmat2/web.py | |||
| @@ -0,0 +1,122 @@ | |||
| 1 | from html import parser | ||
| 2 | from typing import Dict, Any, List, Tuple | ||
| 3 | import re | ||
| 4 | import string | ||
| 5 | |||
| 6 | from . import abstract | ||
| 7 | |||
| 8 | |||
| 9 | class CSSParser(abstract.AbstractParser): | ||
| 10 | """There is no such things as metadata in CSS files, | ||
| 11 | only comments of the form `/* … */`, so we're removing the laters.""" | ||
| 12 | mimetypes = {'text/css', } | ||
| 13 | flags = re.MULTILINE | re.DOTALL | ||
| 14 | |||
| 15 | def remove_all(self) -> bool: | ||
| 16 | with open(self.filename, encoding='utf-8') as f: | ||
| 17 | cleaned = re.sub(r'/\*.+?\*/', '', f.read(), 0, self.flags) | ||
| 18 | with open(self.output_filename, 'w', encoding='utf-8') as f: | ||
| 19 | f.write(cleaned) | ||
| 20 | return True | ||
| 21 | |||
| 22 | def get_meta(self) -> Dict[str, Any]: | ||
| 23 | metadata = {} | ||
| 24 | with open(self.filename, encoding='utf-8') as f: | ||
| 25 | cssdoc = re.findall(r'/\*(.+?)\*/', f.read(), self.flags) | ||
| 26 | for match in cssdoc: | ||
| 27 | for line in match.splitlines(): | ||
| 28 | try: | ||
| 29 | k, v = line.split(':') | ||
| 30 | metadata[k.strip(string.whitespace + '*')] = v.strip() | ||
| 31 | except ValueError: | ||
| 32 | metadata['harmful data'] = line.strip() | ||
| 33 | return metadata | ||
| 34 | |||
| 35 | |||
| 36 | class HTMLParser(abstract.AbstractParser): | ||
| 37 | mimetypes = {'text/html', 'application/x-dtbncx+xml', } | ||
| 38 | def __init__(self, filename): | ||
| 39 | super().__init__(filename) | ||
| 40 | self.__parser = _HTMLParser(self.filename) | ||
| 41 | with open(filename, encoding='utf-8') as f: | ||
| 42 | self.__parser.feed(f.read()) | ||
| 43 | self.__parser.close() | ||
| 44 | |||
| 45 | def get_meta(self) -> Dict[str, Any]: | ||
| 46 | return self.__parser.get_meta() | ||
| 47 | |||
| 48 | def remove_all(self) -> bool: | ||
| 49 | return self.__parser.remove_all(self.output_filename) | ||
| 50 | |||
| 51 | |||
| 52 | class _HTMLParser(parser.HTMLParser): | ||
| 53 | """Python doesn't have a validating html parser in its stdlib, so | ||
| 54 | we're using an internal queue to track all the opening/closing tags, | ||
| 55 | and hoping for the best. | ||
| 56 | """ | ||
| 57 | tag_blacklist = {'doctitle', 'meta'} # everything is lowercase | ||
| 58 | def __init__(self, filename): | ||
| 59 | super().__init__() | ||
| 60 | self.filename = filename | ||
| 61 | self.__textrepr = '' | ||
| 62 | self.__meta = {} | ||
| 63 | self.__validation_queue = [] | ||
| 64 | # We're using a counter instead of a boolean to handle nested tags | ||
| 65 | self.__in_dangerous_tag = 0 | ||
| 66 | |||
| 67 | def handle_starttag(self, tag: str, attrs: List[Tuple[str, str]]): | ||
| 68 | self.__validation_queue.append(tag) | ||
| 69 | if tag in self.tag_blacklist: | ||
| 70 | self.__in_dangerous_tag += 1 | ||
| 71 | return | ||
| 72 | |||
| 73 | if self.__in_dangerous_tag == 0: | ||
| 74 | self.__textrepr += self.get_starttag_text() | ||
| 75 | |||
| 76 | def handle_endtag(self, tag: str): | ||
| 77 | if not self.__validation_queue: | ||
| 78 | raise ValueError("The closing tag %s doesn't have a corresponding " | ||
| 79 | "opening one in %s." % (tag, self.filename)) | ||
| 80 | |||
| 81 | previous_tag = self.__validation_queue.pop() | ||
| 82 | if tag != previous_tag: | ||
| 83 | raise ValueError("The closing tag %s doesn't match the previous " | ||
| 84 | "tag %s in %s" % | ||
| 85 | (tag, previous_tag, self.filename)) | ||
| 86 | elif tag in self.tag_blacklist: | ||
| 87 | self.__in_dangerous_tag -= 1 | ||
| 88 | return | ||
| 89 | |||
| 90 | if self.__in_dangerous_tag == 0: | ||
| 91 | # There is no `get_endtag_text()` method :/ | ||
| 92 | self.__textrepr += '</' + tag + '>\n' | ||
| 93 | |||
| 94 | def handle_data(self, data: str): | ||
| 95 | if self.__in_dangerous_tag == 0 and data.strip(): | ||
| 96 | self.__textrepr += data | ||
| 97 | |||
| 98 | def handle_startendtag(self, tag: str, attrs: List[Tuple[str, str]]): | ||
| 99 | if tag in self.tag_blacklist: | ||
| 100 | meta = {k:v for k, v in attrs} | ||
| 101 | name = meta.get('name', 'harmful metadata') | ||
| 102 | content = meta.get('content', 'harmful data') | ||
| 103 | self.__meta[name] = content | ||
| 104 | else: | ||
| 105 | if self.__in_dangerous_tag == 0: | ||
| 106 | self.__textrepr += self.get_starttag_text() | ||
| 107 | |||
| 108 | def remove_all(self, output_filename: str) -> bool: | ||
| 109 | if self.__validation_queue: | ||
| 110 | raise ValueError("Some tags (%s) were left unclosed in %s" % ( | ||
| 111 | ', '.join(self.__validation_queue), | ||
| 112 | self.filename)) | ||
| 113 | with open(output_filename, 'w', encoding='utf-8') as f: | ||
| 114 | f.write(self.__textrepr) | ||
| 115 | return True | ||
| 116 | |||
| 117 | def get_meta(self) -> Dict[str, Any]: | ||
| 118 | if self.__validation_queue: | ||
| 119 | raise ValueError("Some tags (%s) were left unclosed in %s" % ( | ||
| 120 | ', '.join(self.__validation_queue), | ||
| 121 | self.filename)) | ||
| 122 | return self.__meta | ||
diff --git a/tests/data/dirty.css b/tests/data/dirty.css new file mode 100644 index 0000000..f52caf9 --- /dev/null +++ b/tests/data/dirty.css | |||
| @@ -0,0 +1,14 @@ | |||
| 1 | /** | ||
| 2 | * This is my super css framework | ||
| 3 | * version: 1.0 | ||
| 4 | * author : jvoisin | ||
| 5 | */ | ||
| 6 | |||
| 7 | body { | ||
| 8 | color: red; | ||
| 9 | background-color: blue; | ||
| 10 | } | ||
| 11 | |||
| 12 | .underline { | ||
| 13 | text-decoration: underline; /* underline is cool */ | ||
| 14 | } | ||
diff --git a/tests/data/dirty.epub b/tests/data/dirty.epub new file mode 100644 index 0000000..6389963 --- /dev/null +++ b/tests/data/dirty.epub | |||
| Binary files differ | |||
diff --git a/tests/dirty.epub b/tests/dirty.epub new file mode 100644 index 0000000..6389963 --- /dev/null +++ b/tests/dirty.epub | |||
| Binary files differ | |||
diff --git a/tests/test_corrupted_files.py b/tests/test_corrupted_files.py index 8728cb2..53c856a 100644 --- a/tests/test_corrupted_files.py +++ b/tests/test_corrupted_files.py | |||
| @@ -7,7 +7,7 @@ import logging | |||
| 7 | import zipfile | 7 | import zipfile |
| 8 | 8 | ||
| 9 | from libmat2 import pdf, images, audio, office, parser_factory, torrent | 9 | from libmat2 import pdf, images, audio, office, parser_factory, torrent |
| 10 | from libmat2 import harmless, video, html | 10 | from libmat2 import harmless, video, web |
| 11 | 11 | ||
| 12 | # No need to logging messages, should something go wrong, | 12 | # No need to logging messages, should something go wrong, |
| 13 | # the testsuite _will_ fail. | 13 | # the testsuite _will_ fail. |
| @@ -220,34 +220,34 @@ class TestCorruptedFiles(unittest.TestCase): | |||
| 220 | os.remove('./tests/data/--output.avi') | 220 | os.remove('./tests/data/--output.avi') |
| 221 | 221 | ||
| 222 | def test_zip(self): | 222 | def test_zip(self): |
| 223 | with zipfile.ZipFile('./tests/data/dirty.zip', 'w') as zout: | 223 | with zipfile.ZipFile('./tests/data/clean.zip', 'w') as zout: |
| 224 | zout.write('./tests/data/dirty.flac') | 224 | zout.write('./tests/data/dirty.flac') |
| 225 | zout.write('./tests/data/dirty.docx') | 225 | zout.write('./tests/data/dirty.docx') |
| 226 | zout.write('./tests/data/dirty.jpg') | 226 | zout.write('./tests/data/dirty.jpg') |
| 227 | zout.write('./tests/data/embedded_corrupted.docx') | 227 | zout.write('./tests/data/embedded_corrupted.docx') |
| 228 | p, mimetype = parser_factory.get_parser('./tests/data/dirty.zip') | 228 | p, mimetype = parser_factory.get_parser('./tests/data/clean.zip') |
| 229 | self.assertEqual(mimetype, 'application/zip') | 229 | self.assertEqual(mimetype, 'application/zip') |
| 230 | meta = p.get_meta() | 230 | meta = p.get_meta() |
| 231 | self.assertEqual(meta['tests/data/dirty.flac']['comments'], 'Thank you for using MAT !') | 231 | self.assertEqual(meta['tests/data/dirty.flac']['comments'], 'Thank you for using MAT !') |
| 232 | self.assertEqual(meta['tests/data/dirty.docx']['word/media/image1.png']['Comment'], 'This is a comment, be careful!') | 232 | self.assertEqual(meta['tests/data/dirty.docx']['word/media/image1.png']['Comment'], 'This is a comment, be careful!') |
| 233 | self.assertFalse(p.remove_all()) | 233 | self.assertFalse(p.remove_all()) |
| 234 | os.remove('./tests/data/dirty.zip') | 234 | os.remove('./tests/data/clean.zip') |
| 235 | 235 | ||
| 236 | def test_html(self): | 236 | def test_html(self): |
| 237 | shutil.copy('./tests/data/dirty.html', './tests/data/clean.html') | 237 | shutil.copy('./tests/data/dirty.html', './tests/data/clean.html') |
| 238 | with open('./tests/data/clean.html', 'a') as f: | 238 | with open('./tests/data/clean.html', 'a') as f: |
| 239 | f.write('<open>but not</closed>') | 239 | f.write('<open>but not</closed>') |
| 240 | with self.assertRaises(ValueError): | 240 | with self.assertRaises(ValueError): |
| 241 | html.HTMLParser('./tests/data/clean.html') | 241 | web.HTMLParser('./tests/data/clean.html') |
| 242 | os.remove('./tests/data/clean.html') | 242 | os.remove('./tests/data/clean.html') |
| 243 | 243 | ||
| 244 | # Yes, we're able to deal with malformed html :/ | 244 | # Yes, we're able to deal with malformed html :/ |
| 245 | shutil.copy('./tests/data/dirty.html', './tests/data/clean.html') | 245 | shutil.copy('./tests/data/dirty.html', './tests/data/clean.html') |
| 246 | with open('./tests/data/clean.html', 'a') as f: | 246 | with open('./tests/data/clean.html', 'a') as f: |
| 247 | f.write('<meta name=\'this" is="weird"/>') | 247 | f.write('<meta name=\'this" is="weird"/>') |
| 248 | p = html.HTMLParser('./tests/data/clean.html') | 248 | p = web.HTMLParser('./tests/data/clean.html') |
| 249 | self.assertTrue(p.remove_all()) | 249 | self.assertTrue(p.remove_all()) |
| 250 | p = html.HTMLParser('./tests/data/clean.cleaned.html') | 250 | p = web.HTMLParser('./tests/data/clean.cleaned.html') |
| 251 | self.assertEqual(p.get_meta(), {}) | 251 | self.assertEqual(p.get_meta(), {}) |
| 252 | os.remove('./tests/data/clean.html') | 252 | os.remove('./tests/data/clean.html') |
| 253 | os.remove('./tests/data/clean.cleaned.html') | 253 | os.remove('./tests/data/clean.cleaned.html') |
| @@ -255,17 +255,38 @@ class TestCorruptedFiles(unittest.TestCase): | |||
| 255 | with open('./tests/data/clean.html', 'w') as f: | 255 | with open('./tests/data/clean.html', 'w') as f: |
| 256 | f.write('</close>') | 256 | f.write('</close>') |
| 257 | with self.assertRaises(ValueError): | 257 | with self.assertRaises(ValueError): |
| 258 | html.HTMLParser('./tests/data/clean.html') | 258 | web.HTMLParser('./tests/data/clean.html') |
| 259 | os.remove('./tests/data/clean.html') | 259 | os.remove('./tests/data/clean.html') |
| 260 | 260 | ||
| 261 | with open('./tests/data/clean.html', 'w') as f: | 261 | with open('./tests/data/clean.html', 'w') as f: |
| 262 | f.write('<notclosed>') | 262 | f.write('<notclosed>') |
| 263 | p = html.HTMLParser('./tests/data/clean.html') | 263 | p = web.HTMLParser('./tests/data/clean.html') |
| 264 | with self.assertRaises(ValueError): | 264 | with self.assertRaises(ValueError): |
| 265 | p.get_meta() | 265 | p.get_meta() |
| 266 | p = html.HTMLParser('./tests/data/clean.html') | 266 | p = web.HTMLParser('./tests/data/clean.html') |
| 267 | with self.assertRaises(ValueError): | 267 | with self.assertRaises(ValueError): |
| 268 | p.remove_all() | 268 | p.remove_all() |
| 269 | os.remove('./tests/data/clean.html') | 269 | os.remove('./tests/data/clean.html') |
| 270 | 270 | ||
| 271 | with open('./tests/data/clean.html', 'w') as f: | ||
| 272 | f.write('<doctitle><br/></doctitle><br/><notclosed>') | ||
| 273 | p = web.HTMLParser('./tests/data/clean.html') | ||
| 274 | with self.assertRaises(ValueError): | ||
| 275 | p.get_meta() | ||
| 276 | p = web.HTMLParser('./tests/data/clean.html') | ||
| 277 | with self.assertRaises(ValueError): | ||
| 278 | p.remove_all() | ||
| 279 | os.remove('./tests/data/clean.html') | ||
| 280 | |||
| 281 | def test_epub(self): | ||
| 282 | with zipfile.ZipFile('./tests/data/clean.epub', 'w') as zout: | ||
| 283 | zout.write('./tests/data/dirty.jpg', 'OEBPS/content.opf') | ||
| 284 | p, mimetype = parser_factory.get_parser('./tests/data/clean.epub') | ||
| 285 | self.assertEqual(mimetype, 'application/epub+zip') | ||
| 286 | meta = p.get_meta() | ||
| 287 | self.assertEqual(meta['OEBPS/content.opf']['OEBPS/content.opf'], | ||
| 288 | 'harmful content') | ||
| 289 | |||
| 290 | self.assertFalse(p.remove_all()) | ||
| 291 | os.remove('./tests/data/clean.epub') | ||
| 271 | 292 | ||
diff --git a/tests/test_libmat2.py b/tests/test_libmat2.py index 8753e09..249c56d 100644 --- a/tests/test_libmat2.py +++ b/tests/test_libmat2.py | |||
| @@ -6,7 +6,7 @@ import os | |||
| 6 | import zipfile | 6 | import zipfile |
| 7 | 7 | ||
| 8 | from libmat2 import pdf, images, audio, office, parser_factory, torrent, harmless | 8 | from libmat2 import pdf, images, audio, office, parser_factory, torrent, harmless |
| 9 | from libmat2 import check_dependencies, video, archive, html | 9 | from libmat2 import check_dependencies, video, archive, web, epub |
| 10 | 10 | ||
| 11 | 11 | ||
| 12 | class TestCheckDependencies(unittest.TestCase): | 12 | class TestCheckDependencies(unittest.TestCase): |
| @@ -177,6 +177,23 @@ class TestGetMeta(unittest.TestCase): | |||
| 177 | meta = p.get_meta() | 177 | meta = p.get_meta() |
| 178 | self.assertEqual(meta['Comment'], 'this is a test comment') | 178 | self.assertEqual(meta['Comment'], 'this is a test comment') |
| 179 | 179 | ||
| 180 | def test_epub(self): | ||
| 181 | p, mimetype = parser_factory.get_parser('./tests/data/dirty.epub') | ||
| 182 | self.assertEqual(mimetype, 'application/epub+zip') | ||
| 183 | meta = p.get_meta() | ||
| 184 | self.assertEqual(meta['OEBPS/content.opf']['dc:creator'], 'Dorothy L. Sayers') | ||
| 185 | self.assertEqual(meta['OEBPS/toc.ncx']['dtb:generator'], 'Ebookmaker 0.4.0a5 by Marcello Perathoner <webmaster@gutenberg.org>') | ||
| 186 | self.assertEqual(meta['OEBPS/@public@vhost@g@gutenberg@html@files@58820@58820-h@images@shield25.jpg']['CreatorTool'], 'Adobe Photoshop CS5 Macintosh') | ||
| 187 | self.assertEqual(meta['OEBPS/@public@vhost@g@gutenberg@html@files@58820@58820-h@58820-h-2.htm.html']['generator'], 'Ebookmaker 0.4.0a5 by Marcello Perathoner <webmaster@gutenberg.org>') | ||
| 188 | |||
| 189 | def test_css(self): | ||
| 190 | p, mimetype = parser_factory.get_parser('./tests/data/dirty.css') | ||
| 191 | self.assertEqual(mimetype, 'text/css') | ||
| 192 | meta = p.get_meta() | ||
| 193 | self.assertEqual(meta['author'], 'jvoisin') | ||
| 194 | self.assertEqual(meta['version'], '1.0') | ||
| 195 | self.assertEqual(meta['harmful data'], 'underline is cool') | ||
| 196 | |||
| 180 | class TestRemovingThumbnails(unittest.TestCase): | 197 | class TestRemovingThumbnails(unittest.TestCase): |
| 181 | def test_odt(self): | 198 | def test_odt(self): |
| 182 | shutil.copy('./tests/data/revision.odt', './tests/data/clean.odt') | 199 | shutil.copy('./tests/data/revision.odt', './tests/data/clean.odt') |
| @@ -599,7 +616,7 @@ class TestCleaning(unittest.TestCase): | |||
| 599 | 616 | ||
| 600 | def test_html(self): | 617 | def test_html(self): |
| 601 | shutil.copy('./tests/data/dirty.html', './tests/data/clean.html') | 618 | shutil.copy('./tests/data/dirty.html', './tests/data/clean.html') |
| 602 | p = html.HTMLParser('./tests/data/clean.html') | 619 | p = web.HTMLParser('./tests/data/clean.html') |
| 603 | 620 | ||
| 604 | meta = p.get_meta() | 621 | meta = p.get_meta() |
| 605 | self.assertEqual(meta['author'], 'jvoisin') | 622 | self.assertEqual(meta['author'], 'jvoisin') |
| @@ -607,10 +624,50 @@ class TestCleaning(unittest.TestCase): | |||
| 607 | ret = p.remove_all() | 624 | ret = p.remove_all() |
| 608 | self.assertTrue(ret) | 625 | self.assertTrue(ret) |
| 609 | 626 | ||
| 610 | p = html.HTMLParser('./tests/data/clean.cleaned.html') | 627 | p = web.HTMLParser('./tests/data/clean.cleaned.html') |
| 611 | self.assertEqual(p.get_meta(), {}) | 628 | self.assertEqual(p.get_meta(), {}) |
| 612 | self.assertTrue(p.remove_all()) | 629 | self.assertTrue(p.remove_all()) |
| 613 | 630 | ||
| 614 | os.remove('./tests/data/clean.html') | 631 | os.remove('./tests/data/clean.html') |
| 615 | os.remove('./tests/data/clean.cleaned.html') | 632 | os.remove('./tests/data/clean.cleaned.html') |
| 616 | os.remove('./tests/data/clean.cleaned.cleaned.html') | 633 | os.remove('./tests/data/clean.cleaned.cleaned.html') |
| 634 | |||
| 635 | |||
| 636 | def test_epub(self): | ||
| 637 | shutil.copy('./tests/data/dirty.epub', './tests/data/clean.epub') | ||
| 638 | p = epub.EPUBParser('./tests/data/clean.epub') | ||
| 639 | |||
| 640 | meta = p.get_meta() | ||
| 641 | self.assertEqual(meta['OEBPS/content.opf']['dc:source'], 'http://www.gutenberg.org/files/58820/58820-h/58820-h.htm') | ||
| 642 | |||
| 643 | ret = p.remove_all() | ||
| 644 | self.assertTrue(ret) | ||
| 645 | |||
| 646 | p = epub.EPUBParser('./tests/data/clean.cleaned.epub') | ||
| 647 | self.assertEqual(p.get_meta(), {}) | ||
| 648 | self.assertTrue(p.remove_all()) | ||
| 649 | |||
| 650 | os.remove('./tests/data/clean.epub') | ||
| 651 | os.remove('./tests/data/clean.cleaned.epub') | ||
| 652 | os.remove('./tests/data/clean.cleaned.cleaned.epub') | ||
| 653 | |||
| 654 | |||
| 655 | def test_css(self): | ||
| 656 | shutil.copy('./tests/data/dirty.css', './tests/data/clean.css') | ||
| 657 | p = web.CSSParser('./tests/data/clean.css') | ||
| 658 | |||
| 659 | self.assertEqual(p.get_meta(), { | ||
| 660 | 'harmful data': 'underline is cool', | ||
| 661 | 'version': '1.0', | ||
| 662 | 'author': 'jvoisin'}) | ||
| 663 | |||
| 664 | ret = p.remove_all() | ||
| 665 | self.assertTrue(ret) | ||
| 666 | |||
| 667 | p = web.CSSParser('./tests/data/clean.cleaned.css') | ||
| 668 | self.assertEqual(p.get_meta(), {}) | ||
| 669 | self.assertTrue(p.remove_all()) | ||
| 670 | |||
| 671 | os.remove('./tests/data/clean.css') | ||
| 672 | os.remove('./tests/data/clean.cleaned.css') | ||
| 673 | os.remove('./tests/data/clean.cleaned.cleaned.css') | ||
