diff options
| author | jvoisin | 2018-05-06 21:58:31 +0200 |
|---|---|---|
| committer | jvoisin | 2018-05-06 21:58:31 +0200 |
| commit | b02d72887afd4498b03cdd767ca46676fb150622 (patch) | |
| tree | 9e10596ac428a4fce2b7da14ee8fb972ff826a68 | |
| parent | 459e9b82f76d9aa5c93c288d2ce7b8be34c601b4 (diff) | |
Test for faulty files, and document how MAT2 is behaving wrt. them
| -rw-r--r-- | doc/implementation_notes.md | 8 | ||||
| -rw-r--r-- | src/images.py | 7 | ||||
| -rw-r--r-- | src/parser_factory.py | 5 | ||||
| -rw-r--r-- | src/pdf.py | 9 | ||||
| -rw-r--r-- | tests/test_libmat2.py | 12 |
5 files changed, 38 insertions, 3 deletions
diff --git a/doc/implementation_notes.md b/doc/implementation_notes.md index 60e9081..59e7d94 100644 --- a/doc/implementation_notes.md +++ b/doc/implementation_notes.md | |||
| @@ -9,6 +9,14 @@ that only cleans the superficial metadata of your file, but not | |||
| 9 | the ones that might be in **embeded** resources. Like for example, | 9 | the ones that might be in **embeded** resources. Like for example, |
| 10 | images in a PDF or an office document. | 10 | images in a PDF or an office document. |
| 11 | 11 | ||
| 12 | Race conditions | ||
| 13 | --------------- | ||
| 14 | |||
| 15 | MAT2 does its very best to avoid crashing at runtime. This is why it's checking | ||
| 16 | if the file is valid __at parser creation__. MAT2 doesn't take any measure to | ||
| 17 | ensure that the file is not changed between the time the parser is | ||
| 18 | instantiated, and the call to clean or show the metadata. | ||
| 19 | |||
| 12 | Symlink attacks | 20 | Symlink attacks |
| 13 | --------------- | 21 | --------------- |
| 14 | 22 | ||
diff --git a/src/images.py b/src/images.py index 7c1abaa..6cc3dfe 100644 --- a/src/images.py +++ b/src/images.py | |||
| @@ -20,6 +20,13 @@ class PNGParser(abstract.AbstractParser): | |||
| 20 | 'Compression', 'Filter', 'Interlace', 'BackgroundColor', 'ImageSize', | 20 | 'Compression', 'Filter', 'Interlace', 'BackgroundColor', 'ImageSize', |
| 21 | 'Megapixels', 'ImageHeight'} | 21 | 'Megapixels', 'ImageHeight'} |
| 22 | 22 | ||
| 23 | def __init__(self, filename): | ||
| 24 | super().__init__(filename) | ||
| 25 | try: # better fail here than later | ||
| 26 | cairo.ImageSurface.create_from_png(self.filename) | ||
| 27 | except MemoryError: | ||
| 28 | raise ValueError | ||
| 29 | |||
| 23 | def get_meta(self): | 30 | def get_meta(self): |
| 24 | out = subprocess.check_output(['/usr/bin/exiftool', '-json', self.filename]) | 31 | out = subprocess.check_output(['/usr/bin/exiftool', '-json', self.filename]) |
| 25 | meta = json.loads(out.decode('utf-8'))[0] | 32 | meta = json.loads(out.decode('utf-8'))[0] |
diff --git a/src/parser_factory.py b/src/parser_factory.py index 68e9e9c..80aedae 100644 --- a/src/parser_factory.py +++ b/src/parser_factory.py | |||
| @@ -30,5 +30,8 @@ def get_parser(filename: str) -> (T, str): | |||
| 30 | 30 | ||
| 31 | for c in _get_parsers(): | 31 | for c in _get_parsers(): |
| 32 | if mtype in c.mimetypes: | 32 | if mtype in c.mimetypes: |
| 33 | return c(filename), mtype | 33 | try: |
| 34 | return c(filename), mtype | ||
| 35 | except ValueError: | ||
| 36 | return None, mtype | ||
| 34 | return None, mtype | 37 | return None, mtype |
| @@ -11,7 +11,7 @@ import io | |||
| 11 | import cairo | 11 | import cairo |
| 12 | import gi | 12 | import gi |
| 13 | gi.require_version('Poppler', '0.18') | 13 | gi.require_version('Poppler', '0.18') |
| 14 | from gi.repository import Poppler | 14 | from gi.repository import Poppler, GLib |
| 15 | 15 | ||
| 16 | from . import abstract | 16 | from . import abstract |
| 17 | 17 | ||
| @@ -28,6 +28,10 @@ class PDFParser(abstract.AbstractParser): | |||
| 28 | super().__init__(filename) | 28 | super().__init__(filename) |
| 29 | self.uri = 'file://' + os.path.abspath(self.filename) | 29 | self.uri = 'file://' + os.path.abspath(self.filename) |
| 30 | self.__scale = 2 # how much precision do we want for the render | 30 | self.__scale = 2 # how much precision do we want for the render |
| 31 | try: # Check now that the file is valid, to avoid surprises later | ||
| 32 | Poppler.Document.new_from_file(self.uri, None) | ||
| 33 | except GLib.GError: # Invalid PDF | ||
| 34 | raise ValueError | ||
| 31 | 35 | ||
| 32 | def remove_all_lightweight(self): | 36 | def remove_all_lightweight(self): |
| 33 | """ | 37 | """ |
| @@ -116,8 +120,9 @@ class PDFParser(abstract.AbstractParser): | |||
| 116 | def get_meta(self): | 120 | def get_meta(self): |
| 117 | """ Return a dict with all the meta of the file | 121 | """ Return a dict with all the meta of the file |
| 118 | """ | 122 | """ |
| 119 | document = Poppler.Document.new_from_file(self.uri, None) | ||
| 120 | metadata = {} | 123 | metadata = {} |
| 124 | document = Poppler.Document.new_from_file(self.uri, None) | ||
| 125 | |||
| 121 | for key in self.meta_list: | 126 | for key in self.meta_list: |
| 122 | if document.get_property(key): | 127 | if document.get_property(key): |
| 123 | metadata[key] = document.get_property(key) | 128 | metadata[key] = document.get_property(key) |
diff --git a/tests/test_libmat2.py b/tests/test_libmat2.py index 1950444..17afaf4 100644 --- a/tests/test_libmat2.py +++ b/tests/test_libmat2.py | |||
| @@ -16,6 +16,18 @@ class TestParserFactory(unittest.TestCase): | |||
| 16 | self.assertEqual(mimetype, 'audio/mpeg') | 16 | self.assertEqual(mimetype, 'audio/mpeg') |
| 17 | self.assertEqual(parser.__class__, audio.MP3Parser) | 17 | self.assertEqual(parser.__class__, audio.MP3Parser) |
| 18 | 18 | ||
| 19 | class TestCorruptedFiles(unittest.TestCase): | ||
| 20 | def test_pdf(self): | ||
| 21 | shutil.copy('./tests/data/dirty.png', './tests/data/clean.png') | ||
| 22 | with self.assertRaises(ValueError): | ||
| 23 | pdf.PDFParser('./tests/data/clean.png') | ||
| 24 | os.remove('./tests/data/clean.png') | ||
| 25 | |||
| 26 | def test_png(self): | ||
| 27 | shutil.copy('./tests/data/dirty.pdf', './tests/data/clean.pdf') | ||
| 28 | with self.assertRaises(ValueError): | ||
| 29 | images.PNGParser('./tests/data/clean.pdf') | ||
| 30 | os.remove('./tests/data/clean.pdf') | ||
| 19 | 31 | ||
| 20 | class TestGetMeta(unittest.TestCase): | 32 | class TestGetMeta(unittest.TestCase): |
| 21 | def test_pdf(self): | 33 | def test_pdf(self): |
