summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorjvoisin2018-05-06 21:58:31 +0200
committerjvoisin2018-05-06 21:58:31 +0200
commitb02d72887afd4498b03cdd767ca46676fb150622 (patch)
tree9e10596ac428a4fce2b7da14ee8fb972ff826a68
parent459e9b82f76d9aa5c93c288d2ce7b8be34c601b4 (diff)
Test for faulty files, and document how MAT2 is behaving wrt. them
-rw-r--r--doc/implementation_notes.md8
-rw-r--r--src/images.py7
-rw-r--r--src/parser_factory.py5
-rw-r--r--src/pdf.py9
-rw-r--r--tests/test_libmat2.py12
5 files changed, 38 insertions, 3 deletions
diff --git a/doc/implementation_notes.md b/doc/implementation_notes.md
index 60e9081..59e7d94 100644
--- a/doc/implementation_notes.md
+++ b/doc/implementation_notes.md
@@ -9,6 +9,14 @@ that only cleans the superficial metadata of your file, but not
9the ones that might be in **embeded** resources. Like for example, 9the ones that might be in **embeded** resources. Like for example,
10images in a PDF or an office document. 10images in a PDF or an office document.
11 11
12Race conditions
13---------------
14
15MAT2 does its very best to avoid crashing at runtime. This is why it's checking
16if the file is valid __at parser creation__. MAT2 doesn't take any measure to
17ensure that the file is not changed between the time the parser is
18instantiated, and the call to clean or show the metadata.
19
12Symlink attacks 20Symlink attacks
13--------------- 21---------------
14 22
diff --git a/src/images.py b/src/images.py
index 7c1abaa..6cc3dfe 100644
--- a/src/images.py
+++ b/src/images.py
@@ -20,6 +20,13 @@ class PNGParser(abstract.AbstractParser):
20 'Compression', 'Filter', 'Interlace', 'BackgroundColor', 'ImageSize', 20 'Compression', 'Filter', 'Interlace', 'BackgroundColor', 'ImageSize',
21 'Megapixels', 'ImageHeight'} 21 'Megapixels', 'ImageHeight'}
22 22
23 def __init__(self, filename):
24 super().__init__(filename)
25 try: # better fail here than later
26 cairo.ImageSurface.create_from_png(self.filename)
27 except MemoryError:
28 raise ValueError
29
23 def get_meta(self): 30 def get_meta(self):
24 out = subprocess.check_output(['/usr/bin/exiftool', '-json', self.filename]) 31 out = subprocess.check_output(['/usr/bin/exiftool', '-json', self.filename])
25 meta = json.loads(out.decode('utf-8'))[0] 32 meta = json.loads(out.decode('utf-8'))[0]
diff --git a/src/parser_factory.py b/src/parser_factory.py
index 68e9e9c..80aedae 100644
--- a/src/parser_factory.py
+++ b/src/parser_factory.py
@@ -30,5 +30,8 @@ def get_parser(filename: str) -> (T, str):
30 30
31 for c in _get_parsers(): 31 for c in _get_parsers():
32 if mtype in c.mimetypes: 32 if mtype in c.mimetypes:
33 return c(filename), mtype 33 try:
34 return c(filename), mtype
35 except ValueError:
36 return None, mtype
34 return None, mtype 37 return None, mtype
diff --git a/src/pdf.py b/src/pdf.py
index 6e639cd..3ba3d4a 100644
--- a/src/pdf.py
+++ b/src/pdf.py
@@ -11,7 +11,7 @@ import io
11import cairo 11import cairo
12import gi 12import gi
13gi.require_version('Poppler', '0.18') 13gi.require_version('Poppler', '0.18')
14from gi.repository import Poppler 14from gi.repository import Poppler, GLib
15 15
16from . import abstract 16from . import abstract
17 17
@@ -28,6 +28,10 @@ class PDFParser(abstract.AbstractParser):
28 super().__init__(filename) 28 super().__init__(filename)
29 self.uri = 'file://' + os.path.abspath(self.filename) 29 self.uri = 'file://' + os.path.abspath(self.filename)
30 self.__scale = 2 # how much precision do we want for the render 30 self.__scale = 2 # how much precision do we want for the render
31 try: # Check now that the file is valid, to avoid surprises later
32 Poppler.Document.new_from_file(self.uri, None)
33 except GLib.GError: # Invalid PDF
34 raise ValueError
31 35
32 def remove_all_lightweight(self): 36 def remove_all_lightweight(self):
33 """ 37 """
@@ -116,8 +120,9 @@ class PDFParser(abstract.AbstractParser):
116 def get_meta(self): 120 def get_meta(self):
117 """ Return a dict with all the meta of the file 121 """ Return a dict with all the meta of the file
118 """ 122 """
119 document = Poppler.Document.new_from_file(self.uri, None)
120 metadata = {} 123 metadata = {}
124 document = Poppler.Document.new_from_file(self.uri, None)
125
121 for key in self.meta_list: 126 for key in self.meta_list:
122 if document.get_property(key): 127 if document.get_property(key):
123 metadata[key] = document.get_property(key) 128 metadata[key] = document.get_property(key)
diff --git a/tests/test_libmat2.py b/tests/test_libmat2.py
index 1950444..17afaf4 100644
--- a/tests/test_libmat2.py
+++ b/tests/test_libmat2.py
@@ -16,6 +16,18 @@ class TestParserFactory(unittest.TestCase):
16 self.assertEqual(mimetype, 'audio/mpeg') 16 self.assertEqual(mimetype, 'audio/mpeg')
17 self.assertEqual(parser.__class__, audio.MP3Parser) 17 self.assertEqual(parser.__class__, audio.MP3Parser)
18 18
19class TestCorruptedFiles(unittest.TestCase):
20 def test_pdf(self):
21 shutil.copy('./tests/data/dirty.png', './tests/data/clean.png')
22 with self.assertRaises(ValueError):
23 pdf.PDFParser('./tests/data/clean.png')
24 os.remove('./tests/data/clean.png')
25
26 def test_png(self):
27 shutil.copy('./tests/data/dirty.pdf', './tests/data/clean.pdf')
28 with self.assertRaises(ValueError):
29 images.PNGParser('./tests/data/clean.pdf')
30 os.remove('./tests/data/clean.pdf')
19 31
20class TestGetMeta(unittest.TestCase): 32class TestGetMeta(unittest.TestCase):
21 def test_pdf(self): 33 def test_pdf(self):