Test for faulty files, and document how MAT2 is behaving wrt. them

author: jvoisin 2018-05-06 21:58:31 +0200
committer: jvoisin 2018-05-06 21:58:31 +0200
commit: b02d72887afd4498b03cdd767ca46676fb150622 (patch)
tree: 9e10596ac428a4fce2b7da14ee8fb972ff826a68
parent: 459e9b82f76d9aa5c93c288d2ce7b8be34c601b4 (diff)
5 files changed, 38 insertions, 3 deletions
diff --git a/doc/implementation_notes.md b/doc/implementation_notes.md
index 60e9081..59e7d94 100644
--- a/doc/implementation_notes.md
+++ b/doc/implementation_notes.md
@@ -9,6 +9,14 @@ that only cleans the superficial metadata of your file, but not
 the ones that might be in **embeded** resources. Like for example,
 images in a PDF or an office document.
+Race conditions
+---------------
+MAT2 does its very best to avoid crashing at runtime. This is why it's checking
+if the file is valid __at parser creation__. MAT2 doesn't take any measure to
+ensure that the file is not changed between the time the parser is
+instantiated, and the call to clean or show the metadata.
 Symlink attacks
 ---------------
diff --git a/src/images.py b/src/images.py
index 7c1abaa..6cc3dfe 100644
--- a/src/images.py
+++ b/src/images.py
@@ -20,6 +20,13 @@ class PNGParser(abstract.AbstractParser):
            'Compression', 'Filter', 'Interlace', 'BackgroundColor', 'ImageSize',
            'Megapixels', 'ImageHeight'}
+    def __init__(self, filename):
+        super().__init__(filename)
+        try:  # better fail here than later
+            cairo.ImageSurface.create_from_png(self.filename)
+        except MemoryError:
+            raise ValueError
    def get_meta(self):
        out = subprocess.check_output(['/usr/bin/exiftool', '-json', self.filename])
        meta = json.loads(out.decode('utf-8'))[0]
diff --git a/src/parser_factory.py b/src/parser_factory.py
index 68e9e9c..80aedae 100644
--- a/src/parser_factory.py
+++ b/src/parser_factory.py
@@ -30,5 +30,8 @@ def get_parser(filename: str) -> (T, str):
    for c in _get_parsers():
        if mtype in c.mimetypes:
-            return c(filename), mtype
+            try:
+                return c(filename), mtype
+            except ValueError:
+                return None, mtype
    return None, mtype
diff --git a/src/pdf.py b/src/pdf.py
index 6e639cd..3ba3d4a 100644
--- a/src/pdf.py
+++ b/src/pdf.py
@@ -11,7 +11,7 @@ import io
 import cairo
 import gi
 gi.require_version('Poppler', '0.18')
-from gi.repository import Poppler
+from gi.repository import Poppler, GLib
 from . import abstract
@@ -28,6 +28,10 @@ class PDFParser(abstract.AbstractParser):
        super().__init__(filename)
        self.uri = 'file://' + os.path.abspath(self.filename)
        self.__scale = 2  # how much precision do we want for the render
+        try:  # Check now that the file is valid, to avoid surprises later
+            Poppler.Document.new_from_file(self.uri, None)
+        except GLib.GError:  # Invalid PDF
+            raise ValueError
    def remove_all_lightweight(self):
        """
@@ -116,8 +120,9 @@ class PDFParser(abstract.AbstractParser):
    def get_meta(self):
        """ Return a dict with all the meta of the file
        """
-        document = Poppler.Document.new_from_file(self.uri, None)
        metadata = {}
+        document = Poppler.Document.new_from_file(self.uri, None)
        for key in self.meta_list:
            if document.get_property(key):
                metadata[key] = document.get_property(key)
diff --git a/tests/test_libmat2.py b/tests/test_libmat2.py
index 1950444..17afaf4 100644
--- a/tests/test_libmat2.py
+++ b/tests/test_libmat2.py
@@ -16,6 +16,18 @@ class TestParserFactory(unittest.TestCase):
        self.assertEqual(mimetype, 'audio/mpeg')
        self.assertEqual(parser.__class__, audio.MP3Parser)
+class TestCorruptedFiles(unittest.TestCase):
+    def test_pdf(self):
+        shutil.copy('./tests/data/dirty.png', './tests/data/clean.png')
+        with self.assertRaises(ValueError):
+            pdf.PDFParser('./tests/data/clean.png')
+        os.remove('./tests/data/clean.png')
+    def test_png(self):
+        shutil.copy('./tests/data/dirty.pdf', './tests/data/clean.pdf')
+        with self.assertRaises(ValueError):
+            images.PNGParser('./tests/data/clean.pdf')
+        os.remove('./tests/data/clean.pdf')
 class TestGetMeta(unittest.TestCase):
    def test_pdf(self):
author	jvoisin	2018-05-06 21:58:31 +0200
committer	jvoisin	2018-05-06 21:58:31 +0200
commit	b02d72887afd4498b03cdd767ca46676fb150622 (patch)
tree	9e10596ac428a4fce2b7da14ee8fb972ff826a68
parent	459e9b82f76d9aa5c93c288d2ce7b8be34c601b4 (diff)