Add lightweight processing for PDF

author: jvoisin 2018-04-14 21:23:31 +0200
committer: jvoisin 2018-04-14 21:23:31 +0200
commit: 96299c6a5350f59eab022a09400eddcc347daede (patch)
tree: 492df3a7637b2d1cb45424615ab2777043043eab
parent: 6f4ed2490fbcde0b74e7b8251ad71e29b430b8ef (diff)
5 files changed, 84 insertions, 15 deletions
diff --git a/main.py b/main.py
index be2508e..2cb05ff 100755
--- a/main.py
+++ b/main.py
@@ -31,6 +31,8 @@ def create_arg_parser():
                      help='list all supported fileformats')
    info.add_argument('-s', '--show', action='store_true',
                      help='list all the harmful metadata of a file without removing them')
+    info.add_argument('-L', '--lightweight', action='store_true',
+                      help='remove SOME metadata')
    return parser
@@ -50,7 +52,7 @@ def show_meta(filename:str):
            print("  %s: harmful content" % k)
-def clean_meta(filename:str):
+def clean_meta(filename:str, is_lightweigth:bool):
    if not __check_file(filename, os.R_OK|os.W_OK):
        return
@@ -58,7 +60,10 @@ def clean_meta(filename:str):
    if p is None:
        print("[-] %s's format (%s) is not supported" % (filename, mtype))
        return
-    p.remove_all()
+    if is_lightweigth:
+        p.remove_all_lightweight()
+    else:
+        p.remove_all()
 def show_parsers():
@@ -78,12 +83,12 @@ def __get_files_recursively(files):
                for _f in _files:
                    yield os.path.join(path, _f)
-def __do_clean_async(q):
+def __do_clean_async(is_lightweigth, q):
    while True:
        f = q.get()
        if f is None:  # nothing more to process
            return
-        clean_meta(f)
+        clean_meta(is_lightweigth, f)
        q.task_done()
@@ -109,7 +114,7 @@ def main():
            q.put(f)
        for _ in range(multiprocessing.cpu_count()):
-            worker = Thread(target=__do_clean_async, args=(q, ))
+            worker = Thread(target=__do_clean_async, args=(mode, q))
            worker.start()
            threads.append(worker)
diff --git a/src/abstract.py b/src/abstract.py
index 04c1535..93e8421 100644
--- a/src/abstract.py
+++ b/src/abstract.py
@@ -16,3 +16,7 @@ class AbstractParser(abc.ABC):
    @abc.abstractmethod
    def remove_all(self) -> bool:
        pass
+    def remove_all_lightweight(self) -> bool:
+        """ Remove _SOME_ metadata. """
+        return self.remove_all()
diff --git a/src/pdf.py b/src/pdf.py
index c119449..6e639cd 100644
--- a/src/pdf.py
+++ b/src/pdf.py
@@ -29,18 +29,43 @@ class PDFParser(abstract.AbstractParser):
        self.uri = 'file://' + os.path.abspath(self.filename)
        self.__scale = 2  # how much precision do we want for the render
+    def remove_all_lightweight(self):
+        """
+            Load the document into Poppler, render pages on a new PDFSurface.
+        """
+        document = Poppler.Document.new_from_file(self.uri, None)
+        pages_count = document.get_n_pages()
+        tmp_path = tempfile.mkstemp()[1]
+        pdf_surface = cairo.PDFSurface(tmp_path, 10, 10)
+        pdf_context = cairo.Context(pdf_surface)  # context draws on the surface
+        for pagenum in range(pages_count):
+            logging.info("Rendering page %d/%d", pagenum + 1, pages_count)
+            page = document.get_page(pagenum)
+            page_width, page_height = page.get_size()
+            pdf_surface.set_size(page_width, page_height)
+            pdf_context.save()
+            page.render_for_printing(pdf_context)
+            pdf_context.restore()
+            pdf_context.show_page()  # draw pdf_context on pdf_surface
+        pdf_surface.finish()
+        self.__remove_superficial_meta(tmp_path, self.output_filename)
+        os.remove(tmp_path)
+        return True
    def remove_all(self):
        """
            Load the document into Poppler, render pages on PNG,
-            and shove those PNG into a new PDF. Metadata from the new
+            and shove those PNG into a new PDF.
-            PDF are removed via Poppler, because there is no way to tell
-            cairo to not add "created by cairo" during rendering.
        """
        document = Poppler.Document.new_from_file(self.uri, None)
        pages_count = document.get_n_pages()
        _, tmp_path = tempfile.mkstemp()
-        pdf_surface = cairo.PDFSurface(tmp_path, 128, 128)
+        pdf_surface = cairo.PDFSurface(tmp_path, 32, 32)  # resized later anyway
        pdf_context = cairo.Context(pdf_surface)
        for pagenum in range(pages_count):
@@ -69,14 +94,18 @@ class PDFParser(abstract.AbstractParser):
        pdf_surface.finish()
        # Removes metadata added by Poppler
-        document = Poppler.Document.new_from_file('file://' + tmp_path)
+        self.__remove_superficial_meta(tmp_path, self.output_filename)
-        document.set_producer('')
-        document.set_creator('')
-        document.save('file://' + os.path.abspath(self.output_filename))
        os.remove(tmp_path)
        return True
+    def __remove_superficial_meta(self, in_file:str, out_file: str) -> bool:
+        document = Poppler.Document.new_from_file('file://' + in_file)
+        document.set_producer('')
+        document.set_creator('')
+        document.save('file://' + os.path.abspath(out_file))
+        return True
    def __parse_metadata_field(self, data:str) -> dict:
        metadata = {}
diff --git a/tests/test_climat2.py b/tests/test_climat2.py
index b9c52b5..64345eb 100644
--- a/tests/test_climat2.py
+++ b/tests/test_climat2.py
@@ -6,12 +6,12 @@ class TestHelp(unittest.TestCase):
    def test_help(self):
        proc = subprocess.Popen(['./main.py', '--help'], stdout=subprocess.PIPE)
        stdout, _ = proc.communicate()
-        self.assertIn(b'usage: main.py [-h] [-c] [-l] [-s] [files [files ...]]', stdout)
+        self.assertIn(b'usage: main.py [-h] [-c] [-l] [-s] [-L] [files [files ...]]', stdout)
    def test_no_arg(self):
        proc = subprocess.Popen(['./main.py'], stdout=subprocess.PIPE)
        stdout, _ = proc.communicate()
-        self.assertIn(b'usage: main.py [-h] [-c] [-l] [-s] [files [files ...]]', stdout)
+        self.assertIn(b'usage: main.py [-h] [-c] [-l] [-s] [-L] [files [files ...]]', stdout)
 class TestGetMeta(unittest.TestCase):
diff --git a/tests/test_libmat2.py b/tests/test_libmat2.py
index 6141dbe..34f7301 100644
--- a/tests/test_libmat2.py
+++ b/tests/test_libmat2.py
@@ -138,6 +138,37 @@ class TestDeepCleaning(unittest.TestCase):
        os.remove('./tests/data/clean.odt')
+class TestLightWeightCleaning(unittest.TestCase):
+    def test_pdf(self):
+        shutil.copy('./tests/data/dirty.pdf', './tests/data/clean.pdf')
+        p = pdf.PDFParser('./tests/data/clean.pdf')
+        meta = p.get_meta()
+        self.assertEqual(meta['producer'], 'pdfTeX-1.40.14')
+        ret = p.remove_all_lightweight()
+        self.assertTrue(ret)
+        p = pdf.PDFParser('./tests/data/clean.pdf.cleaned')
+        expected_meta = {'creation-date': -1, 'format': 'PDF-1.5', 'mod-date': -1}
+        self.assertEqual(p.get_meta(), expected_meta)
+        os.remove('./tests/data/clean.pdf')
+    def test_png(self):
+        shutil.copy('./tests/data/dirty.png', './tests/data/clean.png')
+        p = images.PNGParser('./tests/data/clean.png')
+        meta = p.get_meta()
+        self.assertEqual(meta['Comment'], 'This is a comment, be careful!')
+        ret = p.remove_all_lightweight()
+        self.assertTrue(ret)
+        p = images.PNGParser('./tests/data/clean.png.cleaned')
+        self.assertEqual(p.get_meta(), {})
+        os.remove('./tests/data/clean.png')
 class TestCleaning(unittest.TestCase):
    def test_pdf(self):
author	jvoisin	2018-04-14 21:23:31 +0200
committer	jvoisin	2018-04-14 21:23:31 +0200
commit	96299c6a5350f59eab022a09400eddcc347daede (patch)
tree	492df3a7637b2d1cb45424615ab2777043043eab
parent	6f4ed2490fbcde0b74e7b8251ad71e29b430b8ef (diff)