diff options
| -rw-r--r-- | main.py | 46 | ||||
| -rw-r--r-- | src/__init__.py | 0 | ||||
| -rw-r--r-- | src/parsers/__init__.py | 0 | ||||
| -rw-r--r-- | src/parsers/abstract.py | 10 | ||||
| -rw-r--r-- | src/parsers/pdf.py | 106 | ||||
| -rw-r--r-- | tests/data/dirty.pdf | bin | 0 -> 543475 bytes | |||
| -rw-r--r-- | tests/main.py | 22 |
7 files changed, 184 insertions, 0 deletions
| @@ -0,0 +1,46 @@ | |||
| 1 | import sys | ||
| 2 | from shutil import copyfile | ||
| 3 | import argparse | ||
| 4 | |||
| 5 | from src.parsers import pdf | ||
| 6 | |||
| 7 | |||
| 8 | def create_arg_parser(): | ||
| 9 | parser = argparse.ArgumentParser(description='Metadata anonymisation toolkit 2') | ||
| 10 | parser.add_argument('files', nargs='*') | ||
| 11 | |||
| 12 | info = parser.add_argument_group('Information') | ||
| 13 | info.add_argument('-c', '--check', action='store_true', | ||
| 14 | help='check if a file is free of harmful metadatas') | ||
| 15 | info.add_argument('-l', '--list', action='store_true', | ||
| 16 | help='list all supported fileformats') | ||
| 17 | info.add_argument('-s', '--show', action='store_true', | ||
| 18 | help='list all the harmful metadata of a file without removing them') | ||
| 19 | return parser | ||
| 20 | |||
| 21 | def show_meta(file_name:str): | ||
| 22 | p = pdf.PDFParser(file_name) | ||
| 23 | for k,v in p.get_meta().items(): | ||
| 24 | print("%s: %s" % (k, v)) | ||
| 25 | |||
| 26 | def main(): | ||
| 27 | argparser = create_arg_parser() | ||
| 28 | args = argparser.parse_args() | ||
| 29 | |||
| 30 | if args.show: | ||
| 31 | for f in args.files: | ||
| 32 | show_meta(f) | ||
| 33 | return 0 | ||
| 34 | elif not args.files: | ||
| 35 | return parser.show_help() | ||
| 36 | |||
| 37 | copyfile(sys.argv[1] + '.bak', sys.argv[1]) | ||
| 38 | p = pdf.PDFParser(sys.argv[1]) | ||
| 39 | p.remove_all() | ||
| 40 | p = pdf.PDFParser('OUT_clean.pdf') | ||
| 41 | print("ok") | ||
| 42 | |||
| 43 | |||
| 44 | if __name__ == '__main__': | ||
| 45 | |||
| 46 | main() | ||
diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/src/__init__.py | |||
diff --git a/src/parsers/__init__.py b/src/parsers/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/src/parsers/__init__.py | |||
diff --git a/src/parsers/abstract.py b/src/parsers/abstract.py new file mode 100644 index 0000000..a9129cc --- /dev/null +++ b/src/parsers/abstract.py | |||
| @@ -0,0 +1,10 @@ | |||
| 1 | class AbstractParser(object): | ||
| 2 | def __init__(self, filename: str): | ||
| 3 | self.filename = filename | ||
| 4 | self.meta_list = set() | ||
| 5 | |||
| 6 | def get_meta(self): | ||
| 7 | raise NotImplementedError | ||
| 8 | |||
| 9 | def remove_all(self): | ||
| 10 | raise NotImplementedError | ||
diff --git a/src/parsers/pdf.py b/src/parsers/pdf.py new file mode 100644 index 0000000..c25b324 --- /dev/null +++ b/src/parsers/pdf.py | |||
| @@ -0,0 +1,106 @@ | |||
| 1 | """ Handle PDF | ||
| 2 | |||
| 3 | """ | ||
| 4 | |||
| 5 | import os | ||
| 6 | import logging | ||
| 7 | import tempfile | ||
| 8 | import shutil | ||
| 9 | import io | ||
| 10 | |||
| 11 | import cairo | ||
| 12 | import gi | ||
| 13 | gi.require_version('Poppler', '0.18') | ||
| 14 | from gi.repository import Poppler | ||
| 15 | |||
| 16 | try: | ||
| 17 | from PIL import Image | ||
| 18 | except ImportError: | ||
| 19 | Image = None | ||
| 20 | |||
| 21 | from . import abstract | ||
| 22 | |||
| 23 | logging.basicConfig(level=logging.DEBUG) | ||
| 24 | |||
| 25 | |||
| 26 | class PDFParser(abstract.AbstractParser): | ||
| 27 | def __init__(self, filename): | ||
| 28 | super().__init__(filename) | ||
| 29 | self.meta_list = {'title', 'author', 'subject', | ||
| 30 | 'keywords', 'creator', 'producer', 'metadata'} | ||
| 31 | self.uri = 'file://' + os.path.abspath(self.filename) | ||
| 32 | self.password = None | ||
| 33 | |||
| 34 | def __optimize_image_size(self, img: io.BytesIO) -> io.BytesIO: | ||
| 35 | """ This is useless as fuck. """ | ||
| 36 | if Image is None: | ||
| 37 | return img | ||
| 38 | ret = io.BytesIO() | ||
| 39 | im = Image.open(img) | ||
| 40 | w, h = im.size | ||
| 41 | resized = im.resize((w, h), Image.ANTIALIAS) | ||
| 42 | resized.save(ret, optimize=True, format="PNG") | ||
| 43 | ret.seek(0) | ||
| 44 | |||
| 45 | return ret | ||
| 46 | |||
| 47 | |||
| 48 | def remove_all(self): | ||
| 49 | """ | ||
| 50 | Load the document into Poppler, render pages on PNG, | ||
| 51 | and shove those PNG into a new PDF. Metadata from the new | ||
| 52 | PDF are removed via Poppler, because there is no way to tell | ||
| 53 | cairo to not add "created by cairo" during rendering. | ||
| 54 | |||
| 55 | TODO: Improve the resolution | ||
| 56 | TODO: Don't use a temp file | ||
| 57 | """ | ||
| 58 | document = Poppler.Document.new_from_file(self.uri, self.password) | ||
| 59 | |||
| 60 | pdf_surface = cairo.PDFSurface("OUT.pdf", 128, 128) | ||
| 61 | pdf_context = cairo.Context(pdf_surface) | ||
| 62 | |||
| 63 | for pagenum in range(document.get_n_pages()): | ||
| 64 | page = document.get_page(pagenum) | ||
| 65 | page_width, page_height = page.get_size() | ||
| 66 | logging.info("Rendering page %d/%d", pagenum + 1, document.get_n_pages()) | ||
| 67 | |||
| 68 | img_surface = cairo.ImageSurface(cairo.FORMAT_ARGB32, int(page_width)*2, int(page_height)*2) | ||
| 69 | img_context = cairo.Context(img_surface) | ||
| 70 | |||
| 71 | img_context.scale(2, 2) | ||
| 72 | page.render_for_printing_with_options(img_context, Poppler.PrintFlags.DOCUMENT) | ||
| 73 | img_context.show_page() | ||
| 74 | |||
| 75 | buf = io.BytesIO() | ||
| 76 | img_surface.write_to_png(buf) | ||
| 77 | img_surface.finish() | ||
| 78 | buf.seek(0) | ||
| 79 | |||
| 80 | #buf = self.__optimize_image_size(buf) | ||
| 81 | |||
| 82 | img = cairo.ImageSurface.create_from_png(buf) | ||
| 83 | pdf_surface.set_size(page_width*2, page_height*2) | ||
| 84 | pdf_context.set_source_surface(img, 0, 0) | ||
| 85 | pdf_context.paint() | ||
| 86 | pdf_context.show_page() | ||
| 87 | |||
| 88 | pdf_surface.finish() | ||
| 89 | |||
| 90 | document = Poppler.Document.new_from_file('file://' + os.path.abspath('OUT.pdf'), self.password) | ||
| 91 | document.set_producer('totally not MAT2 ;)') | ||
| 92 | document.set_creator('') | ||
| 93 | document.save('file://' + os.path.abspath("OUT_clean.pdf")) | ||
| 94 | |||
| 95 | return True | ||
| 96 | |||
| 97 | def get_meta(self): | ||
| 98 | """ Return a dict with all the meta of the file | ||
| 99 | """ | ||
| 100 | print("URI: %s", self.uri) | ||
| 101 | document = Poppler.Document.new_from_file(self.uri, self.password) | ||
| 102 | metadata = {} | ||
| 103 | for key in self.meta_list: | ||
| 104 | if document.get_property(key): | ||
| 105 | metadata[key] = document.get_property(key) | ||
| 106 | return metadata | ||
diff --git a/tests/data/dirty.pdf b/tests/data/dirty.pdf new file mode 100644 index 0000000..0d88779 --- /dev/null +++ b/tests/data/dirty.pdf | |||
| Binary files differ | |||
diff --git a/tests/main.py b/tests/main.py new file mode 100644 index 0000000..52828af --- /dev/null +++ b/tests/main.py | |||
| @@ -0,0 +1,22 @@ | |||
| 1 | #!/usr/bin/python3 | ||
| 2 | |||
| 3 | import unittest | ||
| 4 | |||
| 5 | class TestCleaning(unittest.TestCase): | ||
| 6 | def test_pdf(self): | ||
| 7 | self.assertEqual('foo'.upper(), 'FOO') | ||
| 8 | |||
| 9 | def test_isupper(self): | ||
| 10 | self.assertTrue('FOO'.isupper()) | ||
| 11 | self.assertFalse('Foo'.isupper()) | ||
| 12 | |||
| 13 | def test_split(self): | ||
| 14 | s = 'hello world' | ||
| 15 | self.assertEqual(s.split(), ['hello', 'world']) | ||
| 16 | # check that s.split fails when the separator is not a string | ||
| 17 | with self.assertRaises(TypeError): | ||
| 18 | s.split(2) | ||
| 19 | |||
| 20 | |||
| 21 | if __name__ == '__main__': | ||
| 22 | unittest.main() | ||
