summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--main.py46
-rw-r--r--src/__init__.py0
-rw-r--r--src/parsers/__init__.py0
-rw-r--r--src/parsers/abstract.py10
-rw-r--r--src/parsers/pdf.py106
-rw-r--r--tests/data/dirty.pdfbin0 -> 543475 bytes
-rw-r--r--tests/main.py22
7 files changed, 184 insertions, 0 deletions
diff --git a/main.py b/main.py
new file mode 100644
index 0000000..e4157e6
--- /dev/null
+++ b/main.py
@@ -0,0 +1,46 @@
1import sys
2from shutil import copyfile
3import argparse
4
5from src.parsers import pdf
6
7
8def create_arg_parser():
9 parser = argparse.ArgumentParser(description='Metadata anonymisation toolkit 2')
10 parser.add_argument('files', nargs='*')
11
12 info = parser.add_argument_group('Information')
13 info.add_argument('-c', '--check', action='store_true',
14 help='check if a file is free of harmful metadatas')
15 info.add_argument('-l', '--list', action='store_true',
16 help='list all supported fileformats')
17 info.add_argument('-s', '--show', action='store_true',
18 help='list all the harmful metadata of a file without removing them')
19 return parser
20
21def show_meta(file_name:str):
22 p = pdf.PDFParser(file_name)
23 for k,v in p.get_meta().items():
24 print("%s: %s" % (k, v))
25
26def main():
27 argparser = create_arg_parser()
28 args = argparser.parse_args()
29
30 if args.show:
31 for f in args.files:
32 show_meta(f)
33 return 0
34 elif not args.files:
35 return parser.show_help()
36
37 copyfile(sys.argv[1] + '.bak', sys.argv[1])
38 p = pdf.PDFParser(sys.argv[1])
39 p.remove_all()
40 p = pdf.PDFParser('OUT_clean.pdf')
41 print("ok")
42
43
44if __name__ == '__main__':
45
46 main()
diff --git a/src/__init__.py b/src/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/src/__init__.py
diff --git a/src/parsers/__init__.py b/src/parsers/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/src/parsers/__init__.py
diff --git a/src/parsers/abstract.py b/src/parsers/abstract.py
new file mode 100644
index 0000000..a9129cc
--- /dev/null
+++ b/src/parsers/abstract.py
@@ -0,0 +1,10 @@
1class AbstractParser(object):
2 def __init__(self, filename: str):
3 self.filename = filename
4 self.meta_list = set()
5
6 def get_meta(self):
7 raise NotImplementedError
8
9 def remove_all(self):
10 raise NotImplementedError
diff --git a/src/parsers/pdf.py b/src/parsers/pdf.py
new file mode 100644
index 0000000..c25b324
--- /dev/null
+++ b/src/parsers/pdf.py
@@ -0,0 +1,106 @@
1""" Handle PDF
2
3"""
4
5import os
6import logging
7import tempfile
8import shutil
9import io
10
11import cairo
12import gi
13gi.require_version('Poppler', '0.18')
14from gi.repository import Poppler
15
16try:
17 from PIL import Image
18except ImportError:
19 Image = None
20
21from . import abstract
22
23logging.basicConfig(level=logging.DEBUG)
24
25
26class PDFParser(abstract.AbstractParser):
27 def __init__(self, filename):
28 super().__init__(filename)
29 self.meta_list = {'title', 'author', 'subject',
30 'keywords', 'creator', 'producer', 'metadata'}
31 self.uri = 'file://' + os.path.abspath(self.filename)
32 self.password = None
33
34 def __optimize_image_size(self, img: io.BytesIO) -> io.BytesIO:
35 """ This is useless as fuck. """
36 if Image is None:
37 return img
38 ret = io.BytesIO()
39 im = Image.open(img)
40 w, h = im.size
41 resized = im.resize((w, h), Image.ANTIALIAS)
42 resized.save(ret, optimize=True, format="PNG")
43 ret.seek(0)
44
45 return ret
46
47
48 def remove_all(self):
49 """
50 Load the document into Poppler, render pages on PNG,
51 and shove those PNG into a new PDF. Metadata from the new
52 PDF are removed via Poppler, because there is no way to tell
53 cairo to not add "created by cairo" during rendering.
54
55 TODO: Improve the resolution
56 TODO: Don't use a temp file
57 """
58 document = Poppler.Document.new_from_file(self.uri, self.password)
59
60 pdf_surface = cairo.PDFSurface("OUT.pdf", 128, 128)
61 pdf_context = cairo.Context(pdf_surface)
62
63 for pagenum in range(document.get_n_pages()):
64 page = document.get_page(pagenum)
65 page_width, page_height = page.get_size()
66 logging.info("Rendering page %d/%d", pagenum + 1, document.get_n_pages())
67
68 img_surface = cairo.ImageSurface(cairo.FORMAT_ARGB32, int(page_width)*2, int(page_height)*2)
69 img_context = cairo.Context(img_surface)
70
71 img_context.scale(2, 2)
72 page.render_for_printing_with_options(img_context, Poppler.PrintFlags.DOCUMENT)
73 img_context.show_page()
74
75 buf = io.BytesIO()
76 img_surface.write_to_png(buf)
77 img_surface.finish()
78 buf.seek(0)
79
80 #buf = self.__optimize_image_size(buf)
81
82 img = cairo.ImageSurface.create_from_png(buf)
83 pdf_surface.set_size(page_width*2, page_height*2)
84 pdf_context.set_source_surface(img, 0, 0)
85 pdf_context.paint()
86 pdf_context.show_page()
87
88 pdf_surface.finish()
89
90 document = Poppler.Document.new_from_file('file://' + os.path.abspath('OUT.pdf'), self.password)
91 document.set_producer('totally not MAT2 ;)')
92 document.set_creator('')
93 document.save('file://' + os.path.abspath("OUT_clean.pdf"))
94
95 return True
96
97 def get_meta(self):
98 """ Return a dict with all the meta of the file
99 """
100 print("URI: %s", self.uri)
101 document = Poppler.Document.new_from_file(self.uri, self.password)
102 metadata = {}
103 for key in self.meta_list:
104 if document.get_property(key):
105 metadata[key] = document.get_property(key)
106 return metadata
diff --git a/tests/data/dirty.pdf b/tests/data/dirty.pdf
new file mode 100644
index 0000000..0d88779
--- /dev/null
+++ b/tests/data/dirty.pdf
Binary files differ
diff --git a/tests/main.py b/tests/main.py
new file mode 100644
index 0000000..52828af
--- /dev/null
+++ b/tests/main.py
@@ -0,0 +1,22 @@
1#!/usr/bin/python3
2
3import unittest
4
5class TestCleaning(unittest.TestCase):
6 def test_pdf(self):
7 self.assertEqual('foo'.upper(), 'FOO')
8
9 def test_isupper(self):
10 self.assertTrue('FOO'.isupper())
11 self.assertFalse('Foo'.isupper())
12
13 def test_split(self):
14 s = 'hello world'
15 self.assertEqual(s.split(), ['hello', 'world'])
16 # check that s.split fails when the separator is not a string
17 with self.assertRaises(TypeError):
18 s.split(2)
19
20
21if __name__ == '__main__':
22 unittest.main()