summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorjvoisin2018-04-14 21:23:31 +0200
committerjvoisin2018-04-14 21:23:31 +0200
commit96299c6a5350f59eab022a09400eddcc347daede (patch)
tree492df3a7637b2d1cb45424615ab2777043043eab
parent6f4ed2490fbcde0b74e7b8251ad71e29b430b8ef (diff)
Add lightweight processing for PDF
-rwxr-xr-xmain.py15
-rw-r--r--src/abstract.py4
-rw-r--r--src/pdf.py45
-rw-r--r--tests/test_climat2.py4
-rw-r--r--tests/test_libmat2.py31
5 files changed, 84 insertions, 15 deletions
diff --git a/main.py b/main.py
index be2508e..2cb05ff 100755
--- a/main.py
+++ b/main.py
@@ -31,6 +31,8 @@ def create_arg_parser():
31 help='list all supported fileformats') 31 help='list all supported fileformats')
32 info.add_argument('-s', '--show', action='store_true', 32 info.add_argument('-s', '--show', action='store_true',
33 help='list all the harmful metadata of a file without removing them') 33 help='list all the harmful metadata of a file without removing them')
34 info.add_argument('-L', '--lightweight', action='store_true',
35 help='remove SOME metadata')
34 return parser 36 return parser
35 37
36 38
@@ -50,7 +52,7 @@ def show_meta(filename:str):
50 print(" %s: harmful content" % k) 52 print(" %s: harmful content" % k)
51 53
52 54
53def clean_meta(filename:str): 55def clean_meta(filename:str, is_lightweigth:bool):
54 if not __check_file(filename, os.R_OK|os.W_OK): 56 if not __check_file(filename, os.R_OK|os.W_OK):
55 return 57 return
56 58
@@ -58,7 +60,10 @@ def clean_meta(filename:str):
58 if p is None: 60 if p is None:
59 print("[-] %s's format (%s) is not supported" % (filename, mtype)) 61 print("[-] %s's format (%s) is not supported" % (filename, mtype))
60 return 62 return
61 p.remove_all() 63 if is_lightweigth:
64 p.remove_all_lightweight()
65 else:
66 p.remove_all()
62 67
63 68
64def show_parsers(): 69def show_parsers():
@@ -78,12 +83,12 @@ def __get_files_recursively(files):
78 for _f in _files: 83 for _f in _files:
79 yield os.path.join(path, _f) 84 yield os.path.join(path, _f)
80 85
81def __do_clean_async(q): 86def __do_clean_async(is_lightweigth, q):
82 while True: 87 while True:
83 f = q.get() 88 f = q.get()
84 if f is None: # nothing more to process 89 if f is None: # nothing more to process
85 return 90 return
86 clean_meta(f) 91 clean_meta(is_lightweigth, f)
87 q.task_done() 92 q.task_done()
88 93
89 94
@@ -109,7 +114,7 @@ def main():
109 q.put(f) 114 q.put(f)
110 115
111 for _ in range(multiprocessing.cpu_count()): 116 for _ in range(multiprocessing.cpu_count()):
112 worker = Thread(target=__do_clean_async, args=(q, )) 117 worker = Thread(target=__do_clean_async, args=(mode, q))
113 worker.start() 118 worker.start()
114 threads.append(worker) 119 threads.append(worker)
115 120
diff --git a/src/abstract.py b/src/abstract.py
index 04c1535..93e8421 100644
--- a/src/abstract.py
+++ b/src/abstract.py
@@ -16,3 +16,7 @@ class AbstractParser(abc.ABC):
16 @abc.abstractmethod 16 @abc.abstractmethod
17 def remove_all(self) -> bool: 17 def remove_all(self) -> bool:
18 pass 18 pass
19
20 def remove_all_lightweight(self) -> bool:
21 """ Remove _SOME_ metadata. """
22 return self.remove_all()
diff --git a/src/pdf.py b/src/pdf.py
index c119449..6e639cd 100644
--- a/src/pdf.py
+++ b/src/pdf.py
@@ -29,18 +29,43 @@ class PDFParser(abstract.AbstractParser):
29 self.uri = 'file://' + os.path.abspath(self.filename) 29 self.uri = 'file://' + os.path.abspath(self.filename)
30 self.__scale = 2 # how much precision do we want for the render 30 self.__scale = 2 # how much precision do we want for the render
31 31
32 def remove_all_lightweight(self):
33 """
34 Load the document into Poppler, render pages on a new PDFSurface.
35 """
36 document = Poppler.Document.new_from_file(self.uri, None)
37 pages_count = document.get_n_pages()
38
39 tmp_path = tempfile.mkstemp()[1]
40 pdf_surface = cairo.PDFSurface(tmp_path, 10, 10)
41 pdf_context = cairo.Context(pdf_surface) # context draws on the surface
42
43 for pagenum in range(pages_count):
44 logging.info("Rendering page %d/%d", pagenum + 1, pages_count)
45 page = document.get_page(pagenum)
46 page_width, page_height = page.get_size()
47 pdf_surface.set_size(page_width, page_height)
48 pdf_context.save()
49 page.render_for_printing(pdf_context)
50 pdf_context.restore()
51 pdf_context.show_page() # draw pdf_context on pdf_surface
52 pdf_surface.finish()
53
54 self.__remove_superficial_meta(tmp_path, self.output_filename)
55 os.remove(tmp_path)
56
57 return True
58
32 def remove_all(self): 59 def remove_all(self):
33 """ 60 """
34 Load the document into Poppler, render pages on PNG, 61 Load the document into Poppler, render pages on PNG,
35 and shove those PNG into a new PDF. Metadata from the new 62 and shove those PNG into a new PDF.
36 PDF are removed via Poppler, because there is no way to tell
37 cairo to not add "created by cairo" during rendering.
38 """ 63 """
39 document = Poppler.Document.new_from_file(self.uri, None) 64 document = Poppler.Document.new_from_file(self.uri, None)
40 pages_count = document.get_n_pages() 65 pages_count = document.get_n_pages()
41 66
42 _, tmp_path = tempfile.mkstemp() 67 _, tmp_path = tempfile.mkstemp()
43 pdf_surface = cairo.PDFSurface(tmp_path, 128, 128) 68 pdf_surface = cairo.PDFSurface(tmp_path, 32, 32) # resized later anyway
44 pdf_context = cairo.Context(pdf_surface) 69 pdf_context = cairo.Context(pdf_surface)
45 70
46 for pagenum in range(pages_count): 71 for pagenum in range(pages_count):
@@ -69,14 +94,18 @@ class PDFParser(abstract.AbstractParser):
69 pdf_surface.finish() 94 pdf_surface.finish()
70 95
71 # Removes metadata added by Poppler 96 # Removes metadata added by Poppler
72 document = Poppler.Document.new_from_file('file://' + tmp_path) 97 self.__remove_superficial_meta(tmp_path, self.output_filename)
73 document.set_producer('')
74 document.set_creator('')
75 document.save('file://' + os.path.abspath(self.output_filename))
76 os.remove(tmp_path) 98 os.remove(tmp_path)
77 99
78 return True 100 return True
79 101
102 def __remove_superficial_meta(self, in_file:str, out_file: str) -> bool:
103 document = Poppler.Document.new_from_file('file://' + in_file)
104 document.set_producer('')
105 document.set_creator('')
106 document.save('file://' + os.path.abspath(out_file))
107 return True
108
80 109
81 def __parse_metadata_field(self, data:str) -> dict: 110 def __parse_metadata_field(self, data:str) -> dict:
82 metadata = {} 111 metadata = {}
diff --git a/tests/test_climat2.py b/tests/test_climat2.py
index b9c52b5..64345eb 100644
--- a/tests/test_climat2.py
+++ b/tests/test_climat2.py
@@ -6,12 +6,12 @@ class TestHelp(unittest.TestCase):
6 def test_help(self): 6 def test_help(self):
7 proc = subprocess.Popen(['./main.py', '--help'], stdout=subprocess.PIPE) 7 proc = subprocess.Popen(['./main.py', '--help'], stdout=subprocess.PIPE)
8 stdout, _ = proc.communicate() 8 stdout, _ = proc.communicate()
9 self.assertIn(b'usage: main.py [-h] [-c] [-l] [-s] [files [files ...]]', stdout) 9 self.assertIn(b'usage: main.py [-h] [-c] [-l] [-s] [-L] [files [files ...]]', stdout)
10 10
11 def test_no_arg(self): 11 def test_no_arg(self):
12 proc = subprocess.Popen(['./main.py'], stdout=subprocess.PIPE) 12 proc = subprocess.Popen(['./main.py'], stdout=subprocess.PIPE)
13 stdout, _ = proc.communicate() 13 stdout, _ = proc.communicate()
14 self.assertIn(b'usage: main.py [-h] [-c] [-l] [-s] [files [files ...]]', stdout) 14 self.assertIn(b'usage: main.py [-h] [-c] [-l] [-s] [-L] [files [files ...]]', stdout)
15 15
16 16
17class TestGetMeta(unittest.TestCase): 17class TestGetMeta(unittest.TestCase):
diff --git a/tests/test_libmat2.py b/tests/test_libmat2.py
index 6141dbe..34f7301 100644
--- a/tests/test_libmat2.py
+++ b/tests/test_libmat2.py
@@ -138,6 +138,37 @@ class TestDeepCleaning(unittest.TestCase):
138 138
139 os.remove('./tests/data/clean.odt') 139 os.remove('./tests/data/clean.odt')
140 140
141class TestLightWeightCleaning(unittest.TestCase):
142 def test_pdf(self):
143 shutil.copy('./tests/data/dirty.pdf', './tests/data/clean.pdf')
144 p = pdf.PDFParser('./tests/data/clean.pdf')
145
146 meta = p.get_meta()
147 self.assertEqual(meta['producer'], 'pdfTeX-1.40.14')
148
149 ret = p.remove_all_lightweight()
150 self.assertTrue(ret)
151
152 p = pdf.PDFParser('./tests/data/clean.pdf.cleaned')
153 expected_meta = {'creation-date': -1, 'format': 'PDF-1.5', 'mod-date': -1}
154 self.assertEqual(p.get_meta(), expected_meta)
155
156 os.remove('./tests/data/clean.pdf')
157
158 def test_png(self):
159 shutil.copy('./tests/data/dirty.png', './tests/data/clean.png')
160 p = images.PNGParser('./tests/data/clean.png')
161
162 meta = p.get_meta()
163 self.assertEqual(meta['Comment'], 'This is a comment, be careful!')
164
165 ret = p.remove_all_lightweight()
166 self.assertTrue(ret)
167
168 p = images.PNGParser('./tests/data/clean.png.cleaned')
169 self.assertEqual(p.get_meta(), {})
170
171 os.remove('./tests/data/clean.png')
141 172
142class TestCleaning(unittest.TestCase): 173class TestCleaning(unittest.TestCase):
143 def test_pdf(self): 174 def test_pdf(self):