diff options
| author | jvoisin | 2018-04-14 21:23:31 +0200 |
|---|---|---|
| committer | jvoisin | 2018-04-14 21:23:31 +0200 |
| commit | 96299c6a5350f59eab022a09400eddcc347daede (patch) | |
| tree | 492df3a7637b2d1cb45424615ab2777043043eab | |
| parent | 6f4ed2490fbcde0b74e7b8251ad71e29b430b8ef (diff) | |
Add lightweight processing for PDF
| -rwxr-xr-x | main.py | 15 | ||||
| -rw-r--r-- | src/abstract.py | 4 | ||||
| -rw-r--r-- | src/pdf.py | 45 | ||||
| -rw-r--r-- | tests/test_climat2.py | 4 | ||||
| -rw-r--r-- | tests/test_libmat2.py | 31 |
5 files changed, 84 insertions, 15 deletions
| @@ -31,6 +31,8 @@ def create_arg_parser(): | |||
| 31 | help='list all supported fileformats') | 31 | help='list all supported fileformats') |
| 32 | info.add_argument('-s', '--show', action='store_true', | 32 | info.add_argument('-s', '--show', action='store_true', |
| 33 | help='list all the harmful metadata of a file without removing them') | 33 | help='list all the harmful metadata of a file without removing them') |
| 34 | info.add_argument('-L', '--lightweight', action='store_true', | ||
| 35 | help='remove SOME metadata') | ||
| 34 | return parser | 36 | return parser |
| 35 | 37 | ||
| 36 | 38 | ||
| @@ -50,7 +52,7 @@ def show_meta(filename:str): | |||
| 50 | print(" %s: harmful content" % k) | 52 | print(" %s: harmful content" % k) |
| 51 | 53 | ||
| 52 | 54 | ||
| 53 | def clean_meta(filename:str): | 55 | def clean_meta(filename:str, is_lightweigth:bool): |
| 54 | if not __check_file(filename, os.R_OK|os.W_OK): | 56 | if not __check_file(filename, os.R_OK|os.W_OK): |
| 55 | return | 57 | return |
| 56 | 58 | ||
| @@ -58,7 +60,10 @@ def clean_meta(filename:str): | |||
| 58 | if p is None: | 60 | if p is None: |
| 59 | print("[-] %s's format (%s) is not supported" % (filename, mtype)) | 61 | print("[-] %s's format (%s) is not supported" % (filename, mtype)) |
| 60 | return | 62 | return |
| 61 | p.remove_all() | 63 | if is_lightweigth: |
| 64 | p.remove_all_lightweight() | ||
| 65 | else: | ||
| 66 | p.remove_all() | ||
| 62 | 67 | ||
| 63 | 68 | ||
| 64 | def show_parsers(): | 69 | def show_parsers(): |
| @@ -78,12 +83,12 @@ def __get_files_recursively(files): | |||
| 78 | for _f in _files: | 83 | for _f in _files: |
| 79 | yield os.path.join(path, _f) | 84 | yield os.path.join(path, _f) |
| 80 | 85 | ||
| 81 | def __do_clean_async(q): | 86 | def __do_clean_async(is_lightweigth, q): |
| 82 | while True: | 87 | while True: |
| 83 | f = q.get() | 88 | f = q.get() |
| 84 | if f is None: # nothing more to process | 89 | if f is None: # nothing more to process |
| 85 | return | 90 | return |
| 86 | clean_meta(f) | 91 | clean_meta(is_lightweigth, f) |
| 87 | q.task_done() | 92 | q.task_done() |
| 88 | 93 | ||
| 89 | 94 | ||
| @@ -109,7 +114,7 @@ def main(): | |||
| 109 | q.put(f) | 114 | q.put(f) |
| 110 | 115 | ||
| 111 | for _ in range(multiprocessing.cpu_count()): | 116 | for _ in range(multiprocessing.cpu_count()): |
| 112 | worker = Thread(target=__do_clean_async, args=(q, )) | 117 | worker = Thread(target=__do_clean_async, args=(mode, q)) |
| 113 | worker.start() | 118 | worker.start() |
| 114 | threads.append(worker) | 119 | threads.append(worker) |
| 115 | 120 | ||
diff --git a/src/abstract.py b/src/abstract.py index 04c1535..93e8421 100644 --- a/src/abstract.py +++ b/src/abstract.py | |||
| @@ -16,3 +16,7 @@ class AbstractParser(abc.ABC): | |||
| 16 | @abc.abstractmethod | 16 | @abc.abstractmethod |
| 17 | def remove_all(self) -> bool: | 17 | def remove_all(self) -> bool: |
| 18 | pass | 18 | pass |
| 19 | |||
| 20 | def remove_all_lightweight(self) -> bool: | ||
| 21 | """ Remove _SOME_ metadata. """ | ||
| 22 | return self.remove_all() | ||
| @@ -29,18 +29,43 @@ class PDFParser(abstract.AbstractParser): | |||
| 29 | self.uri = 'file://' + os.path.abspath(self.filename) | 29 | self.uri = 'file://' + os.path.abspath(self.filename) |
| 30 | self.__scale = 2 # how much precision do we want for the render | 30 | self.__scale = 2 # how much precision do we want for the render |
| 31 | 31 | ||
| 32 | def remove_all_lightweight(self): | ||
| 33 | """ | ||
| 34 | Load the document into Poppler, render pages on a new PDFSurface. | ||
| 35 | """ | ||
| 36 | document = Poppler.Document.new_from_file(self.uri, None) | ||
| 37 | pages_count = document.get_n_pages() | ||
| 38 | |||
| 39 | tmp_path = tempfile.mkstemp()[1] | ||
| 40 | pdf_surface = cairo.PDFSurface(tmp_path, 10, 10) | ||
| 41 | pdf_context = cairo.Context(pdf_surface) # context draws on the surface | ||
| 42 | |||
| 43 | for pagenum in range(pages_count): | ||
| 44 | logging.info("Rendering page %d/%d", pagenum + 1, pages_count) | ||
| 45 | page = document.get_page(pagenum) | ||
| 46 | page_width, page_height = page.get_size() | ||
| 47 | pdf_surface.set_size(page_width, page_height) | ||
| 48 | pdf_context.save() | ||
| 49 | page.render_for_printing(pdf_context) | ||
| 50 | pdf_context.restore() | ||
| 51 | pdf_context.show_page() # draw pdf_context on pdf_surface | ||
| 52 | pdf_surface.finish() | ||
| 53 | |||
| 54 | self.__remove_superficial_meta(tmp_path, self.output_filename) | ||
| 55 | os.remove(tmp_path) | ||
| 56 | |||
| 57 | return True | ||
| 58 | |||
| 32 | def remove_all(self): | 59 | def remove_all(self): |
| 33 | """ | 60 | """ |
| 34 | Load the document into Poppler, render pages on PNG, | 61 | Load the document into Poppler, render pages on PNG, |
| 35 | and shove those PNG into a new PDF. Metadata from the new | 62 | and shove those PNG into a new PDF. |
| 36 | PDF are removed via Poppler, because there is no way to tell | ||
| 37 | cairo to not add "created by cairo" during rendering. | ||
| 38 | """ | 63 | """ |
| 39 | document = Poppler.Document.new_from_file(self.uri, None) | 64 | document = Poppler.Document.new_from_file(self.uri, None) |
| 40 | pages_count = document.get_n_pages() | 65 | pages_count = document.get_n_pages() |
| 41 | 66 | ||
| 42 | _, tmp_path = tempfile.mkstemp() | 67 | _, tmp_path = tempfile.mkstemp() |
| 43 | pdf_surface = cairo.PDFSurface(tmp_path, 128, 128) | 68 | pdf_surface = cairo.PDFSurface(tmp_path, 32, 32) # resized later anyway |
| 44 | pdf_context = cairo.Context(pdf_surface) | 69 | pdf_context = cairo.Context(pdf_surface) |
| 45 | 70 | ||
| 46 | for pagenum in range(pages_count): | 71 | for pagenum in range(pages_count): |
| @@ -69,14 +94,18 @@ class PDFParser(abstract.AbstractParser): | |||
| 69 | pdf_surface.finish() | 94 | pdf_surface.finish() |
| 70 | 95 | ||
| 71 | # Removes metadata added by Poppler | 96 | # Removes metadata added by Poppler |
| 72 | document = Poppler.Document.new_from_file('file://' + tmp_path) | 97 | self.__remove_superficial_meta(tmp_path, self.output_filename) |
| 73 | document.set_producer('') | ||
| 74 | document.set_creator('') | ||
| 75 | document.save('file://' + os.path.abspath(self.output_filename)) | ||
| 76 | os.remove(tmp_path) | 98 | os.remove(tmp_path) |
| 77 | 99 | ||
| 78 | return True | 100 | return True |
| 79 | 101 | ||
| 102 | def __remove_superficial_meta(self, in_file:str, out_file: str) -> bool: | ||
| 103 | document = Poppler.Document.new_from_file('file://' + in_file) | ||
| 104 | document.set_producer('') | ||
| 105 | document.set_creator('') | ||
| 106 | document.save('file://' + os.path.abspath(out_file)) | ||
| 107 | return True | ||
| 108 | |||
| 80 | 109 | ||
| 81 | def __parse_metadata_field(self, data:str) -> dict: | 110 | def __parse_metadata_field(self, data:str) -> dict: |
| 82 | metadata = {} | 111 | metadata = {} |
diff --git a/tests/test_climat2.py b/tests/test_climat2.py index b9c52b5..64345eb 100644 --- a/tests/test_climat2.py +++ b/tests/test_climat2.py | |||
| @@ -6,12 +6,12 @@ class TestHelp(unittest.TestCase): | |||
| 6 | def test_help(self): | 6 | def test_help(self): |
| 7 | proc = subprocess.Popen(['./main.py', '--help'], stdout=subprocess.PIPE) | 7 | proc = subprocess.Popen(['./main.py', '--help'], stdout=subprocess.PIPE) |
| 8 | stdout, _ = proc.communicate() | 8 | stdout, _ = proc.communicate() |
| 9 | self.assertIn(b'usage: main.py [-h] [-c] [-l] [-s] [files [files ...]]', stdout) | 9 | self.assertIn(b'usage: main.py [-h] [-c] [-l] [-s] [-L] [files [files ...]]', stdout) |
| 10 | 10 | ||
| 11 | def test_no_arg(self): | 11 | def test_no_arg(self): |
| 12 | proc = subprocess.Popen(['./main.py'], stdout=subprocess.PIPE) | 12 | proc = subprocess.Popen(['./main.py'], stdout=subprocess.PIPE) |
| 13 | stdout, _ = proc.communicate() | 13 | stdout, _ = proc.communicate() |
| 14 | self.assertIn(b'usage: main.py [-h] [-c] [-l] [-s] [files [files ...]]', stdout) | 14 | self.assertIn(b'usage: main.py [-h] [-c] [-l] [-s] [-L] [files [files ...]]', stdout) |
| 15 | 15 | ||
| 16 | 16 | ||
| 17 | class TestGetMeta(unittest.TestCase): | 17 | class TestGetMeta(unittest.TestCase): |
diff --git a/tests/test_libmat2.py b/tests/test_libmat2.py index 6141dbe..34f7301 100644 --- a/tests/test_libmat2.py +++ b/tests/test_libmat2.py | |||
| @@ -138,6 +138,37 @@ class TestDeepCleaning(unittest.TestCase): | |||
| 138 | 138 | ||
| 139 | os.remove('./tests/data/clean.odt') | 139 | os.remove('./tests/data/clean.odt') |
| 140 | 140 | ||
| 141 | class TestLightWeightCleaning(unittest.TestCase): | ||
| 142 | def test_pdf(self): | ||
| 143 | shutil.copy('./tests/data/dirty.pdf', './tests/data/clean.pdf') | ||
| 144 | p = pdf.PDFParser('./tests/data/clean.pdf') | ||
| 145 | |||
| 146 | meta = p.get_meta() | ||
| 147 | self.assertEqual(meta['producer'], 'pdfTeX-1.40.14') | ||
| 148 | |||
| 149 | ret = p.remove_all_lightweight() | ||
| 150 | self.assertTrue(ret) | ||
| 151 | |||
| 152 | p = pdf.PDFParser('./tests/data/clean.pdf.cleaned') | ||
| 153 | expected_meta = {'creation-date': -1, 'format': 'PDF-1.5', 'mod-date': -1} | ||
| 154 | self.assertEqual(p.get_meta(), expected_meta) | ||
| 155 | |||
| 156 | os.remove('./tests/data/clean.pdf') | ||
| 157 | |||
| 158 | def test_png(self): | ||
| 159 | shutil.copy('./tests/data/dirty.png', './tests/data/clean.png') | ||
| 160 | p = images.PNGParser('./tests/data/clean.png') | ||
| 161 | |||
| 162 | meta = p.get_meta() | ||
| 163 | self.assertEqual(meta['Comment'], 'This is a comment, be careful!') | ||
| 164 | |||
| 165 | ret = p.remove_all_lightweight() | ||
| 166 | self.assertTrue(ret) | ||
| 167 | |||
| 168 | p = images.PNGParser('./tests/data/clean.png.cleaned') | ||
| 169 | self.assertEqual(p.get_meta(), {}) | ||
| 170 | |||
| 171 | os.remove('./tests/data/clean.png') | ||
| 141 | 172 | ||
| 142 | class TestCleaning(unittest.TestCase): | 173 | class TestCleaning(unittest.TestCase): |
| 143 | def test_pdf(self): | 174 | def test_pdf(self): |
