diff options
| author | jvoisin | 2018-10-23 16:14:21 +0200 |
|---|---|---|
| committer | jvoisin | 2018-10-23 16:22:11 +0200 |
| commit | f1a071d460507fd1bb1721deafd2a8d9f88f5b05 (patch) | |
| tree | e17067895ef1fc9b91b00c0ba56d2e86975ceef1 | |
| parent | 38df679a88a19db3a4a82fdb8e20a42c9a53d1a1 (diff) | |
Implement lightweight cleaning for png and tiff
| -rw-r--r-- | .pylintrc | 13 | ||||
| -rw-r--r-- | libmat2/exiftool.py | 29 | ||||
| -rw-r--r-- | libmat2/images.py | 2 | ||||
| -rw-r--r-- | libmat2/video.py | 2 | ||||
| -rw-r--r-- | tests/test_corrupted_files.py | 7 | ||||
| -rw-r--r-- | tests/test_libmat2.py | 36 | ||||
| -rw-r--r-- | tests/test_lightweigh_cleaning.py | 65 |
7 files changed, 111 insertions, 43 deletions
| @@ -6,11 +6,12 @@ max-locals=20 | |||
| 6 | disable= | 6 | disable= |
| 7 | fixme, | 7 | fixme, |
| 8 | invalid-name, | 8 | invalid-name, |
| 9 | duplicate-code, | ||
| 9 | missing-docstring, | 10 | missing-docstring, |
| 10 | protected-access, | 11 | protected-access, |
| 11 | abstract-method, | 12 | abstract-method, |
| 12 | wrong-import-position, | 13 | wrong-import-position, |
| 13 | catching-non-exception, | 14 | catching-non-exception, |
| 14 | cell-var-from-loop, | 15 | cell-var-from-loop, |
| 15 | locally-disabled, | 16 | locally-disabled, |
| 16 | invalid-sequence-index, # pylint doesn't like things like `Tuple[int, bytes]` in type annotation | 17 | invalid-sequence-index, # pylint doesn't like things like `Tuple[int, bytes]` in type annotation |
diff --git a/libmat2/exiftool.py b/libmat2/exiftool.py index 11dd36d..23d0d89 100644 --- a/libmat2/exiftool.py +++ b/libmat2/exiftool.py | |||
| @@ -1,4 +1,5 @@ | |||
| 1 | import json | 1 | import json |
| 2 | import logging | ||
| 2 | import os | 3 | import os |
| 3 | import subprocess | 4 | import subprocess |
| 4 | from typing import Dict, Union, Set | 5 | from typing import Dict, Union, Set |
| @@ -23,6 +24,34 @@ class ExiftoolParser(abstract.AbstractParser): | |||
| 23 | meta.pop(key, None) | 24 | meta.pop(key, None) |
| 24 | return meta | 25 | return meta |
| 25 | 26 | ||
| 27 | def _lightweight_cleanup(self): | ||
| 28 | if os.path.exists(self.output_filename): | ||
| 29 | try: | ||
| 30 | # exiftool can't force output to existing files | ||
| 31 | os.remove(self.output_filename) | ||
| 32 | except OSError as e: # pragma: no cover | ||
| 33 | logging.error("The output file %s is already existing and \ | ||
| 34 | can't be overwritten: %s.", self.filename, e) | ||
| 35 | return False | ||
| 36 | |||
| 37 | # Note: '-All=' must be followed by a known exiftool option. | ||
| 38 | # Also, '-CommonIFD0' is needed for .tiff files | ||
| 39 | cmd = [_get_exiftool_path(), | ||
| 40 | '-all=', # remove metadata | ||
| 41 | '-adobe=', # remove adobe-specific metadata | ||
| 42 | '-exif:all=', # remove all exif metadata | ||
| 43 | '-Time:All=', # remove all timestamps | ||
| 44 | '-quiet', # don't show useless logs | ||
| 45 | '-CommonIFD0=', # remove IFD0 metadata | ||
| 46 | '-o', self.output_filename, | ||
| 47 | self.filename] | ||
| 48 | try: | ||
| 49 | subprocess.check_call(cmd) | ||
| 50 | except subprocess.CalledProcessError as e: # pragma: no cover | ||
| 51 | logging.error("Something went wrong during the processing of %s: %s", self.filename, e) | ||
| 52 | return False | ||
| 53 | return True | ||
| 54 | |||
| 26 | def _get_exiftool_path() -> str: # pragma: no cover | 55 | def _get_exiftool_path() -> str: # pragma: no cover |
| 27 | exiftool_path = '/usr/bin/exiftool' | 56 | exiftool_path = '/usr/bin/exiftool' |
| 28 | if os.path.isfile(exiftool_path): | 57 | if os.path.isfile(exiftool_path): |
diff --git a/libmat2/images.py b/libmat2/images.py index ad80892..03cecd3 100644 --- a/libmat2/images.py +++ b/libmat2/images.py | |||
| @@ -35,6 +35,8 @@ class PNGParser(exiftool.ExiftoolParser): | |||
| 35 | raise ValueError | 35 | raise ValueError |
| 36 | 36 | ||
| 37 | def remove_all(self) -> bool: | 37 | def remove_all(self) -> bool: |
| 38 | if self.lightweight_cleaning: | ||
| 39 | return self._lightweight_cleanup() | ||
| 38 | surface = cairo.ImageSurface.create_from_png(self.filename) | 40 | surface = cairo.ImageSurface.create_from_png(self.filename) |
| 39 | surface.write_to_png(self.output_filename) | 41 | surface.write_to_png(self.output_filename) |
| 40 | return True | 42 | return True |
diff --git a/libmat2/video.py b/libmat2/video.py index fe2a1af..b7ba0a0 100644 --- a/libmat2/video.py +++ b/libmat2/video.py | |||
| @@ -26,7 +26,7 @@ class AVIParser(exiftool.ExiftoolParser): | |||
| 26 | 26 | ||
| 27 | def remove_all(self): | 27 | def remove_all(self): |
| 28 | cmd = [_get_ffmpeg_path(), | 28 | cmd = [_get_ffmpeg_path(), |
| 29 | '-i', self.filename, # input file | 29 | '-i', self.filename, # input file |
| 30 | '-y', # overwrite existing output file | 30 | '-y', # overwrite existing output file |
| 31 | '-loglevel', 'panic', # Don't show log | 31 | '-loglevel', 'panic', # Don't show log |
| 32 | '-hide_banner', # hide the banner | 32 | '-hide_banner', # hide the banner |
diff --git a/tests/test_corrupted_files.py b/tests/test_corrupted_files.py index 82c6c3b..181d4d2 100644 --- a/tests/test_corrupted_files.py +++ b/tests/test_corrupted_files.py | |||
| @@ -194,6 +194,13 @@ class TestCorruptedFiles(unittest.TestCase): | |||
| 194 | images.JPGParser('./tests/data/clean.jpg') | 194 | images.JPGParser('./tests/data/clean.jpg') |
| 195 | os.remove('./tests/data/clean.jpg') | 195 | os.remove('./tests/data/clean.jpg') |
| 196 | 196 | ||
| 197 | def test_png_lightweight(self): | ||
| 198 | return | ||
| 199 | shutil.copy('./tests/data/dirty.torrent', './tests/data/clean.png') | ||
| 200 | p = images.PNGParser('./tests/data/clean.png') | ||
| 201 | self.assertTrue(p.remove_all()) | ||
| 202 | os.remove('./tests/data/clean.png') | ||
| 203 | |||
| 197 | def test_avi(self): | 204 | def test_avi(self): |
| 198 | try: | 205 | try: |
| 199 | video._get_ffmpeg_path() | 206 | video._get_ffmpeg_path() |
diff --git a/tests/test_libmat2.py b/tests/test_libmat2.py index f5fc9e8..46d6aaa 100644 --- a/tests/test_libmat2.py +++ b/tests/test_libmat2.py | |||
| @@ -212,42 +212,6 @@ class TestRevisionsCleaning(unittest.TestCase): | |||
| 212 | os.remove('./tests/data/revision_clean.docx') | 212 | os.remove('./tests/data/revision_clean.docx') |
| 213 | os.remove('./tests/data/revision_clean.cleaned.docx') | 213 | os.remove('./tests/data/revision_clean.cleaned.docx') |
| 214 | 214 | ||
| 215 | class TestLightWeightCleaning(unittest.TestCase): | ||
| 216 | def test_pdf(self): | ||
| 217 | shutil.copy('./tests/data/dirty.pdf', './tests/data/clean.pdf') | ||
| 218 | p = pdf.PDFParser('./tests/data/clean.pdf') | ||
| 219 | |||
| 220 | meta = p.get_meta() | ||
| 221 | self.assertEqual(meta['producer'], 'pdfTeX-1.40.14') | ||
| 222 | |||
| 223 | p.lightweight_cleaning = True | ||
| 224 | ret = p.remove_all() | ||
| 225 | self.assertTrue(ret) | ||
| 226 | |||
| 227 | p = pdf.PDFParser('./tests/data/clean.cleaned.pdf') | ||
| 228 | expected_meta = {'creation-date': -1, 'format': 'PDF-1.5', 'mod-date': -1} | ||
| 229 | self.assertEqual(p.get_meta(), expected_meta) | ||
| 230 | |||
| 231 | os.remove('./tests/data/clean.pdf') | ||
| 232 | os.remove('./tests/data/clean.cleaned.pdf') | ||
| 233 | |||
| 234 | def test_png(self): | ||
| 235 | shutil.copy('./tests/data/dirty.png', './tests/data/clean.png') | ||
| 236 | p = images.PNGParser('./tests/data/clean.png') | ||
| 237 | |||
| 238 | meta = p.get_meta() | ||
| 239 | self.assertEqual(meta['Comment'], 'This is a comment, be careful!') | ||
| 240 | |||
| 241 | p.lightweight_cleaning = True | ||
| 242 | ret = p.remove_all() | ||
| 243 | self.assertTrue(ret) | ||
| 244 | |||
| 245 | p = images.PNGParser('./tests/data/clean.cleaned.png') | ||
| 246 | self.assertEqual(p.get_meta(), {}) | ||
| 247 | |||
| 248 | os.remove('./tests/data/clean.png') | ||
| 249 | os.remove('./tests/data/clean.cleaned.png') | ||
| 250 | |||
| 251 | class TestCleaning(unittest.TestCase): | 215 | class TestCleaning(unittest.TestCase): |
| 252 | def test_pdf(self): | 216 | def test_pdf(self): |
| 253 | shutil.copy('./tests/data/dirty.pdf', './tests/data/clean.pdf') | 217 | shutil.copy('./tests/data/dirty.pdf', './tests/data/clean.pdf') |
diff --git a/tests/test_lightweigh_cleaning.py b/tests/test_lightweigh_cleaning.py new file mode 100644 index 0000000..7af31ad --- /dev/null +++ b/tests/test_lightweigh_cleaning.py | |||
| @@ -0,0 +1,65 @@ | |||
| 1 | #!/usr/bin/env python3 | ||
| 2 | |||
| 3 | import unittest | ||
| 4 | import shutil | ||
| 5 | import os | ||
| 6 | |||
| 7 | from libmat2 import pdf, images | ||
| 8 | |||
| 9 | class TestLightWeightCleaning(unittest.TestCase): | ||
| 10 | def test_pdf(self): | ||
| 11 | shutil.copy('./tests/data/dirty.pdf', './tests/data/clean.pdf') | ||
| 12 | p = pdf.PDFParser('./tests/data/clean.pdf') | ||
| 13 | |||
| 14 | meta = p.get_meta() | ||
| 15 | self.assertEqual(meta['producer'], 'pdfTeX-1.40.14') | ||
| 16 | |||
| 17 | p.lightweight_cleaning = True | ||
| 18 | ret = p.remove_all() | ||
| 19 | self.assertTrue(ret) | ||
| 20 | |||
| 21 | p = pdf.PDFParser('./tests/data/clean.cleaned.pdf') | ||
| 22 | expected_meta = {'creation-date': -1, 'format': 'PDF-1.5', 'mod-date': -1} | ||
| 23 | self.assertEqual(p.get_meta(), expected_meta) | ||
| 24 | |||
| 25 | os.remove('./tests/data/clean.pdf') | ||
| 26 | os.remove('./tests/data/clean.cleaned.pdf') | ||
| 27 | |||
| 28 | def test_png(self): | ||
| 29 | shutil.copy('./tests/data/dirty.png', './tests/data/clean.png') | ||
| 30 | p = images.PNGParser('./tests/data/clean.png') | ||
| 31 | |||
| 32 | meta = p.get_meta() | ||
| 33 | self.assertEqual(meta['Comment'], 'This is a comment, be careful!') | ||
| 34 | |||
| 35 | p.lightweight_cleaning = True | ||
| 36 | ret = p.remove_all() | ||
| 37 | self.assertTrue(ret) | ||
| 38 | |||
| 39 | p = images.PNGParser('./tests/data/clean.cleaned.png') | ||
| 40 | self.assertEqual(p.get_meta(), {}) | ||
| 41 | |||
| 42 | p = images.PNGParser('./tests/data/clean.png') | ||
| 43 | p.lightweight_cleaning = True | ||
| 44 | ret = p.remove_all() | ||
| 45 | self.assertTrue(ret) | ||
| 46 | |||
| 47 | os.remove('./tests/data/clean.png') | ||
| 48 | os.remove('./tests/data/clean.cleaned.png') | ||
| 49 | |||
| 50 | def test_jpg(self): | ||
| 51 | shutil.copy('./tests/data/dirty.jpg', './tests/data/clean.jpg') | ||
| 52 | p = images.JPGParser('./tests/data/clean.jpg') | ||
| 53 | |||
| 54 | meta = p.get_meta() | ||
| 55 | self.assertEqual(meta['Comment'], 'Created with GIMP') | ||
| 56 | |||
| 57 | p.lightweight_cleaning = True | ||
| 58 | ret = p.remove_all() | ||
| 59 | self.assertTrue(ret) | ||
| 60 | |||
| 61 | p = images.JPGParser('./tests/data/clean.cleaned.jpg') | ||
| 62 | self.assertEqual(p.get_meta(), {}) | ||
| 63 | |||
| 64 | os.remove('./tests/data/clean.jpg') | ||
| 65 | os.remove('./tests/data/clean.cleaned.jpg') | ||
