diff options
| -rw-r--r-- | .gitlab-ci.yml | 4 | ||||
| -rw-r--r-- | libmat2/__init__.py | 18 | ||||
| -rw-r--r-- | libmat2/abstract.py | 6 | ||||
| -rw-r--r-- | libmat2/exiftool.py | 61 | ||||
| -rw-r--r-- | libmat2/images.py | 45 | ||||
| -rw-r--r-- | libmat2/parser_factory.py | 2 | ||||
| -rw-r--r-- | libmat2/video.py | 58 | ||||
| -rwxr-xr-x | mat2 | 8 | ||||
| -rw-r--r-- | tests/data/dirty.avi | bin | 0 -> 375688 bytes | |||
| -rw-r--r-- | tests/test_libmat2.py | 29 |
10 files changed, 170 insertions, 61 deletions
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 4f0a140..32ec086 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml | |||
| @@ -42,7 +42,7 @@ tests:debian: | |||
| 42 | stage: test | 42 | stage: test |
| 43 | script: | 43 | script: |
| 44 | - apt-get -qqy update | 44 | - apt-get -qqy update |
| 45 | - apt-get -qqy install --no-install-recommends python3-mutagen python3-gi-cairo gir1.2-poppler-0.18 gir1.2-gdkpixbuf-2.0 libimage-exiftool-perl python3-coverage | 45 | - apt-get -qqy install --no-install-recommends python3-mutagen python3-gi-cairo gir1.2-poppler-0.18 gir1.2-gdkpixbuf-2.0 libimage-exiftool-perl python3-coverage ffmpeg |
| 46 | - python3-coverage run --branch -m unittest discover -s tests/ | 46 | - python3-coverage run --branch -m unittest discover -s tests/ |
| 47 | - python3-coverage report --fail-under=100 -m --include 'libmat2/*' | 47 | - python3-coverage report --fail-under=100 -m --include 'libmat2/*' |
| 48 | 48 | ||
| @@ -62,5 +62,5 @@ tests:archlinux: | |||
| 62 | tags: | 62 | tags: |
| 63 | - whitewhale | 63 | - whitewhale |
| 64 | script: | 64 | script: |
| 65 | - pacman -Sy --noconfirm python-mutagen python-gobject gdk-pixbuf2 poppler-glib gdk-pixbuf2 python-cairo perl-image-exiftool python-setuptools mailcap | 65 | - pacman -Sy --noconfirm python-mutagen python-gobject gdk-pixbuf2 poppler-glib gdk-pixbuf2 python-cairo perl-image-exiftool python-setuptools mailcap ffmpeg |
| 66 | - python3 setup.py test | 66 | - python3 setup.py test |
diff --git a/libmat2/__init__.py b/libmat2/__init__.py index f55a14c..399a364 100644 --- a/libmat2/__init__.py +++ b/libmat2/__init__.py | |||
| @@ -1,11 +1,12 @@ | |||
| 1 | #!/usr/bin/env python3 | 1 | #!/usr/bin/env python3 |
| 2 | 2 | ||
| 3 | import os | ||
| 4 | import collections | 3 | import collections |
| 5 | import enum | 4 | import enum |
| 6 | import importlib | 5 | import importlib |
| 7 | from typing import Dict, Optional | 6 | from typing import Dict, Optional |
| 8 | 7 | ||
| 8 | from . import exiftool, video | ||
| 9 | |||
| 9 | # make pyflakes happy | 10 | # make pyflakes happy |
| 10 | assert Dict | 11 | assert Dict |
| 11 | assert Optional | 12 | assert Optional |
| @@ -37,24 +38,13 @@ DEPENDENCIES = { | |||
| 37 | 'mutagen': 'Mutagen', | 38 | 'mutagen': 'Mutagen', |
| 38 | } | 39 | } |
| 39 | 40 | ||
| 40 | def _get_exiftool_path() -> str: # pragma: no cover | ||
| 41 | exiftool_path = '/usr/bin/exiftool' | ||
| 42 | if os.path.isfile(exiftool_path): | ||
| 43 | if os.access(exiftool_path, os.X_OK): | ||
| 44 | return exiftool_path | ||
| 45 | |||
| 46 | # ArchLinux | ||
| 47 | exiftool_path = '/usr/bin/vendor_perl/exiftool' | ||
| 48 | if os.path.isfile(exiftool_path): | ||
| 49 | if os.access(exiftool_path, os.X_OK): | ||
| 50 | return exiftool_path | ||
| 51 | 41 | ||
| 52 | raise ValueError | ||
| 53 | 42 | ||
| 54 | def check_dependencies() -> dict: | 43 | def check_dependencies() -> dict: |
| 55 | ret = collections.defaultdict(bool) # type: Dict[str, bool] | 44 | ret = collections.defaultdict(bool) # type: Dict[str, bool] |
| 56 | 45 | ||
| 57 | ret['Exiftool'] = True if _get_exiftool_path() else False | 46 | ret['Exiftool'] = True if exiftool._get_exiftool_path() else False |
| 47 | ret['Ffmpeg'] = True if video._get_ffmpeg_path() else False | ||
| 58 | 48 | ||
| 59 | for key, value in DEPENDENCIES.items(): | 49 | for key, value in DEPENDENCIES.items(): |
| 60 | ret[value] = True | 50 | ret[value] = True |
diff --git a/libmat2/abstract.py b/libmat2/abstract.py index 0084796..414a68b 100644 --- a/libmat2/abstract.py +++ b/libmat2/abstract.py | |||
| @@ -7,7 +7,8 @@ assert Set # make pyflakes happy | |||
| 7 | 7 | ||
| 8 | class AbstractParser(abc.ABC): | 8 | class AbstractParser(abc.ABC): |
| 9 | """ This is the base class of every parser. | 9 | """ This is the base class of every parser. |
| 10 | It might yield `ValueError` on instantiation on invalid files. | 10 | It might yield `ValueError` on instantiation on invalid files, |
| 11 | and `RuntimeError` when something went wrong in `remove_all`. | ||
| 11 | """ | 12 | """ |
| 12 | meta_list = set() # type: Set[str] | 13 | meta_list = set() # type: Set[str] |
| 13 | mimetypes = set() # type: Set[str] | 14 | mimetypes = set() # type: Set[str] |
| @@ -27,4 +28,7 @@ class AbstractParser(abc.ABC): | |||
| 27 | 28 | ||
| 28 | @abc.abstractmethod | 29 | @abc.abstractmethod |
| 29 | def remove_all(self) -> bool: | 30 | def remove_all(self) -> bool: |
| 31 | """ | ||
| 32 | :raises RuntimeError: Raised if the cleaning process went wrong. | ||
| 33 | """ | ||
| 30 | pass # pragma: no cover | 34 | pass # pragma: no cover |
diff --git a/libmat2/exiftool.py b/libmat2/exiftool.py new file mode 100644 index 0000000..e17d31b --- /dev/null +++ b/libmat2/exiftool.py | |||
| @@ -0,0 +1,61 @@ | |||
| 1 | import json | ||
| 2 | import os | ||
| 3 | import re | ||
| 4 | import shutil | ||
| 5 | import subprocess | ||
| 6 | import tempfile | ||
| 7 | |||
| 8 | from typing import Dict, Union, Set | ||
| 9 | |||
| 10 | from . import abstract | ||
| 11 | |||
| 12 | # Make pyflakes happy | ||
| 13 | assert Set | ||
| 14 | |||
| 15 | |||
| 16 | class ExiftoolParser(abstract.AbstractParser): | ||
| 17 | """ Exiftool is often the easiest way to get all the metadata | ||
| 18 | from a import file, hence why several parsers are re-using its `get_meta` | ||
| 19 | method. | ||
| 20 | """ | ||
| 21 | meta_whitelist = set() # type: Set[str] | ||
| 22 | |||
| 23 | @staticmethod | ||
| 24 | def __handle_problematic_filename(filename: str, callback) -> bytes: | ||
| 25 | """ This method takes a filename with a problematic name, | ||
| 26 | and safely applies it a `callback`.""" | ||
| 27 | tmpdirname = tempfile.mkdtemp() | ||
| 28 | fname = os.path.join(tmpdirname, "temp_file") | ||
| 29 | shutil.copy(filename, fname) | ||
| 30 | out = callback(fname) | ||
| 31 | shutil.rmtree(tmpdirname) | ||
| 32 | return out | ||
| 33 | |||
| 34 | def get_meta(self) -> Dict[str, Union[str, dict]]: | ||
| 35 | """ There is no way to escape the leading(s) dash(es) of the current | ||
| 36 | self.filename to prevent parameter injections, so we need to take care | ||
| 37 | of this. | ||
| 38 | """ | ||
| 39 | fun = lambda f: subprocess.check_output([_get_exiftool_path(), '-json', f]) | ||
| 40 | if re.search('^[a-z0-9/]', self.filename) is None: | ||
| 41 | out = self.__handle_problematic_filename(self.filename, fun) | ||
| 42 | else: | ||
| 43 | out = fun(self.filename) | ||
| 44 | meta = json.loads(out.decode('utf-8'))[0] | ||
| 45 | for key in self.meta_whitelist: | ||
| 46 | meta.pop(key, None) | ||
| 47 | return meta | ||
| 48 | |||
| 49 | def _get_exiftool_path() -> str: # pragma: no cover | ||
| 50 | exiftool_path = '/usr/bin/exiftool' | ||
| 51 | if os.path.isfile(exiftool_path): | ||
| 52 | if os.access(exiftool_path, os.X_OK): | ||
| 53 | return exiftool_path | ||
| 54 | |||
| 55 | # ArchLinux | ||
| 56 | exiftool_path = '/usr/bin/vendor_perl/exiftool' | ||
| 57 | if os.path.isfile(exiftool_path): | ||
| 58 | if os.access(exiftool_path, os.X_OK): | ||
| 59 | return exiftool_path | ||
| 60 | |||
| 61 | raise RuntimeError("Unable to find exiftool") | ||
diff --git a/libmat2/images.py b/libmat2/images.py index a29cbb7..ad80892 100644 --- a/libmat2/images.py +++ b/libmat2/images.py | |||
| @@ -1,11 +1,6 @@ | |||
| 1 | import subprocess | ||
| 2 | import imghdr | 1 | import imghdr |
| 3 | import json | ||
| 4 | import os | 2 | import os |
| 5 | import shutil | 3 | from typing import Set |
| 6 | import tempfile | ||
| 7 | import re | ||
| 8 | from typing import Set, Dict, Union | ||
| 9 | 4 | ||
| 10 | import cairo | 5 | import cairo |
| 11 | 6 | ||
| @@ -13,44 +8,12 @@ import gi | |||
| 13 | gi.require_version('GdkPixbuf', '2.0') | 8 | gi.require_version('GdkPixbuf', '2.0') |
| 14 | from gi.repository import GdkPixbuf | 9 | from gi.repository import GdkPixbuf |
| 15 | 10 | ||
| 16 | from . import abstract, _get_exiftool_path | 11 | from . import exiftool |
| 17 | 12 | ||
| 18 | # Make pyflakes happy | 13 | # Make pyflakes happy |
| 19 | assert Set | 14 | assert Set |
| 20 | 15 | ||
| 21 | class _ImageParser(abstract.AbstractParser): | 16 | class PNGParser(exiftool.ExiftoolParser): |
| 22 | """ Since we use `exiftool` to get metadata from | ||
| 23 | all images fileformat, `get_meta` is implemented in this class, | ||
| 24 | and all the image-handling ones are inheriting from it.""" | ||
| 25 | meta_whitelist = set() # type: Set[str] | ||
| 26 | |||
| 27 | @staticmethod | ||
| 28 | def __handle_problematic_filename(filename: str, callback) -> bytes: | ||
| 29 | """ This method takes a filename with a problematic name, | ||
| 30 | and safely applies it a `callback`.""" | ||
| 31 | tmpdirname = tempfile.mkdtemp() | ||
| 32 | fname = os.path.join(tmpdirname, "temp_file") | ||
| 33 | shutil.copy(filename, fname) | ||
| 34 | out = callback(fname) | ||
| 35 | shutil.rmtree(tmpdirname) | ||
| 36 | return out | ||
| 37 | |||
| 38 | def get_meta(self) -> Dict[str, Union[str, dict]]: | ||
| 39 | """ There is no way to escape the leading(s) dash(es) of the current | ||
| 40 | self.filename to prevent parameter injections, so we need to take care | ||
| 41 | of this. | ||
| 42 | """ | ||
| 43 | fun = lambda f: subprocess.check_output([_get_exiftool_path(), '-json', f]) | ||
| 44 | if re.search('^[a-z0-9/]', self.filename) is None: | ||
| 45 | out = self.__handle_problematic_filename(self.filename, fun) | ||
| 46 | else: | ||
| 47 | out = fun(self.filename) | ||
| 48 | meta = json.loads(out.decode('utf-8'))[0] | ||
| 49 | for key in self.meta_whitelist: | ||
| 50 | meta.pop(key, None) | ||
| 51 | return meta | ||
| 52 | |||
| 53 | class PNGParser(_ImageParser): | ||
| 54 | mimetypes = {'image/png', } | 17 | mimetypes = {'image/png', } |
| 55 | meta_whitelist = {'SourceFile', 'ExifToolVersion', 'FileName', | 18 | meta_whitelist = {'SourceFile', 'ExifToolVersion', 'FileName', |
| 56 | 'Directory', 'FileSize', 'FileModifyDate', | 19 | 'Directory', 'FileSize', 'FileModifyDate', |
| @@ -77,7 +40,7 @@ class PNGParser(_ImageParser): | |||
| 77 | return True | 40 | return True |
| 78 | 41 | ||
| 79 | 42 | ||
| 80 | class GdkPixbufAbstractParser(_ImageParser): | 43 | class GdkPixbufAbstractParser(exiftool.ExiftoolParser): |
| 81 | """ GdkPixbuf can handle a lot of surfaces, so we're rending images on it, | 44 | """ GdkPixbuf can handle a lot of surfaces, so we're rending images on it, |
| 82 | this has the side-effect of completely removing metadata. | 45 | this has the side-effect of completely removing metadata. |
| 83 | """ | 46 | """ |
diff --git a/libmat2/parser_factory.py b/libmat2/parser_factory.py index 621640b..4a0ca0d 100644 --- a/libmat2/parser_factory.py +++ b/libmat2/parser_factory.py | |||
| @@ -18,6 +18,8 @@ def __load_all_parsers(): | |||
| 18 | continue | 18 | continue |
| 19 | elif fname.endswith('__init__.py'): | 19 | elif fname.endswith('__init__.py'): |
| 20 | continue | 20 | continue |
| 21 | elif fname.endswith('exiftool.py'): | ||
| 22 | continue | ||
| 21 | basename = os.path.basename(fname) | 23 | basename = os.path.basename(fname) |
| 22 | name, _ = os.path.splitext(basename) | 24 | name, _ = os.path.splitext(basename) |
| 23 | importlib.import_module('.' + name, package='libmat2') | 25 | importlib.import_module('.' + name, package='libmat2') |
diff --git a/libmat2/video.py b/libmat2/video.py new file mode 100644 index 0000000..b9f3687 --- /dev/null +++ b/libmat2/video.py | |||
| @@ -0,0 +1,58 @@ | |||
| 1 | import os | ||
| 2 | import subprocess | ||
| 3 | |||
| 4 | from . import exiftool | ||
| 5 | |||
| 6 | |||
| 7 | class AVIParser(exiftool.ExiftoolParser): | ||
| 8 | mimetypes = {'video/x-msvideo', } | ||
| 9 | meta_whitelist = {'SourceFile', 'ExifToolVersion', 'FileName', 'Directory', | ||
| 10 | 'FileSize', 'FileModifyDate', 'FileAccessDate', | ||
| 11 | 'FileInodeChangeDate', 'FilePermissions', 'FileType', | ||
| 12 | 'FileTypeExtension', 'MIMEType', 'FrameRate', 'MaxDataRate', | ||
| 13 | 'FrameCount', 'StreamCount', 'StreamType', 'VideoCodec', | ||
| 14 | 'VideoFrameRate', 'VideoFrameCount', 'Quality', | ||
| 15 | 'SampleSize', 'BMPVersion', 'ImageWidth', 'ImageHeight', | ||
| 16 | 'Planes', 'BitDepth', 'Compression', 'ImageLength', | ||
| 17 | 'PixelsPerMeterX', 'PixelsPerMeterY', 'NumColors', | ||
| 18 | 'NumImportantColors', 'NumColors', 'NumImportantColors', | ||
| 19 | 'RedMask', 'GreenMask', 'BlueMask', 'AlphaMask', | ||
| 20 | 'ColorSpace', 'AudioCodec', 'AudioCodecRate', | ||
| 21 | 'AudioSampleCount', 'AudioSampleCount', | ||
| 22 | 'AudioSampleRate', 'Encoding', 'NumChannels', | ||
| 23 | 'SampleRate', 'AvgBytesPerSec', 'BitsPerSample', | ||
| 24 | 'Duration', 'ImageSize', 'Megapixels'} | ||
| 25 | |||
| 26 | def remove_all(self) -> bool: | ||
| 27 | """ | ||
| 28 | TODO: handle problematic filenames starting with `-` and `--`, | ||
| 29 | check exiftool.py | ||
| 30 | """ | ||
| 31 | cmd = [_get_ffmpeg_path(), | ||
| 32 | '-i', self.filename, # input file | ||
| 33 | '-y', # overwrite existing output file | ||
| 34 | '-loglevel', 'panic', # Don't show log | ||
| 35 | '-hide_banner', # hide the banner | ||
| 36 | '-codec', 'copy', # don't decode anything, just copy (speed!) | ||
| 37 | '-map_metadata', '-1', # remove supperficial metadata | ||
| 38 | '-map_chapters', '-1', # remove chapters | ||
| 39 | '-fflags', '+bitexact', # don't add any metadata | ||
| 40 | '-flags:v', '+bitexact', # don't add any metadata | ||
| 41 | '-flags:a', '+bitexact', # don't add any metadata | ||
| 42 | self.output_filename] | ||
| 43 | |||
| 44 | try: | ||
| 45 | subprocess.check_call(cmd) | ||
| 46 | except subprocess.CalledProcessError: # pragma: no cover | ||
| 47 | return False | ||
| 48 | |||
| 49 | return True | ||
| 50 | |||
| 51 | |||
| 52 | def _get_ffmpeg_path() -> str: # pragma: no cover | ||
| 53 | ffmpeg_path = '/usr/bin/ffmpeg' | ||
| 54 | if os.path.isfile(ffmpeg_path): | ||
| 55 | if os.access(ffmpeg_path, os.X_OK): | ||
| 56 | return ffmpeg_path | ||
| 57 | |||
| 58 | raise RuntimeError("Unable to find ffmpeg") | ||
| @@ -97,7 +97,13 @@ def clean_meta(filename: str, is_lightweight: bool, policy: UnknownMemberPolicy) | |||
| 97 | return False | 97 | return False |
| 98 | p.unknown_member_policy = policy | 98 | p.unknown_member_policy = policy |
| 99 | p.lightweight_cleaning = is_lightweight | 99 | p.lightweight_cleaning = is_lightweight |
| 100 | return p.remove_all() | 100 | |
| 101 | try: | ||
| 102 | return p.remove_all() | ||
| 103 | except RuntimeError as e: | ||
| 104 | print("[-] %s can't be cleaned: %s" % (filename, e)) | ||
| 105 | return False | ||
| 106 | |||
| 101 | 107 | ||
| 102 | 108 | ||
| 103 | def show_parsers() -> bool: | 109 | def show_parsers() -> bool: |
diff --git a/tests/data/dirty.avi b/tests/data/dirty.avi new file mode 100644 index 0000000..850feab --- /dev/null +++ b/tests/data/dirty.avi | |||
| Binary files differ | |||
diff --git a/tests/test_libmat2.py b/tests/test_libmat2.py index 665bab0..37adc6a 100644 --- a/tests/test_libmat2.py +++ b/tests/test_libmat2.py | |||
| @@ -6,12 +6,16 @@ import os | |||
| 6 | import zipfile | 6 | import zipfile |
| 7 | 7 | ||
| 8 | from libmat2 import pdf, images, audio, office, parser_factory, torrent, harmless | 8 | from libmat2 import pdf, images, audio, office, parser_factory, torrent, harmless |
| 9 | from libmat2 import check_dependencies | 9 | from libmat2 import check_dependencies, video |
| 10 | 10 | ||
| 11 | 11 | ||
| 12 | class TestCheckDependencies(unittest.TestCase): | 12 | class TestCheckDependencies(unittest.TestCase): |
| 13 | def test_deps(self): | 13 | def test_deps(self): |
| 14 | ret = check_dependencies() | 14 | try: |
| 15 | ret = check_dependencies() | ||
| 16 | except RuntimeError: | ||
| 17 | return # this happens if not every dependency is installed | ||
| 18 | |||
| 15 | for value in ret.values(): | 19 | for value in ret.values(): |
| 16 | self.assertTrue(value) | 20 | self.assertTrue(value) |
| 17 | 21 | ||
| @@ -471,3 +475,24 @@ class TestCleaning(unittest.TestCase): | |||
| 471 | os.remove('./tests/data/clean.txt') | 475 | os.remove('./tests/data/clean.txt') |
| 472 | os.remove('./tests/data/clean.cleaned.txt') | 476 | os.remove('./tests/data/clean.cleaned.txt') |
| 473 | os.remove('./tests/data/clean.cleaned.cleaned.txt') | 477 | os.remove('./tests/data/clean.cleaned.cleaned.txt') |
| 478 | |||
| 479 | def test_avi(self): | ||
| 480 | shutil.copy('./tests/data/dirty.avi', './tests/data/clean.avi') | ||
| 481 | p = video.AVIParser('./tests/data/clean.avi') | ||
| 482 | |||
| 483 | meta = p.get_meta() | ||
| 484 | self.assertEqual(meta['Software'], 'MEncoder SVN-r33148-4.0.1') | ||
| 485 | |||
| 486 | try: | ||
| 487 | ret = p.remove_all() | ||
| 488 | except RuntimeError: | ||
| 489 | return # this happens if ffmepg is not installed | ||
| 490 | self.assertTrue(ret) | ||
| 491 | |||
| 492 | p = video.AVIParser('./tests/data/clean.cleaned.avi') | ||
| 493 | self.assertEqual(p.get_meta(), {}) | ||
| 494 | self.assertTrue(p.remove_all()) | ||
| 495 | |||
| 496 | os.remove('./tests/data/clean.avi') | ||
| 497 | os.remove('./tests/data/clean.cleaned.avi') | ||
| 498 | os.remove('./tests/data/clean.cleaned.cleaned.avi') | ||
