summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.gitlab-ci.yml4
-rw-r--r--libmat2/__init__.py18
-rw-r--r--libmat2/abstract.py6
-rw-r--r--libmat2/exiftool.py61
-rw-r--r--libmat2/images.py45
-rw-r--r--libmat2/parser_factory.py2
-rw-r--r--libmat2/video.py58
-rwxr-xr-xmat28
-rw-r--r--tests/data/dirty.avibin0 -> 375688 bytes
-rw-r--r--tests/test_libmat2.py29
10 files changed, 170 insertions, 61 deletions
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 4f0a140..32ec086 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -42,7 +42,7 @@ tests:debian:
42 stage: test 42 stage: test
43 script: 43 script:
44 - apt-get -qqy update 44 - apt-get -qqy update
45 - apt-get -qqy install --no-install-recommends python3-mutagen python3-gi-cairo gir1.2-poppler-0.18 gir1.2-gdkpixbuf-2.0 libimage-exiftool-perl python3-coverage 45 - apt-get -qqy install --no-install-recommends python3-mutagen python3-gi-cairo gir1.2-poppler-0.18 gir1.2-gdkpixbuf-2.0 libimage-exiftool-perl python3-coverage ffmpeg
46 - python3-coverage run --branch -m unittest discover -s tests/ 46 - python3-coverage run --branch -m unittest discover -s tests/
47 - python3-coverage report --fail-under=100 -m --include 'libmat2/*' 47 - python3-coverage report --fail-under=100 -m --include 'libmat2/*'
48 48
@@ -62,5 +62,5 @@ tests:archlinux:
62 tags: 62 tags:
63 - whitewhale 63 - whitewhale
64 script: 64 script:
65 - pacman -Sy --noconfirm python-mutagen python-gobject gdk-pixbuf2 poppler-glib gdk-pixbuf2 python-cairo perl-image-exiftool python-setuptools mailcap 65 - pacman -Sy --noconfirm python-mutagen python-gobject gdk-pixbuf2 poppler-glib gdk-pixbuf2 python-cairo perl-image-exiftool python-setuptools mailcap ffmpeg
66 - python3 setup.py test 66 - python3 setup.py test
diff --git a/libmat2/__init__.py b/libmat2/__init__.py
index f55a14c..399a364 100644
--- a/libmat2/__init__.py
+++ b/libmat2/__init__.py
@@ -1,11 +1,12 @@
1#!/usr/bin/env python3 1#!/usr/bin/env python3
2 2
3import os
4import collections 3import collections
5import enum 4import enum
6import importlib 5import importlib
7from typing import Dict, Optional 6from typing import Dict, Optional
8 7
8from . import exiftool, video
9
9# make pyflakes happy 10# make pyflakes happy
10assert Dict 11assert Dict
11assert Optional 12assert Optional
@@ -37,24 +38,13 @@ DEPENDENCIES = {
37 'mutagen': 'Mutagen', 38 'mutagen': 'Mutagen',
38 } 39 }
39 40
40def _get_exiftool_path() -> str: # pragma: no cover
41 exiftool_path = '/usr/bin/exiftool'
42 if os.path.isfile(exiftool_path):
43 if os.access(exiftool_path, os.X_OK):
44 return exiftool_path
45
46 # ArchLinux
47 exiftool_path = '/usr/bin/vendor_perl/exiftool'
48 if os.path.isfile(exiftool_path):
49 if os.access(exiftool_path, os.X_OK):
50 return exiftool_path
51 41
52 raise ValueError
53 42
54def check_dependencies() -> dict: 43def check_dependencies() -> dict:
55 ret = collections.defaultdict(bool) # type: Dict[str, bool] 44 ret = collections.defaultdict(bool) # type: Dict[str, bool]
56 45
57 ret['Exiftool'] = True if _get_exiftool_path() else False 46 ret['Exiftool'] = True if exiftool._get_exiftool_path() else False
47 ret['Ffmpeg'] = True if video._get_ffmpeg_path() else False
58 48
59 for key, value in DEPENDENCIES.items(): 49 for key, value in DEPENDENCIES.items():
60 ret[value] = True 50 ret[value] = True
diff --git a/libmat2/abstract.py b/libmat2/abstract.py
index 0084796..414a68b 100644
--- a/libmat2/abstract.py
+++ b/libmat2/abstract.py
@@ -7,7 +7,8 @@ assert Set # make pyflakes happy
7 7
8class AbstractParser(abc.ABC): 8class AbstractParser(abc.ABC):
9 """ This is the base class of every parser. 9 """ This is the base class of every parser.
10 It might yield `ValueError` on instantiation on invalid files. 10 It might yield `ValueError` on instantiation on invalid files,
11 and `RuntimeError` when something went wrong in `remove_all`.
11 """ 12 """
12 meta_list = set() # type: Set[str] 13 meta_list = set() # type: Set[str]
13 mimetypes = set() # type: Set[str] 14 mimetypes = set() # type: Set[str]
@@ -27,4 +28,7 @@ class AbstractParser(abc.ABC):
27 28
28 @abc.abstractmethod 29 @abc.abstractmethod
29 def remove_all(self) -> bool: 30 def remove_all(self) -> bool:
31 """
32 :raises RuntimeError: Raised if the cleaning process went wrong.
33 """
30 pass # pragma: no cover 34 pass # pragma: no cover
diff --git a/libmat2/exiftool.py b/libmat2/exiftool.py
new file mode 100644
index 0000000..e17d31b
--- /dev/null
+++ b/libmat2/exiftool.py
@@ -0,0 +1,61 @@
1import json
2import os
3import re
4import shutil
5import subprocess
6import tempfile
7
8from typing import Dict, Union, Set
9
10from . import abstract
11
12# Make pyflakes happy
13assert Set
14
15
16class ExiftoolParser(abstract.AbstractParser):
17 """ Exiftool is often the easiest way to get all the metadata
18 from a import file, hence why several parsers are re-using its `get_meta`
19 method.
20 """
21 meta_whitelist = set() # type: Set[str]
22
23 @staticmethod
24 def __handle_problematic_filename(filename: str, callback) -> bytes:
25 """ This method takes a filename with a problematic name,
26 and safely applies it a `callback`."""
27 tmpdirname = tempfile.mkdtemp()
28 fname = os.path.join(tmpdirname, "temp_file")
29 shutil.copy(filename, fname)
30 out = callback(fname)
31 shutil.rmtree(tmpdirname)
32 return out
33
34 def get_meta(self) -> Dict[str, Union[str, dict]]:
35 """ There is no way to escape the leading(s) dash(es) of the current
36 self.filename to prevent parameter injections, so we need to take care
37 of this.
38 """
39 fun = lambda f: subprocess.check_output([_get_exiftool_path(), '-json', f])
40 if re.search('^[a-z0-9/]', self.filename) is None:
41 out = self.__handle_problematic_filename(self.filename, fun)
42 else:
43 out = fun(self.filename)
44 meta = json.loads(out.decode('utf-8'))[0]
45 for key in self.meta_whitelist:
46 meta.pop(key, None)
47 return meta
48
49def _get_exiftool_path() -> str: # pragma: no cover
50 exiftool_path = '/usr/bin/exiftool'
51 if os.path.isfile(exiftool_path):
52 if os.access(exiftool_path, os.X_OK):
53 return exiftool_path
54
55 # ArchLinux
56 exiftool_path = '/usr/bin/vendor_perl/exiftool'
57 if os.path.isfile(exiftool_path):
58 if os.access(exiftool_path, os.X_OK):
59 return exiftool_path
60
61 raise RuntimeError("Unable to find exiftool")
diff --git a/libmat2/images.py b/libmat2/images.py
index a29cbb7..ad80892 100644
--- a/libmat2/images.py
+++ b/libmat2/images.py
@@ -1,11 +1,6 @@
1import subprocess
2import imghdr 1import imghdr
3import json
4import os 2import os
5import shutil 3from typing import Set
6import tempfile
7import re
8from typing import Set, Dict, Union
9 4
10import cairo 5import cairo
11 6
@@ -13,44 +8,12 @@ import gi
13gi.require_version('GdkPixbuf', '2.0') 8gi.require_version('GdkPixbuf', '2.0')
14from gi.repository import GdkPixbuf 9from gi.repository import GdkPixbuf
15 10
16from . import abstract, _get_exiftool_path 11from . import exiftool
17 12
18# Make pyflakes happy 13# Make pyflakes happy
19assert Set 14assert Set
20 15
21class _ImageParser(abstract.AbstractParser): 16class PNGParser(exiftool.ExiftoolParser):
22 """ Since we use `exiftool` to get metadata from
23 all images fileformat, `get_meta` is implemented in this class,
24 and all the image-handling ones are inheriting from it."""
25 meta_whitelist = set() # type: Set[str]
26
27 @staticmethod
28 def __handle_problematic_filename(filename: str, callback) -> bytes:
29 """ This method takes a filename with a problematic name,
30 and safely applies it a `callback`."""
31 tmpdirname = tempfile.mkdtemp()
32 fname = os.path.join(tmpdirname, "temp_file")
33 shutil.copy(filename, fname)
34 out = callback(fname)
35 shutil.rmtree(tmpdirname)
36 return out
37
38 def get_meta(self) -> Dict[str, Union[str, dict]]:
39 """ There is no way to escape the leading(s) dash(es) of the current
40 self.filename to prevent parameter injections, so we need to take care
41 of this.
42 """
43 fun = lambda f: subprocess.check_output([_get_exiftool_path(), '-json', f])
44 if re.search('^[a-z0-9/]', self.filename) is None:
45 out = self.__handle_problematic_filename(self.filename, fun)
46 else:
47 out = fun(self.filename)
48 meta = json.loads(out.decode('utf-8'))[0]
49 for key in self.meta_whitelist:
50 meta.pop(key, None)
51 return meta
52
53class PNGParser(_ImageParser):
54 mimetypes = {'image/png', } 17 mimetypes = {'image/png', }
55 meta_whitelist = {'SourceFile', 'ExifToolVersion', 'FileName', 18 meta_whitelist = {'SourceFile', 'ExifToolVersion', 'FileName',
56 'Directory', 'FileSize', 'FileModifyDate', 19 'Directory', 'FileSize', 'FileModifyDate',
@@ -77,7 +40,7 @@ class PNGParser(_ImageParser):
77 return True 40 return True
78 41
79 42
80class GdkPixbufAbstractParser(_ImageParser): 43class GdkPixbufAbstractParser(exiftool.ExiftoolParser):
81 """ GdkPixbuf can handle a lot of surfaces, so we're rending images on it, 44 """ GdkPixbuf can handle a lot of surfaces, so we're rending images on it,
82 this has the side-effect of completely removing metadata. 45 this has the side-effect of completely removing metadata.
83 """ 46 """
diff --git a/libmat2/parser_factory.py b/libmat2/parser_factory.py
index 621640b..4a0ca0d 100644
--- a/libmat2/parser_factory.py
+++ b/libmat2/parser_factory.py
@@ -18,6 +18,8 @@ def __load_all_parsers():
18 continue 18 continue
19 elif fname.endswith('__init__.py'): 19 elif fname.endswith('__init__.py'):
20 continue 20 continue
21 elif fname.endswith('exiftool.py'):
22 continue
21 basename = os.path.basename(fname) 23 basename = os.path.basename(fname)
22 name, _ = os.path.splitext(basename) 24 name, _ = os.path.splitext(basename)
23 importlib.import_module('.' + name, package='libmat2') 25 importlib.import_module('.' + name, package='libmat2')
diff --git a/libmat2/video.py b/libmat2/video.py
new file mode 100644
index 0000000..b9f3687
--- /dev/null
+++ b/libmat2/video.py
@@ -0,0 +1,58 @@
1import os
2import subprocess
3
4from . import exiftool
5
6
7class AVIParser(exiftool.ExiftoolParser):
8 mimetypes = {'video/x-msvideo', }
9 meta_whitelist = {'SourceFile', 'ExifToolVersion', 'FileName', 'Directory',
10 'FileSize', 'FileModifyDate', 'FileAccessDate',
11 'FileInodeChangeDate', 'FilePermissions', 'FileType',
12 'FileTypeExtension', 'MIMEType', 'FrameRate', 'MaxDataRate',
13 'FrameCount', 'StreamCount', 'StreamType', 'VideoCodec',
14 'VideoFrameRate', 'VideoFrameCount', 'Quality',
15 'SampleSize', 'BMPVersion', 'ImageWidth', 'ImageHeight',
16 'Planes', 'BitDepth', 'Compression', 'ImageLength',
17 'PixelsPerMeterX', 'PixelsPerMeterY', 'NumColors',
18 'NumImportantColors', 'NumColors', 'NumImportantColors',
19 'RedMask', 'GreenMask', 'BlueMask', 'AlphaMask',
20 'ColorSpace', 'AudioCodec', 'AudioCodecRate',
21 'AudioSampleCount', 'AudioSampleCount',
22 'AudioSampleRate', 'Encoding', 'NumChannels',
23 'SampleRate', 'AvgBytesPerSec', 'BitsPerSample',
24 'Duration', 'ImageSize', 'Megapixels'}
25
26 def remove_all(self) -> bool:
27 """
28 TODO: handle problematic filenames starting with `-` and `--`,
29 check exiftool.py
30 """
31 cmd = [_get_ffmpeg_path(),
32 '-i', self.filename, # input file
33 '-y', # overwrite existing output file
34 '-loglevel', 'panic', # Don't show log
35 '-hide_banner', # hide the banner
36 '-codec', 'copy', # don't decode anything, just copy (speed!)
37 '-map_metadata', '-1', # remove supperficial metadata
38 '-map_chapters', '-1', # remove chapters
39 '-fflags', '+bitexact', # don't add any metadata
40 '-flags:v', '+bitexact', # don't add any metadata
41 '-flags:a', '+bitexact', # don't add any metadata
42 self.output_filename]
43
44 try:
45 subprocess.check_call(cmd)
46 except subprocess.CalledProcessError: # pragma: no cover
47 return False
48
49 return True
50
51
52def _get_ffmpeg_path() -> str: # pragma: no cover
53 ffmpeg_path = '/usr/bin/ffmpeg'
54 if os.path.isfile(ffmpeg_path):
55 if os.access(ffmpeg_path, os.X_OK):
56 return ffmpeg_path
57
58 raise RuntimeError("Unable to find ffmpeg")
diff --git a/mat2 b/mat2
index 1665576..01263b6 100755
--- a/mat2
+++ b/mat2
@@ -97,7 +97,13 @@ def clean_meta(filename: str, is_lightweight: bool, policy: UnknownMemberPolicy)
97 return False 97 return False
98 p.unknown_member_policy = policy 98 p.unknown_member_policy = policy
99 p.lightweight_cleaning = is_lightweight 99 p.lightweight_cleaning = is_lightweight
100 return p.remove_all() 100
101 try:
102 return p.remove_all()
103 except RuntimeError as e:
104 print("[-] %s can't be cleaned: %s" % (filename, e))
105 return False
106
101 107
102 108
103def show_parsers() -> bool: 109def show_parsers() -> bool:
diff --git a/tests/data/dirty.avi b/tests/data/dirty.avi
new file mode 100644
index 0000000..850feab
--- /dev/null
+++ b/tests/data/dirty.avi
Binary files differ
diff --git a/tests/test_libmat2.py b/tests/test_libmat2.py
index 665bab0..37adc6a 100644
--- a/tests/test_libmat2.py
+++ b/tests/test_libmat2.py
@@ -6,12 +6,16 @@ import os
6import zipfile 6import zipfile
7 7
8from libmat2 import pdf, images, audio, office, parser_factory, torrent, harmless 8from libmat2 import pdf, images, audio, office, parser_factory, torrent, harmless
9from libmat2 import check_dependencies 9from libmat2 import check_dependencies, video
10 10
11 11
12class TestCheckDependencies(unittest.TestCase): 12class TestCheckDependencies(unittest.TestCase):
13 def test_deps(self): 13 def test_deps(self):
14 ret = check_dependencies() 14 try:
15 ret = check_dependencies()
16 except RuntimeError:
17 return # this happens if not every dependency is installed
18
15 for value in ret.values(): 19 for value in ret.values():
16 self.assertTrue(value) 20 self.assertTrue(value)
17 21
@@ -471,3 +475,24 @@ class TestCleaning(unittest.TestCase):
471 os.remove('./tests/data/clean.txt') 475 os.remove('./tests/data/clean.txt')
472 os.remove('./tests/data/clean.cleaned.txt') 476 os.remove('./tests/data/clean.cleaned.txt')
473 os.remove('./tests/data/clean.cleaned.cleaned.txt') 477 os.remove('./tests/data/clean.cleaned.cleaned.txt')
478
479 def test_avi(self):
480 shutil.copy('./tests/data/dirty.avi', './tests/data/clean.avi')
481 p = video.AVIParser('./tests/data/clean.avi')
482
483 meta = p.get_meta()
484 self.assertEqual(meta['Software'], 'MEncoder SVN-r33148-4.0.1')
485
486 try:
487 ret = p.remove_all()
488 except RuntimeError:
489 return # this happens if ffmepg is not installed
490 self.assertTrue(ret)
491
492 p = video.AVIParser('./tests/data/clean.cleaned.avi')
493 self.assertEqual(p.get_meta(), {})
494 self.assertTrue(p.remove_all())
495
496 os.remove('./tests/data/clean.avi')
497 os.remove('./tests/data/clean.cleaned.avi')
498 os.remove('./tests/data/clean.cleaned.cleaned.avi')