From af36529554c39a2eefcc2c8723715e2d25b401b8 Mon Sep 17 00:00:00 2001 From: jvoisin Date: Sun, 8 Jun 2014 13:39:18 +0200 Subject: Rename the MAT folder to libmat. This commit fixes some issues for dump operating systems who doesn't handle capitalization. --- MAT/__init__.py | 1 - MAT/archive.py | 335 ------------------------------------ MAT/audio.py | 53 ------ MAT/bencode/__init__.py | 1 - MAT/bencode/bencode.py | 143 ---------------- MAT/exceptions.py | 14 -- MAT/exiftool.py | 78 --------- MAT/hachoir_editor/__init__.py | 8 - MAT/hachoir_editor/field.py | 69 -------- MAT/hachoir_editor/fieldset.py | 352 -------------------------------------- MAT/hachoir_editor/typed_field.py | 253 --------------------------- MAT/images.py | 52 ------ MAT/mat.py | 186 -------------------- MAT/misc.py | 76 -------- MAT/mutagenstripper.py | 33 ---- MAT/office.py | 191 --------------------- MAT/parser.py | 135 --------------- MAT/strippers.py | 70 -------- 18 files changed, 2050 deletions(-) delete mode 100644 MAT/__init__.py delete mode 100644 MAT/archive.py delete mode 100644 MAT/audio.py delete mode 100644 MAT/bencode/__init__.py delete mode 100644 MAT/bencode/bencode.py delete mode 100644 MAT/exceptions.py delete mode 100644 MAT/exiftool.py delete mode 100644 MAT/hachoir_editor/__init__.py delete mode 100644 MAT/hachoir_editor/field.py delete mode 100644 MAT/hachoir_editor/fieldset.py delete mode 100644 MAT/hachoir_editor/typed_field.py delete mode 100644 MAT/images.py delete mode 100644 MAT/mat.py delete mode 100644 MAT/misc.py delete mode 100644 MAT/mutagenstripper.py delete mode 100644 MAT/office.py delete mode 100644 MAT/parser.py delete mode 100644 MAT/strippers.py (limited to 'MAT') diff --git a/MAT/__init__.py b/MAT/__init__.py deleted file mode 100644 index 8b13789..0000000 --- a/MAT/__init__.py +++ /dev/null @@ -1 +0,0 @@ - diff --git a/MAT/archive.py b/MAT/archive.py deleted file mode 100644 index d483dcc..0000000 --- a/MAT/archive.py +++ /dev/null @@ -1,335 +0,0 @@ -''' Take care of archives formats -''' - -import datetime -import logging -import os -import shutil -import stat -import tarfile -import tempfile -import zipfile - -import mat -import parser - -# Zip files do not support dates older than 01/01/1980 -ZIP_EPOCH = (1980, 1, 1, 0, 0, 0) -ZIP_EPOCH_SECONDS = (datetime.datetime(1980, 1, 1, 0, 0, 0) - - datetime.datetime(1970, 1, 1, 1, 0, 0)).total_seconds() - - -class GenericArchiveStripper(parser.GenericParser): - ''' Represent a generic archive - ''' - def __init__(self, filename, parser, mime, backup, is_writable, **kwargs): - super(GenericArchiveStripper, self).__init__(filename, - parser, mime, backup, is_writable, **kwargs) - self.compression = '' - self.add2archive = kwargs['add2archive'] - self.tempdir = tempfile.mkdtemp() - - def __del__(self): - ''' Remove the files inside the temp dir, - then remove the temp dir - ''' - for root, dirs, files in os.walk(self.tempdir): - for item in files: - path_file = os.path.join(root, item) - mat.secure_remove(path_file) - shutil.rmtree(self.tempdir) - - def is_clean(self, list_unsupported=False): - ''' Virtual method to check for harmul metadata - ''' - raise NotImplementedError - - def list_unsupported(self): - ''' Get a list of every non-supported files present in the archive - ''' - return self.is_clean(list_unsupported=True) - - def remove_all(self): - ''' Virtual method to remove all metadata - ''' - raise NotImplementedError - - -class ZipStripper(GenericArchiveStripper): - ''' Represent a zip file - ''' - def __is_zipfile_clean(self, fileinfo): - ''' Check if a ZipInfo object is clean of metadata added - by zip itself, independently of the corresponding file metadata - ''' - if fileinfo.comment != '': - return False - elif fileinfo.date_time != ZIP_EPOCH: - return False - elif fileinfo.create_system != 3: # 3 is UNIX - return False - return True - - def is_clean(self, list_unsupported=False): - ''' Check if the given file is clean from harmful metadata - When list_unsupported is True, the method returns a list - of all non-supported/archives files contained in the - archive. - ''' - ret_list = [] - zipin = zipfile.ZipFile(self.filename, 'r') - if zipin.comment != '' and not list_unsupported: - logging.debug('%s has a comment' % self.filename) - return False - for item in zipin.infolist(): - zipin.extract(item, self.tempdir) - path = os.path.join(self.tempdir, item.filename) - if not self.__is_zipfile_clean(item) and not list_unsupported: - logging.debug('%s from %s has compromising zipinfo' % - (item.filename, self.filename)) - return False - if os.path.isfile(path): - cfile = mat.create_class_file(path, False, add2archive=self.add2archive) - if cfile is not None: - if not cfile.is_clean(): - logging.debug('%s from %s has metadata' % (item.filename, self.filename)) - if not list_unsupported: - return False - else: - logging.info('%s\'s fileformat is not supported or harmless.' - % item.filename) - basename, ext = os.path.splitext(path) - if os.path.basename(item.filename) not in ('mimetype', '.rels'): - if ext not in parser.NOMETA: - if not list_unsupported: - return False - ret_list.append(item.filename) - zipin.close() - if list_unsupported: - return ret_list - return True - - def get_meta(self): - ''' Return all the metadata of a zip archive''' - zipin = zipfile.ZipFile(self.filename, 'r') - metadata = {} - if zipin.comment != '': - metadata['comment'] = zipin.comment - for item in zipin.infolist(): - zipinfo_meta = self.__get_zipinfo_meta(item) - if zipinfo_meta != {}: # zipinfo metadata - metadata[item.filename + "'s zipinfo"] = str(zipinfo_meta) - zipin.extract(item, self.tempdir) - path = os.path.join(self.tempdir, item.filename) - if os.path.isfile(path): - cfile = mat.create_class_file(path, False, add2archive=self.add2archive) - if cfile is not None: - cfile_meta = cfile.get_meta() - if cfile_meta != {}: - metadata[item.filename] = str(cfile_meta) - else: - logging.info('%s\'s fileformat is not supported or harmless' - % item.filename) - zipin.close() - return metadata - - def __get_zipinfo_meta(self, zipinfo): - ''' Return all the metadata of a ZipInfo - ''' - metadata = {} - if zipinfo.comment != '': - metadata['comment'] = zipinfo.comment - if zipinfo.date_time != ZIP_EPOCH: - metadata['modified'] = zipinfo.date_time - if zipinfo.create_system != 3: # 3 is UNIX - metadata['system'] = "windows" if zipinfo.create_system == 2 else "unknown" - return metadata - - def remove_all(self, whitelist=[], beginning_blacklist=[], ending_blacklist=[]): - ''' Remove all metadata from a zip archive, even thoses - added by Python's zipfile itself. It will not add - files starting with "begining_blacklist", or ending with - "ending_blacklist". This method also add files present in - whitelist to the archive. - ''' - zipin = zipfile.ZipFile(self.filename, 'r') - zipout = zipfile.ZipFile(self.output, 'w', allowZip64=True) - for item in zipin.infolist(): - zipin.extract(item, self.tempdir) - path = os.path.join(self.tempdir, item.filename) - - beginning = any((True for f in beginning_blacklist if item.filename.startswith(f))) - ending = any((True for f in ending_blacklist if item.filename.endswith(f))) - - if os.path.isfile(path) and not beginning and not ending: - cfile = mat.create_class_file(path, False, add2archive=self.add2archive) - if cfile is not None: - # Handle read-only files inside archive - old_stat = os.stat(path).st_mode - os.chmod(path, old_stat|stat.S_IWUSR) - cfile.remove_all() - os.chmod(path, old_stat) - logging.debug('Processing %s from %s' % (item.filename, self.filename)) - elif item.filename not in whitelist: - logging.info('%s\'s format is not supported or harmless' % item.filename) - basename, ext = os.path.splitext(path) - if not (self.add2archive or ext in parser.NOMETA): - continue - os.utime(path, (ZIP_EPOCH_SECONDS, ZIP_EPOCH_SECONDS)) - zipout.write(path, item.filename) - zipin.close() - zipout.close() - - logging.info('%s processed' % self.filename) - self.do_backup() - return True - - -class TarStripper(GenericArchiveStripper): - ''' Represent a tarfile archive - ''' - def _remove(self, current_file): - ''' Remove the meta added by tarfile itself to the file - ''' - current_file.mtime = 0 - current_file.uid = 0 - current_file.gid = 0 - current_file.uname = '' - current_file.gname = '' - return current_file - - def remove_all(self, whitelist=[]): - ''' Remove all harmful metadata from the tarfile. - The method will also add every files matching - whitelist in the produced archive. - ''' - tarin = tarfile.open(self.filename, 'r' + self.compression, encoding='utf-8') - tarout = tarfile.open(self.output, 'w' + self.compression, encoding='utf-8') - for item in tarin.getmembers(): - tarin.extract(item, self.tempdir) - if item.isfile(): - path = os.path.join(self.tempdir, item.name) - cfile = mat.create_class_file(path, False, add2archive=self.add2archive) - if cfile is not None: - # Handle read-only files inside archive - old_stat = os.stat(path).st_mode - os.chmod(path, old_stat|stat.S_IWUSR) - cfile.remove_all() - os.chmod(path, old_stat) - elif self.add2archive or os.path.splitext(item.name)[1] in parser.NOMETA: - logging.debug('%s\' format is either not supported or harmless' % item.name) - elif item.name in whitelist: - logging.debug('%s is not supported, but MAT was told to add it anyway.' - % item.name) - else: # Don't add the file to the archive - logging.debug('%s will not be added' % item.name) - continue - tarout.add(path, item.name, filter=self._remove) - tarin.close() - tarout.close() - self.do_backup() - return True - - def is_file_clean(self, current_file): - ''' Check metadatas added by tarfile - ''' - if current_file.mtime != 0: - return False - elif current_file.uid != 0: - return False - elif current_file.gid != 0: - return False - elif current_file.uname != '': - return False - elif current_file.gname != '': - return False - return True - - def is_clean(self, list_unsupported=False): - ''' Check if the file is clean from harmful metadatas - When list_unsupported is True, the method returns a list - of all non-supported/archives files contained in the - archive. - ''' - ret_list = [] - tarin = tarfile.open(self.filename, 'r' + self.compression) - for item in tarin.getmembers(): - if not self.is_file_clean(item) and not list_unsupported: - logging.debug('%s from %s has compromising tarinfo' % - (item.name, self.filename)) - return False - tarin.extract(item, self.tempdir) - path = os.path.join(self.tempdir, item.name) - if item.isfile(): - cfile = mat.create_class_file(path, False, add2archive=self.add2archive) - if cfile is not None: - if not cfile.is_clean(): - logging.debug('%s from %s has metadata' % - (item.name.decode("utf8"), self.filename)) - if not list_unsupported: - return False - # Nested archives are treated like unsupported files - elif isinstance(cfile, GenericArchiveStripper): - ret_list.append(item.name) - else: - logging.error('%s\'s format is not supported or harmless' % item.name) - if os.path.splitext(path)[1] not in parser.NOMETA: - if not list_unsupported: - return False - ret_list.append(item.name) - tarin.close() - if list_unsupported: - return ret_list - return True - - def get_meta(self): - ''' Return a dict with all the meta of the tarfile - ''' - tarin = tarfile.open(self.filename, 'r' + self.compression) - metadata = {} - for item in tarin.getmembers(): - current_meta = {} - if item.isfile(): - tarin.extract(item, self.tempdir) - path = os.path.join(self.tempdir, item.name) - class_file = mat.create_class_file(path, False, add2archive=self.add2archive) - if class_file is not None: - meta = class_file.get_meta() - if meta: - current_meta['file'] = str(meta) - else: - logging.error('%s\'s format is not supported or harmless' % item.name) - - if not self.is_file_clean(item): # if there is meta - current_meta['mtime'] = item.mtime - current_meta['uid'] = item.uid - current_meta['gid'] = item.gid - current_meta['uname'] = item.uname - current_meta['gname'] = item.gname - metadata[item.name] = str(current_meta) - tarin.close() - return metadata - - -class TerminalZipStripper(ZipStripper): - ''' Represent a terminal level archive. - This type of archive can not contain nested archives. - It is used for formats like docx, which are basically - ziped xml. - ''' - - -class GzipStripper(TarStripper): - ''' Represent a tar.gz archive - ''' - def __init__(self, filename, parser, mime, backup, is_writable, **kwargs): - super(GzipStripper, self).__init__(filename, parser, mime, backup, is_writable, **kwargs) - self.compression = ':gz' - - -class Bzip2Stripper(TarStripper): - ''' Represent a tar.bz2 archive - ''' - def __init__(self, filename, parser, mime, backup, is_writable, **kwargs): - super(Bzip2Stripper, self).__init__(filename, parser, mime, backup, is_writable, **kwargs) - self.compression = ':bz2' diff --git a/MAT/audio.py b/MAT/audio.py deleted file mode 100644 index dae9d75..0000000 --- a/MAT/audio.py +++ /dev/null @@ -1,53 +0,0 @@ -''' Care about audio fileformat -''' - -try: - from mutagen.flac import FLAC - from mutagen.oggvorbis import OggVorbis -except ImportError: - pass - -import parser -import mutagenstripper - - -class MpegAudioStripper(parser.GenericParser): - ''' Represent mpeg audio file (mp3, ...) - ''' - def _should_remove(self, field): - return field.name in ("id3v1", "id3v2") - - -class OggStripper(mutagenstripper.MutagenStripper): - ''' Represent an ogg vorbis file - ''' - def _create_mfile(self): - self.mfile = OggVorbis(self.filename) - - -class FlacStripper(mutagenstripper.MutagenStripper): - ''' Represent a Flac audio file - ''' - def _create_mfile(self): - self.mfile = FLAC(self.filename) - - def remove_all(self): - ''' Remove the "metadata" block from the file - ''' - super(FlacStripper, self).remove_all() - self.mfile.clear_pictures() - self.mfile.save() - return True - - def is_clean(self): - ''' Check if the "metadata" block is present in the file - ''' - return super(FlacStripper, self).is_clean() and not self.mfile.pictures - - def get_meta(self): - ''' Return the content of the metadata block if present - ''' - metadata = super(FlacStripper, self).get_meta() - if self.mfile.pictures: - metadata['picture:'] = 'yes' - return metadata diff --git a/MAT/bencode/__init__.py b/MAT/bencode/__init__.py deleted file mode 100644 index 8b13789..0000000 --- a/MAT/bencode/__init__.py +++ /dev/null @@ -1 +0,0 @@ - diff --git a/MAT/bencode/bencode.py b/MAT/bencode/bencode.py deleted file mode 100644 index a0cc99a..0000000 --- a/MAT/bencode/bencode.py +++ /dev/null @@ -1,143 +0,0 @@ -# Copyright 2007 by Petru Paler -# Copyright 2011 by Julien (jvoisin) Voisin -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in -# all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -# FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -# THE SOFTWARE. -# - -''' - A quick (and also nice) lib to bencode/bdecode torrent files -''' - - -class BTFailure(Exception): - '''Custom Exception''' - pass - - -class Bencached(object): - '''Custom type : cached string''' - __slots__ = ['bencoded'] - - def __init__(self, string): - self.bencoded = string - - -def decode_int(x, f): - '''decode an int''' - f += 1 - newf = x.index('e', f) - if x[f:f+1] == '-0': - raise ValueError - elif x[f] == '0' and newf != f + 1: - raise ValueError - return int(x[f:newf]), newf + 1 - - -def decode_string(x, f): - '''decode a string''' - colon = x.index(':', f) - if x[f] == '0' and colon != f + 1: - raise ValueError - n = int(x[f:colon]) - colon += 1 - return x[colon:colon + n], colon + n - - -def decode_list(x, f): - '''decode a list''' - result = [] - f += 1 - while x[f] != 'e': - v, f = DECODE_FUNC[x[f]](x, f) - result.append(v) - return result, f + 1 - - -def decode_dict(x, f): - '''decode a dict''' - result = {} - f += 1 - while x[f] != 'e': - k, f = decode_string(x, f) - result[k], f = DECODE_FUNC[x[f]](x, f) - return result, f + 1 - - -def encode_bool(x, r): - '''bencode a boolean''' - encode_int(1 if r else 0, r) - - -def encode_int(x, r): - '''bencode an integer/float''' - r.extend(('i', str(x), 'e')) - - -def encode_list(x, r): - '''bencode a list/tuple''' - r.append('l') - [ENCODE_FUNC[type(item)](item, r) for item in x] - r.append('e') - - -def encode_dict(x, result): - '''bencode a dict''' - result.append('d') - ilist = list(x.items()) - ilist.sort() - for k, v in ilist: - result.extend((str(len(k)), ':', k)) - ENCODE_FUNC[type(v)](v, result) - result.append('e') - - -DECODE_FUNC = {str(x):decode_string for x in range(9)} -DECODE_FUNC['l'] = decode_list -DECODE_FUNC['d'] = decode_dict -DECODE_FUNC['i'] = decode_int - - -ENCODE_FUNC = {} -ENCODE_FUNC[Bencached] = lambda x, r: r.append(x.bencoded) -ENCODE_FUNC[int] = encode_int -ENCODE_FUNC[int] = encode_int -ENCODE_FUNC[bytes] = lambda x, r: r.extend((str(len(x)), ':', x)) -ENCODE_FUNC[list] = encode_list -ENCODE_FUNC[tuple] = encode_list -ENCODE_FUNC[dict] = encode_dict -ENCODE_FUNC[bool] = encode_bool - - -def bencode(string): - '''bencode $string''' - table = [] - ENCODE_FUNC[type(string)](string, table) - return ''.join(table) - - -def bdecode(string): - '''decode $string''' - try: - result, lenght = DECODE_FUNC[string[0]](string, 0) - except (IndexError, KeyError, ValueError): - raise BTFailure('Not a valid bencoded string') - if lenght != len(string): - raise BTFailure('Invalid bencoded value (data after valid prefix)') - return result diff --git a/MAT/exceptions.py b/MAT/exceptions.py deleted file mode 100644 index 47da15c..0000000 --- a/MAT/exceptions.py +++ /dev/null @@ -1,14 +0,0 @@ -''' Base exceptions for MAT -''' - - -class UnableToRemoveFile(Exception): - '''This exception is raised when a file could not be removed - ''' - pass - -class UnableToWriteFile(Exception): - '''This exception is raised when a file - can could not be chmod +w - ''' - pass diff --git a/MAT/exiftool.py b/MAT/exiftool.py deleted file mode 100644 index 9e38f04..0000000 --- a/MAT/exiftool.py +++ /dev/null @@ -1,78 +0,0 @@ -''' Care about images with help of the amazing (perl) library Exiftool. -''' - -import parser -import subprocess - - -class ExiftoolStripper(parser.GenericParser): - ''' A generic stripper class using exiftool as backend - ''' - - def __init__(self, filename, parser, mime, backup, is_writable, **kwargs): - super(ExiftoolStripper, self).__init__(filename, parser, mime, backup, is_writable, **kwargs) - self.allowed = set(['ExifTool Version Number', 'File Name', 'Directory', - 'File Size', 'File Modification Date/Time', 'File Access Date/Time', 'File Permissions', - 'File Type', 'MIME Type', 'Image Width', 'Image Height', - 'Image Size', 'File Inode Change Date/Time']) - self._set_allowed() - - def _set_allowed(self): - ''' Virtual method. Set the allowed/harmless list of metadata - ''' - raise NotImplementedError - - def remove_all(self): - ''' Remove all metadata with help of exiftool - ''' - try: - if self.backup: - self.create_backup_copy() - # Note: '-All=' must be followed by a known exiftool option. - subprocess.call(['exiftool', '-m', '-all=', - '-adobe=', '-overwrite_original', self.filename], - stdout=open('/dev/null')) - return True - except: - return False - - def is_clean(self): - ''' Check if the file is clean with the help of exiftool - ''' - return not self.get_meta() - - def get_meta(self): - ''' Return every harmful meta with help of exiftool. - Exiftool output looks like this: - field name : value - field name : value - ''' - output = subprocess.Popen(['exiftool', self.filename], - stdout=subprocess.PIPE).communicate()[0] - meta = {} - for i in output.split('\n')[:-1]: # chop last char ('\n') - key = i.split(':')[0].strip() - if key not in self.allowed: - meta[key] = i.split(':')[1].strip() # add the field name to the metadata set - return meta - - -class JpegStripper(ExiftoolStripper): - ''' Care about jpeg files with help - of exiftool - ''' - def _set_allowed(self): - self.allowed.update(['JFIF Version', 'Resolution Unit', - 'X Resolution', 'Y Resolution', 'Encoding Process', - 'Bits Per Sample', 'Color Components', 'Y Cb Cr Sub Sampling']) - - -class PngStripper(ExiftoolStripper): - ''' Care about png files with help - of exiftool - ''' - def _set_allowed(self): - self.allowed.update(['Bit Depth', 'Color Type', - 'Compression', 'Filter', 'Interlace', 'Pixels Per Unit X', - 'Pixels Per Unit Y', 'Pixel Units', 'Significant Bits', - 'Background Color', 'SRGB Rendering']) diff --git a/MAT/hachoir_editor/__init__.py b/MAT/hachoir_editor/__init__.py deleted file mode 100644 index 1835676..0000000 --- a/MAT/hachoir_editor/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -from field import ( - EditorError, FakeField) -from typed_field import ( - EditableField, EditableBits, EditableBytes, - EditableInteger, EditableString, - createEditableField) -from fieldset import EditableFieldSet, NewFieldSet, createEditor - diff --git a/MAT/hachoir_editor/field.py b/MAT/hachoir_editor/field.py deleted file mode 100644 index 6b1efe3..0000000 --- a/MAT/hachoir_editor/field.py +++ /dev/null @@ -1,69 +0,0 @@ -from hachoir_core.error import HachoirError -from hachoir_core.field import joinPath, MissingField - -class EditorError(HachoirError): - pass - -class FakeField(object): - """ - This class have API looks similar to Field API, but objects don't contain - any value: all values are _computed_ by parent methods. - - Example: FakeField(editor, "abc").size calls editor._getFieldSize("abc"). - """ - is_field_set = False - - def __init__(self, parent, name): - self._parent = parent - self._name = name - - def _getPath(self): - return joinPath(self._parent.path, self._name) - path = property(_getPath) - - def _getName(self): - return self._name - name = property(_getName) - - def _getAddress(self): - return self._parent._getFieldAddress(self._name) - address = property(_getAddress) - - def _getSize(self): - return self._parent.input[self._name].size - size = property(_getSize) - - def _getValue(self): - return self._parent.input[self._name].value - value = property(_getValue) - - def createDisplay(self): - # TODO: Returns new value if field is altered - return self._parent.input[self._name].display - display = property(createDisplay) - - def _getParent(self): - return self._parent - parent = property(_getParent) - - def hasValue(self): - return self._parent.input[self._name].hasValue() - - def __getitem__(self, key): - # TODO: Implement this function! - raise MissingField(self, key) - - def _isAltered(self): - return False - is_altered = property(_isAltered) - - def writeInto(self, output): - size = self.size - addr = self._parent._getFieldInputAddress(self._name) - input = self._parent.input - stream = input.stream - if size % 8: - output.copyBitsFrom(stream, addr, size, input.endian) - else: - output.copyBytesFrom(stream, addr, size//8) - diff --git a/MAT/hachoir_editor/fieldset.py b/MAT/hachoir_editor/fieldset.py deleted file mode 100644 index b7c9b07..0000000 --- a/MAT/hachoir_editor/fieldset.py +++ /dev/null @@ -1,352 +0,0 @@ -from hachoir_core.dict import UniqKeyError -from hachoir_core.field import MissingField, Float32, Float64, FakeArray -from hachoir_core.compatibility import any -from hachoir_core.i18n import _ -from typed_field import createEditableField -from field import EditorError -from collections import deque # Python 2.4 -import weakref # Python 2.1 -import struct - -class EditableFieldSet(object): - MAX_SIZE = (1 << 40) # Arbitrary limit to catch errors - is_field_set = True - - def __init__(self, parent, fieldset): - self._parent = parent - self.input = fieldset # original FieldSet - self._fields = {} # cache of editable fields - self._deleted = set() # Names of deleted fields - self._inserted = {} # Inserted field (name => list of field, - # where name is the name after) - - def array(self, key): - # FIXME: Use cache? - return FakeArray(self, key) - - def _getParent(self): - return self._parent - parent = property(_getParent) - - def _isAltered(self): - if self._inserted: - return True - if self._deleted: - return True - return any(field.is_altered for field in self._fields.itervalues()) - is_altered = property(_isAltered) - - def reset(self): - """ - Reset the field set and the input field set. - """ - for key, field in self._fields.iteritems(): - if not field.is_altered: - del self._fields[key] - self.input.reset() - - def __len__(self): - return len(self.input) \ - - len(self._deleted) \ - + sum( len(new) for new in self._inserted.itervalues() ) - - def __iter__(self): - for field in self.input: - name = field.name - if name in self._inserted: - for newfield in self._inserted[name]: - yield weakref.proxy(newfield) - if name not in self._deleted: - yield self[name] - if None in self._inserted: - for newfield in self._inserted[None]: - yield weakref.proxy(newfield) - - def insertBefore(self, name, *new_fields): - self._insert(name, new_fields, False) - - def insertAfter(self, name, *new_fields): - self._insert(name, new_fields, True) - - def insert(self, *new_fields): - self._insert(None, new_fields, True) - - def _insert(self, key, new_fields, next): - """ - key is the name of the field before which new_fields - will be inserted. If next is True, the fields will be inserted - _after_ this field. - """ - # Set unique field name - for field in new_fields: - if field._name.endswith("[]"): - self.input.setUniqueFieldName(field) - - # Check that there is no duplicate in inserted fields - new_names = list(field.name for field in new_fields) - names_set = set(new_names) - if len(names_set) != len(new_fields): - duplicates = (name for name in names_set if 1 < new_names.count(name)) - raise UniqKeyError(_("Duplicates in inserted fields: %s") % ", ".join(duplicates)) - - # Check that field names are not in input - if self.input: # Write special version for NewFieldSet? - for name in new_names: - if name in self.input and name not in self._deleted: - raise UniqKeyError(_("Field name '%s' already exists") % name) - - # Check that field names are not in inserted fields - for fields in self._inserted.itervalues(): - for field in fields: - if field.name in new_names: - raise UniqKeyError(_("Field name '%s' already exists") % field.name) - - # Input have already inserted field? - if key in self._inserted: - if next: - self._inserted[key].extend( reversed(new_fields) ) - else: - self._inserted[key].extendleft( reversed(new_fields) ) - return - - # Whould like to insert in inserted fields? - if key: - for fields in self._inserted.itervalues(): - names = [item.name for item in fields] - try: - pos = names.index(key) - except ValueError: - continue - if 0 <= pos: - if next: - pos += 1 - fields.rotate(-pos) - fields.extendleft( reversed(new_fields) ) - fields.rotate(pos) - return - - # Get next field. Use None if we are at the end. - if next: - index = self.input[key].index + 1 - try: - key = self.input[index].name - except IndexError: - key = None - - # Check that field names are not in input - if key not in self.input: - raise MissingField(self, key) - - # Insert in original input - self._inserted[key]= deque(new_fields) - - def _getDescription(self): - return self.input.description - description = property(_getDescription) - - def _getStream(self): - # FIXME: This property is maybe a bad idea since address may be differents - return self.input.stream - stream = property(_getStream) - - def _getName(self): - return self.input.name - name = property(_getName) - - def _getEndian(self): - return self.input.endian - endian = property(_getEndian) - - def _getAddress(self): - if self._parent: - return self._parent._getFieldAddress(self.name) - else: - return 0 - address = property(_getAddress) - - def _getAbsoluteAddress(self): - address = self.address - current = self._parent - while current: - address += current.address - current = current._parent - return address - absolute_address = property(_getAbsoluteAddress) - - def hasValue(self): - return False -# return self._parent.input[self.name].hasValue() - - def _getSize(self): - if self.is_altered: - return sum(field.size for field in self) - else: - return self.input.size - size = property(_getSize) - - def _getPath(self): - return self.input.path - path = property(_getPath) - - def _getOriginalField(self, name): - assert name in self.input - return self.input[name] - - def _getFieldInputAddress(self, name): - """ - Absolute address of a field from the input field set. - """ - assert name in self.input - return self.input[name].absolute_address - - def _getFieldAddress(self, name): - """ - Compute relative address of a field. The operation takes care of - deleted and resized fields. - """ - #assert name not in self._deleted - addr = 0 - for field in self: - if field.name == name: - return addr - addr += field.size - raise MissingField(self, name) - - def _getItemByPath(self, path): - if not path[0]: - path = path[1:] - field = self - for name in path: - field = field[name] - return field - - def __contains__(self, name): - try: - field = self[name] - return (field is not None) - except MissingField: - return False - - def __getitem__(self, key): - """ - Create a weak reference to an editable field (EditableField) for the - field with specified name. If the field is removed later, using the - editable field will raise a weakref.ReferenceError exception. - - May raise a MissingField error if the field doesn't exist in original - field set or it has been deleted. - """ - if "/" in key: - return self._getItemByPath(key.split("/")) - if isinstance(key, (int, long)): - raise EditorError("Integer index are not supported") - - if (key in self._deleted) or (key not in self.input): - raise MissingField(self, key) - if key not in self._fields: - field = self.input[key] - if field.is_field_set: - self._fields[key] = createEditableFieldSet(self, field) - else: - self._fields[key] = createEditableField(self, field) - return weakref.proxy(self._fields[key]) - - def __delitem__(self, name): - """ - Remove a field from the field set. May raise an MissingField exception - if the field has already been deleted. - """ - parts = name.partition('/') - if parts[2]: - fieldset = self[parts[0]] - del fieldset[parts[2]] - return - if name in self._deleted: - raise MissingField(self, name) - self._deleted.add(name) - if name in self._fields: - del self._fields[name] - - def writeInto(self, output): - """ - Write the content if this field set into the output stream - (OutputStream). - """ - if not self.is_altered: - # Not altered: just copy bits/bytes - input = self.input - if input.size % 8: - output.copyBitsFrom(input.stream, - input.absolute_address, input.size, input.endian) - else: - output.copyBytesFrom(input.stream, - input.absolute_address, input.size//8) - else: - # Altered: call writeInto() method of each field - realaddr = 0 - for field in self: - field.writeInto(output) - realaddr += field.size - - def _getValue(self): - raise EditorError('Field set "%s" has no value' % self.path) - def _setValue(self, value): - raise EditorError('Field set "%s" value is read only' % self.path) - value = property(_getValue, _setValue, "Value of field") - -class EditableFloat(EditableFieldSet): - _value = None - - def _isAltered(self): - return (self._value is not None) - is_altered = property(_isAltered) - - def writeInto(self, output): - if self._value is not None: - self._write(output) - else: - EditableFieldSet.writeInto(self, output) - - def _write(self, output): - format = self.input.struct_format - raw = struct.pack(format, self._value) - output.writeBytes(raw) - - def _setValue(self, value): - self.parent._is_altered = True - self._value = value - value = property(EditableFieldSet._getValue, _setValue) - -def createEditableFieldSet(parent, field): - cls = field.__class__ - # FIXME: Support Float80 - if cls in (Float32, Float64): - return EditableFloat(parent, field) - else: - return EditableFieldSet(parent, field) - -class NewFieldSet(EditableFieldSet): - def __init__(self, parent, name): - EditableFieldSet.__init__(self, parent, None) - self._name = name - self._endian = parent.endian - - def __iter__(self): - if None in self._inserted: - return iter(self._inserted[None]) - else: - raise StopIteration() - - def _getName(self): - return self._name - name = property(_getName) - - def _getEndian(self): - return self._endian - endian = property(_getEndian) - - is_altered = property(lambda self: True) - -def createEditor(fieldset): - return EditableFieldSet(None, fieldset) - diff --git a/MAT/hachoir_editor/typed_field.py b/MAT/hachoir_editor/typed_field.py deleted file mode 100644 index 0f0427b..0000000 --- a/MAT/hachoir_editor/typed_field.py +++ /dev/null @@ -1,253 +0,0 @@ -from hachoir_core.field import ( - RawBits, Bit, Bits, PaddingBits, - RawBytes, Bytes, PaddingBytes, - GenericString, Character, - isInteger, isString) -from field import FakeField - -class EditableField(FakeField): - """ - Pure virtual class used to write editable field class. - """ - - _is_altered = False - def __init__(self, parent, name, value=None): - FakeField.__init__(self, parent, name) - self._value = value - - def _isAltered(self): - return self._is_altered - is_altered = property(_isAltered) - - def hasValue(self): - return True - - def _computeSize(self): - raise NotImplementedError() - def _getValue(self): - return self._value - def _setValue(self, value): - self._value = value - - def _propGetValue(self): - if self._value is not None: - return self._getValue() - else: - return FakeField._getValue(self) - def _propSetValue(self, value): - self._setValue(value) - self._is_altered = True - value = property(_propGetValue, _propSetValue) - - def _getSize(self): - if self._value is not None: - return self._computeSize() - else: - return FakeField._getSize(self) - size = property(_getSize) - - def _write(self, output): - raise NotImplementedError() - - def writeInto(self, output): - if self._is_altered: - self._write(output) - else: - return FakeField.writeInto(self, output) - -class EditableFixedField(EditableField): - """ - Editable field with fixed size. - """ - - def __init__(self, parent, name, value=None, size=None): - EditableField.__init__(self, parent, name, value) - if size is not None: - self._size = size - else: - self._size = self._parent._getOriginalField(self._name).size - - def _getSize(self): - return self._size - size = property(_getSize) - -class EditableBits(EditableFixedField): - def __init__(self, parent, name, *args): - if args: - if len(args) != 2: - raise TypeError( - "Wrong argument count, EditableBits constructor prototype is: " - "(parent, name, [size, value])") - size = args[0] - value = args[1] - assert isinstance(value, (int, long)) - else: - size = None - value = None - EditableFixedField.__init__(self, parent, name, value, size) - if args: - self._setValue(args[1]) - self._is_altered = True - - def _setValue(self, value): - if not(0 <= value < (1 << self._size)): - raise ValueError("Invalid value, must be in range %s..%s" - % (0, (1 << self._size) - 1)) - self._value = value - - def _write(self, output): - output.writeBits(self._size, self._value, self._parent.endian) - -class EditableBytes(EditableField): - def _setValue(self, value): - if not value: raise ValueError( - "Unable to set empty string to a EditableBytes field") - self._value = value - - def _computeSize(self): - return len(self._value) * 8 - - def _write(self, output): - output.writeBytes(self._value) - -class EditableString(EditableField): - MAX_SIZE = { - "Pascal8": (1 << 8)-1, - "Pascal16": (1 << 16)-1, - "Pascal32": (1 << 32)-1, - } - - def __init__(self, parent, name, *args, **kw): - if len(args) == 2: - value = args[1] - assert isinstance(value, str) # TODO: support Unicode - elif not args: - value = None - else: - raise TypeError( - "Wrong argument count, EditableString constructor prototype is:" - "(parent, name, [format, value])") - EditableField.__init__(self, parent, name, value) - if len(args) == 2: - self._charset = kw.get('charset', None) - self._format = args[0] - if self._format in GenericString.PASCAL_FORMATS: - self._prefix_size = GenericString.PASCAL_FORMATS[self._format] - else: - self._prefix_size = 0 - self._suffix_str = GenericString.staticSuffixStr( - self._format, self._charset, self._parent.endian) - self._is_altered = True - else: - orig = self._parent._getOriginalField(name) - self._charset = orig.charset - self._format = orig.format - self._prefix_size = orig.content_offset - self._suffix_str = orig.suffix_str - - def _setValue(self, value): - size = len(value) - if self._format in self.MAX_SIZE and self.MAX_SIZE[self._format] < size: - raise ValueError("String is too big") - self._value = value - - def _computeSize(self): - return (self._prefix_size + len(self._value) + len(self._suffix_str))*8 - - def _write(self, output): - if self._format in GenericString.SUFFIX_FORMAT: - output.writeBytes(self._value) - output.writeBytes(self._suffix_str) - elif self._format == "fixed": - output.writeBytes(self._value) - else: - assert self._format in GenericString.PASCAL_FORMATS - size = GenericString.PASCAL_FORMATS[self._format] - output.writeInteger(len(self._value), False, size, self._parent.endian) - output.writeBytes(self._value) - -class EditableCharacter(EditableFixedField): - def __init__(self, parent, name, *args): - if args: - if len(args) != 3: - raise TypeError( - "Wrong argument count, EditableCharacter " - "constructor prototype is: (parent, name, [value])") - value = args[0] - if not isinstance(value, str) or len(value) != 1: - raise TypeError("EditableCharacter needs a character") - else: - value = None - EditableFixedField.__init__(self, parent, name, value, 8) - if args: - self._is_altered = True - - def _setValue(self, value): - if not isinstance(value, str) or len(value) != 1: - raise TypeError("EditableCharacter needs a character") - self._value = value - - def _write(self, output): - output.writeBytes(self._value) - -class EditableInteger(EditableFixedField): - VALID_VALUE_SIGNED = { - 8: (-(1 << 8), (1 << 8)-1), - 16: (-(1 << 15), (1 << 15)-1), - 32: (-(1 << 31), (1 << 31)-1), - } - VALID_VALUE_UNSIGNED = { - 8: (0, (1 << 8)-1), - 16: (0, (1 << 16)-1), - 32: (0, (1 << 32)-1) - } - - def __init__(self, parent, name, *args): - if args: - if len(args) != 3: - raise TypeError( - "Wrong argument count, EditableInteger constructor prototype is: " - "(parent, name, [signed, size, value])") - size = args[1] - value = args[2] - assert isinstance(value, (int, long)) - else: - size = None - value = None - EditableFixedField.__init__(self, parent, name, value, size) - if args: - self._signed = args[0] - self._is_altered = True - else: - self._signed = self._parent._getOriginalField(self._name).signed - - def _setValue(self, value): - if self._signed: - valid = self.VALID_VALUE_SIGNED - else: - valid = self.VALID_VALUE_UNSIGNED - minval, maxval = valid[self._size] - if not(minval <= value <= maxval): - raise ValueError("Invalid value, must be in range %s..%s" - % (minval, maxval)) - self._value = value - - def _write(self, output): - output.writeInteger( - self.value, self._signed, self._size//8, self._parent.endian) - -def createEditableField(fieldset, field): - if isInteger(field): - cls = EditableInteger - elif isString(field): - cls = EditableString - elif field.__class__ in (RawBytes, Bytes, PaddingBytes): - cls = EditableBytes - elif field.__class__ in (RawBits, Bits, Bit, PaddingBits): - cls = EditableBits - elif field.__class__ == Character: - cls = EditableCharacter - else: - cls = FakeField - return cls(fieldset, field.name) - diff --git a/MAT/images.py b/MAT/images.py deleted file mode 100644 index 67c710f..0000000 --- a/MAT/images.py +++ /dev/null @@ -1,52 +0,0 @@ -''' Takes care about pictures formats - -References: - - JFIF: http://www.ecma-international.org/publications/techreports/E-TR-098.htm - - PNG: http://www.sno.phy.queensu.ca/~phil/exiftool/TagNames/PNG.html - - PNG: http://www.w3.org/TR/PNG-Chunks.html -''' - -import parser - - -class JpegStripper(parser.GenericParser): - ''' Represents a jpeg file. - Custom Huffman and Quantization tables - are stripped: they may leak - some info, and the quality loss is minor. - ''' - def _should_remove(self, field): - ''' Return True if the field is compromising - ''' - field_list = frozenset([ - 'start_image', # start of the image - 'app0', # JFIF data - 'start_frame', # specify width, height, number of components - 'start_scan', # specify which slice of data the top-to-bottom scan contains - 'data', # actual data - 'end_image']) # end of the image - if field.name in field_list: - return False - elif field.name.startswith('quantization['): # custom Quant. tables - return False - elif field.name.startswith('huffman['): # custom Huffman tables - return False - return True - - -class PngStripper(parser.GenericParser): - ''' Represents a png file - ''' - def _should_remove(self, field): - ''' Return True if the field is compromising - ''' - field_list = frozenset([ - 'id', - 'header', # PNG header - 'physical', # the intended pixel size or aspect ratio - 'end']) # end of the image - if field.name in field_list: - return False - if field.name.startswith('data['): # data - return False - return True diff --git a/MAT/mat.py b/MAT/mat.py deleted file mode 100644 index 5b1fbda..0000000 --- a/MAT/mat.py +++ /dev/null @@ -1,186 +0,0 @@ -#!/usr/bin/env python - -''' Metadata anonymisation toolkit library -''' - -import logging -import mimetypes -import os -import subprocess -import xml.sax - -import hachoir_core.cmd_line -import hachoir_parser - -import MAT.exceptions - -__version__ = '0.5.2' -__author__ = 'jvoisin' - -#Silence -LOGGING_LEVEL = logging.CRITICAL -hachoir_core.config.quiet = True -fname = '' - -#Verbose -#LOGGING_LEVEL = logging.DEBUG -#hachoir_core.config.quiet = False -#logname = 'report.log' - -logging.basicConfig(filename=fname, level=LOGGING_LEVEL) - -import strippers # this is loaded here because we need LOGGING_LEVEL - - -def get_logo(): - ''' Return the path to the logo - ''' - if os.path.isfile('./data/mat.png'): - return './data/mat.png' - elif os.path.isfile('/usr/share/pixmaps/mat.png'): - return '/usr/share/pixmaps/mat.png' - elif os.path.isfile('/usr/local/share/pixmaps/mat.png'): - return '/usr/local/share/pixmaps/mat.png' - - -def get_datadir(): - ''' Return the path to the data directory - ''' - if os.path.isdir('./data/'): - return './data/' - elif os.path.isdir('/usr/local/share/mat/'): - return '/usr/local/share/mat/' - elif os.path.isdir('/usr/share/mat/'): - return '/usr/share/mat/' - - -def list_supported_formats(): - ''' Return a list of all locally supported fileformat. - It parses that FORMATS file, and removes locally - non-supported formats. - ''' - handler = XMLParser() - parser = xml.sax.make_parser() - parser.setContentHandler(handler) - path = os.path.join(get_datadir(), 'FORMATS') - with open(path, 'r') as xmlfile: - parser.parse(xmlfile) - - localy_supported = [] - for item in handler.list: - if item['mimetype'].split(',')[0] in strippers.STRIPPERS: - localy_supported.append(item) - - return localy_supported - - -class XMLParser(xml.sax.handler.ContentHandler): - ''' Parse the supported format xml, and return a corresponding - list of dict - ''' - def __init__(self): - self.dict = {} - self.list = [] - self.content, self.key = '', '' - self.between = False - - def startElement(self, name, attrs): - ''' Called when entering into xml tag - ''' - self.between = True - self.key = name - self.content = '' - - def endElement(self, name): - ''' Called when exiting a xml tag - ''' - if name == 'format': # leaving a fileformat section - self.list.append(self.dict.copy()) - self.dict.clear() - else: - content = self.content.replace('\s', ' ') - self.dict[self.key] = content - self.between = False - - def characters(self, characters): - ''' Concatenate the content between opening and closing tags - ''' - if self.between: - self.content += characters - - -def secure_remove(filename): - ''' Securely remove the file - ''' - # I want the file removed, even if it's ro - try: - os.chmod(filename, 220) - except OSError: - logging.error('Unable to add write rights to %s' % filename) - raise MAT.exceptions.UnableToWriteFile - - try: - if not subprocess.call(['shred', '--remove', filename]): - return True - else: - raise OSError - except OSError: - logging.error('Unable to securely remove %s' % filename) - - try: - os.remove(filename) - except OSError: - logging.error('Unable to remove %s' % filename) - raise MAT.exceptions.UnableToRemoveFile - - return True - - -def create_class_file(name, backup, **kwargs): - ''' Return a $FILETYPEStripper() class, - corresponding to the filetype of the given file - ''' - if not os.path.isfile(name): # check if the file exists - logging.error('%s is not a valid file' % name) - return None - - if not os.access(name, os.R_OK): # check read permissions - logging.error('%s is is not readable' % name) - return None - - if not os.path.getsize(name): - #check if the file is not empty (hachoir crash on empty files) - logging.error('%s is empty' % name) - return None - - filename = '' - try: - filename = hachoir_core.cmd_line.unicodeFilename(name) - except TypeError: # get rid of "decoding Unicode is not supported" - filename = name - - parser = hachoir_parser.createParser(filename) - if not parser: - logging.info('Unable to parse %s' % filename) - return None - - mime = parser.mime_type - - if mime == 'application/zip': # some formats are zipped stuff - if mimetypes.guess_type(name)[0]: - mime = mimetypes.guess_type(name)[0] - - if mime.startswith('application/vnd.oasis.opendocument'): - mime = 'application/opendocument' # opendocument fileformat - elif mime.startswith('application/vnd.openxmlformats-officedocument'): - mime = 'application/officeopenxml' # office openxml - - is_writable = os.access(name, os.W_OK) - - try: - stripper_class = strippers.STRIPPERS[mime] - except KeyError: - logging.info('Don\'t have stripper for %s format' % mime) - return None - - return stripper_class(filename, parser, mime, backup, is_writable, **kwargs) diff --git a/MAT/misc.py b/MAT/misc.py deleted file mode 100644 index 450f381..0000000 --- a/MAT/misc.py +++ /dev/null @@ -1,76 +0,0 @@ -''' Care about misc formats -''' - -import parser - -from bencode import bencode - - -class TorrentStripper(parser.GenericParser): - ''' Represent a torrent file with the help - of the bencode lib from Petru Paler - ''' - def __init__(self, filename, parser, mime, backup, is_writable, **kwargs): - super(TorrentStripper, self).__init__(filename, parser, mime, backup, is_writable, **kwargs) - self.fields = frozenset(['announce', 'info', 'name', 'path', 'piece length', 'pieces', - 'length', 'files', 'announce-list', 'nodes', 'httpseeds', 'private', 'root hash']) - - def __get_key_recursively(self, dictionary): - ''' Get recursively all keys from a dict and - its subdicts - ''' - for i, j in list(dictionary.items()): - if isinstance(j, dict): - return set([i]).union(self.__get_key_recursively(j)) - return set([i]) - - def is_clean(self): - ''' Check if the file is clean from harmful metadata - ''' - with open(self.filename, 'r') as f: - decoded = bencode.bdecode(f.read()) - return self.fields.issuperset(self.__get_key_recursively(decoded)) - - def __get_meta_recursively(self, dictionary): - ''' Get recursively all harmful metadata - ''' - d = dict() - for i, j in list(dictionary.items()): - if i not in self.fields: - d[i] = j - elif isinstance(j, dict): - d = dict(d.items() + list(self.__get_meta_recursively(j).items())) - return d - - def get_meta(self): - ''' Return a dict with all the meta of the file - ''' - with open(self.filename, 'r') as f: - decoded = bencode.bdecode(f.read()) - return self.__get_meta_recursively(decoded) - - def __remove_all_recursively(self, dictionary): - ''' Remove recursively all compromizing fields - ''' - d = dict() - for i, j in [i for i in list(dictionary.items()) if i in self.fields]: - if isinstance(j, dict): - d = dict(list(d.items()) + list(self.__get_meta_recursively(j).items())) - else: - d[i] = j - return d - - def remove_all(self): - ''' Remove all comprimizing fields - ''' - decoded = '' - with open(self.filename, 'r') as f: - decoded = bencode.bdecode(f.read()) - - cleaned = {i: j for i, j in list(decoded.items()) if i in self.fields} - - with open(self.output, 'w') as f: # encode the decoded torrent - f.write(bencode.bencode(cleaned)) # and write it in self.output - - self.do_backup() - return True diff --git a/MAT/mutagenstripper.py b/MAT/mutagenstripper.py deleted file mode 100644 index 403c9a7..0000000 --- a/MAT/mutagenstripper.py +++ /dev/null @@ -1,33 +0,0 @@ -''' Take care of mutagen-supported formats (audio) -''' - -import parser - - -class MutagenStripper(parser.GenericParser): - def __init__(self, filename, parser, mime, backup, is_writable, **kwargs): - super(MutagenStripper, self).__init__(filename, parser, mime, backup, is_writable, **kwargs) - self._create_mfile() - - def _create_mfile(self): - raise NotImplementedError - - def is_clean(self): - return not self.mfile.tags - - def remove_all(self): - if self.backup: - self.create_backup_copy() - self.mfile.delete() - self.mfile.save() - return True - - def get_meta(self): - ''' - Return the content of the metadata block is present - ''' - metadata = {} - if self.mfile.tags: - for key, value in self.mfile.tags: - metadata[key] = value - return metadata diff --git a/MAT/office.py b/MAT/office.py deleted file mode 100644 index 0ca1ff1..0000000 --- a/MAT/office.py +++ /dev/null @@ -1,191 +0,0 @@ -''' Care about office's formats - -''' - -import logging -import os -import shutil -import tempfile -import xml.dom.minidom as minidom -import zipfile - -try: - import cairo - from gi.repository import Poppler -except ImportError: - logging.info('office.py loaded without PDF support') - pass - -import parser -import archive - - -class OpenDocumentStripper(archive.TerminalZipStripper): - ''' An open document file is a zip, with xml file into. - The one that interest us is meta.xml - ''' - - def get_meta(self): - ''' Return a dict with all the meta of the file by - trying to read the meta.xml file. - ''' - metadata = super(OpenDocumentStripper, self).get_meta() - zipin = zipfile.ZipFile(self.filename, 'r') - try: - content = zipin.read('meta.xml') - dom1 = minidom.parseString(content) - elements = dom1.getElementsByTagName('office:meta') - for i in elements[0].childNodes: - if i.tagName != 'meta:document-statistic': - nodename = ''.join(i.nodeName.split(':')[1:]) - metadata[nodename] = ''.join([j.data for j in i.childNodes]) - else: - # thank you w3c for not providing a nice - # method to get all attributes of a node - pass - except KeyError: # no meta.xml file found - logging.debug('%s has no opendocument metadata' % self.filename) - zipin.close() - return metadata - - def remove_all(self): - ''' Removes metadata - ''' - return super(OpenDocumentStripper, self).remove_all(ending_blacklist=['meta.xml']) - - def is_clean(self): - ''' Check if the file is clean from harmful metadatas - ''' - clean_super = super(OpenDocumentStripper, self).is_clean() - if clean_super is False: - return False - - zipin = zipfile.ZipFile(self.filename, 'r') - try: - zipin.getinfo('meta.xml') - except KeyError: # no meta.xml in the file - return True - zipin.close() - return False - - -class OpenXmlStripper(archive.TerminalZipStripper): - ''' Represent an office openxml document, which is like - an opendocument format, with some tricky stuff added. - It contains mostly xml, but can have media blobs, crap, ... - (I don't like this format.) - ''' - def remove_all(self): - return super(OpenXmlStripper, self).remove_all( - beginning_blacklist=('docProps/'), whitelist=('.rels')) - - def is_clean(self): - ''' Check if the file is clean from harmful metadatas. - This implementation is faster than something like - "return this.get_meta() == {}". - ''' - clean_super = super(OpenXmlStripper, self).is_clean() - if clean_super is False: - return False - - zipin = zipfile.ZipFile(self.filename, 'r') - for item in zipin.namelist(): - if item.startswith('docProps/'): - return False - zipin.close() - return True - - def get_meta(self): - ''' Return a dict with all the meta of the file - ''' - metadata = super(OpenXmlStripper, self).get_meta() - - zipin = zipfile.ZipFile(self.filename, 'r') - for item in zipin.namelist(): - if item.startswith('docProps/'): - metadata[item] = 'harmful content' - zipin.close() - return metadata - - -class PdfStripper(parser.GenericParser): - ''' Represent a PDF file - ''' - def __init__(self, filename, parser, mime, backup, is_writable, **kwargs): - super(PdfStripper, self).__init__(filename, parser, mime, backup, is_writable, **kwargs) - self.uri = 'file://' + os.path.abspath(self.filename) - self.password = None - try: - self.pdf_quality = kwargs['low_pdf_quality'] - except KeyError: - self.pdf_quality = False - - self.meta_list = frozenset(['title', 'author', 'subject', - 'keywords', 'creator', 'producer', 'metadata']) - - def is_clean(self): - ''' Check if the file is clean from harmful metadatas - ''' - document = Poppler.Document.new_from_file(self.uri, self.password) - for key in self.meta_list: - if document.get_property(key): - return False - return True - - def remove_all(self): - ''' Opening the PDF with poppler, then doing a render - on a cairo pdfsurface for each pages. - - http://cairographics.org/documentation/pycairo/2/ - - The use of an intermediate tempfile is necessary because - python-cairo segfaults on unicode. - See http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=699457 - ''' - document = Poppler.Document.new_from_file(self.uri, self.password) - try: - output = tempfile.mkstemp()[1] - page = document.get_page(0) - # assume that every pages are the same size - page_width, page_height = page.get_size() - surface = cairo.PDFSurface(output, page_width, page_height) - context = cairo.Context(surface) # context draws on the surface - logging.debug('PDF rendering of %s' % self.filename) - for pagenum in range(document.get_n_pages()): - page = document.get_page(pagenum) - context.translate(0, 0) - if self.pdf_quality: - page.render(context) # render the page on context - else: - page.render_for_printing(context) # render the page on context - context.show_page() # draw context on surface - surface.finish() - shutil.move(output, self.output) - except: - logging.error('Something went wrong when cleaning %s.' % self.filename) - return False - - try: - import pdfrw # For now, poppler cannot write meta, so we must use pdfrw - logging.debug('Removing %s\'s superficial metadata' % self.filename) - trailer = pdfrw.PdfReader(self.output) - trailer.Info.Producer = None - trailer.Info.Creator = None - writer = pdfrw.PdfWriter() - writer.trailer = trailer - writer.write(self.output) - self.do_backup() - except: - logging.error('Unable to remove all metadata from %s, please install pdfrw' % self.output) - return False - return True - - def get_meta(self): - ''' Return a dict with all the meta of the file - ''' - document = Poppler.Document.new_from_file(self.uri, self.password) - metadata = {} - for key in self.meta_list: - if document.get_property(key): - metadata[key] = document.get_property(key) - return metadata diff --git a/MAT/parser.py b/MAT/parser.py deleted file mode 100644 index 1765da8..0000000 --- a/MAT/parser.py +++ /dev/null @@ -1,135 +0,0 @@ -''' Parent class of all parser -''' - -import os -import shutil -import tempfile - -import hachoir_core -import hachoir_editor - -import mat - -NOMETA = frozenset(( - '.bmp', # "raw" image - '.rdf', # text - '.txt', # plain text - '.xml', # formated text (XML) - '.rels', # openXML formated text -)) - -FIELD = object() - - -class GenericParser(object): - ''' Parent class of all parsers - ''' - def __init__(self, filename, parser, mime, backup, is_writable, **kwargs): - self.filename = '' - self.parser = parser - self.mime = mime - self.backup = backup - self.is_writable = is_writable - self.editor = hachoir_editor.createEditor(parser) - try: - self.filename = hachoir_core.cmd_line.unicodeFilename(filename) - except TypeError: # get rid of "decoding Unicode is not supported" - self.filename = filename - self.basename = os.path.basename(filename) - _, output = tempfile.mkstemp() - self.output = hachoir_core.cmd_line.unicodeFilename(output) - - def __del__(self): - ''' Remove tempfile if it was not used - ''' - if os.path.exists(self.output): - mat.secure_remove(self.output) - - def is_clean(self): - ''' - Check if the file is clean from harmful metadatas - ''' - for field in self.editor: - if self._should_remove(field): - return self._is_clean(self.editor) - return True - - def _is_clean(self, fieldset): - for field in fieldset: - remove = self._should_remove(field) - if remove is True: - return False - if remove is FIELD: - if not self._is_clean(field): - return False - return True - - def remove_all(self): - ''' Remove all compromising fields - ''' - state = self._remove_all(self.editor) - hachoir_core.field.writeIntoFile(self.editor, self.output) - self.do_backup() - return state - - def _remove_all(self, fieldset): - ''' Recursive way to handle tree metadatas - ''' - try: - for field in fieldset: - remove = self._should_remove(field) - if remove is True: - self._remove(fieldset, field.name) - if remove is FIELD: - self._remove_all(field) - return True - except: - return False - - def _remove(self, fieldset, field): - ''' Delete the given field - ''' - del fieldset[field] - - def get_meta(self): - ''' Return a dict with all the meta of the file - ''' - metadata = {} - self._get_meta(self.editor, metadata) - return metadata - - def _get_meta(self, fieldset, metadata): - ''' Recursive way to handle tree metadatas - ''' - for field in fieldset: - remove = self._should_remove(field) - if remove: - try: - metadata[field.name] = field.value - except: - metadata[field.name] = 'harmful content' - if remove is FIELD: - self._get_meta(field, None) - - def _should_remove(self, key): - ''' Return True if the field is compromising - abstract method - ''' - raise NotImplementedError - - def create_backup_copy(self): - ''' Create a backup copy - ''' - shutil.copy2(self.filename, self.filename + '.bak') - - def do_backup(self): - ''' Keep a backup of the file if asked. - - The process of double-renaming is not very elegant, - but it greatly simplify new strippers implementation. - ''' - if self.backup: - shutil.move(self.filename, self.filename + '.bak') - else: - mat.secure_remove(self.filename) - shutil.move(self.output, self.filename) diff --git a/MAT/strippers.py b/MAT/strippers.py deleted file mode 100644 index aea98da..0000000 --- a/MAT/strippers.py +++ /dev/null @@ -1,70 +0,0 @@ -''' Manage which fileformat can be processed -''' - -import archive -import audio -import gi -import images -import logging -import mat -import misc -import office -import subprocess - -STRIPPERS = { - 'application/x-tar': archive.TarStripper, - 'application/x-bzip2': archive.Bzip2Stripper, - 'application/x-gzip': archive.GzipStripper, - 'application/zip': archive.ZipStripper, - 'audio/mpeg': audio.MpegAudioStripper, - 'application/x-bittorrent': misc.TorrentStripper, - 'application/opendocument': office.OpenDocumentStripper, - 'application/officeopenxml': office.OpenXmlStripper, -} - -logging.basicConfig(level=mat.LOGGING_LEVEL) - -# PDF support -pdfSupport = True -try: - from gi.repository import Poppler -except ImportError: - logging.info('Unable to import Poppler: no PDF support') - pdfSupport = False - -try: - import cairo -except ImportError: - logging.info('Unable to import python-cairo: no PDF support') - pdfSupport = False - -try: - import pdfrw -except ImportError: - logging.info('Unable to import python-pdfrw: no PDf support') - pdfSupport = False - -if pdfSupport: - STRIPPERS['application/x-pdf'] = office.PdfStripper - STRIPPERS['application/pdf'] = office.PdfStripper - - -# audio format support with mutagen-python -try: - import mutagen - STRIPPERS['audio/x-flac'] = audio.FlacStripper - STRIPPERS['audio/vorbis'] = audio.OggStripper - STRIPPERS['audio/mpeg'] = audio.MpegAudioStripper -except ImportError: - logging.info('Unable to import python-mutagen: limited audio format support') - -# exiftool -try: - subprocess.check_output(['exiftool', '-ver']) - import exiftool - STRIPPERS['image/jpeg'] = exiftool.JpegStripper - STRIPPERS['image/png'] = exiftool.PngStripper -except OSError: # if exiftool is not installed, use hachoir instead - logging.info('Unable to find exiftool: limited images support') - STRIPPERS['image/jpeg'] = images.JpegStripper - STRIPPERS['image/png'] = images.PngStripper -- cgit v1.3