From af36529554c39a2eefcc2c8723715e2d25b401b8 Mon Sep 17 00:00:00 2001
From: jvoisin
Date: Sun, 8 Jun 2014 13:39:18 +0200
Subject: Rename the MAT folder to libmat.

This commit fixes some issues for dump operating
systems who doesn't handle capitalization.
---
 libmat/archive.py | 335 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 335 insertions(+)
 create mode 100644 libmat/archive.py

(limited to 'libmat/archive.py')

diff --git a/libmat/archive.py b/libmat/archive.py
new file mode 100644
index 0000000..d483dcc
--- /dev/null
+++ b/libmat/archive.py
@@ -0,0 +1,335 @@
+''' Take care of archives formats
+'''
+
+import datetime
+import logging
+import os
+import shutil
+import stat
+import tarfile
+import tempfile
+import zipfile
+
+import mat
+import parser
+
+# Zip files do not support dates older than 01/01/1980
+ZIP_EPOCH = (1980, 1, 1, 0, 0, 0)
+ZIP_EPOCH_SECONDS = (datetime.datetime(1980, 1, 1, 0, 0, 0)
+        - datetime.datetime(1970, 1, 1, 1, 0, 0)).total_seconds()
+
+
+class GenericArchiveStripper(parser.GenericParser):
+    ''' Represent a generic archive
+    '''
+    def __init__(self, filename, parser, mime, backup, is_writable, **kwargs):
+        super(GenericArchiveStripper, self).__init__(filename,
+                parser, mime, backup, is_writable, **kwargs)
+        self.compression = ''
+        self.add2archive = kwargs['add2archive']
+        self.tempdir = tempfile.mkdtemp()
+
+    def __del__(self):
+        ''' Remove the files inside the temp dir,
+            then remove the temp dir
+        '''
+        for root, dirs, files in os.walk(self.tempdir):
+            for item in files:
+                path_file = os.path.join(root, item)
+                mat.secure_remove(path_file)
+        shutil.rmtree(self.tempdir)
+
+    def is_clean(self, list_unsupported=False):
+        ''' Virtual method to check for harmul metadata
+        '''
+        raise NotImplementedError
+
+    def list_unsupported(self):
+        ''' Get a list of every non-supported files present in the archive
+        '''
+        return self.is_clean(list_unsupported=True)
+
+    def remove_all(self):
+        ''' Virtual method to remove all metadata
+        '''
+        raise NotImplementedError
+
+
+class ZipStripper(GenericArchiveStripper):
+    ''' Represent a zip file
+    '''
+    def __is_zipfile_clean(self, fileinfo):
+        ''' Check if a ZipInfo object is clean of metadata added
+            by zip itself, independently of the corresponding file metadata
+        '''
+        if fileinfo.comment != '':
+            return False
+        elif fileinfo.date_time != ZIP_EPOCH:
+            return False
+        elif fileinfo.create_system != 3:  # 3 is UNIX
+            return False
+        return True
+
+    def is_clean(self, list_unsupported=False):
+        ''' Check if the given file is clean from harmful metadata
+            When list_unsupported is True, the method returns a list
+            of all non-supported/archives files contained in the
+            archive.
+        '''
+        ret_list = []
+        zipin = zipfile.ZipFile(self.filename, 'r')
+        if zipin.comment != '' and not list_unsupported:
+            logging.debug('%s has a comment' % self.filename)
+            return False
+        for item in zipin.infolist():
+            zipin.extract(item, self.tempdir)
+            path = os.path.join(self.tempdir, item.filename)
+            if not self.__is_zipfile_clean(item) and not list_unsupported:
+                logging.debug('%s from %s has compromising zipinfo' %
+                        (item.filename, self.filename))
+                return False
+            if os.path.isfile(path):
+                cfile = mat.create_class_file(path, False, add2archive=self.add2archive)
+                if cfile is not None:
+                    if not cfile.is_clean():
+                        logging.debug('%s from %s has metadata' % (item.filename, self.filename))
+                        if not list_unsupported:
+                            return False
+                else:
+                    logging.info('%s\'s fileformat is not supported or harmless.'
+                            % item.filename)
+                    basename, ext = os.path.splitext(path)
+                    if os.path.basename(item.filename) not in ('mimetype', '.rels'):
+                        if ext not in parser.NOMETA:
+                            if not list_unsupported:
+                                return False
+                            ret_list.append(item.filename)
+        zipin.close()
+        if list_unsupported:
+            return ret_list
+        return True
+
+    def get_meta(self):
+        ''' Return all the metadata of a zip archive'''
+        zipin = zipfile.ZipFile(self.filename, 'r')
+        metadata = {}
+        if zipin.comment != '':
+            metadata['comment'] = zipin.comment
+        for item in zipin.infolist():
+            zipinfo_meta = self.__get_zipinfo_meta(item)
+            if zipinfo_meta != {}:  # zipinfo metadata
+                metadata[item.filename + "'s zipinfo"] = str(zipinfo_meta)
+            zipin.extract(item, self.tempdir)
+            path = os.path.join(self.tempdir, item.filename)
+            if os.path.isfile(path):
+                cfile = mat.create_class_file(path, False, add2archive=self.add2archive)
+                if cfile is not None:
+                    cfile_meta = cfile.get_meta()
+                    if cfile_meta != {}:
+                        metadata[item.filename] = str(cfile_meta)
+                else:
+                    logging.info('%s\'s fileformat is not supported or harmless'
+                            % item.filename)
+        zipin.close()
+        return metadata
+
+    def __get_zipinfo_meta(self, zipinfo):
+        ''' Return all the metadata of a ZipInfo
+        '''
+        metadata = {}
+        if zipinfo.comment != '':
+            metadata['comment'] = zipinfo.comment
+        if zipinfo.date_time != ZIP_EPOCH:
+            metadata['modified'] = zipinfo.date_time
+        if zipinfo.create_system != 3:  # 3 is UNIX
+            metadata['system'] = "windows" if zipinfo.create_system == 2 else "unknown"
+        return metadata
+
+    def remove_all(self, whitelist=[], beginning_blacklist=[], ending_blacklist=[]):
+        ''' Remove all metadata from a zip archive, even thoses
+            added by Python's zipfile itself. It will not add
+            files starting with "begining_blacklist", or ending with
+            "ending_blacklist". This method also add files present in
+            whitelist to the archive.
+        '''
+        zipin = zipfile.ZipFile(self.filename, 'r')
+        zipout = zipfile.ZipFile(self.output, 'w', allowZip64=True)
+        for item in zipin.infolist():
+            zipin.extract(item, self.tempdir)
+            path = os.path.join(self.tempdir, item.filename)
+
+            beginning = any((True for f in beginning_blacklist if item.filename.startswith(f)))
+            ending = any((True for f in ending_blacklist if item.filename.endswith(f)))
+
+            if os.path.isfile(path) and not beginning and not ending:
+                cfile = mat.create_class_file(path, False, add2archive=self.add2archive)
+                if cfile is not None:
+                    # Handle read-only files inside archive
+                    old_stat = os.stat(path).st_mode
+                    os.chmod(path, old_stat|stat.S_IWUSR)
+                    cfile.remove_all()
+                    os.chmod(path, old_stat)
+                    logging.debug('Processing %s from %s' % (item.filename, self.filename))
+                elif item.filename not in whitelist:
+                    logging.info('%s\'s format is not supported or harmless' % item.filename)
+                    basename, ext = os.path.splitext(path)
+                    if not (self.add2archive or ext in parser.NOMETA):
+                        continue
+                os.utime(path, (ZIP_EPOCH_SECONDS, ZIP_EPOCH_SECONDS))
+                zipout.write(path, item.filename)
+        zipin.close()
+        zipout.close()
+
+        logging.info('%s processed' % self.filename)
+        self.do_backup()
+        return True
+
+
+class TarStripper(GenericArchiveStripper):
+    ''' Represent a tarfile archive
+    '''
+    def _remove(self, current_file):
+        ''' Remove the meta added by tarfile itself to the file
+        '''
+        current_file.mtime = 0
+        current_file.uid = 0
+        current_file.gid = 0
+        current_file.uname = ''
+        current_file.gname = ''
+        return current_file
+
+    def remove_all(self, whitelist=[]):
+        ''' Remove all harmful metadata from the tarfile.
+            The method will also add every files matching
+            whitelist in the produced archive.
+        '''
+        tarin = tarfile.open(self.filename, 'r' + self.compression, encoding='utf-8')
+        tarout = tarfile.open(self.output, 'w' + self.compression, encoding='utf-8')
+        for item in tarin.getmembers():
+            tarin.extract(item, self.tempdir)
+            if item.isfile():
+                path = os.path.join(self.tempdir, item.name)
+                cfile = mat.create_class_file(path, False, add2archive=self.add2archive)
+                if cfile is not None:
+                    # Handle read-only files inside archive
+                    old_stat = os.stat(path).st_mode
+                    os.chmod(path, old_stat|stat.S_IWUSR)
+                    cfile.remove_all()
+                    os.chmod(path, old_stat)
+                elif self.add2archive or os.path.splitext(item.name)[1] in parser.NOMETA:
+                    logging.debug('%s\' format is either not supported or harmless' % item.name)
+                elif item.name in whitelist:
+                    logging.debug('%s is not supported, but MAT was told to add it anyway.'
+                            % item.name)
+                else:  # Don't add the file to the archive
+                    logging.debug('%s will not be added' % item.name)
+                    continue
+                tarout.add(path, item.name, filter=self._remove)
+        tarin.close()
+        tarout.close()
+        self.do_backup()
+        return True
+
+    def is_file_clean(self, current_file):
+        ''' Check metadatas added by tarfile
+        '''
+        if current_file.mtime != 0:
+            return False
+        elif current_file.uid != 0:
+            return False
+        elif current_file.gid != 0:
+            return False
+        elif current_file.uname != '':
+            return False
+        elif current_file.gname != '':
+            return False
+        return True
+
+    def is_clean(self, list_unsupported=False):
+        ''' Check if the file is clean from harmful metadatas
+            When list_unsupported is True, the method returns a list
+            of all non-supported/archives files contained in the
+            archive.
+        '''
+        ret_list = []
+        tarin = tarfile.open(self.filename, 'r' + self.compression)
+        for item in tarin.getmembers():
+            if not self.is_file_clean(item) and not list_unsupported:
+                logging.debug('%s from %s has compromising tarinfo' %
+                        (item.name, self.filename))
+                return False
+            tarin.extract(item, self.tempdir)
+            path = os.path.join(self.tempdir, item.name)
+            if item.isfile():
+                cfile = mat.create_class_file(path, False, add2archive=self.add2archive)
+                if cfile is not None:
+                    if not cfile.is_clean():
+                        logging.debug('%s from %s has metadata' %
+                                (item.name.decode("utf8"), self.filename))
+                        if not list_unsupported:
+                            return False
+                        # Nested archives are treated like unsupported files
+                        elif isinstance(cfile, GenericArchiveStripper):
+                            ret_list.append(item.name)
+                else:
+                    logging.error('%s\'s format is not supported or harmless' % item.name)
+                    if os.path.splitext(path)[1] not in parser.NOMETA:
+                        if not list_unsupported:
+                            return False
+                        ret_list.append(item.name)
+        tarin.close()
+        if list_unsupported:
+            return ret_list
+        return True
+
+    def get_meta(self):
+        ''' Return a dict with all the meta of the tarfile
+        '''
+        tarin = tarfile.open(self.filename, 'r' + self.compression)
+        metadata = {}
+        for item in tarin.getmembers():
+            current_meta = {}
+            if item.isfile():
+                tarin.extract(item, self.tempdir)
+                path = os.path.join(self.tempdir, item.name)
+                class_file = mat.create_class_file(path, False, add2archive=self.add2archive)
+                if class_file is not None:
+                    meta = class_file.get_meta()
+                    if meta:
+                        current_meta['file'] = str(meta)
+                else:
+                    logging.error('%s\'s format is not supported or harmless' % item.name)
+
+                if not self.is_file_clean(item):  # if there is meta
+                    current_meta['mtime'] = item.mtime
+                    current_meta['uid'] = item.uid
+                    current_meta['gid'] = item.gid
+                    current_meta['uname'] = item.uname
+                    current_meta['gname'] = item.gname
+                    metadata[item.name] = str(current_meta)
+        tarin.close()
+        return metadata
+
+
+class TerminalZipStripper(ZipStripper):
+    ''' Represent a terminal level archive.
+        This type of archive can not contain nested archives.
+        It is used for formats like docx, which are basically
+        ziped xml.
+    '''
+
+
+class GzipStripper(TarStripper):
+    ''' Represent a tar.gz archive
+    '''
+    def __init__(self, filename, parser, mime, backup, is_writable, **kwargs):
+        super(GzipStripper, self).__init__(filename, parser, mime, backup, is_writable, **kwargs)
+        self.compression = ':gz'
+
+
+class Bzip2Stripper(TarStripper):
+    ''' Represent a tar.bz2 archive
+    '''
+    def __init__(self, filename, parser, mime, backup, is_writable, **kwargs):
+        super(Bzip2Stripper, self).__init__(filename, parser, mime, backup, is_writable, **kwargs)
+        self.compression = ':bz2'
-- 
cgit v1.3