From bbe17fd511b5890fb4554447e23d666f6c13b745 Mon Sep 17 00:00:00 2001
From: jvoisin
Date: Wed, 15 Jan 2014 02:42:39 +0000
Subject: Add support for zipfiles!

---
 MAT/archive.py   | 140 ++++++++++++++++++++++++-----------------
 MAT/office.py    | 187 ++++++++++++++++---------------------------------------
 MAT/strippers.py |   2 +
 3 files changed, 138 insertions(+), 191 deletions(-)

(limited to 'MAT')

diff --git a/MAT/archive.py b/MAT/archive.py
index 9179e48..53c5e9b 100644
--- a/MAT/archive.py
+++ b/MAT/archive.py
@@ -1,6 +1,7 @@
 ''' Take care of archives formats
 '''
 
+import datetime
 import logging
 import os
 import shutil
@@ -11,12 +12,17 @@ import zipfile
 import mat
 import parser
 
+ZIP_EPOCH = (1980, 1, 1, 0, 0, 0)
+ZIP_EPOCH_SECONDS = (datetime.datetime(1980, 1, 1, 0, 0, 0)
+        - datetime.datetime(1970, 1, 1, 0, 0, 0)).total_seconds()
+
 
 class GenericArchiveStripper(parser.GenericParser):
     ''' Represent a generic archive
     '''
     def __init__(self, filename, parser, mime, backup, is_writable, **kwargs):
-        super(GenericArchiveStripper, self).__init__(filename, parser, mime, backup, is_writable, **kwargs)
+        super(GenericArchiveStripper, self).__init__(filename,
+                parser, mime, backup, is_writable, **kwargs)
         self.compression = ''
         self.add2archive = kwargs['add2archive']
         self.tempdir = tempfile.mkdtemp()
@@ -48,13 +54,13 @@ class GenericArchiveStripper(parser.GenericParser):
 class ZipStripper(GenericArchiveStripper):
     ''' Represent a zip file
     '''
-    def is_file_clean(self, fileinfo):
+    def __is_zipfile_clean(self, fileinfo):
         ''' Check if a ZipInfo object is clean of metadatas added
             by zip itself, independently of the corresponding file metadatas
         '''
         if fileinfo.comment != '':
             return False
-        elif fileinfo.date_time != (1980, 1, 1, 0, 0, 0):
+        elif fileinfo.date_time != ZIP_EPOCH:
             return False
         elif fileinfo.create_system != 3:  # 3 is UNIX
             return False
@@ -70,83 +76,100 @@ class ZipStripper(GenericArchiveStripper):
             logging.debug('%s has a comment' % self.filename)
             return False
         for item in zipin.infolist():
-            # I have not found a way to remove the crap added by zipfile :/
-            # if not self.is_file_clean(item):
-            #    logging.debug('%s from %s has compromising zipinfo' %
-            #        (item.filename, self.filename))
-            #    return False
             zipin.extract(item, self.tempdir)
             name = os.path.join(self.tempdir, item.filename)
+            if not self.__is_zipfile_clean(item) and not list_unsupported:
+                logging.debug('%s from %s has compromising zipinfo' %
+                        (item.filename, self.filename))
+                return False
             if os.path.isfile(name):
                 cfile = mat.create_class_file(name, False, add2archive=self.add2archive)
                 if cfile:
                     if not cfile.is_clean():
-                        return False
+                        logging.debug('%s from %s has compromising zipinfo' %
+                                (item.filename, self.filename))
+                        if not list_unsupported:
+                            return False
+                        ret_list.append(item.filename)
                 else:
-                    logging.info('%s\'s fileformat is not supported, or is harmless' % item.filename)
+                    logging.info('%s\'s fileformat is not supported or harmless.'
+                            % item.filename)
                     basename, ext = os.path.splitext(name)
-                    bname = os.path.basename(item.filename)
-                    if ext not in parser.NOMETA:
-                        if bname != 'mimetype' and bname != '.rels':
-                            if list_unsupported:
-                                ret_list.append(bname)
-                            else:
+                    if os.path.basename(item.filename) not in ('mimetype', '.rels'):
+                        if ext not in parser.NOMETA:
+                            if not list_unsupported:
                                 return False
+                            ret_list.append(item.filename)
         zipin.close()
         if list_unsupported:
             return ret_list
         return True
 
     def get_meta(self):
-        ''' Return all the metadata of a ZipFile (don't return metadatas
-            of contained files : should it ?)
-        '''
+        ''' Return all the metadata of a zip archive'''
         zipin = zipfile.ZipFile(self.filename, 'r')
         metadata = {}
-        for field in zipin.infolist():
-            zipmeta = {}
-            if field.comment != '':
-                zipmeta['comment'] = field.comment
-            if field.date_time != (1980, 1, 1, 0, 0, 0):
-                zipmeta['modified'] = field.date_time
-            if field.create_system != 3:  # 3 is UNIX
-                zipmeta['system'] = "windows" if field.create_system == 2 else "unknown"
         if zipin.comment != '':
-            metadata["%s comment" % self.filename] = zipin.comment
+            metadata['comment'] = zipin.comment
+        for item in zipin.infolist():
+            zipinfo_meta = self.__get_zipinfo_meta(item)
+            if zipinfo_meta != {}:  # zipinfo metadata
+                metadata[item.filename + "'s zipinfo"] = str(zipinfo_meta)
+            zipin.extract(item, self.tempdir)
+            name = os.path.join(self.tempdir, item.filename)
+            if os.path.isfile(name):
+                cfile = mat.create_class_file(name, False, add2archive=self.add2archive)
+                if cfile:
+                    cfile_meta = cfile.get_meta()
+                    if cfile_meta != {}:
+                        metadata[item.filename] = str(cfile_meta)
+                else:
+                    logging.info('%s\'s fileformat is not supported or harmless'
+                            % item.filename)
         zipin.close()
         return metadata
 
-    def remove_all(self):
-        ''' So far, the zipfile module does not allow to write a ZipInfo
-            object into a zipfile (and it's a shame !) : so data added
-            by zipfile itself could not be removed. It's a big concern.
-            Is shipping a patched version of zipfile.py a good idea ?
+    def __get_zipinfo_meta(self, zipinfo):
+        ''' Return all the metadata of a ZipInfo
+        '''
+        metadata = {}
+        if zipinfo.comment != '':
+            metadata['comment'] = zipinfo.comment
+        if zipinfo.date_time != ZIP_EPOCH:
+            metadata['modified'] = zipinfo.date_time
+        if zipinfo.create_system != 3:  # 3 is UNIX
+            metadata['system'] = "windows" if zipinfo.create_system == 2 else "unknown"
+        return metadata
+
+    def remove_all(self, whitelist=[], beginning_blacklist=[], ending_blacklist=[]):
+        ''' Remove all metadata from a zip archive, even thoses
+            added by Python's zipfile itself. It will not add
+            files starting with "begining_blacklist", or ending with
+            "ending_blacklist". This method also add files present in
+            whitelist to the archive.
         '''
         zipin = zipfile.ZipFile(self.filename, 'r')
         zipout = zipfile.ZipFile(self.output, 'w', allowZip64=True)
         for item in zipin.infolist():
             zipin.extract(item, self.tempdir)
             name = os.path.join(self.tempdir, item.filename)
-            if os.path.isfile(name):
-                try:
-                    cfile = mat.create_class_file(name, False,
-                        add2archive=self.add2archive)
+
+            beginning = any((True for f in beginning_blacklist if item.filename.startswith(f)))
+            ending = any((True for f in ending_blacklist if item.filename.endswith(f)))
+
+            if os.path.isfile(name) and not beginning and not ending:
+                cfile = mat.create_class_file(name, False, add2archive=self.add2archive)
+                if cfile is not None:
                     cfile.remove_all()
-                    logging.debug('Processing %s from %s' % (item.filename,
-                        self.filename))
-                    zipout.write(name, item.filename)
-                except:
-                    logging.info('%s\'s format is not supported or harmless' %
-                        item.filename)
-                    _, ext = os.path.splitext(name)
-                    if self.add2archive or ext in parser.NOMETA:
-                        zipout.write(name, item.filename)
+                    logging.debug('Processing %s from %s' % (item.filename, self.filename))
+                elif item.filename not in whitelist:
+                    logging.info('%s\'s format is not supported or harmless' % item.filename)
+                    basename, ext = os.path.splitext(name)
+                    if not (self.add2archive or ext in parser.NOMETA):
+                        continue
+                os.utime(name, (ZIP_EPOCH_SECONDS, ZIP_EPOCH_SECONDS))
+                zipout.write(name, item.filename)
         zipin.close()
-        for zipFile in zipout.infolist():
-            zipFile.orig_filename = zipFile.filename
-            zipFile.date_time = (1980, 1, 1, 0, 0, 0)
-            zipFile.create_system = 3  # 3 is UNIX
-        zipout.comment = ''
         zipout.close()
 
         logging.info('%s processed' % self.filename)
@@ -167,7 +190,7 @@ class TarStripper(GenericArchiveStripper):
         current_file.gname = ''
         return current_file
 
-    def remove_all(self, exclude_list=[]):
+    def remove_all(self, whitelist=[]):
         tarin = tarfile.open(self.filename, 'r' + self.compression, encoding='utf-8')
         tarout = tarfile.open(self.output, 'w' + self.compression, encoding='utf-8')
         for item in tarin.getmembers():
@@ -179,8 +202,9 @@ class TarStripper(GenericArchiveStripper):
                     cfile.remove_all()
                 elif self.add2archive or os.path.splitext(item.name)[1] in parser.NOMETA:
                     logging.info('%s\' format is either not supported or harmless' % item.name)
-                elif item.name in exclude_list:
-                    logging.debug('%s is not supported, but MAt was told to add it anyway.' % item.name)
+                elif item.name in whitelist:
+                    logging.debug('%s is not supported, but MAT was told to add it anyway.'
+                            % item.name)
                 else:
                     continue
                 tarout.add(complete_name, item.name, filter=self._remove)
@@ -209,7 +233,6 @@ class TarStripper(GenericArchiveStripper):
         '''
         if list_unsupported:
             ret_list = []
-        tempdir_len = len(self.tempdir) + 1  # trim the tempfile path
         tarin = tarfile.open(self.filename, 'r' + self.compression)
         for item in tarin.getmembers():
             if not self.is_file_clean(item) and not list_unsupported:
@@ -217,20 +240,21 @@ class TarStripper(GenericArchiveStripper):
             tarin.extract(item, self.tempdir)
             complete_name = os.path.join(self.tempdir, item.name)
             if item.isfile():
-                class_file = mat.create_class_file(complete_name, False, add2archive=self.add2archive)
+                class_file = mat.create_class_file(complete_name,
+                        False, add2archive=self.add2archive)
                 if class_file:
                     # We don't support nested archives
                     if not class_file.is_clean():
                         if not list_unsupported:
                             return False
                         elif isinstance(class_file, GenericArchiveStripper):
-                            ret_list.append(complete_name[tempdir_len:])
+                            ret_list.append(item.name)
                 else:
                     logging.error('%s\'s format is not supported or harmless' % item.name)
                     if os.path.splitext(complete_name)[1] not in parser.NOMETA:
                         if not list_unsupported:
                             return False
-                        ret_list.append(complete_name[tempdir_len:])
+                        ret_list.append(item.name)
         tarin.close()
         if list_unsupported:
             return ret_list
diff --git a/MAT/office.py b/MAT/office.py
index f60fc64..97405b3 100644
--- a/MAT/office.py
+++ b/MAT/office.py
@@ -1,13 +1,12 @@
 ''' Care about office's formats
 '''
 
-import os
 import logging
-import zipfile
-import fileinput
-import tempfile
+import os
 import shutil
+import tempfile
 import xml.dom.minidom as minidom
+import zipfile
 
 try:
     import cairo
@@ -16,7 +15,6 @@ except ImportError:
     logging.info('office.py loaded without PDF support')
     pass
 
-import mat
 import parser
 import archive
 
@@ -30,89 +28,83 @@ class OpenDocumentStripper(archive.ZipStripper):
         ''' Return a dict with all the meta of the file by
             trying to read the meta.xml file.
         '''
+        metadata = super(OpenDocumentStripper, self).get_meta()
         zipin = zipfile.ZipFile(self.filename, 'r')
-        metadata = {}
         try:
             content = zipin.read('meta.xml')
             dom1 = minidom.parseString(content)
             elements = dom1.getElementsByTagName('office:meta')
             for i in elements[0].childNodes:
                 if i.tagName != 'meta:document-statistic':
-                    nodename = ''.join([k for k in i.nodeName.split(':')[1:]])
+                    nodename = ''.join(i.nodeName.split(':')[1:])
                     metadata[nodename] = ''.join([j.data for j in i.childNodes])
                 else:
                     # thank you w3c for not providing a nice
                     # method to get all attributes of a node
                     pass
-            zipin.close()
         except KeyError:  # no meta.xml file found
             logging.debug('%s has no opendocument metadata' % self.filename)
+        zipin.close()
         return metadata
 
     def remove_all(self):
+        ''' Removes metadata
         '''
-            FIXME ?
-            There is a patch implementing the Zipfile.remove()
-            method here : http://bugs.python.org/issue6818
+        return super(OpenDocumentStripper, self).remove_all(ending_blacklist=['meta.xml'])
+
+    def is_clean(self):
+        ''' Check if the file is clean from harmful metadatas
         '''
+        clean_super = super(OpenDocumentStripper, self).is_clean()
+        if clean_super is False:
+            return False
+
         zipin = zipfile.ZipFile(self.filename, 'r')
-        zipout = zipfile.ZipFile(self.output, 'w', allowZip64=True)
+        try:
+            zipin.getinfo('meta.xml')
+        except KeyError:  # no meta.xml in the file
+            return True
+        zipin.close()
+        return False
 
-        for item in zipin.namelist():
-            name = os.path.join(self.tempdir, item)
-            _, ext = os.path.splitext(name)
 
-            if item.endswith('manifest.xml'):
-            # contain the list of all files present in the archive
-                zipin.extract(item, self.tempdir)
-                for line in fileinput.input(name, inplace=1):
-                    # remove the line which contains "meta.xml"
-                    line = line.strip()
-                    if not 'meta.xml' in line:
-                        print line
-                zipout.write(name, item)
+class OpenXmlStripper(archive.ZipStripper):
+    ''' Represent an office openxml document, which is like
+        an opendocument format, with some tricky stuff added.
+        It contains mostly xml, but can have media blobs, crap, ...
+        (I don't like this format.)
+    '''
+    def remove_all(self):
+        return super(OpenXmlStripper, self).remove_all(
+                beginning_blacklist=('docProps/'), whitelist=('.rels'))
 
-            elif ext in parser.NOMETA or item == 'mimetype':
-                # keep NOMETA files, and the "manifest" file
-                if item != 'meta.xml':  # contains the metadata
-                    zipin.extract(item, self.tempdir)
-                    zipout.write(name, item)
+    def is_clean(self):
+        ''' Check if the file is clean from harmful metadatas.
+            This implementation is faster than something like
+            "return this.get_meta() == {}".
+        '''
+        clean_super = super(OpenXmlStripper, self).is_clean()
+        if clean_super is False:
+            return False
 
-            else:
-                zipin.extract(item, self.tempdir)
-                if os.path.isfile(name):
-                    try:
-                        cfile = mat.create_class_file(name, False,
-                            add2archive=self.add2archive)
-                        cfile.remove_all()
-                        logging.debug('Processing %s from %s' % (item,
-                            self.filename))
-                        zipout.write(name, item)
-                    except:
-                        logging.info('%s\'s fileformat is not supported' % item)
-                        if self.add2archive:
-                            zipout.write(name, item)
-        zipout.comment = ''
-        logging.info('%s processed' % self.filename)
+        zipin = zipfile.ZipFile(self.filename, 'r')
+        for item in zipin.namelist():
+            if item.startswith('docProps/'):
+                return False
         zipin.close()
-        zipout.close()
-        self.do_backup()
         return True
 
-    def is_clean(self):
-        ''' Check if the file is clean from harmful metadatas
+    def get_meta(self):
+        ''' Return a dict with all the meta of the file
         '''
+        metadata = super(OpenXmlStripper, self).get_meta()
+
         zipin = zipfile.ZipFile(self.filename, 'r')
-        try:
-            zipin.getinfo('meta.xml')
-        except KeyError:  # no meta.xml in the file
-            czf = archive.ZipStripper(self.filename, self.parser,
-                'application/zip', False, True, add2archive=self.add2archive)
-            if czf.is_clean():
-                zipin.close()
-                return True
+        for item in zipin.namelist():
+            if item.startswith('docProps/'):
+                metadata[item] = 'harmful content'
         zipin.close()
-        return False
+        return metadata
 
 
 class PdfStripper(parser.GenericParser):
@@ -128,8 +120,8 @@ class PdfStripper(parser.GenericParser):
             self.pdf_quality = False
 
         self.document = Poppler.Document.new_from_file(uri, self.password)
-        self.meta_list = frozenset(['title', 'author', 'subject', 'keywords', 'creator',
-            'producer', 'metadata'])
+        self.meta_list = frozenset(['title', 'author', 'subject',
+            'keywords', 'creator', 'producer', 'metadata'])
 
     def is_clean(self):
         ''' Check if the file is clean from harmful metadatas
@@ -168,7 +160,7 @@ class PdfStripper(parser.GenericParser):
             surface.finish()
             shutil.move(output, self.output)
         except:
-            logging.error('Something went wrong when cleaning %s. File not cleaned' % self.filename)
+            logging.error('Something went wrong when cleaning %s.' % self.filename)
             return False
 
         try:
@@ -182,8 +174,7 @@ class PdfStripper(parser.GenericParser):
             writer.write(self.output)
             self.do_backup()
         except:
-            logging.error('Unable to remove all metadata from %s, please install\
-pdfrw' % self.output)
+            logging.error('Unable to remove all metadata from %s, please install pdfrw' % self.output)
             return False
         return True
 
@@ -195,73 +186,3 @@ pdfrw' % self.output)
             if self.document.get_property(key):
                 metadata[key] = self.document.get_property(key)
         return metadata
-
-
-class OpenXmlStripper(archive.GenericArchiveStripper):
-    '''
-        Represent an office openxml document, which is like
-        an opendocument format, with some tricky stuff added.
-        It contains mostly xml, but can have media blobs, crap, ...
-        (I don't like this format.)
-    '''
-    def remove_all(self):
-        '''
-            FIXME ?
-            There is a patch implementing the Zipfile.remove()
-            method here : http://bugs.python.org/issue6818
-        '''
-        zipin = zipfile.ZipFile(self.filename, 'r')
-        zipout = zipfile.ZipFile(self.output, 'w',
-            allowZip64=True)
-        for item in zipin.namelist():
-            name = os.path.join(self.tempdir, item)
-            _, ext = os.path.splitext(name)
-            if item.startswith('docProps/'):  # metadatas
-                pass
-            elif ext in parser.NOMETA or item == '.rels':
-                # keep parser.NOMETA files, and the file named ".rels"
-                zipin.extract(item, self.tempdir)
-                zipout.write(name, item)
-            else:
-                zipin.extract(item, self.tempdir)
-                if os.path.isfile(name):  # don't care about folders
-                    try:
-                        cfile = mat.create_class_file(name, False,
-                            add2archive=self.add2archive)
-                        cfile.remove_all()
-                        logging.debug('Processing %s from %s' % (item,
-                            self.filename))
-                        zipout.write(name, item)
-                    except:
-                        logging.info('%s\'s fileformat is not supported' % item)
-                        if self.add2archive:
-                            zipout.write(name, item)
-        zipout.comment = ''
-        logging.info('%s processed' % self.filename)
-        zipin.close()
-        zipout.close()
-        self.do_backup()
-        return True
-
-    def is_clean(self):
-        ''' Check if the file is clean from harmful metadatas
-        '''
-        zipin = zipfile.ZipFile(self.filename, 'r')
-        for item in zipin.namelist():
-            if item.startswith('docProps/'):
-                return False
-        zipin.close()
-        czf = archive.ZipStripper(self.filename, self.parser,
-                'application/zip', False, True, add2archive=self.add2archive)
-        return czf.is_clean()
-
-    def get_meta(self):
-        ''' Return a dict with all the meta of the file
-        '''
-        zipin = zipfile.ZipFile(self.filename, 'r')
-        metadata = {}
-        for item in zipin.namelist():
-            if item.startswith('docProps/'):
-                metadata[item] = 'harmful content'
-        zipin.close()
-        return metadata
diff --git a/MAT/strippers.py b/MAT/strippers.py
index 5fd4e08..aea98da 100644
--- a/MAT/strippers.py
+++ b/MAT/strippers.py
@@ -14,6 +14,8 @@ import subprocess
 STRIPPERS = {
     'application/x-tar': archive.TarStripper,
     'application/x-bzip2': archive.Bzip2Stripper,
+    'application/x-gzip': archive.GzipStripper,
+    'application/zip': archive.ZipStripper,
     'audio/mpeg': audio.MpegAudioStripper,
     'application/x-bittorrent': misc.TorrentStripper,
     'application/opendocument': office.OpenDocumentStripper,
-- 
cgit v1.3