From bbe17fd511b5890fb4554447e23d666f6c13b745 Mon Sep 17 00:00:00 2001
From: jvoisin
Date: Wed, 15 Jan 2014 02:42:39 +0000
Subject: Add support for zipfiles!

---
 MAT/archive.py | 140 +++++++++++++++++++++++++++++++++------------------------
 1 file changed, 82 insertions(+), 58 deletions(-)

(limited to 'MAT/archive.py')

diff --git a/MAT/archive.py b/MAT/archive.py
index 9179e48..53c5e9b 100644
--- a/MAT/archive.py
+++ b/MAT/archive.py
@@ -1,6 +1,7 @@
 ''' Take care of archives formats
 '''
 
+import datetime
 import logging
 import os
 import shutil
@@ -11,12 +12,17 @@ import zipfile
 import mat
 import parser
 
+ZIP_EPOCH = (1980, 1, 1, 0, 0, 0)
+ZIP_EPOCH_SECONDS = (datetime.datetime(1980, 1, 1, 0, 0, 0)
+        - datetime.datetime(1970, 1, 1, 0, 0, 0)).total_seconds()
+
 
 class GenericArchiveStripper(parser.GenericParser):
     ''' Represent a generic archive
     '''
     def __init__(self, filename, parser, mime, backup, is_writable, **kwargs):
-        super(GenericArchiveStripper, self).__init__(filename, parser, mime, backup, is_writable, **kwargs)
+        super(GenericArchiveStripper, self).__init__(filename,
+                parser, mime, backup, is_writable, **kwargs)
         self.compression = ''
         self.add2archive = kwargs['add2archive']
         self.tempdir = tempfile.mkdtemp()
@@ -48,13 +54,13 @@ class GenericArchiveStripper(parser.GenericParser):
 class ZipStripper(GenericArchiveStripper):
     ''' Represent a zip file
     '''
-    def is_file_clean(self, fileinfo):
+    def __is_zipfile_clean(self, fileinfo):
         ''' Check if a ZipInfo object is clean of metadatas added
             by zip itself, independently of the corresponding file metadatas
         '''
         if fileinfo.comment != '':
             return False
-        elif fileinfo.date_time != (1980, 1, 1, 0, 0, 0):
+        elif fileinfo.date_time != ZIP_EPOCH:
             return False
         elif fileinfo.create_system != 3:  # 3 is UNIX
             return False
@@ -70,83 +76,100 @@ class ZipStripper(GenericArchiveStripper):
             logging.debug('%s has a comment' % self.filename)
             return False
         for item in zipin.infolist():
-            # I have not found a way to remove the crap added by zipfile :/
-            # if not self.is_file_clean(item):
-            #    logging.debug('%s from %s has compromising zipinfo' %
-            #        (item.filename, self.filename))
-            #    return False
             zipin.extract(item, self.tempdir)
             name = os.path.join(self.tempdir, item.filename)
+            if not self.__is_zipfile_clean(item) and not list_unsupported:
+                logging.debug('%s from %s has compromising zipinfo' %
+                        (item.filename, self.filename))
+                return False
             if os.path.isfile(name):
                 cfile = mat.create_class_file(name, False, add2archive=self.add2archive)
                 if cfile:
                     if not cfile.is_clean():
-                        return False
+                        logging.debug('%s from %s has compromising zipinfo' %
+                                (item.filename, self.filename))
+                        if not list_unsupported:
+                            return False
+                        ret_list.append(item.filename)
                 else:
-                    logging.info('%s\'s fileformat is not supported, or is harmless' % item.filename)
+                    logging.info('%s\'s fileformat is not supported or harmless.'
+                            % item.filename)
                     basename, ext = os.path.splitext(name)
-                    bname = os.path.basename(item.filename)
-                    if ext not in parser.NOMETA:
-                        if bname != 'mimetype' and bname != '.rels':
-                            if list_unsupported:
-                                ret_list.append(bname)
-                            else:
+                    if os.path.basename(item.filename) not in ('mimetype', '.rels'):
+                        if ext not in parser.NOMETA:
+                            if not list_unsupported:
                                 return False
+                            ret_list.append(item.filename)
         zipin.close()
         if list_unsupported:
             return ret_list
         return True
 
     def get_meta(self):
-        ''' Return all the metadata of a ZipFile (don't return metadatas
-            of contained files : should it ?)
-        '''
+        ''' Return all the metadata of a zip archive'''
         zipin = zipfile.ZipFile(self.filename, 'r')
         metadata = {}
-        for field in zipin.infolist():
-            zipmeta = {}
-            if field.comment != '':
-                zipmeta['comment'] = field.comment
-            if field.date_time != (1980, 1, 1, 0, 0, 0):
-                zipmeta['modified'] = field.date_time
-            if field.create_system != 3:  # 3 is UNIX
-                zipmeta['system'] = "windows" if field.create_system == 2 else "unknown"
         if zipin.comment != '':
-            metadata["%s comment" % self.filename] = zipin.comment
+            metadata['comment'] = zipin.comment
+        for item in zipin.infolist():
+            zipinfo_meta = self.__get_zipinfo_meta(item)
+            if zipinfo_meta != {}:  # zipinfo metadata
+                metadata[item.filename + "'s zipinfo"] = str(zipinfo_meta)
+            zipin.extract(item, self.tempdir)
+            name = os.path.join(self.tempdir, item.filename)
+            if os.path.isfile(name):
+                cfile = mat.create_class_file(name, False, add2archive=self.add2archive)
+                if cfile:
+                    cfile_meta = cfile.get_meta()
+                    if cfile_meta != {}:
+                        metadata[item.filename] = str(cfile_meta)
+                else:
+                    logging.info('%s\'s fileformat is not supported or harmless'
+                            % item.filename)
         zipin.close()
         return metadata
 
-    def remove_all(self):
-        ''' So far, the zipfile module does not allow to write a ZipInfo
-            object into a zipfile (and it's a shame !) : so data added
-            by zipfile itself could not be removed. It's a big concern.
-            Is shipping a patched version of zipfile.py a good idea ?
+    def __get_zipinfo_meta(self, zipinfo):
+        ''' Return all the metadata of a ZipInfo
+        '''
+        metadata = {}
+        if zipinfo.comment != '':
+            metadata['comment'] = zipinfo.comment
+        if zipinfo.date_time != ZIP_EPOCH:
+            metadata['modified'] = zipinfo.date_time
+        if zipinfo.create_system != 3:  # 3 is UNIX
+            metadata['system'] = "windows" if zipinfo.create_system == 2 else "unknown"
+        return metadata
+
+    def remove_all(self, whitelist=[], beginning_blacklist=[], ending_blacklist=[]):
+        ''' Remove all metadata from a zip archive, even thoses
+            added by Python's zipfile itself. It will not add
+            files starting with "begining_blacklist", or ending with
+            "ending_blacklist". This method also add files present in
+            whitelist to the archive.
         '''
         zipin = zipfile.ZipFile(self.filename, 'r')
         zipout = zipfile.ZipFile(self.output, 'w', allowZip64=True)
         for item in zipin.infolist():
             zipin.extract(item, self.tempdir)
             name = os.path.join(self.tempdir, item.filename)
-            if os.path.isfile(name):
-                try:
-                    cfile = mat.create_class_file(name, False,
-                        add2archive=self.add2archive)
+
+            beginning = any((True for f in beginning_blacklist if item.filename.startswith(f)))
+            ending = any((True for f in ending_blacklist if item.filename.endswith(f)))
+
+            if os.path.isfile(name) and not beginning and not ending:
+                cfile = mat.create_class_file(name, False, add2archive=self.add2archive)
+                if cfile is not None:
                     cfile.remove_all()
-                    logging.debug('Processing %s from %s' % (item.filename,
-                        self.filename))
-                    zipout.write(name, item.filename)
-                except:
-                    logging.info('%s\'s format is not supported or harmless' %
-                        item.filename)
-                    _, ext = os.path.splitext(name)
-                    if self.add2archive or ext in parser.NOMETA:
-                        zipout.write(name, item.filename)
+                    logging.debug('Processing %s from %s' % (item.filename, self.filename))
+                elif item.filename not in whitelist:
+                    logging.info('%s\'s format is not supported or harmless' % item.filename)
+                    basename, ext = os.path.splitext(name)
+                    if not (self.add2archive or ext in parser.NOMETA):
+                        continue
+                os.utime(name, (ZIP_EPOCH_SECONDS, ZIP_EPOCH_SECONDS))
+                zipout.write(name, item.filename)
         zipin.close()
-        for zipFile in zipout.infolist():
-            zipFile.orig_filename = zipFile.filename
-            zipFile.date_time = (1980, 1, 1, 0, 0, 0)
-            zipFile.create_system = 3  # 3 is UNIX
-        zipout.comment = ''
         zipout.close()
 
         logging.info('%s processed' % self.filename)
@@ -167,7 +190,7 @@ class TarStripper(GenericArchiveStripper):
         current_file.gname = ''
         return current_file
 
-    def remove_all(self, exclude_list=[]):
+    def remove_all(self, whitelist=[]):
         tarin = tarfile.open(self.filename, 'r' + self.compression, encoding='utf-8')
         tarout = tarfile.open(self.output, 'w' + self.compression, encoding='utf-8')
         for item in tarin.getmembers():
@@ -179,8 +202,9 @@ class TarStripper(GenericArchiveStripper):
                     cfile.remove_all()
                 elif self.add2archive or os.path.splitext(item.name)[1] in parser.NOMETA:
                     logging.info('%s\' format is either not supported or harmless' % item.name)
-                elif item.name in exclude_list:
-                    logging.debug('%s is not supported, but MAt was told to add it anyway.' % item.name)
+                elif item.name in whitelist:
+                    logging.debug('%s is not supported, but MAT was told to add it anyway.'
+                            % item.name)
                 else:
                     continue
                 tarout.add(complete_name, item.name, filter=self._remove)
@@ -209,7 +233,6 @@ class TarStripper(GenericArchiveStripper):
         '''
         if list_unsupported:
             ret_list = []
-        tempdir_len = len(self.tempdir) + 1  # trim the tempfile path
         tarin = tarfile.open(self.filename, 'r' + self.compression)
         for item in tarin.getmembers():
             if not self.is_file_clean(item) and not list_unsupported:
@@ -217,20 +240,21 @@ class TarStripper(GenericArchiveStripper):
             tarin.extract(item, self.tempdir)
             complete_name = os.path.join(self.tempdir, item.name)
             if item.isfile():
-                class_file = mat.create_class_file(complete_name, False, add2archive=self.add2archive)
+                class_file = mat.create_class_file(complete_name,
+                        False, add2archive=self.add2archive)
                 if class_file:
                     # We don't support nested archives
                     if not class_file.is_clean():
                         if not list_unsupported:
                             return False
                         elif isinstance(class_file, GenericArchiveStripper):
-                            ret_list.append(complete_name[tempdir_len:])
+                            ret_list.append(item.name)
                 else:
                     logging.error('%s\'s format is not supported or harmless' % item.name)
                     if os.path.splitext(complete_name)[1] not in parser.NOMETA:
                         if not list_unsupported:
                             return False
-                        ret_list.append(complete_name[tempdir_len:])
+                        ret_list.append(item.name)
         tarin.close()
         if list_unsupported:
             return ret_list
-- 
cgit v1.3