1 files changed, 82 insertions, 58 deletions
diff --git a/MAT/archive.py b/MAT/archive.py
index 9179e48..53c5e9b 100644
--- a/MAT/archive.py
+++ b/MAT/archive.py
@@ -1,6 +1,7 @@
 ''' Take care of archives formats
 '''
+import datetime
 import logging
 import os
 import shutil
@@ -11,12 +12,17 @@ import zipfile
 import mat
 import parser
+ZIP_EPOCH = (1980, 1, 1, 0, 0, 0)
+ZIP_EPOCH_SECONDS = (datetime.datetime(1980, 1, 1, 0, 0, 0)
+        - datetime.datetime(1970, 1, 1, 0, 0, 0)).total_seconds()
 class GenericArchiveStripper(parser.GenericParser):
    ''' Represent a generic archive
    '''
    def __init__(self, filename, parser, mime, backup, is_writable, **kwargs):
-        super(GenericArchiveStripper, self).__init__(filename, parser, mime, backup, is_writable, **kwargs)
+        super(GenericArchiveStripper, self).__init__(filename,
+                parser, mime, backup, is_writable, **kwargs)
        self.compression = ''
        self.add2archive = kwargs['add2archive']
        self.tempdir = tempfile.mkdtemp()
@@ -48,13 +54,13 @@ class GenericArchiveStripper(parser.GenericParser):
 class ZipStripper(GenericArchiveStripper):
    ''' Represent a zip file
    '''
-    def is_file_clean(self, fileinfo):
+    def __is_zipfile_clean(self, fileinfo):
        ''' Check if a ZipInfo object is clean of metadatas added
            by zip itself, independently of the corresponding file metadatas
        '''
        if fileinfo.comment != '':
            return False
-        elif fileinfo.date_time != (1980, 1, 1, 0, 0, 0):
+        elif fileinfo.date_time != ZIP_EPOCH:
            return False
        elif fileinfo.create_system != 3:  # 3 is UNIX
            return False
@@ -70,83 +76,100 @@ class ZipStripper(GenericArchiveStripper):
            logging.debug('%s has a comment' % self.filename)
            return False
        for item in zipin.infolist():
-            # I have not found a way to remove the crap added by zipfile :/
-            # if not self.is_file_clean(item):
-            #    logging.debug('%s from %s has compromising zipinfo' %
-            #        (item.filename, self.filename))
-            #    return False
            zipin.extract(item, self.tempdir)
            name = os.path.join(self.tempdir, item.filename)
+            if not self.__is_zipfile_clean(item) and not list_unsupported:
+                logging.debug('%s from %s has compromising zipinfo' %
+                        (item.filename, self.filename))
+                return False
            if os.path.isfile(name):
                cfile = mat.create_class_file(name, False, add2archive=self.add2archive)
                if cfile:
                    if not cfile.is_clean():
-                        return False
+                        logging.debug('%s from %s has compromising zipinfo' %
+                                (item.filename, self.filename))
+                        if not list_unsupported:
+                            return False
+                        ret_list.append(item.filename)
                else:
-                    logging.info('%s\'s fileformat is not supported, or is harmless' % item.filename)
+                    logging.info('%s\'s fileformat is not supported or harmless.'
+                            % item.filename)
                    basename, ext = os.path.splitext(name)
-                    bname = os.path.basename(item.filename)
+                    if os.path.basename(item.filename) not in ('mimetype', '.rels'):
-                    if ext not in parser.NOMETA:
+                        if ext not in parser.NOMETA:
-                        if bname != 'mimetype' and bname != '.rels':
+                            if not list_unsupported:
-                            if list_unsupported:
-                                ret_list.append(bname)
-                            else:
                                return False
+                            ret_list.append(item.filename)
        zipin.close()
        if list_unsupported:
            return ret_list
        return True
    def get_meta(self):
-        ''' Return all the metadata of a ZipFile (don't return metadatas
+        ''' Return all the metadata of a zip archive'''
-            of contained files : should it ?)
-        '''
        zipin = zipfile.ZipFile(self.filename, 'r')
        metadata = {}
-        for field in zipin.infolist():
-            zipmeta = {}
-            if field.comment != '':
-                zipmeta['comment'] = field.comment
-            if field.date_time != (1980, 1, 1, 0, 0, 0):
-                zipmeta['modified'] = field.date_time
-            if field.create_system != 3:  # 3 is UNIX
-                zipmeta['system'] = "windows" if field.create_system == 2 else "unknown"
        if zipin.comment != '':
-            metadata["%s comment" % self.filename] = zipin.comment
+            metadata['comment'] = zipin.comment
+        for item in zipin.infolist():
+            zipinfo_meta = self.__get_zipinfo_meta(item)
+            if zipinfo_meta != {}:  # zipinfo metadata
+                metadata[item.filename + "'s zipinfo"] = str(zipinfo_meta)
+            zipin.extract(item, self.tempdir)
+            name = os.path.join(self.tempdir, item.filename)
+            if os.path.isfile(name):
+                cfile = mat.create_class_file(name, False, add2archive=self.add2archive)
+                if cfile:
+                    cfile_meta = cfile.get_meta()
+                    if cfile_meta != {}:
+                        metadata[item.filename] = str(cfile_meta)
+                else:
+                    logging.info('%s\'s fileformat is not supported or harmless'
+                            % item.filename)
        zipin.close()
        return metadata
-    def remove_all(self):
+    def __get_zipinfo_meta(self, zipinfo):
-        ''' So far, the zipfile module does not allow to write a ZipInfo
+        ''' Return all the metadata of a ZipInfo
-            object into a zipfile (and it's a shame !) : so data added
+        '''
-            by zipfile itself could not be removed. It's a big concern.
+        metadata = {}
-            Is shipping a patched version of zipfile.py a good idea ?
+        if zipinfo.comment != '':
+            metadata['comment'] = zipinfo.comment
+        if zipinfo.date_time != ZIP_EPOCH:
+            metadata['modified'] = zipinfo.date_time
+        if zipinfo.create_system != 3:  # 3 is UNIX
+            metadata['system'] = "windows" if zipinfo.create_system == 2 else "unknown"
+        return metadata
+    def remove_all(self, whitelist=[], beginning_blacklist=[], ending_blacklist=[]):
+        ''' Remove all metadata from a zip archive, even thoses
+            added by Python's zipfile itself. It will not add
+            files starting with "begining_blacklist", or ending with
+            "ending_blacklist". This method also add files present in
+            whitelist to the archive.
        '''
        zipin = zipfile.ZipFile(self.filename, 'r')
        zipout = zipfile.ZipFile(self.output, 'w', allowZip64=True)
        for item in zipin.infolist():
            zipin.extract(item, self.tempdir)
            name = os.path.join(self.tempdir, item.filename)
-            if os.path.isfile(name):
-                try:
+            beginning = any((True for f in beginning_blacklist if item.filename.startswith(f)))
-                    cfile = mat.create_class_file(name, False,
+            ending = any((True for f in ending_blacklist if item.filename.endswith(f)))
-                        add2archive=self.add2archive)
+            if os.path.isfile(name) and not beginning and not ending:
+                cfile = mat.create_class_file(name, False, add2archive=self.add2archive)
+                if cfile is not None:
                    cfile.remove_all()
-                    logging.debug('Processing %s from %s' % (item.filename,
+                    logging.debug('Processing %s from %s' % (item.filename, self.filename))
-                        self.filename))
+                elif item.filename not in whitelist:
-                    zipout.write(name, item.filename)
+                    logging.info('%s\'s format is not supported or harmless' % item.filename)
-                except:
+                    basename, ext = os.path.splitext(name)
-                    logging.info('%s\'s format is not supported or harmless' %
+                    if not (self.add2archive or ext in parser.NOMETA):
-                        item.filename)
+                        continue
-                    _, ext = os.path.splitext(name)
+                os.utime(name, (ZIP_EPOCH_SECONDS, ZIP_EPOCH_SECONDS))
-                    if self.add2archive or ext in parser.NOMETA:
+                zipout.write(name, item.filename)
-                        zipout.write(name, item.filename)
        zipin.close()
-        for zipFile in zipout.infolist():
-            zipFile.orig_filename = zipFile.filename
-            zipFile.date_time = (1980, 1, 1, 0, 0, 0)
-            zipFile.create_system = 3  # 3 is UNIX
-        zipout.comment = ''
        zipout.close()
        logging.info('%s processed' % self.filename)
@@ -167,7 +190,7 @@ class TarStripper(GenericArchiveStripper):
        current_file.gname = ''
        return current_file
-    def remove_all(self, exclude_list=[]):
+    def remove_all(self, whitelist=[]):
        tarin = tarfile.open(self.filename, 'r' + self.compression, encoding='utf-8')
        tarout = tarfile.open(self.output, 'w' + self.compression, encoding='utf-8')
        for item in tarin.getmembers():
@@ -179,8 +202,9 @@ class TarStripper(GenericArchiveStripper):
                    cfile.remove_all()
                elif self.add2archive or os.path.splitext(item.name)[1] in parser.NOMETA:
                    logging.info('%s\' format is either not supported or harmless' % item.name)
-                elif item.name in exclude_list:
+                elif item.name in whitelist:
-                    logging.debug('%s is not supported, but MAt was told to add it anyway.' % item.name)
+                    logging.debug('%s is not supported, but MAT was told to add it anyway.'
+                            % item.name)
                else:
                    continue
                tarout.add(complete_name, item.name, filter=self._remove)
@@ -209,7 +233,6 @@ class TarStripper(GenericArchiveStripper):
        '''
        if list_unsupported:
            ret_list = []
-        tempdir_len = len(self.tempdir) + 1  # trim the tempfile path
        tarin = tarfile.open(self.filename, 'r' + self.compression)
        for item in tarin.getmembers():
            if not self.is_file_clean(item) and not list_unsupported:
@@ -217,20 +240,21 @@ class TarStripper(GenericArchiveStripper):
            tarin.extract(item, self.tempdir)
            complete_name = os.path.join(self.tempdir, item.name)
            if item.isfile():
-                class_file = mat.create_class_file(complete_name, False, add2archive=self.add2archive)
+                class_file = mat.create_class_file(complete_name,
+                        False, add2archive=self.add2archive)
                if class_file:
                    # We don't support nested archives
                    if not class_file.is_clean():
                        if not list_unsupported:
                            return False
                        elif isinstance(class_file, GenericArchiveStripper):
-                            ret_list.append(complete_name[tempdir_len:])
+                            ret_list.append(item.name)
                else:
                    logging.error('%s\'s format is not supported or harmless' % item.name)
                    if os.path.splitext(complete_name)[1] not in parser.NOMETA:
                        if not list_unsupported:
                            return False
-                        ret_list.append(complete_name[tempdir_len:])
+                        ret_list.append(item.name)
        tarin.close()
        if list_unsupported:
            return ret_list

diff --git a/MAT/archive.py b/MAT/archive.py index 9179e48..53c5e9b 100644 --- a/MAT/archive.py +++ b/MAT/archive.py
@@ -1,6 +1,7 @@
1	''' Take care of archives formats	1	''' Take care of archives formats
2	'''	2	'''
3		3
		4	import datetime
4	import logging	5	import logging
5	import os	6	import os
6	import shutil	7	import shutil
@@ -11,12 +12,17 @@ import zipfile
11	import mat	12	import mat
12	import parser	13	import parser
13		14
		15	ZIP_EPOCH = (1980, 1, 1, 0, 0, 0)
		16	ZIP_EPOCH_SECONDS = (datetime.datetime(1980, 1, 1, 0, 0, 0)
		17	- datetime.datetime(1970, 1, 1, 0, 0, 0)).total_seconds()
		18
14		19
15	class GenericArchiveStripper(parser.GenericParser):	20	class GenericArchiveStripper(parser.GenericParser):
16	''' Represent a generic archive	21	''' Represent a generic archive
17	'''	22	'''
18	def __init__(self, filename, parser, mime, backup, is_writable, **kwargs):	23	def __init__(self, filename, parser, mime, backup, is_writable, **kwargs):
19	super(GenericArchiveStripper, self).__init__(filename, parser, mime, backup, is_writable, **kwargs)	24	super(GenericArchiveStripper, self).__init__(filename,
		25	parser, mime, backup, is_writable, **kwargs)
20	self.compression = ''	26	self.compression = ''
21	self.add2archive = kwargs['add2archive']	27	self.add2archive = kwargs['add2archive']
22	self.tempdir = tempfile.mkdtemp()	28	self.tempdir = tempfile.mkdtemp()
@@ -48,13 +54,13 @@ class GenericArchiveStripper(parser.GenericParser):
48	class ZipStripper(GenericArchiveStripper):	54	class ZipStripper(GenericArchiveStripper):
49	''' Represent a zip file	55	''' Represent a zip file
50	'''	56	'''
51	def is_file_clean(self, fileinfo):	57	def __is_zipfile_clean(self, fileinfo):
52	''' Check if a ZipInfo object is clean of metadatas added	58	''' Check if a ZipInfo object is clean of metadatas added
53	by zip itself, independently of the corresponding file metadatas	59	by zip itself, independently of the corresponding file metadatas
54	'''	60	'''
55	if fileinfo.comment != '':	61	if fileinfo.comment != '':
56	return False	62	return False
57	elif fileinfo.date_time != (1980, 1, 1, 0, 0, 0):	63	elif fileinfo.date_time != ZIP_EPOCH:
58	return False	64	return False
59	elif fileinfo.create_system != 3: # 3 is UNIX	65	elif fileinfo.create_system != 3: # 3 is UNIX
60	return False	66	return False
@@ -70,83 +76,100 @@ class ZipStripper(GenericArchiveStripper):
70	logging.debug('%s has a comment' % self.filename)	76	logging.debug('%s has a comment' % self.filename)
71	return False	77	return False
72	for item in zipin.infolist():	78	for item in zipin.infolist():
73	# I have not found a way to remove the crap added by zipfile :/
74	# if not self.is_file_clean(item):
75	# logging.debug('%s from %s has compromising zipinfo' %
76	# (item.filename, self.filename))
77	# return False
78	zipin.extract(item, self.tempdir)	79	zipin.extract(item, self.tempdir)
79	name = os.path.join(self.tempdir, item.filename)	80	name = os.path.join(self.tempdir, item.filename)
		81	if not self.__is_zipfile_clean(item) and not list_unsupported:
		82	logging.debug('%s from %s has compromising zipinfo' %
		83	(item.filename, self.filename))
		84	return False
80	if os.path.isfile(name):	85	if os.path.isfile(name):
81	cfile = mat.create_class_file(name, False, add2archive=self.add2archive)	86	cfile = mat.create_class_file(name, False, add2archive=self.add2archive)
82	if cfile:	87	if cfile:
83	if not cfile.is_clean():	88	if not cfile.is_clean():
84	return False	89	logging.debug('%s from %s has compromising zipinfo' %
		90	(item.filename, self.filename))
		91	if not list_unsupported:
		92	return False
		93	ret_list.append(item.filename)
85	else:	94	else:
86	logging.info('%s\'s fileformat is not supported, or is harmless' % item.filename)	95	logging.info('%s\'s fileformat is not supported or harmless.'
		96	% item.filename)
87	basename, ext = os.path.splitext(name)	97	basename, ext = os.path.splitext(name)
88	bname = os.path.basename(item.filename)	98	if os.path.basename(item.filename) not in ('mimetype', '.rels'):
89	if ext not in parser.NOMETA:	99	if ext not in parser.NOMETA:
90	if bname != 'mimetype' and bname != '.rels':	100	if not list_unsupported:
91	if list_unsupported:
92	ret_list.append(bname)
93	else:
94	return False	101	return False
		102	ret_list.append(item.filename)
95	zipin.close()	103	zipin.close()
96	if list_unsupported:	104	if list_unsupported:
97	return ret_list	105	return ret_list
98	return True	106	return True
99		107
100	def get_meta(self):	108	def get_meta(self):
101	''' Return all the metadata of a ZipFile (don't return metadatas	109	''' Return all the metadata of a zip archive'''
102	of contained files : should it ?)
103	'''
104	zipin = zipfile.ZipFile(self.filename, 'r')	110	zipin = zipfile.ZipFile(self.filename, 'r')
105	metadata = {}	111	metadata = {}
106	for field in zipin.infolist():
107	zipmeta = {}
108	if field.comment != '':
109	zipmeta['comment'] = field.comment
110	if field.date_time != (1980, 1, 1, 0, 0, 0):
111	zipmeta['modified'] = field.date_time
112	if field.create_system != 3: # 3 is UNIX
113	zipmeta['system'] = "windows" if field.create_system == 2 else "unknown"
114	if zipin.comment != '':	112	if zipin.comment != '':
115	metadata["%s comment" % self.filename] = zipin.comment	113	metadata['comment'] = zipin.comment
		114	for item in zipin.infolist():
		115	zipinfo_meta = self.__get_zipinfo_meta(item)
		116	if zipinfo_meta != {}: # zipinfo metadata
		117	metadata[item.filename + "'s zipinfo"] = str(zipinfo_meta)
		118	zipin.extract(item, self.tempdir)
		119	name = os.path.join(self.tempdir, item.filename)
		120	if os.path.isfile(name):
		121	cfile = mat.create_class_file(name, False, add2archive=self.add2archive)
		122	if cfile:
		123	cfile_meta = cfile.get_meta()
		124	if cfile_meta != {}:
		125	metadata[item.filename] = str(cfile_meta)
		126	else:
		127	logging.info('%s\'s fileformat is not supported or harmless'
		128	% item.filename)
116	zipin.close()	129	zipin.close()
117	return metadata	130	return metadata
118		131
119	def remove_all(self):	132	def __get_zipinfo_meta(self, zipinfo):
120	''' So far, the zipfile module does not allow to write a ZipInfo	133	''' Return all the metadata of a ZipInfo
121	object into a zipfile (and it's a shame !) : so data added	134	'''
122	by zipfile itself could not be removed. It's a big concern.	135	metadata = {}
123	Is shipping a patched version of zipfile.py a good idea ?	136	if zipinfo.comment != '':
		137	metadata['comment'] = zipinfo.comment
		138	if zipinfo.date_time != ZIP_EPOCH:
		139	metadata['modified'] = zipinfo.date_time
		140	if zipinfo.create_system != 3: # 3 is UNIX
		141	metadata['system'] = "windows" if zipinfo.create_system == 2 else "unknown"
		142	return metadata
		143
		144	def remove_all(self, whitelist=[], beginning_blacklist=[], ending_blacklist=[]):
		145	''' Remove all metadata from a zip archive, even thoses
		146	added by Python's zipfile itself. It will not add
		147	files starting with "begining_blacklist", or ending with
		148	"ending_blacklist". This method also add files present in
		149	whitelist to the archive.
124	'''	150	'''
125	zipin = zipfile.ZipFile(self.filename, 'r')	151	zipin = zipfile.ZipFile(self.filename, 'r')
126	zipout = zipfile.ZipFile(self.output, 'w', allowZip64=True)	152	zipout = zipfile.ZipFile(self.output, 'w', allowZip64=True)
127	for item in zipin.infolist():	153	for item in zipin.infolist():
128	zipin.extract(item, self.tempdir)	154	zipin.extract(item, self.tempdir)
129	name = os.path.join(self.tempdir, item.filename)	155	name = os.path.join(self.tempdir, item.filename)
130	if os.path.isfile(name):	156
131	try:	157	beginning = any((True for f in beginning_blacklist if item.filename.startswith(f)))
132	cfile = mat.create_class_file(name, False,	158	ending = any((True for f in ending_blacklist if item.filename.endswith(f)))
133	add2archive=self.add2archive)	159
		160	if os.path.isfile(name) and not beginning and not ending:
		161	cfile = mat.create_class_file(name, False, add2archive=self.add2archive)
		162	if cfile is not None:
134	cfile.remove_all()	163	cfile.remove_all()
135	logging.debug('Processing %s from %s' % (item.filename,	164	logging.debug('Processing %s from %s' % (item.filename, self.filename))
136	self.filename))	165	elif item.filename not in whitelist:
137	zipout.write(name, item.filename)	166	logging.info('%s\'s format is not supported or harmless' % item.filename)
138	except:	167	basename, ext = os.path.splitext(name)
139	logging.info('%s\'s format is not supported or harmless' %	168	if not (self.add2archive or ext in parser.NOMETA):
140	item.filename)	169	continue
141	_, ext = os.path.splitext(name)	170	os.utime(name, (ZIP_EPOCH_SECONDS, ZIP_EPOCH_SECONDS))
142	if self.add2archive or ext in parser.NOMETA:	171	zipout.write(name, item.filename)
143	zipout.write(name, item.filename)
144	zipin.close()	172	zipin.close()
145	for zipFile in zipout.infolist():
146	zipFile.orig_filename = zipFile.filename
147	zipFile.date_time = (1980, 1, 1, 0, 0, 0)
148	zipFile.create_system = 3 # 3 is UNIX
149	zipout.comment = ''
150	zipout.close()	173	zipout.close()
151		174
152	logging.info('%s processed' % self.filename)	175	logging.info('%s processed' % self.filename)
@@ -167,7 +190,7 @@ class TarStripper(GenericArchiveStripper):
167	current_file.gname = ''	190	current_file.gname = ''
168	return current_file	191	return current_file
169		192
170	def remove_all(self, exclude_list=[]):	193	def remove_all(self, whitelist=[]):
171	tarin = tarfile.open(self.filename, 'r' + self.compression, encoding='utf-8')	194	tarin = tarfile.open(self.filename, 'r' + self.compression, encoding='utf-8')
172	tarout = tarfile.open(self.output, 'w' + self.compression, encoding='utf-8')	195	tarout = tarfile.open(self.output, 'w' + self.compression, encoding='utf-8')
173	for item in tarin.getmembers():	196	for item in tarin.getmembers():
@@ -179,8 +202,9 @@ class TarStripper(GenericArchiveStripper):
179	cfile.remove_all()	202	cfile.remove_all()
180	elif self.add2archive or os.path.splitext(item.name)[1] in parser.NOMETA:	203	elif self.add2archive or os.path.splitext(item.name)[1] in parser.NOMETA:
181	logging.info('%s\' format is either not supported or harmless' % item.name)	204	logging.info('%s\' format is either not supported or harmless' % item.name)
182	elif item.name in exclude_list:	205	elif item.name in whitelist:
183	logging.debug('%s is not supported, but MAt was told to add it anyway.' % item.name)	206	logging.debug('%s is not supported, but MAT was told to add it anyway.'
		207	% item.name)
184	else:	208	else:
185	continue	209	continue
186	tarout.add(complete_name, item.name, filter=self._remove)	210	tarout.add(complete_name, item.name, filter=self._remove)
@@ -209,7 +233,6 @@ class TarStripper(GenericArchiveStripper):
209	'''	233	'''
210	if list_unsupported:	234	if list_unsupported:
211	ret_list = []	235	ret_list = []
212	tempdir_len = len(self.tempdir) + 1 # trim the tempfile path
213	tarin = tarfile.open(self.filename, 'r' + self.compression)	236	tarin = tarfile.open(self.filename, 'r' + self.compression)
214	for item in tarin.getmembers():	237	for item in tarin.getmembers():
215	if not self.is_file_clean(item) and not list_unsupported:	238	if not self.is_file_clean(item) and not list_unsupported:
@@ -217,20 +240,21 @@ class TarStripper(GenericArchiveStripper):
217	tarin.extract(item, self.tempdir)	240	tarin.extract(item, self.tempdir)
218	complete_name = os.path.join(self.tempdir, item.name)	241	complete_name = os.path.join(self.tempdir, item.name)
219	if item.isfile():	242	if item.isfile():
220	class_file = mat.create_class_file(complete_name, False, add2archive=self.add2archive)	243	class_file = mat.create_class_file(complete_name,
		244	False, add2archive=self.add2archive)
221	if class_file:	245	if class_file:
222	# We don't support nested archives	246	# We don't support nested archives
223	if not class_file.is_clean():	247	if not class_file.is_clean():
224	if not list_unsupported:	248	if not list_unsupported:
225	return False	249	return False
226	elif isinstance(class_file, GenericArchiveStripper):	250	elif isinstance(class_file, GenericArchiveStripper):
227	ret_list.append(complete_name[tempdir_len:])	251	ret_list.append(item.name)
228	else:	252	else:
229	logging.error('%s\'s format is not supported or harmless' % item.name)	253	logging.error('%s\'s format is not supported or harmless' % item.name)
230	if os.path.splitext(complete_name)[1] not in parser.NOMETA:	254	if os.path.splitext(complete_name)[1] not in parser.NOMETA:
231	if not list_unsupported:	255	if not list_unsupported:
232	return False	256	return False
233	ret_list.append(complete_name[tempdir_len:])	257	ret_list.append(item.name)
234	tarin.close()	258	tarin.close()
235	if list_unsupported:	259	if list_unsupported:
236	return ret_list	260	return ret_list