summaryrefslogtreecommitdiff
path: root/MAT/archive.py
diff options
context:
space:
mode:
authorjvoisin2014-01-15 02:42:39 +0000
committerjvoisin2014-01-15 02:42:39 +0000
commitbbe17fd511b5890fb4554447e23d666f6c13b745 (patch)
tree5651c76da1d23ca80b252097ca1eb7880e8cf863 /MAT/archive.py
parent5e65094084c75a9372f529a3387b072a84bf254a (diff)
Add support for zipfiles!
Diffstat (limited to 'MAT/archive.py')
-rw-r--r--MAT/archive.py140
1 files changed, 82 insertions, 58 deletions
diff --git a/MAT/archive.py b/MAT/archive.py
index 9179e48..53c5e9b 100644
--- a/MAT/archive.py
+++ b/MAT/archive.py
@@ -1,6 +1,7 @@
1''' Take care of archives formats 1''' Take care of archives formats
2''' 2'''
3 3
4import datetime
4import logging 5import logging
5import os 6import os
6import shutil 7import shutil
@@ -11,12 +12,17 @@ import zipfile
11import mat 12import mat
12import parser 13import parser
13 14
15ZIP_EPOCH = (1980, 1, 1, 0, 0, 0)
16ZIP_EPOCH_SECONDS = (datetime.datetime(1980, 1, 1, 0, 0, 0)
17 - datetime.datetime(1970, 1, 1, 0, 0, 0)).total_seconds()
18
14 19
15class GenericArchiveStripper(parser.GenericParser): 20class GenericArchiveStripper(parser.GenericParser):
16 ''' Represent a generic archive 21 ''' Represent a generic archive
17 ''' 22 '''
18 def __init__(self, filename, parser, mime, backup, is_writable, **kwargs): 23 def __init__(self, filename, parser, mime, backup, is_writable, **kwargs):
19 super(GenericArchiveStripper, self).__init__(filename, parser, mime, backup, is_writable, **kwargs) 24 super(GenericArchiveStripper, self).__init__(filename,
25 parser, mime, backup, is_writable, **kwargs)
20 self.compression = '' 26 self.compression = ''
21 self.add2archive = kwargs['add2archive'] 27 self.add2archive = kwargs['add2archive']
22 self.tempdir = tempfile.mkdtemp() 28 self.tempdir = tempfile.mkdtemp()
@@ -48,13 +54,13 @@ class GenericArchiveStripper(parser.GenericParser):
48class ZipStripper(GenericArchiveStripper): 54class ZipStripper(GenericArchiveStripper):
49 ''' Represent a zip file 55 ''' Represent a zip file
50 ''' 56 '''
51 def is_file_clean(self, fileinfo): 57 def __is_zipfile_clean(self, fileinfo):
52 ''' Check if a ZipInfo object is clean of metadatas added 58 ''' Check if a ZipInfo object is clean of metadatas added
53 by zip itself, independently of the corresponding file metadatas 59 by zip itself, independently of the corresponding file metadatas
54 ''' 60 '''
55 if fileinfo.comment != '': 61 if fileinfo.comment != '':
56 return False 62 return False
57 elif fileinfo.date_time != (1980, 1, 1, 0, 0, 0): 63 elif fileinfo.date_time != ZIP_EPOCH:
58 return False 64 return False
59 elif fileinfo.create_system != 3: # 3 is UNIX 65 elif fileinfo.create_system != 3: # 3 is UNIX
60 return False 66 return False
@@ -70,83 +76,100 @@ class ZipStripper(GenericArchiveStripper):
70 logging.debug('%s has a comment' % self.filename) 76 logging.debug('%s has a comment' % self.filename)
71 return False 77 return False
72 for item in zipin.infolist(): 78 for item in zipin.infolist():
73 # I have not found a way to remove the crap added by zipfile :/
74 # if not self.is_file_clean(item):
75 # logging.debug('%s from %s has compromising zipinfo' %
76 # (item.filename, self.filename))
77 # return False
78 zipin.extract(item, self.tempdir) 79 zipin.extract(item, self.tempdir)
79 name = os.path.join(self.tempdir, item.filename) 80 name = os.path.join(self.tempdir, item.filename)
81 if not self.__is_zipfile_clean(item) and not list_unsupported:
82 logging.debug('%s from %s has compromising zipinfo' %
83 (item.filename, self.filename))
84 return False
80 if os.path.isfile(name): 85 if os.path.isfile(name):
81 cfile = mat.create_class_file(name, False, add2archive=self.add2archive) 86 cfile = mat.create_class_file(name, False, add2archive=self.add2archive)
82 if cfile: 87 if cfile:
83 if not cfile.is_clean(): 88 if not cfile.is_clean():
84 return False 89 logging.debug('%s from %s has compromising zipinfo' %
90 (item.filename, self.filename))
91 if not list_unsupported:
92 return False
93 ret_list.append(item.filename)
85 else: 94 else:
86 logging.info('%s\'s fileformat is not supported, or is harmless' % item.filename) 95 logging.info('%s\'s fileformat is not supported or harmless.'
96 % item.filename)
87 basename, ext = os.path.splitext(name) 97 basename, ext = os.path.splitext(name)
88 bname = os.path.basename(item.filename) 98 if os.path.basename(item.filename) not in ('mimetype', '.rels'):
89 if ext not in parser.NOMETA: 99 if ext not in parser.NOMETA:
90 if bname != 'mimetype' and bname != '.rels': 100 if not list_unsupported:
91 if list_unsupported:
92 ret_list.append(bname)
93 else:
94 return False 101 return False
102 ret_list.append(item.filename)
95 zipin.close() 103 zipin.close()
96 if list_unsupported: 104 if list_unsupported:
97 return ret_list 105 return ret_list
98 return True 106 return True
99 107
100 def get_meta(self): 108 def get_meta(self):
101 ''' Return all the metadata of a ZipFile (don't return metadatas 109 ''' Return all the metadata of a zip archive'''
102 of contained files : should it ?)
103 '''
104 zipin = zipfile.ZipFile(self.filename, 'r') 110 zipin = zipfile.ZipFile(self.filename, 'r')
105 metadata = {} 111 metadata = {}
106 for field in zipin.infolist():
107 zipmeta = {}
108 if field.comment != '':
109 zipmeta['comment'] = field.comment
110 if field.date_time != (1980, 1, 1, 0, 0, 0):
111 zipmeta['modified'] = field.date_time
112 if field.create_system != 3: # 3 is UNIX
113 zipmeta['system'] = "windows" if field.create_system == 2 else "unknown"
114 if zipin.comment != '': 112 if zipin.comment != '':
115 metadata["%s comment" % self.filename] = zipin.comment 113 metadata['comment'] = zipin.comment
114 for item in zipin.infolist():
115 zipinfo_meta = self.__get_zipinfo_meta(item)
116 if zipinfo_meta != {}: # zipinfo metadata
117 metadata[item.filename + "'s zipinfo"] = str(zipinfo_meta)
118 zipin.extract(item, self.tempdir)
119 name = os.path.join(self.tempdir, item.filename)
120 if os.path.isfile(name):
121 cfile = mat.create_class_file(name, False, add2archive=self.add2archive)
122 if cfile:
123 cfile_meta = cfile.get_meta()
124 if cfile_meta != {}:
125 metadata[item.filename] = str(cfile_meta)
126 else:
127 logging.info('%s\'s fileformat is not supported or harmless'
128 % item.filename)
116 zipin.close() 129 zipin.close()
117 return metadata 130 return metadata
118 131
119 def remove_all(self): 132 def __get_zipinfo_meta(self, zipinfo):
120 ''' So far, the zipfile module does not allow to write a ZipInfo 133 ''' Return all the metadata of a ZipInfo
121 object into a zipfile (and it's a shame !) : so data added 134 '''
122 by zipfile itself could not be removed. It's a big concern. 135 metadata = {}
123 Is shipping a patched version of zipfile.py a good idea ? 136 if zipinfo.comment != '':
137 metadata['comment'] = zipinfo.comment
138 if zipinfo.date_time != ZIP_EPOCH:
139 metadata['modified'] = zipinfo.date_time
140 if zipinfo.create_system != 3: # 3 is UNIX
141 metadata['system'] = "windows" if zipinfo.create_system == 2 else "unknown"
142 return metadata
143
144 def remove_all(self, whitelist=[], beginning_blacklist=[], ending_blacklist=[]):
145 ''' Remove all metadata from a zip archive, even thoses
146 added by Python's zipfile itself. It will not add
147 files starting with "begining_blacklist", or ending with
148 "ending_blacklist". This method also add files present in
149 whitelist to the archive.
124 ''' 150 '''
125 zipin = zipfile.ZipFile(self.filename, 'r') 151 zipin = zipfile.ZipFile(self.filename, 'r')
126 zipout = zipfile.ZipFile(self.output, 'w', allowZip64=True) 152 zipout = zipfile.ZipFile(self.output, 'w', allowZip64=True)
127 for item in zipin.infolist(): 153 for item in zipin.infolist():
128 zipin.extract(item, self.tempdir) 154 zipin.extract(item, self.tempdir)
129 name = os.path.join(self.tempdir, item.filename) 155 name = os.path.join(self.tempdir, item.filename)
130 if os.path.isfile(name): 156
131 try: 157 beginning = any((True for f in beginning_blacklist if item.filename.startswith(f)))
132 cfile = mat.create_class_file(name, False, 158 ending = any((True for f in ending_blacklist if item.filename.endswith(f)))
133 add2archive=self.add2archive) 159
160 if os.path.isfile(name) and not beginning and not ending:
161 cfile = mat.create_class_file(name, False, add2archive=self.add2archive)
162 if cfile is not None:
134 cfile.remove_all() 163 cfile.remove_all()
135 logging.debug('Processing %s from %s' % (item.filename, 164 logging.debug('Processing %s from %s' % (item.filename, self.filename))
136 self.filename)) 165 elif item.filename not in whitelist:
137 zipout.write(name, item.filename) 166 logging.info('%s\'s format is not supported or harmless' % item.filename)
138 except: 167 basename, ext = os.path.splitext(name)
139 logging.info('%s\'s format is not supported or harmless' % 168 if not (self.add2archive or ext in parser.NOMETA):
140 item.filename) 169 continue
141 _, ext = os.path.splitext(name) 170 os.utime(name, (ZIP_EPOCH_SECONDS, ZIP_EPOCH_SECONDS))
142 if self.add2archive or ext in parser.NOMETA: 171 zipout.write(name, item.filename)
143 zipout.write(name, item.filename)
144 zipin.close() 172 zipin.close()
145 for zipFile in zipout.infolist():
146 zipFile.orig_filename = zipFile.filename
147 zipFile.date_time = (1980, 1, 1, 0, 0, 0)
148 zipFile.create_system = 3 # 3 is UNIX
149 zipout.comment = ''
150 zipout.close() 173 zipout.close()
151 174
152 logging.info('%s processed' % self.filename) 175 logging.info('%s processed' % self.filename)
@@ -167,7 +190,7 @@ class TarStripper(GenericArchiveStripper):
167 current_file.gname = '' 190 current_file.gname = ''
168 return current_file 191 return current_file
169 192
170 def remove_all(self, exclude_list=[]): 193 def remove_all(self, whitelist=[]):
171 tarin = tarfile.open(self.filename, 'r' + self.compression, encoding='utf-8') 194 tarin = tarfile.open(self.filename, 'r' + self.compression, encoding='utf-8')
172 tarout = tarfile.open(self.output, 'w' + self.compression, encoding='utf-8') 195 tarout = tarfile.open(self.output, 'w' + self.compression, encoding='utf-8')
173 for item in tarin.getmembers(): 196 for item in tarin.getmembers():
@@ -179,8 +202,9 @@ class TarStripper(GenericArchiveStripper):
179 cfile.remove_all() 202 cfile.remove_all()
180 elif self.add2archive or os.path.splitext(item.name)[1] in parser.NOMETA: 203 elif self.add2archive or os.path.splitext(item.name)[1] in parser.NOMETA:
181 logging.info('%s\' format is either not supported or harmless' % item.name) 204 logging.info('%s\' format is either not supported or harmless' % item.name)
182 elif item.name in exclude_list: 205 elif item.name in whitelist:
183 logging.debug('%s is not supported, but MAt was told to add it anyway.' % item.name) 206 logging.debug('%s is not supported, but MAT was told to add it anyway.'
207 % item.name)
184 else: 208 else:
185 continue 209 continue
186 tarout.add(complete_name, item.name, filter=self._remove) 210 tarout.add(complete_name, item.name, filter=self._remove)
@@ -209,7 +233,6 @@ class TarStripper(GenericArchiveStripper):
209 ''' 233 '''
210 if list_unsupported: 234 if list_unsupported:
211 ret_list = [] 235 ret_list = []
212 tempdir_len = len(self.tempdir) + 1 # trim the tempfile path
213 tarin = tarfile.open(self.filename, 'r' + self.compression) 236 tarin = tarfile.open(self.filename, 'r' + self.compression)
214 for item in tarin.getmembers(): 237 for item in tarin.getmembers():
215 if not self.is_file_clean(item) and not list_unsupported: 238 if not self.is_file_clean(item) and not list_unsupported:
@@ -217,20 +240,21 @@ class TarStripper(GenericArchiveStripper):
217 tarin.extract(item, self.tempdir) 240 tarin.extract(item, self.tempdir)
218 complete_name = os.path.join(self.tempdir, item.name) 241 complete_name = os.path.join(self.tempdir, item.name)
219 if item.isfile(): 242 if item.isfile():
220 class_file = mat.create_class_file(complete_name, False, add2archive=self.add2archive) 243 class_file = mat.create_class_file(complete_name,
244 False, add2archive=self.add2archive)
221 if class_file: 245 if class_file:
222 # We don't support nested archives 246 # We don't support nested archives
223 if not class_file.is_clean(): 247 if not class_file.is_clean():
224 if not list_unsupported: 248 if not list_unsupported:
225 return False 249 return False
226 elif isinstance(class_file, GenericArchiveStripper): 250 elif isinstance(class_file, GenericArchiveStripper):
227 ret_list.append(complete_name[tempdir_len:]) 251 ret_list.append(item.name)
228 else: 252 else:
229 logging.error('%s\'s format is not supported or harmless' % item.name) 253 logging.error('%s\'s format is not supported or harmless' % item.name)
230 if os.path.splitext(complete_name)[1] not in parser.NOMETA: 254 if os.path.splitext(complete_name)[1] not in parser.NOMETA:
231 if not list_unsupported: 255 if not list_unsupported:
232 return False 256 return False
233 ret_list.append(complete_name[tempdir_len:]) 257 ret_list.append(item.name)
234 tarin.close() 258 tarin.close()
235 if list_unsupported: 259 if list_unsupported:
236 return ret_list 260 return ret_list