summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorjvoisin2014-01-02 16:44:59 +0000
committerjvoisin2014-01-02 16:44:59 +0000
commit67d7f217587bed6efbf155f5e0e413528443251b (patch)
tree33e6a1a4be3a1766c638efcd67bf04067adf6a76
parent5cbcd67aa4d5718fbe33b7af8d7ec8e5756d551a (diff)
Greatly improves tarfiles handling
-rw-r--r--MAT/archive.py179
1 files changed, 97 insertions, 82 deletions
diff --git a/MAT/archive.py b/MAT/archive.py
index f07e18c..3a30b66 100644
--- a/MAT/archive.py
+++ b/MAT/archive.py
@@ -4,12 +4,12 @@
4import logging 4import logging
5import os 5import os
6import shutil 6import shutil
7import tarfile
7import tempfile 8import tempfile
8import zipfile 9import zipfile
9 10
10import mat 11import mat
11import parser 12import parser
12import tarfile
13 13
14 14
15class GenericArchiveStripper(parser.GenericParser): 15class GenericArchiveStripper(parser.GenericParser):
@@ -31,6 +31,14 @@ class GenericArchiveStripper(parser.GenericParser):
31 mat.secure_remove(path_file) 31 mat.secure_remove(path_file)
32 shutil.rmtree(self.tempdir) 32 shutil.rmtree(self.tempdir)
33 33
34 def is_clean(self, list_unsupported):
35 raise NotImplementedError
36
37 def list_unsupported(self):
38 ''' Get a list of every non-supported files present in the archive
39 '''
40 return self.is_clean(list_unsupported=True)
41
34 def remove_all(self): 42 def remove_all(self):
35 ''' Virtual method to remove all metadata 43 ''' Virtual method to remove all metadata
36 ''' 44 '''
@@ -44,20 +52,19 @@ class ZipStripper(GenericArchiveStripper):
44 ''' Check if a ZipInfo object is clean of metadatas added 52 ''' Check if a ZipInfo object is clean of metadatas added
45 by zip itself, independently of the corresponding file metadatas 53 by zip itself, independently of the corresponding file metadatas
46 ''' 54 '''
47 if fileinfo.comment: 55 if fileinfo.comment != '':
48 return False
49 elif fileinfo.date_time:
50 return False 56 return False
51 elif fileinfo.create_system: 57 elif fileinfo.date_time != (1980, 1, 1, 0, 0, 0):
52 return False 58 return False
53 elif fileinfo.create_version: 59 elif fileinfo.create_system != 3: # 3 is UNIX
54 return False 60 return False
55 return True 61 return True
56 62
57 def is_clean(self): 63 def is_clean(self, list_unsupported=False):
58 ''' 64 ''' Check if the given file is clean from harmful metadata
59 Check if the given file is clean from harmful metadata
60 ''' 65 '''
66 if list_unsupported:
67 ret_list = []
61 zipin = zipfile.ZipFile(self.filename, 'r') 68 zipin = zipfile.ZipFile(self.filename, 'r')
62 if zipin.comment != '': 69 if zipin.comment != '':
63 logging.debug('%s has a comment' % self.filename) 70 logging.debug('%s has a comment' % self.filename)
@@ -71,44 +78,46 @@ class ZipStripper(GenericArchiveStripper):
71 zipin.extract(item, self.tempdir) 78 zipin.extract(item, self.tempdir)
72 name = os.path.join(self.tempdir, item.filename) 79 name = os.path.join(self.tempdir, item.filename)
73 if os.path.isfile(name): 80 if os.path.isfile(name):
74 try: 81 cfile = mat.create_class_file(name, False, add2archive=self.add2archive)
75 cfile = mat.create_class_file(name, False, 82 if cfile:
76 add2archive=self.add2archive)
77 if not cfile.is_clean(): 83 if not cfile.is_clean():
78 return False 84 return False
79 except: 85 else:
80 # best solution I have found 86 logging.info('%s\'s fileformat is not supported, or is harmless' % item.filename)
81 logging.info('%s\'s fileformat is not supported, or is a \ 87 basename, ext = os.path.splitext(name)
82harmless format' % item.filename)
83 _, ext = os.path.splitext(name)
84 bname = os.path.basename(item.filename) 88 bname = os.path.basename(item.filename)
85 if ext not in parser.NOMETA: 89 if ext not in parser.NOMETA:
86 if bname != 'mimetype' and bname != '.rels': 90 if bname != 'mimetype' and bname != '.rels':
87 return False 91 if list_unsupported:
92 ret_list.append(bname)
93 else:
94 return False
88 zipin.close() 95 zipin.close()
96 if list_unsupported:
97 return ret_list
89 return True 98 return True
90 99
91 def get_meta(self): 100 def get_meta(self):
92 ''' 101 ''' Return all the metadata of a ZipFile (don't return metadatas
93 Return all the metadata of a ZipFile (don't return metadatas
94 of contained files : should it ?) 102 of contained files : should it ?)
95 ''' 103 '''
96 zipin = zipfile.ZipFile(self.filename, 'r') 104 zipin = zipfile.ZipFile(self.filename, 'r')
97 metadata = {} 105 metadata = {}
98 for field in zipin.infolist(): 106 for field in zipin.infolist():
99 zipmeta = {} 107 zipmeta = {}
100 zipmeta['comment'] = field.comment 108 if field.comment != '':
101 zipmeta['modified'] = field.date_time 109 zipmeta['comment'] = field.comment
102 zipmeta['system'] = field.create_system 110 if field.date_time != (1980, 1, 1, 0, 0, 0):
103 zipmeta['zip_version'] = field.create_version 111 zipmeta['modified'] = field.date_time
104 metadata[field.filename] = zipmeta 112 if field.create_system != 3: # 3 is UNIX
105 metadata["%s comment" % self.filename] = zipin.comment 113 zipmeta['system'] = "windows" if field.create_system == 2 else "unknown"
114 if zipin.comment != '':
115 metadata["%s comment" % self.filename] = zipin.comment
106 zipin.close() 116 zipin.close()
107 return metadata 117 return metadata
108 118
109 def remove_all(self): 119 def remove_all(self):
110 ''' 120 ''' So far, the zipfile module does not allow to write a ZipInfo
111 So far, the zipfile module does not allow to write a ZipInfo
112 object into a zipfile (and it's a shame !) : so data added 121 object into a zipfile (and it's a shame !) : so data added
113 by zipfile itself could not be removed. It's a big concern. 122 by zipfile itself could not be removed. It's a big concern.
114 Is shipping a patched version of zipfile.py a good idea ? 123 Is shipping a patched version of zipfile.py a good idea ?
@@ -132,21 +141,24 @@ harmless format' % item.filename)
132 _, ext = os.path.splitext(name) 141 _, ext = os.path.splitext(name)
133 if self.add2archive or ext in parser.NOMETA: 142 if self.add2archive or ext in parser.NOMETA:
134 zipout.write(name, item.filename) 143 zipout.write(name, item.filename)
135 zipout.comment = ''
136 zipin.close() 144 zipin.close()
145 for zipfile in zipout.infolist():
146 zipfile.orig_filename = zipfile.filename
147 zipfile.date_time = (1980, 1, 1, 0, 0, 0)
148 zipfile.create_system = 3 # 3 is UNIX
149 zipout.comment = ''
137 zipout.close() 150 zipout.close()
138 logging.info('%s treated' % self.filename) 151
152 logging.info('%s processed' % self.filename)
139 self.do_backup() 153 self.do_backup()
140 return True 154 return True
141 155
142 156
143class TarStripper(GenericArchiveStripper): 157class TarStripper(GenericArchiveStripper):
144 ''' 158 ''' Represent a tarfile archive
145 Represent a tarfile archive
146 ''' 159 '''
147 def _remove(self, current_file): 160 def _remove(self, current_file):
148 ''' 161 ''' Remove the meta added by tar itself to the file
149 remove the meta added by tar itself to the file
150 ''' 162 '''
151 current_file.mtime = 0 163 current_file.mtime = 0
152 current_file.uid = 0 164 current_file.uid = 0
@@ -160,28 +172,24 @@ class TarStripper(GenericArchiveStripper):
160 tarout = tarfile.open(self.output, 'w' + self.compression, encoding='utf-8') 172 tarout = tarfile.open(self.output, 'w' + self.compression, encoding='utf-8')
161 for item in tarin.getmembers(): 173 for item in tarin.getmembers():
162 tarin.extract(item, self.tempdir) 174 tarin.extract(item, self.tempdir)
163 name = os.path.join(self.tempdir, item.name) 175 complete_name = os.path.join(self.tempdir, item.name)
164 if item.type == '0': # is item a regular file ? 176 if item.isfile():
165 # no backup file 177 cfile = mat.create_class_file(complete_name, False, add2archive=self.add2archive)
166 try: 178 if cfile:
167 cfile = mat.create_class_file(name, False,
168 add2archive=self.add2archive)
169 cfile.remove_all() 179 cfile.remove_all()
170 tarout.add(name, item.name, filter=self._remove) 180 tarout.add(complete_name, item.name, filter=self._remove)
171 except: 181 else:
172 logging.info('%s\' format is not supported or harmless' % 182 logging.info('%s\' format is not supported or harmless' % item.name)
173 item.name) 183 basename, ext = os.path.splitext(item.name)
174 _, ext = os.path.splitext(name)
175 if self.add2archive or ext in parser.NOMETA: 184 if self.add2archive or ext in parser.NOMETA:
176 tarout.add(name, item.name, filter=self._remove) 185 tarout.add(complete_name, item.name, filter=self._remove)
177 tarin.close() 186 tarin.close()
178 tarout.close() 187 tarout.close()
179 self.do_backup() 188 self.do_backup()
180 return True 189 return True
181 190
182 def is_file_clean(self, current_file): 191 def is_file_clean(self, current_file):
183 ''' 192 ''' Check metadatas added by tar
184 Check metadatas added by tar
185 ''' 193 '''
186 if current_file.mtime != 0: 194 if current_file.mtime != 0:
187 return False 195 return False
@@ -193,60 +201,68 @@ class TarStripper(GenericArchiveStripper):
193 return False 201 return False
194 elif current_file.gname != '': 202 elif current_file.gname != '':
195 return False 203 return False
196 else: 204 return True
197 return True
198 205
199 def is_clean(self): 206 def is_clean(self, list_unsupported=False):
200 ''' 207 ''' Check if the file is clean from harmful metadatas
201 Check if the file is clean from harmful metadatas
202 ''' 208 '''
209 if list_unsupported:
210 ret_list = []
203 tarin = tarfile.open(self.filename, 'r' + self.compression) 211 tarin = tarfile.open(self.filename, 'r' + self.compression)
204 for item in tarin.getmembers(): 212 for item in tarin.getmembers():
205 if not self.is_file_clean(item): 213 if not self.is_file_clean(item):
206 tarin.close()
207 return False 214 return False
208 tarin.extract(item, self.tempdir) 215 tarin.extract(item, self.tempdir)
209 name = os.path.join(self.tempdir, item.name) 216 complete_name = os.path.join(self.tempdir, item.name)
210 if item.type == '0': # is item a regular file ? 217 if item.isfile():
211 try: 218 class_file = mat.create_class_file(complete_name, False, add2archive=self.add2archive)
212 class_file = mat.create_class_file(name, 219 if class_file:
213 False, add2archive=self.add2archive) # no backup file
214 if not class_file.is_clean(): 220 if not class_file.is_clean():
215 tarin.close()
216 return False 221 return False
217 except: 222 else:
218 logging.error('%s\'s format is not supported or harmless' % 223 logging.error('%s\'s format is not supported or harmless' % item.name)
219 item.filename) 224 basename, ext = os.path.splitext(complete_name)
220 _, ext = os.path.splitext(name)
221 if ext not in parser.NOMETA: 225 if ext not in parser.NOMETA:
222 tarin.close() 226 if list_unsupported:
223 return False 227 ret_list.append(complete_name)
228 else:
229 return False
224 tarin.close() 230 tarin.close()
231 if list_unsupported:
232 return ret_list
225 return True 233 return True
226 234
227 def get_meta(self): 235 def get_meta(self):
228 ''' 236 ''' Return a dict with all the meta of the file
229 Return a dict with all the meta of the file
230 ''' 237 '''
231 tarin = tarfile.open(self.filename, 'r' + self.compression) 238 tarin = tarfile.open(self.filename, 'r' + self.compression)
232 metadata = {} 239 metadata = {}
233 for current_file in tarin.getmembers(): 240 for item in tarin.getmembers():
234 if current_file.type == '0': 241 current_meta = {}
235 if not self.is_file_clean(current_file): # if there is meta 242 if item.isfile():
236 current_meta = {} 243 tarin.extract(item, self.tempdir)
237 current_meta['mtime'] = current_file.mtime 244 name = os.path.join(self.tempdir, item.name)
238 current_meta['uid'] = current_file.uid 245 class_file = mat.create_class_file(name, False, add2archive=self.add2archive)
239 current_meta['gid'] = current_file.gid 246 if class_file is not None:
240 current_meta['uname'] = current_file.uname 247 meta = class_file.get_meta()
241 current_meta['gname'] = current_file.gname 248 if meta:
242 metadata[current_file.name] = current_meta 249 current_meta['file'] = meta
250 else:
251 logging.error('%s\'s format is not supported or harmless' % item.name)
252
253 if not self.is_file_clean(item): # if there is meta
254 current_meta['mtime'] = item.mtime
255 current_meta['uid'] = item.uid
256 current_meta['gid'] = item.gid
257 current_meta['uname'] = item.uname
258 current_meta['gname'] = item.gname
259 metadata[item.name] = current_meta
243 tarin.close() 260 tarin.close()
244 return metadata 261 return metadata
245 262
246 263
247class GzipStripper(TarStripper): 264class GzipStripper(TarStripper):
248 ''' 265 ''' Represent a tar.gz archive
249 Represent a tar.gz archive
250 ''' 266 '''
251 def __init__(self, filename, parser, mime, backup, is_writable, **kwargs): 267 def __init__(self, filename, parser, mime, backup, is_writable, **kwargs):
252 super(GzipStripper, self).__init__(filename, parser, mime, backup, is_writable, **kwargs) 268 super(GzipStripper, self).__init__(filename, parser, mime, backup, is_writable, **kwargs)
@@ -254,8 +270,7 @@ class GzipStripper(TarStripper):
254 270
255 271
256class Bzip2Stripper(TarStripper): 272class Bzip2Stripper(TarStripper):
257 ''' 273 ''' Represent a tar.bz2 archive
258 Represents a tar.bz2 archive
259 ''' 274 '''
260 def __init__(self, filename, parser, mime, backup, is_writable, **kwargs): 275 def __init__(self, filename, parser, mime, backup, is_writable, **kwargs):
261 super(Bzip2Stripper, self).__init__(filename, parser, mime, backup, is_writable, **kwargs) 276 super(Bzip2Stripper, self).__init__(filename, parser, mime, backup, is_writable, **kwargs)