summaryrefslogtreecommitdiff
path: root/libmat/archive.py
diff options
context:
space:
mode:
authorjvoisin2014-06-08 13:39:18 +0200
committerjvoisin2014-06-08 13:39:18 +0200
commitaf36529554c39a2eefcc2c8723715e2d25b401b8 (patch)
treef54b964520bab44d1dfac725086211eaf22d3763 /libmat/archive.py
parentef5a32cfd3c0555ffe5ddf413eeaae61622ebb4b (diff)
Rename the MAT folder to libmat.
This commit fixes some issues for dump operating systems who doesn't handle capitalization.
Diffstat (limited to 'libmat/archive.py')
-rw-r--r--libmat/archive.py335
1 files changed, 335 insertions, 0 deletions
diff --git a/libmat/archive.py b/libmat/archive.py
new file mode 100644
index 0000000..d483dcc
--- /dev/null
+++ b/libmat/archive.py
@@ -0,0 +1,335 @@
1''' Take care of archives formats
2'''
3
4import datetime
5import logging
6import os
7import shutil
8import stat
9import tarfile
10import tempfile
11import zipfile
12
13import mat
14import parser
15
16# Zip files do not support dates older than 01/01/1980
17ZIP_EPOCH = (1980, 1, 1, 0, 0, 0)
18ZIP_EPOCH_SECONDS = (datetime.datetime(1980, 1, 1, 0, 0, 0)
19 - datetime.datetime(1970, 1, 1, 1, 0, 0)).total_seconds()
20
21
22class GenericArchiveStripper(parser.GenericParser):
23 ''' Represent a generic archive
24 '''
25 def __init__(self, filename, parser, mime, backup, is_writable, **kwargs):
26 super(GenericArchiveStripper, self).__init__(filename,
27 parser, mime, backup, is_writable, **kwargs)
28 self.compression = ''
29 self.add2archive = kwargs['add2archive']
30 self.tempdir = tempfile.mkdtemp()
31
32 def __del__(self):
33 ''' Remove the files inside the temp dir,
34 then remove the temp dir
35 '''
36 for root, dirs, files in os.walk(self.tempdir):
37 for item in files:
38 path_file = os.path.join(root, item)
39 mat.secure_remove(path_file)
40 shutil.rmtree(self.tempdir)
41
42 def is_clean(self, list_unsupported=False):
43 ''' Virtual method to check for harmul metadata
44 '''
45 raise NotImplementedError
46
47 def list_unsupported(self):
48 ''' Get a list of every non-supported files present in the archive
49 '''
50 return self.is_clean(list_unsupported=True)
51
52 def remove_all(self):
53 ''' Virtual method to remove all metadata
54 '''
55 raise NotImplementedError
56
57
58class ZipStripper(GenericArchiveStripper):
59 ''' Represent a zip file
60 '''
61 def __is_zipfile_clean(self, fileinfo):
62 ''' Check if a ZipInfo object is clean of metadata added
63 by zip itself, independently of the corresponding file metadata
64 '''
65 if fileinfo.comment != '':
66 return False
67 elif fileinfo.date_time != ZIP_EPOCH:
68 return False
69 elif fileinfo.create_system != 3: # 3 is UNIX
70 return False
71 return True
72
73 def is_clean(self, list_unsupported=False):
74 ''' Check if the given file is clean from harmful metadata
75 When list_unsupported is True, the method returns a list
76 of all non-supported/archives files contained in the
77 archive.
78 '''
79 ret_list = []
80 zipin = zipfile.ZipFile(self.filename, 'r')
81 if zipin.comment != '' and not list_unsupported:
82 logging.debug('%s has a comment' % self.filename)
83 return False
84 for item in zipin.infolist():
85 zipin.extract(item, self.tempdir)
86 path = os.path.join(self.tempdir, item.filename)
87 if not self.__is_zipfile_clean(item) and not list_unsupported:
88 logging.debug('%s from %s has compromising zipinfo' %
89 (item.filename, self.filename))
90 return False
91 if os.path.isfile(path):
92 cfile = mat.create_class_file(path, False, add2archive=self.add2archive)
93 if cfile is not None:
94 if not cfile.is_clean():
95 logging.debug('%s from %s has metadata' % (item.filename, self.filename))
96 if not list_unsupported:
97 return False
98 else:
99 logging.info('%s\'s fileformat is not supported or harmless.'
100 % item.filename)
101 basename, ext = os.path.splitext(path)
102 if os.path.basename(item.filename) not in ('mimetype', '.rels'):
103 if ext not in parser.NOMETA:
104 if not list_unsupported:
105 return False
106 ret_list.append(item.filename)
107 zipin.close()
108 if list_unsupported:
109 return ret_list
110 return True
111
112 def get_meta(self):
113 ''' Return all the metadata of a zip archive'''
114 zipin = zipfile.ZipFile(self.filename, 'r')
115 metadata = {}
116 if zipin.comment != '':
117 metadata['comment'] = zipin.comment
118 for item in zipin.infolist():
119 zipinfo_meta = self.__get_zipinfo_meta(item)
120 if zipinfo_meta != {}: # zipinfo metadata
121 metadata[item.filename + "'s zipinfo"] = str(zipinfo_meta)
122 zipin.extract(item, self.tempdir)
123 path = os.path.join(self.tempdir, item.filename)
124 if os.path.isfile(path):
125 cfile = mat.create_class_file(path, False, add2archive=self.add2archive)
126 if cfile is not None:
127 cfile_meta = cfile.get_meta()
128 if cfile_meta != {}:
129 metadata[item.filename] = str(cfile_meta)
130 else:
131 logging.info('%s\'s fileformat is not supported or harmless'
132 % item.filename)
133 zipin.close()
134 return metadata
135
136 def __get_zipinfo_meta(self, zipinfo):
137 ''' Return all the metadata of a ZipInfo
138 '''
139 metadata = {}
140 if zipinfo.comment != '':
141 metadata['comment'] = zipinfo.comment
142 if zipinfo.date_time != ZIP_EPOCH:
143 metadata['modified'] = zipinfo.date_time
144 if zipinfo.create_system != 3: # 3 is UNIX
145 metadata['system'] = "windows" if zipinfo.create_system == 2 else "unknown"
146 return metadata
147
148 def remove_all(self, whitelist=[], beginning_blacklist=[], ending_blacklist=[]):
149 ''' Remove all metadata from a zip archive, even thoses
150 added by Python's zipfile itself. It will not add
151 files starting with "begining_blacklist", or ending with
152 "ending_blacklist". This method also add files present in
153 whitelist to the archive.
154 '''
155 zipin = zipfile.ZipFile(self.filename, 'r')
156 zipout = zipfile.ZipFile(self.output, 'w', allowZip64=True)
157 for item in zipin.infolist():
158 zipin.extract(item, self.tempdir)
159 path = os.path.join(self.tempdir, item.filename)
160
161 beginning = any((True for f in beginning_blacklist if item.filename.startswith(f)))
162 ending = any((True for f in ending_blacklist if item.filename.endswith(f)))
163
164 if os.path.isfile(path) and not beginning and not ending:
165 cfile = mat.create_class_file(path, False, add2archive=self.add2archive)
166 if cfile is not None:
167 # Handle read-only files inside archive
168 old_stat = os.stat(path).st_mode
169 os.chmod(path, old_stat|stat.S_IWUSR)
170 cfile.remove_all()
171 os.chmod(path, old_stat)
172 logging.debug('Processing %s from %s' % (item.filename, self.filename))
173 elif item.filename not in whitelist:
174 logging.info('%s\'s format is not supported or harmless' % item.filename)
175 basename, ext = os.path.splitext(path)
176 if not (self.add2archive or ext in parser.NOMETA):
177 continue
178 os.utime(path, (ZIP_EPOCH_SECONDS, ZIP_EPOCH_SECONDS))
179 zipout.write(path, item.filename)
180 zipin.close()
181 zipout.close()
182
183 logging.info('%s processed' % self.filename)
184 self.do_backup()
185 return True
186
187
188class TarStripper(GenericArchiveStripper):
189 ''' Represent a tarfile archive
190 '''
191 def _remove(self, current_file):
192 ''' Remove the meta added by tarfile itself to the file
193 '''
194 current_file.mtime = 0
195 current_file.uid = 0
196 current_file.gid = 0
197 current_file.uname = ''
198 current_file.gname = ''
199 return current_file
200
201 def remove_all(self, whitelist=[]):
202 ''' Remove all harmful metadata from the tarfile.
203 The method will also add every files matching
204 whitelist in the produced archive.
205 '''
206 tarin = tarfile.open(self.filename, 'r' + self.compression, encoding='utf-8')
207 tarout = tarfile.open(self.output, 'w' + self.compression, encoding='utf-8')
208 for item in tarin.getmembers():
209 tarin.extract(item, self.tempdir)
210 if item.isfile():
211 path = os.path.join(self.tempdir, item.name)
212 cfile = mat.create_class_file(path, False, add2archive=self.add2archive)
213 if cfile is not None:
214 # Handle read-only files inside archive
215 old_stat = os.stat(path).st_mode
216 os.chmod(path, old_stat|stat.S_IWUSR)
217 cfile.remove_all()
218 os.chmod(path, old_stat)
219 elif self.add2archive or os.path.splitext(item.name)[1] in parser.NOMETA:
220 logging.debug('%s\' format is either not supported or harmless' % item.name)
221 elif item.name in whitelist:
222 logging.debug('%s is not supported, but MAT was told to add it anyway.'
223 % item.name)
224 else: # Don't add the file to the archive
225 logging.debug('%s will not be added' % item.name)
226 continue
227 tarout.add(path, item.name, filter=self._remove)
228 tarin.close()
229 tarout.close()
230 self.do_backup()
231 return True
232
233 def is_file_clean(self, current_file):
234 ''' Check metadatas added by tarfile
235 '''
236 if current_file.mtime != 0:
237 return False
238 elif current_file.uid != 0:
239 return False
240 elif current_file.gid != 0:
241 return False
242 elif current_file.uname != '':
243 return False
244 elif current_file.gname != '':
245 return False
246 return True
247
248 def is_clean(self, list_unsupported=False):
249 ''' Check if the file is clean from harmful metadatas
250 When list_unsupported is True, the method returns a list
251 of all non-supported/archives files contained in the
252 archive.
253 '''
254 ret_list = []
255 tarin = tarfile.open(self.filename, 'r' + self.compression)
256 for item in tarin.getmembers():
257 if not self.is_file_clean(item) and not list_unsupported:
258 logging.debug('%s from %s has compromising tarinfo' %
259 (item.name, self.filename))
260 return False
261 tarin.extract(item, self.tempdir)
262 path = os.path.join(self.tempdir, item.name)
263 if item.isfile():
264 cfile = mat.create_class_file(path, False, add2archive=self.add2archive)
265 if cfile is not None:
266 if not cfile.is_clean():
267 logging.debug('%s from %s has metadata' %
268 (item.name.decode("utf8"), self.filename))
269 if not list_unsupported:
270 return False
271 # Nested archives are treated like unsupported files
272 elif isinstance(cfile, GenericArchiveStripper):
273 ret_list.append(item.name)
274 else:
275 logging.error('%s\'s format is not supported or harmless' % item.name)
276 if os.path.splitext(path)[1] not in parser.NOMETA:
277 if not list_unsupported:
278 return False
279 ret_list.append(item.name)
280 tarin.close()
281 if list_unsupported:
282 return ret_list
283 return True
284
285 def get_meta(self):
286 ''' Return a dict with all the meta of the tarfile
287 '''
288 tarin = tarfile.open(self.filename, 'r' + self.compression)
289 metadata = {}
290 for item in tarin.getmembers():
291 current_meta = {}
292 if item.isfile():
293 tarin.extract(item, self.tempdir)
294 path = os.path.join(self.tempdir, item.name)
295 class_file = mat.create_class_file(path, False, add2archive=self.add2archive)
296 if class_file is not None:
297 meta = class_file.get_meta()
298 if meta:
299 current_meta['file'] = str(meta)
300 else:
301 logging.error('%s\'s format is not supported or harmless' % item.name)
302
303 if not self.is_file_clean(item): # if there is meta
304 current_meta['mtime'] = item.mtime
305 current_meta['uid'] = item.uid
306 current_meta['gid'] = item.gid
307 current_meta['uname'] = item.uname
308 current_meta['gname'] = item.gname
309 metadata[item.name] = str(current_meta)
310 tarin.close()
311 return metadata
312
313
314class TerminalZipStripper(ZipStripper):
315 ''' Represent a terminal level archive.
316 This type of archive can not contain nested archives.
317 It is used for formats like docx, which are basically
318 ziped xml.
319 '''
320
321
322class GzipStripper(TarStripper):
323 ''' Represent a tar.gz archive
324 '''
325 def __init__(self, filename, parser, mime, backup, is_writable, **kwargs):
326 super(GzipStripper, self).__init__(filename, parser, mime, backup, is_writable, **kwargs)
327 self.compression = ':gz'
328
329
330class Bzip2Stripper(TarStripper):
331 ''' Represent a tar.bz2 archive
332 '''
333 def __init__(self, filename, parser, mime, backup, is_writable, **kwargs):
334 super(Bzip2Stripper, self).__init__(filename, parser, mime, backup, is_writable, **kwargs)
335 self.compression = ':bz2'