diff options
| author | jvoisin | 2014-06-08 13:39:18 +0200 |
|---|---|---|
| committer | jvoisin | 2014-06-08 13:39:18 +0200 |
| commit | af36529554c39a2eefcc2c8723715e2d25b401b8 (patch) | |
| tree | f54b964520bab44d1dfac725086211eaf22d3763 /libmat/archive.py | |
| parent | ef5a32cfd3c0555ffe5ddf413eeaae61622ebb4b (diff) | |
Rename the MAT folder to libmat.
This commit fixes some issues for dump operating
systems who doesn't handle capitalization.
Diffstat (limited to 'libmat/archive.py')
| -rw-r--r-- | libmat/archive.py | 335 |
1 files changed, 335 insertions, 0 deletions
diff --git a/libmat/archive.py b/libmat/archive.py new file mode 100644 index 0000000..d483dcc --- /dev/null +++ b/libmat/archive.py | |||
| @@ -0,0 +1,335 @@ | |||
| 1 | ''' Take care of archives formats | ||
| 2 | ''' | ||
| 3 | |||
| 4 | import datetime | ||
| 5 | import logging | ||
| 6 | import os | ||
| 7 | import shutil | ||
| 8 | import stat | ||
| 9 | import tarfile | ||
| 10 | import tempfile | ||
| 11 | import zipfile | ||
| 12 | |||
| 13 | import mat | ||
| 14 | import parser | ||
| 15 | |||
| 16 | # Zip files do not support dates older than 01/01/1980 | ||
| 17 | ZIP_EPOCH = (1980, 1, 1, 0, 0, 0) | ||
| 18 | ZIP_EPOCH_SECONDS = (datetime.datetime(1980, 1, 1, 0, 0, 0) | ||
| 19 | - datetime.datetime(1970, 1, 1, 1, 0, 0)).total_seconds() | ||
| 20 | |||
| 21 | |||
| 22 | class GenericArchiveStripper(parser.GenericParser): | ||
| 23 | ''' Represent a generic archive | ||
| 24 | ''' | ||
| 25 | def __init__(self, filename, parser, mime, backup, is_writable, **kwargs): | ||
| 26 | super(GenericArchiveStripper, self).__init__(filename, | ||
| 27 | parser, mime, backup, is_writable, **kwargs) | ||
| 28 | self.compression = '' | ||
| 29 | self.add2archive = kwargs['add2archive'] | ||
| 30 | self.tempdir = tempfile.mkdtemp() | ||
| 31 | |||
| 32 | def __del__(self): | ||
| 33 | ''' Remove the files inside the temp dir, | ||
| 34 | then remove the temp dir | ||
| 35 | ''' | ||
| 36 | for root, dirs, files in os.walk(self.tempdir): | ||
| 37 | for item in files: | ||
| 38 | path_file = os.path.join(root, item) | ||
| 39 | mat.secure_remove(path_file) | ||
| 40 | shutil.rmtree(self.tempdir) | ||
| 41 | |||
| 42 | def is_clean(self, list_unsupported=False): | ||
| 43 | ''' Virtual method to check for harmul metadata | ||
| 44 | ''' | ||
| 45 | raise NotImplementedError | ||
| 46 | |||
| 47 | def list_unsupported(self): | ||
| 48 | ''' Get a list of every non-supported files present in the archive | ||
| 49 | ''' | ||
| 50 | return self.is_clean(list_unsupported=True) | ||
| 51 | |||
| 52 | def remove_all(self): | ||
| 53 | ''' Virtual method to remove all metadata | ||
| 54 | ''' | ||
| 55 | raise NotImplementedError | ||
| 56 | |||
| 57 | |||
| 58 | class ZipStripper(GenericArchiveStripper): | ||
| 59 | ''' Represent a zip file | ||
| 60 | ''' | ||
| 61 | def __is_zipfile_clean(self, fileinfo): | ||
| 62 | ''' Check if a ZipInfo object is clean of metadata added | ||
| 63 | by zip itself, independently of the corresponding file metadata | ||
| 64 | ''' | ||
| 65 | if fileinfo.comment != '': | ||
| 66 | return False | ||
| 67 | elif fileinfo.date_time != ZIP_EPOCH: | ||
| 68 | return False | ||
| 69 | elif fileinfo.create_system != 3: # 3 is UNIX | ||
| 70 | return False | ||
| 71 | return True | ||
| 72 | |||
| 73 | def is_clean(self, list_unsupported=False): | ||
| 74 | ''' Check if the given file is clean from harmful metadata | ||
| 75 | When list_unsupported is True, the method returns a list | ||
| 76 | of all non-supported/archives files contained in the | ||
| 77 | archive. | ||
| 78 | ''' | ||
| 79 | ret_list = [] | ||
| 80 | zipin = zipfile.ZipFile(self.filename, 'r') | ||
| 81 | if zipin.comment != '' and not list_unsupported: | ||
| 82 | logging.debug('%s has a comment' % self.filename) | ||
| 83 | return False | ||
| 84 | for item in zipin.infolist(): | ||
| 85 | zipin.extract(item, self.tempdir) | ||
| 86 | path = os.path.join(self.tempdir, item.filename) | ||
| 87 | if not self.__is_zipfile_clean(item) and not list_unsupported: | ||
| 88 | logging.debug('%s from %s has compromising zipinfo' % | ||
| 89 | (item.filename, self.filename)) | ||
| 90 | return False | ||
| 91 | if os.path.isfile(path): | ||
| 92 | cfile = mat.create_class_file(path, False, add2archive=self.add2archive) | ||
| 93 | if cfile is not None: | ||
| 94 | if not cfile.is_clean(): | ||
| 95 | logging.debug('%s from %s has metadata' % (item.filename, self.filename)) | ||
| 96 | if not list_unsupported: | ||
| 97 | return False | ||
| 98 | else: | ||
| 99 | logging.info('%s\'s fileformat is not supported or harmless.' | ||
| 100 | % item.filename) | ||
| 101 | basename, ext = os.path.splitext(path) | ||
| 102 | if os.path.basename(item.filename) not in ('mimetype', '.rels'): | ||
| 103 | if ext not in parser.NOMETA: | ||
| 104 | if not list_unsupported: | ||
| 105 | return False | ||
| 106 | ret_list.append(item.filename) | ||
| 107 | zipin.close() | ||
| 108 | if list_unsupported: | ||
| 109 | return ret_list | ||
| 110 | return True | ||
| 111 | |||
| 112 | def get_meta(self): | ||
| 113 | ''' Return all the metadata of a zip archive''' | ||
| 114 | zipin = zipfile.ZipFile(self.filename, 'r') | ||
| 115 | metadata = {} | ||
| 116 | if zipin.comment != '': | ||
| 117 | metadata['comment'] = zipin.comment | ||
| 118 | for item in zipin.infolist(): | ||
| 119 | zipinfo_meta = self.__get_zipinfo_meta(item) | ||
| 120 | if zipinfo_meta != {}: # zipinfo metadata | ||
| 121 | metadata[item.filename + "'s zipinfo"] = str(zipinfo_meta) | ||
| 122 | zipin.extract(item, self.tempdir) | ||
| 123 | path = os.path.join(self.tempdir, item.filename) | ||
| 124 | if os.path.isfile(path): | ||
| 125 | cfile = mat.create_class_file(path, False, add2archive=self.add2archive) | ||
| 126 | if cfile is not None: | ||
| 127 | cfile_meta = cfile.get_meta() | ||
| 128 | if cfile_meta != {}: | ||
| 129 | metadata[item.filename] = str(cfile_meta) | ||
| 130 | else: | ||
| 131 | logging.info('%s\'s fileformat is not supported or harmless' | ||
| 132 | % item.filename) | ||
| 133 | zipin.close() | ||
| 134 | return metadata | ||
| 135 | |||
| 136 | def __get_zipinfo_meta(self, zipinfo): | ||
| 137 | ''' Return all the metadata of a ZipInfo | ||
| 138 | ''' | ||
| 139 | metadata = {} | ||
| 140 | if zipinfo.comment != '': | ||
| 141 | metadata['comment'] = zipinfo.comment | ||
| 142 | if zipinfo.date_time != ZIP_EPOCH: | ||
| 143 | metadata['modified'] = zipinfo.date_time | ||
| 144 | if zipinfo.create_system != 3: # 3 is UNIX | ||
| 145 | metadata['system'] = "windows" if zipinfo.create_system == 2 else "unknown" | ||
| 146 | return metadata | ||
| 147 | |||
| 148 | def remove_all(self, whitelist=[], beginning_blacklist=[], ending_blacklist=[]): | ||
| 149 | ''' Remove all metadata from a zip archive, even thoses | ||
| 150 | added by Python's zipfile itself. It will not add | ||
| 151 | files starting with "begining_blacklist", or ending with | ||
| 152 | "ending_blacklist". This method also add files present in | ||
| 153 | whitelist to the archive. | ||
| 154 | ''' | ||
| 155 | zipin = zipfile.ZipFile(self.filename, 'r') | ||
| 156 | zipout = zipfile.ZipFile(self.output, 'w', allowZip64=True) | ||
| 157 | for item in zipin.infolist(): | ||
| 158 | zipin.extract(item, self.tempdir) | ||
| 159 | path = os.path.join(self.tempdir, item.filename) | ||
| 160 | |||
| 161 | beginning = any((True for f in beginning_blacklist if item.filename.startswith(f))) | ||
| 162 | ending = any((True for f in ending_blacklist if item.filename.endswith(f))) | ||
| 163 | |||
| 164 | if os.path.isfile(path) and not beginning and not ending: | ||
| 165 | cfile = mat.create_class_file(path, False, add2archive=self.add2archive) | ||
| 166 | if cfile is not None: | ||
| 167 | # Handle read-only files inside archive | ||
| 168 | old_stat = os.stat(path).st_mode | ||
| 169 | os.chmod(path, old_stat|stat.S_IWUSR) | ||
| 170 | cfile.remove_all() | ||
| 171 | os.chmod(path, old_stat) | ||
| 172 | logging.debug('Processing %s from %s' % (item.filename, self.filename)) | ||
| 173 | elif item.filename not in whitelist: | ||
| 174 | logging.info('%s\'s format is not supported or harmless' % item.filename) | ||
| 175 | basename, ext = os.path.splitext(path) | ||
| 176 | if not (self.add2archive or ext in parser.NOMETA): | ||
| 177 | continue | ||
| 178 | os.utime(path, (ZIP_EPOCH_SECONDS, ZIP_EPOCH_SECONDS)) | ||
| 179 | zipout.write(path, item.filename) | ||
| 180 | zipin.close() | ||
| 181 | zipout.close() | ||
| 182 | |||
| 183 | logging.info('%s processed' % self.filename) | ||
| 184 | self.do_backup() | ||
| 185 | return True | ||
| 186 | |||
| 187 | |||
| 188 | class TarStripper(GenericArchiveStripper): | ||
| 189 | ''' Represent a tarfile archive | ||
| 190 | ''' | ||
| 191 | def _remove(self, current_file): | ||
| 192 | ''' Remove the meta added by tarfile itself to the file | ||
| 193 | ''' | ||
| 194 | current_file.mtime = 0 | ||
| 195 | current_file.uid = 0 | ||
| 196 | current_file.gid = 0 | ||
| 197 | current_file.uname = '' | ||
| 198 | current_file.gname = '' | ||
| 199 | return current_file | ||
| 200 | |||
| 201 | def remove_all(self, whitelist=[]): | ||
| 202 | ''' Remove all harmful metadata from the tarfile. | ||
| 203 | The method will also add every files matching | ||
| 204 | whitelist in the produced archive. | ||
| 205 | ''' | ||
| 206 | tarin = tarfile.open(self.filename, 'r' + self.compression, encoding='utf-8') | ||
| 207 | tarout = tarfile.open(self.output, 'w' + self.compression, encoding='utf-8') | ||
| 208 | for item in tarin.getmembers(): | ||
| 209 | tarin.extract(item, self.tempdir) | ||
| 210 | if item.isfile(): | ||
| 211 | path = os.path.join(self.tempdir, item.name) | ||
| 212 | cfile = mat.create_class_file(path, False, add2archive=self.add2archive) | ||
| 213 | if cfile is not None: | ||
| 214 | # Handle read-only files inside archive | ||
| 215 | old_stat = os.stat(path).st_mode | ||
| 216 | os.chmod(path, old_stat|stat.S_IWUSR) | ||
| 217 | cfile.remove_all() | ||
| 218 | os.chmod(path, old_stat) | ||
| 219 | elif self.add2archive or os.path.splitext(item.name)[1] in parser.NOMETA: | ||
| 220 | logging.debug('%s\' format is either not supported or harmless' % item.name) | ||
| 221 | elif item.name in whitelist: | ||
| 222 | logging.debug('%s is not supported, but MAT was told to add it anyway.' | ||
| 223 | % item.name) | ||
| 224 | else: # Don't add the file to the archive | ||
| 225 | logging.debug('%s will not be added' % item.name) | ||
| 226 | continue | ||
| 227 | tarout.add(path, item.name, filter=self._remove) | ||
| 228 | tarin.close() | ||
| 229 | tarout.close() | ||
| 230 | self.do_backup() | ||
| 231 | return True | ||
| 232 | |||
| 233 | def is_file_clean(self, current_file): | ||
| 234 | ''' Check metadatas added by tarfile | ||
| 235 | ''' | ||
| 236 | if current_file.mtime != 0: | ||
| 237 | return False | ||
| 238 | elif current_file.uid != 0: | ||
| 239 | return False | ||
| 240 | elif current_file.gid != 0: | ||
| 241 | return False | ||
| 242 | elif current_file.uname != '': | ||
| 243 | return False | ||
| 244 | elif current_file.gname != '': | ||
| 245 | return False | ||
| 246 | return True | ||
| 247 | |||
| 248 | def is_clean(self, list_unsupported=False): | ||
| 249 | ''' Check if the file is clean from harmful metadatas | ||
| 250 | When list_unsupported is True, the method returns a list | ||
| 251 | of all non-supported/archives files contained in the | ||
| 252 | archive. | ||
| 253 | ''' | ||
| 254 | ret_list = [] | ||
| 255 | tarin = tarfile.open(self.filename, 'r' + self.compression) | ||
| 256 | for item in tarin.getmembers(): | ||
| 257 | if not self.is_file_clean(item) and not list_unsupported: | ||
| 258 | logging.debug('%s from %s has compromising tarinfo' % | ||
| 259 | (item.name, self.filename)) | ||
| 260 | return False | ||
| 261 | tarin.extract(item, self.tempdir) | ||
| 262 | path = os.path.join(self.tempdir, item.name) | ||
| 263 | if item.isfile(): | ||
| 264 | cfile = mat.create_class_file(path, False, add2archive=self.add2archive) | ||
| 265 | if cfile is not None: | ||
| 266 | if not cfile.is_clean(): | ||
| 267 | logging.debug('%s from %s has metadata' % | ||
| 268 | (item.name.decode("utf8"), self.filename)) | ||
| 269 | if not list_unsupported: | ||
| 270 | return False | ||
| 271 | # Nested archives are treated like unsupported files | ||
| 272 | elif isinstance(cfile, GenericArchiveStripper): | ||
| 273 | ret_list.append(item.name) | ||
| 274 | else: | ||
| 275 | logging.error('%s\'s format is not supported or harmless' % item.name) | ||
| 276 | if os.path.splitext(path)[1] not in parser.NOMETA: | ||
| 277 | if not list_unsupported: | ||
| 278 | return False | ||
| 279 | ret_list.append(item.name) | ||
| 280 | tarin.close() | ||
| 281 | if list_unsupported: | ||
| 282 | return ret_list | ||
| 283 | return True | ||
| 284 | |||
| 285 | def get_meta(self): | ||
| 286 | ''' Return a dict with all the meta of the tarfile | ||
| 287 | ''' | ||
| 288 | tarin = tarfile.open(self.filename, 'r' + self.compression) | ||
| 289 | metadata = {} | ||
| 290 | for item in tarin.getmembers(): | ||
| 291 | current_meta = {} | ||
| 292 | if item.isfile(): | ||
| 293 | tarin.extract(item, self.tempdir) | ||
| 294 | path = os.path.join(self.tempdir, item.name) | ||
| 295 | class_file = mat.create_class_file(path, False, add2archive=self.add2archive) | ||
| 296 | if class_file is not None: | ||
| 297 | meta = class_file.get_meta() | ||
| 298 | if meta: | ||
| 299 | current_meta['file'] = str(meta) | ||
| 300 | else: | ||
| 301 | logging.error('%s\'s format is not supported or harmless' % item.name) | ||
| 302 | |||
| 303 | if not self.is_file_clean(item): # if there is meta | ||
| 304 | current_meta['mtime'] = item.mtime | ||
| 305 | current_meta['uid'] = item.uid | ||
| 306 | current_meta['gid'] = item.gid | ||
| 307 | current_meta['uname'] = item.uname | ||
| 308 | current_meta['gname'] = item.gname | ||
| 309 | metadata[item.name] = str(current_meta) | ||
| 310 | tarin.close() | ||
| 311 | return metadata | ||
| 312 | |||
| 313 | |||
| 314 | class TerminalZipStripper(ZipStripper): | ||
| 315 | ''' Represent a terminal level archive. | ||
| 316 | This type of archive can not contain nested archives. | ||
| 317 | It is used for formats like docx, which are basically | ||
| 318 | ziped xml. | ||
| 319 | ''' | ||
| 320 | |||
| 321 | |||
| 322 | class GzipStripper(TarStripper): | ||
| 323 | ''' Represent a tar.gz archive | ||
| 324 | ''' | ||
| 325 | def __init__(self, filename, parser, mime, backup, is_writable, **kwargs): | ||
| 326 | super(GzipStripper, self).__init__(filename, parser, mime, backup, is_writable, **kwargs) | ||
| 327 | self.compression = ':gz' | ||
| 328 | |||
| 329 | |||
| 330 | class Bzip2Stripper(TarStripper): | ||
| 331 | ''' Represent a tar.bz2 archive | ||
| 332 | ''' | ||
| 333 | def __init__(self, filename, parser, mime, backup, is_writable, **kwargs): | ||
| 334 | super(Bzip2Stripper, self).__init__(filename, parser, mime, backup, is_writable, **kwargs) | ||
| 335 | self.compression = ':bz2' | ||
