diff options
Diffstat (limited to 'libmat/archive.py')
| -rw-r--r-- | libmat/archive.py | 128 |
1 files changed, 72 insertions, 56 deletions
diff --git a/libmat/archive.py b/libmat/archive.py index d483dcc..4c62dc8 100644 --- a/libmat/archive.py +++ b/libmat/archive.py | |||
| @@ -1,5 +1,5 @@ | |||
| 1 | ''' Take care of archives formats | 1 | """ Take care of archives formats |
| 2 | ''' | 2 | """ |
| 3 | 3 | ||
| 4 | import datetime | 4 | import datetime |
| 5 | import logging | 5 | import logging |
| @@ -16,23 +16,24 @@ import parser | |||
| 16 | # Zip files do not support dates older than 01/01/1980 | 16 | # Zip files do not support dates older than 01/01/1980 |
| 17 | ZIP_EPOCH = (1980, 1, 1, 0, 0, 0) | 17 | ZIP_EPOCH = (1980, 1, 1, 0, 0, 0) |
| 18 | ZIP_EPOCH_SECONDS = (datetime.datetime(1980, 1, 1, 0, 0, 0) | 18 | ZIP_EPOCH_SECONDS = (datetime.datetime(1980, 1, 1, 0, 0, 0) |
| 19 | - datetime.datetime(1970, 1, 1, 1, 0, 0)).total_seconds() | 19 | - datetime.datetime(1970, 1, 1, 1, 0, 0)).total_seconds() |
| 20 | 20 | ||
| 21 | 21 | ||
| 22 | class GenericArchiveStripper(parser.GenericParser): | 22 | class GenericArchiveStripper(parser.GenericParser): |
| 23 | ''' Represent a generic archive | 23 | """ Represent a generic archive |
| 24 | ''' | 24 | """ |
| 25 | |||
| 25 | def __init__(self, filename, parser, mime, backup, is_writable, **kwargs): | 26 | def __init__(self, filename, parser, mime, backup, is_writable, **kwargs): |
| 26 | super(GenericArchiveStripper, self).__init__(filename, | 27 | super(GenericArchiveStripper, self).__init__(filename, |
| 27 | parser, mime, backup, is_writable, **kwargs) | 28 | parser, mime, backup, is_writable, **kwargs) |
| 28 | self.compression = '' | 29 | self.compression = '' |
| 29 | self.add2archive = kwargs['add2archive'] | 30 | self.add2archive = kwargs['add2archive'] |
| 30 | self.tempdir = tempfile.mkdtemp() | 31 | self.tempdir = tempfile.mkdtemp() |
| 31 | 32 | ||
| 32 | def __del__(self): | 33 | def __del__(self): |
| 33 | ''' Remove the files inside the temp dir, | 34 | """ Remove the files inside the temp dir, |
| 34 | then remove the temp dir | 35 | then remove the temp dir |
| 35 | ''' | 36 | """ |
| 36 | for root, dirs, files in os.walk(self.tempdir): | 37 | for root, dirs, files in os.walk(self.tempdir): |
| 37 | for item in files: | 38 | for item in files: |
| 38 | path_file = os.path.join(root, item) | 39 | path_file = os.path.join(root, item) |
| @@ -40,28 +41,30 @@ class GenericArchiveStripper(parser.GenericParser): | |||
| 40 | shutil.rmtree(self.tempdir) | 41 | shutil.rmtree(self.tempdir) |
| 41 | 42 | ||
| 42 | def is_clean(self, list_unsupported=False): | 43 | def is_clean(self, list_unsupported=False): |
| 43 | ''' Virtual method to check for harmul metadata | 44 | """ Virtual method to check for harmul metadata |
| 44 | ''' | 45 | """ |
| 45 | raise NotImplementedError | 46 | raise NotImplementedError |
| 46 | 47 | ||
| 47 | def list_unsupported(self): | 48 | def list_unsupported(self): |
| 48 | ''' Get a list of every non-supported files present in the archive | 49 | """ Get a list of every non-supported files present in the archive |
| 49 | ''' | 50 | """ |
| 50 | return self.is_clean(list_unsupported=True) | 51 | return self.is_clean(list_unsupported=True) |
| 51 | 52 | ||
| 52 | def remove_all(self): | 53 | def remove_all(self): |
| 53 | ''' Virtual method to remove all metadata | 54 | """ Virtual method to remove all metadata |
| 54 | ''' | 55 | """ |
| 55 | raise NotImplementedError | 56 | raise NotImplementedError |
| 56 | 57 | ||
| 57 | 58 | ||
| 58 | class ZipStripper(GenericArchiveStripper): | 59 | class ZipStripper(GenericArchiveStripper): |
| 59 | ''' Represent a zip file | 60 | """ Represent a zip file |
| 60 | ''' | 61 | """ |
| 61 | def __is_zipfile_clean(self, fileinfo): | 62 | |
| 62 | ''' Check if a ZipInfo object is clean of metadata added | 63 | @staticmethod |
| 64 | def __is_zipfile_clean(fileinfo): | ||
| 65 | """ Check if a ZipInfo object is clean of metadata added | ||
| 63 | by zip itself, independently of the corresponding file metadata | 66 | by zip itself, independently of the corresponding file metadata |
| 64 | ''' | 67 | """ |
| 65 | if fileinfo.comment != '': | 68 | if fileinfo.comment != '': |
| 66 | return False | 69 | return False |
| 67 | elif fileinfo.date_time != ZIP_EPOCH: | 70 | elif fileinfo.date_time != ZIP_EPOCH: |
| @@ -71,11 +74,11 @@ class ZipStripper(GenericArchiveStripper): | |||
| 71 | return True | 74 | return True |
| 72 | 75 | ||
| 73 | def is_clean(self, list_unsupported=False): | 76 | def is_clean(self, list_unsupported=False): |
| 74 | ''' Check if the given file is clean from harmful metadata | 77 | """ Check if the given file is clean from harmful metadata |
| 75 | When list_unsupported is True, the method returns a list | 78 | When list_unsupported is True, the method returns a list |
| 76 | of all non-supported/archives files contained in the | 79 | of all non-supported/archives files contained in the |
| 77 | archive. | 80 | archive. |
| 78 | ''' | 81 | """ |
| 79 | ret_list = [] | 82 | ret_list = [] |
| 80 | zipin = zipfile.ZipFile(self.filename, 'r') | 83 | zipin = zipfile.ZipFile(self.filename, 'r') |
| 81 | if zipin.comment != '' and not list_unsupported: | 84 | if zipin.comment != '' and not list_unsupported: |
| @@ -86,7 +89,7 @@ class ZipStripper(GenericArchiveStripper): | |||
| 86 | path = os.path.join(self.tempdir, item.filename) | 89 | path = os.path.join(self.tempdir, item.filename) |
| 87 | if not self.__is_zipfile_clean(item) and not list_unsupported: | 90 | if not self.__is_zipfile_clean(item) and not list_unsupported: |
| 88 | logging.debug('%s from %s has compromising zipinfo' % | 91 | logging.debug('%s from %s has compromising zipinfo' % |
| 89 | (item.filename, self.filename)) | 92 | (item.filename, self.filename)) |
| 90 | return False | 93 | return False |
| 91 | if os.path.isfile(path): | 94 | if os.path.isfile(path): |
| 92 | cfile = mat.create_class_file(path, False, add2archive=self.add2archive) | 95 | cfile = mat.create_class_file(path, False, add2archive=self.add2archive) |
| @@ -97,7 +100,7 @@ class ZipStripper(GenericArchiveStripper): | |||
| 97 | return False | 100 | return False |
| 98 | else: | 101 | else: |
| 99 | logging.info('%s\'s fileformat is not supported or harmless.' | 102 | logging.info('%s\'s fileformat is not supported or harmless.' |
| 100 | % item.filename) | 103 | % item.filename) |
| 101 | basename, ext = os.path.splitext(path) | 104 | basename, ext = os.path.splitext(path) |
| 102 | if os.path.basename(item.filename) not in ('mimetype', '.rels'): | 105 | if os.path.basename(item.filename) not in ('mimetype', '.rels'): |
| 103 | if ext not in parser.NOMETA: | 106 | if ext not in parser.NOMETA: |
| @@ -110,7 +113,7 @@ class ZipStripper(GenericArchiveStripper): | |||
| 110 | return True | 113 | return True |
| 111 | 114 | ||
| 112 | def get_meta(self): | 115 | def get_meta(self): |
| 113 | ''' Return all the metadata of a zip archive''' | 116 | """ Return all the metadata of a zip archive""" |
| 114 | zipin = zipfile.ZipFile(self.filename, 'r') | 117 | zipin = zipfile.ZipFile(self.filename, 'r') |
| 115 | metadata = {} | 118 | metadata = {} |
| 116 | if zipin.comment != '': | 119 | if zipin.comment != '': |
| @@ -129,13 +132,14 @@ class ZipStripper(GenericArchiveStripper): | |||
| 129 | metadata[item.filename] = str(cfile_meta) | 132 | metadata[item.filename] = str(cfile_meta) |
| 130 | else: | 133 | else: |
| 131 | logging.info('%s\'s fileformat is not supported or harmless' | 134 | logging.info('%s\'s fileformat is not supported or harmless' |
| 132 | % item.filename) | 135 | % item.filename) |
| 133 | zipin.close() | 136 | zipin.close() |
| 134 | return metadata | 137 | return metadata |
| 135 | 138 | ||
| 136 | def __get_zipinfo_meta(self, zipinfo): | 139 | @staticmethod |
| 137 | ''' Return all the metadata of a ZipInfo | 140 | def __get_zipinfo_meta(zipinfo): |
| 138 | ''' | 141 | """ Return all the metadata of a ZipInfo |
| 142 | """ | ||
| 139 | metadata = {} | 143 | metadata = {} |
| 140 | if zipinfo.comment != '': | 144 | if zipinfo.comment != '': |
| 141 | metadata['comment'] = zipinfo.comment | 145 | metadata['comment'] = zipinfo.comment |
| @@ -145,13 +149,19 @@ class ZipStripper(GenericArchiveStripper): | |||
| 145 | metadata['system'] = "windows" if zipinfo.create_system == 2 else "unknown" | 149 | metadata['system'] = "windows" if zipinfo.create_system == 2 else "unknown" |
| 146 | return metadata | 150 | return metadata |
| 147 | 151 | ||
| 148 | def remove_all(self, whitelist=[], beginning_blacklist=[], ending_blacklist=[]): | 152 | def remove_all(self, whitelist=None, beginning_blacklist=None, ending_blacklist=None): |
| 149 | ''' Remove all metadata from a zip archive, even thoses | 153 | """ Remove all metadata from a zip archive, even thoses |
| 150 | added by Python's zipfile itself. It will not add | 154 | added by Python's zipfile itself. It will not add |
| 151 | files starting with "begining_blacklist", or ending with | 155 | files starting with "begining_blacklist", or ending with |
| 152 | "ending_blacklist". This method also add files present in | 156 | "ending_blacklist". This method also add files present in |
| 153 | whitelist to the archive. | 157 | whitelist to the archive. |
| 154 | ''' | 158 | """ |
| 159 | if not ending_blacklist: | ||
| 160 | ending_blacklist = [] | ||
| 161 | if not beginning_blacklist: | ||
| 162 | beginning_blacklist = [] | ||
| 163 | if not whitelist: | ||
| 164 | whitelist = [] | ||
| 155 | zipin = zipfile.ZipFile(self.filename, 'r') | 165 | zipin = zipfile.ZipFile(self.filename, 'r') |
| 156 | zipout = zipfile.ZipFile(self.output, 'w', allowZip64=True) | 166 | zipout = zipfile.ZipFile(self.output, 'w', allowZip64=True) |
| 157 | for item in zipin.infolist(): | 167 | for item in zipin.infolist(): |
| @@ -166,7 +176,7 @@ class ZipStripper(GenericArchiveStripper): | |||
| 166 | if cfile is not None: | 176 | if cfile is not None: |
| 167 | # Handle read-only files inside archive | 177 | # Handle read-only files inside archive |
| 168 | old_stat = os.stat(path).st_mode | 178 | old_stat = os.stat(path).st_mode |
| 169 | os.chmod(path, old_stat|stat.S_IWUSR) | 179 | os.chmod(path, old_stat | stat.S_IWUSR) |
| 170 | cfile.remove_all() | 180 | cfile.remove_all() |
| 171 | os.chmod(path, old_stat) | 181 | os.chmod(path, old_stat) |
| 172 | logging.debug('Processing %s from %s' % (item.filename, self.filename)) | 182 | logging.debug('Processing %s from %s' % (item.filename, self.filename)) |
| @@ -186,11 +196,12 @@ class ZipStripper(GenericArchiveStripper): | |||
| 186 | 196 | ||
| 187 | 197 | ||
| 188 | class TarStripper(GenericArchiveStripper): | 198 | class TarStripper(GenericArchiveStripper): |
| 189 | ''' Represent a tarfile archive | 199 | """ Represent a tarfile archive |
| 190 | ''' | 200 | """ |
| 201 | |||
| 191 | def _remove(self, current_file): | 202 | def _remove(self, current_file): |
| 192 | ''' Remove the meta added by tarfile itself to the file | 203 | """ Remove the meta added by tarfile itself to the file |
| 193 | ''' | 204 | """ |
| 194 | current_file.mtime = 0 | 205 | current_file.mtime = 0 |
| 195 | current_file.uid = 0 | 206 | current_file.uid = 0 |
| 196 | current_file.gid = 0 | 207 | current_file.gid = 0 |
| @@ -198,11 +209,13 @@ class TarStripper(GenericArchiveStripper): | |||
| 198 | current_file.gname = '' | 209 | current_file.gname = '' |
| 199 | return current_file | 210 | return current_file |
| 200 | 211 | ||
| 201 | def remove_all(self, whitelist=[]): | 212 | def remove_all(self, whitelist=None): |
| 202 | ''' Remove all harmful metadata from the tarfile. | 213 | """ Remove all harmful metadata from the tarfile. |
| 203 | The method will also add every files matching | 214 | The method will also add every files matching |
| 204 | whitelist in the produced archive. | 215 | whitelist in the produced archive. |
| 205 | ''' | 216 | """ |
| 217 | if not whitelist: | ||
| 218 | whitelist = [] | ||
| 206 | tarin = tarfile.open(self.filename, 'r' + self.compression, encoding='utf-8') | 219 | tarin = tarfile.open(self.filename, 'r' + self.compression, encoding='utf-8') |
| 207 | tarout = tarfile.open(self.output, 'w' + self.compression, encoding='utf-8') | 220 | tarout = tarfile.open(self.output, 'w' + self.compression, encoding='utf-8') |
| 208 | for item in tarin.getmembers(): | 221 | for item in tarin.getmembers(): |
| @@ -213,14 +226,14 @@ class TarStripper(GenericArchiveStripper): | |||
| 213 | if cfile is not None: | 226 | if cfile is not None: |
| 214 | # Handle read-only files inside archive | 227 | # Handle read-only files inside archive |
| 215 | old_stat = os.stat(path).st_mode | 228 | old_stat = os.stat(path).st_mode |
| 216 | os.chmod(path, old_stat|stat.S_IWUSR) | 229 | os.chmod(path, old_stat | stat.S_IWUSR) |
| 217 | cfile.remove_all() | 230 | cfile.remove_all() |
| 218 | os.chmod(path, old_stat) | 231 | os.chmod(path, old_stat) |
| 219 | elif self.add2archive or os.path.splitext(item.name)[1] in parser.NOMETA: | 232 | elif self.add2archive or os.path.splitext(item.name)[1] in parser.NOMETA: |
| 220 | logging.debug('%s\' format is either not supported or harmless' % item.name) | 233 | logging.debug('%s\' format is either not supported or harmless' % item.name) |
| 221 | elif item.name in whitelist: | 234 | elif item.name in whitelist: |
| 222 | logging.debug('%s is not supported, but MAT was told to add it anyway.' | 235 | logging.debug('%s is not supported, but MAT was told to add it anyway.' |
| 223 | % item.name) | 236 | % item.name) |
| 224 | else: # Don't add the file to the archive | 237 | else: # Don't add the file to the archive |
| 225 | logging.debug('%s will not be added' % item.name) | 238 | logging.debug('%s will not be added' % item.name) |
| 226 | continue | 239 | continue |
| @@ -230,9 +243,10 @@ class TarStripper(GenericArchiveStripper): | |||
| 230 | self.do_backup() | 243 | self.do_backup() |
| 231 | return True | 244 | return True |
| 232 | 245 | ||
| 233 | def is_file_clean(self, current_file): | 246 | @staticmethod |
| 234 | ''' Check metadatas added by tarfile | 247 | def is_file_clean(current_file): |
| 235 | ''' | 248 | """ Check metadatas added by tarfile |
| 249 | """ | ||
| 236 | if current_file.mtime != 0: | 250 | if current_file.mtime != 0: |
| 237 | return False | 251 | return False |
| 238 | elif current_file.uid != 0: | 252 | elif current_file.uid != 0: |
| @@ -246,17 +260,17 @@ class TarStripper(GenericArchiveStripper): | |||
| 246 | return True | 260 | return True |
| 247 | 261 | ||
| 248 | def is_clean(self, list_unsupported=False): | 262 | def is_clean(self, list_unsupported=False): |
| 249 | ''' Check if the file is clean from harmful metadatas | 263 | """ Check if the file is clean from harmful metadatas |
| 250 | When list_unsupported is True, the method returns a list | 264 | When list_unsupported is True, the method returns a list |
| 251 | of all non-supported/archives files contained in the | 265 | of all non-supported/archives files contained in the |
| 252 | archive. | 266 | archive. |
| 253 | ''' | 267 | """ |
| 254 | ret_list = [] | 268 | ret_list = [] |
| 255 | tarin = tarfile.open(self.filename, 'r' + self.compression) | 269 | tarin = tarfile.open(self.filename, 'r' + self.compression) |
| 256 | for item in tarin.getmembers(): | 270 | for item in tarin.getmembers(): |
| 257 | if not self.is_file_clean(item) and not list_unsupported: | 271 | if not self.is_file_clean(item) and not list_unsupported: |
| 258 | logging.debug('%s from %s has compromising tarinfo' % | 272 | logging.debug('%s from %s has compromising tarinfo' % |
| 259 | (item.name, self.filename)) | 273 | (item.name, self.filename)) |
| 260 | return False | 274 | return False |
| 261 | tarin.extract(item, self.tempdir) | 275 | tarin.extract(item, self.tempdir) |
| 262 | path = os.path.join(self.tempdir, item.name) | 276 | path = os.path.join(self.tempdir, item.name) |
| @@ -265,7 +279,7 @@ class TarStripper(GenericArchiveStripper): | |||
| 265 | if cfile is not None: | 279 | if cfile is not None: |
| 266 | if not cfile.is_clean(): | 280 | if not cfile.is_clean(): |
| 267 | logging.debug('%s from %s has metadata' % | 281 | logging.debug('%s from %s has metadata' % |
| 268 | (item.name.decode("utf8"), self.filename)) | 282 | (item.name.decode("utf8"), self.filename)) |
| 269 | if not list_unsupported: | 283 | if not list_unsupported: |
| 270 | return False | 284 | return False |
| 271 | # Nested archives are treated like unsupported files | 285 | # Nested archives are treated like unsupported files |
| @@ -283,8 +297,8 @@ class TarStripper(GenericArchiveStripper): | |||
| 283 | return True | 297 | return True |
| 284 | 298 | ||
| 285 | def get_meta(self): | 299 | def get_meta(self): |
| 286 | ''' Return a dict with all the meta of the tarfile | 300 | """ Return a dict with all the meta of the tarfile |
| 287 | ''' | 301 | """ |
| 288 | tarin = tarfile.open(self.filename, 'r' + self.compression) | 302 | tarin = tarfile.open(self.filename, 'r' + self.compression) |
| 289 | metadata = {} | 303 | metadata = {} |
| 290 | for item in tarin.getmembers(): | 304 | for item in tarin.getmembers(): |
| @@ -312,24 +326,26 @@ class TarStripper(GenericArchiveStripper): | |||
| 312 | 326 | ||
| 313 | 327 | ||
| 314 | class TerminalZipStripper(ZipStripper): | 328 | class TerminalZipStripper(ZipStripper): |
| 315 | ''' Represent a terminal level archive. | 329 | """ Represent a terminal level archive. |
| 316 | This type of archive can not contain nested archives. | 330 | This type of archive can not contain nested archives. |
| 317 | It is used for formats like docx, which are basically | 331 | It is used for formats like docx, which are basically |
| 318 | ziped xml. | 332 | ziped xml. |
| 319 | ''' | 333 | """ |
| 320 | 334 | ||
| 321 | 335 | ||
| 322 | class GzipStripper(TarStripper): | 336 | class GzipStripper(TarStripper): |
| 323 | ''' Represent a tar.gz archive | 337 | """ Represent a tar.gz archive |
| 324 | ''' | 338 | """ |
| 339 | |||
| 325 | def __init__(self, filename, parser, mime, backup, is_writable, **kwargs): | 340 | def __init__(self, filename, parser, mime, backup, is_writable, **kwargs): |
| 326 | super(GzipStripper, self).__init__(filename, parser, mime, backup, is_writable, **kwargs) | 341 | super(GzipStripper, self).__init__(filename, parser, mime, backup, is_writable, **kwargs) |
| 327 | self.compression = ':gz' | 342 | self.compression = ':gz' |
| 328 | 343 | ||
| 329 | 344 | ||
| 330 | class Bzip2Stripper(TarStripper): | 345 | class Bzip2Stripper(TarStripper): |
| 331 | ''' Represent a tar.bz2 archive | 346 | """ Represent a tar.bz2 archive |
| 332 | ''' | 347 | """ |
| 348 | |||
| 333 | def __init__(self, filename, parser, mime, backup, is_writable, **kwargs): | 349 | def __init__(self, filename, parser, mime, backup, is_writable, **kwargs): |
| 334 | super(Bzip2Stripper, self).__init__(filename, parser, mime, backup, is_writable, **kwargs) | 350 | super(Bzip2Stripper, self).__init__(filename, parser, mime, backup, is_writable, **kwargs) |
| 335 | self.compression = ':bz2' | 351 | self.compression = ':bz2' |
