diff options
| author | jvoisin | 2014-01-02 16:44:59 +0000 |
|---|---|---|
| committer | jvoisin | 2014-01-02 16:44:59 +0000 |
| commit | 67d7f217587bed6efbf155f5e0e413528443251b (patch) | |
| tree | 33e6a1a4be3a1766c638efcd67bf04067adf6a76 /MAT | |
| parent | 5cbcd67aa4d5718fbe33b7af8d7ec8e5756d551a (diff) | |
Greatly improves tarfiles handling
Diffstat (limited to 'MAT')
| -rw-r--r-- | MAT/archive.py | 179 |
1 files changed, 97 insertions, 82 deletions
diff --git a/MAT/archive.py b/MAT/archive.py index f07e18c..3a30b66 100644 --- a/MAT/archive.py +++ b/MAT/archive.py | |||
| @@ -4,12 +4,12 @@ | |||
| 4 | import logging | 4 | import logging |
| 5 | import os | 5 | import os |
| 6 | import shutil | 6 | import shutil |
| 7 | import tarfile | ||
| 7 | import tempfile | 8 | import tempfile |
| 8 | import zipfile | 9 | import zipfile |
| 9 | 10 | ||
| 10 | import mat | 11 | import mat |
| 11 | import parser | 12 | import parser |
| 12 | import tarfile | ||
| 13 | 13 | ||
| 14 | 14 | ||
| 15 | class GenericArchiveStripper(parser.GenericParser): | 15 | class GenericArchiveStripper(parser.GenericParser): |
| @@ -31,6 +31,14 @@ class GenericArchiveStripper(parser.GenericParser): | |||
| 31 | mat.secure_remove(path_file) | 31 | mat.secure_remove(path_file) |
| 32 | shutil.rmtree(self.tempdir) | 32 | shutil.rmtree(self.tempdir) |
| 33 | 33 | ||
| 34 | def is_clean(self, list_unsupported): | ||
| 35 | raise NotImplementedError | ||
| 36 | |||
| 37 | def list_unsupported(self): | ||
| 38 | ''' Get a list of every non-supported files present in the archive | ||
| 39 | ''' | ||
| 40 | return self.is_clean(list_unsupported=True) | ||
| 41 | |||
| 34 | def remove_all(self): | 42 | def remove_all(self): |
| 35 | ''' Virtual method to remove all metadata | 43 | ''' Virtual method to remove all metadata |
| 36 | ''' | 44 | ''' |
| @@ -44,20 +52,19 @@ class ZipStripper(GenericArchiveStripper): | |||
| 44 | ''' Check if a ZipInfo object is clean of metadatas added | 52 | ''' Check if a ZipInfo object is clean of metadatas added |
| 45 | by zip itself, independently of the corresponding file metadatas | 53 | by zip itself, independently of the corresponding file metadatas |
| 46 | ''' | 54 | ''' |
| 47 | if fileinfo.comment: | 55 | if fileinfo.comment != '': |
| 48 | return False | ||
| 49 | elif fileinfo.date_time: | ||
| 50 | return False | 56 | return False |
| 51 | elif fileinfo.create_system: | 57 | elif fileinfo.date_time != (1980, 1, 1, 0, 0, 0): |
| 52 | return False | 58 | return False |
| 53 | elif fileinfo.create_version: | 59 | elif fileinfo.create_system != 3: # 3 is UNIX |
| 54 | return False | 60 | return False |
| 55 | return True | 61 | return True |
| 56 | 62 | ||
| 57 | def is_clean(self): | 63 | def is_clean(self, list_unsupported=False): |
| 58 | ''' | 64 | ''' Check if the given file is clean from harmful metadata |
| 59 | Check if the given file is clean from harmful metadata | ||
| 60 | ''' | 65 | ''' |
| 66 | if list_unsupported: | ||
| 67 | ret_list = [] | ||
| 61 | zipin = zipfile.ZipFile(self.filename, 'r') | 68 | zipin = zipfile.ZipFile(self.filename, 'r') |
| 62 | if zipin.comment != '': | 69 | if zipin.comment != '': |
| 63 | logging.debug('%s has a comment' % self.filename) | 70 | logging.debug('%s has a comment' % self.filename) |
| @@ -71,44 +78,46 @@ class ZipStripper(GenericArchiveStripper): | |||
| 71 | zipin.extract(item, self.tempdir) | 78 | zipin.extract(item, self.tempdir) |
| 72 | name = os.path.join(self.tempdir, item.filename) | 79 | name = os.path.join(self.tempdir, item.filename) |
| 73 | if os.path.isfile(name): | 80 | if os.path.isfile(name): |
| 74 | try: | 81 | cfile = mat.create_class_file(name, False, add2archive=self.add2archive) |
| 75 | cfile = mat.create_class_file(name, False, | 82 | if cfile: |
| 76 | add2archive=self.add2archive) | ||
| 77 | if not cfile.is_clean(): | 83 | if not cfile.is_clean(): |
| 78 | return False | 84 | return False |
| 79 | except: | 85 | else: |
| 80 | # best solution I have found | 86 | logging.info('%s\'s fileformat is not supported, or is harmless' % item.filename) |
| 81 | logging.info('%s\'s fileformat is not supported, or is a \ | 87 | basename, ext = os.path.splitext(name) |
| 82 | harmless format' % item.filename) | ||
| 83 | _, ext = os.path.splitext(name) | ||
| 84 | bname = os.path.basename(item.filename) | 88 | bname = os.path.basename(item.filename) |
| 85 | if ext not in parser.NOMETA: | 89 | if ext not in parser.NOMETA: |
| 86 | if bname != 'mimetype' and bname != '.rels': | 90 | if bname != 'mimetype' and bname != '.rels': |
| 87 | return False | 91 | if list_unsupported: |
| 92 | ret_list.append(bname) | ||
| 93 | else: | ||
| 94 | return False | ||
| 88 | zipin.close() | 95 | zipin.close() |
| 96 | if list_unsupported: | ||
| 97 | return ret_list | ||
| 89 | return True | 98 | return True |
| 90 | 99 | ||
| 91 | def get_meta(self): | 100 | def get_meta(self): |
| 92 | ''' | 101 | ''' Return all the metadata of a ZipFile (don't return metadatas |
| 93 | Return all the metadata of a ZipFile (don't return metadatas | ||
| 94 | of contained files : should it ?) | 102 | of contained files : should it ?) |
| 95 | ''' | 103 | ''' |
| 96 | zipin = zipfile.ZipFile(self.filename, 'r') | 104 | zipin = zipfile.ZipFile(self.filename, 'r') |
| 97 | metadata = {} | 105 | metadata = {} |
| 98 | for field in zipin.infolist(): | 106 | for field in zipin.infolist(): |
| 99 | zipmeta = {} | 107 | zipmeta = {} |
| 100 | zipmeta['comment'] = field.comment | 108 | if field.comment != '': |
| 101 | zipmeta['modified'] = field.date_time | 109 | zipmeta['comment'] = field.comment |
| 102 | zipmeta['system'] = field.create_system | 110 | if field.date_time != (1980, 1, 1, 0, 0, 0): |
| 103 | zipmeta['zip_version'] = field.create_version | 111 | zipmeta['modified'] = field.date_time |
| 104 | metadata[field.filename] = zipmeta | 112 | if field.create_system != 3: # 3 is UNIX |
| 105 | metadata["%s comment" % self.filename] = zipin.comment | 113 | zipmeta['system'] = "windows" if field.create_system == 2 else "unknown" |
| 114 | if zipin.comment != '': | ||
| 115 | metadata["%s comment" % self.filename] = zipin.comment | ||
| 106 | zipin.close() | 116 | zipin.close() |
| 107 | return metadata | 117 | return metadata |
| 108 | 118 | ||
| 109 | def remove_all(self): | 119 | def remove_all(self): |
| 110 | ''' | 120 | ''' So far, the zipfile module does not allow to write a ZipInfo |
| 111 | So far, the zipfile module does not allow to write a ZipInfo | ||
| 112 | object into a zipfile (and it's a shame !) : so data added | 121 | object into a zipfile (and it's a shame !) : so data added |
| 113 | by zipfile itself could not be removed. It's a big concern. | 122 | by zipfile itself could not be removed. It's a big concern. |
| 114 | Is shipping a patched version of zipfile.py a good idea ? | 123 | Is shipping a patched version of zipfile.py a good idea ? |
| @@ -132,21 +141,24 @@ harmless format' % item.filename) | |||
| 132 | _, ext = os.path.splitext(name) | 141 | _, ext = os.path.splitext(name) |
| 133 | if self.add2archive or ext in parser.NOMETA: | 142 | if self.add2archive or ext in parser.NOMETA: |
| 134 | zipout.write(name, item.filename) | 143 | zipout.write(name, item.filename) |
| 135 | zipout.comment = '' | ||
| 136 | zipin.close() | 144 | zipin.close() |
| 145 | for zipfile in zipout.infolist(): | ||
| 146 | zipfile.orig_filename = zipfile.filename | ||
| 147 | zipfile.date_time = (1980, 1, 1, 0, 0, 0) | ||
| 148 | zipfile.create_system = 3 # 3 is UNIX | ||
| 149 | zipout.comment = '' | ||
| 137 | zipout.close() | 150 | zipout.close() |
| 138 | logging.info('%s treated' % self.filename) | 151 | |
| 152 | logging.info('%s processed' % self.filename) | ||
| 139 | self.do_backup() | 153 | self.do_backup() |
| 140 | return True | 154 | return True |
| 141 | 155 | ||
| 142 | 156 | ||
| 143 | class TarStripper(GenericArchiveStripper): | 157 | class TarStripper(GenericArchiveStripper): |
| 144 | ''' | 158 | ''' Represent a tarfile archive |
| 145 | Represent a tarfile archive | ||
| 146 | ''' | 159 | ''' |
| 147 | def _remove(self, current_file): | 160 | def _remove(self, current_file): |
| 148 | ''' | 161 | ''' Remove the meta added by tar itself to the file |
| 149 | remove the meta added by tar itself to the file | ||
| 150 | ''' | 162 | ''' |
| 151 | current_file.mtime = 0 | 163 | current_file.mtime = 0 |
| 152 | current_file.uid = 0 | 164 | current_file.uid = 0 |
| @@ -160,28 +172,24 @@ class TarStripper(GenericArchiveStripper): | |||
| 160 | tarout = tarfile.open(self.output, 'w' + self.compression, encoding='utf-8') | 172 | tarout = tarfile.open(self.output, 'w' + self.compression, encoding='utf-8') |
| 161 | for item in tarin.getmembers(): | 173 | for item in tarin.getmembers(): |
| 162 | tarin.extract(item, self.tempdir) | 174 | tarin.extract(item, self.tempdir) |
| 163 | name = os.path.join(self.tempdir, item.name) | 175 | complete_name = os.path.join(self.tempdir, item.name) |
| 164 | if item.type == '0': # is item a regular file ? | 176 | if item.isfile(): |
| 165 | # no backup file | 177 | cfile = mat.create_class_file(complete_name, False, add2archive=self.add2archive) |
| 166 | try: | 178 | if cfile: |
| 167 | cfile = mat.create_class_file(name, False, | ||
| 168 | add2archive=self.add2archive) | ||
| 169 | cfile.remove_all() | 179 | cfile.remove_all() |
| 170 | tarout.add(name, item.name, filter=self._remove) | 180 | tarout.add(complete_name, item.name, filter=self._remove) |
| 171 | except: | 181 | else: |
| 172 | logging.info('%s\' format is not supported or harmless' % | 182 | logging.info('%s\' format is not supported or harmless' % item.name) |
| 173 | item.name) | 183 | basename, ext = os.path.splitext(item.name) |
| 174 | _, ext = os.path.splitext(name) | ||
| 175 | if self.add2archive or ext in parser.NOMETA: | 184 | if self.add2archive or ext in parser.NOMETA: |
| 176 | tarout.add(name, item.name, filter=self._remove) | 185 | tarout.add(complete_name, item.name, filter=self._remove) |
| 177 | tarin.close() | 186 | tarin.close() |
| 178 | tarout.close() | 187 | tarout.close() |
| 179 | self.do_backup() | 188 | self.do_backup() |
| 180 | return True | 189 | return True |
| 181 | 190 | ||
| 182 | def is_file_clean(self, current_file): | 191 | def is_file_clean(self, current_file): |
| 183 | ''' | 192 | ''' Check metadatas added by tar |
| 184 | Check metadatas added by tar | ||
| 185 | ''' | 193 | ''' |
| 186 | if current_file.mtime != 0: | 194 | if current_file.mtime != 0: |
| 187 | return False | 195 | return False |
| @@ -193,60 +201,68 @@ class TarStripper(GenericArchiveStripper): | |||
| 193 | return False | 201 | return False |
| 194 | elif current_file.gname != '': | 202 | elif current_file.gname != '': |
| 195 | return False | 203 | return False |
| 196 | else: | 204 | return True |
| 197 | return True | ||
| 198 | 205 | ||
| 199 | def is_clean(self): | 206 | def is_clean(self, list_unsupported=False): |
| 200 | ''' | 207 | ''' Check if the file is clean from harmful metadatas |
| 201 | Check if the file is clean from harmful metadatas | ||
| 202 | ''' | 208 | ''' |
| 209 | if list_unsupported: | ||
| 210 | ret_list = [] | ||
| 203 | tarin = tarfile.open(self.filename, 'r' + self.compression) | 211 | tarin = tarfile.open(self.filename, 'r' + self.compression) |
| 204 | for item in tarin.getmembers(): | 212 | for item in tarin.getmembers(): |
| 205 | if not self.is_file_clean(item): | 213 | if not self.is_file_clean(item): |
| 206 | tarin.close() | ||
| 207 | return False | 214 | return False |
| 208 | tarin.extract(item, self.tempdir) | 215 | tarin.extract(item, self.tempdir) |
| 209 | name = os.path.join(self.tempdir, item.name) | 216 | complete_name = os.path.join(self.tempdir, item.name) |
| 210 | if item.type == '0': # is item a regular file ? | 217 | if item.isfile(): |
| 211 | try: | 218 | class_file = mat.create_class_file(complete_name, False, add2archive=self.add2archive) |
| 212 | class_file = mat.create_class_file(name, | 219 | if class_file: |
| 213 | False, add2archive=self.add2archive) # no backup file | ||
| 214 | if not class_file.is_clean(): | 220 | if not class_file.is_clean(): |
| 215 | tarin.close() | ||
| 216 | return False | 221 | return False |
| 217 | except: | 222 | else: |
| 218 | logging.error('%s\'s format is not supported or harmless' % | 223 | logging.error('%s\'s format is not supported or harmless' % item.name) |
| 219 | item.filename) | 224 | basename, ext = os.path.splitext(complete_name) |
| 220 | _, ext = os.path.splitext(name) | ||
| 221 | if ext not in parser.NOMETA: | 225 | if ext not in parser.NOMETA: |
| 222 | tarin.close() | 226 | if list_unsupported: |
| 223 | return False | 227 | ret_list.append(complete_name) |
| 228 | else: | ||
| 229 | return False | ||
| 224 | tarin.close() | 230 | tarin.close() |
| 231 | if list_unsupported: | ||
| 232 | return ret_list | ||
| 225 | return True | 233 | return True |
| 226 | 234 | ||
| 227 | def get_meta(self): | 235 | def get_meta(self): |
| 228 | ''' | 236 | ''' Return a dict with all the meta of the file |
| 229 | Return a dict with all the meta of the file | ||
| 230 | ''' | 237 | ''' |
| 231 | tarin = tarfile.open(self.filename, 'r' + self.compression) | 238 | tarin = tarfile.open(self.filename, 'r' + self.compression) |
| 232 | metadata = {} | 239 | metadata = {} |
| 233 | for current_file in tarin.getmembers(): | 240 | for item in tarin.getmembers(): |
| 234 | if current_file.type == '0': | 241 | current_meta = {} |
| 235 | if not self.is_file_clean(current_file): # if there is meta | 242 | if item.isfile(): |
| 236 | current_meta = {} | 243 | tarin.extract(item, self.tempdir) |
| 237 | current_meta['mtime'] = current_file.mtime | 244 | name = os.path.join(self.tempdir, item.name) |
| 238 | current_meta['uid'] = current_file.uid | 245 | class_file = mat.create_class_file(name, False, add2archive=self.add2archive) |
| 239 | current_meta['gid'] = current_file.gid | 246 | if class_file is not None: |
| 240 | current_meta['uname'] = current_file.uname | 247 | meta = class_file.get_meta() |
| 241 | current_meta['gname'] = current_file.gname | 248 | if meta: |
| 242 | metadata[current_file.name] = current_meta | 249 | current_meta['file'] = meta |
| 250 | else: | ||
| 251 | logging.error('%s\'s format is not supported or harmless' % item.name) | ||
| 252 | |||
| 253 | if not self.is_file_clean(item): # if there is meta | ||
| 254 | current_meta['mtime'] = item.mtime | ||
| 255 | current_meta['uid'] = item.uid | ||
| 256 | current_meta['gid'] = item.gid | ||
| 257 | current_meta['uname'] = item.uname | ||
| 258 | current_meta['gname'] = item.gname | ||
| 259 | metadata[item.name] = current_meta | ||
| 243 | tarin.close() | 260 | tarin.close() |
| 244 | return metadata | 261 | return metadata |
| 245 | 262 | ||
| 246 | 263 | ||
| 247 | class GzipStripper(TarStripper): | 264 | class GzipStripper(TarStripper): |
| 248 | ''' | 265 | ''' Represent a tar.gz archive |
| 249 | Represent a tar.gz archive | ||
| 250 | ''' | 266 | ''' |
| 251 | def __init__(self, filename, parser, mime, backup, is_writable, **kwargs): | 267 | def __init__(self, filename, parser, mime, backup, is_writable, **kwargs): |
| 252 | super(GzipStripper, self).__init__(filename, parser, mime, backup, is_writable, **kwargs) | 268 | super(GzipStripper, self).__init__(filename, parser, mime, backup, is_writable, **kwargs) |
| @@ -254,8 +270,7 @@ class GzipStripper(TarStripper): | |||
| 254 | 270 | ||
| 255 | 271 | ||
| 256 | class Bzip2Stripper(TarStripper): | 272 | class Bzip2Stripper(TarStripper): |
| 257 | ''' | 273 | ''' Represent a tar.bz2 archive |
| 258 | Represents a tar.bz2 archive | ||
| 259 | ''' | 274 | ''' |
| 260 | def __init__(self, filename, parser, mime, backup, is_writable, **kwargs): | 275 | def __init__(self, filename, parser, mime, backup, is_writable, **kwargs): |
| 261 | super(Bzip2Stripper, self).__init__(filename, parser, mime, backup, is_writable, **kwargs) | 276 | super(Bzip2Stripper, self).__init__(filename, parser, mime, backup, is_writable, **kwargs) |
