diff options
Diffstat (limited to 'MAT/archive.py')
| -rw-r--r-- | MAT/archive.py | 140 |
1 files changed, 82 insertions, 58 deletions
diff --git a/MAT/archive.py b/MAT/archive.py index 9179e48..53c5e9b 100644 --- a/MAT/archive.py +++ b/MAT/archive.py | |||
| @@ -1,6 +1,7 @@ | |||
| 1 | ''' Take care of archives formats | 1 | ''' Take care of archives formats |
| 2 | ''' | 2 | ''' |
| 3 | 3 | ||
| 4 | import datetime | ||
| 4 | import logging | 5 | import logging |
| 5 | import os | 6 | import os |
| 6 | import shutil | 7 | import shutil |
| @@ -11,12 +12,17 @@ import zipfile | |||
| 11 | import mat | 12 | import mat |
| 12 | import parser | 13 | import parser |
| 13 | 14 | ||
| 15 | ZIP_EPOCH = (1980, 1, 1, 0, 0, 0) | ||
| 16 | ZIP_EPOCH_SECONDS = (datetime.datetime(1980, 1, 1, 0, 0, 0) | ||
| 17 | - datetime.datetime(1970, 1, 1, 0, 0, 0)).total_seconds() | ||
| 18 | |||
| 14 | 19 | ||
| 15 | class GenericArchiveStripper(parser.GenericParser): | 20 | class GenericArchiveStripper(parser.GenericParser): |
| 16 | ''' Represent a generic archive | 21 | ''' Represent a generic archive |
| 17 | ''' | 22 | ''' |
| 18 | def __init__(self, filename, parser, mime, backup, is_writable, **kwargs): | 23 | def __init__(self, filename, parser, mime, backup, is_writable, **kwargs): |
| 19 | super(GenericArchiveStripper, self).__init__(filename, parser, mime, backup, is_writable, **kwargs) | 24 | super(GenericArchiveStripper, self).__init__(filename, |
| 25 | parser, mime, backup, is_writable, **kwargs) | ||
| 20 | self.compression = '' | 26 | self.compression = '' |
| 21 | self.add2archive = kwargs['add2archive'] | 27 | self.add2archive = kwargs['add2archive'] |
| 22 | self.tempdir = tempfile.mkdtemp() | 28 | self.tempdir = tempfile.mkdtemp() |
| @@ -48,13 +54,13 @@ class GenericArchiveStripper(parser.GenericParser): | |||
| 48 | class ZipStripper(GenericArchiveStripper): | 54 | class ZipStripper(GenericArchiveStripper): |
| 49 | ''' Represent a zip file | 55 | ''' Represent a zip file |
| 50 | ''' | 56 | ''' |
| 51 | def is_file_clean(self, fileinfo): | 57 | def __is_zipfile_clean(self, fileinfo): |
| 52 | ''' Check if a ZipInfo object is clean of metadatas added | 58 | ''' Check if a ZipInfo object is clean of metadatas added |
| 53 | by zip itself, independently of the corresponding file metadatas | 59 | by zip itself, independently of the corresponding file metadatas |
| 54 | ''' | 60 | ''' |
| 55 | if fileinfo.comment != '': | 61 | if fileinfo.comment != '': |
| 56 | return False | 62 | return False |
| 57 | elif fileinfo.date_time != (1980, 1, 1, 0, 0, 0): | 63 | elif fileinfo.date_time != ZIP_EPOCH: |
| 58 | return False | 64 | return False |
| 59 | elif fileinfo.create_system != 3: # 3 is UNIX | 65 | elif fileinfo.create_system != 3: # 3 is UNIX |
| 60 | return False | 66 | return False |
| @@ -70,83 +76,100 @@ class ZipStripper(GenericArchiveStripper): | |||
| 70 | logging.debug('%s has a comment' % self.filename) | 76 | logging.debug('%s has a comment' % self.filename) |
| 71 | return False | 77 | return False |
| 72 | for item in zipin.infolist(): | 78 | for item in zipin.infolist(): |
| 73 | # I have not found a way to remove the crap added by zipfile :/ | ||
| 74 | # if not self.is_file_clean(item): | ||
| 75 | # logging.debug('%s from %s has compromising zipinfo' % | ||
| 76 | # (item.filename, self.filename)) | ||
| 77 | # return False | ||
| 78 | zipin.extract(item, self.tempdir) | 79 | zipin.extract(item, self.tempdir) |
| 79 | name = os.path.join(self.tempdir, item.filename) | 80 | name = os.path.join(self.tempdir, item.filename) |
| 81 | if not self.__is_zipfile_clean(item) and not list_unsupported: | ||
| 82 | logging.debug('%s from %s has compromising zipinfo' % | ||
| 83 | (item.filename, self.filename)) | ||
| 84 | return False | ||
| 80 | if os.path.isfile(name): | 85 | if os.path.isfile(name): |
| 81 | cfile = mat.create_class_file(name, False, add2archive=self.add2archive) | 86 | cfile = mat.create_class_file(name, False, add2archive=self.add2archive) |
| 82 | if cfile: | 87 | if cfile: |
| 83 | if not cfile.is_clean(): | 88 | if not cfile.is_clean(): |
| 84 | return False | 89 | logging.debug('%s from %s has compromising zipinfo' % |
| 90 | (item.filename, self.filename)) | ||
| 91 | if not list_unsupported: | ||
| 92 | return False | ||
| 93 | ret_list.append(item.filename) | ||
| 85 | else: | 94 | else: |
| 86 | logging.info('%s\'s fileformat is not supported, or is harmless' % item.filename) | 95 | logging.info('%s\'s fileformat is not supported or harmless.' |
| 96 | % item.filename) | ||
| 87 | basename, ext = os.path.splitext(name) | 97 | basename, ext = os.path.splitext(name) |
| 88 | bname = os.path.basename(item.filename) | 98 | if os.path.basename(item.filename) not in ('mimetype', '.rels'): |
| 89 | if ext not in parser.NOMETA: | 99 | if ext not in parser.NOMETA: |
| 90 | if bname != 'mimetype' and bname != '.rels': | 100 | if not list_unsupported: |
| 91 | if list_unsupported: | ||
| 92 | ret_list.append(bname) | ||
| 93 | else: | ||
| 94 | return False | 101 | return False |
| 102 | ret_list.append(item.filename) | ||
| 95 | zipin.close() | 103 | zipin.close() |
| 96 | if list_unsupported: | 104 | if list_unsupported: |
| 97 | return ret_list | 105 | return ret_list |
| 98 | return True | 106 | return True |
| 99 | 107 | ||
| 100 | def get_meta(self): | 108 | def get_meta(self): |
| 101 | ''' Return all the metadata of a ZipFile (don't return metadatas | 109 | ''' Return all the metadata of a zip archive''' |
| 102 | of contained files : should it ?) | ||
| 103 | ''' | ||
| 104 | zipin = zipfile.ZipFile(self.filename, 'r') | 110 | zipin = zipfile.ZipFile(self.filename, 'r') |
| 105 | metadata = {} | 111 | metadata = {} |
| 106 | for field in zipin.infolist(): | ||
| 107 | zipmeta = {} | ||
| 108 | if field.comment != '': | ||
| 109 | zipmeta['comment'] = field.comment | ||
| 110 | if field.date_time != (1980, 1, 1, 0, 0, 0): | ||
| 111 | zipmeta['modified'] = field.date_time | ||
| 112 | if field.create_system != 3: # 3 is UNIX | ||
| 113 | zipmeta['system'] = "windows" if field.create_system == 2 else "unknown" | ||
| 114 | if zipin.comment != '': | 112 | if zipin.comment != '': |
| 115 | metadata["%s comment" % self.filename] = zipin.comment | 113 | metadata['comment'] = zipin.comment |
| 114 | for item in zipin.infolist(): | ||
| 115 | zipinfo_meta = self.__get_zipinfo_meta(item) | ||
| 116 | if zipinfo_meta != {}: # zipinfo metadata | ||
| 117 | metadata[item.filename + "'s zipinfo"] = str(zipinfo_meta) | ||
| 118 | zipin.extract(item, self.tempdir) | ||
| 119 | name = os.path.join(self.tempdir, item.filename) | ||
| 120 | if os.path.isfile(name): | ||
| 121 | cfile = mat.create_class_file(name, False, add2archive=self.add2archive) | ||
| 122 | if cfile: | ||
| 123 | cfile_meta = cfile.get_meta() | ||
| 124 | if cfile_meta != {}: | ||
| 125 | metadata[item.filename] = str(cfile_meta) | ||
| 126 | else: | ||
| 127 | logging.info('%s\'s fileformat is not supported or harmless' | ||
| 128 | % item.filename) | ||
| 116 | zipin.close() | 129 | zipin.close() |
| 117 | return metadata | 130 | return metadata |
| 118 | 131 | ||
| 119 | def remove_all(self): | 132 | def __get_zipinfo_meta(self, zipinfo): |
| 120 | ''' So far, the zipfile module does not allow to write a ZipInfo | 133 | ''' Return all the metadata of a ZipInfo |
| 121 | object into a zipfile (and it's a shame !) : so data added | 134 | ''' |
| 122 | by zipfile itself could not be removed. It's a big concern. | 135 | metadata = {} |
| 123 | Is shipping a patched version of zipfile.py a good idea ? | 136 | if zipinfo.comment != '': |
| 137 | metadata['comment'] = zipinfo.comment | ||
| 138 | if zipinfo.date_time != ZIP_EPOCH: | ||
| 139 | metadata['modified'] = zipinfo.date_time | ||
| 140 | if zipinfo.create_system != 3: # 3 is UNIX | ||
| 141 | metadata['system'] = "windows" if zipinfo.create_system == 2 else "unknown" | ||
| 142 | return metadata | ||
| 143 | |||
| 144 | def remove_all(self, whitelist=[], beginning_blacklist=[], ending_blacklist=[]): | ||
| 145 | ''' Remove all metadata from a zip archive, even thoses | ||
| 146 | added by Python's zipfile itself. It will not add | ||
| 147 | files starting with "begining_blacklist", or ending with | ||
| 148 | "ending_blacklist". This method also add files present in | ||
| 149 | whitelist to the archive. | ||
| 124 | ''' | 150 | ''' |
| 125 | zipin = zipfile.ZipFile(self.filename, 'r') | 151 | zipin = zipfile.ZipFile(self.filename, 'r') |
| 126 | zipout = zipfile.ZipFile(self.output, 'w', allowZip64=True) | 152 | zipout = zipfile.ZipFile(self.output, 'w', allowZip64=True) |
| 127 | for item in zipin.infolist(): | 153 | for item in zipin.infolist(): |
| 128 | zipin.extract(item, self.tempdir) | 154 | zipin.extract(item, self.tempdir) |
| 129 | name = os.path.join(self.tempdir, item.filename) | 155 | name = os.path.join(self.tempdir, item.filename) |
| 130 | if os.path.isfile(name): | 156 | |
| 131 | try: | 157 | beginning = any((True for f in beginning_blacklist if item.filename.startswith(f))) |
| 132 | cfile = mat.create_class_file(name, False, | 158 | ending = any((True for f in ending_blacklist if item.filename.endswith(f))) |
| 133 | add2archive=self.add2archive) | 159 | |
| 160 | if os.path.isfile(name) and not beginning and not ending: | ||
| 161 | cfile = mat.create_class_file(name, False, add2archive=self.add2archive) | ||
| 162 | if cfile is not None: | ||
| 134 | cfile.remove_all() | 163 | cfile.remove_all() |
| 135 | logging.debug('Processing %s from %s' % (item.filename, | 164 | logging.debug('Processing %s from %s' % (item.filename, self.filename)) |
| 136 | self.filename)) | 165 | elif item.filename not in whitelist: |
| 137 | zipout.write(name, item.filename) | 166 | logging.info('%s\'s format is not supported or harmless' % item.filename) |
| 138 | except: | 167 | basename, ext = os.path.splitext(name) |
| 139 | logging.info('%s\'s format is not supported or harmless' % | 168 | if not (self.add2archive or ext in parser.NOMETA): |
| 140 | item.filename) | 169 | continue |
| 141 | _, ext = os.path.splitext(name) | 170 | os.utime(name, (ZIP_EPOCH_SECONDS, ZIP_EPOCH_SECONDS)) |
| 142 | if self.add2archive or ext in parser.NOMETA: | 171 | zipout.write(name, item.filename) |
| 143 | zipout.write(name, item.filename) | ||
| 144 | zipin.close() | 172 | zipin.close() |
| 145 | for zipFile in zipout.infolist(): | ||
| 146 | zipFile.orig_filename = zipFile.filename | ||
| 147 | zipFile.date_time = (1980, 1, 1, 0, 0, 0) | ||
| 148 | zipFile.create_system = 3 # 3 is UNIX | ||
| 149 | zipout.comment = '' | ||
| 150 | zipout.close() | 173 | zipout.close() |
| 151 | 174 | ||
| 152 | logging.info('%s processed' % self.filename) | 175 | logging.info('%s processed' % self.filename) |
| @@ -167,7 +190,7 @@ class TarStripper(GenericArchiveStripper): | |||
| 167 | current_file.gname = '' | 190 | current_file.gname = '' |
| 168 | return current_file | 191 | return current_file |
| 169 | 192 | ||
| 170 | def remove_all(self, exclude_list=[]): | 193 | def remove_all(self, whitelist=[]): |
| 171 | tarin = tarfile.open(self.filename, 'r' + self.compression, encoding='utf-8') | 194 | tarin = tarfile.open(self.filename, 'r' + self.compression, encoding='utf-8') |
| 172 | tarout = tarfile.open(self.output, 'w' + self.compression, encoding='utf-8') | 195 | tarout = tarfile.open(self.output, 'w' + self.compression, encoding='utf-8') |
| 173 | for item in tarin.getmembers(): | 196 | for item in tarin.getmembers(): |
| @@ -179,8 +202,9 @@ class TarStripper(GenericArchiveStripper): | |||
| 179 | cfile.remove_all() | 202 | cfile.remove_all() |
| 180 | elif self.add2archive or os.path.splitext(item.name)[1] in parser.NOMETA: | 203 | elif self.add2archive or os.path.splitext(item.name)[1] in parser.NOMETA: |
| 181 | logging.info('%s\' format is either not supported or harmless' % item.name) | 204 | logging.info('%s\' format is either not supported or harmless' % item.name) |
| 182 | elif item.name in exclude_list: | 205 | elif item.name in whitelist: |
| 183 | logging.debug('%s is not supported, but MAt was told to add it anyway.' % item.name) | 206 | logging.debug('%s is not supported, but MAT was told to add it anyway.' |
| 207 | % item.name) | ||
| 184 | else: | 208 | else: |
| 185 | continue | 209 | continue |
| 186 | tarout.add(complete_name, item.name, filter=self._remove) | 210 | tarout.add(complete_name, item.name, filter=self._remove) |
| @@ -209,7 +233,6 @@ class TarStripper(GenericArchiveStripper): | |||
| 209 | ''' | 233 | ''' |
| 210 | if list_unsupported: | 234 | if list_unsupported: |
| 211 | ret_list = [] | 235 | ret_list = [] |
| 212 | tempdir_len = len(self.tempdir) + 1 # trim the tempfile path | ||
| 213 | tarin = tarfile.open(self.filename, 'r' + self.compression) | 236 | tarin = tarfile.open(self.filename, 'r' + self.compression) |
| 214 | for item in tarin.getmembers(): | 237 | for item in tarin.getmembers(): |
| 215 | if not self.is_file_clean(item) and not list_unsupported: | 238 | if not self.is_file_clean(item) and not list_unsupported: |
| @@ -217,20 +240,21 @@ class TarStripper(GenericArchiveStripper): | |||
| 217 | tarin.extract(item, self.tempdir) | 240 | tarin.extract(item, self.tempdir) |
| 218 | complete_name = os.path.join(self.tempdir, item.name) | 241 | complete_name = os.path.join(self.tempdir, item.name) |
| 219 | if item.isfile(): | 242 | if item.isfile(): |
| 220 | class_file = mat.create_class_file(complete_name, False, add2archive=self.add2archive) | 243 | class_file = mat.create_class_file(complete_name, |
| 244 | False, add2archive=self.add2archive) | ||
| 221 | if class_file: | 245 | if class_file: |
| 222 | # We don't support nested archives | 246 | # We don't support nested archives |
| 223 | if not class_file.is_clean(): | 247 | if not class_file.is_clean(): |
| 224 | if not list_unsupported: | 248 | if not list_unsupported: |
| 225 | return False | 249 | return False |
| 226 | elif isinstance(class_file, GenericArchiveStripper): | 250 | elif isinstance(class_file, GenericArchiveStripper): |
| 227 | ret_list.append(complete_name[tempdir_len:]) | 251 | ret_list.append(item.name) |
| 228 | else: | 252 | else: |
| 229 | logging.error('%s\'s format is not supported or harmless' % item.name) | 253 | logging.error('%s\'s format is not supported or harmless' % item.name) |
| 230 | if os.path.splitext(complete_name)[1] not in parser.NOMETA: | 254 | if os.path.splitext(complete_name)[1] not in parser.NOMETA: |
| 231 | if not list_unsupported: | 255 | if not list_unsupported: |
| 232 | return False | 256 | return False |
| 233 | ret_list.append(complete_name[tempdir_len:]) | 257 | ret_list.append(item.name) |
| 234 | tarin.close() | 258 | tarin.close() |
| 235 | if list_unsupported: | 259 | if list_unsupported: |
| 236 | return ret_list | 260 | return ret_list |
