diff options
| author | jvoisin | 2014-01-15 02:42:39 +0000 |
|---|---|---|
| committer | jvoisin | 2014-01-15 02:42:39 +0000 |
| commit | bbe17fd511b5890fb4554447e23d666f6c13b745 (patch) | |
| tree | 5651c76da1d23ca80b252097ca1eb7880e8cf863 | |
| parent | 5e65094084c75a9372f529a3387b072a84bf254a (diff) | |
Add support for zipfiles!
| -rw-r--r-- | MAT/archive.py | 140 | ||||
| -rw-r--r-- | MAT/office.py | 187 | ||||
| -rw-r--r-- | MAT/strippers.py | 2 | ||||
| -rwxr-xr-x | mat-gui | 2 | ||||
| -rw-r--r-- | test/TODO/dirty.zip | bin | 6433 -> 0 bytes | |||
| -rw-r--r-- | test/clean é.docx | bin | 5842 -> 6520 bytes | |||
| -rw-r--r-- | test/clean é.odt | bin | 33130 -> 33140 bytes | |||
| -rw-r--r-- | test/clean é.tar.gz | bin | 0 -> 5656 bytes | |||
| -rw-r--r-- | test/clean é.zip (renamed from test/TODO/clean.zip) | bin | 5885 -> 5885 bytes | |||
| -rw-r--r-- | test/dirty é.tar.gz | bin | 0 -> 5994 bytes | |||
| -rw-r--r-- | test/dirty é.zip | bin | 0 -> 6206 bytes | |||
| -rw-r--r-- | test/libtest.py | 8 |
12 files changed, 144 insertions, 195 deletions
diff --git a/MAT/archive.py b/MAT/archive.py index 9179e48..53c5e9b 100644 --- a/MAT/archive.py +++ b/MAT/archive.py | |||
| @@ -1,6 +1,7 @@ | |||
| 1 | ''' Take care of archives formats | 1 | ''' Take care of archives formats |
| 2 | ''' | 2 | ''' |
| 3 | 3 | ||
| 4 | import datetime | ||
| 4 | import logging | 5 | import logging |
| 5 | import os | 6 | import os |
| 6 | import shutil | 7 | import shutil |
| @@ -11,12 +12,17 @@ import zipfile | |||
| 11 | import mat | 12 | import mat |
| 12 | import parser | 13 | import parser |
| 13 | 14 | ||
| 15 | ZIP_EPOCH = (1980, 1, 1, 0, 0, 0) | ||
| 16 | ZIP_EPOCH_SECONDS = (datetime.datetime(1980, 1, 1, 0, 0, 0) | ||
| 17 | - datetime.datetime(1970, 1, 1, 0, 0, 0)).total_seconds() | ||
| 18 | |||
| 14 | 19 | ||
| 15 | class GenericArchiveStripper(parser.GenericParser): | 20 | class GenericArchiveStripper(parser.GenericParser): |
| 16 | ''' Represent a generic archive | 21 | ''' Represent a generic archive |
| 17 | ''' | 22 | ''' |
| 18 | def __init__(self, filename, parser, mime, backup, is_writable, **kwargs): | 23 | def __init__(self, filename, parser, mime, backup, is_writable, **kwargs): |
| 19 | super(GenericArchiveStripper, self).__init__(filename, parser, mime, backup, is_writable, **kwargs) | 24 | super(GenericArchiveStripper, self).__init__(filename, |
| 25 | parser, mime, backup, is_writable, **kwargs) | ||
| 20 | self.compression = '' | 26 | self.compression = '' |
| 21 | self.add2archive = kwargs['add2archive'] | 27 | self.add2archive = kwargs['add2archive'] |
| 22 | self.tempdir = tempfile.mkdtemp() | 28 | self.tempdir = tempfile.mkdtemp() |
| @@ -48,13 +54,13 @@ class GenericArchiveStripper(parser.GenericParser): | |||
| 48 | class ZipStripper(GenericArchiveStripper): | 54 | class ZipStripper(GenericArchiveStripper): |
| 49 | ''' Represent a zip file | 55 | ''' Represent a zip file |
| 50 | ''' | 56 | ''' |
| 51 | def is_file_clean(self, fileinfo): | 57 | def __is_zipfile_clean(self, fileinfo): |
| 52 | ''' Check if a ZipInfo object is clean of metadatas added | 58 | ''' Check if a ZipInfo object is clean of metadatas added |
| 53 | by zip itself, independently of the corresponding file metadatas | 59 | by zip itself, independently of the corresponding file metadatas |
| 54 | ''' | 60 | ''' |
| 55 | if fileinfo.comment != '': | 61 | if fileinfo.comment != '': |
| 56 | return False | 62 | return False |
| 57 | elif fileinfo.date_time != (1980, 1, 1, 0, 0, 0): | 63 | elif fileinfo.date_time != ZIP_EPOCH: |
| 58 | return False | 64 | return False |
| 59 | elif fileinfo.create_system != 3: # 3 is UNIX | 65 | elif fileinfo.create_system != 3: # 3 is UNIX |
| 60 | return False | 66 | return False |
| @@ -70,83 +76,100 @@ class ZipStripper(GenericArchiveStripper): | |||
| 70 | logging.debug('%s has a comment' % self.filename) | 76 | logging.debug('%s has a comment' % self.filename) |
| 71 | return False | 77 | return False |
| 72 | for item in zipin.infolist(): | 78 | for item in zipin.infolist(): |
| 73 | # I have not found a way to remove the crap added by zipfile :/ | ||
| 74 | # if not self.is_file_clean(item): | ||
| 75 | # logging.debug('%s from %s has compromising zipinfo' % | ||
| 76 | # (item.filename, self.filename)) | ||
| 77 | # return False | ||
| 78 | zipin.extract(item, self.tempdir) | 79 | zipin.extract(item, self.tempdir) |
| 79 | name = os.path.join(self.tempdir, item.filename) | 80 | name = os.path.join(self.tempdir, item.filename) |
| 81 | if not self.__is_zipfile_clean(item) and not list_unsupported: | ||
| 82 | logging.debug('%s from %s has compromising zipinfo' % | ||
| 83 | (item.filename, self.filename)) | ||
| 84 | return False | ||
| 80 | if os.path.isfile(name): | 85 | if os.path.isfile(name): |
| 81 | cfile = mat.create_class_file(name, False, add2archive=self.add2archive) | 86 | cfile = mat.create_class_file(name, False, add2archive=self.add2archive) |
| 82 | if cfile: | 87 | if cfile: |
| 83 | if not cfile.is_clean(): | 88 | if not cfile.is_clean(): |
| 84 | return False | 89 | logging.debug('%s from %s has compromising zipinfo' % |
| 90 | (item.filename, self.filename)) | ||
| 91 | if not list_unsupported: | ||
| 92 | return False | ||
| 93 | ret_list.append(item.filename) | ||
| 85 | else: | 94 | else: |
| 86 | logging.info('%s\'s fileformat is not supported, or is harmless' % item.filename) | 95 | logging.info('%s\'s fileformat is not supported or harmless.' |
| 96 | % item.filename) | ||
| 87 | basename, ext = os.path.splitext(name) | 97 | basename, ext = os.path.splitext(name) |
| 88 | bname = os.path.basename(item.filename) | 98 | if os.path.basename(item.filename) not in ('mimetype', '.rels'): |
| 89 | if ext not in parser.NOMETA: | 99 | if ext not in parser.NOMETA: |
| 90 | if bname != 'mimetype' and bname != '.rels': | 100 | if not list_unsupported: |
| 91 | if list_unsupported: | ||
| 92 | ret_list.append(bname) | ||
| 93 | else: | ||
| 94 | return False | 101 | return False |
| 102 | ret_list.append(item.filename) | ||
| 95 | zipin.close() | 103 | zipin.close() |
| 96 | if list_unsupported: | 104 | if list_unsupported: |
| 97 | return ret_list | 105 | return ret_list |
| 98 | return True | 106 | return True |
| 99 | 107 | ||
| 100 | def get_meta(self): | 108 | def get_meta(self): |
| 101 | ''' Return all the metadata of a ZipFile (don't return metadatas | 109 | ''' Return all the metadata of a zip archive''' |
| 102 | of contained files : should it ?) | ||
| 103 | ''' | ||
| 104 | zipin = zipfile.ZipFile(self.filename, 'r') | 110 | zipin = zipfile.ZipFile(self.filename, 'r') |
| 105 | metadata = {} | 111 | metadata = {} |
| 106 | for field in zipin.infolist(): | ||
| 107 | zipmeta = {} | ||
| 108 | if field.comment != '': | ||
| 109 | zipmeta['comment'] = field.comment | ||
| 110 | if field.date_time != (1980, 1, 1, 0, 0, 0): | ||
| 111 | zipmeta['modified'] = field.date_time | ||
| 112 | if field.create_system != 3: # 3 is UNIX | ||
| 113 | zipmeta['system'] = "windows" if field.create_system == 2 else "unknown" | ||
| 114 | if zipin.comment != '': | 112 | if zipin.comment != '': |
| 115 | metadata["%s comment" % self.filename] = zipin.comment | 113 | metadata['comment'] = zipin.comment |
| 114 | for item in zipin.infolist(): | ||
| 115 | zipinfo_meta = self.__get_zipinfo_meta(item) | ||
| 116 | if zipinfo_meta != {}: # zipinfo metadata | ||
| 117 | metadata[item.filename + "'s zipinfo"] = str(zipinfo_meta) | ||
| 118 | zipin.extract(item, self.tempdir) | ||
| 119 | name = os.path.join(self.tempdir, item.filename) | ||
| 120 | if os.path.isfile(name): | ||
| 121 | cfile = mat.create_class_file(name, False, add2archive=self.add2archive) | ||
| 122 | if cfile: | ||
| 123 | cfile_meta = cfile.get_meta() | ||
| 124 | if cfile_meta != {}: | ||
| 125 | metadata[item.filename] = str(cfile_meta) | ||
| 126 | else: | ||
| 127 | logging.info('%s\'s fileformat is not supported or harmless' | ||
| 128 | % item.filename) | ||
| 116 | zipin.close() | 129 | zipin.close() |
| 117 | return metadata | 130 | return metadata |
| 118 | 131 | ||
| 119 | def remove_all(self): | 132 | def __get_zipinfo_meta(self, zipinfo): |
| 120 | ''' So far, the zipfile module does not allow to write a ZipInfo | 133 | ''' Return all the metadata of a ZipInfo |
| 121 | object into a zipfile (and it's a shame !) : so data added | 134 | ''' |
| 122 | by zipfile itself could not be removed. It's a big concern. | 135 | metadata = {} |
| 123 | Is shipping a patched version of zipfile.py a good idea ? | 136 | if zipinfo.comment != '': |
| 137 | metadata['comment'] = zipinfo.comment | ||
| 138 | if zipinfo.date_time != ZIP_EPOCH: | ||
| 139 | metadata['modified'] = zipinfo.date_time | ||
| 140 | if zipinfo.create_system != 3: # 3 is UNIX | ||
| 141 | metadata['system'] = "windows" if zipinfo.create_system == 2 else "unknown" | ||
| 142 | return metadata | ||
| 143 | |||
| 144 | def remove_all(self, whitelist=[], beginning_blacklist=[], ending_blacklist=[]): | ||
| 145 | ''' Remove all metadata from a zip archive, even thoses | ||
| 146 | added by Python's zipfile itself. It will not add | ||
| 147 | files starting with "begining_blacklist", or ending with | ||
| 148 | "ending_blacklist". This method also add files present in | ||
| 149 | whitelist to the archive. | ||
| 124 | ''' | 150 | ''' |
| 125 | zipin = zipfile.ZipFile(self.filename, 'r') | 151 | zipin = zipfile.ZipFile(self.filename, 'r') |
| 126 | zipout = zipfile.ZipFile(self.output, 'w', allowZip64=True) | 152 | zipout = zipfile.ZipFile(self.output, 'w', allowZip64=True) |
| 127 | for item in zipin.infolist(): | 153 | for item in zipin.infolist(): |
| 128 | zipin.extract(item, self.tempdir) | 154 | zipin.extract(item, self.tempdir) |
| 129 | name = os.path.join(self.tempdir, item.filename) | 155 | name = os.path.join(self.tempdir, item.filename) |
| 130 | if os.path.isfile(name): | 156 | |
| 131 | try: | 157 | beginning = any((True for f in beginning_blacklist if item.filename.startswith(f))) |
| 132 | cfile = mat.create_class_file(name, False, | 158 | ending = any((True for f in ending_blacklist if item.filename.endswith(f))) |
| 133 | add2archive=self.add2archive) | 159 | |
| 160 | if os.path.isfile(name) and not beginning and not ending: | ||
| 161 | cfile = mat.create_class_file(name, False, add2archive=self.add2archive) | ||
| 162 | if cfile is not None: | ||
| 134 | cfile.remove_all() | 163 | cfile.remove_all() |
| 135 | logging.debug('Processing %s from %s' % (item.filename, | 164 | logging.debug('Processing %s from %s' % (item.filename, self.filename)) |
| 136 | self.filename)) | 165 | elif item.filename not in whitelist: |
| 137 | zipout.write(name, item.filename) | 166 | logging.info('%s\'s format is not supported or harmless' % item.filename) |
| 138 | except: | 167 | basename, ext = os.path.splitext(name) |
| 139 | logging.info('%s\'s format is not supported or harmless' % | 168 | if not (self.add2archive or ext in parser.NOMETA): |
| 140 | item.filename) | 169 | continue |
| 141 | _, ext = os.path.splitext(name) | 170 | os.utime(name, (ZIP_EPOCH_SECONDS, ZIP_EPOCH_SECONDS)) |
| 142 | if self.add2archive or ext in parser.NOMETA: | 171 | zipout.write(name, item.filename) |
| 143 | zipout.write(name, item.filename) | ||
| 144 | zipin.close() | 172 | zipin.close() |
| 145 | for zipFile in zipout.infolist(): | ||
| 146 | zipFile.orig_filename = zipFile.filename | ||
| 147 | zipFile.date_time = (1980, 1, 1, 0, 0, 0) | ||
| 148 | zipFile.create_system = 3 # 3 is UNIX | ||
| 149 | zipout.comment = '' | ||
| 150 | zipout.close() | 173 | zipout.close() |
| 151 | 174 | ||
| 152 | logging.info('%s processed' % self.filename) | 175 | logging.info('%s processed' % self.filename) |
| @@ -167,7 +190,7 @@ class TarStripper(GenericArchiveStripper): | |||
| 167 | current_file.gname = '' | 190 | current_file.gname = '' |
| 168 | return current_file | 191 | return current_file |
| 169 | 192 | ||
| 170 | def remove_all(self, exclude_list=[]): | 193 | def remove_all(self, whitelist=[]): |
| 171 | tarin = tarfile.open(self.filename, 'r' + self.compression, encoding='utf-8') | 194 | tarin = tarfile.open(self.filename, 'r' + self.compression, encoding='utf-8') |
| 172 | tarout = tarfile.open(self.output, 'w' + self.compression, encoding='utf-8') | 195 | tarout = tarfile.open(self.output, 'w' + self.compression, encoding='utf-8') |
| 173 | for item in tarin.getmembers(): | 196 | for item in tarin.getmembers(): |
| @@ -179,8 +202,9 @@ class TarStripper(GenericArchiveStripper): | |||
| 179 | cfile.remove_all() | 202 | cfile.remove_all() |
| 180 | elif self.add2archive or os.path.splitext(item.name)[1] in parser.NOMETA: | 203 | elif self.add2archive or os.path.splitext(item.name)[1] in parser.NOMETA: |
| 181 | logging.info('%s\' format is either not supported or harmless' % item.name) | 204 | logging.info('%s\' format is either not supported or harmless' % item.name) |
| 182 | elif item.name in exclude_list: | 205 | elif item.name in whitelist: |
| 183 | logging.debug('%s is not supported, but MAt was told to add it anyway.' % item.name) | 206 | logging.debug('%s is not supported, but MAT was told to add it anyway.' |
| 207 | % item.name) | ||
| 184 | else: | 208 | else: |
| 185 | continue | 209 | continue |
| 186 | tarout.add(complete_name, item.name, filter=self._remove) | 210 | tarout.add(complete_name, item.name, filter=self._remove) |
| @@ -209,7 +233,6 @@ class TarStripper(GenericArchiveStripper): | |||
| 209 | ''' | 233 | ''' |
| 210 | if list_unsupported: | 234 | if list_unsupported: |
| 211 | ret_list = [] | 235 | ret_list = [] |
| 212 | tempdir_len = len(self.tempdir) + 1 # trim the tempfile path | ||
| 213 | tarin = tarfile.open(self.filename, 'r' + self.compression) | 236 | tarin = tarfile.open(self.filename, 'r' + self.compression) |
| 214 | for item in tarin.getmembers(): | 237 | for item in tarin.getmembers(): |
| 215 | if not self.is_file_clean(item) and not list_unsupported: | 238 | if not self.is_file_clean(item) and not list_unsupported: |
| @@ -217,20 +240,21 @@ class TarStripper(GenericArchiveStripper): | |||
| 217 | tarin.extract(item, self.tempdir) | 240 | tarin.extract(item, self.tempdir) |
| 218 | complete_name = os.path.join(self.tempdir, item.name) | 241 | complete_name = os.path.join(self.tempdir, item.name) |
| 219 | if item.isfile(): | 242 | if item.isfile(): |
| 220 | class_file = mat.create_class_file(complete_name, False, add2archive=self.add2archive) | 243 | class_file = mat.create_class_file(complete_name, |
| 244 | False, add2archive=self.add2archive) | ||
| 221 | if class_file: | 245 | if class_file: |
| 222 | # We don't support nested archives | 246 | # We don't support nested archives |
| 223 | if not class_file.is_clean(): | 247 | if not class_file.is_clean(): |
| 224 | if not list_unsupported: | 248 | if not list_unsupported: |
| 225 | return False | 249 | return False |
| 226 | elif isinstance(class_file, GenericArchiveStripper): | 250 | elif isinstance(class_file, GenericArchiveStripper): |
| 227 | ret_list.append(complete_name[tempdir_len:]) | 251 | ret_list.append(item.name) |
| 228 | else: | 252 | else: |
| 229 | logging.error('%s\'s format is not supported or harmless' % item.name) | 253 | logging.error('%s\'s format is not supported or harmless' % item.name) |
| 230 | if os.path.splitext(complete_name)[1] not in parser.NOMETA: | 254 | if os.path.splitext(complete_name)[1] not in parser.NOMETA: |
| 231 | if not list_unsupported: | 255 | if not list_unsupported: |
| 232 | return False | 256 | return False |
| 233 | ret_list.append(complete_name[tempdir_len:]) | 257 | ret_list.append(item.name) |
| 234 | tarin.close() | 258 | tarin.close() |
| 235 | if list_unsupported: | 259 | if list_unsupported: |
| 236 | return ret_list | 260 | return ret_list |
diff --git a/MAT/office.py b/MAT/office.py index f60fc64..97405b3 100644 --- a/MAT/office.py +++ b/MAT/office.py | |||
| @@ -1,13 +1,12 @@ | |||
| 1 | ''' Care about office's formats | 1 | ''' Care about office's formats |
| 2 | ''' | 2 | ''' |
| 3 | 3 | ||
| 4 | import os | ||
| 5 | import logging | 4 | import logging |
| 6 | import zipfile | 5 | import os |
| 7 | import fileinput | ||
| 8 | import tempfile | ||
| 9 | import shutil | 6 | import shutil |
| 7 | import tempfile | ||
| 10 | import xml.dom.minidom as minidom | 8 | import xml.dom.minidom as minidom |
| 9 | import zipfile | ||
| 11 | 10 | ||
| 12 | try: | 11 | try: |
| 13 | import cairo | 12 | import cairo |
| @@ -16,7 +15,6 @@ except ImportError: | |||
| 16 | logging.info('office.py loaded without PDF support') | 15 | logging.info('office.py loaded without PDF support') |
| 17 | pass | 16 | pass |
| 18 | 17 | ||
| 19 | import mat | ||
| 20 | import parser | 18 | import parser |
| 21 | import archive | 19 | import archive |
| 22 | 20 | ||
| @@ -30,89 +28,83 @@ class OpenDocumentStripper(archive.ZipStripper): | |||
| 30 | ''' Return a dict with all the meta of the file by | 28 | ''' Return a dict with all the meta of the file by |
| 31 | trying to read the meta.xml file. | 29 | trying to read the meta.xml file. |
| 32 | ''' | 30 | ''' |
| 31 | metadata = super(OpenDocumentStripper, self).get_meta() | ||
| 33 | zipin = zipfile.ZipFile(self.filename, 'r') | 32 | zipin = zipfile.ZipFile(self.filename, 'r') |
| 34 | metadata = {} | ||
| 35 | try: | 33 | try: |
| 36 | content = zipin.read('meta.xml') | 34 | content = zipin.read('meta.xml') |
| 37 | dom1 = minidom.parseString(content) | 35 | dom1 = minidom.parseString(content) |
| 38 | elements = dom1.getElementsByTagName('office:meta') | 36 | elements = dom1.getElementsByTagName('office:meta') |
| 39 | for i in elements[0].childNodes: | 37 | for i in elements[0].childNodes: |
| 40 | if i.tagName != 'meta:document-statistic': | 38 | if i.tagName != 'meta:document-statistic': |
| 41 | nodename = ''.join([k for k in i.nodeName.split(':')[1:]]) | 39 | nodename = ''.join(i.nodeName.split(':')[1:]) |
| 42 | metadata[nodename] = ''.join([j.data for j in i.childNodes]) | 40 | metadata[nodename] = ''.join([j.data for j in i.childNodes]) |
| 43 | else: | 41 | else: |
| 44 | # thank you w3c for not providing a nice | 42 | # thank you w3c for not providing a nice |
| 45 | # method to get all attributes of a node | 43 | # method to get all attributes of a node |
| 46 | pass | 44 | pass |
| 47 | zipin.close() | ||
| 48 | except KeyError: # no meta.xml file found | 45 | except KeyError: # no meta.xml file found |
| 49 | logging.debug('%s has no opendocument metadata' % self.filename) | 46 | logging.debug('%s has no opendocument metadata' % self.filename) |
| 47 | zipin.close() | ||
| 50 | return metadata | 48 | return metadata |
| 51 | 49 | ||
| 52 | def remove_all(self): | 50 | def remove_all(self): |
| 51 | ''' Removes metadata | ||
| 53 | ''' | 52 | ''' |
| 54 | FIXME ? | 53 | return super(OpenDocumentStripper, self).remove_all(ending_blacklist=['meta.xml']) |
| 55 | There is a patch implementing the Zipfile.remove() | 54 | |
| 56 | method here : http://bugs.python.org/issue6818 | 55 | def is_clean(self): |
| 56 | ''' Check if the file is clean from harmful metadatas | ||
| 57 | ''' | 57 | ''' |
| 58 | clean_super = super(OpenDocumentStripper, self).is_clean() | ||
| 59 | if clean_super is False: | ||
| 60 | return False | ||
| 61 | |||
| 58 | zipin = zipfile.ZipFile(self.filename, 'r') | 62 | zipin = zipfile.ZipFile(self.filename, 'r') |
| 59 | zipout = zipfile.ZipFile(self.output, 'w', allowZip64=True) | 63 | try: |
| 64 | zipin.getinfo('meta.xml') | ||
| 65 | except KeyError: # no meta.xml in the file | ||
| 66 | return True | ||
| 67 | zipin.close() | ||
| 68 | return False | ||
| 60 | 69 | ||
| 61 | for item in zipin.namelist(): | ||
| 62 | name = os.path.join(self.tempdir, item) | ||
| 63 | _, ext = os.path.splitext(name) | ||
| 64 | 70 | ||
| 65 | if item.endswith('manifest.xml'): | 71 | class OpenXmlStripper(archive.ZipStripper): |
| 66 | # contain the list of all files present in the archive | 72 | ''' Represent an office openxml document, which is like |
| 67 | zipin.extract(item, self.tempdir) | 73 | an opendocument format, with some tricky stuff added. |
| 68 | for line in fileinput.input(name, inplace=1): | 74 | It contains mostly xml, but can have media blobs, crap, ... |
| 69 | # remove the line which contains "meta.xml" | 75 | (I don't like this format.) |
| 70 | line = line.strip() | 76 | ''' |
| 71 | if not 'meta.xml' in line: | 77 | def remove_all(self): |
| 72 | print line | 78 | return super(OpenXmlStripper, self).remove_all( |
| 73 | zipout.write(name, item) | 79 | beginning_blacklist=('docProps/'), whitelist=('.rels')) |
| 74 | 80 | ||
| 75 | elif ext in parser.NOMETA or item == 'mimetype': | 81 | def is_clean(self): |
| 76 | # keep NOMETA files, and the "manifest" file | 82 | ''' Check if the file is clean from harmful metadatas. |
| 77 | if item != 'meta.xml': # contains the metadata | 83 | This implementation is faster than something like |
| 78 | zipin.extract(item, self.tempdir) | 84 | "return this.get_meta() == {}". |
| 79 | zipout.write(name, item) | 85 | ''' |
| 86 | clean_super = super(OpenXmlStripper, self).is_clean() | ||
| 87 | if clean_super is False: | ||
| 88 | return False | ||
| 80 | 89 | ||
| 81 | else: | 90 | zipin = zipfile.ZipFile(self.filename, 'r') |
| 82 | zipin.extract(item, self.tempdir) | 91 | for item in zipin.namelist(): |
| 83 | if os.path.isfile(name): | 92 | if item.startswith('docProps/'): |
| 84 | try: | 93 | return False |
| 85 | cfile = mat.create_class_file(name, False, | ||
| 86 | add2archive=self.add2archive) | ||
| 87 | cfile.remove_all() | ||
| 88 | logging.debug('Processing %s from %s' % (item, | ||
| 89 | self.filename)) | ||
| 90 | zipout.write(name, item) | ||
| 91 | except: | ||
| 92 | logging.info('%s\'s fileformat is not supported' % item) | ||
| 93 | if self.add2archive: | ||
| 94 | zipout.write(name, item) | ||
| 95 | zipout.comment = '' | ||
| 96 | logging.info('%s processed' % self.filename) | ||
| 97 | zipin.close() | 94 | zipin.close() |
| 98 | zipout.close() | ||
| 99 | self.do_backup() | ||
| 100 | return True | 95 | return True |
| 101 | 96 | ||
| 102 | def is_clean(self): | 97 | def get_meta(self): |
| 103 | ''' Check if the file is clean from harmful metadatas | 98 | ''' Return a dict with all the meta of the file |
| 104 | ''' | 99 | ''' |
| 100 | metadata = super(OpenXmlStripper, self).get_meta() | ||
| 101 | |||
| 105 | zipin = zipfile.ZipFile(self.filename, 'r') | 102 | zipin = zipfile.ZipFile(self.filename, 'r') |
| 106 | try: | 103 | for item in zipin.namelist(): |
| 107 | zipin.getinfo('meta.xml') | 104 | if item.startswith('docProps/'): |
| 108 | except KeyError: # no meta.xml in the file | 105 | metadata[item] = 'harmful content' |
| 109 | czf = archive.ZipStripper(self.filename, self.parser, | ||
| 110 | 'application/zip', False, True, add2archive=self.add2archive) | ||
| 111 | if czf.is_clean(): | ||
| 112 | zipin.close() | ||
| 113 | return True | ||
| 114 | zipin.close() | 106 | zipin.close() |
| 115 | return False | 107 | return metadata |
| 116 | 108 | ||
| 117 | 109 | ||
| 118 | class PdfStripper(parser.GenericParser): | 110 | class PdfStripper(parser.GenericParser): |
| @@ -128,8 +120,8 @@ class PdfStripper(parser.GenericParser): | |||
| 128 | self.pdf_quality = False | 120 | self.pdf_quality = False |
| 129 | 121 | ||
| 130 | self.document = Poppler.Document.new_from_file(uri, self.password) | 122 | self.document = Poppler.Document.new_from_file(uri, self.password) |
| 131 | self.meta_list = frozenset(['title', 'author', 'subject', 'keywords', 'creator', | 123 | self.meta_list = frozenset(['title', 'author', 'subject', |
| 132 | 'producer', 'metadata']) | 124 | 'keywords', 'creator', 'producer', 'metadata']) |
| 133 | 125 | ||
| 134 | def is_clean(self): | 126 | def is_clean(self): |
| 135 | ''' Check if the file is clean from harmful metadatas | 127 | ''' Check if the file is clean from harmful metadatas |
| @@ -168,7 +160,7 @@ class PdfStripper(parser.GenericParser): | |||
| 168 | surface.finish() | 160 | surface.finish() |
| 169 | shutil.move(output, self.output) | 161 | shutil.move(output, self.output) |
| 170 | except: | 162 | except: |
| 171 | logging.error('Something went wrong when cleaning %s. File not cleaned' % self.filename) | 163 | logging.error('Something went wrong when cleaning %s.' % self.filename) |
| 172 | return False | 164 | return False |
| 173 | 165 | ||
| 174 | try: | 166 | try: |
| @@ -182,8 +174,7 @@ class PdfStripper(parser.GenericParser): | |||
| 182 | writer.write(self.output) | 174 | writer.write(self.output) |
| 183 | self.do_backup() | 175 | self.do_backup() |
| 184 | except: | 176 | except: |
| 185 | logging.error('Unable to remove all metadata from %s, please install\ | 177 | logging.error('Unable to remove all metadata from %s, please install pdfrw' % self.output) |
| 186 | pdfrw' % self.output) | ||
| 187 | return False | 178 | return False |
| 188 | return True | 179 | return True |
| 189 | 180 | ||
| @@ -195,73 +186,3 @@ pdfrw' % self.output) | |||
| 195 | if self.document.get_property(key): | 186 | if self.document.get_property(key): |
| 196 | metadata[key] = self.document.get_property(key) | 187 | metadata[key] = self.document.get_property(key) |
| 197 | return metadata | 188 | return metadata |
| 198 | |||
| 199 | |||
| 200 | class OpenXmlStripper(archive.GenericArchiveStripper): | ||
| 201 | ''' | ||
| 202 | Represent an office openxml document, which is like | ||
| 203 | an opendocument format, with some tricky stuff added. | ||
| 204 | It contains mostly xml, but can have media blobs, crap, ... | ||
| 205 | (I don't like this format.) | ||
| 206 | ''' | ||
| 207 | def remove_all(self): | ||
| 208 | ''' | ||
| 209 | FIXME ? | ||
| 210 | There is a patch implementing the Zipfile.remove() | ||
| 211 | method here : http://bugs.python.org/issue6818 | ||
| 212 | ''' | ||
| 213 | zipin = zipfile.ZipFile(self.filename, 'r') | ||
| 214 | zipout = zipfile.ZipFile(self.output, 'w', | ||
| 215 | allowZip64=True) | ||
| 216 | for item in zipin.namelist(): | ||
| 217 | name = os.path.join(self.tempdir, item) | ||
| 218 | _, ext = os.path.splitext(name) | ||
| 219 | if item.startswith('docProps/'): # metadatas | ||
| 220 | pass | ||
| 221 | elif ext in parser.NOMETA or item == '.rels': | ||
| 222 | # keep parser.NOMETA files, and the file named ".rels" | ||
| 223 | zipin.extract(item, self.tempdir) | ||
| 224 | zipout.write(name, item) | ||
| 225 | else: | ||
| 226 | zipin.extract(item, self.tempdir) | ||
| 227 | if os.path.isfile(name): # don't care about folders | ||
| 228 | try: | ||
| 229 | cfile = mat.create_class_file(name, False, | ||
| 230 | add2archive=self.add2archive) | ||
| 231 | cfile.remove_all() | ||
| 232 | logging.debug('Processing %s from %s' % (item, | ||
| 233 | self.filename)) | ||
| 234 | zipout.write(name, item) | ||
| 235 | except: | ||
| 236 | logging.info('%s\'s fileformat is not supported' % item) | ||
| 237 | if self.add2archive: | ||
| 238 | zipout.write(name, item) | ||
| 239 | zipout.comment = '' | ||
| 240 | logging.info('%s processed' % self.filename) | ||
| 241 | zipin.close() | ||
| 242 | zipout.close() | ||
| 243 | self.do_backup() | ||
| 244 | return True | ||
| 245 | |||
| 246 | def is_clean(self): | ||
| 247 | ''' Check if the file is clean from harmful metadatas | ||
| 248 | ''' | ||
| 249 | zipin = zipfile.ZipFile(self.filename, 'r') | ||
| 250 | for item in zipin.namelist(): | ||
| 251 | if item.startswith('docProps/'): | ||
| 252 | return False | ||
| 253 | zipin.close() | ||
| 254 | czf = archive.ZipStripper(self.filename, self.parser, | ||
| 255 | 'application/zip', False, True, add2archive=self.add2archive) | ||
| 256 | return czf.is_clean() | ||
| 257 | |||
| 258 | def get_meta(self): | ||
| 259 | ''' Return a dict with all the meta of the file | ||
| 260 | ''' | ||
| 261 | zipin = zipfile.ZipFile(self.filename, 'r') | ||
| 262 | metadata = {} | ||
| 263 | for item in zipin.namelist(): | ||
| 264 | if item.startswith('docProps/'): | ||
| 265 | metadata[item] = 'harmful content' | ||
| 266 | zipin.close() | ||
| 267 | return metadata | ||
diff --git a/MAT/strippers.py b/MAT/strippers.py index 5fd4e08..aea98da 100644 --- a/MAT/strippers.py +++ b/MAT/strippers.py | |||
| @@ -14,6 +14,8 @@ import subprocess | |||
| 14 | STRIPPERS = { | 14 | STRIPPERS = { |
| 15 | 'application/x-tar': archive.TarStripper, | 15 | 'application/x-tar': archive.TarStripper, |
| 16 | 'application/x-bzip2': archive.Bzip2Stripper, | 16 | 'application/x-bzip2': archive.Bzip2Stripper, |
| 17 | 'application/x-gzip': archive.GzipStripper, | ||
| 18 | 'application/zip': archive.ZipStripper, | ||
| 17 | 'audio/mpeg': audio.MpegAudioStripper, | 19 | 'audio/mpeg': audio.MpegAudioStripper, |
| 18 | 'application/x-bittorrent': misc.TorrentStripper, | 20 | 'application/x-bittorrent': misc.TorrentStripper, |
| 19 | 'application/opendocument': office.OpenDocumentStripper, | 21 | 'application/opendocument': office.OpenDocumentStripper, |
| @@ -410,7 +410,7 @@ non-anonymised) file to output archive')) | |||
| 410 | unsupported_list = self.liststore[line][0].file.list_unsupported() | 410 | unsupported_list = self.liststore[line][0].file.list_unsupported() |
| 411 | if unsupported_list: | 411 | if unsupported_list: |
| 412 | list_to_add = self.__popup_archive(unsupported_list) | 412 | list_to_add = self.__popup_archive(unsupported_list) |
| 413 | if self.liststore[line][0].file.remove_all(list_to_add): | 413 | if self.liststore[line][0].file.remove_all(whitelist=list_to_add): |
| 414 | self.liststore[line][2] = _('Clean') | 414 | self.liststore[line][2] = _('Clean') |
| 415 | elif self.liststore[line][0].file.remove_all(): | 415 | elif self.liststore[line][0].file.remove_all(): |
| 416 | self.liststore[line][2] = _('Clean') | 416 | self.liststore[line][2] = _('Clean') |
diff --git a/test/TODO/dirty.zip b/test/TODO/dirty.zip deleted file mode 100644 index a8eb59b..0000000 --- a/test/TODO/dirty.zip +++ /dev/null | |||
| Binary files differ | |||
diff --git a/test/clean é.docx b/test/clean é.docx index 0f1470c..738eb6c 100644 --- a/test/clean é.docx +++ b/test/clean é.docx | |||
| Binary files differ | |||
diff --git a/test/clean é.odt b/test/clean é.odt index e7a550c..a06d816 100644 --- a/test/clean é.odt +++ b/test/clean é.odt | |||
| Binary files differ | |||
diff --git a/test/clean é.tar.gz b/test/clean é.tar.gz new file mode 100644 index 0000000..1ab4407 --- /dev/null +++ b/test/clean é.tar.gz | |||
| Binary files differ | |||
diff --git a/test/TODO/clean.zip b/test/clean é.zip index bf46419..b2805c4 100644 --- a/test/TODO/clean.zip +++ b/test/clean é.zip | |||
| Binary files differ | |||
diff --git a/test/dirty é.tar.gz b/test/dirty é.tar.gz new file mode 100644 index 0000000..8bb392b --- /dev/null +++ b/test/dirty é.tar.gz | |||
| Binary files differ | |||
diff --git a/test/dirty é.zip b/test/dirty é.zip new file mode 100644 index 0000000..e272162 --- /dev/null +++ b/test/dirty é.zip | |||
| Binary files differ | |||
diff --git a/test/libtest.py b/test/libtest.py index 0b45505..f052b6e 100644 --- a/test/libtest.py +++ b/test/libtest.py | |||
| @@ -99,6 +99,7 @@ class TestSecureRemove(unittest.TestCase): | |||
| 99 | ''' | 99 | ''' |
| 100 | self.assertRaises(MAT.exceptions.UnableToRemoveFile, MAT.mat.secure_remove, '/NOTREMOVABLE') | 100 | self.assertRaises(MAT.exceptions.UnableToRemoveFile, MAT.mat.secure_remove, '/NOTREMOVABLE') |
| 101 | 101 | ||
| 102 | |||
| 102 | class TestArchiveProcessing(test.MATTest): | 103 | class TestArchiveProcessing(test.MATTest): |
| 103 | ''' Test archives processing | 104 | ''' Test archives processing |
| 104 | ''' | 105 | ''' |
| @@ -107,7 +108,7 @@ class TestArchiveProcessing(test.MATTest): | |||
| 107 | ''' | 108 | ''' |
| 108 | tarpath = os.path.join(self.tmpdir, "test.tar.bz2") | 109 | tarpath = os.path.join(self.tmpdir, "test.tar.bz2") |
| 109 | tar = tarfile.open(tarpath, "w:bz2") | 110 | tar = tarfile.open(tarpath, "w:bz2") |
| 110 | for clean,dirty in self.file_list: | 111 | for clean, dirty in self.file_list: |
| 111 | tar.add(dirty) | 112 | tar.add(dirty) |
| 112 | tar.add(clean) | 113 | tar.add(clean) |
| 113 | tar.close() | 114 | tar.close() |
| @@ -121,7 +122,7 @@ class TestArchiveProcessing(test.MATTest): | |||
| 121 | ''' | 122 | ''' |
| 122 | tarpath = os.path.join(self.tmpdir, "test.tar") | 123 | tarpath = os.path.join(self.tmpdir, "test.tar") |
| 123 | tar = tarfile.open(tarpath, "w") | 124 | tar = tarfile.open(tarpath, "w") |
| 124 | for clean,dirty in self.file_list: | 125 | for clean, dirty in self.file_list: |
| 125 | tar.add(dirty) | 126 | tar.add(dirty) |
| 126 | tar.add(clean) | 127 | tar.add(clean) |
| 127 | tar.close() | 128 | tar.close() |
| @@ -135,7 +136,7 @@ class TestArchiveProcessing(test.MATTest): | |||
| 135 | ''' | 136 | ''' |
| 136 | tarpath = os.path.join(self.tmpdir, "test.tar.gz") | 137 | tarpath = os.path.join(self.tmpdir, "test.tar.gz") |
| 137 | tar = tarfile.open(tarpath, "w") | 138 | tar = tarfile.open(tarpath, "w") |
| 138 | for clean,dirty in self.file_list: | 139 | for clean, dirty in self.file_list: |
| 139 | tar.add(dirty) | 140 | tar.add(dirty) |
| 140 | tar.add(clean) | 141 | tar.add(clean) |
| 141 | tar.close() | 142 | tar.close() |
| @@ -156,6 +157,7 @@ class TestArchiveProcessing(test.MATTest): | |||
| 156 | unsupported_files = set(current_file.is_clean(list_unsupported=True)) | 157 | unsupported_files = set(current_file.is_clean(list_unsupported=True)) |
| 157 | self.assertEqual(unsupported_files, set(('mat.desktop', 'README.security', 'setup.py'))) | 158 | self.assertEqual(unsupported_files, set(('mat.desktop', 'README.security', 'setup.py'))) |
| 158 | 159 | ||
| 160 | |||
| 159 | def get_tests(): | 161 | def get_tests(): |
| 160 | ''' Returns every libtests''' | 162 | ''' Returns every libtests''' |
| 161 | suite = unittest.TestSuite() | 163 | suite = unittest.TestSuite() |
