summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--MAT/archive.py140
-rw-r--r--MAT/office.py187
-rw-r--r--MAT/strippers.py2
-rwxr-xr-xmat-gui2
-rw-r--r--test/TODO/dirty.zipbin6433 -> 0 bytes
-rw-r--r--test/clean é.docxbin5842 -> 6520 bytes
-rw-r--r--test/clean é.odtbin33130 -> 33140 bytes
-rw-r--r--test/clean é.tar.gzbin0 -> 5656 bytes
-rw-r--r--test/clean é.zip (renamed from test/TODO/clean.zip)bin5885 -> 5885 bytes
-rw-r--r--test/dirty é.tar.gzbin0 -> 5994 bytes
-rw-r--r--test/dirty é.zipbin0 -> 6206 bytes
-rw-r--r--test/libtest.py8
12 files changed, 144 insertions, 195 deletions
diff --git a/MAT/archive.py b/MAT/archive.py
index 9179e48..53c5e9b 100644
--- a/MAT/archive.py
+++ b/MAT/archive.py
@@ -1,6 +1,7 @@
1''' Take care of archives formats 1''' Take care of archives formats
2''' 2'''
3 3
4import datetime
4import logging 5import logging
5import os 6import os
6import shutil 7import shutil
@@ -11,12 +12,17 @@ import zipfile
11import mat 12import mat
12import parser 13import parser
13 14
15ZIP_EPOCH = (1980, 1, 1, 0, 0, 0)
16ZIP_EPOCH_SECONDS = (datetime.datetime(1980, 1, 1, 0, 0, 0)
17 - datetime.datetime(1970, 1, 1, 0, 0, 0)).total_seconds()
18
14 19
15class GenericArchiveStripper(parser.GenericParser): 20class GenericArchiveStripper(parser.GenericParser):
16 ''' Represent a generic archive 21 ''' Represent a generic archive
17 ''' 22 '''
18 def __init__(self, filename, parser, mime, backup, is_writable, **kwargs): 23 def __init__(self, filename, parser, mime, backup, is_writable, **kwargs):
19 super(GenericArchiveStripper, self).__init__(filename, parser, mime, backup, is_writable, **kwargs) 24 super(GenericArchiveStripper, self).__init__(filename,
25 parser, mime, backup, is_writable, **kwargs)
20 self.compression = '' 26 self.compression = ''
21 self.add2archive = kwargs['add2archive'] 27 self.add2archive = kwargs['add2archive']
22 self.tempdir = tempfile.mkdtemp() 28 self.tempdir = tempfile.mkdtemp()
@@ -48,13 +54,13 @@ class GenericArchiveStripper(parser.GenericParser):
48class ZipStripper(GenericArchiveStripper): 54class ZipStripper(GenericArchiveStripper):
49 ''' Represent a zip file 55 ''' Represent a zip file
50 ''' 56 '''
51 def is_file_clean(self, fileinfo): 57 def __is_zipfile_clean(self, fileinfo):
52 ''' Check if a ZipInfo object is clean of metadatas added 58 ''' Check if a ZipInfo object is clean of metadatas added
53 by zip itself, independently of the corresponding file metadatas 59 by zip itself, independently of the corresponding file metadatas
54 ''' 60 '''
55 if fileinfo.comment != '': 61 if fileinfo.comment != '':
56 return False 62 return False
57 elif fileinfo.date_time != (1980, 1, 1, 0, 0, 0): 63 elif fileinfo.date_time != ZIP_EPOCH:
58 return False 64 return False
59 elif fileinfo.create_system != 3: # 3 is UNIX 65 elif fileinfo.create_system != 3: # 3 is UNIX
60 return False 66 return False
@@ -70,83 +76,100 @@ class ZipStripper(GenericArchiveStripper):
70 logging.debug('%s has a comment' % self.filename) 76 logging.debug('%s has a comment' % self.filename)
71 return False 77 return False
72 for item in zipin.infolist(): 78 for item in zipin.infolist():
73 # I have not found a way to remove the crap added by zipfile :/
74 # if not self.is_file_clean(item):
75 # logging.debug('%s from %s has compromising zipinfo' %
76 # (item.filename, self.filename))
77 # return False
78 zipin.extract(item, self.tempdir) 79 zipin.extract(item, self.tempdir)
79 name = os.path.join(self.tempdir, item.filename) 80 name = os.path.join(self.tempdir, item.filename)
81 if not self.__is_zipfile_clean(item) and not list_unsupported:
82 logging.debug('%s from %s has compromising zipinfo' %
83 (item.filename, self.filename))
84 return False
80 if os.path.isfile(name): 85 if os.path.isfile(name):
81 cfile = mat.create_class_file(name, False, add2archive=self.add2archive) 86 cfile = mat.create_class_file(name, False, add2archive=self.add2archive)
82 if cfile: 87 if cfile:
83 if not cfile.is_clean(): 88 if not cfile.is_clean():
84 return False 89 logging.debug('%s from %s has compromising zipinfo' %
90 (item.filename, self.filename))
91 if not list_unsupported:
92 return False
93 ret_list.append(item.filename)
85 else: 94 else:
86 logging.info('%s\'s fileformat is not supported, or is harmless' % item.filename) 95 logging.info('%s\'s fileformat is not supported or harmless.'
96 % item.filename)
87 basename, ext = os.path.splitext(name) 97 basename, ext = os.path.splitext(name)
88 bname = os.path.basename(item.filename) 98 if os.path.basename(item.filename) not in ('mimetype', '.rels'):
89 if ext not in parser.NOMETA: 99 if ext not in parser.NOMETA:
90 if bname != 'mimetype' and bname != '.rels': 100 if not list_unsupported:
91 if list_unsupported:
92 ret_list.append(bname)
93 else:
94 return False 101 return False
102 ret_list.append(item.filename)
95 zipin.close() 103 zipin.close()
96 if list_unsupported: 104 if list_unsupported:
97 return ret_list 105 return ret_list
98 return True 106 return True
99 107
100 def get_meta(self): 108 def get_meta(self):
101 ''' Return all the metadata of a ZipFile (don't return metadatas 109 ''' Return all the metadata of a zip archive'''
102 of contained files : should it ?)
103 '''
104 zipin = zipfile.ZipFile(self.filename, 'r') 110 zipin = zipfile.ZipFile(self.filename, 'r')
105 metadata = {} 111 metadata = {}
106 for field in zipin.infolist():
107 zipmeta = {}
108 if field.comment != '':
109 zipmeta['comment'] = field.comment
110 if field.date_time != (1980, 1, 1, 0, 0, 0):
111 zipmeta['modified'] = field.date_time
112 if field.create_system != 3: # 3 is UNIX
113 zipmeta['system'] = "windows" if field.create_system == 2 else "unknown"
114 if zipin.comment != '': 112 if zipin.comment != '':
115 metadata["%s comment" % self.filename] = zipin.comment 113 metadata['comment'] = zipin.comment
114 for item in zipin.infolist():
115 zipinfo_meta = self.__get_zipinfo_meta(item)
116 if zipinfo_meta != {}: # zipinfo metadata
117 metadata[item.filename + "'s zipinfo"] = str(zipinfo_meta)
118 zipin.extract(item, self.tempdir)
119 name = os.path.join(self.tempdir, item.filename)
120 if os.path.isfile(name):
121 cfile = mat.create_class_file(name, False, add2archive=self.add2archive)
122 if cfile:
123 cfile_meta = cfile.get_meta()
124 if cfile_meta != {}:
125 metadata[item.filename] = str(cfile_meta)
126 else:
127 logging.info('%s\'s fileformat is not supported or harmless'
128 % item.filename)
116 zipin.close() 129 zipin.close()
117 return metadata 130 return metadata
118 131
119 def remove_all(self): 132 def __get_zipinfo_meta(self, zipinfo):
120 ''' So far, the zipfile module does not allow to write a ZipInfo 133 ''' Return all the metadata of a ZipInfo
121 object into a zipfile (and it's a shame !) : so data added 134 '''
122 by zipfile itself could not be removed. It's a big concern. 135 metadata = {}
123 Is shipping a patched version of zipfile.py a good idea ? 136 if zipinfo.comment != '':
137 metadata['comment'] = zipinfo.comment
138 if zipinfo.date_time != ZIP_EPOCH:
139 metadata['modified'] = zipinfo.date_time
140 if zipinfo.create_system != 3: # 3 is UNIX
141 metadata['system'] = "windows" if zipinfo.create_system == 2 else "unknown"
142 return metadata
143
144 def remove_all(self, whitelist=[], beginning_blacklist=[], ending_blacklist=[]):
145 ''' Remove all metadata from a zip archive, even thoses
146 added by Python's zipfile itself. It will not add
147 files starting with "begining_blacklist", or ending with
148 "ending_blacklist". This method also add files present in
149 whitelist to the archive.
124 ''' 150 '''
125 zipin = zipfile.ZipFile(self.filename, 'r') 151 zipin = zipfile.ZipFile(self.filename, 'r')
126 zipout = zipfile.ZipFile(self.output, 'w', allowZip64=True) 152 zipout = zipfile.ZipFile(self.output, 'w', allowZip64=True)
127 for item in zipin.infolist(): 153 for item in zipin.infolist():
128 zipin.extract(item, self.tempdir) 154 zipin.extract(item, self.tempdir)
129 name = os.path.join(self.tempdir, item.filename) 155 name = os.path.join(self.tempdir, item.filename)
130 if os.path.isfile(name): 156
131 try: 157 beginning = any((True for f in beginning_blacklist if item.filename.startswith(f)))
132 cfile = mat.create_class_file(name, False, 158 ending = any((True for f in ending_blacklist if item.filename.endswith(f)))
133 add2archive=self.add2archive) 159
160 if os.path.isfile(name) and not beginning and not ending:
161 cfile = mat.create_class_file(name, False, add2archive=self.add2archive)
162 if cfile is not None:
134 cfile.remove_all() 163 cfile.remove_all()
135 logging.debug('Processing %s from %s' % (item.filename, 164 logging.debug('Processing %s from %s' % (item.filename, self.filename))
136 self.filename)) 165 elif item.filename not in whitelist:
137 zipout.write(name, item.filename) 166 logging.info('%s\'s format is not supported or harmless' % item.filename)
138 except: 167 basename, ext = os.path.splitext(name)
139 logging.info('%s\'s format is not supported or harmless' % 168 if not (self.add2archive or ext in parser.NOMETA):
140 item.filename) 169 continue
141 _, ext = os.path.splitext(name) 170 os.utime(name, (ZIP_EPOCH_SECONDS, ZIP_EPOCH_SECONDS))
142 if self.add2archive or ext in parser.NOMETA: 171 zipout.write(name, item.filename)
143 zipout.write(name, item.filename)
144 zipin.close() 172 zipin.close()
145 for zipFile in zipout.infolist():
146 zipFile.orig_filename = zipFile.filename
147 zipFile.date_time = (1980, 1, 1, 0, 0, 0)
148 zipFile.create_system = 3 # 3 is UNIX
149 zipout.comment = ''
150 zipout.close() 173 zipout.close()
151 174
152 logging.info('%s processed' % self.filename) 175 logging.info('%s processed' % self.filename)
@@ -167,7 +190,7 @@ class TarStripper(GenericArchiveStripper):
167 current_file.gname = '' 190 current_file.gname = ''
168 return current_file 191 return current_file
169 192
170 def remove_all(self, exclude_list=[]): 193 def remove_all(self, whitelist=[]):
171 tarin = tarfile.open(self.filename, 'r' + self.compression, encoding='utf-8') 194 tarin = tarfile.open(self.filename, 'r' + self.compression, encoding='utf-8')
172 tarout = tarfile.open(self.output, 'w' + self.compression, encoding='utf-8') 195 tarout = tarfile.open(self.output, 'w' + self.compression, encoding='utf-8')
173 for item in tarin.getmembers(): 196 for item in tarin.getmembers():
@@ -179,8 +202,9 @@ class TarStripper(GenericArchiveStripper):
179 cfile.remove_all() 202 cfile.remove_all()
180 elif self.add2archive or os.path.splitext(item.name)[1] in parser.NOMETA: 203 elif self.add2archive or os.path.splitext(item.name)[1] in parser.NOMETA:
181 logging.info('%s\' format is either not supported or harmless' % item.name) 204 logging.info('%s\' format is either not supported or harmless' % item.name)
182 elif item.name in exclude_list: 205 elif item.name in whitelist:
183 logging.debug('%s is not supported, but MAt was told to add it anyway.' % item.name) 206 logging.debug('%s is not supported, but MAT was told to add it anyway.'
207 % item.name)
184 else: 208 else:
185 continue 209 continue
186 tarout.add(complete_name, item.name, filter=self._remove) 210 tarout.add(complete_name, item.name, filter=self._remove)
@@ -209,7 +233,6 @@ class TarStripper(GenericArchiveStripper):
209 ''' 233 '''
210 if list_unsupported: 234 if list_unsupported:
211 ret_list = [] 235 ret_list = []
212 tempdir_len = len(self.tempdir) + 1 # trim the tempfile path
213 tarin = tarfile.open(self.filename, 'r' + self.compression) 236 tarin = tarfile.open(self.filename, 'r' + self.compression)
214 for item in tarin.getmembers(): 237 for item in tarin.getmembers():
215 if not self.is_file_clean(item) and not list_unsupported: 238 if not self.is_file_clean(item) and not list_unsupported:
@@ -217,20 +240,21 @@ class TarStripper(GenericArchiveStripper):
217 tarin.extract(item, self.tempdir) 240 tarin.extract(item, self.tempdir)
218 complete_name = os.path.join(self.tempdir, item.name) 241 complete_name = os.path.join(self.tempdir, item.name)
219 if item.isfile(): 242 if item.isfile():
220 class_file = mat.create_class_file(complete_name, False, add2archive=self.add2archive) 243 class_file = mat.create_class_file(complete_name,
244 False, add2archive=self.add2archive)
221 if class_file: 245 if class_file:
222 # We don't support nested archives 246 # We don't support nested archives
223 if not class_file.is_clean(): 247 if not class_file.is_clean():
224 if not list_unsupported: 248 if not list_unsupported:
225 return False 249 return False
226 elif isinstance(class_file, GenericArchiveStripper): 250 elif isinstance(class_file, GenericArchiveStripper):
227 ret_list.append(complete_name[tempdir_len:]) 251 ret_list.append(item.name)
228 else: 252 else:
229 logging.error('%s\'s format is not supported or harmless' % item.name) 253 logging.error('%s\'s format is not supported or harmless' % item.name)
230 if os.path.splitext(complete_name)[1] not in parser.NOMETA: 254 if os.path.splitext(complete_name)[1] not in parser.NOMETA:
231 if not list_unsupported: 255 if not list_unsupported:
232 return False 256 return False
233 ret_list.append(complete_name[tempdir_len:]) 257 ret_list.append(item.name)
234 tarin.close() 258 tarin.close()
235 if list_unsupported: 259 if list_unsupported:
236 return ret_list 260 return ret_list
diff --git a/MAT/office.py b/MAT/office.py
index f60fc64..97405b3 100644
--- a/MAT/office.py
+++ b/MAT/office.py
@@ -1,13 +1,12 @@
1''' Care about office's formats 1''' Care about office's formats
2''' 2'''
3 3
4import os
5import logging 4import logging
6import zipfile 5import os
7import fileinput
8import tempfile
9import shutil 6import shutil
7import tempfile
10import xml.dom.minidom as minidom 8import xml.dom.minidom as minidom
9import zipfile
11 10
12try: 11try:
13 import cairo 12 import cairo
@@ -16,7 +15,6 @@ except ImportError:
16 logging.info('office.py loaded without PDF support') 15 logging.info('office.py loaded without PDF support')
17 pass 16 pass
18 17
19import mat
20import parser 18import parser
21import archive 19import archive
22 20
@@ -30,89 +28,83 @@ class OpenDocumentStripper(archive.ZipStripper):
30 ''' Return a dict with all the meta of the file by 28 ''' Return a dict with all the meta of the file by
31 trying to read the meta.xml file. 29 trying to read the meta.xml file.
32 ''' 30 '''
31 metadata = super(OpenDocumentStripper, self).get_meta()
33 zipin = zipfile.ZipFile(self.filename, 'r') 32 zipin = zipfile.ZipFile(self.filename, 'r')
34 metadata = {}
35 try: 33 try:
36 content = zipin.read('meta.xml') 34 content = zipin.read('meta.xml')
37 dom1 = minidom.parseString(content) 35 dom1 = minidom.parseString(content)
38 elements = dom1.getElementsByTagName('office:meta') 36 elements = dom1.getElementsByTagName('office:meta')
39 for i in elements[0].childNodes: 37 for i in elements[0].childNodes:
40 if i.tagName != 'meta:document-statistic': 38 if i.tagName != 'meta:document-statistic':
41 nodename = ''.join([k for k in i.nodeName.split(':')[1:]]) 39 nodename = ''.join(i.nodeName.split(':')[1:])
42 metadata[nodename] = ''.join([j.data for j in i.childNodes]) 40 metadata[nodename] = ''.join([j.data for j in i.childNodes])
43 else: 41 else:
44 # thank you w3c for not providing a nice 42 # thank you w3c for not providing a nice
45 # method to get all attributes of a node 43 # method to get all attributes of a node
46 pass 44 pass
47 zipin.close()
48 except KeyError: # no meta.xml file found 45 except KeyError: # no meta.xml file found
49 logging.debug('%s has no opendocument metadata' % self.filename) 46 logging.debug('%s has no opendocument metadata' % self.filename)
47 zipin.close()
50 return metadata 48 return metadata
51 49
52 def remove_all(self): 50 def remove_all(self):
51 ''' Removes metadata
53 ''' 52 '''
54 FIXME ? 53 return super(OpenDocumentStripper, self).remove_all(ending_blacklist=['meta.xml'])
55 There is a patch implementing the Zipfile.remove() 54
56 method here : http://bugs.python.org/issue6818 55 def is_clean(self):
56 ''' Check if the file is clean from harmful metadatas
57 ''' 57 '''
58 clean_super = super(OpenDocumentStripper, self).is_clean()
59 if clean_super is False:
60 return False
61
58 zipin = zipfile.ZipFile(self.filename, 'r') 62 zipin = zipfile.ZipFile(self.filename, 'r')
59 zipout = zipfile.ZipFile(self.output, 'w', allowZip64=True) 63 try:
64 zipin.getinfo('meta.xml')
65 except KeyError: # no meta.xml in the file
66 return True
67 zipin.close()
68 return False
60 69
61 for item in zipin.namelist():
62 name = os.path.join(self.tempdir, item)
63 _, ext = os.path.splitext(name)
64 70
65 if item.endswith('manifest.xml'): 71class OpenXmlStripper(archive.ZipStripper):
66 # contain the list of all files present in the archive 72 ''' Represent an office openxml document, which is like
67 zipin.extract(item, self.tempdir) 73 an opendocument format, with some tricky stuff added.
68 for line in fileinput.input(name, inplace=1): 74 It contains mostly xml, but can have media blobs, crap, ...
69 # remove the line which contains "meta.xml" 75 (I don't like this format.)
70 line = line.strip() 76 '''
71 if not 'meta.xml' in line: 77 def remove_all(self):
72 print line 78 return super(OpenXmlStripper, self).remove_all(
73 zipout.write(name, item) 79 beginning_blacklist=('docProps/'), whitelist=('.rels'))
74 80
75 elif ext in parser.NOMETA or item == 'mimetype': 81 def is_clean(self):
76 # keep NOMETA files, and the "manifest" file 82 ''' Check if the file is clean from harmful metadatas.
77 if item != 'meta.xml': # contains the metadata 83 This implementation is faster than something like
78 zipin.extract(item, self.tempdir) 84 "return this.get_meta() == {}".
79 zipout.write(name, item) 85 '''
86 clean_super = super(OpenXmlStripper, self).is_clean()
87 if clean_super is False:
88 return False
80 89
81 else: 90 zipin = zipfile.ZipFile(self.filename, 'r')
82 zipin.extract(item, self.tempdir) 91 for item in zipin.namelist():
83 if os.path.isfile(name): 92 if item.startswith('docProps/'):
84 try: 93 return False
85 cfile = mat.create_class_file(name, False,
86 add2archive=self.add2archive)
87 cfile.remove_all()
88 logging.debug('Processing %s from %s' % (item,
89 self.filename))
90 zipout.write(name, item)
91 except:
92 logging.info('%s\'s fileformat is not supported' % item)
93 if self.add2archive:
94 zipout.write(name, item)
95 zipout.comment = ''
96 logging.info('%s processed' % self.filename)
97 zipin.close() 94 zipin.close()
98 zipout.close()
99 self.do_backup()
100 return True 95 return True
101 96
102 def is_clean(self): 97 def get_meta(self):
103 ''' Check if the file is clean from harmful metadatas 98 ''' Return a dict with all the meta of the file
104 ''' 99 '''
100 metadata = super(OpenXmlStripper, self).get_meta()
101
105 zipin = zipfile.ZipFile(self.filename, 'r') 102 zipin = zipfile.ZipFile(self.filename, 'r')
106 try: 103 for item in zipin.namelist():
107 zipin.getinfo('meta.xml') 104 if item.startswith('docProps/'):
108 except KeyError: # no meta.xml in the file 105 metadata[item] = 'harmful content'
109 czf = archive.ZipStripper(self.filename, self.parser,
110 'application/zip', False, True, add2archive=self.add2archive)
111 if czf.is_clean():
112 zipin.close()
113 return True
114 zipin.close() 106 zipin.close()
115 return False 107 return metadata
116 108
117 109
118class PdfStripper(parser.GenericParser): 110class PdfStripper(parser.GenericParser):
@@ -128,8 +120,8 @@ class PdfStripper(parser.GenericParser):
128 self.pdf_quality = False 120 self.pdf_quality = False
129 121
130 self.document = Poppler.Document.new_from_file(uri, self.password) 122 self.document = Poppler.Document.new_from_file(uri, self.password)
131 self.meta_list = frozenset(['title', 'author', 'subject', 'keywords', 'creator', 123 self.meta_list = frozenset(['title', 'author', 'subject',
132 'producer', 'metadata']) 124 'keywords', 'creator', 'producer', 'metadata'])
133 125
134 def is_clean(self): 126 def is_clean(self):
135 ''' Check if the file is clean from harmful metadatas 127 ''' Check if the file is clean from harmful metadatas
@@ -168,7 +160,7 @@ class PdfStripper(parser.GenericParser):
168 surface.finish() 160 surface.finish()
169 shutil.move(output, self.output) 161 shutil.move(output, self.output)
170 except: 162 except:
171 logging.error('Something went wrong when cleaning %s. File not cleaned' % self.filename) 163 logging.error('Something went wrong when cleaning %s.' % self.filename)
172 return False 164 return False
173 165
174 try: 166 try:
@@ -182,8 +174,7 @@ class PdfStripper(parser.GenericParser):
182 writer.write(self.output) 174 writer.write(self.output)
183 self.do_backup() 175 self.do_backup()
184 except: 176 except:
185 logging.error('Unable to remove all metadata from %s, please install\ 177 logging.error('Unable to remove all metadata from %s, please install pdfrw' % self.output)
186pdfrw' % self.output)
187 return False 178 return False
188 return True 179 return True
189 180
@@ -195,73 +186,3 @@ pdfrw' % self.output)
195 if self.document.get_property(key): 186 if self.document.get_property(key):
196 metadata[key] = self.document.get_property(key) 187 metadata[key] = self.document.get_property(key)
197 return metadata 188 return metadata
198
199
200class OpenXmlStripper(archive.GenericArchiveStripper):
201 '''
202 Represent an office openxml document, which is like
203 an opendocument format, with some tricky stuff added.
204 It contains mostly xml, but can have media blobs, crap, ...
205 (I don't like this format.)
206 '''
207 def remove_all(self):
208 '''
209 FIXME ?
210 There is a patch implementing the Zipfile.remove()
211 method here : http://bugs.python.org/issue6818
212 '''
213 zipin = zipfile.ZipFile(self.filename, 'r')
214 zipout = zipfile.ZipFile(self.output, 'w',
215 allowZip64=True)
216 for item in zipin.namelist():
217 name = os.path.join(self.tempdir, item)
218 _, ext = os.path.splitext(name)
219 if item.startswith('docProps/'): # metadatas
220 pass
221 elif ext in parser.NOMETA or item == '.rels':
222 # keep parser.NOMETA files, and the file named ".rels"
223 zipin.extract(item, self.tempdir)
224 zipout.write(name, item)
225 else:
226 zipin.extract(item, self.tempdir)
227 if os.path.isfile(name): # don't care about folders
228 try:
229 cfile = mat.create_class_file(name, False,
230 add2archive=self.add2archive)
231 cfile.remove_all()
232 logging.debug('Processing %s from %s' % (item,
233 self.filename))
234 zipout.write(name, item)
235 except:
236 logging.info('%s\'s fileformat is not supported' % item)
237 if self.add2archive:
238 zipout.write(name, item)
239 zipout.comment = ''
240 logging.info('%s processed' % self.filename)
241 zipin.close()
242 zipout.close()
243 self.do_backup()
244 return True
245
246 def is_clean(self):
247 ''' Check if the file is clean from harmful metadatas
248 '''
249 zipin = zipfile.ZipFile(self.filename, 'r')
250 for item in zipin.namelist():
251 if item.startswith('docProps/'):
252 return False
253 zipin.close()
254 czf = archive.ZipStripper(self.filename, self.parser,
255 'application/zip', False, True, add2archive=self.add2archive)
256 return czf.is_clean()
257
258 def get_meta(self):
259 ''' Return a dict with all the meta of the file
260 '''
261 zipin = zipfile.ZipFile(self.filename, 'r')
262 metadata = {}
263 for item in zipin.namelist():
264 if item.startswith('docProps/'):
265 metadata[item] = 'harmful content'
266 zipin.close()
267 return metadata
diff --git a/MAT/strippers.py b/MAT/strippers.py
index 5fd4e08..aea98da 100644
--- a/MAT/strippers.py
+++ b/MAT/strippers.py
@@ -14,6 +14,8 @@ import subprocess
14STRIPPERS = { 14STRIPPERS = {
15 'application/x-tar': archive.TarStripper, 15 'application/x-tar': archive.TarStripper,
16 'application/x-bzip2': archive.Bzip2Stripper, 16 'application/x-bzip2': archive.Bzip2Stripper,
17 'application/x-gzip': archive.GzipStripper,
18 'application/zip': archive.ZipStripper,
17 'audio/mpeg': audio.MpegAudioStripper, 19 'audio/mpeg': audio.MpegAudioStripper,
18 'application/x-bittorrent': misc.TorrentStripper, 20 'application/x-bittorrent': misc.TorrentStripper,
19 'application/opendocument': office.OpenDocumentStripper, 21 'application/opendocument': office.OpenDocumentStripper,
diff --git a/mat-gui b/mat-gui
index de0da83..ba252b6 100755
--- a/mat-gui
+++ b/mat-gui
@@ -410,7 +410,7 @@ non-anonymised) file to output archive'))
410 unsupported_list = self.liststore[line][0].file.list_unsupported() 410 unsupported_list = self.liststore[line][0].file.list_unsupported()
411 if unsupported_list: 411 if unsupported_list:
412 list_to_add = self.__popup_archive(unsupported_list) 412 list_to_add = self.__popup_archive(unsupported_list)
413 if self.liststore[line][0].file.remove_all(list_to_add): 413 if self.liststore[line][0].file.remove_all(whitelist=list_to_add):
414 self.liststore[line][2] = _('Clean') 414 self.liststore[line][2] = _('Clean')
415 elif self.liststore[line][0].file.remove_all(): 415 elif self.liststore[line][0].file.remove_all():
416 self.liststore[line][2] = _('Clean') 416 self.liststore[line][2] = _('Clean')
diff --git a/test/TODO/dirty.zip b/test/TODO/dirty.zip
deleted file mode 100644
index a8eb59b..0000000
--- a/test/TODO/dirty.zip
+++ /dev/null
Binary files differ
diff --git a/test/clean é.docx b/test/clean é.docx
index 0f1470c..738eb6c 100644
--- a/test/clean é.docx
+++ b/test/clean é.docx
Binary files differ
diff --git a/test/clean é.odt b/test/clean é.odt
index e7a550c..a06d816 100644
--- a/test/clean é.odt
+++ b/test/clean é.odt
Binary files differ
diff --git a/test/clean é.tar.gz b/test/clean é.tar.gz
new file mode 100644
index 0000000..1ab4407
--- /dev/null
+++ b/test/clean é.tar.gz
Binary files differ
diff --git a/test/TODO/clean.zip b/test/clean é.zip
index bf46419..b2805c4 100644
--- a/test/TODO/clean.zip
+++ b/test/clean é.zip
Binary files differ
diff --git a/test/dirty é.tar.gz b/test/dirty é.tar.gz
new file mode 100644
index 0000000..8bb392b
--- /dev/null
+++ b/test/dirty é.tar.gz
Binary files differ
diff --git a/test/dirty é.zip b/test/dirty é.zip
new file mode 100644
index 0000000..e272162
--- /dev/null
+++ b/test/dirty é.zip
Binary files differ
diff --git a/test/libtest.py b/test/libtest.py
index 0b45505..f052b6e 100644
--- a/test/libtest.py
+++ b/test/libtest.py
@@ -99,6 +99,7 @@ class TestSecureRemove(unittest.TestCase):
99 ''' 99 '''
100 self.assertRaises(MAT.exceptions.UnableToRemoveFile, MAT.mat.secure_remove, '/NOTREMOVABLE') 100 self.assertRaises(MAT.exceptions.UnableToRemoveFile, MAT.mat.secure_remove, '/NOTREMOVABLE')
101 101
102
102class TestArchiveProcessing(test.MATTest): 103class TestArchiveProcessing(test.MATTest):
103 ''' Test archives processing 104 ''' Test archives processing
104 ''' 105 '''
@@ -107,7 +108,7 @@ class TestArchiveProcessing(test.MATTest):
107 ''' 108 '''
108 tarpath = os.path.join(self.tmpdir, "test.tar.bz2") 109 tarpath = os.path.join(self.tmpdir, "test.tar.bz2")
109 tar = tarfile.open(tarpath, "w:bz2") 110 tar = tarfile.open(tarpath, "w:bz2")
110 for clean,dirty in self.file_list: 111 for clean, dirty in self.file_list:
111 tar.add(dirty) 112 tar.add(dirty)
112 tar.add(clean) 113 tar.add(clean)
113 tar.close() 114 tar.close()
@@ -121,7 +122,7 @@ class TestArchiveProcessing(test.MATTest):
121 ''' 122 '''
122 tarpath = os.path.join(self.tmpdir, "test.tar") 123 tarpath = os.path.join(self.tmpdir, "test.tar")
123 tar = tarfile.open(tarpath, "w") 124 tar = tarfile.open(tarpath, "w")
124 for clean,dirty in self.file_list: 125 for clean, dirty in self.file_list:
125 tar.add(dirty) 126 tar.add(dirty)
126 tar.add(clean) 127 tar.add(clean)
127 tar.close() 128 tar.close()
@@ -135,7 +136,7 @@ class TestArchiveProcessing(test.MATTest):
135 ''' 136 '''
136 tarpath = os.path.join(self.tmpdir, "test.tar.gz") 137 tarpath = os.path.join(self.tmpdir, "test.tar.gz")
137 tar = tarfile.open(tarpath, "w") 138 tar = tarfile.open(tarpath, "w")
138 for clean,dirty in self.file_list: 139 for clean, dirty in self.file_list:
139 tar.add(dirty) 140 tar.add(dirty)
140 tar.add(clean) 141 tar.add(clean)
141 tar.close() 142 tar.close()
@@ -156,6 +157,7 @@ class TestArchiveProcessing(test.MATTest):
156 unsupported_files = set(current_file.is_clean(list_unsupported=True)) 157 unsupported_files = set(current_file.is_clean(list_unsupported=True))
157 self.assertEqual(unsupported_files, set(('mat.desktop', 'README.security', 'setup.py'))) 158 self.assertEqual(unsupported_files, set(('mat.desktop', 'README.security', 'setup.py')))
158 159
160
159def get_tests(): 161def get_tests():
160 ''' Returns every libtests''' 162 ''' Returns every libtests'''
161 suite = unittest.TestSuite() 163 suite = unittest.TestSuite()