1 files changed, 54 insertions, 133 deletions
diff --git a/MAT/office.py b/MAT/office.py
index f60fc64..97405b3 100644
--- a/MAT/office.py
+++ b/MAT/office.py
@@ -1,13 +1,12 @@
 ''' Care about office's formats
 '''
-import os
 import logging
-import zipfile
+import os
-import fileinput
-import tempfile
 import shutil
+import tempfile
 import xml.dom.minidom as minidom
+import zipfile
 try:
    import cairo
@@ -16,7 +15,6 @@ except ImportError:
    logging.info('office.py loaded without PDF support')
    pass
-import mat
 import parser
 import archive
@@ -30,89 +28,83 @@ class OpenDocumentStripper(archive.ZipStripper):
        ''' Return a dict with all the meta of the file by
            trying to read the meta.xml file.
        '''
+        metadata = super(OpenDocumentStripper, self).get_meta()
        zipin = zipfile.ZipFile(self.filename, 'r')
-        metadata = {}
        try:
            content = zipin.read('meta.xml')
            dom1 = minidom.parseString(content)
            elements = dom1.getElementsByTagName('office:meta')
            for i in elements[0].childNodes:
                if i.tagName != 'meta:document-statistic':
-                    nodename = ''.join([k for k in i.nodeName.split(':')[1:]])
+                    nodename = ''.join(i.nodeName.split(':')[1:])
                    metadata[nodename] = ''.join([j.data for j in i.childNodes])
                else:
                    # thank you w3c for not providing a nice
                    # method to get all attributes of a node
                    pass
-            zipin.close()
        except KeyError:  # no meta.xml file found
            logging.debug('%s has no opendocument metadata' % self.filename)
+        zipin.close()
        return metadata
    def remove_all(self):
+        ''' Removes metadata
        '''
-            FIXME ?
+        return super(OpenDocumentStripper, self).remove_all(ending_blacklist=['meta.xml'])
-            There is a patch implementing the Zipfile.remove()
-            method here : http://bugs.python.org/issue6818
+    def is_clean(self):
+        ''' Check if the file is clean from harmful metadatas
        '''
+        clean_super = super(OpenDocumentStripper, self).is_clean()
+        if clean_super is False:
+            return False
        zipin = zipfile.ZipFile(self.filename, 'r')
-        zipout = zipfile.ZipFile(self.output, 'w', allowZip64=True)
+        try:
+            zipin.getinfo('meta.xml')
+        except KeyError:  # no meta.xml in the file
+            return True
+        zipin.close()
+        return False
-        for item in zipin.namelist():
-            name = os.path.join(self.tempdir, item)
-            _, ext = os.path.splitext(name)
-            if item.endswith('manifest.xml'):
+class OpenXmlStripper(archive.ZipStripper):
-            # contain the list of all files present in the archive
+    ''' Represent an office openxml document, which is like
-                zipin.extract(item, self.tempdir)
+        an opendocument format, with some tricky stuff added.
-                for line in fileinput.input(name, inplace=1):
+        It contains mostly xml, but can have media blobs, crap, ...
-                    # remove the line which contains "meta.xml"
+        (I don't like this format.)
-                    line = line.strip()
+    '''
-                    if not 'meta.xml' in line:
+    def remove_all(self):
-                        print line
+        return super(OpenXmlStripper, self).remove_all(
-                zipout.write(name, item)
+                beginning_blacklist=('docProps/'), whitelist=('.rels'))
-            elif ext in parser.NOMETA or item == 'mimetype':
+    def is_clean(self):
-                # keep NOMETA files, and the "manifest" file
+        ''' Check if the file is clean from harmful metadatas.
-                if item != 'meta.xml':  # contains the metadata
+            This implementation is faster than something like
-                    zipin.extract(item, self.tempdir)
+            "return this.get_meta() == {}".
-                    zipout.write(name, item)
+        '''
+        clean_super = super(OpenXmlStripper, self).is_clean()
+        if clean_super is False:
+            return False
-            else:
+        zipin = zipfile.ZipFile(self.filename, 'r')
-                zipin.extract(item, self.tempdir)
+        for item in zipin.namelist():
-                if os.path.isfile(name):
+            if item.startswith('docProps/'):
-                    try:
+                return False
-                        cfile = mat.create_class_file(name, False,
-                            add2archive=self.add2archive)
-                        cfile.remove_all()
-                        logging.debug('Processing %s from %s' % (item,
-                            self.filename))
-                        zipout.write(name, item)
-                    except:
-                        logging.info('%s\'s fileformat is not supported' % item)
-                        if self.add2archive:
-                            zipout.write(name, item)
-        zipout.comment = ''
-        logging.info('%s processed' % self.filename)
        zipin.close()
-        zipout.close()
-        self.do_backup()
        return True
-    def is_clean(self):
+    def get_meta(self):
-        ''' Check if the file is clean from harmful metadatas
+        ''' Return a dict with all the meta of the file
        '''
+        metadata = super(OpenXmlStripper, self).get_meta()
        zipin = zipfile.ZipFile(self.filename, 'r')
-        try:
+        for item in zipin.namelist():
-            zipin.getinfo('meta.xml')
+            if item.startswith('docProps/'):
-        except KeyError:  # no meta.xml in the file
+                metadata[item] = 'harmful content'
-            czf = archive.ZipStripper(self.filename, self.parser,
-                'application/zip', False, True, add2archive=self.add2archive)
-            if czf.is_clean():
-                zipin.close()
-                return True
        zipin.close()
-        return False
+        return metadata
 class PdfStripper(parser.GenericParser):
@@ -128,8 +120,8 @@ class PdfStripper(parser.GenericParser):
            self.pdf_quality = False
        self.document = Poppler.Document.new_from_file(uri, self.password)
-        self.meta_list = frozenset(['title', 'author', 'subject', 'keywords', 'creator',
+        self.meta_list = frozenset(['title', 'author', 'subject',
-            'producer', 'metadata'])
+            'keywords', 'creator', 'producer', 'metadata'])
    def is_clean(self):
        ''' Check if the file is clean from harmful metadatas
@@ -168,7 +160,7 @@ class PdfStripper(parser.GenericParser):
            surface.finish()
            shutil.move(output, self.output)
        except:
-            logging.error('Something went wrong when cleaning %s. File not cleaned' % self.filename)
+            logging.error('Something went wrong when cleaning %s.' % self.filename)
            return False
        try:
@@ -182,8 +174,7 @@ class PdfStripper(parser.GenericParser):
            writer.write(self.output)
            self.do_backup()
        except:
-            logging.error('Unable to remove all metadata from %s, please install\
+            logging.error('Unable to remove all metadata from %s, please install pdfrw' % self.output)
-pdfrw' % self.output)
            return False
        return True
@@ -195,73 +186,3 @@ pdfrw' % self.output)
            if self.document.get_property(key):
                metadata[key] = self.document.get_property(key)
        return metadata
-class OpenXmlStripper(archive.GenericArchiveStripper):
-    '''
-        Represent an office openxml document, which is like
-        an opendocument format, with some tricky stuff added.
-        It contains mostly xml, but can have media blobs, crap, ...
-        (I don't like this format.)
-    '''
-    def remove_all(self):
-        '''
-            FIXME ?
-            There is a patch implementing the Zipfile.remove()
-            method here : http://bugs.python.org/issue6818
-        '''
-        zipin = zipfile.ZipFile(self.filename, 'r')
-        zipout = zipfile.ZipFile(self.output, 'w',
-            allowZip64=True)
-        for item in zipin.namelist():
-            name = os.path.join(self.tempdir, item)
-            _, ext = os.path.splitext(name)
-            if item.startswith('docProps/'):  # metadatas
-                pass
-            elif ext in parser.NOMETA or item == '.rels':
-                # keep parser.NOMETA files, and the file named ".rels"
-                zipin.extract(item, self.tempdir)
-                zipout.write(name, item)
-            else:
-                zipin.extract(item, self.tempdir)
-                if os.path.isfile(name):  # don't care about folders
-                    try:
-                        cfile = mat.create_class_file(name, False,
-                            add2archive=self.add2archive)
-                        cfile.remove_all()
-                        logging.debug('Processing %s from %s' % (item,
-                            self.filename))
-                        zipout.write(name, item)
-                    except:
-                        logging.info('%s\'s fileformat is not supported' % item)
-                        if self.add2archive:
-                            zipout.write(name, item)
-        zipout.comment = ''
-        logging.info('%s processed' % self.filename)
-        zipin.close()
-        zipout.close()
-        self.do_backup()
-        return True
-    def is_clean(self):
-        ''' Check if the file is clean from harmful metadatas
-        '''
-        zipin = zipfile.ZipFile(self.filename, 'r')
-        for item in zipin.namelist():
-            if item.startswith('docProps/'):
-                return False
-        zipin.close()
-        czf = archive.ZipStripper(self.filename, self.parser,
-                'application/zip', False, True, add2archive=self.add2archive)
-        return czf.is_clean()
-    def get_meta(self):
-        ''' Return a dict with all the meta of the file
-        '''
-        zipin = zipfile.ZipFile(self.filename, 'r')
-        metadata = {}
-        for item in zipin.namelist():
-            if item.startswith('docProps/'):
-                metadata[item] = 'harmful content'
-        zipin.close()
-        return metadata

diff --git a/MAT/office.py b/MAT/office.py index f60fc64..97405b3 100644 --- a/MAT/office.py +++ b/MAT/office.py
@@ -1,13 +1,12 @@
1	''' Care about office's formats	1	''' Care about office's formats
2	'''	2	'''
3		3
4	import os
5	import logging	4	import logging
6	import zipfile	5	import os
7	import fileinput
8	import tempfile
9	import shutil	6	import shutil
		7	import tempfile
10	import xml.dom.minidom as minidom	8	import xml.dom.minidom as minidom
		9	import zipfile
11		10
12	try:	11	try:
13	import cairo	12	import cairo
@@ -16,7 +15,6 @@ except ImportError:
16	logging.info('office.py loaded without PDF support')	15	logging.info('office.py loaded without PDF support')
17	pass	16	pass
18		17
19	import mat
20	import parser	18	import parser
21	import archive	19	import archive
22		20
@@ -30,89 +28,83 @@ class OpenDocumentStripper(archive.ZipStripper):
30	''' Return a dict with all the meta of the file by	28	''' Return a dict with all the meta of the file by
31	trying to read the meta.xml file.	29	trying to read the meta.xml file.
32	'''	30	'''
		31	metadata = super(OpenDocumentStripper, self).get_meta()
33	zipin = zipfile.ZipFile(self.filename, 'r')	32	zipin = zipfile.ZipFile(self.filename, 'r')
34	metadata = {}
35	try:	33	try:
36	content = zipin.read('meta.xml')	34	content = zipin.read('meta.xml')
37	dom1 = minidom.parseString(content)	35	dom1 = minidom.parseString(content)
38	elements = dom1.getElementsByTagName('office:meta')	36	elements = dom1.getElementsByTagName('office:meta')
39	for i in elements[0].childNodes:	37	for i in elements[0].childNodes:
40	if i.tagName != 'meta:document-statistic':	38	if i.tagName != 'meta:document-statistic':
41	nodename = ''.join([k for k in i.nodeName.split(':')[1:]])	39	nodename = ''.join(i.nodeName.split(':')[1:])
42	metadata[nodename] = ''.join([j.data for j in i.childNodes])	40	metadata[nodename] = ''.join([j.data for j in i.childNodes])
43	else:	41	else:
44	# thank you w3c for not providing a nice	42	# thank you w3c for not providing a nice
45	# method to get all attributes of a node	43	# method to get all attributes of a node
46	pass	44	pass
47	zipin.close()
48	except KeyError: # no meta.xml file found	45	except KeyError: # no meta.xml file found
49	logging.debug('%s has no opendocument metadata' % self.filename)	46	logging.debug('%s has no opendocument metadata' % self.filename)
		47	zipin.close()
50	return metadata	48	return metadata
51		49
52	def remove_all(self):	50	def remove_all(self):
		51	''' Removes metadata
53	'''	52	'''
54	FIXME ?	53	return super(OpenDocumentStripper, self).remove_all(ending_blacklist=['meta.xml'])
55	There is a patch implementing the Zipfile.remove()	54
56	method here : http://bugs.python.org/issue6818	55	def is_clean(self):
		56	''' Check if the file is clean from harmful metadatas
57	'''	57	'''
		58	clean_super = super(OpenDocumentStripper, self).is_clean()
		59	if clean_super is False:
		60	return False
		61
58	zipin = zipfile.ZipFile(self.filename, 'r')	62	zipin = zipfile.ZipFile(self.filename, 'r')
59	zipout = zipfile.ZipFile(self.output, 'w', allowZip64=True)	63	try:
		64	zipin.getinfo('meta.xml')
		65	except KeyError: # no meta.xml in the file
		66	return True
		67	zipin.close()
		68	return False
60		69
61	for item in zipin.namelist():
62	name = os.path.join(self.tempdir, item)
63	_, ext = os.path.splitext(name)
64		70
65	if item.endswith('manifest.xml'):	71	class OpenXmlStripper(archive.ZipStripper):
66	# contain the list of all files present in the archive	72	''' Represent an office openxml document, which is like
67	zipin.extract(item, self.tempdir)	73	an opendocument format, with some tricky stuff added.
68	for line in fileinput.input(name, inplace=1):	74	It contains mostly xml, but can have media blobs, crap, ...
69	# remove the line which contains "meta.xml"	75	(I don't like this format.)
70	line = line.strip()	76	'''
71	if not 'meta.xml' in line:	77	def remove_all(self):
72	print line	78	return super(OpenXmlStripper, self).remove_all(
73	zipout.write(name, item)	79	beginning_blacklist=('docProps/'), whitelist=('.rels'))
74		80
75	elif ext in parser.NOMETA or item == 'mimetype':	81	def is_clean(self):
76	# keep NOMETA files, and the "manifest" file	82	''' Check if the file is clean from harmful metadatas.
77	if item != 'meta.xml': # contains the metadata	83	This implementation is faster than something like
78	zipin.extract(item, self.tempdir)	84	"return this.get_meta() == {}".
79	zipout.write(name, item)	85	'''
		86	clean_super = super(OpenXmlStripper, self).is_clean()
		87	if clean_super is False:
		88	return False
80		89
81	else:	90	zipin = zipfile.ZipFile(self.filename, 'r')
82	zipin.extract(item, self.tempdir)	91	for item in zipin.namelist():
83	if os.path.isfile(name):	92	if item.startswith('docProps/'):
84	try:	93	return False
85	cfile = mat.create_class_file(name, False,
86	add2archive=self.add2archive)
87	cfile.remove_all()
88	logging.debug('Processing %s from %s' % (item,
89	self.filename))
90	zipout.write(name, item)
91	except:
92	logging.info('%s\'s fileformat is not supported' % item)
93	if self.add2archive:
94	zipout.write(name, item)
95	zipout.comment = ''
96	logging.info('%s processed' % self.filename)
97	zipin.close()	94	zipin.close()
98	zipout.close()
99	self.do_backup()
100	return True	95	return True
101		96
102	def is_clean(self):	97	def get_meta(self):
103	''' Check if the file is clean from harmful metadatas	98	''' Return a dict with all the meta of the file
104	'''	99	'''
		100	metadata = super(OpenXmlStripper, self).get_meta()
		101
105	zipin = zipfile.ZipFile(self.filename, 'r')	102	zipin = zipfile.ZipFile(self.filename, 'r')
106	try:	103	for item in zipin.namelist():
107	zipin.getinfo('meta.xml')	104	if item.startswith('docProps/'):
108	except KeyError: # no meta.xml in the file	105	metadata[item] = 'harmful content'
109	czf = archive.ZipStripper(self.filename, self.parser,
110	'application/zip', False, True, add2archive=self.add2archive)
111	if czf.is_clean():
112	zipin.close()
113	return True
114	zipin.close()	106	zipin.close()
115	return False	107	return metadata
116		108
117		109
118	class PdfStripper(parser.GenericParser):	110	class PdfStripper(parser.GenericParser):
@@ -128,8 +120,8 @@ class PdfStripper(parser.GenericParser):
128	self.pdf_quality = False	120	self.pdf_quality = False
129		121
130	self.document = Poppler.Document.new_from_file(uri, self.password)	122	self.document = Poppler.Document.new_from_file(uri, self.password)
131	self.meta_list = frozenset(['title', 'author', 'subject', 'keywords', 'creator',	123	self.meta_list = frozenset(['title', 'author', 'subject',
132	'producer', 'metadata'])	124	'keywords', 'creator', 'producer', 'metadata'])
133		125
134	def is_clean(self):	126	def is_clean(self):
135	''' Check if the file is clean from harmful metadatas	127	''' Check if the file is clean from harmful metadatas
@@ -168,7 +160,7 @@ class PdfStripper(parser.GenericParser):
168	surface.finish()	160	surface.finish()
169	shutil.move(output, self.output)	161	shutil.move(output, self.output)
170	except:	162	except:
171	logging.error('Something went wrong when cleaning %s. File not cleaned' % self.filename)	163	logging.error('Something went wrong when cleaning %s.' % self.filename)
172	return False	164	return False
173		165
174	try:	166	try:
@@ -182,8 +174,7 @@ class PdfStripper(parser.GenericParser):
182	writer.write(self.output)	174	writer.write(self.output)
183	self.do_backup()	175	self.do_backup()
184	except:	176	except:
185	logging.error('Unable to remove all metadata from %s, please install\	177	logging.error('Unable to remove all metadata from %s, please install pdfrw' % self.output)
186	pdfrw' % self.output)
187	return False	178	return False
188	return True	179	return True
189		180
@@ -195,73 +186,3 @@ pdfrw' % self.output)
195	if self.document.get_property(key):	186	if self.document.get_property(key):
196	metadata[key] = self.document.get_property(key)	187	metadata[key] = self.document.get_property(key)
197	return metadata	188	return metadata
198
199
200	class OpenXmlStripper(archive.GenericArchiveStripper):
201	'''
202	Represent an office openxml document, which is like
203	an opendocument format, with some tricky stuff added.
204	It contains mostly xml, but can have media blobs, crap, ...
205	(I don't like this format.)
206	'''
207	def remove_all(self):
208	'''
209	FIXME ?
210	There is a patch implementing the Zipfile.remove()
211	method here : http://bugs.python.org/issue6818
212	'''
213	zipin = zipfile.ZipFile(self.filename, 'r')
214	zipout = zipfile.ZipFile(self.output, 'w',
215	allowZip64=True)
216	for item in zipin.namelist():
217	name = os.path.join(self.tempdir, item)
218	_, ext = os.path.splitext(name)
219	if item.startswith('docProps/'): # metadatas
220	pass
221	elif ext in parser.NOMETA or item == '.rels':
222	# keep parser.NOMETA files, and the file named ".rels"
223	zipin.extract(item, self.tempdir)
224	zipout.write(name, item)
225	else:
226	zipin.extract(item, self.tempdir)
227	if os.path.isfile(name): # don't care about folders
228	try:
229	cfile = mat.create_class_file(name, False,
230	add2archive=self.add2archive)
231	cfile.remove_all()
232	logging.debug('Processing %s from %s' % (item,
233	self.filename))
234	zipout.write(name, item)
235	except:
236	logging.info('%s\'s fileformat is not supported' % item)
237	if self.add2archive:
238	zipout.write(name, item)
239	zipout.comment = ''
240	logging.info('%s processed' % self.filename)
241	zipin.close()
242	zipout.close()
243	self.do_backup()
244	return True
245
246	def is_clean(self):
247	''' Check if the file is clean from harmful metadatas
248	'''
249	zipin = zipfile.ZipFile(self.filename, 'r')
250	for item in zipin.namelist():
251	if item.startswith('docProps/'):
252	return False
253	zipin.close()
254	czf = archive.ZipStripper(self.filename, self.parser,
255	'application/zip', False, True, add2archive=self.add2archive)
256	return czf.is_clean()
257
258	def get_meta(self):
259	''' Return a dict with all the meta of the file
260	'''
261	zipin = zipfile.ZipFile(self.filename, 'r')
262	metadata = {}
263	for item in zipin.namelist():
264	if item.startswith('docProps/'):
265	metadata[item] = 'harmful content'
266	zipin.close()
267	return metadata