summaryrefslogtreecommitdiff
path: root/MAT/office.py
diff options
context:
space:
mode:
Diffstat (limited to 'MAT/office.py')
-rw-r--r--MAT/office.py187
1 files changed, 54 insertions, 133 deletions
diff --git a/MAT/office.py b/MAT/office.py
index f60fc64..97405b3 100644
--- a/MAT/office.py
+++ b/MAT/office.py
@@ -1,13 +1,12 @@
1''' Care about office's formats 1''' Care about office's formats
2''' 2'''
3 3
4import os
5import logging 4import logging
6import zipfile 5import os
7import fileinput
8import tempfile
9import shutil 6import shutil
7import tempfile
10import xml.dom.minidom as minidom 8import xml.dom.minidom as minidom
9import zipfile
11 10
12try: 11try:
13 import cairo 12 import cairo
@@ -16,7 +15,6 @@ except ImportError:
16 logging.info('office.py loaded without PDF support') 15 logging.info('office.py loaded without PDF support')
17 pass 16 pass
18 17
19import mat
20import parser 18import parser
21import archive 19import archive
22 20
@@ -30,89 +28,83 @@ class OpenDocumentStripper(archive.ZipStripper):
30 ''' Return a dict with all the meta of the file by 28 ''' Return a dict with all the meta of the file by
31 trying to read the meta.xml file. 29 trying to read the meta.xml file.
32 ''' 30 '''
31 metadata = super(OpenDocumentStripper, self).get_meta()
33 zipin = zipfile.ZipFile(self.filename, 'r') 32 zipin = zipfile.ZipFile(self.filename, 'r')
34 metadata = {}
35 try: 33 try:
36 content = zipin.read('meta.xml') 34 content = zipin.read('meta.xml')
37 dom1 = minidom.parseString(content) 35 dom1 = minidom.parseString(content)
38 elements = dom1.getElementsByTagName('office:meta') 36 elements = dom1.getElementsByTagName('office:meta')
39 for i in elements[0].childNodes: 37 for i in elements[0].childNodes:
40 if i.tagName != 'meta:document-statistic': 38 if i.tagName != 'meta:document-statistic':
41 nodename = ''.join([k for k in i.nodeName.split(':')[1:]]) 39 nodename = ''.join(i.nodeName.split(':')[1:])
42 metadata[nodename] = ''.join([j.data for j in i.childNodes]) 40 metadata[nodename] = ''.join([j.data for j in i.childNodes])
43 else: 41 else:
44 # thank you w3c for not providing a nice 42 # thank you w3c for not providing a nice
45 # method to get all attributes of a node 43 # method to get all attributes of a node
46 pass 44 pass
47 zipin.close()
48 except KeyError: # no meta.xml file found 45 except KeyError: # no meta.xml file found
49 logging.debug('%s has no opendocument metadata' % self.filename) 46 logging.debug('%s has no opendocument metadata' % self.filename)
47 zipin.close()
50 return metadata 48 return metadata
51 49
52 def remove_all(self): 50 def remove_all(self):
51 ''' Removes metadata
53 ''' 52 '''
54 FIXME ? 53 return super(OpenDocumentStripper, self).remove_all(ending_blacklist=['meta.xml'])
55 There is a patch implementing the Zipfile.remove() 54
56 method here : http://bugs.python.org/issue6818 55 def is_clean(self):
56 ''' Check if the file is clean from harmful metadatas
57 ''' 57 '''
58 clean_super = super(OpenDocumentStripper, self).is_clean()
59 if clean_super is False:
60 return False
61
58 zipin = zipfile.ZipFile(self.filename, 'r') 62 zipin = zipfile.ZipFile(self.filename, 'r')
59 zipout = zipfile.ZipFile(self.output, 'w', allowZip64=True) 63 try:
64 zipin.getinfo('meta.xml')
65 except KeyError: # no meta.xml in the file
66 return True
67 zipin.close()
68 return False
60 69
61 for item in zipin.namelist():
62 name = os.path.join(self.tempdir, item)
63 _, ext = os.path.splitext(name)
64 70
65 if item.endswith('manifest.xml'): 71class OpenXmlStripper(archive.ZipStripper):
66 # contain the list of all files present in the archive 72 ''' Represent an office openxml document, which is like
67 zipin.extract(item, self.tempdir) 73 an opendocument format, with some tricky stuff added.
68 for line in fileinput.input(name, inplace=1): 74 It contains mostly xml, but can have media blobs, crap, ...
69 # remove the line which contains "meta.xml" 75 (I don't like this format.)
70 line = line.strip() 76 '''
71 if not 'meta.xml' in line: 77 def remove_all(self):
72 print line 78 return super(OpenXmlStripper, self).remove_all(
73 zipout.write(name, item) 79 beginning_blacklist=('docProps/'), whitelist=('.rels'))
74 80
75 elif ext in parser.NOMETA or item == 'mimetype': 81 def is_clean(self):
76 # keep NOMETA files, and the "manifest" file 82 ''' Check if the file is clean from harmful metadatas.
77 if item != 'meta.xml': # contains the metadata 83 This implementation is faster than something like
78 zipin.extract(item, self.tempdir) 84 "return this.get_meta() == {}".
79 zipout.write(name, item) 85 '''
86 clean_super = super(OpenXmlStripper, self).is_clean()
87 if clean_super is False:
88 return False
80 89
81 else: 90 zipin = zipfile.ZipFile(self.filename, 'r')
82 zipin.extract(item, self.tempdir) 91 for item in zipin.namelist():
83 if os.path.isfile(name): 92 if item.startswith('docProps/'):
84 try: 93 return False
85 cfile = mat.create_class_file(name, False,
86 add2archive=self.add2archive)
87 cfile.remove_all()
88 logging.debug('Processing %s from %s' % (item,
89 self.filename))
90 zipout.write(name, item)
91 except:
92 logging.info('%s\'s fileformat is not supported' % item)
93 if self.add2archive:
94 zipout.write(name, item)
95 zipout.comment = ''
96 logging.info('%s processed' % self.filename)
97 zipin.close() 94 zipin.close()
98 zipout.close()
99 self.do_backup()
100 return True 95 return True
101 96
102 def is_clean(self): 97 def get_meta(self):
103 ''' Check if the file is clean from harmful metadatas 98 ''' Return a dict with all the meta of the file
104 ''' 99 '''
100 metadata = super(OpenXmlStripper, self).get_meta()
101
105 zipin = zipfile.ZipFile(self.filename, 'r') 102 zipin = zipfile.ZipFile(self.filename, 'r')
106 try: 103 for item in zipin.namelist():
107 zipin.getinfo('meta.xml') 104 if item.startswith('docProps/'):
108 except KeyError: # no meta.xml in the file 105 metadata[item] = 'harmful content'
109 czf = archive.ZipStripper(self.filename, self.parser,
110 'application/zip', False, True, add2archive=self.add2archive)
111 if czf.is_clean():
112 zipin.close()
113 return True
114 zipin.close() 106 zipin.close()
115 return False 107 return metadata
116 108
117 109
118class PdfStripper(parser.GenericParser): 110class PdfStripper(parser.GenericParser):
@@ -128,8 +120,8 @@ class PdfStripper(parser.GenericParser):
128 self.pdf_quality = False 120 self.pdf_quality = False
129 121
130 self.document = Poppler.Document.new_from_file(uri, self.password) 122 self.document = Poppler.Document.new_from_file(uri, self.password)
131 self.meta_list = frozenset(['title', 'author', 'subject', 'keywords', 'creator', 123 self.meta_list = frozenset(['title', 'author', 'subject',
132 'producer', 'metadata']) 124 'keywords', 'creator', 'producer', 'metadata'])
133 125
134 def is_clean(self): 126 def is_clean(self):
135 ''' Check if the file is clean from harmful metadatas 127 ''' Check if the file is clean from harmful metadatas
@@ -168,7 +160,7 @@ class PdfStripper(parser.GenericParser):
168 surface.finish() 160 surface.finish()
169 shutil.move(output, self.output) 161 shutil.move(output, self.output)
170 except: 162 except:
171 logging.error('Something went wrong when cleaning %s. File not cleaned' % self.filename) 163 logging.error('Something went wrong when cleaning %s.' % self.filename)
172 return False 164 return False
173 165
174 try: 166 try:
@@ -182,8 +174,7 @@ class PdfStripper(parser.GenericParser):
182 writer.write(self.output) 174 writer.write(self.output)
183 self.do_backup() 175 self.do_backup()
184 except: 176 except:
185 logging.error('Unable to remove all metadata from %s, please install\ 177 logging.error('Unable to remove all metadata from %s, please install pdfrw' % self.output)
186pdfrw' % self.output)
187 return False 178 return False
188 return True 179 return True
189 180
@@ -195,73 +186,3 @@ pdfrw' % self.output)
195 if self.document.get_property(key): 186 if self.document.get_property(key):
196 metadata[key] = self.document.get_property(key) 187 metadata[key] = self.document.get_property(key)
197 return metadata 188 return metadata
198
199
200class OpenXmlStripper(archive.GenericArchiveStripper):
201 '''
202 Represent an office openxml document, which is like
203 an opendocument format, with some tricky stuff added.
204 It contains mostly xml, but can have media blobs, crap, ...
205 (I don't like this format.)
206 '''
207 def remove_all(self):
208 '''
209 FIXME ?
210 There is a patch implementing the Zipfile.remove()
211 method here : http://bugs.python.org/issue6818
212 '''
213 zipin = zipfile.ZipFile(self.filename, 'r')
214 zipout = zipfile.ZipFile(self.output, 'w',
215 allowZip64=True)
216 for item in zipin.namelist():
217 name = os.path.join(self.tempdir, item)
218 _, ext = os.path.splitext(name)
219 if item.startswith('docProps/'): # metadatas
220 pass
221 elif ext in parser.NOMETA or item == '.rels':
222 # keep parser.NOMETA files, and the file named ".rels"
223 zipin.extract(item, self.tempdir)
224 zipout.write(name, item)
225 else:
226 zipin.extract(item, self.tempdir)
227 if os.path.isfile(name): # don't care about folders
228 try:
229 cfile = mat.create_class_file(name, False,
230 add2archive=self.add2archive)
231 cfile.remove_all()
232 logging.debug('Processing %s from %s' % (item,
233 self.filename))
234 zipout.write(name, item)
235 except:
236 logging.info('%s\'s fileformat is not supported' % item)
237 if self.add2archive:
238 zipout.write(name, item)
239 zipout.comment = ''
240 logging.info('%s processed' % self.filename)
241 zipin.close()
242 zipout.close()
243 self.do_backup()
244 return True
245
246 def is_clean(self):
247 ''' Check if the file is clean from harmful metadatas
248 '''
249 zipin = zipfile.ZipFile(self.filename, 'r')
250 for item in zipin.namelist():
251 if item.startswith('docProps/'):
252 return False
253 zipin.close()
254 czf = archive.ZipStripper(self.filename, self.parser,
255 'application/zip', False, True, add2archive=self.add2archive)
256 return czf.is_clean()
257
258 def get_meta(self):
259 ''' Return a dict with all the meta of the file
260 '''
261 zipin = zipfile.ZipFile(self.filename, 'r')
262 metadata = {}
263 for item in zipin.namelist():
264 if item.startswith('docProps/'):
265 metadata[item] = 'harmful content'
266 zipin.close()
267 return metadata