summaryrefslogtreecommitdiff
path: root/lib/office.py
diff options
context:
space:
mode:
authorjvoisin2012-02-01 22:56:04 +0100
committerjvoisin2012-02-01 22:56:04 +0100
commit544fe9bf1782a027b3f31bf4c10a050d783e32ac (patch)
treea8dd60b9ae45efea4875fdb827070531f0199717 /lib/office.py
parent9ea6dc6960cebfa70d18ba8ee49d775ea91c9b34 (diff)
Rename mat-cli to mat-gui
Diffstat (limited to 'lib/office.py')
-rw-r--r--lib/office.py305
1 files changed, 305 insertions, 0 deletions
diff --git a/lib/office.py b/lib/office.py
new file mode 100644
index 0000000..e1d738e
--- /dev/null
+++ b/lib/office.py
@@ -0,0 +1,305 @@
1'''
2 Care about office's formats
3'''
4
5import os
6import logging
7import zipfile
8import fileinput
9import subprocess
10import xml.dom.minidom as minidom
11
12try:
13 import cairo
14 import poppler
15except ImportError:
16 pass
17
18import mat
19import parser
20import archive
21
22class OpenDocumentStripper(archive.GenericArchiveStripper):
23 '''
24 An open document file is a zip, with xml file into.
25 The one that interest us is meta.xml
26 '''
27
28 def get_meta(self):
29 '''
30 Return a dict with all the meta of the file by
31 trying to read the meta.xml file.
32 '''
33 zipin = zipfile.ZipFile(self.filename, 'r')
34 metadata = {}
35 try:
36 content = zipin.read('meta.xml')
37 dom1 = minidom.parseString(content)
38 elements = dom1.getElementsByTagName('office:meta')
39 for i in elements[0].childNodes:
40 if i.tagName != 'meta:document-statistic':
41 nodename = ''.join([k for k in i.nodeName.split(':')[1:]])
42 metadata[nodename] = ''.join([j.data for j in i.childNodes])
43 else:
44 # thank you w3c for not providing a nice
45 # method to get all attributes from a node
46 pass
47 zipin.close()
48 except KeyError: # no meta.xml file found
49 logging.debug('%s has no opendocument metadata' % self.filename)
50 return metadata
51
52 def _remove_all(self, method):
53 '''
54 FIXME ?
55 There is a patch implementing the Zipfile.remove()
56 method here : http://bugs.python.org/issue6818
57 '''
58 zipin = zipfile.ZipFile(self.filename, 'r')
59 zipout = zipfile.ZipFile(self.output, 'w', allowZip64=True)
60
61 for item in zipin.namelist():
62 name = os.path.join(self.tempdir, item)
63 _, ext = os.path.splitext(name)
64
65 if item.endswith('manifest.xml'):
66 # contain the list of all files present in the archive
67 zipin.extract(item, self.tempdir)
68 for line in fileinput.input(name, inplace=1):
69 #remove the line which contains "meta.xml"
70 line = line.strip()
71 if not 'meta.xml' in line:
72 print line
73 zipout.write(name, item)
74
75 elif ext in parser.NOMETA or item == 'mimetype':
76 #keep NOMETA files, and the "manifest" file
77 if item != 'meta.xml': # contains the metadata
78 zipin.extract(item, self.tempdir)
79 zipout.write(name, item)
80
81 else:
82 zipin.extract(item, self.tempdir)
83 if os.path.isfile(name):
84 try:
85 cfile = mat.create_class_file(name, False,
86 self.add2archive)
87 if method == 'normal':
88 cfile.remove_all()
89 else:
90 cfile.remove_all_strict()
91 logging.debug('Processing %s from %s' % (item,
92 self.filename))
93 zipout.write(name, item)
94 except:
95 logging.info('%s\' fileformat is not supported' % item)
96 if self.add2archive:
97 zipout.write(name, item)
98 zipout.comment = ''
99 logging.info('%s treated' % self.filename)
100 zipin.close()
101 zipout.close()
102 self.do_backup()
103 return True
104
105 def is_clean(self):
106 '''
107 Check if the file is clean from harmful metadatas
108 '''
109 zipin = zipfile.ZipFile(self.filename, 'r')
110 try:
111 zipin.getinfo('meta.xml')
112 except KeyError: # no meta.xml in the file
113 czf = archive.ZipStripper(self.filename, self.parser,
114 'application/zip', self.backup, self.add2archive)
115 if czf.is_clean():
116 zipin.close()
117 return True
118 zipin.close()
119 return False
120
121
122class PdfStripper(parser.GenericParser):
123 '''
124 Represent a PDF file
125 '''
126 def __init__(self, filename, parser, mime, backup, add2archive):
127 super(PdfStripper, self).__init__(filename, parser, mime, backup,
128 add2archive)
129 uri = 'file://' + os.path.abspath(self.filename)
130 self.password = None
131 self.document = poppler.document_new_from_file(uri, self.password)
132 self.meta_list = ('title', 'author', 'subject', 'keywords', 'creator',
133 'producer', 'metadata')
134
135 def is_clean(self):
136 '''
137 Check if the file is clean from harmful metadatas
138 '''
139 for key in self.meta_list:
140 if self.document.get_property(key) is not None and \
141 self.document.get_property(key) != '':
142 return False
143 return True
144
145
146 def remove_all(self):
147 '''
148 Remove supperficial
149 '''
150 return self._remove_meta()
151
152
153 def remove_all_strict(self):
154 '''
155 Opening the PDF with poppler, then doing a render
156 on a cairo pdfsurface for each pages.
157 Thanks to Lunar^for the idea.
158 http://cairographics.org/documentation/pycairo/2/
159 python-poppler is not documented at all : have fun ;)
160 '''
161 page = self.document.get_page(0)
162 page_width, page_height = page.get_size()
163 surface = cairo.PDFSurface(self.output, page_width, page_height)
164 context = cairo.Context(surface) # context draws on the surface
165 logging.debug('PDF rendering of %s' % self.filename)
166 for pagenum in xrange(self.document.get_n_pages()):
167 page = self.document.get_page(pagenum)
168 context.translate(0, 0)
169 page.render(context) # render the page on context
170 context.show_page() # draw context on surface
171 surface.finish()
172 return self._remove_meta()
173
174 def _remove_meta(self):
175 '''
176 Remove superficial/external metadata
177 from a PDF file, using exiftool,
178 of pdfrw if exiftool is not installed
179 '''
180 processed = False
181 try:# try with pdfrw
182 import pdfrw
183 #For now, poppler cannot write meta, so we must use pdfrw
184 logging.debug('Removing %s\'s superficial metadata' % self.filename)
185 trailer = pdfrw.PdfReader(self.output)
186 trailer.Info.Producer = trailer.Author = trailer.Info.Creator = None
187 writer = pdfrw.PdfWriter()
188 writer.trailer = trailer
189 writer.write(self.output)
190 self.do_backup()
191 processed = True
192 except:
193 pass
194
195 try: # try with exiftool
196 subprocess.Popen('exiftool', stdout=open('/dev/null'))
197 import exiftool
198 # Note: '-All=' must be followed by a known exiftool option.
199 if self.backup:
200 process = subprocess.Popen(['exiftool', '-m', '-All=',
201 '-out', self.output, self.filename], stdout=open('/dev/null'))
202 process.wait()
203 else:
204 # Note: '-All=' must be followed by a known exiftool option.
205 process = subprocess.Popen(
206 ['exiftool', '-All=', '-overwrite_original', self.filename],
207 stdout=open('/dev/null'))
208 process.wait()
209 processed = True
210 except:
211 pass
212
213 if processed is False:
214 logging.error('Please install either pdfrw, or exiftool to\
215 fully handle PDF files')
216 return processed
217
218 def get_meta(self):
219 '''
220 Return a dict with all the meta of the file
221 '''
222 metadata = {}
223 for key in self.meta_list:
224 if self.document.get_property(key) is not None and \
225 self.document.get_property(key) != '':
226 metadata[key] = self.document.get_property(key)
227 return metadata
228
229
230class OpenXmlStripper(archive.GenericArchiveStripper):
231 '''
232 Represent an office openxml document, which is like
233 an opendocument format, with some tricky stuff added.
234 It contains mostly xml, but can have media blobs, crap, ...
235 (I don't like this format.)
236 '''
237 def _remove_all(self, method):
238 '''
239 FIXME ?
240 There is a patch implementing the Zipfile.remove()
241 method here : http://bugs.python.org/issue6818
242 '''
243 zipin = zipfile.ZipFile(self.filename, 'r')
244 zipout = zipfile.ZipFile(self.output, 'w',
245 allowZip64=True)
246 for item in zipin.namelist():
247 name = os.path.join(self.tempdir, item)
248 _, ext = os.path.splitext(name)
249 if item.startswith('docProps/'): # metadatas
250 pass
251 elif ext in parser.NOMETA or item == '.rels':
252 #keep parser.NOMETA files, and the file named ".rels"
253 zipin.extract(item, self.tempdir)
254 zipout.write(name, item)
255 else:
256 zipin.extract(item, self.tempdir)
257 if os.path.isfile(name): # don't care about folders
258 try:
259 cfile = mat.create_class_file(name, False,
260 self.add2archive)
261 if method == 'normal':
262 cfile.remove_all()
263 else:
264 cfile.remove_all_strict()
265 logging.debug('Processing %s from %s' % (item,
266 self.filename))
267 zipout.write(name, item)
268 except:
269 logging.info('%s\' fileformat is not supported' % item)
270 if self.add2archive:
271 zipout.write(name, item)
272 zipout.comment = ''
273 logging.info('%s treated' % self.filename)
274 zipin.close()
275 zipout.close()
276 self.do_backup()
277 return True
278
279 def is_clean(self):
280 '''
281 Check if the file is clean from harmful metadatas
282 '''
283 zipin = zipfile.ZipFile(self.filename, 'r')
284 for item in zipin.namelist():
285 if item.startswith('docProps/'):
286 return False
287 zipin.close()
288 czf = archive.ZipStripper(self.filename, self.parser,
289 'application/zip', self.backup, self.add2archive)
290 if not czf.is_clean():
291 return False
292 else:
293 return True
294
295 def get_meta(self):
296 '''
297 Return a dict with all the meta of the file
298 '''
299 zipin = zipfile.ZipFile(self.filename, 'r')
300 metadata = {}
301 for item in zipin.namelist():
302 if item.startswith('docProps/'):
303 metadata[item] = 'harmful content'
304 zipin.close()
305 return metadata