summaryrefslogtreecommitdiff
path: root/MAT/office.py
diff options
context:
space:
mode:
authorjvoisin2012-12-08 02:02:25 +0100
committerjvoisin2012-12-13 14:24:01 +0100
commitcbf8a2a65928694202e19b6bcf56ec84bcbf613c (patch)
treee106475b0d5c003505336b5ae6416e4508bb768b /MAT/office.py
parent67d5c1fa6b9ab6e1e7328ee57b15d8e46526d72a (diff)
Reorganize source tree and files installation location, cleanup setup.py (Closes: #689409)
Diffstat (limited to 'MAT/office.py')
-rw-r--r--MAT/office.py265
1 files changed, 265 insertions, 0 deletions
diff --git a/MAT/office.py b/MAT/office.py
new file mode 100644
index 0000000..d14125b
--- /dev/null
+++ b/MAT/office.py
@@ -0,0 +1,265 @@
1'''
2 Care about office's formats
3'''
4
5import os
6import logging
7import zipfile
8import fileinput
9import xml.dom.minidom as minidom
10
11try:
12 import cairo
13 import poppler
14except ImportError:
15 pass
16
17import mat
18import parser
19import archive
20
21
22class OpenDocumentStripper(archive.GenericArchiveStripper):
23 '''
24 An open document file is a zip, with xml file into.
25 The one that interest us is meta.xml
26 '''
27
28 def get_meta(self):
29 '''
30 Return a dict with all the meta of the file by
31 trying to read the meta.xml file.
32 '''
33 zipin = zipfile.ZipFile(self.filename, 'r')
34 metadata = {}
35 try:
36 content = zipin.read('meta.xml')
37 dom1 = minidom.parseString(content)
38 elements = dom1.getElementsByTagName('office:meta')
39 for i in elements[0].childNodes:
40 if i.tagName != 'meta:document-statistic':
41 nodename = ''.join([k for k in i.nodeName.split(':')[1:]])
42 metadata[nodename] = ''.join([j.data for j in i.childNodes])
43 else:
44 # thank you w3c for not providing a nice
45 # method to get all attributes from a node
46 pass
47 zipin.close()
48 except KeyError: # no meta.xml file found
49 logging.debug('%s has no opendocument metadata' % self.filename)
50 return metadata
51
52 def _remove_all(self):
53 '''
54 FIXME ?
55 There is a patch implementing the Zipfile.remove()
56 method here : http://bugs.python.org/issue6818
57 '''
58 zipin = zipfile.ZipFile(self.filename, 'r')
59 zipout = zipfile.ZipFile(self.output, 'w', allowZip64=True)
60
61 for item in zipin.namelist():
62 name = os.path.join(self.tempdir, item)
63 _, ext = os.path.splitext(name)
64
65 if item.endswith('manifest.xml'):
66 # contain the list of all files present in the archive
67 zipin.extract(item, self.tempdir)
68 for line in fileinput.input(name, inplace=1):
69 #remove the line which contains "meta.xml"
70 line = line.strip()
71 if not 'meta.xml' in line:
72 print line
73 zipout.write(name, item)
74
75 elif ext in parser.NOMETA or item == 'mimetype':
76 #keep NOMETA files, and the "manifest" file
77 if item != 'meta.xml': # contains the metadata
78 zipin.extract(item, self.tempdir)
79 zipout.write(name, item)
80
81 else:
82 zipin.extract(item, self.tempdir)
83 if os.path.isfile(name):
84 try:
85 cfile = mat.create_class_file(name, False,
86 self.add2archive)
87 cfile.remove_all()
88 logging.debug('Processing %s from %s' % (item,
89 self.filename))
90 zipout.write(name, item)
91 except:
92 logging.info('%s\' fileformat is not supported' % item)
93 if self.add2archive:
94 zipout.write(name, item)
95 zipout.comment = ''
96 logging.info('%s treated' % self.filename)
97 zipin.close()
98 zipout.close()
99 self.do_backup()
100 return True
101
102 def is_clean(self):
103 '''
104 Check if the file is clean from harmful metadatas
105 '''
106 zipin = zipfile.ZipFile(self.filename, 'r')
107 try:
108 zipin.getinfo('meta.xml')
109 except KeyError: # no meta.xml in the file
110 czf = archive.ZipStripper(self.filename, self.parser,
111 'application/zip', self.backup, self.add2archive)
112 if czf.is_clean():
113 zipin.close()
114 return True
115 zipin.close()
116 return False
117
118
119class PdfStripper(parser.GenericParser):
120 '''
121 Represent a PDF file
122 '''
123 def __init__(self, filename, parser, mime, backup, add2archive):
124 super(PdfStripper, self).__init__(filename, parser, mime, backup,
125 add2archive)
126 uri = 'file://' + os.path.abspath(self.filename)
127 self.password = None
128 self.document = poppler.document_new_from_file(uri, self.password)
129 self.meta_list = frozenset(['title', 'author', 'subject', 'keywords', 'creator',
130 'producer', 'metadata'])
131
132 def is_clean(self):
133 '''
134 Check if the file is clean from harmful metadatas
135 '''
136 for key in self.meta_list:
137 if self.document.get_property(key):
138 return False
139 return True
140
141 def remove_all(self):
142 '''
143 Remove metadata
144 '''
145 return self._remove_meta()
146
147 def _remove_meta(self):
148 '''
149 Opening the PDF with poppler, then doing a render
150 on a cairo pdfsurface for each pages.
151
152 http://cairographics.org/documentation/pycairo/2/
153 python-poppler is not documented at all : have fun ;)
154 '''
155 page = self.document.get_page(0)
156 # assume that every pages are the same size
157 page_width, page_height = page.get_size()
158 surface = cairo.PDFSurface(self.output, page_width, page_height)
159 context = cairo.Context(surface) # context draws on the surface
160 logging.debug('PDF rendering of %s' % self.filename)
161 for pagenum in xrange(self.document.get_n_pages()):
162 page = self.document.get_page(pagenum)
163 context.translate(0, 0)
164 page.render_for_printing(context) # render the page on context
165 context.show_page() # draw context on surface
166 surface.finish()
167
168 try:
169 import pdfrw # For now, poppler cannot write meta, so we must use pdfrw
170 logging.debug('Removing %s\'s superficial metadata' % self.filename)
171 trailer = pdfrw.PdfReader(self.output)
172 trailer.Info.Producer = None
173 trailer.Info.Creator = None
174 writer = pdfrw.PdfWriter()
175 writer.trailer = trailer
176 writer.write(self.output)
177 self.do_backup()
178 return True
179 except:
180 print('Unable to remove all metadata from %s, please install\
181pdfrw' % self.output)
182 return False
183 return True
184
185 def get_meta(self):
186 '''
187 Return a dict with all the meta of the file
188 '''
189 metadata = {}
190 for key in self.meta_list:
191 if self.document.get_property(key):
192 metadata[key] = self.document.get_property(key)
193 return metadata
194
195
196class OpenXmlStripper(archive.GenericArchiveStripper):
197 '''
198 Represent an office openxml document, which is like
199 an opendocument format, with some tricky stuff added.
200 It contains mostly xml, but can have media blobs, crap, ...
201 (I don't like this format.)
202 '''
203 def _remove_all(self):
204 '''
205 FIXME ?
206 There is a patch implementing the Zipfile.remove()
207 method here : http://bugs.python.org/issue6818
208 '''
209 zipin = zipfile.ZipFile(self.filename, 'r')
210 zipout = zipfile.ZipFile(self.output, 'w',
211 allowZip64=True)
212 for item in zipin.namelist():
213 name = os.path.join(self.tempdir, item)
214 _, ext = os.path.splitext(name)
215 if item.startswith('docProps/'): # metadatas
216 pass
217 elif ext in parser.NOMETA or item == '.rels':
218 #keep parser.NOMETA files, and the file named ".rels"
219 zipin.extract(item, self.tempdir)
220 zipout.write(name, item)
221 else:
222 zipin.extract(item, self.tempdir)
223 if os.path.isfile(name): # don't care about folders
224 try:
225 cfile = mat.create_class_file(name, False,
226 self.add2archive)
227 cfile.remove_all()
228 logging.debug('Processing %s from %s' % (item,
229 self.filename))
230 zipout.write(name, item)
231 except:
232 logging.info('%s\' fileformat is not supported' % item)
233 if self.add2archive:
234 zipout.write(name, item)
235 zipout.comment = ''
236 logging.info('%s treated' % self.filename)
237 zipin.close()
238 zipout.close()
239 self.do_backup()
240 return True
241
242 def is_clean(self):
243 '''
244 Check if the file is clean from harmful metadatas
245 '''
246 zipin = zipfile.ZipFile(self.filename, 'r')
247 for item in zipin.namelist():
248 if item.startswith('docProps/'):
249 return False
250 zipin.close()
251 czf = archive.ZipStripper(self.filename, self.parser,
252 'application/zip', self.backup, self.add2archive)
253 return czf.is_clean()
254
255 def get_meta(self):
256 '''
257 Return a dict with all the meta of the file
258 '''
259 zipin = zipfile.ZipFile(self.filename, 'r')
260 metadata = {}
261 for item in zipin.namelist():
262 if item.startswith('docProps/'):
263 metadata[item] = 'harmful content'
264 zipin.close()
265 return metadata