summaryrefslogtreecommitdiff
path: root/lib/office.py
diff options
context:
space:
mode:
authorjvoisin2011-08-16 18:11:24 +0200
committerjvoisin2011-08-16 18:11:24 +0200
commit4bd3e47da02fde08acfada1795cc55170abdb00a (patch)
treef8c7aa5fd5e1b07a28b350c5ded8125ef2467c51 /lib/office.py
parentbaf8e080125614326ba9c96ca8f2404fd12b050e (diff)
setup.py now works !
Diffstat (limited to 'lib/office.py')
-rw-r--r--lib/office.py257
1 files changed, 0 insertions, 257 deletions
diff --git a/lib/office.py b/lib/office.py
deleted file mode 100644
index 33af48e..0000000
--- a/lib/office.py
+++ /dev/null
@@ -1,257 +0,0 @@
1'''
2 Care about office's formats
3'''
4
5import os
6import logging
7import zipfile
8import fileinput
9
10try:
11 import cairo
12 import poppler
13except ImportError:
14 pass
15
16import mat
17import parser
18import archive
19import pdfrw
20
21
22class OpenDocumentStripper(archive.GenericArchiveStripper):
23 '''
24 An open document file is a zip, with xml file into.
25 The one that interest us is meta.xml
26 '''
27
28 def get_meta(self):
29 '''
30 Return a dict with all the meta of the file by
31 trying to read the meta.xml file.
32 '''
33 zipin = zipfile.ZipFile(self.filename, 'r')
34 metadata = {}
35 try:
36 content = zipin.read('meta.xml')
37 zipin.close()
38 metadata[self.filename] = 'harful meta'
39 except KeyError: # no meta.xml file found
40 logging.debug('%s has no opendocument metadata' % self.filename)
41 return metadata
42
43 def _remove_all(self, method):
44 '''
45 FIXME ?
46 There is a patch implementing the Zipfile.remove()
47 method here : http://bugs.python.org/issue6818
48 '''
49 zipin = zipfile.ZipFile(self.filename, 'r')
50 zipout = zipfile.ZipFile(self.output, 'w', allowZip64=True)
51
52 for item in zipin.namelist():
53 name = os.path.join(self.tempdir, item)
54 _, ext = os.path.splitext(name)
55
56 if item.endswith('manifest.xml'):
57 # contain the list of all files present in the archive
58 zipin.extract(item, self.tempdir)
59 for line in fileinput.input(name, inplace=1):
60 #remove the line which contains "meta.xml"
61 line = line.strip()
62 if not 'meta.xml' in line:
63 print line
64 zipout.write(name, item)
65
66 elif ext in parser.NOMETA or item == 'mimetype':
67 #keep NOMETA files, and the "manifest" file
68 if item != 'meta.xml': # contains the metadata
69 zipin.extract(item, self.tempdir)
70 zipout.write(name, item)
71
72 else:
73 zipin.extract(item, self.tempdir)
74 if os.path.isfile(name):
75 try:
76 cfile = mat.create_class_file(name, False,
77 self.add2archive)
78 if method == 'normal':
79 cfile.remove_all()
80 else:
81 cfile.remove_all_ugly()
82 logging.debug('Processing %s from %s' % (item,
83 self.filename))
84 zipout.write(name, item)
85 except:
86 logging.info('%s\' fileformat is not supported' % item)
87 if self.add2archive:
88 zipout.write(name, item)
89 zipout.comment = ''
90 logging.info('%s treated' % self.filename)
91 zipin.close()
92 zipout.close()
93 self.do_backup()
94
95 def is_clean(self):
96 '''
97 Check if the file is clean from harmful metadatas
98 '''
99 zipin = zipfile.ZipFile(self.filename, 'r')
100 try:
101 zipin.getinfo('meta.xml')
102 except KeyError: # no meta.xml in the file
103 czf = archive.ZipStripper(self.filename, self.parser,
104 'application/zip', self.backup, self.add2archive)
105 if czf.is_clean():
106 zipin.close()
107 return True
108 zipin.close()
109 return False
110
111
112class PdfStripper(parser.GenericParser):
113 '''
114 Represent a pdf file
115 '''
116 def __init__(self, filename, parser, mime, backup, add2archive):
117 super(PdfStripper, self).__init__(filename, parser, mime, backup,
118 add2archive)
119 uri = 'file://' + os.path.abspath(self.filename)
120 self.password = None
121 self.document = poppler.document_new_from_file(uri, self.password)
122 self.meta_list = ('title', 'author', 'subject', 'keywords', 'creator',
123 'producer', 'creation-date', 'mod-date', 'metadata')
124
125 def is_clean(self):
126 '''
127 Check if the file is clean from harmful metadatas
128 '''
129 for key in self.meta_list:
130 if key == 'creation-date' or key == 'mod-date':
131 if self.document.get_property(key) != -1:
132 return False
133 elif self.document.get_property(key) is not None and \
134 self.document.get_property(key) != '':
135 return False
136 return True
137
138 def remove_all(self):
139 '''
140 Opening the pdf with poppler, then doing a render
141 on a cairo pdfsurface for each pages.
142 Thanks to Lunar^for the idea.
143 http://cairographics.org/documentation/pycairo/2/
144 python-poppler is not documented at all : have fun ;)
145 '''
146 page = self.document.get_page(0)
147 page_width, page_height = page.get_size()
148 surface = cairo.PDFSurface(self.output, page_width, page_height)
149 context = cairo.Context(surface) # context draws on the surface
150 logging.debug('Pdf rendering of %s' % self.filename)
151 for pagenum in xrange(self.document.get_n_pages()):
152 page = self.document.get_page(pagenum)
153 context.translate(0, 0)
154 page.render(context) # render the page on context
155 context.show_page() # draw context on surface
156 surface.finish()
157
158 #For now, poppler cannot write meta, so we must use pdfrw
159 logging.debug('Removing %s\'s superficial metadata' % self.filename)
160 trailer = pdfrw.PdfReader(self.output)
161 trailer.Info.Producer = trailer.Info.Creator = None
162 writer = pdfrw.PdfWriter()
163 writer.trailer = trailer
164 writer.write(self.output)
165 self.do_backup()
166
167 def get_meta(self):
168 '''
169 Return a dict with all the meta of the file
170 '''
171 metadata = {}
172 for key in self.meta_list:
173 if key == 'creation-date' or key == 'mod-date':
174 #creation and modification are set to -1
175 if self.document.get_property(key) != -1:
176 metadata[key] = self.document.get_property(key)
177 elif self.document.get_property(key) is not None and \
178 self.document.get_property(key) != '':
179 metadata[key] = self.document.get_property(key)
180 return metadata
181
182
183class OpenXmlStripper(archive.GenericArchiveStripper):
184 '''
185 Represent an office openxml document, which is like
186 an opendocument format, with some tricky stuff added.
187 It contains mostly xml, but can have media blobs, crap, ...
188 (I don't like this format.)
189 '''
190 def _remove_all(self, method):
191 '''
192 FIXME ?
193 There is a patch implementing the Zipfile.remove()
194 method here : http://bugs.python.org/issue6818
195 '''
196 zipin = zipfile.ZipFile(self.filename, 'r')
197 zipout = zipfile.ZipFile(self.output, 'w',
198 allowZip64=True)
199 for item in zipin.namelist():
200 name = os.path.join(self.tempdir, item)
201 _, ext = os.path.splitext(name)
202 if item.startswith('docProps/'): # metadatas
203 pass
204 elif ext in parser.NOMETA or item == '.rels':
205 #keep parser.NOMETA files, and the file named ".rels"
206 zipin.extract(item, self.tempdir)
207 zipout.write(name, item)
208 else:
209 zipin.extract(item, self.tempdir)
210 if os.path.isfile(name): # don't care about folders
211 try:
212 cfile = mat.create_class_file(name, False,
213 self.add2archive)
214 if method == 'normal':
215 cfile.remove_all()
216 else:
217 cfile.remove_all_ugly()
218 logging.debug('Processing %s from %s' % (item,
219 self.filename))
220 zipout.write(name, item)
221 except:
222 logging.info('%s\' fileformat is not supported' % item)
223 if self.add2archive:
224 zipout.write(name, item)
225 zipout.comment = ''
226 logging.info('%s treated' % self.filename)
227 zipin.close()
228 zipout.close()
229 self.do_backup()
230
231 def is_clean(self):
232 '''
233 Check if the file is clean from harmful metadatas
234 '''
235 zipin = zipfile.ZipFile(self.filename, 'r')
236 for item in zipin.namelist():
237 if item.startswith('docProps/'):
238 return False
239 zipin.close()
240 czf = archive.ZipStripper(self.filename, self.parser,
241 'application/zip', self.backup, self.add2archive)
242 if not czf.is_clean():
243 return False
244 else:
245 return True
246
247 def get_meta(self):
248 '''
249 Return a dict with all the meta of the file
250 '''
251 zipin = zipfile.ZipFile(self.filename, 'r')
252 metadata = {}
253 for item in zipin.namelist():
254 if item.startswith('docProps/'):
255 metadata[item] = 'harmful content'
256 zipin.close()
257 return metadata