summaryrefslogtreecommitdiff
path: root/lib/office.py
diff options
context:
space:
mode:
authorjvoisin2011-08-05 11:11:26 +0200
committerjvoisin2011-08-05 11:11:26 +0200
commit325baae32eb114ff65274faa9bf58c0b9f415927 (patch)
tree81b4ffe793d00642b00d543b6e0de38a635146a6 /lib/office.py
parentad31d77e6a199295ba44832abec35b054d04bced (diff)
Preliminary support for openxml office format
Diffstat (limited to 'lib/office.py')
-rw-r--r--lib/office.py31
1 files changed, 31 insertions, 0 deletions
diff --git a/lib/office.py b/lib/office.py
index f236d09..3cbc566 100644
--- a/lib/office.py
+++ b/lib/office.py
@@ -178,3 +178,34 @@ class PdfStripper(parser.GenericParser):
178 self.document.get_property(key) != '': 178 self.document.get_property(key) != '':
179 metadata[key] = self.document.get_property(key) 179 metadata[key] = self.document.get_property(key)
180 return metadata 180 return metadata
181
182
183class OpenXmlStripper(archive.GenericArchiveStripper):
184 '''
185 Represent an office openxml document, which is like
186 an opendocument format, with some tricky stuff added.
187 It contains mostly xml, but can have media blobs, crap, ...
188 (I don't like this format.)
189 '''
190 def is_clean(self):
191 return False
192
193 def get_meta(self):
194 '''
195 Return a dict with all the meta of the file
196 '''
197 zipin = zipfile.ZipFile(self.filename, 'r')
198 metadata = {}
199 try:
200 content = zipin.read('docProps/app.xml')
201 metadata['app'] = 'harful meta'
202 except KeyError: # no app.xml file found
203 logging.debug('%s has no app.xml metadata' % self.filename)
204 try:
205 content = zipin.read('docProps/core.xml')
206 metadata['core'] = 'harmful meta'
207 except KeyError: # no core.xml found
208 logging.debug('%s has no core.xml metadata' % self.filename)
209 zipin.close()
210
211 return metadata