summaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
authorjvoisin2011-08-05 11:11:26 +0200
committerjvoisin2011-08-05 11:11:26 +0200
commit325baae32eb114ff65274faa9bf58c0b9f415927 (patch)
tree81b4ffe793d00642b00d543b6e0de38a635146a6 /lib
parentad31d77e6a199295ba44832abec35b054d04bced (diff)
Preliminary support for openxml office format
Diffstat (limited to 'lib')
-rw-r--r--lib/mat.py12
-rw-r--r--lib/office.py31
2 files changed, 40 insertions, 3 deletions
diff --git a/lib/mat.py b/lib/mat.py
index 8fe6fb4..9f3f6c5 100644
--- a/lib/mat.py
+++ b/lib/mat.py
@@ -7,6 +7,7 @@
7import os 7import os
8import subprocess 8import subprocess
9import logging 9import logging
10import mimetypes
10import xml.sax 11import xml.sax
11 12
12import hachoir_core.cmd_line 13import hachoir_core.cmd_line
@@ -30,9 +31,11 @@ STRIPPERS = {
30 'application/x-bzip2': archive.Bzip2Stripper, 31 'application/x-bzip2': archive.Bzip2Stripper,
31 'application/zip': archive.ZipStripper, 32 'application/zip': archive.ZipStripper,
32 'audio/mpeg': audio.MpegAudioStripper, 33 'audio/mpeg': audio.MpegAudioStripper,
34 'image/gif': images.GifStripper,
33 'image/jpeg': images.JpegStripper, 35 'image/jpeg': images.JpegStripper,
34 'image/png': images.PngStripper, 36 'image/png': images.PngStripper,
35 'application/vnd.oasis.opendocument': office.OpenDocumentStripper, 37 'application/vnd.oasis.opendocument': office.OpenDocumentStripper,
38 'application/vnd.openxmlformats-officedocument': office.OpenXmlStripper,
36} 39}
37 40
38try: 41try:
@@ -140,15 +143,18 @@ def create_class_file(name, backup, add2archive):
140 143
141 mime = parser.mime_type 144 mime = parser.mime_type
142 145
146 if mime == 'application/zip': # some formats are zipped stuff
147 mime = mimetypes.guess_type(name)[0]
148
143 if mime.startswith('application/vnd.oasis.opendocument'): 149 if mime.startswith('application/vnd.oasis.opendocument'):
144 mime = 'application/vnd.oasis.opendocument' # opendocument fileformat 150 mime = 'application/vnd.oasis.opendocument' # opendocument fileformat
145 151 elif mime.startswith('application/vnd.openxmlformats-officedocument'):
146 #stripper_class = STRIPPERS[mime] 152 mime = 'application/vnd.openxmlformats-officedocument'
147 153
148 try: 154 try:
149 stripper_class = STRIPPERS[mime] 155 stripper_class = STRIPPERS[mime]
150 except KeyError: 156 except KeyError:
151 logging.info('Don\'t have stripper for %s\'s format' % name) 157 logging.info('Don\'t have stripper for %s format' % mime)
152 return 158 return
153 159
154 return stripper_class(filename, parser, mime, backup, add2archive) 160 return stripper_class(filename, parser, mime, backup, add2archive)
diff --git a/lib/office.py b/lib/office.py
index f236d09..3cbc566 100644
--- a/lib/office.py
+++ b/lib/office.py
@@ -178,3 +178,34 @@ class PdfStripper(parser.GenericParser):
178 self.document.get_property(key) != '': 178 self.document.get_property(key) != '':
179 metadata[key] = self.document.get_property(key) 179 metadata[key] = self.document.get_property(key)
180 return metadata 180 return metadata
181
182
183class OpenXmlStripper(archive.GenericArchiveStripper):
184 '''
185 Represent an office openxml document, which is like
186 an opendocument format, with some tricky stuff added.
187 It contains mostly xml, but can have media blobs, crap, ...
188 (I don't like this format.)
189 '''
190 def is_clean(self):
191 return False
192
193 def get_meta(self):
194 '''
195 Return a dict with all the meta of the file
196 '''
197 zipin = zipfile.ZipFile(self.filename, 'r')
198 metadata = {}
199 try:
200 content = zipin.read('docProps/app.xml')
201 metadata['app'] = 'harful meta'
202 except KeyError: # no app.xml file found
203 logging.debug('%s has no app.xml metadata' % self.filename)
204 try:
205 content = zipin.read('docProps/core.xml')
206 metadata['core'] = 'harmful meta'
207 except KeyError: # no core.xml found
208 logging.debug('%s has no core.xml metadata' % self.filename)
209 zipin.close()
210
211 return metadata