diff options
| author | jvoisin | 2011-08-05 11:11:26 +0200 |
|---|---|---|
| committer | jvoisin | 2011-08-05 11:11:26 +0200 |
| commit | 325baae32eb114ff65274faa9bf58c0b9f415927 (patch) | |
| tree | 81b4ffe793d00642b00d543b6e0de38a635146a6 /lib | |
| parent | ad31d77e6a199295ba44832abec35b054d04bced (diff) | |
Preliminary support for openxml office format
Diffstat (limited to 'lib')
| -rw-r--r-- | lib/mat.py | 12 | ||||
| -rw-r--r-- | lib/office.py | 31 |
2 files changed, 40 insertions, 3 deletions
| @@ -7,6 +7,7 @@ | |||
| 7 | import os | 7 | import os |
| 8 | import subprocess | 8 | import subprocess |
| 9 | import logging | 9 | import logging |
| 10 | import mimetypes | ||
| 10 | import xml.sax | 11 | import xml.sax |
| 11 | 12 | ||
| 12 | import hachoir_core.cmd_line | 13 | import hachoir_core.cmd_line |
| @@ -30,9 +31,11 @@ STRIPPERS = { | |||
| 30 | 'application/x-bzip2': archive.Bzip2Stripper, | 31 | 'application/x-bzip2': archive.Bzip2Stripper, |
| 31 | 'application/zip': archive.ZipStripper, | 32 | 'application/zip': archive.ZipStripper, |
| 32 | 'audio/mpeg': audio.MpegAudioStripper, | 33 | 'audio/mpeg': audio.MpegAudioStripper, |
| 34 | 'image/gif': images.GifStripper, | ||
| 33 | 'image/jpeg': images.JpegStripper, | 35 | 'image/jpeg': images.JpegStripper, |
| 34 | 'image/png': images.PngStripper, | 36 | 'image/png': images.PngStripper, |
| 35 | 'application/vnd.oasis.opendocument': office.OpenDocumentStripper, | 37 | 'application/vnd.oasis.opendocument': office.OpenDocumentStripper, |
| 38 | 'application/vnd.openxmlformats-officedocument': office.OpenXmlStripper, | ||
| 36 | } | 39 | } |
| 37 | 40 | ||
| 38 | try: | 41 | try: |
| @@ -140,15 +143,18 @@ def create_class_file(name, backup, add2archive): | |||
| 140 | 143 | ||
| 141 | mime = parser.mime_type | 144 | mime = parser.mime_type |
| 142 | 145 | ||
| 146 | if mime == 'application/zip': # some formats are zipped stuff | ||
| 147 | mime = mimetypes.guess_type(name)[0] | ||
| 148 | |||
| 143 | if mime.startswith('application/vnd.oasis.opendocument'): | 149 | if mime.startswith('application/vnd.oasis.opendocument'): |
| 144 | mime = 'application/vnd.oasis.opendocument' # opendocument fileformat | 150 | mime = 'application/vnd.oasis.opendocument' # opendocument fileformat |
| 145 | 151 | elif mime.startswith('application/vnd.openxmlformats-officedocument'): | |
| 146 | #stripper_class = STRIPPERS[mime] | 152 | mime = 'application/vnd.openxmlformats-officedocument' |
| 147 | 153 | ||
| 148 | try: | 154 | try: |
| 149 | stripper_class = STRIPPERS[mime] | 155 | stripper_class = STRIPPERS[mime] |
| 150 | except KeyError: | 156 | except KeyError: |
| 151 | logging.info('Don\'t have stripper for %s\'s format' % name) | 157 | logging.info('Don\'t have stripper for %s format' % mime) |
| 152 | return | 158 | return |
| 153 | 159 | ||
| 154 | return stripper_class(filename, parser, mime, backup, add2archive) | 160 | return stripper_class(filename, parser, mime, backup, add2archive) |
diff --git a/lib/office.py b/lib/office.py index f236d09..3cbc566 100644 --- a/lib/office.py +++ b/lib/office.py | |||
| @@ -178,3 +178,34 @@ class PdfStripper(parser.GenericParser): | |||
| 178 | self.document.get_property(key) != '': | 178 | self.document.get_property(key) != '': |
| 179 | metadata[key] = self.document.get_property(key) | 179 | metadata[key] = self.document.get_property(key) |
| 180 | return metadata | 180 | return metadata |
| 181 | |||
| 182 | |||
| 183 | class OpenXmlStripper(archive.GenericArchiveStripper): | ||
| 184 | ''' | ||
| 185 | Represent an office openxml document, which is like | ||
| 186 | an opendocument format, with some tricky stuff added. | ||
| 187 | It contains mostly xml, but can have media blobs, crap, ... | ||
| 188 | (I don't like this format.) | ||
| 189 | ''' | ||
| 190 | def is_clean(self): | ||
| 191 | return False | ||
| 192 | |||
| 193 | def get_meta(self): | ||
| 194 | ''' | ||
| 195 | Return a dict with all the meta of the file | ||
| 196 | ''' | ||
| 197 | zipin = zipfile.ZipFile(self.filename, 'r') | ||
| 198 | metadata = {} | ||
| 199 | try: | ||
| 200 | content = zipin.read('docProps/app.xml') | ||
| 201 | metadata['app'] = 'harful meta' | ||
| 202 | except KeyError: # no app.xml file found | ||
| 203 | logging.debug('%s has no app.xml metadata' % self.filename) | ||
| 204 | try: | ||
| 205 | content = zipin.read('docProps/core.xml') | ||
| 206 | metadata['core'] = 'harmful meta' | ||
| 207 | except KeyError: # no core.xml found | ||
| 208 | logging.debug('%s has no core.xml metadata' % self.filename) | ||
| 209 | zipin.close() | ||
| 210 | |||
| 211 | return metadata | ||
