diff options
| author | jvoisin | 2011-08-05 11:11:26 +0200 |
|---|---|---|
| committer | jvoisin | 2011-08-05 11:11:26 +0200 |
| commit | 325baae32eb114ff65274faa9bf58c0b9f415927 (patch) | |
| tree | 81b4ffe793d00642b00d543b6e0de38a635146a6 /lib/office.py | |
| parent | ad31d77e6a199295ba44832abec35b054d04bced (diff) | |
Preliminary support for openxml office format
Diffstat (limited to 'lib/office.py')
| -rw-r--r-- | lib/office.py | 31 |
1 files changed, 31 insertions, 0 deletions
diff --git a/lib/office.py b/lib/office.py index f236d09..3cbc566 100644 --- a/lib/office.py +++ b/lib/office.py | |||
| @@ -178,3 +178,34 @@ class PdfStripper(parser.GenericParser): | |||
| 178 | self.document.get_property(key) != '': | 178 | self.document.get_property(key) != '': |
| 179 | metadata[key] = self.document.get_property(key) | 179 | metadata[key] = self.document.get_property(key) |
| 180 | return metadata | 180 | return metadata |
| 181 | |||
| 182 | |||
| 183 | class OpenXmlStripper(archive.GenericArchiveStripper): | ||
| 184 | ''' | ||
| 185 | Represent an office openxml document, which is like | ||
| 186 | an opendocument format, with some tricky stuff added. | ||
| 187 | It contains mostly xml, but can have media blobs, crap, ... | ||
| 188 | (I don't like this format.) | ||
| 189 | ''' | ||
| 190 | def is_clean(self): | ||
| 191 | return False | ||
| 192 | |||
| 193 | def get_meta(self): | ||
| 194 | ''' | ||
| 195 | Return a dict with all the meta of the file | ||
| 196 | ''' | ||
| 197 | zipin = zipfile.ZipFile(self.filename, 'r') | ||
| 198 | metadata = {} | ||
| 199 | try: | ||
| 200 | content = zipin.read('docProps/app.xml') | ||
| 201 | metadata['app'] = 'harful meta' | ||
| 202 | except KeyError: # no app.xml file found | ||
| 203 | logging.debug('%s has no app.xml metadata' % self.filename) | ||
| 204 | try: | ||
| 205 | content = zipin.read('docProps/core.xml') | ||
| 206 | metadata['core'] = 'harmful meta' | ||
| 207 | except KeyError: # no core.xml found | ||
| 208 | logging.debug('%s has no core.xml metadata' % self.filename) | ||
| 209 | zipin.close() | ||
| 210 | |||
| 211 | return metadata | ||
