From 325baae32eb114ff65274faa9bf58c0b9f415927 Mon Sep 17 00:00:00 2001 From: jvoisin Date: Fri, 5 Aug 2011 11:11:26 +0200 Subject: Preliminary support for openxml office format --- lib/mat.py | 12 +++++++++--- lib/office.py | 31 +++++++++++++++++++++++++++++++ 2 files changed, 40 insertions(+), 3 deletions(-) (limited to 'lib') diff --git a/lib/mat.py b/lib/mat.py index 8fe6fb4..9f3f6c5 100644 --- a/lib/mat.py +++ b/lib/mat.py @@ -7,6 +7,7 @@ import os import subprocess import logging +import mimetypes import xml.sax import hachoir_core.cmd_line @@ -30,9 +31,11 @@ STRIPPERS = { 'application/x-bzip2': archive.Bzip2Stripper, 'application/zip': archive.ZipStripper, 'audio/mpeg': audio.MpegAudioStripper, + 'image/gif': images.GifStripper, 'image/jpeg': images.JpegStripper, 'image/png': images.PngStripper, 'application/vnd.oasis.opendocument': office.OpenDocumentStripper, + 'application/vnd.openxmlformats-officedocument': office.OpenXmlStripper, } try: @@ -140,15 +143,18 @@ def create_class_file(name, backup, add2archive): mime = parser.mime_type + if mime == 'application/zip': # some formats are zipped stuff + mime = mimetypes.guess_type(name)[0] + if mime.startswith('application/vnd.oasis.opendocument'): mime = 'application/vnd.oasis.opendocument' # opendocument fileformat - - #stripper_class = STRIPPERS[mime] + elif mime.startswith('application/vnd.openxmlformats-officedocument'): + mime = 'application/vnd.openxmlformats-officedocument' try: stripper_class = STRIPPERS[mime] except KeyError: - logging.info('Don\'t have stripper for %s\'s format' % name) + logging.info('Don\'t have stripper for %s format' % mime) return return stripper_class(filename, parser, mime, backup, add2archive) diff --git a/lib/office.py b/lib/office.py index f236d09..3cbc566 100644 --- a/lib/office.py +++ b/lib/office.py @@ -178,3 +178,34 @@ class PdfStripper(parser.GenericParser): self.document.get_property(key) != '': metadata[key] = self.document.get_property(key) return metadata + + +class OpenXmlStripper(archive.GenericArchiveStripper): + ''' + Represent an office openxml document, which is like + an opendocument format, with some tricky stuff added. + It contains mostly xml, but can have media blobs, crap, ... + (I don't like this format.) + ''' + def is_clean(self): + return False + + def get_meta(self): + ''' + Return a dict with all the meta of the file + ''' + zipin = zipfile.ZipFile(self.filename, 'r') + metadata = {} + try: + content = zipin.read('docProps/app.xml') + metadata['app'] = 'harful meta' + except KeyError: # no app.xml file found + logging.debug('%s has no app.xml metadata' % self.filename) + try: + content = zipin.read('docProps/core.xml') + metadata['core'] = 'harmful meta' + except KeyError: # no core.xml found + logging.debug('%s has no core.xml metadata' % self.filename) + zipin.close() + + return metadata -- cgit v1.3