From ace3d8213921a9308d30afc057fc21221420e12e Mon Sep 17 00:00:00 2001 From: jvoisin Date: Sun, 24 Jul 2011 02:30:50 +0200 Subject: First implementation of open document format --- lib/mat.py | 36 +++++++++++++++++++------ lib/office.py | 86 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 114 insertions(+), 8 deletions(-) (limited to 'lib') diff --git a/lib/mat.py b/lib/mat.py index 2903ed9..5dcdbc2 100644 --- a/lib/mat.py +++ b/lib/mat.py @@ -7,6 +7,7 @@ import os import subprocess import logging +import mimetypes import hachoir_core.cmd_line import hachoir_parser @@ -14,7 +15,7 @@ import hachoir_editor import images import audio -import misc +import office import archive __version__ = "0.1" @@ -29,7 +30,7 @@ strippers = { hachoir_parser.image.PngFile: images.PngStripper, hachoir_parser.image.bmp.BmpFile: images.BmpStripper, hachoir_parser.audio.MpegAudioFile: audio.MpegAudioStripper, - hachoir_parser.misc.PDFDocument: misc.PdfStripper, + hachoir_parser.misc.PDFDocument: office.PdfStripper, hachoir_parser.archive.TarFile: archive.TarStripper, hachoir_parser.archive.gzip_parser.GzipParser: archive.GzipStripper, hachoir_parser.archive.bzip2_parser.Bzip2Parser: archive.Bzip2Stripper, @@ -61,12 +62,14 @@ def create_class_file(name, backup, add2archive): corresponding to the filetype of the given file ''' if is_secure(name): - print 'a' return filename = "" realname = name - filename = hachoir_core.cmd_line.unicodeFilename(name) + try: + filename = hachoir_core.cmd_line.unicodeFilename(name) + except TypeError:# get rid of "TypeError: decoding Unicode is not supported" + filename = name parser = hachoir_parser.createParser(filename) if not parser: logging.error("Unable to parse %s" % filename) @@ -82,9 +85,26 @@ def create_class_file(name, backup, add2archive): stripper_class = strippers[editor.input.__class__] except KeyError: #Place for another lib than hachoir - logging.error("Don't have stripper for file type %s" % editor.description) + logging.error("Don't have stripper for format %s" % editor.description) return - if editor.input.__class__ == hachoir_parser.misc.PDFDocument: + + if editor.input.__class__ == hachoir_parser.misc.PDFDocument:#pdf return stripper_class(filename, realname, backup) - return stripper_class(realname, filename, parser, editor, backup, - add2archive) + + elif editor.input.__class__ == hachoir_parser.archive.zip.ZipFile: + #zip based format + mime = mimetypes.guess_type(filename)[0] + try:#Ugly workaround, cleaning open document delete mime (wtf?) + if mime.startswith(#Open document format + 'application/vnd.oasis.opendocument'): + return office.OpenDocumentStripper(realname, filename, parser, + editor, backup, add2archive) + else:#normal zip + return stripper_class(realname, filename, parser, editor, + backup, add2archive) + except:#normal zip file + return stripper_class(realname, filename, parser, editor, backup, + add2archive) + else:#normal handling + return stripper_class(realname, filename, parser, editor, backup, + add2archive) diff --git a/lib/office.py b/lib/office.py index de38129..5d62732 100644 --- a/lib/office.py +++ b/lib/office.py @@ -3,12 +3,98 @@ import mimetypes import subprocess import tempfile import glob +import logging +import zipfile +import shutil import hachoir_core import pdfrw import mat import parser +import archive + +class OpenDocumentStripper(archive.GenericArchiveStripper): + ''' + An open document file is a zip, with xml file into. + The one that interest us is meta.xml + ''' + + def remove_folder(self, folder_list): + for folder in folder_list: + dirname = folder.split('/')[0] + try: + shutil.rmtree(dirname) + except:#Some folder or open document format are buggies + pass + self.folder_list = [] + + def _remove_all(self, method): + ''' + FIXME ? + There is a patch implementing the Zipfile.remove() + method here : http://bugs.python.org/issue6818 + ''' + zipin = zipfile.ZipFile(self.filename, 'r') + zipout = zipfile.ZipFile(self.filename + parser.POSTFIX, 'w', + allowZip64=True) + folder_list = [] + for item in zipin.namelist(): + if os.path.basename(item) is not item:#add folders to folder_list + folder_list.insert(0, os.path.dirname(item)) + if item.endswith('.xml') or item.startswith('manifest'): + if item != 'meta.xml':#contains the metadata + zipin.extract(item) + zipout.write(item) + mat.secure_remove(item) + elif item == 'mimetype': + zipin.extract(item) + #remove line meta.xml + zipout.write(item) + mat.secure_remove(item) + else: + zipin.extract(item) + if os.path.isfile(item): + try: + cfile = mat.create_class_file(item, False, + self.add2archive) + if method == 'normal': + cfile.remove_all() + else: + cfile.remove_all_ugly() + logging.debug('Processing %s from %s' % (item, + self.filename)) + zipout.write(item) + except: + logging.info('%s\' fileformat is not supported' % + item) + if self.add2archive: + zipout.write(item) + mat.secure_remove(item) + zipout.comment = '' + logging.info('%s treated' % self.filename) + zipin.close() + zipout.close() + self.remove_folder(folder_list) + + if self.backup is False: + mat.secure_remove(self.filename) #remove the old file + os.rename(self.filename + parser.POSTFIX, self.filename) + + def is_clean(self): + zipin = zipfile.ZipFile(self.filename, 'r') + try: + zipin.getinfo('meta.xml') + except KeyError:#no meta.xml in the file + zipin.close() + czf = archive.ZipStripper(self.realname, self.filename, + self.parser, self.editor, self.backup, self.add2archive) + if czf.is_clean(): + return True + else: + return False + return False + class TorrentStripper(parser.Generic_parser): ''' -- cgit v1.3