From 5a6bd3a9312f1d3444ebb9343353812bde7702da Mon Sep 17 00:00:00 2001 From: jvoisin Date: Fri, 5 Aug 2011 12:06:47 +0200 Subject: Tests for openxml format, and some improvement for this format support --- lib/mat.py | 1 - lib/office.py | 20 ++++++-------------- lib/parser.py | 8 ++++++-- 3 files changed, 12 insertions(+), 17 deletions(-) (limited to 'lib') diff --git a/lib/mat.py b/lib/mat.py index 9f3f6c5..80d5c66 100644 --- a/lib/mat.py +++ b/lib/mat.py @@ -31,7 +31,6 @@ STRIPPERS = { 'application/x-bzip2': archive.Bzip2Stripper, 'application/zip': archive.ZipStripper, 'audio/mpeg': audio.MpegAudioStripper, - 'image/gif': images.GifStripper, 'image/jpeg': images.JpegStripper, 'image/png': images.PngStripper, 'application/vnd.oasis.opendocument': office.OpenDocumentStripper, diff --git a/lib/office.py b/lib/office.py index b7c607f..03e386b 100644 --- a/lib/office.py +++ b/lib/office.py @@ -146,13 +146,13 @@ class PdfStripper(parser.GenericParser): page = self.document.get_page(0) page_width, page_height = page.get_size() surface = cairo.PDFSurface(self.output, page_width, page_height) - context = cairo.Context(surface) # context draws on the surface + context = cairo.Context(surface) # context draws on the surface logging.debug('Pdf rendering of %s' % self.filename) for pagenum in xrange(self.document.get_n_pages()): page = self.document.get_page(pagenum) context.translate(0, 0) - page.render(context) # render the page on context - context.show_page() # draw context on surface + page.render(context) # render the page on context + context.show_page() # draw context on surface surface.finish() #For now, poppler cannot write meta, so we must use pdfrw @@ -253,16 +253,8 @@ class OpenXmlStripper(archive.GenericArchiveStripper): ''' zipin = zipfile.ZipFile(self.filename, 'r') metadata = {} - try: - content = zipin.read('docProps/app.xml') - metadata['app'] = 'harful meta' - except KeyError: # no app.xml file found - logging.debug('%s has no app.xml metadata' % self.filename) - try: - content = zipin.read('docProps/core.xml') - metadata['core'] = 'harmful meta' - except KeyError: # no core.xml found - logging.debug('%s has no core.xml metadata' % self.filename) + for item in zipin.namelist(): + if item.startswith('docProps/'): + metadata[item] = 'harmful content' zipin.close() - return metadata diff --git a/lib/parser.py b/lib/parser.py index 1bdca57..fd0ed13 100644 --- a/lib/parser.py +++ b/lib/parser.py @@ -9,8 +9,12 @@ import os import mat -NOMETA = ('.bmp', 'html', '.py', '.rdf', '.txt', '.xml') - +NOMETA = ('.bmp', '.rdf', '.txt', '.xml', '.rels') +#bmp : image +#rdf : text +#txt : plain text +#xml : formated text +#rels : openxml foramted text class GenericParser(object): ''' -- cgit v1.3