summaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
authorjvoisin2011-08-05 12:06:47 +0200
committerjvoisin2011-08-05 12:06:47 +0200
commit5a6bd3a9312f1d3444ebb9343353812bde7702da (patch)
tree29d9b8456b9da67201c74dbc2616de8ed890003f /lib
parent503e926812d35032ed527c81e78444f362a5d527 (diff)
Tests for openxml format, and some improvement for this format support
Diffstat (limited to 'lib')
-rw-r--r--lib/mat.py1
-rw-r--r--lib/office.py20
-rw-r--r--lib/parser.py8
3 files changed, 12 insertions, 17 deletions
diff --git a/lib/mat.py b/lib/mat.py
index 9f3f6c5..80d5c66 100644
--- a/lib/mat.py
+++ b/lib/mat.py
@@ -31,7 +31,6 @@ STRIPPERS = {
31 'application/x-bzip2': archive.Bzip2Stripper, 31 'application/x-bzip2': archive.Bzip2Stripper,
32 'application/zip': archive.ZipStripper, 32 'application/zip': archive.ZipStripper,
33 'audio/mpeg': audio.MpegAudioStripper, 33 'audio/mpeg': audio.MpegAudioStripper,
34 'image/gif': images.GifStripper,
35 'image/jpeg': images.JpegStripper, 34 'image/jpeg': images.JpegStripper,
36 'image/png': images.PngStripper, 35 'image/png': images.PngStripper,
37 'application/vnd.oasis.opendocument': office.OpenDocumentStripper, 36 'application/vnd.oasis.opendocument': office.OpenDocumentStripper,
diff --git a/lib/office.py b/lib/office.py
index b7c607f..03e386b 100644
--- a/lib/office.py
+++ b/lib/office.py
@@ -146,13 +146,13 @@ class PdfStripper(parser.GenericParser):
146 page = self.document.get_page(0) 146 page = self.document.get_page(0)
147 page_width, page_height = page.get_size() 147 page_width, page_height = page.get_size()
148 surface = cairo.PDFSurface(self.output, page_width, page_height) 148 surface = cairo.PDFSurface(self.output, page_width, page_height)
149 context = cairo.Context(surface) # context draws on the surface 149 context = cairo.Context(surface) # context draws on the surface
150 logging.debug('Pdf rendering of %s' % self.filename) 150 logging.debug('Pdf rendering of %s' % self.filename)
151 for pagenum in xrange(self.document.get_n_pages()): 151 for pagenum in xrange(self.document.get_n_pages()):
152 page = self.document.get_page(pagenum) 152 page = self.document.get_page(pagenum)
153 context.translate(0, 0) 153 context.translate(0, 0)
154 page.render(context) # render the page on context 154 page.render(context) # render the page on context
155 context.show_page() # draw context on surface 155 context.show_page() # draw context on surface
156 surface.finish() 156 surface.finish()
157 157
158 #For now, poppler cannot write meta, so we must use pdfrw 158 #For now, poppler cannot write meta, so we must use pdfrw
@@ -253,16 +253,8 @@ class OpenXmlStripper(archive.GenericArchiveStripper):
253 ''' 253 '''
254 zipin = zipfile.ZipFile(self.filename, 'r') 254 zipin = zipfile.ZipFile(self.filename, 'r')
255 metadata = {} 255 metadata = {}
256 try: 256 for item in zipin.namelist():
257 content = zipin.read('docProps/app.xml') 257 if item.startswith('docProps/'):
258 metadata['app'] = 'harful meta' 258 metadata[item] = 'harmful content'
259 except KeyError: # no app.xml file found
260 logging.debug('%s has no app.xml metadata' % self.filename)
261 try:
262 content = zipin.read('docProps/core.xml')
263 metadata['core'] = 'harmful meta'
264 except KeyError: # no core.xml found
265 logging.debug('%s has no core.xml metadata' % self.filename)
266 zipin.close() 259 zipin.close()
267
268 return metadata 260 return metadata
diff --git a/lib/parser.py b/lib/parser.py
index 1bdca57..fd0ed13 100644
--- a/lib/parser.py
+++ b/lib/parser.py
@@ -9,8 +9,12 @@ import os
9 9
10import mat 10import mat
11 11
12NOMETA = ('.bmp', 'html', '.py', '.rdf', '.txt', '.xml') 12NOMETA = ('.bmp', '.rdf', '.txt', '.xml', '.rels')
13 13#bmp : image
14#rdf : text
15#txt : plain text
16#xml : formated text
17#rels : openxml foramted text
14 18
15class GenericParser(object): 19class GenericParser(object):
16 ''' 20 '''