diff options
| author | jvoisin | 2011-08-05 12:06:47 +0200 |
|---|---|---|
| committer | jvoisin | 2011-08-05 12:06:47 +0200 |
| commit | 5a6bd3a9312f1d3444ebb9343353812bde7702da (patch) | |
| tree | 29d9b8456b9da67201c74dbc2616de8ed890003f /lib | |
| parent | 503e926812d35032ed527c81e78444f362a5d527 (diff) | |
Tests for openxml format, and some improvement for this format support
Diffstat (limited to 'lib')
| -rw-r--r-- | lib/mat.py | 1 | ||||
| -rw-r--r-- | lib/office.py | 20 | ||||
| -rw-r--r-- | lib/parser.py | 8 |
3 files changed, 12 insertions, 17 deletions
| @@ -31,7 +31,6 @@ STRIPPERS = { | |||
| 31 | 'application/x-bzip2': archive.Bzip2Stripper, | 31 | 'application/x-bzip2': archive.Bzip2Stripper, |
| 32 | 'application/zip': archive.ZipStripper, | 32 | 'application/zip': archive.ZipStripper, |
| 33 | 'audio/mpeg': audio.MpegAudioStripper, | 33 | 'audio/mpeg': audio.MpegAudioStripper, |
| 34 | 'image/gif': images.GifStripper, | ||
| 35 | 'image/jpeg': images.JpegStripper, | 34 | 'image/jpeg': images.JpegStripper, |
| 36 | 'image/png': images.PngStripper, | 35 | 'image/png': images.PngStripper, |
| 37 | 'application/vnd.oasis.opendocument': office.OpenDocumentStripper, | 36 | 'application/vnd.oasis.opendocument': office.OpenDocumentStripper, |
diff --git a/lib/office.py b/lib/office.py index b7c607f..03e386b 100644 --- a/lib/office.py +++ b/lib/office.py | |||
| @@ -146,13 +146,13 @@ class PdfStripper(parser.GenericParser): | |||
| 146 | page = self.document.get_page(0) | 146 | page = self.document.get_page(0) |
| 147 | page_width, page_height = page.get_size() | 147 | page_width, page_height = page.get_size() |
| 148 | surface = cairo.PDFSurface(self.output, page_width, page_height) | 148 | surface = cairo.PDFSurface(self.output, page_width, page_height) |
| 149 | context = cairo.Context(surface) # context draws on the surface | 149 | context = cairo.Context(surface) # context draws on the surface |
| 150 | logging.debug('Pdf rendering of %s' % self.filename) | 150 | logging.debug('Pdf rendering of %s' % self.filename) |
| 151 | for pagenum in xrange(self.document.get_n_pages()): | 151 | for pagenum in xrange(self.document.get_n_pages()): |
| 152 | page = self.document.get_page(pagenum) | 152 | page = self.document.get_page(pagenum) |
| 153 | context.translate(0, 0) | 153 | context.translate(0, 0) |
| 154 | page.render(context) # render the page on context | 154 | page.render(context) # render the page on context |
| 155 | context.show_page() # draw context on surface | 155 | context.show_page() # draw context on surface |
| 156 | surface.finish() | 156 | surface.finish() |
| 157 | 157 | ||
| 158 | #For now, poppler cannot write meta, so we must use pdfrw | 158 | #For now, poppler cannot write meta, so we must use pdfrw |
| @@ -253,16 +253,8 @@ class OpenXmlStripper(archive.GenericArchiveStripper): | |||
| 253 | ''' | 253 | ''' |
| 254 | zipin = zipfile.ZipFile(self.filename, 'r') | 254 | zipin = zipfile.ZipFile(self.filename, 'r') |
| 255 | metadata = {} | 255 | metadata = {} |
| 256 | try: | 256 | for item in zipin.namelist(): |
| 257 | content = zipin.read('docProps/app.xml') | 257 | if item.startswith('docProps/'): |
| 258 | metadata['app'] = 'harful meta' | 258 | metadata[item] = 'harmful content' |
| 259 | except KeyError: # no app.xml file found | ||
| 260 | logging.debug('%s has no app.xml metadata' % self.filename) | ||
| 261 | try: | ||
| 262 | content = zipin.read('docProps/core.xml') | ||
| 263 | metadata['core'] = 'harmful meta' | ||
| 264 | except KeyError: # no core.xml found | ||
| 265 | logging.debug('%s has no core.xml metadata' % self.filename) | ||
| 266 | zipin.close() | 259 | zipin.close() |
| 267 | |||
| 268 | return metadata | 260 | return metadata |
diff --git a/lib/parser.py b/lib/parser.py index 1bdca57..fd0ed13 100644 --- a/lib/parser.py +++ b/lib/parser.py | |||
| @@ -9,8 +9,12 @@ import os | |||
| 9 | 9 | ||
| 10 | import mat | 10 | import mat |
| 11 | 11 | ||
| 12 | NOMETA = ('.bmp', 'html', '.py', '.rdf', '.txt', '.xml') | 12 | NOMETA = ('.bmp', '.rdf', '.txt', '.xml', '.rels') |
| 13 | 13 | #bmp : image | |
| 14 | #rdf : text | ||
| 15 | #txt : plain text | ||
| 16 | #xml : formated text | ||
| 17 | #rels : openxml foramted text | ||
| 14 | 18 | ||
| 15 | class GenericParser(object): | 19 | class GenericParser(object): |
| 16 | ''' | 20 | ''' |
