From 5a6bd3a9312f1d3444ebb9343353812bde7702da Mon Sep 17 00:00:00 2001
From: jvoisin
Date: Fri, 5 Aug 2011 12:06:47 +0200
Subject: Tests for openxml format, and some improvement for this format
support
---
FORMATS | 8 ++++++++
lib/mat.py | 1 -
lib/office.py | 20 ++++++--------------
lib/parser.py | 8 ++++++--
test/clean.docx | Bin 0 -> 2619 bytes
test/dirty.docx | Bin 0 -> 3415 bytes
6 files changed, 20 insertions(+), 17 deletions(-)
create mode 100644 test/clean.docx
create mode 100644 test/dirty.docx
diff --git a/FORMATS b/FORMATS
index cc38bae..c497524 100644
--- a/FORMATS
+++ b/FORMATS
@@ -23,6 +23,14 @@
removal of the meta.xml file
+
+ Office Openxml
+ .docx, .pptx, .xlsx, ...
+ full
+ a docProps folder containings xml metadata files
+ removal of the docProps folder
+
+
Portable Document Fileformat
.pdf
diff --git a/lib/mat.py b/lib/mat.py
index 9f3f6c5..80d5c66 100644
--- a/lib/mat.py
+++ b/lib/mat.py
@@ -31,7 +31,6 @@ STRIPPERS = {
'application/x-bzip2': archive.Bzip2Stripper,
'application/zip': archive.ZipStripper,
'audio/mpeg': audio.MpegAudioStripper,
- 'image/gif': images.GifStripper,
'image/jpeg': images.JpegStripper,
'image/png': images.PngStripper,
'application/vnd.oasis.opendocument': office.OpenDocumentStripper,
diff --git a/lib/office.py b/lib/office.py
index b7c607f..03e386b 100644
--- a/lib/office.py
+++ b/lib/office.py
@@ -146,13 +146,13 @@ class PdfStripper(parser.GenericParser):
page = self.document.get_page(0)
page_width, page_height = page.get_size()
surface = cairo.PDFSurface(self.output, page_width, page_height)
- context = cairo.Context(surface) # context draws on the surface
+ context = cairo.Context(surface) # context draws on the surface
logging.debug('Pdf rendering of %s' % self.filename)
for pagenum in xrange(self.document.get_n_pages()):
page = self.document.get_page(pagenum)
context.translate(0, 0)
- page.render(context) # render the page on context
- context.show_page() # draw context on surface
+ page.render(context) # render the page on context
+ context.show_page() # draw context on surface
surface.finish()
#For now, poppler cannot write meta, so we must use pdfrw
@@ -253,16 +253,8 @@ class OpenXmlStripper(archive.GenericArchiveStripper):
'''
zipin = zipfile.ZipFile(self.filename, 'r')
metadata = {}
- try:
- content = zipin.read('docProps/app.xml')
- metadata['app'] = 'harful meta'
- except KeyError: # no app.xml file found
- logging.debug('%s has no app.xml metadata' % self.filename)
- try:
- content = zipin.read('docProps/core.xml')
- metadata['core'] = 'harmful meta'
- except KeyError: # no core.xml found
- logging.debug('%s has no core.xml metadata' % self.filename)
+ for item in zipin.namelist():
+ if item.startswith('docProps/'):
+ metadata[item] = 'harmful content'
zipin.close()
-
return metadata
diff --git a/lib/parser.py b/lib/parser.py
index 1bdca57..fd0ed13 100644
--- a/lib/parser.py
+++ b/lib/parser.py
@@ -9,8 +9,12 @@ import os
import mat
-NOMETA = ('.bmp', 'html', '.py', '.rdf', '.txt', '.xml')
-
+NOMETA = ('.bmp', '.rdf', '.txt', '.xml', '.rels')
+#bmp : image
+#rdf : text
+#txt : plain text
+#xml : formated text
+#rels : openxml foramted text
class GenericParser(object):
'''
diff --git a/test/clean.docx b/test/clean.docx
new file mode 100644
index 0000000..0220399
Binary files /dev/null and b/test/clean.docx differ
diff --git a/test/dirty.docx b/test/dirty.docx
new file mode 100644
index 0000000..404ecc0
Binary files /dev/null and b/test/dirty.docx differ
--
cgit v1.3