From 5a6bd3a9312f1d3444ebb9343353812bde7702da Mon Sep 17 00:00:00 2001
From: jvoisin
Date: Fri, 5 Aug 2011 12:06:47 +0200
Subject: Tests for openxml format, and some improvement for this format
 support

---
 FORMATS         |   8 ++++++++
 lib/mat.py      |   1 -
 lib/office.py   |  20 ++++++--------------
 lib/parser.py   |   8 ++++++--
 test/clean.docx | Bin 0 -> 2619 bytes
 test/dirty.docx | Bin 0 -> 3415 bytes
 6 files changed, 20 insertions(+), 17 deletions(-)
 create mode 100644 test/clean.docx
 create mode 100644 test/dirty.docx
diff --git a/FORMATS b/FORMATS
index cc38bae..c497524 100644
--- a/FORMATS
+++ b/FORMATS
@@ -23,6 +23,14 @@
         <method>removal of the meta.xml file</method>
     </format>
 
+    <format>
+        <name>Office Openxml</name>
+        <extension>.docx, .pptx, .xlsx, ...</extension>
+        <support>full</support>
+        <metadata>a docProps folder containings xml metadata files</metadata>
+        <method>removal of the docProps folder</method>
+    </format>
+
     <format>
         <name>Portable Document Fileformat</name>
         <extension>.pdf</extension>
diff --git a/lib/mat.py b/lib/mat.py
index 9f3f6c5..80d5c66 100644
--- a/lib/mat.py
+++ b/lib/mat.py
@@ -31,7 +31,6 @@ STRIPPERS = {
     'application/x-bzip2': archive.Bzip2Stripper,
     'application/zip': archive.ZipStripper,
     'audio/mpeg': audio.MpegAudioStripper,
-    'image/gif': images.GifStripper,
     'image/jpeg': images.JpegStripper,
     'image/png': images.PngStripper,
     'application/vnd.oasis.opendocument': office.OpenDocumentStripper,
diff --git a/lib/office.py b/lib/office.py
index b7c607f..03e386b 100644
--- a/lib/office.py
+++ b/lib/office.py
@@ -146,13 +146,13 @@ class PdfStripper(parser.GenericParser):
         page = self.document.get_page(0)
         page_width, page_height = page.get_size()
         surface = cairo.PDFSurface(self.output, page_width, page_height)
-        context = cairo.Context(surface) #  context draws on the surface
+        context = cairo.Context(surface)  # context draws on the surface
         logging.debug('Pdf rendering of %s' % self.filename)
         for pagenum in xrange(self.document.get_n_pages()):
             page = self.document.get_page(pagenum)
             context.translate(0, 0)
-            page.render(context) #  render the page on context
-            context.show_page() #  draw context on surface
+            page.render(context)  # render the page on context
+            context.show_page()  # draw context on surface
         surface.finish()
 
         #For now, poppler cannot write meta, so we must use pdfrw
@@ -253,16 +253,8 @@ class OpenXmlStripper(archive.GenericArchiveStripper):
         '''
         zipin = zipfile.ZipFile(self.filename, 'r')
         metadata = {}
-        try:
-            content = zipin.read('docProps/app.xml')
-            metadata['app'] = 'harful meta'
-        except KeyError:  # no app.xml file found
-            logging.debug('%s has no app.xml metadata' % self.filename)
-        try:
-            content = zipin.read('docProps/core.xml')
-            metadata['core'] = 'harmful meta'
-        except KeyError:  # no core.xml found
-            logging.debug('%s has no core.xml metadata' % self.filename)
+        for item in zipin.namelist():
+            if item.startswith('docProps/'):
+                metadata[item] = 'harmful content'
         zipin.close()
-
         return metadata
diff --git a/lib/parser.py b/lib/parser.py
index 1bdca57..fd0ed13 100644
--- a/lib/parser.py
+++ b/lib/parser.py
@@ -9,8 +9,12 @@ import os
 
 import mat
 
-NOMETA = ('.bmp', 'html', '.py', '.rdf', '.txt', '.xml')
-
+NOMETA = ('.bmp', '.rdf', '.txt', '.xml', '.rels')
+#bmp : image
+#rdf : text
+#txt : plain text
+#xml : formated text
+#rels : openxml foramted text
 
 class GenericParser(object):
     '''
diff --git a/test/clean.docx b/test/clean.docx
new file mode 100644
index 0000000..0220399
Binary files /dev/null and b/test/clean.docx differ
diff --git a/test/dirty.docx b/test/dirty.docx
new file mode 100644
index 0000000..404ecc0
Binary files /dev/null and b/test/dirty.docx differ
-- 
cgit v1.3