From bbe17fd511b5890fb4554447e23d666f6c13b745 Mon Sep 17 00:00:00 2001
From: jvoisin
Date: Wed, 15 Jan 2014 02:42:39 +0000
Subject: Add support for zipfiles!

---
 MAT/office.py | 187 +++++++++++++++++-----------------------------------------
 1 file changed, 54 insertions(+), 133 deletions(-)

(limited to 'MAT/office.py')

diff --git a/MAT/office.py b/MAT/office.py
index f60fc64..97405b3 100644
--- a/MAT/office.py
+++ b/MAT/office.py
@@ -1,13 +1,12 @@
 ''' Care about office's formats
 '''
 
-import os
 import logging
-import zipfile
-import fileinput
-import tempfile
+import os
 import shutil
+import tempfile
 import xml.dom.minidom as minidom
+import zipfile
 
 try:
     import cairo
@@ -16,7 +15,6 @@ except ImportError:
     logging.info('office.py loaded without PDF support')
     pass
 
-import mat
 import parser
 import archive
 
@@ -30,89 +28,83 @@ class OpenDocumentStripper(archive.ZipStripper):
         ''' Return a dict with all the meta of the file by
             trying to read the meta.xml file.
         '''
+        metadata = super(OpenDocumentStripper, self).get_meta()
         zipin = zipfile.ZipFile(self.filename, 'r')
-        metadata = {}
         try:
             content = zipin.read('meta.xml')
             dom1 = minidom.parseString(content)
             elements = dom1.getElementsByTagName('office:meta')
             for i in elements[0].childNodes:
                 if i.tagName != 'meta:document-statistic':
-                    nodename = ''.join([k for k in i.nodeName.split(':')[1:]])
+                    nodename = ''.join(i.nodeName.split(':')[1:])
                     metadata[nodename] = ''.join([j.data for j in i.childNodes])
                 else:
                     # thank you w3c for not providing a nice
                     # method to get all attributes of a node
                     pass
-            zipin.close()
         except KeyError:  # no meta.xml file found
             logging.debug('%s has no opendocument metadata' % self.filename)
+        zipin.close()
         return metadata
 
     def remove_all(self):
+        ''' Removes metadata
         '''
-            FIXME ?
-            There is a patch implementing the Zipfile.remove()
-            method here : http://bugs.python.org/issue6818
+        return super(OpenDocumentStripper, self).remove_all(ending_blacklist=['meta.xml'])
+
+    def is_clean(self):
+        ''' Check if the file is clean from harmful metadatas
         '''
+        clean_super = super(OpenDocumentStripper, self).is_clean()
+        if clean_super is False:
+            return False
+
         zipin = zipfile.ZipFile(self.filename, 'r')
-        zipout = zipfile.ZipFile(self.output, 'w', allowZip64=True)
+        try:
+            zipin.getinfo('meta.xml')
+        except KeyError:  # no meta.xml in the file
+            return True
+        zipin.close()
+        return False
 
-        for item in zipin.namelist():
-            name = os.path.join(self.tempdir, item)
-            _, ext = os.path.splitext(name)
 
-            if item.endswith('manifest.xml'):
-            # contain the list of all files present in the archive
-                zipin.extract(item, self.tempdir)
-                for line in fileinput.input(name, inplace=1):
-                    # remove the line which contains "meta.xml"
-                    line = line.strip()
-                    if not 'meta.xml' in line:
-                        print line
-                zipout.write(name, item)
+class OpenXmlStripper(archive.ZipStripper):
+    ''' Represent an office openxml document, which is like
+        an opendocument format, with some tricky stuff added.
+        It contains mostly xml, but can have media blobs, crap, ...
+        (I don't like this format.)
+    '''
+    def remove_all(self):
+        return super(OpenXmlStripper, self).remove_all(
+                beginning_blacklist=('docProps/'), whitelist=('.rels'))
 
-            elif ext in parser.NOMETA or item == 'mimetype':
-                # keep NOMETA files, and the "manifest" file
-                if item != 'meta.xml':  # contains the metadata
-                    zipin.extract(item, self.tempdir)
-                    zipout.write(name, item)
+    def is_clean(self):
+        ''' Check if the file is clean from harmful metadatas.
+            This implementation is faster than something like
+            "return this.get_meta() == {}".
+        '''
+        clean_super = super(OpenXmlStripper, self).is_clean()
+        if clean_super is False:
+            return False
 
-            else:
-                zipin.extract(item, self.tempdir)
-                if os.path.isfile(name):
-                    try:
-                        cfile = mat.create_class_file(name, False,
-                            add2archive=self.add2archive)
-                        cfile.remove_all()
-                        logging.debug('Processing %s from %s' % (item,
-                            self.filename))
-                        zipout.write(name, item)
-                    except:
-                        logging.info('%s\'s fileformat is not supported' % item)
-                        if self.add2archive:
-                            zipout.write(name, item)
-        zipout.comment = ''
-        logging.info('%s processed' % self.filename)
+        zipin = zipfile.ZipFile(self.filename, 'r')
+        for item in zipin.namelist():
+            if item.startswith('docProps/'):
+                return False
         zipin.close()
-        zipout.close()
-        self.do_backup()
         return True
 
-    def is_clean(self):
-        ''' Check if the file is clean from harmful metadatas
+    def get_meta(self):
+        ''' Return a dict with all the meta of the file
         '''
+        metadata = super(OpenXmlStripper, self).get_meta()
+
         zipin = zipfile.ZipFile(self.filename, 'r')
-        try:
-            zipin.getinfo('meta.xml')
-        except KeyError:  # no meta.xml in the file
-            czf = archive.ZipStripper(self.filename, self.parser,
-                'application/zip', False, True, add2archive=self.add2archive)
-            if czf.is_clean():
-                zipin.close()
-                return True
+        for item in zipin.namelist():
+            if item.startswith('docProps/'):
+                metadata[item] = 'harmful content'
         zipin.close()
-        return False
+        return metadata
 
 
 class PdfStripper(parser.GenericParser):
@@ -128,8 +120,8 @@ class PdfStripper(parser.GenericParser):
             self.pdf_quality = False
 
         self.document = Poppler.Document.new_from_file(uri, self.password)
-        self.meta_list = frozenset(['title', 'author', 'subject', 'keywords', 'creator',
-            'producer', 'metadata'])
+        self.meta_list = frozenset(['title', 'author', 'subject',
+            'keywords', 'creator', 'producer', 'metadata'])
 
     def is_clean(self):
         ''' Check if the file is clean from harmful metadatas
@@ -168,7 +160,7 @@ class PdfStripper(parser.GenericParser):
             surface.finish()
             shutil.move(output, self.output)
         except:
-            logging.error('Something went wrong when cleaning %s. File not cleaned' % self.filename)
+            logging.error('Something went wrong when cleaning %s.' % self.filename)
             return False
 
         try:
@@ -182,8 +174,7 @@ class PdfStripper(parser.GenericParser):
             writer.write(self.output)
             self.do_backup()
         except:
-            logging.error('Unable to remove all metadata from %s, please install\
-pdfrw' % self.output)
+            logging.error('Unable to remove all metadata from %s, please install pdfrw' % self.output)
             return False
         return True
 
@@ -195,73 +186,3 @@ pdfrw' % self.output)
             if self.document.get_property(key):
                 metadata[key] = self.document.get_property(key)
         return metadata
-
-
-class OpenXmlStripper(archive.GenericArchiveStripper):
-    '''
-        Represent an office openxml document, which is like
-        an opendocument format, with some tricky stuff added.
-        It contains mostly xml, but can have media blobs, crap, ...
-        (I don't like this format.)
-    '''
-    def remove_all(self):
-        '''
-            FIXME ?
-            There is a patch implementing the Zipfile.remove()
-            method here : http://bugs.python.org/issue6818
-        '''
-        zipin = zipfile.ZipFile(self.filename, 'r')
-        zipout = zipfile.ZipFile(self.output, 'w',
-            allowZip64=True)
-        for item in zipin.namelist():
-            name = os.path.join(self.tempdir, item)
-            _, ext = os.path.splitext(name)
-            if item.startswith('docProps/'):  # metadatas
-                pass
-            elif ext in parser.NOMETA or item == '.rels':
-                # keep parser.NOMETA files, and the file named ".rels"
-                zipin.extract(item, self.tempdir)
-                zipout.write(name, item)
-            else:
-                zipin.extract(item, self.tempdir)
-                if os.path.isfile(name):  # don't care about folders
-                    try:
-                        cfile = mat.create_class_file(name, False,
-                            add2archive=self.add2archive)
-                        cfile.remove_all()
-                        logging.debug('Processing %s from %s' % (item,
-                            self.filename))
-                        zipout.write(name, item)
-                    except:
-                        logging.info('%s\'s fileformat is not supported' % item)
-                        if self.add2archive:
-                            zipout.write(name, item)
-        zipout.comment = ''
-        logging.info('%s processed' % self.filename)
-        zipin.close()
-        zipout.close()
-        self.do_backup()
-        return True
-
-    def is_clean(self):
-        ''' Check if the file is clean from harmful metadatas
-        '''
-        zipin = zipfile.ZipFile(self.filename, 'r')
-        for item in zipin.namelist():
-            if item.startswith('docProps/'):
-                return False
-        zipin.close()
-        czf = archive.ZipStripper(self.filename, self.parser,
-                'application/zip', False, True, add2archive=self.add2archive)
-        return czf.is_clean()
-
-    def get_meta(self):
-        ''' Return a dict with all the meta of the file
-        '''
-        zipin = zipfile.ZipFile(self.filename, 'r')
-        metadata = {}
-        for item in zipin.namelist():
-            if item.startswith('docProps/'):
-                metadata[item] = 'harmful content'
-        zipin.close()
-        return metadata
-- 
cgit v1.3