From 544fe9bf1782a027b3f31bf4c10a050d783e32ac Mon Sep 17 00:00:00 2001
From: jvoisin
Date: Wed, 1 Feb 2012 22:56:04 +0100
Subject: Rename mat-cli to mat-gui

---
 lib/office.py | 305 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 305 insertions(+)
 create mode 100644 lib/office.py

(limited to 'lib/office.py')

diff --git a/lib/office.py b/lib/office.py
new file mode 100644
index 0000000..e1d738e
--- /dev/null
+++ b/lib/office.py
@@ -0,0 +1,305 @@
+'''
+    Care about office's formats
+'''
+
+import os
+import logging
+import zipfile
+import fileinput
+import subprocess
+import xml.dom.minidom as minidom
+
+try:
+    import cairo
+    import poppler
+except ImportError:
+    pass
+
+import mat
+import parser
+import archive
+
+class OpenDocumentStripper(archive.GenericArchiveStripper):
+    '''
+        An open document file is a zip, with xml file into.
+        The one that interest us is meta.xml
+    '''
+
+    def get_meta(self):
+        '''
+            Return a dict with all the meta of the file by
+            trying to read the meta.xml file.
+        '''
+        zipin = zipfile.ZipFile(self.filename, 'r')
+        metadata = {}
+        try:
+            content = zipin.read('meta.xml')
+            dom1 = minidom.parseString(content)
+            elements = dom1.getElementsByTagName('office:meta')
+            for i in elements[0].childNodes:
+                if i.tagName != 'meta:document-statistic':
+                    nodename = ''.join([k for k in i.nodeName.split(':')[1:]])
+                    metadata[nodename] = ''.join([j.data for j in i.childNodes])
+                else:
+                    # thank you w3c for not providing a nice
+                    # method to get all attributes from a node
+                    pass
+            zipin.close()
+        except KeyError:  # no meta.xml file found
+            logging.debug('%s has no opendocument metadata' % self.filename)
+        return metadata
+
+    def _remove_all(self, method):
+        '''
+            FIXME ?
+            There is a patch implementing the Zipfile.remove()
+            method here : http://bugs.python.org/issue6818
+        '''
+        zipin = zipfile.ZipFile(self.filename, 'r')
+        zipout = zipfile.ZipFile(self.output, 'w', allowZip64=True)
+
+        for item in zipin.namelist():
+            name = os.path.join(self.tempdir, item)
+            _, ext = os.path.splitext(name)
+
+            if item.endswith('manifest.xml'):
+            # contain the list of all files present in the archive
+                zipin.extract(item, self.tempdir)
+                for line in fileinput.input(name, inplace=1):
+                    #remove the line which contains "meta.xml"
+                    line = line.strip()
+                    if not 'meta.xml' in line:
+                        print line
+                zipout.write(name, item)
+
+            elif ext in parser.NOMETA or item == 'mimetype':
+                #keep NOMETA files, and the "manifest" file
+                if item != 'meta.xml':  # contains the metadata
+                    zipin.extract(item, self.tempdir)
+                    zipout.write(name, item)
+
+            else:
+                zipin.extract(item, self.tempdir)
+                if os.path.isfile(name):
+                    try:
+                        cfile = mat.create_class_file(name, False,
+                            self.add2archive)
+                        if method == 'normal':
+                            cfile.remove_all()
+                        else:
+                            cfile.remove_all_strict()
+                        logging.debug('Processing %s from %s' % (item,
+                            self.filename))
+                        zipout.write(name, item)
+                    except:
+                        logging.info('%s\' fileformat is not supported' % item)
+                        if self.add2archive:
+                            zipout.write(name, item)
+        zipout.comment = ''
+        logging.info('%s treated' % self.filename)
+        zipin.close()
+        zipout.close()
+        self.do_backup()
+        return True
+
+    def is_clean(self):
+        '''
+            Check if the file is clean from harmful metadatas
+        '''
+        zipin = zipfile.ZipFile(self.filename, 'r')
+        try:
+            zipin.getinfo('meta.xml')
+        except KeyError:  # no meta.xml in the file
+            czf = archive.ZipStripper(self.filename, self.parser,
+                'application/zip', self.backup, self.add2archive)
+            if czf.is_clean():
+                zipin.close()
+                return True
+        zipin.close()
+        return False
+
+
+class PdfStripper(parser.GenericParser):
+    '''
+        Represent a PDF file
+    '''
+    def __init__(self, filename, parser, mime, backup, add2archive):
+        super(PdfStripper, self).__init__(filename, parser, mime, backup,
+            add2archive)
+        uri = 'file://' + os.path.abspath(self.filename)
+        self.password = None
+        self.document = poppler.document_new_from_file(uri, self.password)
+        self.meta_list = ('title', 'author', 'subject', 'keywords', 'creator',
+            'producer', 'metadata')
+
+    def is_clean(self):
+        '''
+            Check if the file is clean from harmful metadatas
+        '''
+        for key in self.meta_list:
+            if self.document.get_property(key) is not None and \
+                self.document.get_property(key) != '':
+                return False
+        return True
+
+
+    def remove_all(self):
+        '''
+            Remove supperficial
+        '''
+        return self._remove_meta()
+
+
+    def remove_all_strict(self):
+        '''
+            Opening the PDF with poppler, then doing a render
+            on a cairo pdfsurface for each pages.
+            Thanks to Lunar^for the idea.
+            http://cairographics.org/documentation/pycairo/2/
+            python-poppler is not documented at all : have fun ;)
+        '''
+        page = self.document.get_page(0)
+        page_width, page_height = page.get_size()
+        surface = cairo.PDFSurface(self.output, page_width, page_height)
+        context = cairo.Context(surface)  # context draws on the surface
+        logging.debug('PDF rendering of %s' % self.filename)
+        for pagenum in xrange(self.document.get_n_pages()):
+            page = self.document.get_page(pagenum)
+            context.translate(0, 0)
+            page.render(context)  # render the page on context
+            context.show_page()  # draw context on surface
+        surface.finish()
+        return self._remove_meta()
+
+    def _remove_meta(self):
+        '''
+            Remove superficial/external metadata
+            from a PDF file, using exiftool,
+            of pdfrw if exiftool is not installed
+        '''
+        processed = False
+        try:# try with pdfrw
+            import pdfrw
+            #For now, poppler cannot write meta, so we must use pdfrw
+            logging.debug('Removing %s\'s superficial metadata' % self.filename)
+            trailer = pdfrw.PdfReader(self.output)
+            trailer.Info.Producer = trailer.Author = trailer.Info.Creator = None
+            writer = pdfrw.PdfWriter()
+            writer.trailer = trailer
+            writer.write(self.output)
+            self.do_backup()
+            processed = True
+        except:
+            pass
+
+        try:  # try with exiftool
+            subprocess.Popen('exiftool', stdout=open('/dev/null'))
+            import exiftool
+            # Note: '-All=' must be followed by a known exiftool option.
+            if self.backup:
+                process = subprocess.Popen(['exiftool', '-m', '-All=',
+                    '-out', self.output, self.filename], stdout=open('/dev/null'))
+                process.wait()
+            else:
+                # Note: '-All=' must be followed by a known exiftool option.
+                process = subprocess.Popen(
+                    ['exiftool', '-All=', '-overwrite_original', self.filename],
+                    stdout=open('/dev/null'))
+                process.wait()
+            processed = True
+        except:
+            pass
+
+        if processed is False:
+            logging.error('Please install either pdfrw, or exiftool to\
+                    fully handle PDF files')
+        return processed
+
+    def get_meta(self):
+        '''
+            Return a dict with all the meta of the file
+        '''
+        metadata = {}
+        for key in self.meta_list:
+            if self.document.get_property(key) is not None and \
+                self.document.get_property(key) != '':
+                metadata[key] = self.document.get_property(key)
+        return metadata
+
+
+class OpenXmlStripper(archive.GenericArchiveStripper):
+    '''
+        Represent an office openxml document, which is like
+        an opendocument format, with some tricky stuff added.
+        It contains mostly xml, but can have media blobs, crap, ...
+        (I don't like this format.)
+    '''
+    def _remove_all(self, method):
+        '''
+            FIXME ?
+            There is a patch implementing the Zipfile.remove()
+            method here : http://bugs.python.org/issue6818
+        '''
+        zipin = zipfile.ZipFile(self.filename, 'r')
+        zipout = zipfile.ZipFile(self.output, 'w',
+            allowZip64=True)
+        for item in zipin.namelist():
+            name = os.path.join(self.tempdir, item)
+            _, ext = os.path.splitext(name)
+            if item.startswith('docProps/'):  # metadatas
+                pass
+            elif ext in parser.NOMETA or item == '.rels':
+                #keep parser.NOMETA files, and the file named ".rels"
+                zipin.extract(item, self.tempdir)
+                zipout.write(name, item)
+            else:
+                zipin.extract(item, self.tempdir)
+                if os.path.isfile(name):  # don't care about folders
+                    try:
+                        cfile = mat.create_class_file(name, False,
+                            self.add2archive)
+                        if method == 'normal':
+                            cfile.remove_all()
+                        else:
+                            cfile.remove_all_strict()
+                        logging.debug('Processing %s from %s' % (item,
+                            self.filename))
+                        zipout.write(name, item)
+                    except:
+                        logging.info('%s\' fileformat is not supported' % item)
+                        if self.add2archive:
+                            zipout.write(name, item)
+        zipout.comment = ''
+        logging.info('%s treated' % self.filename)
+        zipin.close()
+        zipout.close()
+        self.do_backup()
+        return True
+
+    def is_clean(self):
+        '''
+            Check if the file is clean from harmful metadatas
+        '''
+        zipin = zipfile.ZipFile(self.filename, 'r')
+        for item in zipin.namelist():
+            if item.startswith('docProps/'):
+                return False
+        zipin.close()
+        czf = archive.ZipStripper(self.filename, self.parser,
+                'application/zip', self.backup, self.add2archive)
+        if not czf.is_clean():
+            return False
+        else:
+            return True
+
+    def get_meta(self):
+        '''
+            Return a dict with all the meta of the file
+        '''
+        zipin = zipfile.ZipFile(self.filename, 'r')
+        metadata = {}
+        for item in zipin.namelist():
+            if item.startswith('docProps/'):
+                metadata[item] = 'harmful content'
+        zipin.close()
+        return metadata
-- 
cgit v1.3