From af36529554c39a2eefcc2c8723715e2d25b401b8 Mon Sep 17 00:00:00 2001 From: jvoisin Date: Sun, 8 Jun 2014 13:39:18 +0200 Subject: Rename the MAT folder to libmat. This commit fixes some issues for dump operating systems who doesn't handle capitalization. --- libmat/office.py | 191 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 191 insertions(+) create mode 100644 libmat/office.py (limited to 'libmat/office.py') diff --git a/libmat/office.py b/libmat/office.py new file mode 100644 index 0000000..0ca1ff1 --- /dev/null +++ b/libmat/office.py @@ -0,0 +1,191 @@ +''' Care about office's formats + +''' + +import logging +import os +import shutil +import tempfile +import xml.dom.minidom as minidom +import zipfile + +try: + import cairo + from gi.repository import Poppler +except ImportError: + logging.info('office.py loaded without PDF support') + pass + +import parser +import archive + + +class OpenDocumentStripper(archive.TerminalZipStripper): + ''' An open document file is a zip, with xml file into. + The one that interest us is meta.xml + ''' + + def get_meta(self): + ''' Return a dict with all the meta of the file by + trying to read the meta.xml file. + ''' + metadata = super(OpenDocumentStripper, self).get_meta() + zipin = zipfile.ZipFile(self.filename, 'r') + try: + content = zipin.read('meta.xml') + dom1 = minidom.parseString(content) + elements = dom1.getElementsByTagName('office:meta') + for i in elements[0].childNodes: + if i.tagName != 'meta:document-statistic': + nodename = ''.join(i.nodeName.split(':')[1:]) + metadata[nodename] = ''.join([j.data for j in i.childNodes]) + else: + # thank you w3c for not providing a nice + # method to get all attributes of a node + pass + except KeyError: # no meta.xml file found + logging.debug('%s has no opendocument metadata' % self.filename) + zipin.close() + return metadata + + def remove_all(self): + ''' Removes metadata + ''' + return super(OpenDocumentStripper, self).remove_all(ending_blacklist=['meta.xml']) + + def is_clean(self): + ''' Check if the file is clean from harmful metadatas + ''' + clean_super = super(OpenDocumentStripper, self).is_clean() + if clean_super is False: + return False + + zipin = zipfile.ZipFile(self.filename, 'r') + try: + zipin.getinfo('meta.xml') + except KeyError: # no meta.xml in the file + return True + zipin.close() + return False + + +class OpenXmlStripper(archive.TerminalZipStripper): + ''' Represent an office openxml document, which is like + an opendocument format, with some tricky stuff added. + It contains mostly xml, but can have media blobs, crap, ... + (I don't like this format.) + ''' + def remove_all(self): + return super(OpenXmlStripper, self).remove_all( + beginning_blacklist=('docProps/'), whitelist=('.rels')) + + def is_clean(self): + ''' Check if the file is clean from harmful metadatas. + This implementation is faster than something like + "return this.get_meta() == {}". + ''' + clean_super = super(OpenXmlStripper, self).is_clean() + if clean_super is False: + return False + + zipin = zipfile.ZipFile(self.filename, 'r') + for item in zipin.namelist(): + if item.startswith('docProps/'): + return False + zipin.close() + return True + + def get_meta(self): + ''' Return a dict with all the meta of the file + ''' + metadata = super(OpenXmlStripper, self).get_meta() + + zipin = zipfile.ZipFile(self.filename, 'r') + for item in zipin.namelist(): + if item.startswith('docProps/'): + metadata[item] = 'harmful content' + zipin.close() + return metadata + + +class PdfStripper(parser.GenericParser): + ''' Represent a PDF file + ''' + def __init__(self, filename, parser, mime, backup, is_writable, **kwargs): + super(PdfStripper, self).__init__(filename, parser, mime, backup, is_writable, **kwargs) + self.uri = 'file://' + os.path.abspath(self.filename) + self.password = None + try: + self.pdf_quality = kwargs['low_pdf_quality'] + except KeyError: + self.pdf_quality = False + + self.meta_list = frozenset(['title', 'author', 'subject', + 'keywords', 'creator', 'producer', 'metadata']) + + def is_clean(self): + ''' Check if the file is clean from harmful metadatas + ''' + document = Poppler.Document.new_from_file(self.uri, self.password) + for key in self.meta_list: + if document.get_property(key): + return False + return True + + def remove_all(self): + ''' Opening the PDF with poppler, then doing a render + on a cairo pdfsurface for each pages. + + http://cairographics.org/documentation/pycairo/2/ + + The use of an intermediate tempfile is necessary because + python-cairo segfaults on unicode. + See http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=699457 + ''' + document = Poppler.Document.new_from_file(self.uri, self.password) + try: + output = tempfile.mkstemp()[1] + page = document.get_page(0) + # assume that every pages are the same size + page_width, page_height = page.get_size() + surface = cairo.PDFSurface(output, page_width, page_height) + context = cairo.Context(surface) # context draws on the surface + logging.debug('PDF rendering of %s' % self.filename) + for pagenum in range(document.get_n_pages()): + page = document.get_page(pagenum) + context.translate(0, 0) + if self.pdf_quality: + page.render(context) # render the page on context + else: + page.render_for_printing(context) # render the page on context + context.show_page() # draw context on surface + surface.finish() + shutil.move(output, self.output) + except: + logging.error('Something went wrong when cleaning %s.' % self.filename) + return False + + try: + import pdfrw # For now, poppler cannot write meta, so we must use pdfrw + logging.debug('Removing %s\'s superficial metadata' % self.filename) + trailer = pdfrw.PdfReader(self.output) + trailer.Info.Producer = None + trailer.Info.Creator = None + writer = pdfrw.PdfWriter() + writer.trailer = trailer + writer.write(self.output) + self.do_backup() + except: + logging.error('Unable to remove all metadata from %s, please install pdfrw' % self.output) + return False + return True + + def get_meta(self): + ''' Return a dict with all the meta of the file + ''' + document = Poppler.Document.new_from_file(self.uri, self.password) + metadata = {} + for key in self.meta_list: + if document.get_property(key): + metadata[key] = document.get_property(key) + return metadata -- cgit v1.3