From af36529554c39a2eefcc2c8723715e2d25b401b8 Mon Sep 17 00:00:00 2001 From: jvoisin Date: Sun, 8 Jun 2014 13:39:18 +0200 Subject: Rename the MAT folder to libmat. This commit fixes some issues for dump operating systems who doesn't handle capitalization. --- MAT/office.py | 191 ---------------------------------------------------------- 1 file changed, 191 deletions(-) delete mode 100644 MAT/office.py (limited to 'MAT/office.py') diff --git a/MAT/office.py b/MAT/office.py deleted file mode 100644 index 0ca1ff1..0000000 --- a/MAT/office.py +++ /dev/null @@ -1,191 +0,0 @@ -''' Care about office's formats - -''' - -import logging -import os -import shutil -import tempfile -import xml.dom.minidom as minidom -import zipfile - -try: - import cairo - from gi.repository import Poppler -except ImportError: - logging.info('office.py loaded without PDF support') - pass - -import parser -import archive - - -class OpenDocumentStripper(archive.TerminalZipStripper): - ''' An open document file is a zip, with xml file into. - The one that interest us is meta.xml - ''' - - def get_meta(self): - ''' Return a dict with all the meta of the file by - trying to read the meta.xml file. - ''' - metadata = super(OpenDocumentStripper, self).get_meta() - zipin = zipfile.ZipFile(self.filename, 'r') - try: - content = zipin.read('meta.xml') - dom1 = minidom.parseString(content) - elements = dom1.getElementsByTagName('office:meta') - for i in elements[0].childNodes: - if i.tagName != 'meta:document-statistic': - nodename = ''.join(i.nodeName.split(':')[1:]) - metadata[nodename] = ''.join([j.data for j in i.childNodes]) - else: - # thank you w3c for not providing a nice - # method to get all attributes of a node - pass - except KeyError: # no meta.xml file found - logging.debug('%s has no opendocument metadata' % self.filename) - zipin.close() - return metadata - - def remove_all(self): - ''' Removes metadata - ''' - return super(OpenDocumentStripper, self).remove_all(ending_blacklist=['meta.xml']) - - def is_clean(self): - ''' Check if the file is clean from harmful metadatas - ''' - clean_super = super(OpenDocumentStripper, self).is_clean() - if clean_super is False: - return False - - zipin = zipfile.ZipFile(self.filename, 'r') - try: - zipin.getinfo('meta.xml') - except KeyError: # no meta.xml in the file - return True - zipin.close() - return False - - -class OpenXmlStripper(archive.TerminalZipStripper): - ''' Represent an office openxml document, which is like - an opendocument format, with some tricky stuff added. - It contains mostly xml, but can have media blobs, crap, ... - (I don't like this format.) - ''' - def remove_all(self): - return super(OpenXmlStripper, self).remove_all( - beginning_blacklist=('docProps/'), whitelist=('.rels')) - - def is_clean(self): - ''' Check if the file is clean from harmful metadatas. - This implementation is faster than something like - "return this.get_meta() == {}". - ''' - clean_super = super(OpenXmlStripper, self).is_clean() - if clean_super is False: - return False - - zipin = zipfile.ZipFile(self.filename, 'r') - for item in zipin.namelist(): - if item.startswith('docProps/'): - return False - zipin.close() - return True - - def get_meta(self): - ''' Return a dict with all the meta of the file - ''' - metadata = super(OpenXmlStripper, self).get_meta() - - zipin = zipfile.ZipFile(self.filename, 'r') - for item in zipin.namelist(): - if item.startswith('docProps/'): - metadata[item] = 'harmful content' - zipin.close() - return metadata - - -class PdfStripper(parser.GenericParser): - ''' Represent a PDF file - ''' - def __init__(self, filename, parser, mime, backup, is_writable, **kwargs): - super(PdfStripper, self).__init__(filename, parser, mime, backup, is_writable, **kwargs) - self.uri = 'file://' + os.path.abspath(self.filename) - self.password = None - try: - self.pdf_quality = kwargs['low_pdf_quality'] - except KeyError: - self.pdf_quality = False - - self.meta_list = frozenset(['title', 'author', 'subject', - 'keywords', 'creator', 'producer', 'metadata']) - - def is_clean(self): - ''' Check if the file is clean from harmful metadatas - ''' - document = Poppler.Document.new_from_file(self.uri, self.password) - for key in self.meta_list: - if document.get_property(key): - return False - return True - - def remove_all(self): - ''' Opening the PDF with poppler, then doing a render - on a cairo pdfsurface for each pages. - - http://cairographics.org/documentation/pycairo/2/ - - The use of an intermediate tempfile is necessary because - python-cairo segfaults on unicode. - See http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=699457 - ''' - document = Poppler.Document.new_from_file(self.uri, self.password) - try: - output = tempfile.mkstemp()[1] - page = document.get_page(0) - # assume that every pages are the same size - page_width, page_height = page.get_size() - surface = cairo.PDFSurface(output, page_width, page_height) - context = cairo.Context(surface) # context draws on the surface - logging.debug('PDF rendering of %s' % self.filename) - for pagenum in range(document.get_n_pages()): - page = document.get_page(pagenum) - context.translate(0, 0) - if self.pdf_quality: - page.render(context) # render the page on context - else: - page.render_for_printing(context) # render the page on context - context.show_page() # draw context on surface - surface.finish() - shutil.move(output, self.output) - except: - logging.error('Something went wrong when cleaning %s.' % self.filename) - return False - - try: - import pdfrw # For now, poppler cannot write meta, so we must use pdfrw - logging.debug('Removing %s\'s superficial metadata' % self.filename) - trailer = pdfrw.PdfReader(self.output) - trailer.Info.Producer = None - trailer.Info.Creator = None - writer = pdfrw.PdfWriter() - writer.trailer = trailer - writer.write(self.output) - self.do_backup() - except: - logging.error('Unable to remove all metadata from %s, please install pdfrw' % self.output) - return False - return True - - def get_meta(self): - ''' Return a dict with all the meta of the file - ''' - document = Poppler.Document.new_from_file(self.uri, self.password) - metadata = {} - for key in self.meta_list: - if document.get_property(key): - metadata[key] = document.get_property(key) - return metadata -- cgit v1.3