From cbf8a2a65928694202e19b6bcf56ec84bcbf613c Mon Sep 17 00:00:00 2001 From: jvoisin Date: Sat, 8 Dec 2012 02:02:25 +0100 Subject: Reorganize source tree and files installation location, cleanup setup.py (Closes: #689409) --- MAT/office.py | 265 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 265 insertions(+) create mode 100644 MAT/office.py (limited to 'MAT/office.py') diff --git a/MAT/office.py b/MAT/office.py new file mode 100644 index 0000000..d14125b --- /dev/null +++ b/MAT/office.py @@ -0,0 +1,265 @@ +''' + Care about office's formats +''' + +import os +import logging +import zipfile +import fileinput +import xml.dom.minidom as minidom + +try: + import cairo + import poppler +except ImportError: + pass + +import mat +import parser +import archive + + +class OpenDocumentStripper(archive.GenericArchiveStripper): + ''' + An open document file is a zip, with xml file into. + The one that interest us is meta.xml + ''' + + def get_meta(self): + ''' + Return a dict with all the meta of the file by + trying to read the meta.xml file. + ''' + zipin = zipfile.ZipFile(self.filename, 'r') + metadata = {} + try: + content = zipin.read('meta.xml') + dom1 = minidom.parseString(content) + elements = dom1.getElementsByTagName('office:meta') + for i in elements[0].childNodes: + if i.tagName != 'meta:document-statistic': + nodename = ''.join([k for k in i.nodeName.split(':')[1:]]) + metadata[nodename] = ''.join([j.data for j in i.childNodes]) + else: + # thank you w3c for not providing a nice + # method to get all attributes from a node + pass + zipin.close() + except KeyError: # no meta.xml file found + logging.debug('%s has no opendocument metadata' % self.filename) + return metadata + + def _remove_all(self): + ''' + FIXME ? + There is a patch implementing the Zipfile.remove() + method here : http://bugs.python.org/issue6818 + ''' + zipin = zipfile.ZipFile(self.filename, 'r') + zipout = zipfile.ZipFile(self.output, 'w', allowZip64=True) + + for item in zipin.namelist(): + name = os.path.join(self.tempdir, item) + _, ext = os.path.splitext(name) + + if item.endswith('manifest.xml'): + # contain the list of all files present in the archive + zipin.extract(item, self.tempdir) + for line in fileinput.input(name, inplace=1): + #remove the line which contains "meta.xml" + line = line.strip() + if not 'meta.xml' in line: + print line + zipout.write(name, item) + + elif ext in parser.NOMETA or item == 'mimetype': + #keep NOMETA files, and the "manifest" file + if item != 'meta.xml': # contains the metadata + zipin.extract(item, self.tempdir) + zipout.write(name, item) + + else: + zipin.extract(item, self.tempdir) + if os.path.isfile(name): + try: + cfile = mat.create_class_file(name, False, + self.add2archive) + cfile.remove_all() + logging.debug('Processing %s from %s' % (item, + self.filename)) + zipout.write(name, item) + except: + logging.info('%s\' fileformat is not supported' % item) + if self.add2archive: + zipout.write(name, item) + zipout.comment = '' + logging.info('%s treated' % self.filename) + zipin.close() + zipout.close() + self.do_backup() + return True + + def is_clean(self): + ''' + Check if the file is clean from harmful metadatas + ''' + zipin = zipfile.ZipFile(self.filename, 'r') + try: + zipin.getinfo('meta.xml') + except KeyError: # no meta.xml in the file + czf = archive.ZipStripper(self.filename, self.parser, + 'application/zip', self.backup, self.add2archive) + if czf.is_clean(): + zipin.close() + return True + zipin.close() + return False + + +class PdfStripper(parser.GenericParser): + ''' + Represent a PDF file + ''' + def __init__(self, filename, parser, mime, backup, add2archive): + super(PdfStripper, self).__init__(filename, parser, mime, backup, + add2archive) + uri = 'file://' + os.path.abspath(self.filename) + self.password = None + self.document = poppler.document_new_from_file(uri, self.password) + self.meta_list = frozenset(['title', 'author', 'subject', 'keywords', 'creator', + 'producer', 'metadata']) + + def is_clean(self): + ''' + Check if the file is clean from harmful metadatas + ''' + for key in self.meta_list: + if self.document.get_property(key): + return False + return True + + def remove_all(self): + ''' + Remove metadata + ''' + return self._remove_meta() + + def _remove_meta(self): + ''' + Opening the PDF with poppler, then doing a render + on a cairo pdfsurface for each pages. + + http://cairographics.org/documentation/pycairo/2/ + python-poppler is not documented at all : have fun ;) + ''' + page = self.document.get_page(0) + # assume that every pages are the same size + page_width, page_height = page.get_size() + surface = cairo.PDFSurface(self.output, page_width, page_height) + context = cairo.Context(surface) # context draws on the surface + logging.debug('PDF rendering of %s' % self.filename) + for pagenum in xrange(self.document.get_n_pages()): + page = self.document.get_page(pagenum) + context.translate(0, 0) + page.render_for_printing(context) # render the page on context + context.show_page() # draw context on surface + surface.finish() + + try: + import pdfrw # For now, poppler cannot write meta, so we must use pdfrw + logging.debug('Removing %s\'s superficial metadata' % self.filename) + trailer = pdfrw.PdfReader(self.output) + trailer.Info.Producer = None + trailer.Info.Creator = None + writer = pdfrw.PdfWriter() + writer.trailer = trailer + writer.write(self.output) + self.do_backup() + return True + except: + print('Unable to remove all metadata from %s, please install\ +pdfrw' % self.output) + return False + return True + + def get_meta(self): + ''' + Return a dict with all the meta of the file + ''' + metadata = {} + for key in self.meta_list: + if self.document.get_property(key): + metadata[key] = self.document.get_property(key) + return metadata + + +class OpenXmlStripper(archive.GenericArchiveStripper): + ''' + Represent an office openxml document, which is like + an opendocument format, with some tricky stuff added. + It contains mostly xml, but can have media blobs, crap, ... + (I don't like this format.) + ''' + def _remove_all(self): + ''' + FIXME ? + There is a patch implementing the Zipfile.remove() + method here : http://bugs.python.org/issue6818 + ''' + zipin = zipfile.ZipFile(self.filename, 'r') + zipout = zipfile.ZipFile(self.output, 'w', + allowZip64=True) + for item in zipin.namelist(): + name = os.path.join(self.tempdir, item) + _, ext = os.path.splitext(name) + if item.startswith('docProps/'): # metadatas + pass + elif ext in parser.NOMETA or item == '.rels': + #keep parser.NOMETA files, and the file named ".rels" + zipin.extract(item, self.tempdir) + zipout.write(name, item) + else: + zipin.extract(item, self.tempdir) + if os.path.isfile(name): # don't care about folders + try: + cfile = mat.create_class_file(name, False, + self.add2archive) + cfile.remove_all() + logging.debug('Processing %s from %s' % (item, + self.filename)) + zipout.write(name, item) + except: + logging.info('%s\' fileformat is not supported' % item) + if self.add2archive: + zipout.write(name, item) + zipout.comment = '' + logging.info('%s treated' % self.filename) + zipin.close() + zipout.close() + self.do_backup() + return True + + def is_clean(self): + ''' + Check if the file is clean from harmful metadatas + ''' + zipin = zipfile.ZipFile(self.filename, 'r') + for item in zipin.namelist(): + if item.startswith('docProps/'): + return False + zipin.close() + czf = archive.ZipStripper(self.filename, self.parser, + 'application/zip', self.backup, self.add2archive) + return czf.is_clean() + + def get_meta(self): + ''' + Return a dict with all the meta of the file + ''' + zipin = zipfile.ZipFile(self.filename, 'r') + metadata = {} + for item in zipin.namelist(): + if item.startswith('docProps/'): + metadata[item] = 'harmful content' + zipin.close() + return metadata -- cgit v1.3