From cbf8a2a65928694202e19b6bcf56ec84bcbf613c Mon Sep 17 00:00:00 2001 From: jvoisin Date: Sat, 8 Dec 2012 02:02:25 +0100 Subject: Reorganize source tree and files installation location, cleanup setup.py (Closes: #689409) --- lib/office.py | 265 ---------------------------------------------------------- 1 file changed, 265 deletions(-) delete mode 100644 lib/office.py (limited to 'lib/office.py') diff --git a/lib/office.py b/lib/office.py deleted file mode 100644 index d14125b..0000000 --- a/lib/office.py +++ /dev/null @@ -1,265 +0,0 @@ -''' - Care about office's formats -''' - -import os -import logging -import zipfile -import fileinput -import xml.dom.minidom as minidom - -try: - import cairo - import poppler -except ImportError: - pass - -import mat -import parser -import archive - - -class OpenDocumentStripper(archive.GenericArchiveStripper): - ''' - An open document file is a zip, with xml file into. - The one that interest us is meta.xml - ''' - - def get_meta(self): - ''' - Return a dict with all the meta of the file by - trying to read the meta.xml file. - ''' - zipin = zipfile.ZipFile(self.filename, 'r') - metadata = {} - try: - content = zipin.read('meta.xml') - dom1 = minidom.parseString(content) - elements = dom1.getElementsByTagName('office:meta') - for i in elements[0].childNodes: - if i.tagName != 'meta:document-statistic': - nodename = ''.join([k for k in i.nodeName.split(':')[1:]]) - metadata[nodename] = ''.join([j.data for j in i.childNodes]) - else: - # thank you w3c for not providing a nice - # method to get all attributes from a node - pass - zipin.close() - except KeyError: # no meta.xml file found - logging.debug('%s has no opendocument metadata' % self.filename) - return metadata - - def _remove_all(self): - ''' - FIXME ? - There is a patch implementing the Zipfile.remove() - method here : http://bugs.python.org/issue6818 - ''' - zipin = zipfile.ZipFile(self.filename, 'r') - zipout = zipfile.ZipFile(self.output, 'w', allowZip64=True) - - for item in zipin.namelist(): - name = os.path.join(self.tempdir, item) - _, ext = os.path.splitext(name) - - if item.endswith('manifest.xml'): - # contain the list of all files present in the archive - zipin.extract(item, self.tempdir) - for line in fileinput.input(name, inplace=1): - #remove the line which contains "meta.xml" - line = line.strip() - if not 'meta.xml' in line: - print line - zipout.write(name, item) - - elif ext in parser.NOMETA or item == 'mimetype': - #keep NOMETA files, and the "manifest" file - if item != 'meta.xml': # contains the metadata - zipin.extract(item, self.tempdir) - zipout.write(name, item) - - else: - zipin.extract(item, self.tempdir) - if os.path.isfile(name): - try: - cfile = mat.create_class_file(name, False, - self.add2archive) - cfile.remove_all() - logging.debug('Processing %s from %s' % (item, - self.filename)) - zipout.write(name, item) - except: - logging.info('%s\' fileformat is not supported' % item) - if self.add2archive: - zipout.write(name, item) - zipout.comment = '' - logging.info('%s treated' % self.filename) - zipin.close() - zipout.close() - self.do_backup() - return True - - def is_clean(self): - ''' - Check if the file is clean from harmful metadatas - ''' - zipin = zipfile.ZipFile(self.filename, 'r') - try: - zipin.getinfo('meta.xml') - except KeyError: # no meta.xml in the file - czf = archive.ZipStripper(self.filename, self.parser, - 'application/zip', self.backup, self.add2archive) - if czf.is_clean(): - zipin.close() - return True - zipin.close() - return False - - -class PdfStripper(parser.GenericParser): - ''' - Represent a PDF file - ''' - def __init__(self, filename, parser, mime, backup, add2archive): - super(PdfStripper, self).__init__(filename, parser, mime, backup, - add2archive) - uri = 'file://' + os.path.abspath(self.filename) - self.password = None - self.document = poppler.document_new_from_file(uri, self.password) - self.meta_list = frozenset(['title', 'author', 'subject', 'keywords', 'creator', - 'producer', 'metadata']) - - def is_clean(self): - ''' - Check if the file is clean from harmful metadatas - ''' - for key in self.meta_list: - if self.document.get_property(key): - return False - return True - - def remove_all(self): - ''' - Remove metadata - ''' - return self._remove_meta() - - def _remove_meta(self): - ''' - Opening the PDF with poppler, then doing a render - on a cairo pdfsurface for each pages. - - http://cairographics.org/documentation/pycairo/2/ - python-poppler is not documented at all : have fun ;) - ''' - page = self.document.get_page(0) - # assume that every pages are the same size - page_width, page_height = page.get_size() - surface = cairo.PDFSurface(self.output, page_width, page_height) - context = cairo.Context(surface) # context draws on the surface - logging.debug('PDF rendering of %s' % self.filename) - for pagenum in xrange(self.document.get_n_pages()): - page = self.document.get_page(pagenum) - context.translate(0, 0) - page.render_for_printing(context) # render the page on context - context.show_page() # draw context on surface - surface.finish() - - try: - import pdfrw # For now, poppler cannot write meta, so we must use pdfrw - logging.debug('Removing %s\'s superficial metadata' % self.filename) - trailer = pdfrw.PdfReader(self.output) - trailer.Info.Producer = None - trailer.Info.Creator = None - writer = pdfrw.PdfWriter() - writer.trailer = trailer - writer.write(self.output) - self.do_backup() - return True - except: - print('Unable to remove all metadata from %s, please install\ -pdfrw' % self.output) - return False - return True - - def get_meta(self): - ''' - Return a dict with all the meta of the file - ''' - metadata = {} - for key in self.meta_list: - if self.document.get_property(key): - metadata[key] = self.document.get_property(key) - return metadata - - -class OpenXmlStripper(archive.GenericArchiveStripper): - ''' - Represent an office openxml document, which is like - an opendocument format, with some tricky stuff added. - It contains mostly xml, but can have media blobs, crap, ... - (I don't like this format.) - ''' - def _remove_all(self): - ''' - FIXME ? - There is a patch implementing the Zipfile.remove() - method here : http://bugs.python.org/issue6818 - ''' - zipin = zipfile.ZipFile(self.filename, 'r') - zipout = zipfile.ZipFile(self.output, 'w', - allowZip64=True) - for item in zipin.namelist(): - name = os.path.join(self.tempdir, item) - _, ext = os.path.splitext(name) - if item.startswith('docProps/'): # metadatas - pass - elif ext in parser.NOMETA or item == '.rels': - #keep parser.NOMETA files, and the file named ".rels" - zipin.extract(item, self.tempdir) - zipout.write(name, item) - else: - zipin.extract(item, self.tempdir) - if os.path.isfile(name): # don't care about folders - try: - cfile = mat.create_class_file(name, False, - self.add2archive) - cfile.remove_all() - logging.debug('Processing %s from %s' % (item, - self.filename)) - zipout.write(name, item) - except: - logging.info('%s\' fileformat is not supported' % item) - if self.add2archive: - zipout.write(name, item) - zipout.comment = '' - logging.info('%s treated' % self.filename) - zipin.close() - zipout.close() - self.do_backup() - return True - - def is_clean(self): - ''' - Check if the file is clean from harmful metadatas - ''' - zipin = zipfile.ZipFile(self.filename, 'r') - for item in zipin.namelist(): - if item.startswith('docProps/'): - return False - zipin.close() - czf = archive.ZipStripper(self.filename, self.parser, - 'application/zip', self.backup, self.add2archive) - return czf.is_clean() - - def get_meta(self): - ''' - Return a dict with all the meta of the file - ''' - zipin = zipfile.ZipFile(self.filename, 'r') - metadata = {} - for item in zipin.namelist(): - if item.startswith('docProps/'): - metadata[item] = 'harmful content' - zipin.close() - return metadata -- cgit v1.3