From 64b667be5d6b36d17839482593ccf2207af14ac9 Mon Sep 17 00:00:00 2001 From: jvoisin Date: Mon, 29 Aug 2016 22:12:40 +0200 Subject: Python3, now with less features I want to release a new version ASAP, so lets ditch some features for now. --- libmat/office.py | 89 +------------------------------------------------------- 1 file changed, 1 insertion(+), 88 deletions(-) (limited to 'libmat/office.py') diff --git a/libmat/office.py b/libmat/office.py index b23ec84..b4a05a7 100644 --- a/libmat/office.py +++ b/libmat/office.py @@ -18,94 +18,7 @@ except ImportError: logging.info('office.py loaded without PDF support') from libmat import parser -from libmat import archive - - -class OpenDocumentStripper(archive.TerminalZipStripper): - """ An open document file is a zip, with xml file into. - The one that interest us is meta.xml - """ - - def get_meta(self): - """ Return a dict with all the meta of the file by - trying to read the meta.xml file. - """ - metadata = super(OpenDocumentStripper, self).get_meta() - zipin = zipfile.ZipFile(self.filename, 'r') - try: - content = zipin.read('meta.xml') - dom1 = minidom.parseString(content) - elements = dom1.getElementsByTagName('office:meta') - for i in elements[0].childNodes: - if i.tagName != 'meta:document-statistic': - nodename = ''.join(i.nodeName.split(':')[1:]) - metadata[nodename] = ''.join([j.data for j in i.childNodes]) - except KeyError: # no meta.xml file found - logging.debug('%s has no opendocument metadata', self.filename) - zipin.close() - return metadata - - def remove_all(self): - """ Removes metadata - """ - return super(OpenDocumentStripper, self).remove_all(ending_blacklist=['meta.xml']) - - def is_clean(self): - """ Check if the file is clean from harmful metadatas - """ - clean_super = super(OpenDocumentStripper, self).is_clean() - if clean_super is False: - return False - - zipin = zipfile.ZipFile(self.filename, 'r') - try: - zipin.getinfo('meta.xml') - except KeyError: # no meta.xml in the file - return True - zipin.close() - return False - - -class OpenXmlStripper(archive.TerminalZipStripper): - """ Represent an office openxml document, which is like - an opendocument format, with some tricky stuff added. - It contains mostly xml, but can have media blobs, crap, ... - (I don't like this format.) - """ - - def remove_all(self): - """ Remove harmful metadata, by deleting everything that doesn't end with '.rels' in the - 'docProps' folder. """ - return super(OpenXmlStripper, self).remove_all( - beginning_blacklist=['docProps/'], whitelist=['.rels']) - - def is_clean(self): - """ Check if the file is clean from harmful metadatas. - This implementation is faster than something like - "return this.get_meta() == {}". - """ - clean_super = super(OpenXmlStripper, self).is_clean() - if clean_super is False: - return False - - zipin = zipfile.ZipFile(self.filename) - for item in zipin.namelist(): - if item.startswith('docProps/'): - return False - zipin.close() - return True - - def get_meta(self): - """ Return a dict with all the meta of the file - """ - metadata = super(OpenXmlStripper, self).get_meta() - - zipin = zipfile.ZipFile(self.filename) - for item in zipin.namelist(): - if item.startswith('docProps/'): - metadata[item] = 'harmful content' - zipin.close() - return metadata +#from libmat import archive class PdfStripper(parser.GenericParser): -- cgit v1.3