Python3, now with less features

I want to release a new version ASAP, so lets ditch some features for now.
author: jvoisin 2016-08-29 22:12:40 +0200
committer: jvoisin 2016-08-29 22:12:40 +0200
commit: 64b667be5d6b36d17839482593ccf2207af14ac9 (patch)
tree: 8ab14777fc5d6a8d9793c2a460ae9e4ea14c2909 /libmat/office.py
parent: a3c289dea1ceebcc2e624d002ab31deb851a7e3a (diff)
1 files changed, 1 insertions, 88 deletions
diff --git a/libmat/office.py b/libmat/office.py
index b23ec84..b4a05a7 100644
--- a/libmat/office.py
+++ b/libmat/office.py
@@ -18,94 +18,7 @@ except ImportError:
    logging.info('office.py loaded without PDF support')
 from libmat import parser
-from libmat import archive
+#from libmat import archive
-class OpenDocumentStripper(archive.TerminalZipStripper):
-    """ An open document file is a zip, with xml file into.
-        The one that interest us is meta.xml
-    """
-    def get_meta(self):
-        """ Return a dict with all the meta of the file by
-            trying to read the meta.xml file.
-        """
-        metadata = super(OpenDocumentStripper, self).get_meta()
-        zipin = zipfile.ZipFile(self.filename, 'r')
-        try:
-            content = zipin.read('meta.xml')
-            dom1 = minidom.parseString(content)
-            elements = dom1.getElementsByTagName('office:meta')
-            for i in elements[0].childNodes:
-                if i.tagName != 'meta:document-statistic':
-                    nodename = ''.join(i.nodeName.split(':')[1:])
-                    metadata[nodename] = ''.join([j.data for j in i.childNodes])
-        except KeyError:  # no meta.xml file found
-            logging.debug('%s has no opendocument metadata', self.filename)
-        zipin.close()
-        return metadata
-    def remove_all(self):
-        """ Removes metadata
-        """
-        return super(OpenDocumentStripper, self).remove_all(ending_blacklist=['meta.xml'])
-    def is_clean(self):
-        """ Check if the file is clean from harmful metadatas
-        """
-        clean_super = super(OpenDocumentStripper, self).is_clean()
-        if clean_super is False:
-            return False
-        zipin = zipfile.ZipFile(self.filename, 'r')
-        try:
-            zipin.getinfo('meta.xml')
-        except KeyError:  # no meta.xml in the file
-            return True
-        zipin.close()
-        return False
-class OpenXmlStripper(archive.TerminalZipStripper):
-    """ Represent an office openxml document, which is like
-        an opendocument format, with some tricky stuff added.
-        It contains mostly xml, but can have media blobs, crap, ...
-        (I don't like this format.)
-    """
-    def remove_all(self):
-        """ Remove harmful metadata, by deleting everything that doesn't end with '.rels' in the
-        'docProps' folder. """
-        return super(OpenXmlStripper, self).remove_all(
-            beginning_blacklist=['docProps/'], whitelist=['.rels'])
-    def is_clean(self):
-        """ Check if the file is clean from harmful metadatas.
-            This implementation is faster than something like
-            "return this.get_meta() == {}".
-        """
-        clean_super = super(OpenXmlStripper, self).is_clean()
-        if clean_super is False:
-            return False
-        zipin = zipfile.ZipFile(self.filename)
-        for item in zipin.namelist():
-            if item.startswith('docProps/'):
-                return False
-        zipin.close()
-        return True
-    def get_meta(self):
-        """ Return a dict with all the meta of the file
-        """
-        metadata = super(OpenXmlStripper, self).get_meta()
-        zipin = zipfile.ZipFile(self.filename)
-        for item in zipin.namelist():
-            if item.startswith('docProps/'):
-                metadata[item] = 'harmful content'
-        zipin.close()
-        return metadata
 class PdfStripper(parser.GenericParser):
author	jvoisin	2016-08-29 22:12:40 +0200
committer	jvoisin	2016-08-29 22:12:40 +0200
commit	64b667be5d6b36d17839482593ccf2207af14ac9 (patch)
tree	8ab14777fc5d6a8d9793c2a460ae9e4ea14c2909 /libmat/office.py
parent	a3c289dea1ceebcc2e624d002ab31deb851a7e3a (diff)

diff --git a/libmat/office.py b/libmat/office.py index b23ec84..b4a05a7 100644 --- a/libmat/office.py +++ b/libmat/office.py
@@ -18,94 +18,7 @@ except ImportError:
18	logging.info('office.py loaded without PDF support')	18	logging.info('office.py loaded without PDF support')
19		19
20	from libmat import parser	20	from libmat import parser
21	from libmat import archive	21	#from libmat import archive
22
23
24	class OpenDocumentStripper(archive.TerminalZipStripper):
25	""" An open document file is a zip, with xml file into.
26	The one that interest us is meta.xml
27	"""
28
29	def get_meta(self):
30	""" Return a dict with all the meta of the file by
31	trying to read the meta.xml file.
32	"""
33	metadata = super(OpenDocumentStripper, self).get_meta()
34	zipin = zipfile.ZipFile(self.filename, 'r')
35	try:
36	content = zipin.read('meta.xml')
37	dom1 = minidom.parseString(content)
38	elements = dom1.getElementsByTagName('office:meta')
39	for i in elements[0].childNodes:
40	if i.tagName != 'meta:document-statistic':
41	nodename = ''.join(i.nodeName.split(':')[1:])
42	metadata[nodename] = ''.join([j.data for j in i.childNodes])
43	except KeyError: # no meta.xml file found
44	logging.debug('%s has no opendocument metadata', self.filename)
45	zipin.close()
46	return metadata
47
48	def remove_all(self):
49	""" Removes metadata
50	"""
51	return super(OpenDocumentStripper, self).remove_all(ending_blacklist=['meta.xml'])
52
53	def is_clean(self):
54	""" Check if the file is clean from harmful metadatas
55	"""
56	clean_super = super(OpenDocumentStripper, self).is_clean()
57	if clean_super is False:
58	return False
59
60	zipin = zipfile.ZipFile(self.filename, 'r')
61	try:
62	zipin.getinfo('meta.xml')
63	except KeyError: # no meta.xml in the file
64	return True
65	zipin.close()
66	return False
67
68
69	class OpenXmlStripper(archive.TerminalZipStripper):
70	""" Represent an office openxml document, which is like
71	an opendocument format, with some tricky stuff added.
72	It contains mostly xml, but can have media blobs, crap, ...
73	(I don't like this format.)
74	"""
75
76	def remove_all(self):
77	""" Remove harmful metadata, by deleting everything that doesn't end with '.rels' in the
78	'docProps' folder. """
79	return super(OpenXmlStripper, self).remove_all(
80	beginning_blacklist=['docProps/'], whitelist=['.rels'])
81
82	def is_clean(self):
83	""" Check if the file is clean from harmful metadatas.
84	This implementation is faster than something like
85	"return this.get_meta() == {}".
86	"""
87	clean_super = super(OpenXmlStripper, self).is_clean()
88	if clean_super is False:
89	return False
90
91	zipin = zipfile.ZipFile(self.filename)
92	for item in zipin.namelist():
93	if item.startswith('docProps/'):
94	return False
95	zipin.close()
96	return True
97
98	def get_meta(self):
99	""" Return a dict with all the meta of the file
100	"""
101	metadata = super(OpenXmlStripper, self).get_meta()
102
103	zipin = zipfile.ZipFile(self.filename)
104	for item in zipin.namelist():
105	if item.startswith('docProps/'):
106	metadata[item] = 'harmful content'
107	zipin.close()
108	return metadata
109		22
110		23
111	class PdfStripper(parser.GenericParser):	24	class PdfStripper(parser.GenericParser):