summaryrefslogtreecommitdiff
path: root/libmat/office.py
diff options
context:
space:
mode:
authorjvoisin2016-08-29 22:12:40 +0200
committerjvoisin2016-08-29 22:12:40 +0200
commit64b667be5d6b36d17839482593ccf2207af14ac9 (patch)
tree8ab14777fc5d6a8d9793c2a460ae9e4ea14c2909 /libmat/office.py
parenta3c289dea1ceebcc2e624d002ab31deb851a7e3a (diff)
Python3, now with less features
I want to release a new version ASAP, so lets ditch some features for now.
Diffstat (limited to 'libmat/office.py')
-rw-r--r--libmat/office.py89
1 files changed, 1 insertions, 88 deletions
diff --git a/libmat/office.py b/libmat/office.py
index b23ec84..b4a05a7 100644
--- a/libmat/office.py
+++ b/libmat/office.py
@@ -18,94 +18,7 @@ except ImportError:
18 logging.info('office.py loaded without PDF support') 18 logging.info('office.py loaded without PDF support')
19 19
20from libmat import parser 20from libmat import parser
21from libmat import archive 21#from libmat import archive
22
23
24class OpenDocumentStripper(archive.TerminalZipStripper):
25 """ An open document file is a zip, with xml file into.
26 The one that interest us is meta.xml
27 """
28
29 def get_meta(self):
30 """ Return a dict with all the meta of the file by
31 trying to read the meta.xml file.
32 """
33 metadata = super(OpenDocumentStripper, self).get_meta()
34 zipin = zipfile.ZipFile(self.filename, 'r')
35 try:
36 content = zipin.read('meta.xml')
37 dom1 = minidom.parseString(content)
38 elements = dom1.getElementsByTagName('office:meta')
39 for i in elements[0].childNodes:
40 if i.tagName != 'meta:document-statistic':
41 nodename = ''.join(i.nodeName.split(':')[1:])
42 metadata[nodename] = ''.join([j.data for j in i.childNodes])
43 except KeyError: # no meta.xml file found
44 logging.debug('%s has no opendocument metadata', self.filename)
45 zipin.close()
46 return metadata
47
48 def remove_all(self):
49 """ Removes metadata
50 """
51 return super(OpenDocumentStripper, self).remove_all(ending_blacklist=['meta.xml'])
52
53 def is_clean(self):
54 """ Check if the file is clean from harmful metadatas
55 """
56 clean_super = super(OpenDocumentStripper, self).is_clean()
57 if clean_super is False:
58 return False
59
60 zipin = zipfile.ZipFile(self.filename, 'r')
61 try:
62 zipin.getinfo('meta.xml')
63 except KeyError: # no meta.xml in the file
64 return True
65 zipin.close()
66 return False
67
68
69class OpenXmlStripper(archive.TerminalZipStripper):
70 """ Represent an office openxml document, which is like
71 an opendocument format, with some tricky stuff added.
72 It contains mostly xml, but can have media blobs, crap, ...
73 (I don't like this format.)
74 """
75
76 def remove_all(self):
77 """ Remove harmful metadata, by deleting everything that doesn't end with '.rels' in the
78 'docProps' folder. """
79 return super(OpenXmlStripper, self).remove_all(
80 beginning_blacklist=['docProps/'], whitelist=['.rels'])
81
82 def is_clean(self):
83 """ Check if the file is clean from harmful metadatas.
84 This implementation is faster than something like
85 "return this.get_meta() == {}".
86 """
87 clean_super = super(OpenXmlStripper, self).is_clean()
88 if clean_super is False:
89 return False
90
91 zipin = zipfile.ZipFile(self.filename)
92 for item in zipin.namelist():
93 if item.startswith('docProps/'):
94 return False
95 zipin.close()
96 return True
97
98 def get_meta(self):
99 """ Return a dict with all the meta of the file
100 """
101 metadata = super(OpenXmlStripper, self).get_meta()
102
103 zipin = zipfile.ZipFile(self.filename)
104 for item in zipin.namelist():
105 if item.startswith('docProps/'):
106 metadata[item] = 'harmful content'
107 zipin.close()
108 return metadata
109 22
110 23
111class PdfStripper(parser.GenericParser): 24class PdfStripper(parser.GenericParser):