summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--libmat2/office.py49
1 files changed, 44 insertions, 5 deletions
diff --git a/libmat2/office.py b/libmat2/office.py
index 50b776e..5c2c996 100644
--- a/libmat2/office.py
+++ b/libmat2/office.py
@@ -1,3 +1,4 @@
1import logging
1import os 2import os
2import re 3import re
3import zipfile 4import zipfile
@@ -12,16 +13,38 @@ assert Set
12assert Pattern 13assert Pattern
13 14
14def _parse_xml(full_path: str): 15def _parse_xml(full_path: str):
15 """ This function parse XML, with namespace support. """ 16 """ This function parses XML, with namespace support. """
16 17
18 cpt = 0
17 namespace_map = dict() 19 namespace_map = dict()
18 for _, (key, value) in ET.iterparse(full_path, ("start-ns", )): 20 for _, (key, value) in ET.iterparse(full_path, ("start-ns", )):
21 # The ns[0-9]+ namespaces are reserved for interal usage, so
22 # we have to use an other nomenclature.
23 if re.match('^ns[0-9]+$', key):
24 key = 'mat%d' % cpt
25 cpt += 1
26
19 namespace_map[key] = value 27 namespace_map[key] = value
20 ET.register_namespace(key, value) 28 ET.register_namespace(key, value)
21 29
22 return ET.parse(full_path), namespace_map 30 return ET.parse(full_path), namespace_map
23 31
24 32
33def _sort_xml_attributes(full_path: str) -> bool:
34 """ Sort xml attributes lexicographically,
35 because it's possible to fingerprint producers (MS Office, Libreoffice, …)
36 since they are all using different orders.
37 """
38 tree = ET.parse(full_path)
39 root = tree.getroot()
40
41 for c in root:
42 c[:] = sorted(c, key=lambda child: (child.tag, child.get('desc')))
43
44 tree.write(full_path, xml_declaration=True)
45 return True
46
47
25class MSOfficeParser(ArchiveBasedAbstractParser): 48class MSOfficeParser(ArchiveBasedAbstractParser):
26 mimetypes = { 49 mimetypes = {
27 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', 50 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
@@ -49,7 +72,8 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
49 """ 72 """
50 try: 73 try:
51 tree, namespace = _parse_xml(full_path) 74 tree, namespace = _parse_xml(full_path)
52 except ET.ParseError: 75 except ET.ParseError as e:
76 logging.error("Unable to parse %s: %s", full_path, e)
53 return False 77 return False
54 78
55 # Revisions are either deletions (`w:del`) or 79 # Revisions are either deletions (`w:del`) or
@@ -83,6 +107,9 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
83 return True 107 return True
84 108
85 def _specific_cleanup(self, full_path: str) -> bool: 109 def _specific_cleanup(self, full_path: str) -> bool:
110 if os.stat(full_path).st_size == 0: # Don't process empty files
111 return True
112
86 if full_path.endswith('/word/document.xml'): 113 if full_path.endswith('/word/document.xml'):
87 # this file contains the revisions 114 # this file contains the revisions
88 return self.__remove_revisions(full_path) 115 return self.__remove_revisions(full_path)
@@ -139,7 +166,8 @@ class LibreOfficeParser(ArchiveBasedAbstractParser):
139 def __remove_revisions(full_path: str) -> bool: 166 def __remove_revisions(full_path: str) -> bool:
140 try: 167 try:
141 tree, namespace = _parse_xml(full_path) 168 tree, namespace = _parse_xml(full_path)
142 except ET.ParseError: 169 except ET.ParseError as e:
170 logging.error("Unable to parse %s: %s", full_path, e)
143 return False 171 return False
144 172
145 if 'office' not in namespace.keys(): # no revisions in the current file 173 if 'office' not in namespace.keys(): # no revisions in the current file
@@ -154,8 +182,19 @@ class LibreOfficeParser(ArchiveBasedAbstractParser):
154 return True 182 return True
155 183
156 def _specific_cleanup(self, full_path: str) -> bool: 184 def _specific_cleanup(self, full_path: str) -> bool:
157 if os.path.basename(full_path) == 'content.xml': 185 if os.stat(full_path).st_size == 0: # Don't process empty files
158 return self.__remove_revisions(full_path) 186 return True
187
188 if os.path.basename(full_path).endswith('.xml'):
189 if os.path.basename(full_path) == 'content.xml':
190 if self.__remove_revisions(full_path) is False:
191 return False
192
193 try:
194 _sort_xml_attributes(full_path)
195 except ET.ParseError as e:
196 logging.error("Unable to parse %s: %s", full_path, e)
197 return False
159 return True 198 return True
160 199
161 def get_meta(self) -> Dict[str, str]: 200 def get_meta(self) -> Dict[str, str]: