summaryrefslogtreecommitdiff
path: root/libmat2/office.py
diff options
context:
space:
mode:
authorjvoisin2018-10-01 12:25:37 -0700
committerjvoisin2018-10-01 12:25:37 -0700
commit652b8e519fbd11da051f40ecde5b814e1e8fc013 (patch)
treed510e6543d61806facf99a70174c98145823f323 /libmat2/office.py
parentc14be47f95e3a0fe6ecddfc236329bc3e76f5eec (diff)
Files processed via MAT2 are now accepted without warnings by MS Office
Diffstat (limited to 'libmat2/office.py')
-rw-r--r--libmat2/office.py63
1 files changed, 50 insertions, 13 deletions
diff --git a/libmat2/office.py b/libmat2/office.py
index a8a2c94..4348d9b 100644
--- a/libmat2/office.py
+++ b/libmat2/office.py
@@ -36,9 +36,8 @@ def _sort_xml_attributes(full_path: str) -> bool:
36 since they are all using different orders. 36 since they are all using different orders.
37 """ 37 """
38 tree = ET.parse(full_path) 38 tree = ET.parse(full_path)
39 root = tree.getroot()
40 39
41 for c in root: 40 for c in tree.getroot():
42 c[:] = sorted(c, key=lambda child: (child.tag, child.get('desc'))) 41 c[:] = sorted(c, key=lambda child: (child.tag, child.get('desc')))
43 42
44 tree.write(full_path, xml_declaration=True) 43 tree.write(full_path, xml_declaration=True)
@@ -59,6 +58,8 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
59 'word/fontTable.xml', 58 'word/fontTable.xml',
60 'word/settings.xml', 59 'word/settings.xml',
61 'word/styles.xml', 60 'word/styles.xml',
61 'docProps/app.xml',
62 'docProps/core.xml',
62 63
63 # https://msdn.microsoft.com/en-us/library/dd908153(v=office.12).aspx 64 # https://msdn.microsoft.com/en-us/library/dd908153(v=office.12).aspx
64 'word/stylesWithEffects.xml', 65 'word/stylesWithEffects.xml',
@@ -66,7 +67,6 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
66 files_to_omit = set(map(re.compile, { # type: ignore 67 files_to_omit = set(map(re.compile, { # type: ignore
67 'word/webSettings.xml', 68 'word/webSettings.xml',
68 'word/theme', 69 'word/theme',
69 '^docProps/',
70 })) 70 }))
71 71
72 @staticmethod 72 @staticmethod
@@ -95,7 +95,7 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
95 95
96 elements_to_remove = list() 96 elements_to_remove = list()
97 for item in tree.iterfind('.//', namespace): 97 for item in tree.iterfind('.//', namespace):
98 if '}rsid' in item.tag.strip().lower(): # resi as tag 98 if '}rsid' in item.tag.strip().lower(): # rsid as tag
99 elements_to_remove.append(item) 99 elements_to_remove.append(item)
100 continue 100 continue
101 for key in list(item.attrib.keys()): # rsid as attribute 101 for key in list(item.attrib.keys()): # rsid as attribute
@@ -106,7 +106,6 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
106 parent_map[element].remove(element) 106 parent_map[element].remove(element)
107 107
108 tree.write(full_path, xml_declaration=True) 108 tree.write(full_path, xml_declaration=True)
109
110 return True 109 return True
111 110
112 @staticmethod 111 @staticmethod
@@ -148,7 +147,6 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
148 parent_map[element].remove(element) 147 parent_map[element].remove(element)
149 148
150 tree.write(full_path, xml_declaration=True) 149 tree.write(full_path, xml_declaration=True)
151
152 return True 150 return True
153 151
154 def __remove_content_type_members(self, full_path: str) -> bool: 152 def __remove_content_type_members(self, full_path: str) -> bool:
@@ -176,27 +174,67 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
176 root.remove(item) 174 root.remove(item)
177 175
178 tree.write(full_path, xml_declaration=True) 176 tree.write(full_path, xml_declaration=True)
179
180 return True 177 return True
181 178
182 def _specific_cleanup(self, full_path: str) -> bool: 179 def _specific_cleanup(self, full_path: str) -> bool:
180 # pylint: disable=too-many-return-statements
183 if os.stat(full_path).st_size == 0: # Don't process empty files 181 if os.stat(full_path).st_size == 0: # Don't process empty files
184 return True 182 return True
185 183
184 if not full_path.endswith('.xml'):
185 return True
186
186 if full_path.endswith('/[Content_Types].xml'): 187 if full_path.endswith('/[Content_Types].xml'):
187 # this file contains references to files that we might 188 # this file contains references to files that we might
188 # remove, and MS Office doesn't like dangling references 189 # remove, and MS Office doesn't like dangling references
189 if self.__remove_content_type_members(full_path) is False: 190 if self.__remove_content_type_members(full_path) is False:
190 return False 191 return False
191 192 elif full_path.endswith('/word/document.xml'):
192 if full_path.endswith('/word/document.xml'):
193 # this file contains the revisions 193 # this file contains the revisions
194 if self.__remove_revisions(full_path) is False: 194 if self.__remove_revisions(full_path) is False:
195 return False 195 return False
196 elif full_path.endswith('/docProps/app.xml'):
197 # This file must be present and valid,
198 # so we're removing as much as we can.
199 with open(full_path, 'wb') as f:
200 f.write(b'<?xml version="1.0" encoding="UTF-8" standalone="yes"?>')
201 f.write(b'<Properties xmlns="http://schemas.openxmlformats.org/officeDocument/2006/extended-properties">')
202 f.write(b'</Properties>')
203 elif full_path.endswith('/docProps/core.xml'):
204 # This file must be present and valid,
205 # so we're removing as much as we can.
206 with open(full_path, 'wb') as f:
207 f.write(b'<?xml version="1.0" encoding="UTF-8" standalone="yes"?>')
208 f.write(b'<cp:coreProperties xmlns:cp="http://schemas.openxmlformats.org/package/2006/metadata/core-properties">')
209 f.write(b'</cp:coreProperties>')
196 210
197 if full_path.endswith('.xml'): 211
198 if self.__remove_rsid(full_path) is False: 212 if self.__remove_rsid(full_path) is False:
199 return False 213 return False
214
215 try:
216 _sort_xml_attributes(full_path)
217 except ET.ParseError as e: # pragma: no cover
218 logging.error("Unable to parse %s: %s", full_path, e)
219 return False
220
221 # This is awful, I'm sorry.
222 #
223 # Microsoft Office isn't happy when we have the `mc:Ignorable`
224 # tag containing namespaces that aren't present in the xml file,
225 # so instead of trying to remove this specific tag with etree,
226 # we're removing it, with a regexp.
227 #
228 # Since we're the ones producing this file, via the call to
229 # _sort_xml_attributes, there won't be any "funny tricks".
230 # Worst case, the tag isn't present, and everything is fine.
231 #
232 # see: https://docs.microsoft.com/en-us/dotnet/framework/wpf/advanced/mc-ignorable-attribute
233 with open(full_path, 'rb') as f:
234 text = f.read()
235 out = re.sub(b'mc:Ignorable="[^"]*"', b'', text, 1)
236 with open(full_path, 'wb') as f:
237 f.write(out)
200 238
201 return True 239 return True
202 240
@@ -262,7 +300,6 @@ class LibreOfficeParser(ArchiveBasedAbstractParser):
262 text.remove(changes) 300 text.remove(changes)
263 301
264 tree.write(full_path, xml_declaration=True) 302 tree.write(full_path, xml_declaration=True)
265
266 return True 303 return True
267 304
268 def _specific_cleanup(self, full_path: str) -> bool: 305 def _specific_cleanup(self, full_path: str) -> bool: