summaryrefslogtreecommitdiff
path: root/libmat2/office.py
diff options
context:
space:
mode:
authorAlex Marchant2024-04-05 18:45:58 +0200
committerjvoisin2024-04-05 18:45:58 +0200
commit156855ab7e79a311c1d19e9c937c41aed12b7506 (patch)
tree639c0d46875b05ed066e926c7e807a4fb66cca53 /libmat2/office.py
parent09672a2dccb2fea0035278c7014f319b85e89c31 (diff)
Remove dangling references from document.xml.rels
The file `word/_rels/document.xml.rels` is similar to `[Content_Types].xml` and has references to other files in the archive. If those references aren't removed Word refuses to open the document. # Please enter the commit message for your changes. Lines starting
Diffstat (limited to 'libmat2/office.py')
-rw-r--r--libmat2/office.py38
1 files changed, 37 insertions, 1 deletions
diff --git a/libmat2/office.py b/libmat2/office.py
index f182277..fc83a20 100644
--- a/libmat2/office.py
+++ b/libmat2/office.py
@@ -323,6 +323,38 @@ class MSOfficeParser(ZipParser):
323 tree.write(full_path, xml_declaration=True, encoding='utf-8') 323 tree.write(full_path, xml_declaration=True, encoding='utf-8')
324 return True 324 return True
325 325
326 def __remove_document_xml_rels_members(self, full_path: str) -> bool:
327 """ Remove the dangling references from the word/_rels/document.xml.rels file, since MS office doesn't like them.
328 """
329 try:
330 tree, namespace = _parse_xml(full_path)
331 except ET.ParseError as e: # pragma: no cover
332 logging.error("Unable to parse %s: %s", full_path, e)
333 return False
334
335 if len(namespace.items()) != 1: # pragma: no cover
336 logging.debug("Got several namespaces for Types: %s", namespace.items())
337
338 removed_fnames = set()
339 with zipfile.ZipFile(self.filename) as zin:
340 for fname in [item.filename for item in zin.infolist()]:
341 for file_to_omit in self.files_to_omit:
342 if file_to_omit.search(fname):
343 matches = map(lambda r: r.search(fname), self.files_to_keep)
344 if any(matches): # the file is in the allowlist
345 continue
346 removed_fnames.add(fname)
347 break
348
349 root = tree.getroot()
350 for item in root.findall('{%s}Relationship' % namespace['']):
351 name = 'word/' + item.attrib['Target'] # add the word/ prefix to the path, since all document rels are in the word/ directory
352 if name in removed_fnames:
353 root.remove(item)
354
355 tree.write(full_path, xml_declaration=True, encoding='utf-8')
356 return True
357
326 @staticmethod 358 @staticmethod
327 def __remove_document_comment_meta(full_path: str) -> bool: 359 def __remove_document_comment_meta(full_path: str) -> bool:
328 try: 360 try:
@@ -445,7 +477,7 @@ class MSOfficeParser(ZipParser):
445 if os.stat(full_path).st_size == 0: # Don't process empty files 477 if os.stat(full_path).st_size == 0: # Don't process empty files
446 return True 478 return True
447 479
448 if not full_path.endswith('.xml'): 480 if not full_path.endswith(('.xml', '.xml.rels')):
449 return True 481 return True
450 482
451 if self.__randomize_creationId(full_path) is False: 483 if self.__randomize_creationId(full_path) is False:
@@ -465,6 +497,10 @@ class MSOfficeParser(ZipParser):
465 # remove comment references and ranges 497 # remove comment references and ranges
466 if self.__remove_document_comment_meta(full_path) is False: 498 if self.__remove_document_comment_meta(full_path) is False:
467 return False # pragma: no cover 499 return False # pragma: no cover
500 elif full_path.endswith('/word/_rels/document.xml.rels'):
501 # similar to the above, but for the document.xml.rels file
502 if self.__remove_document_xml_rels_members(full_path) is False: # pragma: no cover
503 return False
468 elif full_path.endswith('/docProps/app.xml'): 504 elif full_path.endswith('/docProps/app.xml'):
469 # This file must be present and valid, 505 # This file must be present and valid,
470 # so we're removing as much as we can. 506 # so we're removing as much as we can.