summaryrefslogtreecommitdiff
path: root/libmat2/office.py
diff options
context:
space:
mode:
authorAlex Marchant2024-04-03 15:20:00 -0400
committerAlex Marchant2024-04-03 15:20:00 -0400
commit61f39c4bd0b51be6371fb2973c14054a2772352e (patch)
tree3f5a8c11eb8c37fbc229a213a6e89ade3b5272b0 /libmat2/office.py
parent1b9ce34e2c3da718e79137e2c2210ccdcd299486 (diff)
Strip comment references from document.xml
Diffstat (limited to 'libmat2/office.py')
-rw-r--r--libmat2/office.py36
1 files changed, 36 insertions, 0 deletions
diff --git a/libmat2/office.py b/libmat2/office.py
index 6f69e4a..66f462b 100644
--- a/libmat2/office.py
+++ b/libmat2/office.py
@@ -290,6 +290,39 @@ class MSOfficeParser(ZipParser):
290 tree.write(full_path, xml_declaration=True) 290 tree.write(full_path, xml_declaration=True)
291 return True 291 return True
292 292
293 @staticmethod
294 def __remove_document_comment_meta(full_path: str) -> bool:
295 try:
296 tree, namespace = _parse_xml(full_path)
297 except ET.ParseError as e: # pragma: no cover
298 logging.error("Unable to parse %s: %s", full_path, e)
299 return False
300
301 # search the docs to see if we can bail early
302 range_start = tree.find('.//w:commentRangeStart', namespace)
303 range_end = tree.find('.//w:commentRangeEnd', namespace)
304 references = tree.find('.//w:commentReference', namespace)
305 if range_start is None and range_end is None and references is None:
306 return True # No comment meta tags are present
307
308 parent_map = {c:p for p in tree.iter() for c in p}
309
310 # iterate over the elements and add them to list
311 elements_del = list()
312 for element in tree.iterfind('.//w:commentRangeStart', namespace):
313 elements_del.append(element)
314 for element in tree.iterfind('.//w:commentRangeEnd', namespace):
315 elements_del.append(element)
316 for element in tree.iterfind('.//w:commentReference', namespace):
317 elements_del.append(element)
318
319 # remove the elements
320 for element in elements_del:
321 parent_map[element].remove(element)
322
323 tree.write(full_path, xml_declaration=True, encoding='utf-8')
324 return True
325
293 def __remove_content_type_members(self, full_path: str) -> bool: 326 def __remove_content_type_members(self, full_path: str) -> bool:
294 """ The method will remove the dangling references 327 """ The method will remove the dangling references
295 form the [Content_Types].xml file, since MS office doesn't like them 328 form the [Content_Types].xml file, since MS office doesn't like them
@@ -396,6 +429,9 @@ class MSOfficeParser(ZipParser):
396 # this file contains the revisions 429 # this file contains the revisions
397 if self.__remove_revisions(full_path) is False: 430 if self.__remove_revisions(full_path) is False:
398 return False # pragma: no cover 431 return False # pragma: no cover
432 # remove comment references and ranges
433 if self.__remove_document_comment_meta(full_path) is False:
434 return False # pragma: no cover
399 elif full_path.endswith('/docProps/app.xml'): 435 elif full_path.endswith('/docProps/app.xml'):
400 # This file must be present and valid, 436 # This file must be present and valid,
401 # so we're removing as much as we can. 437 # so we're removing as much as we can.