summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--libmat2/office.py36
-rw-r--r--tests/test_libmat2.py32
2 files changed, 67 insertions, 1 deletions
diff --git a/libmat2/office.py b/libmat2/office.py
index fa79834..3a290d8 100644
--- a/libmat2/office.py
+++ b/libmat2/office.py
@@ -290,6 +290,39 @@ class MSOfficeParser(ZipParser):
290 tree.write(full_path, xml_declaration=True, encoding='utf-8') 290 tree.write(full_path, xml_declaration=True, encoding='utf-8')
291 return True 291 return True
292 292
293 @staticmethod
294 def __remove_document_comment_meta(full_path: str) -> bool:
295 try:
296 tree, namespace = _parse_xml(full_path)
297 except ET.ParseError as e: # pragma: no cover
298 logging.error("Unable to parse %s: %s", full_path, e)
299 return False
300
301 # search the docs to see if we can bail early
302 range_start = tree.find('.//w:commentRangeStart', namespace)
303 range_end = tree.find('.//w:commentRangeEnd', namespace)
304 references = tree.find('.//w:commentReference', namespace)
305 if range_start is None and range_end is None and references is None:
306 return True # No comment meta tags are present
307
308 parent_map = {c:p for p in tree.iter() for c in p}
309
310 # iterate over the elements and add them to list
311 elements_del = list()
312 for element in tree.iterfind('.//w:commentRangeStart', namespace):
313 elements_del.append(element)
314 for element in tree.iterfind('.//w:commentRangeEnd', namespace):
315 elements_del.append(element)
316 for element in tree.iterfind('.//w:commentReference', namespace):
317 elements_del.append(element)
318
319 # remove the elements
320 for element in elements_del:
321 parent_map[element].remove(element)
322
323 tree.write(full_path, xml_declaration=True, encoding='utf-8')
324 return True
325
293 def __remove_content_type_members(self, full_path: str) -> bool: 326 def __remove_content_type_members(self, full_path: str) -> bool:
294 """ The method will remove the dangling references 327 """ The method will remove the dangling references
295 form the [Content_Types].xml file, since MS office doesn't like them 328 form the [Content_Types].xml file, since MS office doesn't like them
@@ -396,6 +429,9 @@ class MSOfficeParser(ZipParser):
396 # this file contains the revisions 429 # this file contains the revisions
397 if self.__remove_revisions(full_path) is False: 430 if self.__remove_revisions(full_path) is False:
398 return False # pragma: no cover 431 return False # pragma: no cover
432 # remove comment references and ranges
433 if self.__remove_document_comment_meta(full_path) is False:
434 return False # pragma: no cover
399 elif full_path.endswith('/docProps/app.xml'): 435 elif full_path.endswith('/docProps/app.xml'):
400 # This file must be present and valid, 436 # This file must be present and valid,
401 # so we're removing as much as we can. 437 # so we're removing as much as we can.
diff --git a/tests/test_libmat2.py b/tests/test_libmat2.py
index 0435113..491f396 100644
--- a/tests/test_libmat2.py
+++ b/tests/test_libmat2.py
@@ -900,4 +900,34 @@ class TextDocx(unittest.TestCase):
900 self.assertIsNotNone(match) 900 self.assertIsNotNone(match)
901 901
902 os.remove('./tests/data/comment_clean.docx') 902 os.remove('./tests/data/comment_clean.docx')
903 os.remove('./tests/data/comment_clean.cleaned.docx') \ No newline at end of file 903 os.remove('./tests/data/comment_clean.cleaned.docx')
904
905 def test_comment_references_are_removed(self):
906 with zipfile.ZipFile('./tests/data/comment.docx') as zipin:
907 c = zipin.open('word/document.xml')
908 content = c.read()
909
910 r = b'w:commentRangeStart'
911 self.assertIn(r, content)
912 r = b'w:commentRangeEnd'
913 self.assertIn(r, content)
914 r = b'w:commentReference'
915 self.assertIn(r, content)
916
917 shutil.copy('./tests/data/comment.docx', './tests/data/comment_clean.docx')
918 p = office.MSOfficeParser('./tests/data/comment_clean.docx')
919 self.assertTrue(p.remove_all())
920
921 with zipfile.ZipFile('./tests/data/comment_clean.cleaned.docx') as zipin:
922 c = zipin.open('word/document.xml')
923 content = c.read()
924
925 r = b'w:commentRangeStart'
926 self.assertNotIn(r, content)
927 r = b'w:commentRangeEnd'
928 self.assertNotIn(r, content)
929 r = b'w:commentReference'
930 self.assertNotIn(r, content)
931
932 os.remove('./tests/data/comment_clean.docx')
933 os.remove('./tests/data/comment_clean.cleaned.docx')