2 files changed, 67 insertions, 1 deletions
diff --git a/libmat2/office.py b/libmat2/office.py
index fa79834..3a290d8 100644
--- a/libmat2/office.py
+++ b/libmat2/office.py
@@ -290,6 +290,39 @@ class MSOfficeParser(ZipParser):
        tree.write(full_path, xml_declaration=True, encoding='utf-8')
        return True
+    @staticmethod
+    def __remove_document_comment_meta(full_path: str) -> bool:
+        try:
+            tree, namespace = _parse_xml(full_path)
+        except ET.ParseError as e:  # pragma: no cover
+            logging.error("Unable to parse %s: %s", full_path, e)
+            return False
+        # search the docs to see if we can bail early
+        range_start = tree.find('.//w:commentRangeStart', namespace)
+        range_end = tree.find('.//w:commentRangeEnd', namespace)
+        references = tree.find('.//w:commentReference', namespace)
+        if range_start is None and range_end is None and references is None:
+            return True  # No comment meta tags are present
+        parent_map = {c:p for p in tree.iter() for c in p}
+        # iterate over the elements and add them to list
+        elements_del = list()
+        for element in tree.iterfind('.//w:commentRangeStart', namespace):
+            elements_del.append(element)
+        for element in tree.iterfind('.//w:commentRangeEnd', namespace):
+            elements_del.append(element)
+        for element in tree.iterfind('.//w:commentReference', namespace):
+            elements_del.append(element)
+        # remove the elements
+        for element in elements_del:
+            parent_map[element].remove(element)
+        tree.write(full_path, xml_declaration=True, encoding='utf-8')
+        return True
    def __remove_content_type_members(self, full_path: str) -> bool:
        """ The method will remove the dangling references
        form the [Content_Types].xml file, since MS office doesn't like them
@@ -396,6 +429,9 @@ class MSOfficeParser(ZipParser):
            # this file contains the revisions
            if self.__remove_revisions(full_path) is False:
                return False  # pragma: no cover
+            # remove comment references and ranges
+            if self.__remove_document_comment_meta(full_path) is False:
+                return False  # pragma: no cover
        elif full_path.endswith('/docProps/app.xml'):
            # This file must be present and valid,
            # so we're removing as much as we can.
diff --git a/tests/test_libmat2.py b/tests/test_libmat2.py
index 0435113..491f396 100644
--- a/tests/test_libmat2.py
+++ b/tests/test_libmat2.py
@@ -900,4 +900,34 @@ class TextDocx(unittest.TestCase):
            self.assertIsNotNone(match)
        os.remove('./tests/data/comment_clean.docx')
-        os.remove('./tests/data/comment_clean.cleaned.docx')
-\ No newline at end of file
+        os.remove('./tests/data/comment_clean.cleaned.docx')
+    def test_comment_references_are_removed(self):
+        with zipfile.ZipFile('./tests/data/comment.docx') as zipin:
+            c = zipin.open('word/document.xml')
+            content = c.read()
+            r = b'w:commentRangeStart'
+            self.assertIn(r, content)
+            r = b'w:commentRangeEnd'
+            self.assertIn(r, content)
+            r = b'w:commentReference'
+            self.assertIn(r, content)
+        shutil.copy('./tests/data/comment.docx', './tests/data/comment_clean.docx')
+        p = office.MSOfficeParser('./tests/data/comment_clean.docx')
+        self.assertTrue(p.remove_all())
+        with zipfile.ZipFile('./tests/data/comment_clean.cleaned.docx') as zipin:
+            c = zipin.open('word/document.xml')
+            content = c.read()
+            r = b'w:commentRangeStart'
+            self.assertNotIn(r, content)
+            r = b'w:commentRangeEnd'
+            self.assertNotIn(r, content)
+            r = b'w:commentReference'
+            self.assertNotIn(r, content)
+        os.remove('./tests/data/comment_clean.docx')
+        os.remove('./tests/data/comment_clean.cleaned.docx')

diff --git a/libmat2/office.py b/libmat2/office.py index fa79834..3a290d8 100644 --- a/libmat2/office.py +++ b/libmat2/office.py
@@ -290,6 +290,39 @@ class MSOfficeParser(ZipParser):
290	tree.write(full_path, xml_declaration=True, encoding='utf-8')	290	tree.write(full_path, xml_declaration=True, encoding='utf-8')
291	return True	291	return True
292		292
		293	@staticmethod
		294	def __remove_document_comment_meta(full_path: str) -> bool:
		295	try:
		296	tree, namespace = _parse_xml(full_path)
		297	except ET.ParseError as e: # pragma: no cover
		298	logging.error("Unable to parse %s: %s", full_path, e)
		299	return False
		300
		301	# search the docs to see if we can bail early
		302	range_start = tree.find('.//w:commentRangeStart', namespace)
		303	range_end = tree.find('.//w:commentRangeEnd', namespace)
		304	references = tree.find('.//w:commentReference', namespace)
		305	if range_start is None and range_end is None and references is None:
		306	return True # No comment meta tags are present
		307
		308	parent_map = {c:p for p in tree.iter() for c in p}
		309
		310	# iterate over the elements and add them to list
		311	elements_del = list()
		312	for element in tree.iterfind('.//w:commentRangeStart', namespace):
		313	elements_del.append(element)
		314	for element in tree.iterfind('.//w:commentRangeEnd', namespace):
		315	elements_del.append(element)
		316	for element in tree.iterfind('.//w:commentReference', namespace):
		317	elements_del.append(element)
		318
		319	# remove the elements
		320	for element in elements_del:
		321	parent_map[element].remove(element)
		322
		323	tree.write(full_path, xml_declaration=True, encoding='utf-8')
		324	return True
		325
293	def __remove_content_type_members(self, full_path: str) -> bool:	326	def __remove_content_type_members(self, full_path: str) -> bool:
294	""" The method will remove the dangling references	327	""" The method will remove the dangling references
295	form the [Content_Types].xml file, since MS office doesn't like them	328	form the [Content_Types].xml file, since MS office doesn't like them
@@ -396,6 +429,9 @@ class MSOfficeParser(ZipParser):
396	# this file contains the revisions	429	# this file contains the revisions
397	if self.__remove_revisions(full_path) is False:	430	if self.__remove_revisions(full_path) is False:
398	return False # pragma: no cover	431	return False # pragma: no cover
		432	# remove comment references and ranges
		433	if self.__remove_document_comment_meta(full_path) is False:
		434	return False # pragma: no cover
399	elif full_path.endswith('/docProps/app.xml'):	435	elif full_path.endswith('/docProps/app.xml'):
400	# This file must be present and valid,	436	# This file must be present and valid,
401	# so we're removing as much as we can.	437	# so we're removing as much as we can.


diff --git a/tests/test_libmat2.py b/tests/test_libmat2.py index 0435113..491f396 100644 --- a/tests/test_libmat2.py +++ b/tests/test_libmat2.py
@@ -900,4 +900,34 @@ class TextDocx(unittest.TestCase):
900	self.assertIsNotNone(match)	900	self.assertIsNotNone(match)
901		901
902	os.remove('./tests/data/comment_clean.docx')	902	os.remove('./tests/data/comment_clean.docx')
903	os.remove('./tests/data/comment_clean.cleaned.docx') \ No newline at end of file	903	os.remove('./tests/data/comment_clean.cleaned.docx')
		904
		905	def test_comment_references_are_removed(self):
		906	with zipfile.ZipFile('./tests/data/comment.docx') as zipin:
		907	c = zipin.open('word/document.xml')
		908	content = c.read()
		909
		910	r = b'w:commentRangeStart'
		911	self.assertIn(r, content)
		912	r = b'w:commentRangeEnd'
		913	self.assertIn(r, content)
		914	r = b'w:commentReference'
		915	self.assertIn(r, content)
		916
		917	shutil.copy('./tests/data/comment.docx', './tests/data/comment_clean.docx')
		918	p = office.MSOfficeParser('./tests/data/comment_clean.docx')
		919	self.assertTrue(p.remove_all())
		920
		921	with zipfile.ZipFile('./tests/data/comment_clean.cleaned.docx') as zipin:
		922	c = zipin.open('word/document.xml')
		923	content = c.read()
		924
		925	r = b'w:commentRangeStart'
		926	self.assertNotIn(r, content)
		927	r = b'w:commentRangeEnd'
		928	self.assertNotIn(r, content)
		929	r = b'w:commentReference'
		930	self.assertNotIn(r, content)
		931
		932	os.remove('./tests/data/comment_clean.docx')
		933	os.remove('./tests/data/comment_clean.cleaned.docx')