2 files changed, 58 insertions, 1 deletions
diff --git a/libmat2/office.py b/libmat2/office.py
index f182277..fc83a20 100644
--- a/libmat2/office.py
+++ b/libmat2/office.py
@@ -323,6 +323,38 @@ class MSOfficeParser(ZipParser):
        tree.write(full_path, xml_declaration=True, encoding='utf-8')
        return True
+    def __remove_document_xml_rels_members(self, full_path: str) -> bool:
+        """ Remove the dangling references from the word/_rels/document.xml.rels file, since MS office doesn't like them.
+        """
+        try:
+            tree, namespace = _parse_xml(full_path)
+        except ET.ParseError as e:  # pragma: no cover
+            logging.error("Unable to parse %s: %s", full_path, e)
+            return False
+        if len(namespace.items()) != 1:  # pragma: no cover
+            logging.debug("Got several namespaces for Types: %s", namespace.items())
+        removed_fnames = set()
+        with zipfile.ZipFile(self.filename) as zin:
+            for fname in [item.filename for item in zin.infolist()]:
+                for file_to_omit in self.files_to_omit:
+                    if file_to_omit.search(fname):
+                        matches = map(lambda r: r.search(fname), self.files_to_keep)
+                        if any(matches):  # the file is in the allowlist
+                            continue
+                        removed_fnames.add(fname)
+                        break
+        root = tree.getroot()
+        for item in root.findall('{%s}Relationship' % namespace['']):
+            name = 'word/' + item.attrib['Target'] # add the word/ prefix to the path, since all document rels are in the word/ directory
+            if name in removed_fnames:
+                root.remove(item)
+        tree.write(full_path, xml_declaration=True, encoding='utf-8')
+        return True
    @staticmethod
    def __remove_document_comment_meta(full_path: str) -> bool:
        try:
@@ -445,7 +477,7 @@ class MSOfficeParser(ZipParser):
        if os.stat(full_path).st_size == 0:  # Don't process empty files
            return True
-        if not full_path.endswith('.xml'):
+        if not full_path.endswith(('.xml', '.xml.rels')):
            return True
        if self.__randomize_creationId(full_path) is False:
@@ -465,6 +497,10 @@ class MSOfficeParser(ZipParser):
            # remove comment references and ranges
            if self.__remove_document_comment_meta(full_path) is False:
                return False  # pragma: no cover
+        elif full_path.endswith('/word/_rels/document.xml.rels'):
+            # similar to the above, but for the document.xml.rels file
+            if self.__remove_document_xml_rels_members(full_path) is False:  # pragma: no cover
+                return False
        elif full_path.endswith('/docProps/app.xml'):
            # This file must be present and valid,
            # so we're removing as much as we can.
diff --git a/tests/test_libmat2.py b/tests/test_libmat2.py
index 491f396..7855062 100644
--- a/tests/test_libmat2.py
+++ b/tests/test_libmat2.py
@@ -931,3 +931,24 @@ class TextDocx(unittest.TestCase):
        os.remove('./tests/data/comment_clean.docx')
        os.remove('./tests/data/comment_clean.cleaned.docx')
+    def test_clean_document_xml_rels(self):
+        with zipfile.ZipFile('./tests/data/comment.docx') as zipin:
+            c = zipin.open('word/_rels/document.xml.rels')
+            content = c.read()
+            r = b'Target="comments.xml"'
+            self.assertIn(r, content)
+        shutil.copy('./tests/data/comment.docx', './tests/data/comment_clean.docx')
+        p = office.MSOfficeParser('./tests/data/comment_clean.docx')
+        self.assertTrue(p.remove_all())
+        with zipfile.ZipFile('./tests/data/comment_clean.cleaned.docx') as zipin:
+            c = zipin.open('word/_rels/document.xml.rels')
+            content = c.read()
+            r = b'Target="comments.xml"'
+            self.assertNotIn(r, content)
+        os.remove('./tests/data/comment_clean.docx')
+        os.remove('./tests/data/comment_clean.cleaned.docx')

diff --git a/libmat2/office.py b/libmat2/office.py index f182277..fc83a20 100644 --- a/libmat2/office.py +++ b/libmat2/office.py
@@ -323,6 +323,38 @@ class MSOfficeParser(ZipParser):
323	tree.write(full_path, xml_declaration=True, encoding='utf-8')	323	tree.write(full_path, xml_declaration=True, encoding='utf-8')
324	return True	324	return True
325		325
		326	def __remove_document_xml_rels_members(self, full_path: str) -> bool:
		327	""" Remove the dangling references from the word/_rels/document.xml.rels file, since MS office doesn't like them.
		328	"""
		329	try:
		330	tree, namespace = _parse_xml(full_path)
		331	except ET.ParseError as e: # pragma: no cover
		332	logging.error("Unable to parse %s: %s", full_path, e)
		333	return False
		334
		335	if len(namespace.items()) != 1: # pragma: no cover
		336	logging.debug("Got several namespaces for Types: %s", namespace.items())
		337
		338	removed_fnames = set()
		339	with zipfile.ZipFile(self.filename) as zin:
		340	for fname in [item.filename for item in zin.infolist()]:
		341	for file_to_omit in self.files_to_omit:
		342	if file_to_omit.search(fname):
		343	matches = map(lambda r: r.search(fname), self.files_to_keep)
		344	if any(matches): # the file is in the allowlist
		345	continue
		346	removed_fnames.add(fname)
		347	break
		348
		349	root = tree.getroot()
		350	for item in root.findall('{%s}Relationship' % namespace['']):
		351	name = 'word/' + item.attrib['Target'] # add the word/ prefix to the path, since all document rels are in the word/ directory
		352	if name in removed_fnames:
		353	root.remove(item)
		354
		355	tree.write(full_path, xml_declaration=True, encoding='utf-8')
		356	return True
		357
326	@staticmethod	358	@staticmethod
327	def __remove_document_comment_meta(full_path: str) -> bool:	359	def __remove_document_comment_meta(full_path: str) -> bool:
328	try:	360	try:
@@ -445,7 +477,7 @@ class MSOfficeParser(ZipParser):
445	if os.stat(full_path).st_size == 0: # Don't process empty files	477	if os.stat(full_path).st_size == 0: # Don't process empty files
446	return True	478	return True
447		479
448	if not full_path.endswith('.xml'):	480	if not full_path.endswith(('.xml', '.xml.rels')):
449	return True	481	return True
450		482
451	if self.__randomize_creationId(full_path) is False:	483	if self.__randomize_creationId(full_path) is False:
@@ -465,6 +497,10 @@ class MSOfficeParser(ZipParser):
465	# remove comment references and ranges	497	# remove comment references and ranges
466	if self.__remove_document_comment_meta(full_path) is False:	498	if self.__remove_document_comment_meta(full_path) is False:
467	return False # pragma: no cover	499	return False # pragma: no cover
		500	elif full_path.endswith('/word/_rels/document.xml.rels'):
		501	# similar to the above, but for the document.xml.rels file
		502	if self.__remove_document_xml_rels_members(full_path) is False: # pragma: no cover
		503	return False
468	elif full_path.endswith('/docProps/app.xml'):	504	elif full_path.endswith('/docProps/app.xml'):
469	# This file must be present and valid,	505	# This file must be present and valid,
470	# so we're removing as much as we can.	506	# so we're removing as much as we can.


diff --git a/tests/test_libmat2.py b/tests/test_libmat2.py index 491f396..7855062 100644 --- a/tests/test_libmat2.py +++ b/tests/test_libmat2.py
@@ -931,3 +931,24 @@ class TextDocx(unittest.TestCase):
931		931
932	os.remove('./tests/data/comment_clean.docx')	932	os.remove('./tests/data/comment_clean.docx')
933	os.remove('./tests/data/comment_clean.cleaned.docx')	933	os.remove('./tests/data/comment_clean.cleaned.docx')
		934
		935	def test_clean_document_xml_rels(self):
		936	with zipfile.ZipFile('./tests/data/comment.docx') as zipin:
		937	c = zipin.open('word/_rels/document.xml.rels')
		938	content = c.read()
		939	r = b'Target="comments.xml"'
		940	self.assertIn(r, content)
		941
		942	shutil.copy('./tests/data/comment.docx', './tests/data/comment_clean.docx')
		943	p = office.MSOfficeParser('./tests/data/comment_clean.docx')
		944	self.assertTrue(p.remove_all())
		945
		946	with zipfile.ZipFile('./tests/data/comment_clean.cleaned.docx') as zipin:
		947	c = zipin.open('word/_rels/document.xml.rels')
		948	content = c.read()
		949	r = b'Target="comments.xml"'
		950	self.assertNotIn(r, content)
		951
		952	os.remove('./tests/data/comment_clean.docx')
		953	os.remove('./tests/data/comment_clean.cleaned.docx')
		954