Remove dangling references in MS Office's [Content_types].xml

author: jvoisin 2018-09-30 19:52:35 +0200
committer: jvoisin 2018-09-30 19:53:18 +0200
commit: e342671eadd3f5ff922fe62cae81792d4cd65e83 (patch)
tree: cd85ad31860cdb3d4a670bd0f1c3dd48ff5129c6
parent: 212d9c472cd7017cab31449ff75461bebd7a40d4 (diff)
3 files changed, 42 insertions, 0 deletions
diff --git a/libmat2/office.py b/libmat2/office.py
index bad352b..b220092 100644
--- a/libmat2/office.py
+++ b/libmat2/office.py
@@ -151,10 +151,44 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
        return True
+    def __remove_content_type_members(self, full_path: str) -> bool:
+        """ The method will remove the dangling references
+        form the [Content_Types].xml file, since MS office doesn't like them
+        """
+        try:
+            tree, namespace = _parse_xml(full_path)
+        except ET.ParseError:  # pragma: no cover
+            return False
+        if len(namespace.items()) != 1:
+            return False  # there should be only one namespace for Types
+        removed_fnames = set()
+        with zipfile.ZipFile(self.filename) as zin:
+            for fname in [item.filename for item in zin.infolist()]:
+                if any(map(lambda r: r.search(fname), self.files_to_omit)):
+                    removed_fnames.add(fname)
+        root = tree.getroot()
+        for item in root.findall('{%s}Override' % namespace['']):
+            name = item.attrib['PartName'][1:]  # remove the leading '/'
+            if name in removed_fnames:
+                root.remove(item)
+        tree.write(full_path, xml_declaration=True)
+        return True
    def _specific_cleanup(self, full_path: str) -> bool:
        if os.stat(full_path).st_size == 0:  # Don't process empty files
            return True
+        if full_path.endswith('/[Content_Types].xml'):
+            # this file contains references to files that we might
+            # remove, and MS Office doesn't like dangling references
+            if self.__remove_content_type_members(full_path) is False:
+                return False
        if full_path.endswith('/word/document.xml'):
            # this file contains the revisions
            if self.__remove_revisions(full_path) is False:
diff --git a/tests/data/malformed_content_types.docx b/tests/data/malformed_content_types.docx
new file mode 100644
index 0000000..43ac743
--- /dev/null
+++ b/tests/data/malformed_content_types.docx
Binary files differ
diff --git a/tests/test_corrupted_files.py b/tests/test_corrupted_files.py
index 30039e6..5af0e81 100644
--- a/tests/test_corrupted_files.py
+++ b/tests/test_corrupted_files.py
@@ -80,6 +80,14 @@ class TestExplicitelyUnsupportedFiles(unittest.TestCase):
        os.remove('./tests/data/clean.py')
+class TestCorruptedContentTypesOffice(unittest.TestCase):
+    def test_office(self):
+        shutil.copy('./tests/data/malformed_content_types.docx', './tests/data/clean.docx')
+        p = office.MSOfficeParser('./tests/data/clean.docx')
+        self.assertIsNotNone(p)
+        self.assertFalse(p.remove_all())
+        os.remove('./tests/data/clean.docx')
 class TestCorruptedFiles(unittest.TestCase):
    def test_pdf(self):
        shutil.copy('./tests/data/dirty.png', './tests/data/clean.png')
author	jvoisin	2018-09-30 19:52:35 +0200
committer	jvoisin	2018-09-30 19:53:18 +0200
commit	e342671eadd3f5ff922fe62cae81792d4cd65e83 (patch)
tree	cd85ad31860cdb3d4a670bd0f1c3dd48ff5129c6
parent	212d9c472cd7017cab31449ff75461bebd7a40d4 (diff)

diff --git a/libmat2/office.py b/libmat2/office.py index bad352b..b220092 100644 --- a/libmat2/office.py +++ b/libmat2/office.py
@@ -151,10 +151,44 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
151		151
152	return True	152	return True
153		153
		154	def __remove_content_type_members(self, full_path: str) -> bool:
		155	""" The method will remove the dangling references
		156	form the [Content_Types].xml file, since MS office doesn't like them
		157	"""
		158	try:
		159	tree, namespace = _parse_xml(full_path)
		160	except ET.ParseError: # pragma: no cover
		161	return False
		162
		163	if len(namespace.items()) != 1:
		164	return False # there should be only one namespace for Types
		165
		166	removed_fnames = set()
		167	with zipfile.ZipFile(self.filename) as zin:
		168	for fname in [item.filename for item in zin.infolist()]:
		169	if any(map(lambda r: r.search(fname), self.files_to_omit)):
		170	removed_fnames.add(fname)
		171
		172	root = tree.getroot()
		173	for item in root.findall('{%s}Override' % namespace['']):
		174	name = item.attrib['PartName'][1:] # remove the leading '/'
		175	if name in removed_fnames:
		176	root.remove(item)
		177
		178	tree.write(full_path, xml_declaration=True)
		179
		180	return True
		181
154	def _specific_cleanup(self, full_path: str) -> bool:	182	def _specific_cleanup(self, full_path: str) -> bool:
155	if os.stat(full_path).st_size == 0: # Don't process empty files	183	if os.stat(full_path).st_size == 0: # Don't process empty files
156	return True	184	return True
157		185
		186	if full_path.endswith('/[Content_Types].xml'):
		187	# this file contains references to files that we might
		188	# remove, and MS Office doesn't like dangling references
		189	if self.__remove_content_type_members(full_path) is False:
		190	return False
		191
158	if full_path.endswith('/word/document.xml'):	192	if full_path.endswith('/word/document.xml'):
159	# this file contains the revisions	193	# this file contains the revisions
160	if self.__remove_revisions(full_path) is False:	194	if self.__remove_revisions(full_path) is False:


diff --git a/tests/data/malformed_content_types.docx b/tests/data/malformed_content_types.docx new file mode 100644 index 0000000..43ac743 --- /dev/null +++ b/tests/data/malformed_content_types.docx
Binary files differ


diff --git a/tests/test_corrupted_files.py b/tests/test_corrupted_files.py index 30039e6..5af0e81 100644 --- a/tests/test_corrupted_files.py +++ b/tests/test_corrupted_files.py
@@ -80,6 +80,14 @@ class TestExplicitelyUnsupportedFiles(unittest.TestCase):
80	os.remove('./tests/data/clean.py')	80	os.remove('./tests/data/clean.py')
81		81
82		82
		83	class TestCorruptedContentTypesOffice(unittest.TestCase):
		84	def test_office(self):
		85	shutil.copy('./tests/data/malformed_content_types.docx', './tests/data/clean.docx')
		86	p = office.MSOfficeParser('./tests/data/clean.docx')
		87	self.assertIsNotNone(p)
		88	self.assertFalse(p.remove_all())
		89	os.remove('./tests/data/clean.docx')
		90
83	class TestCorruptedFiles(unittest.TestCase):	91	class TestCorruptedFiles(unittest.TestCase):
84	def test_pdf(self):	92	def test_pdf(self):
85	shutil.copy('./tests/data/dirty.png', './tests/data/clean.png')	93	shutil.copy('./tests/data/dirty.png', './tests/data/clean.png')