summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorjvoisin2018-09-30 19:52:35 +0200
committerjvoisin2018-09-30 19:53:18 +0200
commite342671eadd3f5ff922fe62cae81792d4cd65e83 (patch)
treecd85ad31860cdb3d4a670bd0f1c3dd48ff5129c6
parent212d9c472cd7017cab31449ff75461bebd7a40d4 (diff)
Remove dangling references in MS Office's [Content_types].xml
-rw-r--r--libmat2/office.py34
-rw-r--r--tests/data/malformed_content_types.docxbin0 -> 4131 bytes
-rw-r--r--tests/test_corrupted_files.py8
3 files changed, 42 insertions, 0 deletions
diff --git a/libmat2/office.py b/libmat2/office.py
index bad352b..b220092 100644
--- a/libmat2/office.py
+++ b/libmat2/office.py
@@ -151,10 +151,44 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
151 151
152 return True 152 return True
153 153
154 def __remove_content_type_members(self, full_path: str) -> bool:
155 """ The method will remove the dangling references
156 form the [Content_Types].xml file, since MS office doesn't like them
157 """
158 try:
159 tree, namespace = _parse_xml(full_path)
160 except ET.ParseError: # pragma: no cover
161 return False
162
163 if len(namespace.items()) != 1:
164 return False # there should be only one namespace for Types
165
166 removed_fnames = set()
167 with zipfile.ZipFile(self.filename) as zin:
168 for fname in [item.filename for item in zin.infolist()]:
169 if any(map(lambda r: r.search(fname), self.files_to_omit)):
170 removed_fnames.add(fname)
171
172 root = tree.getroot()
173 for item in root.findall('{%s}Override' % namespace['']):
174 name = item.attrib['PartName'][1:] # remove the leading '/'
175 if name in removed_fnames:
176 root.remove(item)
177
178 tree.write(full_path, xml_declaration=True)
179
180 return True
181
154 def _specific_cleanup(self, full_path: str) -> bool: 182 def _specific_cleanup(self, full_path: str) -> bool:
155 if os.stat(full_path).st_size == 0: # Don't process empty files 183 if os.stat(full_path).st_size == 0: # Don't process empty files
156 return True 184 return True
157 185
186 if full_path.endswith('/[Content_Types].xml'):
187 # this file contains references to files that we might
188 # remove, and MS Office doesn't like dangling references
189 if self.__remove_content_type_members(full_path) is False:
190 return False
191
158 if full_path.endswith('/word/document.xml'): 192 if full_path.endswith('/word/document.xml'):
159 # this file contains the revisions 193 # this file contains the revisions
160 if self.__remove_revisions(full_path) is False: 194 if self.__remove_revisions(full_path) is False:
diff --git a/tests/data/malformed_content_types.docx b/tests/data/malformed_content_types.docx
new file mode 100644
index 0000000..43ac743
--- /dev/null
+++ b/tests/data/malformed_content_types.docx
Binary files differ
diff --git a/tests/test_corrupted_files.py b/tests/test_corrupted_files.py
index 30039e6..5af0e81 100644
--- a/tests/test_corrupted_files.py
+++ b/tests/test_corrupted_files.py
@@ -80,6 +80,14 @@ class TestExplicitelyUnsupportedFiles(unittest.TestCase):
80 os.remove('./tests/data/clean.py') 80 os.remove('./tests/data/clean.py')
81 81
82 82
83class TestCorruptedContentTypesOffice(unittest.TestCase):
84 def test_office(self):
85 shutil.copy('./tests/data/malformed_content_types.docx', './tests/data/clean.docx')
86 p = office.MSOfficeParser('./tests/data/clean.docx')
87 self.assertIsNotNone(p)
88 self.assertFalse(p.remove_all())
89 os.remove('./tests/data/clean.docx')
90
83class TestCorruptedFiles(unittest.TestCase): 91class TestCorruptedFiles(unittest.TestCase):
84 def test_pdf(self): 92 def test_pdf(self):
85 shutil.copy('./tests/data/dirty.png', './tests/data/clean.png') 93 shutil.copy('./tests/data/dirty.png', './tests/data/clean.png')