summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--libmat2/office.py26
-rw-r--r--tests/data/embedded_corrupted.docxbin0 -> 223581 bytes
-rw-r--r--tests/data/embedded_corrupted.odtbin0 -> 217315 bytes
-rw-r--r--tests/test_corrupted_files.py15
4 files changed, 29 insertions, 12 deletions
diff --git a/libmat2/office.py b/libmat2/office.py
index 5165056..6087c47 100644
--- a/libmat2/office.py
+++ b/libmat2/office.py
@@ -147,7 +147,10 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
147 """ In this function, we're changing the XML 147 """ In this function, we're changing the XML
148 document in two times, since we don't want 148 document in two times, since we don't want
149 to change the tree we're iterating on.""" 149 to change the tree we're iterating on."""
150 tree, ns = _parse_xml(full_path) 150 try:
151 tree, ns = _parse_xml(full_path)
152 except ET.ParseError:
153 return False
151 154
152 # No revisions are present 155 # No revisions are present
153 del_presence = tree.find('.//w:del', ns) 156 del_presence = tree.find('.//w:del', ns)
@@ -191,15 +194,13 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
191 zipin = zipfile.ZipFile(self.filename) 194 zipin = zipfile.ZipFile(self.filename)
192 for item in zipin.infolist(): 195 for item in zipin.infolist():
193 if item.filename.startswith('docProps/') and item.filename.endswith('.xml'): 196 if item.filename.startswith('docProps/') and item.filename.endswith('.xml'):
194 content = zipin.read(item).decode('utf-8')
195 try: 197 try:
198 content = zipin.read(item).decode('utf-8')
196 results = re.findall(r"<(.+)>(.+)</\1>", content, re.I|re.M) 199 results = re.findall(r"<(.+)>(.+)</\1>", content, re.I|re.M)
197 for (key, value) in results: 200 for (key, value) in results:
198 metadata[key] = value 201 metadata[key] = value
199 except TypeError: # We didn't manage to parse the xml file 202 except (TypeError, UnicodeDecodeError): # We didn't manage to parse the xml file
200 pass 203 metadata[item.filename] = 'harmful content'
201 if not metadata: # better safe than sorry
202 metadata[item] = 'harmful content'
203 for key, value in self._get_zipinfo_meta(item).items(): 204 for key, value in self._get_zipinfo_meta(item).items():
204 metadata[key] = value 205 metadata[key] = value
205 zipin.close() 206 zipin.close()
@@ -232,7 +233,10 @@ class LibreOfficeParser(ArchiveBasedAbstractParser):
232 233
233 234
234 def __remove_revisions(self, full_path: str) -> bool: 235 def __remove_revisions(self, full_path: str) -> bool:
235 tree, ns = _parse_xml(full_path) 236 try:
237 tree, ns = _parse_xml(full_path)
238 except ET.ParseError:
239 return False
236 240
237 if 'office' not in ns.keys(): # no revisions in the current file 241 if 'office' not in ns.keys(): # no revisions in the current file
238 return True 242 return True
@@ -259,15 +263,13 @@ class LibreOfficeParser(ArchiveBasedAbstractParser):
259 zipin = zipfile.ZipFile(self.filename) 263 zipin = zipfile.ZipFile(self.filename)
260 for item in zipin.infolist(): 264 for item in zipin.infolist():
261 if item.filename == 'meta.xml': 265 if item.filename == 'meta.xml':
262 content = zipin.read(item).decode('utf-8')
263 try: 266 try:
267 content = zipin.read(item).decode('utf-8')
264 results = re.findall(r"<((?:meta|dc|cp).+?)>(.+)</\1>", content, re.I|re.M) 268 results = re.findall(r"<((?:meta|dc|cp).+?)>(.+)</\1>", content, re.I|re.M)
265 for (key, value) in results: 269 for (key, value) in results:
266 metadata[key] = value 270 metadata[key] = value
267 except TypeError: # We didn't manage to parse the xml file 271 except (TypeError, UnicodeDecodeError): # We didn't manage to parse the xml file
268 pass 272 metadata[item.filename] = 'harmful content'
269 if not metadata: # better safe than sorry
270 metadata[item] = 'harmful content'
271 for key, value in self._get_zipinfo_meta(item).items(): 273 for key, value in self._get_zipinfo_meta(item).items():
272 metadata[key] = value 274 metadata[key] = value
273 zipin.close() 275 zipin.close()
diff --git a/tests/data/embedded_corrupted.docx b/tests/data/embedded_corrupted.docx
new file mode 100644
index 0000000..989bdb8
--- /dev/null
+++ b/tests/data/embedded_corrupted.docx
Binary files differ
diff --git a/tests/data/embedded_corrupted.odt b/tests/data/embedded_corrupted.odt
new file mode 100644
index 0000000..1e4a844
--- /dev/null
+++ b/tests/data/embedded_corrupted.odt
Binary files differ
diff --git a/tests/test_corrupted_files.py b/tests/test_corrupted_files.py
index a77acbc..2bb1c76 100644
--- a/tests/test_corrupted_files.py
+++ b/tests/test_corrupted_files.py
@@ -15,6 +15,21 @@ class TestUnsupportedFiles(unittest.TestCase):
15 self.assertEqual(parser, None) 15 self.assertEqual(parser, None)
16 os.remove('./tests/clean.py') 16 os.remove('./tests/clean.py')
17 17
18class TestCorruptedEmbedded(unittest.TestCase):
19 def test_docx(self):
20 shutil.copy('./tests/data/embedded_corrupted.docx', './tests/data/clean.docx')
21 parser, mimetype = parser_factory.get_parser('./tests/data/clean.docx')
22 self.assertFalse(parser.remove_all())
23 self.assertIsNotNone(parser.get_meta())
24 os.remove('./tests/data/clean.docx')
25
26 def test_odt(self):
27 shutil.copy('./tests/data/embedded_corrupted.odt', './tests/data/clean.odt')
28 parser, mimetype = parser_factory.get_parser('./tests/data/clean.odt')
29 self.assertFalse(parser.remove_all())
30 self.assertEqual(parser.get_meta(), {'create_system': 'Weird', 'date_time': '2018-06-10 17:18:18', 'meta.xml': 'harmful content'})
31 os.remove('./tests/data/clean.odt')
32
18 33
19class TestExplicitelyUnsupportedFiles(unittest.TestCase): 34class TestExplicitelyUnsupportedFiles(unittest.TestCase):
20 def test_pdf(self): 35 def test_pdf(self):