diff options
| -rw-r--r-- | libmat2/office.py | 26 | ||||
| -rw-r--r-- | tests/data/embedded_corrupted.docx | bin | 0 -> 223581 bytes | |||
| -rw-r--r-- | tests/data/embedded_corrupted.odt | bin | 0 -> 217315 bytes | |||
| -rw-r--r-- | tests/test_corrupted_files.py | 15 |
4 files changed, 29 insertions, 12 deletions
diff --git a/libmat2/office.py b/libmat2/office.py index 5165056..6087c47 100644 --- a/libmat2/office.py +++ b/libmat2/office.py | |||
| @@ -147,7 +147,10 @@ class MSOfficeParser(ArchiveBasedAbstractParser): | |||
| 147 | """ In this function, we're changing the XML | 147 | """ In this function, we're changing the XML |
| 148 | document in two times, since we don't want | 148 | document in two times, since we don't want |
| 149 | to change the tree we're iterating on.""" | 149 | to change the tree we're iterating on.""" |
| 150 | tree, ns = _parse_xml(full_path) | 150 | try: |
| 151 | tree, ns = _parse_xml(full_path) | ||
| 152 | except ET.ParseError: | ||
| 153 | return False | ||
| 151 | 154 | ||
| 152 | # No revisions are present | 155 | # No revisions are present |
| 153 | del_presence = tree.find('.//w:del', ns) | 156 | del_presence = tree.find('.//w:del', ns) |
| @@ -191,15 +194,13 @@ class MSOfficeParser(ArchiveBasedAbstractParser): | |||
| 191 | zipin = zipfile.ZipFile(self.filename) | 194 | zipin = zipfile.ZipFile(self.filename) |
| 192 | for item in zipin.infolist(): | 195 | for item in zipin.infolist(): |
| 193 | if item.filename.startswith('docProps/') and item.filename.endswith('.xml'): | 196 | if item.filename.startswith('docProps/') and item.filename.endswith('.xml'): |
| 194 | content = zipin.read(item).decode('utf-8') | ||
| 195 | try: | 197 | try: |
| 198 | content = zipin.read(item).decode('utf-8') | ||
| 196 | results = re.findall(r"<(.+)>(.+)</\1>", content, re.I|re.M) | 199 | results = re.findall(r"<(.+)>(.+)</\1>", content, re.I|re.M) |
| 197 | for (key, value) in results: | 200 | for (key, value) in results: |
| 198 | metadata[key] = value | 201 | metadata[key] = value |
| 199 | except TypeError: # We didn't manage to parse the xml file | 202 | except (TypeError, UnicodeDecodeError): # We didn't manage to parse the xml file |
| 200 | pass | 203 | metadata[item.filename] = 'harmful content' |
| 201 | if not metadata: # better safe than sorry | ||
| 202 | metadata[item] = 'harmful content' | ||
| 203 | for key, value in self._get_zipinfo_meta(item).items(): | 204 | for key, value in self._get_zipinfo_meta(item).items(): |
| 204 | metadata[key] = value | 205 | metadata[key] = value |
| 205 | zipin.close() | 206 | zipin.close() |
| @@ -232,7 +233,10 @@ class LibreOfficeParser(ArchiveBasedAbstractParser): | |||
| 232 | 233 | ||
| 233 | 234 | ||
| 234 | def __remove_revisions(self, full_path: str) -> bool: | 235 | def __remove_revisions(self, full_path: str) -> bool: |
| 235 | tree, ns = _parse_xml(full_path) | 236 | try: |
| 237 | tree, ns = _parse_xml(full_path) | ||
| 238 | except ET.ParseError: | ||
| 239 | return False | ||
| 236 | 240 | ||
| 237 | if 'office' not in ns.keys(): # no revisions in the current file | 241 | if 'office' not in ns.keys(): # no revisions in the current file |
| 238 | return True | 242 | return True |
| @@ -259,15 +263,13 @@ class LibreOfficeParser(ArchiveBasedAbstractParser): | |||
| 259 | zipin = zipfile.ZipFile(self.filename) | 263 | zipin = zipfile.ZipFile(self.filename) |
| 260 | for item in zipin.infolist(): | 264 | for item in zipin.infolist(): |
| 261 | if item.filename == 'meta.xml': | 265 | if item.filename == 'meta.xml': |
| 262 | content = zipin.read(item).decode('utf-8') | ||
| 263 | try: | 266 | try: |
| 267 | content = zipin.read(item).decode('utf-8') | ||
| 264 | results = re.findall(r"<((?:meta|dc|cp).+?)>(.+)</\1>", content, re.I|re.M) | 268 | results = re.findall(r"<((?:meta|dc|cp).+?)>(.+)</\1>", content, re.I|re.M) |
| 265 | for (key, value) in results: | 269 | for (key, value) in results: |
| 266 | metadata[key] = value | 270 | metadata[key] = value |
| 267 | except TypeError: # We didn't manage to parse the xml file | 271 | except (TypeError, UnicodeDecodeError): # We didn't manage to parse the xml file |
| 268 | pass | 272 | metadata[item.filename] = 'harmful content' |
| 269 | if not metadata: # better safe than sorry | ||
| 270 | metadata[item] = 'harmful content' | ||
| 271 | for key, value in self._get_zipinfo_meta(item).items(): | 273 | for key, value in self._get_zipinfo_meta(item).items(): |
| 272 | metadata[key] = value | 274 | metadata[key] = value |
| 273 | zipin.close() | 275 | zipin.close() |
diff --git a/tests/data/embedded_corrupted.docx b/tests/data/embedded_corrupted.docx new file mode 100644 index 0000000..989bdb8 --- /dev/null +++ b/tests/data/embedded_corrupted.docx | |||
| Binary files differ | |||
diff --git a/tests/data/embedded_corrupted.odt b/tests/data/embedded_corrupted.odt new file mode 100644 index 0000000..1e4a844 --- /dev/null +++ b/tests/data/embedded_corrupted.odt | |||
| Binary files differ | |||
diff --git a/tests/test_corrupted_files.py b/tests/test_corrupted_files.py index a77acbc..2bb1c76 100644 --- a/tests/test_corrupted_files.py +++ b/tests/test_corrupted_files.py | |||
| @@ -15,6 +15,21 @@ class TestUnsupportedFiles(unittest.TestCase): | |||
| 15 | self.assertEqual(parser, None) | 15 | self.assertEqual(parser, None) |
| 16 | os.remove('./tests/clean.py') | 16 | os.remove('./tests/clean.py') |
| 17 | 17 | ||
| 18 | class TestCorruptedEmbedded(unittest.TestCase): | ||
| 19 | def test_docx(self): | ||
| 20 | shutil.copy('./tests/data/embedded_corrupted.docx', './tests/data/clean.docx') | ||
| 21 | parser, mimetype = parser_factory.get_parser('./tests/data/clean.docx') | ||
| 22 | self.assertFalse(parser.remove_all()) | ||
| 23 | self.assertIsNotNone(parser.get_meta()) | ||
| 24 | os.remove('./tests/data/clean.docx') | ||
| 25 | |||
| 26 | def test_odt(self): | ||
| 27 | shutil.copy('./tests/data/embedded_corrupted.odt', './tests/data/clean.odt') | ||
| 28 | parser, mimetype = parser_factory.get_parser('./tests/data/clean.odt') | ||
| 29 | self.assertFalse(parser.remove_all()) | ||
| 30 | self.assertEqual(parser.get_meta(), {'create_system': 'Weird', 'date_time': '2018-06-10 17:18:18', 'meta.xml': 'harmful content'}) | ||
| 31 | os.remove('./tests/data/clean.odt') | ||
| 32 | |||
| 18 | 33 | ||
| 19 | class TestExplicitelyUnsupportedFiles(unittest.TestCase): | 34 | class TestExplicitelyUnsupportedFiles(unittest.TestCase): |
| 20 | def test_pdf(self): | 35 | def test_pdf(self): |
