diff options
| -rw-r--r-- | libmat2/office.py | 28 | ||||
| -rw-r--r-- | tests/test_corrupted_files.py | 7 | ||||
| -rw-r--r-- | tests/test_libmat2.py | 14 |
3 files changed, 19 insertions, 30 deletions
diff --git a/libmat2/office.py b/libmat2/office.py index dfad3b3..0c9caa8 100644 --- a/libmat2/office.py +++ b/libmat2/office.py | |||
| @@ -2,7 +2,7 @@ import logging | |||
| 2 | import os | 2 | import os |
| 3 | import re | 3 | import re |
| 4 | import zipfile | 4 | import zipfile |
| 5 | from typing import Dict, Set, Pattern, Tuple, Union, Any | 5 | from typing import Dict, Set, Pattern, Tuple, Any |
| 6 | 6 | ||
| 7 | import xml.etree.ElementTree as ET # type: ignore | 7 | import xml.etree.ElementTree as ET # type: ignore |
| 8 | 8 | ||
| @@ -375,23 +375,17 @@ class LibreOfficeParser(ArchiveBasedAbstractParser): | |||
| 375 | return False | 375 | return False |
| 376 | return True | 376 | return True |
| 377 | 377 | ||
| 378 | def get_meta(self) -> Dict[str, Union[str, dict]]: | 378 | def _specific_get_meta(self, full_path: str, file_path: str) -> Dict[str, Any]: |
| 379 | """ | 379 | """ |
| 380 | Yes, I know that parsing xml with regexp ain't pretty, | 380 | Yes, I know that parsing xml with regexp ain't pretty, |
| 381 | be my guest and fix it if you want. | 381 | be my guest and fix it if you want. |
| 382 | """ | 382 | """ |
| 383 | metadata = {} | 383 | if file_path != 'meta.xml': |
| 384 | zipin = zipfile.ZipFile(self.filename) | 384 | return {} |
| 385 | for item in zipin.infolist(): | 385 | with open(full_path, encoding='utf-8') as f: |
| 386 | if item.filename == 'meta.xml': | 386 | try: |
| 387 | try: | 387 | results = re.findall(r"<((?:meta|dc|cp).+?)>(.+)</\1>", f.read(), re.I|re.M) |
| 388 | content = zipin.read(item).decode('utf-8') | 388 | return {k:v for (k, v) in results} |
| 389 | results = re.findall(r"<((?:meta|dc|cp).+?)>(.+)</\1>", content, re.I|re.M) | 389 | except (TypeError, UnicodeDecodeError): # We didn't manage to parse the xml file |
| 390 | for (key, value) in results: | 390 | # We didn't manage to parse the xml file |
| 391 | metadata[key] = value | 391 | return {file_path: 'harmful content', } |
| 392 | except (TypeError, UnicodeDecodeError): # We didn't manage to parse the xml file | ||
| 393 | metadata[item.filename] = 'harmful content' | ||
| 394 | for key, value in self._get_zipinfo_meta(item).items(): | ||
| 395 | metadata[key] = value | ||
| 396 | zipin.close() | ||
| 397 | return metadata | ||
diff --git a/tests/test_corrupted_files.py b/tests/test_corrupted_files.py index e7d3c2a..b2e7798 100644 --- a/tests/test_corrupted_files.py +++ b/tests/test_corrupted_files.py | |||
| @@ -67,15 +67,10 @@ class TestCorruptedEmbedded(unittest.TestCase): | |||
| 67 | os.remove('./tests/data/clean.docx') | 67 | os.remove('./tests/data/clean.docx') |
| 68 | 68 | ||
| 69 | def test_odt(self): | 69 | def test_odt(self): |
| 70 | expected = { | ||
| 71 | 'create_system': 'Weird', | ||
| 72 | 'date_time': '2018-06-10 17:18:18', | ||
| 73 | 'meta.xml': 'harmful content' | ||
| 74 | } | ||
| 75 | shutil.copy('./tests/data/embedded_corrupted.odt', './tests/data/clean.odt') | 70 | shutil.copy('./tests/data/embedded_corrupted.odt', './tests/data/clean.odt') |
| 76 | parser, _ = parser_factory.get_parser('./tests/data/clean.odt') | 71 | parser, _ = parser_factory.get_parser('./tests/data/clean.odt') |
| 77 | self.assertFalse(parser.remove_all()) | 72 | self.assertFalse(parser.remove_all()) |
| 78 | self.assertEqual(parser.get_meta(), expected) | 73 | self.assertTrue(parser.get_meta()) |
| 79 | os.remove('./tests/data/clean.odt') | 74 | os.remove('./tests/data/clean.odt') |
| 80 | 75 | ||
| 81 | 76 | ||
diff --git a/tests/test_libmat2.py b/tests/test_libmat2.py index d692181..548b076 100644 --- a/tests/test_libmat2.py +++ b/tests/test_libmat2.py | |||
| @@ -138,14 +138,14 @@ class TestGetMeta(unittest.TestCase): | |||
| 138 | def test_libreoffice(self): | 138 | def test_libreoffice(self): |
| 139 | p = office.LibreOfficeParser('./tests/data/dirty.odt') | 139 | p = office.LibreOfficeParser('./tests/data/dirty.odt') |
| 140 | meta = p.get_meta() | 140 | meta = p.get_meta() |
| 141 | self.assertEqual(meta['meta:initial-creator'], 'jvoisin ') | 141 | self.assertEqual(meta['meta.xml']['meta:initial-creator'], 'jvoisin ') |
| 142 | self.assertEqual(meta['meta:creation-date'], '2011-07-26T03:27:48') | 142 | self.assertEqual(meta['meta.xml']['meta:creation-date'], '2011-07-26T03:27:48') |
| 143 | self.assertEqual(meta['meta:generator'], 'LibreOffice/3.3$Unix LibreOffice_project/330m19$Build-202') | 143 | self.assertEqual(meta['meta.xml']['meta:generator'], 'LibreOffice/3.3$Unix LibreOffice_project/330m19$Build-202') |
| 144 | 144 | ||
| 145 | p = office.LibreOfficeParser('./tests/data/weird_producer.odt') | 145 | p = office.LibreOfficeParser('./tests/data/weird_producer.odt') |
| 146 | meta = p.get_meta() | 146 | meta = p.get_meta() |
| 147 | self.assertEqual(meta['create_system'], 'Windows') | 147 | self.assertEqual(meta['mimetype']['create_system'], 'Windows') |
| 148 | self.assertEqual(meta['comment'], b'YAY FOR COMMENTS') | 148 | self.assertEqual(meta['mimetype']['comment'], b'YAY FOR COMMENTS') |
| 149 | 149 | ||
| 150 | def test_txt(self): | 150 | def test_txt(self): |
| 151 | p, mimetype = parser_factory.get_parser('./tests/data/dirty.txt') | 151 | p, mimetype = parser_factory.get_parser('./tests/data/dirty.txt') |
| @@ -440,7 +440,7 @@ class TestCleaning(unittest.TestCase): | |||
| 440 | p = office.LibreOfficeParser('./tests/data/clean.odf') | 440 | p = office.LibreOfficeParser('./tests/data/clean.odf') |
| 441 | 441 | ||
| 442 | meta = p.get_meta() | 442 | meta = p.get_meta() |
| 443 | self.assertEqual(meta['meta:creation-date'], '2018-04-23T00:18:59.438231281') | 443 | self.assertEqual(meta['meta.xml']['meta:creation-date'], '2018-04-23T00:18:59.438231281') |
| 444 | 444 | ||
| 445 | ret = p.remove_all() | 445 | ret = p.remove_all() |
| 446 | self.assertTrue(ret) | 446 | self.assertTrue(ret) |
| @@ -458,7 +458,7 @@ class TestCleaning(unittest.TestCase): | |||
| 458 | p = office.LibreOfficeParser('./tests/data/clean.odg') | 458 | p = office.LibreOfficeParser('./tests/data/clean.odg') |
| 459 | 459 | ||
| 460 | meta = p.get_meta() | 460 | meta = p.get_meta() |
| 461 | self.assertEqual(meta['dc:date'], '2018-04-23T00:26:59.385838550') | 461 | self.assertEqual(meta['meta.xml']['dc:date'], '2018-04-23T00:26:59.385838550') |
| 462 | 462 | ||
| 463 | ret = p.remove_all() | 463 | ret = p.remove_all() |
| 464 | self.assertTrue(ret) | 464 | self.assertTrue(ret) |
