diff options
| author | jvoisin | 2018-10-25 11:29:50 +0200 |
|---|---|---|
| committer | jvoisin | 2018-10-25 11:29:50 +0200 |
| commit | 513d897ea0cf3e006a2b33a89cdbf33cae3592cd (patch) | |
| tree | 73e3ffa11477b86f9cde3f36e763f83b8cf13117 | |
| parent | 5a9dc388ade0604962cd86889dfd1658579539fa (diff) | |
Implement get_meta() for archives
Diffstat (limited to '')
| -rw-r--r-- | libmat2/archive.py | 25 | ||||
| -rw-r--r-- | libmat2/office.py | 2 | ||||
| -rw-r--r-- | tests/test_deep_cleaning.py | 1 |
3 files changed, 27 insertions, 1 deletions
diff --git a/libmat2/archive.py b/libmat2/archive.py index f788ecc..80e0bf2 100644 --- a/libmat2/archive.py +++ b/libmat2/archive.py | |||
| @@ -67,6 +67,31 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser): | |||
| 67 | 67 | ||
| 68 | return metadata | 68 | return metadata |
| 69 | 69 | ||
| 70 | def get_meta(self) -> Dict[str, Union[str, dict]]: | ||
| 71 | meta = dict() # type: Dict[str, Union[str, dict]] | ||
| 72 | |||
| 73 | with zipfile.ZipFile(self.filename) as zin: | ||
| 74 | temp_folder = tempfile.mkdtemp() | ||
| 75 | |||
| 76 | for item in zin.infolist(): | ||
| 77 | if item.filename[-1] == '/': # pragma: no cover | ||
| 78 | # `is_dir` is added in Python3.6 | ||
| 79 | continue # don't keep empty folders | ||
| 80 | |||
| 81 | zin.extract(member=item, path=temp_folder) | ||
| 82 | full_path = os.path.join(temp_folder, item.filename) | ||
| 83 | |||
| 84 | tmp_parser, _ = parser_factory.get_parser(full_path) # type: ignore | ||
| 85 | if not tmp_parser: | ||
| 86 | continue | ||
| 87 | |||
| 88 | local_meta = tmp_parser.get_meta() | ||
| 89 | if local_meta: | ||
| 90 | meta[item.filename] = local_meta | ||
| 91 | |||
| 92 | shutil.rmtree(temp_folder) | ||
| 93 | return meta | ||
| 94 | |||
| 70 | def remove_all(self) -> bool: | 95 | def remove_all(self) -> bool: |
| 71 | # pylint: disable=too-many-branches | 96 | # pylint: disable=too-many-branches |
| 72 | 97 | ||
diff --git a/libmat2/office.py b/libmat2/office.py index c10664f..e6370e7 100644 --- a/libmat2/office.py +++ b/libmat2/office.py | |||
| @@ -301,7 +301,7 @@ class MSOfficeParser(ArchiveBasedAbstractParser): | |||
| 301 | Yes, I know that parsing xml with regexp ain't pretty, | 301 | Yes, I know that parsing xml with regexp ain't pretty, |
| 302 | be my guest and fix it if you want. | 302 | be my guest and fix it if you want. |
| 303 | """ | 303 | """ |
| 304 | metadata = {} | 304 | metadata = super().get_meta() |
| 305 | zipin = zipfile.ZipFile(self.filename) | 305 | zipin = zipfile.ZipFile(self.filename) |
| 306 | for item in zipin.infolist(): | 306 | for item in zipin.infolist(): |
| 307 | if item.filename.startswith('docProps/') and item.filename.endswith('.xml'): | 307 | if item.filename.startswith('docProps/') and item.filename.endswith('.xml'): |
diff --git a/tests/test_deep_cleaning.py b/tests/test_deep_cleaning.py index 03db6c5..8466127 100644 --- a/tests/test_deep_cleaning.py +++ b/tests/test_deep_cleaning.py | |||
| @@ -36,6 +36,7 @@ class TestZipMetadata(unittest.TestCase): | |||
| 36 | 36 | ||
| 37 | meta = p.get_meta() | 37 | meta = p.get_meta() |
| 38 | self.assertIsNotNone(meta) | 38 | self.assertIsNotNone(meta) |
| 39 | self.assertEqual(meta['word/media/image1.png']['Comment'], 'This is a comment, be careful!') | ||
| 39 | 40 | ||
| 40 | ret = p.remove_all() | 41 | ret = p.remove_all() |
| 41 | self.assertTrue(ret) | 42 | self.assertTrue(ret) |
