diff options
| -rw-r--r-- | libmat2/archive.py | 22 | ||||
| -rw-r--r-- | libmat2/office.py | 29 | ||||
| -rw-r--r-- | tests/test_libmat2.py | 6 |
3 files changed, 33 insertions, 24 deletions
diff --git a/libmat2/archive.py b/libmat2/archive.py index b2483fc..d155664 100644 --- a/libmat2/archive.py +++ b/libmat2/archive.py | |||
| @@ -4,7 +4,7 @@ import tempfile | |||
| 4 | import os | 4 | import os |
| 5 | import logging | 5 | import logging |
| 6 | import shutil | 6 | import shutil |
| 7 | from typing import Dict, Set, Pattern, Union | 7 | from typing import Dict, Set, Pattern, Union, Any |
| 8 | 8 | ||
| 9 | from . import abstract, UnknownMemberPolicy, parser_factory | 9 | from . import abstract, UnknownMemberPolicy, parser_factory |
| 10 | 10 | ||
| @@ -42,6 +42,12 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser): | |||
| 42 | # pylint: disable=unused-argument,no-self-use | 42 | # pylint: disable=unused-argument,no-self-use |
| 43 | return True # pragma: no cover | 43 | return True # pragma: no cover |
| 44 | 44 | ||
| 45 | def _specific_get_meta(self, full_path: str, file_path: str) -> Dict[str, Any]: | ||
| 46 | """ This method can be used to extract specific metadata | ||
| 47 | from files present in the archive.""" | ||
| 48 | # pylint: disable=unused-argument,no-self-use | ||
| 49 | return {} # pragma: no cover | ||
| 50 | |||
| 45 | @staticmethod | 51 | @staticmethod |
| 46 | def _clean_zipinfo(zipinfo: zipfile.ZipInfo) -> zipfile.ZipInfo: | 52 | def _clean_zipinfo(zipinfo: zipfile.ZipInfo) -> zipfile.ZipInfo: |
| 47 | zipinfo.create_system = 3 # Linux | 53 | zipinfo.create_system = 3 # Linux |
| @@ -74,6 +80,10 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser): | |||
| 74 | temp_folder = tempfile.mkdtemp() | 80 | temp_folder = tempfile.mkdtemp() |
| 75 | 81 | ||
| 76 | for item in zin.infolist(): | 82 | for item in zin.infolist(): |
| 83 | local_meta = dict() # type: Dict[str, Union[str, Dict]] | ||
| 84 | for k, v in self._get_zipinfo_meta(item).items(): | ||
| 85 | local_meta[k] = v | ||
| 86 | |||
| 77 | if item.filename[-1] == '/': # pragma: no cover | 87 | if item.filename[-1] == '/': # pragma: no cover |
| 78 | # `is_dir` is added in Python3.6 | 88 | # `is_dir` is added in Python3.6 |
| 79 | continue # don't keep empty folders | 89 | continue # don't keep empty folders |
| @@ -81,11 +91,15 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser): | |||
| 81 | zin.extract(member=item, path=temp_folder) | 91 | zin.extract(member=item, path=temp_folder) |
| 82 | full_path = os.path.join(temp_folder, item.filename) | 92 | full_path = os.path.join(temp_folder, item.filename) |
| 83 | 93 | ||
| 94 | specific_meta = self._specific_get_meta(full_path, item.filename) | ||
| 95 | for (k, v) in specific_meta.items(): | ||
| 96 | local_meta[k] = v | ||
| 97 | |||
| 84 | tmp_parser, _ = parser_factory.get_parser(full_path) # type: ignore | 98 | tmp_parser, _ = parser_factory.get_parser(full_path) # type: ignore |
| 85 | if not tmp_parser: | 99 | if tmp_parser: |
| 86 | continue | 100 | for k, v in tmp_parser.get_meta().items(): |
| 101 | local_meta[k] = v | ||
| 87 | 102 | ||
| 88 | local_meta = tmp_parser.get_meta() | ||
| 89 | if local_meta: | 103 | if local_meta: |
| 90 | meta[item.filename] = local_meta | 104 | meta[item.filename] = local_meta |
| 91 | 105 | ||
diff --git a/libmat2/office.py b/libmat2/office.py index 365c230..dfad3b3 100644 --- a/libmat2/office.py +++ b/libmat2/office.py | |||
| @@ -2,7 +2,7 @@ import logging | |||
| 2 | import os | 2 | import os |
| 3 | import re | 3 | import re |
| 4 | import zipfile | 4 | import zipfile |
| 5 | from typing import Dict, Set, Pattern, Tuple, Union | 5 | from typing import Dict, Set, Pattern, Tuple, Union, Any |
| 6 | 6 | ||
| 7 | import xml.etree.ElementTree as ET # type: ignore | 7 | import xml.etree.ElementTree as ET # type: ignore |
| 8 | 8 | ||
| @@ -295,26 +295,21 @@ class MSOfficeParser(ArchiveBasedAbstractParser): | |||
| 295 | 295 | ||
| 296 | return True | 296 | return True |
| 297 | 297 | ||
| 298 | def get_meta(self) -> Dict[str, Union[str, dict]]: | 298 | def _specific_get_meta(self, full_path: str, file_path: str) -> Dict[str, Any]: |
| 299 | """ | 299 | """ |
| 300 | Yes, I know that parsing xml with regexp ain't pretty, | 300 | Yes, I know that parsing xml with regexp ain't pretty, |
| 301 | be my guest and fix it if you want. | 301 | be my guest and fix it if you want. |
| 302 | """ | 302 | """ |
| 303 | metadata = super().get_meta() | 303 | if not file_path.startswith('docProps/') or not file_path.endswith('.xml'): |
| 304 | zipin = zipfile.ZipFile(self.filename) | 304 | return {} |
| 305 | for item in zipin.infolist(): | 305 | |
| 306 | if item.filename.startswith('docProps/') and item.filename.endswith('.xml'): | 306 | with open(full_path, encoding='utf-8') as f: |
| 307 | try: | 307 | try: |
| 308 | content = zipin.read(item).decode('utf-8') | 308 | results = re.findall(r"<(.+)>(.+)</\1>", f.read(), re.I|re.M) |
| 309 | results = re.findall(r"<(.+)>(.+)</\1>", content, re.I|re.M) | 309 | return {k:v for (k, v) in results} |
| 310 | for (key, value) in results: | 310 | except (TypeError, UnicodeDecodeError): |
| 311 | metadata[key] = value | 311 | # We didn't manage to parse the xml file |
| 312 | except (TypeError, UnicodeDecodeError): # We didn't manage to parse the xml file | 312 | return {file_path: 'harmful content', } |
| 313 | metadata[item.filename] = 'harmful content' | ||
| 314 | for key, value in self._get_zipinfo_meta(item).items(): | ||
| 315 | metadata[key] = value | ||
| 316 | zipin.close() | ||
| 317 | return metadata | ||
| 318 | 313 | ||
| 319 | 314 | ||
| 320 | class LibreOfficeParser(ArchiveBasedAbstractParser): | 315 | class LibreOfficeParser(ArchiveBasedAbstractParser): |
diff --git a/tests/test_libmat2.py b/tests/test_libmat2.py index 9354286..d692181 100644 --- a/tests/test_libmat2.py +++ b/tests/test_libmat2.py | |||
| @@ -131,9 +131,9 @@ class TestGetMeta(unittest.TestCase): | |||
| 131 | def test_docx(self): | 131 | def test_docx(self): |
| 132 | p = office.MSOfficeParser('./tests/data/dirty.docx') | 132 | p = office.MSOfficeParser('./tests/data/dirty.docx') |
| 133 | meta = p.get_meta() | 133 | meta = p.get_meta() |
| 134 | self.assertEqual(meta['cp:lastModifiedBy'], 'Julien Voisin') | 134 | self.assertEqual(meta['docProps/core.xml']['cp:lastModifiedBy'], 'Julien Voisin') |
| 135 | self.assertEqual(meta['dc:creator'], 'julien voisin') | 135 | self.assertEqual(meta['docProps/core.xml']['dc:creator'], 'julien voisin') |
| 136 | self.assertEqual(meta['Application'], 'LibreOffice/5.4.5.1$Linux_X86_64 LibreOffice_project/40m0$Build-1') | 136 | self.assertEqual(meta['docProps/app.xml']['Application'], 'LibreOffice/5.4.5.1$Linux_X86_64 LibreOffice_project/40m0$Build-1') |
| 137 | 137 | ||
| 138 | def test_libreoffice(self): | 138 | def test_libreoffice(self): |
| 139 | p = office.LibreOfficeParser('./tests/data/dirty.odt') | 139 | p = office.LibreOfficeParser('./tests/data/dirty.odt') |
