diff options
Diffstat (limited to 'libmat2/office.py')
| -rw-r--r-- | libmat2/office.py | 29 |
1 files changed, 12 insertions, 17 deletions
diff --git a/libmat2/office.py b/libmat2/office.py index 365c230..dfad3b3 100644 --- a/libmat2/office.py +++ b/libmat2/office.py | |||
| @@ -2,7 +2,7 @@ import logging | |||
| 2 | import os | 2 | import os |
| 3 | import re | 3 | import re |
| 4 | import zipfile | 4 | import zipfile |
| 5 | from typing import Dict, Set, Pattern, Tuple, Union | 5 | from typing import Dict, Set, Pattern, Tuple, Union, Any |
| 6 | 6 | ||
| 7 | import xml.etree.ElementTree as ET # type: ignore | 7 | import xml.etree.ElementTree as ET # type: ignore |
| 8 | 8 | ||
| @@ -295,26 +295,21 @@ class MSOfficeParser(ArchiveBasedAbstractParser): | |||
| 295 | 295 | ||
| 296 | return True | 296 | return True |
| 297 | 297 | ||
| 298 | def get_meta(self) -> Dict[str, Union[str, dict]]: | 298 | def _specific_get_meta(self, full_path: str, file_path: str) -> Dict[str, Any]: |
| 299 | """ | 299 | """ |
| 300 | Yes, I know that parsing xml with regexp ain't pretty, | 300 | Yes, I know that parsing xml with regexp ain't pretty, |
| 301 | be my guest and fix it if you want. | 301 | be my guest and fix it if you want. |
| 302 | """ | 302 | """ |
| 303 | metadata = super().get_meta() | 303 | if not file_path.startswith('docProps/') or not file_path.endswith('.xml'): |
| 304 | zipin = zipfile.ZipFile(self.filename) | 304 | return {} |
| 305 | for item in zipin.infolist(): | 305 | |
| 306 | if item.filename.startswith('docProps/') and item.filename.endswith('.xml'): | 306 | with open(full_path, encoding='utf-8') as f: |
| 307 | try: | 307 | try: |
| 308 | content = zipin.read(item).decode('utf-8') | 308 | results = re.findall(r"<(.+)>(.+)</\1>", f.read(), re.I|re.M) |
| 309 | results = re.findall(r"<(.+)>(.+)</\1>", content, re.I|re.M) | 309 | return {k:v for (k, v) in results} |
| 310 | for (key, value) in results: | 310 | except (TypeError, UnicodeDecodeError): |
| 311 | metadata[key] = value | 311 | # We didn't manage to parse the xml file |
| 312 | except (TypeError, UnicodeDecodeError): # We didn't manage to parse the xml file | 312 | return {file_path: 'harmful content', } |
| 313 | metadata[item.filename] = 'harmful content' | ||
| 314 | for key, value in self._get_zipinfo_meta(item).items(): | ||
| 315 | metadata[key] = value | ||
| 316 | zipin.close() | ||
| 317 | return metadata | ||
| 318 | 313 | ||
| 319 | 314 | ||
| 320 | class LibreOfficeParser(ArchiveBasedAbstractParser): | 315 | class LibreOfficeParser(ArchiveBasedAbstractParser): |
