diff options
| author | jvoisin | 2019-02-07 21:58:10 +0100 |
|---|---|---|
| committer | jvoisin | 2019-02-07 22:19:37 +0100 |
| commit | e1dd439fc86ba15816e2331e8bed67dd7147e368 (patch) | |
| tree | 0c8e368fcb9c409fa2182018b166ec4f18cdd98c /libmat2/office.py | |
| parent | b9a62d798af14ea799ae5fceab1ed7a537d1cbdd (diff) | |
Use of the archive refactoring for the office documents too
Diffstat (limited to 'libmat2/office.py')
| -rw-r--r-- | libmat2/office.py | 28 |
1 files changed, 11 insertions, 17 deletions
diff --git a/libmat2/office.py b/libmat2/office.py index dfad3b3..0c9caa8 100644 --- a/libmat2/office.py +++ b/libmat2/office.py | |||
| @@ -2,7 +2,7 @@ import logging | |||
| 2 | import os | 2 | import os |
| 3 | import re | 3 | import re |
| 4 | import zipfile | 4 | import zipfile |
| 5 | from typing import Dict, Set, Pattern, Tuple, Union, Any | 5 | from typing import Dict, Set, Pattern, Tuple, Any |
| 6 | 6 | ||
| 7 | import xml.etree.ElementTree as ET # type: ignore | 7 | import xml.etree.ElementTree as ET # type: ignore |
| 8 | 8 | ||
| @@ -375,23 +375,17 @@ class LibreOfficeParser(ArchiveBasedAbstractParser): | |||
| 375 | return False | 375 | return False |
| 376 | return True | 376 | return True |
| 377 | 377 | ||
| 378 | def get_meta(self) -> Dict[str, Union[str, dict]]: | 378 | def _specific_get_meta(self, full_path: str, file_path: str) -> Dict[str, Any]: |
| 379 | """ | 379 | """ |
| 380 | Yes, I know that parsing xml with regexp ain't pretty, | 380 | Yes, I know that parsing xml with regexp ain't pretty, |
| 381 | be my guest and fix it if you want. | 381 | be my guest and fix it if you want. |
| 382 | """ | 382 | """ |
| 383 | metadata = {} | 383 | if file_path != 'meta.xml': |
| 384 | zipin = zipfile.ZipFile(self.filename) | 384 | return {} |
| 385 | for item in zipin.infolist(): | 385 | with open(full_path, encoding='utf-8') as f: |
| 386 | if item.filename == 'meta.xml': | 386 | try: |
| 387 | try: | 387 | results = re.findall(r"<((?:meta|dc|cp).+?)>(.+)</\1>", f.read(), re.I|re.M) |
| 388 | content = zipin.read(item).decode('utf-8') | 388 | return {k:v for (k, v) in results} |
| 389 | results = re.findall(r"<((?:meta|dc|cp).+?)>(.+)</\1>", content, re.I|re.M) | 389 | except (TypeError, UnicodeDecodeError): # We didn't manage to parse the xml file |
| 390 | for (key, value) in results: | 390 | # We didn't manage to parse the xml file |
| 391 | metadata[key] = value | 391 | return {file_path: 'harmful content', } |
| 392 | except (TypeError, UnicodeDecodeError): # We didn't manage to parse the xml file | ||
| 393 | metadata[item.filename] = 'harmful content' | ||
| 394 | for key, value in self._get_zipinfo_meta(item).items(): | ||
| 395 | metadata[key] = value | ||
| 396 | zipin.close() | ||
| 397 | return metadata | ||
