diff options
| -rw-r--r-- | src/office.py | 31 |
1 files changed, 27 insertions, 4 deletions
diff --git a/src/office.py b/src/office.py index 5083308..0a34185 100644 --- a/src/office.py +++ b/src/office.py | |||
| @@ -4,6 +4,7 @@ import re | |||
| 4 | import shutil | 4 | import shutil |
| 5 | import subprocess | 5 | import subprocess |
| 6 | import tempfile | 6 | import tempfile |
| 7 | import datetime | ||
| 7 | import zipfile | 8 | import zipfile |
| 8 | 9 | ||
| 9 | from . import abstract, parser_factory | 10 | from . import abstract, parser_factory |
| @@ -16,6 +17,25 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser): | |||
| 16 | zipinfo.date_time = (1980, 1, 1, 0, 0, 0) | 17 | zipinfo.date_time = (1980, 1, 1, 0, 0, 0) |
| 17 | return zipinfo | 18 | return zipinfo |
| 18 | 19 | ||
| 20 | def _get_zipinfo_meta(self, zipinfo:zipfile.ZipInfo) -> dict: | ||
| 21 | metadata = {} | ||
| 22 | if zipinfo.create_system == 3: | ||
| 23 | #metadata['create_system'] = 'Linux' | ||
| 24 | pass | ||
| 25 | elif zipinfo.create_system == 2: | ||
| 26 | metadata['create_system'] = 'Windows' | ||
| 27 | else: | ||
| 28 | metadata['create_system'] = 'Weird' | ||
| 29 | |||
| 30 | if zipinfo.comment: | ||
| 31 | metadata['comment'] = zipinfo.comment | ||
| 32 | |||
| 33 | if zipinfo.date_time != (1980, 1, 1, 0, 0, 0): | ||
| 34 | metadata['comment'] = datetime.datetime(*zipinfo.date_time) | ||
| 35 | |||
| 36 | return metadata | ||
| 37 | |||
| 38 | |||
| 19 | def _clean_internal_file(self, item:zipfile.ZipInfo, temp_folder:str, zin:zipfile.ZipFile, zout:zipfile.ZipFile): | 39 | def _clean_internal_file(self, item:zipfile.ZipInfo, temp_folder:str, zin:zipfile.ZipFile, zout:zipfile.ZipFile): |
| 20 | zin.extract(member=item, path=temp_folder) | 40 | zin.extract(member=item, path=temp_folder) |
| 21 | tmp_parser = parser_factory.get_parser(os.path.join(temp_folder, item.filename)) | 41 | tmp_parser = parser_factory.get_parser(os.path.join(temp_folder, item.filename)) |
| @@ -43,13 +63,15 @@ class MSOfficeParser(ArchiveBasedAbstractParser): | |||
| 43 | """ | 63 | """ |
| 44 | metadata = {} | 64 | metadata = {} |
| 45 | zipin = zipfile.ZipFile(self.filename) | 65 | zipin = zipfile.ZipFile(self.filename) |
| 46 | for item in zipin.namelist(): | 66 | for item in zipin.infolist(): |
| 47 | if item.startswith('docProps/') and item.endswith('.xml'): | 67 | if item.filename.startswith('docProps/') and item.filename.endswith('.xml'): |
| 48 | content = zipin.read(item).decode('utf-8') | 68 | content = zipin.read(item).decode('utf-8') |
| 49 | for (key, value) in re.findall(r"<(.+)>(.+)</\1>", content, re.I): | 69 | for (key, value) in re.findall(r"<(.+)>(.+)</\1>", content, re.I): |
| 50 | metadata[key] = value | 70 | metadata[key] = value |
| 51 | if not metadata: # better safe than sorry | 71 | if not metadata: # better safe than sorry |
| 52 | metadata[item] = 'harmful content' | 72 | metadata[item] = 'harmful content' |
| 73 | |||
| 74 | metadata = {**metadata, **self._get_zipinfo_meta(item)} | ||
| 53 | zipin.close() | 75 | zipin.close() |
| 54 | return metadata | 76 | return metadata |
| 55 | 77 | ||
| @@ -95,13 +117,14 @@ class LibreOfficeParser(ArchiveBasedAbstractParser): | |||
| 95 | """ | 117 | """ |
| 96 | metadata = {} | 118 | metadata = {} |
| 97 | zipin = zipfile.ZipFile(self.filename) | 119 | zipin = zipfile.ZipFile(self.filename) |
| 98 | for item in zipin.namelist(): | 120 | for item in zipin.infolist(): |
| 99 | if item == 'meta.xml': | 121 | if item.filename == 'meta.xml': |
| 100 | content = zipin.read(item).decode('utf-8') | 122 | content = zipin.read(item).decode('utf-8') |
| 101 | for (key, value) in re.findall(r"<((?:meta|dc|cp).+?)>(.+)</\1>", content, re.I): | 123 | for (key, value) in re.findall(r"<((?:meta|dc|cp).+?)>(.+)</\1>", content, re.I): |
| 102 | metadata[key] = value | 124 | metadata[key] = value |
| 103 | if not metadata: # better safe than sorry | 125 | if not metadata: # better safe than sorry |
| 104 | metadata[item] = 'harmful content' | 126 | metadata[item] = 'harmful content' |
| 127 | metadata = {**metadata, **self._get_zipinfo_meta(item)} | ||
| 105 | zipin.close() | 128 | zipin.close() |
| 106 | return metadata | 129 | return metadata |
| 107 | 130 | ||
