diff options
| author | jvoisin | 2018-06-04 22:54:01 +0200 |
|---|---|---|
| committer | jvoisin | 2018-06-04 23:20:30 +0200 |
| commit | 6a1b0b31f0fbfa59a78a8b9f4f07bf9ed3f91cdf (patch) | |
| tree | fdb8e31a7ad5bf6982cb8c11a2012205a0cfe14f /libmat2/office.py | |
| parent | 4ebf9754f84e28eb73a09df0f788b5be80c9c73e (diff) | |
Add more typing and use mypy in the CI
Diffstat (limited to 'libmat2/office.py')
| -rw-r--r-- | libmat2/office.py | 38 |
1 files changed, 26 insertions, 12 deletions
diff --git a/libmat2/office.py b/libmat2/office.py index 749fc7d..90f7c7a 100644 --- a/libmat2/office.py +++ b/libmat2/office.py | |||
| @@ -4,11 +4,15 @@ import shutil | |||
| 4 | import tempfile | 4 | import tempfile |
| 5 | import datetime | 5 | import datetime |
| 6 | import zipfile | 6 | import zipfile |
| 7 | from typing import Dict, Set | ||
| 7 | 8 | ||
| 8 | from . import abstract, parser_factory | 9 | from . import abstract, parser_factory |
| 9 | 10 | ||
| 11 | assert Set # make pyflakes happy | ||
| 10 | 12 | ||
| 11 | class ArchiveBasedAbstractParser(abstract.AbstractParser): | 13 | class ArchiveBasedAbstractParser(abstract.AbstractParser): |
| 14 | whitelist = set() # type: Set[str] | ||
| 15 | |||
| 12 | def _clean_zipinfo(self, zipinfo: zipfile.ZipInfo) -> zipfile.ZipInfo: | 16 | def _clean_zipinfo(self, zipinfo: zipfile.ZipInfo) -> zipfile.ZipInfo: |
| 13 | zipinfo.compress_type = zipfile.ZIP_DEFLATED | 17 | zipinfo.compress_type = zipfile.ZIP_DEFLATED |
| 14 | zipinfo.create_system = 3 # Linux | 18 | zipinfo.create_system = 3 # Linux |
| @@ -16,7 +20,7 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser): | |||
| 16 | zipinfo.date_time = (1980, 1, 1, 0, 0, 0) | 20 | zipinfo.date_time = (1980, 1, 1, 0, 0, 0) |
| 17 | return zipinfo | 21 | return zipinfo |
| 18 | 22 | ||
| 19 | def _get_zipinfo_meta(self, zipinfo: zipfile.ZipInfo) -> dict: | 23 | def _get_zipinfo_meta(self, zipinfo: zipfile.ZipInfo) -> Dict[str, str]: |
| 20 | metadata = {} | 24 | metadata = {} |
| 21 | if zipinfo.create_system == 3: | 25 | if zipinfo.create_system == 3: |
| 22 | #metadata['create_system'] = 'Linux' | 26 | #metadata['create_system'] = 'Linux' |
| @@ -27,25 +31,31 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser): | |||
| 27 | metadata['create_system'] = 'Weird' | 31 | metadata['create_system'] = 'Weird' |
| 28 | 32 | ||
| 29 | if zipinfo.comment: | 33 | if zipinfo.comment: |
| 30 | metadata['comment'] = zipinfo.comment | 34 | metadata['comment'] = zipinfo.comment # type: ignore |
| 31 | 35 | ||
| 32 | if zipinfo.date_time != (1980, 1, 1, 0, 0, 0): | 36 | if zipinfo.date_time != (1980, 1, 1, 0, 0, 0): |
| 33 | metadata['date_time'] = datetime.datetime(*zipinfo.date_time) | 37 | metadata['date_time'] =str(datetime.datetime(*zipinfo.date_time)) |
| 34 | 38 | ||
| 35 | return metadata | 39 | return metadata |
| 36 | 40 | ||
| 37 | 41 | ||
| 38 | def _clean_internal_file(self, item: zipfile.ZipInfo, temp_folder: str, | 42 | def _clean_internal_file(self, item: zipfile.ZipInfo, temp_folder: str, |
| 39 | zin: zipfile.ZipFile, zout: zipfile.ZipFile): | 43 | zin: zipfile.ZipFile, zout: zipfile.ZipFile): |
| 44 | output = '' | ||
| 40 | zin.extract(member=item, path=temp_folder) | 45 | zin.extract(member=item, path=temp_folder) |
| 41 | tmp_parser, mtype = parser_factory.get_parser(os.path.join(temp_folder, item.filename)) | 46 | if item.filename not in self.whitelist: |
| 42 | if not tmp_parser: | 47 | full_path = os.path.join(temp_folder, item.filename) |
| 43 | print("%s's format (%s) isn't supported" % (item.filename, mtype)) | 48 | tmp_parser, mtype = parser_factory.get_parser(full_path) # type: ignore |
| 44 | return | 49 | if not tmp_parser: |
| 45 | tmp_parser.remove_all() | 50 | print("%s's format (%s) isn't supported" % (item.filename, mtype)) |
| 46 | zinfo = zipfile.ZipInfo(item.filename) | 51 | return |
| 52 | tmp_parser.remove_all() | ||
| 53 | output = tmp_parser.output_filename | ||
| 54 | else: | ||
| 55 | output = os.path.join(temp_folder, item.filename) | ||
| 56 | zinfo = zipfile.ZipInfo(item.filename) # type: ignore | ||
| 47 | clean_zinfo = self._clean_zipinfo(zinfo) | 57 | clean_zinfo = self._clean_zipinfo(zinfo) |
| 48 | with open(tmp_parser.output_filename, 'rb') as f: | 58 | with open(output, 'rb') as f: |
| 49 | zout.writestr(clean_zinfo, f.read()) | 59 | zout.writestr(clean_zinfo, f.read()) |
| 50 | 60 | ||
| 51 | 61 | ||
| @@ -72,7 +82,8 @@ class MSOfficeParser(ArchiveBasedAbstractParser): | |||
| 72 | if not metadata: # better safe than sorry | 82 | if not metadata: # better safe than sorry |
| 73 | metadata[item] = 'harmful content' | 83 | metadata[item] = 'harmful content' |
| 74 | 84 | ||
| 75 | metadata = {**metadata, **self._get_zipinfo_meta(item)} | 85 | for key, value in self._get_zipinfo_meta(item).items(): |
| 86 | metadata[key] = value | ||
| 76 | zipin.close() | 87 | zipin.close() |
| 77 | return metadata | 88 | return metadata |
| 78 | 89 | ||
| @@ -112,6 +123,8 @@ class LibreOfficeParser(ArchiveBasedAbstractParser): | |||
| 112 | 'application/vnd.oasis.opendocument.formula', | 123 | 'application/vnd.oasis.opendocument.formula', |
| 113 | 'application/vnd.oasis.opendocument.image', | 124 | 'application/vnd.oasis.opendocument.image', |
| 114 | } | 125 | } |
| 126 | whitelist = {'mimetype', 'manifest.rdf'} | ||
| 127 | |||
| 115 | 128 | ||
| 116 | def get_meta(self): | 129 | def get_meta(self): |
| 117 | """ | 130 | """ |
| @@ -127,7 +140,8 @@ class LibreOfficeParser(ArchiveBasedAbstractParser): | |||
| 127 | metadata[key] = value | 140 | metadata[key] = value |
| 128 | if not metadata: # better safe than sorry | 141 | if not metadata: # better safe than sorry |
| 129 | metadata[item] = 'harmful content' | 142 | metadata[item] = 'harmful content' |
| 130 | metadata = {**metadata, **self._get_zipinfo_meta(item)} | 143 | for key, value in self._get_zipinfo_meta(item).items(): |
| 144 | metadata[key] = value | ||
| 131 | zipin.close() | 145 | zipin.close() |
| 132 | return metadata | 146 | return metadata |
| 133 | 147 | ||
