diff options
Diffstat (limited to 'libmat2')
| -rw-r--r-- | libmat2/harmless.py | 2 | ||||
| -rw-r--r-- | libmat2/office.py | 130 |
2 files changed, 63 insertions, 69 deletions
diff --git a/libmat2/harmless.py b/libmat2/harmless.py index 54737a8..2878571 100644 --- a/libmat2/harmless.py +++ b/libmat2/harmless.py | |||
| @@ -4,7 +4,7 @@ from . import abstract | |||
| 4 | 4 | ||
| 5 | class HarmlessParser(abstract.AbstractParser): | 5 | class HarmlessParser(abstract.AbstractParser): |
| 6 | """ This is the parser for filetypes that do not contain metadata. """ | 6 | """ This is the parser for filetypes that do not contain metadata. """ |
| 7 | mimetypes = {'application/xml', 'text/plain', 'text/xml', 'application/rdf+xml'} | 7 | mimetypes = {'text/plain', } |
| 8 | 8 | ||
| 9 | def __init__(self, filename: str) -> None: | 9 | def __init__(self, filename: str) -> None: |
| 10 | super().__init__(filename) | 10 | super().__init__(filename) |
diff --git a/libmat2/office.py b/libmat2/office.py index 0791b07..fd3cdf4 100644 --- a/libmat2/office.py +++ b/libmat2/office.py | |||
| @@ -4,17 +4,16 @@ import shutil | |||
| 4 | import tempfile | 4 | import tempfile |
| 5 | import datetime | 5 | import datetime |
| 6 | import zipfile | 6 | import zipfile |
| 7 | from typing import Dict, Set | 7 | from typing import Dict, Set, Pattern |
| 8 | 8 | ||
| 9 | from . import abstract, parser_factory | 9 | from . import abstract, parser_factory |
| 10 | 10 | ||
| 11 | assert Set # make pyflakes happy | ||
| 12 | 11 | ||
| 13 | class ArchiveBasedAbstractParser(abstract.AbstractParser): | 12 | class ArchiveBasedAbstractParser(abstract.AbstractParser): |
| 14 | whitelist = set() # type: Set[str] | 13 | files_to_keep : Set[str] = set() |
| 14 | files_to_omit : Set[Pattern] = set() | ||
| 15 | 15 | ||
| 16 | def _clean_zipinfo(self, zipinfo: zipfile.ZipInfo) -> zipfile.ZipInfo: | 16 | def _clean_zipinfo(self, zipinfo: zipfile.ZipInfo) -> zipfile.ZipInfo: |
| 17 | zipinfo.compress_type = zipfile.ZIP_DEFLATED | ||
| 18 | zipinfo.create_system = 3 # Linux | 17 | zipinfo.create_system = 3 # Linux |
| 19 | zipinfo.comment = b'' | 18 | zipinfo.comment = b'' |
| 20 | zipinfo.date_time = (1980, 1, 1, 0, 0, 0) | 19 | zipinfo.date_time = (1980, 1, 1, 0, 0, 0) |
| @@ -34,33 +33,51 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser): | |||
| 34 | metadata['comment'] = zipinfo.comment # type: ignore | 33 | metadata['comment'] = zipinfo.comment # type: ignore |
| 35 | 34 | ||
| 36 | if zipinfo.date_time != (1980, 1, 1, 0, 0, 0): | 35 | if zipinfo.date_time != (1980, 1, 1, 0, 0, 0): |
| 37 | metadata['date_time'] =str(datetime.datetime(*zipinfo.date_time)) | 36 | metadata['date_time'] = str(datetime.datetime(*zipinfo.date_time)) |
| 38 | 37 | ||
| 39 | return metadata | 38 | return metadata |
| 40 | 39 | ||
| 41 | 40 | ||
| 42 | def _clean_internal_file(self, item: zipfile.ZipInfo, temp_folder: str, | 41 | def _clean_internal_file(self, item: zipfile.ZipInfo, temp_folder: str, |
| 43 | zin: zipfile.ZipFile, zout: zipfile.ZipFile) -> bool: | 42 | zin: zipfile.ZipFile, zout: zipfile.ZipFile) -> bool: |
| 44 | output = '' | ||
| 45 | zin.extract(member=item, path=temp_folder) | 43 | zin.extract(member=item, path=temp_folder) |
| 46 | if item.filename not in self.whitelist: | 44 | full_path = os.path.join(temp_folder, item.filename) |
| 47 | full_path = os.path.join(temp_folder, item.filename) | 45 | tmp_parser, mtype = parser_factory.get_parser(full_path) # type: ignore |
| 48 | tmp_parser, mtype = parser_factory.get_parser(full_path) # type: ignore | 46 | if not tmp_parser: |
| 49 | if not tmp_parser: | 47 | zout.close() |
| 50 | zout.close() | 48 | os.remove(self.output_filename) |
| 51 | os.remove(self.output_filename) | 49 | print("%s's format (%s) isn't supported" % (item.filename, mtype)) |
| 52 | print("%s's format (%s) isn't supported" % (item.filename, mtype)) | 50 | return False |
| 53 | return False | 51 | tmp_parser.remove_all() |
| 54 | tmp_parser.remove_all() | 52 | |
| 55 | output = tmp_parser.output_filename | ||
| 56 | else: | ||
| 57 | output = os.path.join(temp_folder, item.filename) | ||
| 58 | zinfo = zipfile.ZipInfo(item.filename) # type: ignore | 53 | zinfo = zipfile.ZipInfo(item.filename) # type: ignore |
| 59 | clean_zinfo = self._clean_zipinfo(zinfo) | 54 | clean_zinfo = self._clean_zipinfo(zinfo) |
| 60 | with open(output, 'rb') as f: | 55 | with open(tmp_parser.output_filename, 'rb') as f: |
| 61 | zout.writestr(clean_zinfo, f.read()) | 56 | zout.writestr(clean_zinfo, f.read()) |
| 62 | return True | 57 | return True |
| 63 | 58 | ||
| 59 | def remove_all(self) -> bool: | ||
| 60 | zin = zipfile.ZipFile(self.filename, 'r') | ||
| 61 | zout = zipfile.ZipFile(self.output_filename, 'w') | ||
| 62 | temp_folder = tempfile.mkdtemp() | ||
| 63 | |||
| 64 | for item in zin.infolist(): | ||
| 65 | if item.filename[-1] == '/': # `is_dir` is added in Python3.6 | ||
| 66 | continue # don't keep empty folders | ||
| 67 | elif item.filename in self.files_to_keep: | ||
| 68 | item = self._clean_zipinfo(item) | ||
| 69 | zout.writestr(item, zin.read(item)) | ||
| 70 | continue | ||
| 71 | elif any(map(lambda r: r.search(item.filename), self.files_to_omit)): | ||
| 72 | continue | ||
| 73 | elif not self._clean_internal_file(item, temp_folder, zin, zout): | ||
| 74 | return False | ||
| 75 | |||
| 76 | shutil.rmtree(temp_folder) | ||
| 77 | zout.close() | ||
| 78 | zin.close() | ||
| 79 | return True | ||
| 80 | |||
| 64 | 81 | ||
| 65 | class MSOfficeParser(ArchiveBasedAbstractParser): | 82 | class MSOfficeParser(ArchiveBasedAbstractParser): |
| 66 | mimetypes = { | 83 | mimetypes = { |
| @@ -68,9 +85,20 @@ class MSOfficeParser(ArchiveBasedAbstractParser): | |||
| 68 | 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', | 85 | 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', |
| 69 | 'application/vnd.openxmlformats-officedocument.presentationml.presentation' | 86 | 'application/vnd.openxmlformats-officedocument.presentationml.presentation' |
| 70 | } | 87 | } |
| 71 | files_to_keep = {'_rels/.rels', 'word/_rels/document.xml.rels'} | 88 | files_to_keep = { |
| 89 | '[Content_Types].xml', | ||
| 90 | '_rels/.rels', | ||
| 91 | 'word/_rels/document.xml.rels', | ||
| 92 | 'word/document.xml', | ||
| 93 | 'word/fontTable.xml', | ||
| 94 | 'word/settings.xml', | ||
| 95 | 'word/styles.xml', | ||
| 96 | } | ||
| 97 | files_to_omit = set(map(re.compile, { # type: ignore | ||
| 98 | '^docProps/', | ||
| 99 | })) | ||
| 72 | 100 | ||
| 73 | def get_meta(self): | 101 | def get_meta(self) -> Dict[str, str]: |
| 74 | """ | 102 | """ |
| 75 | Yes, I know that parsing xml with regexp ain't pretty, | 103 | Yes, I know that parsing xml with regexp ain't pretty, |
| 76 | be my guest and fix it if you want. | 104 | be my guest and fix it if you want. |
| @@ -88,38 +116,12 @@ class MSOfficeParser(ArchiveBasedAbstractParser): | |||
| 88 | pass | 116 | pass |
| 89 | if not metadata: # better safe than sorry | 117 | if not metadata: # better safe than sorry |
| 90 | metadata[item] = 'harmful content' | 118 | metadata[item] = 'harmful content' |
| 91 | |||
| 92 | for key, value in self._get_zipinfo_meta(item).items(): | 119 | for key, value in self._get_zipinfo_meta(item).items(): |
| 93 | metadata[key] = value | 120 | metadata[key] = value |
| 94 | zipin.close() | 121 | zipin.close() |
| 95 | return metadata | 122 | return metadata |
| 96 | 123 | ||
| 97 | 124 | ||
| 98 | def remove_all(self): | ||
| 99 | zin = zipfile.ZipFile(self.filename, 'r') | ||
| 100 | zout = zipfile.ZipFile(self.output_filename, 'w') | ||
| 101 | temp_folder = tempfile.mkdtemp() | ||
| 102 | |||
| 103 | for item in zin.infolist(): | ||
| 104 | if item.filename[-1] == '/': | ||
| 105 | continue # `is_dir` is added in Python3.6 | ||
| 106 | elif item.filename.startswith('docProps/'): | ||
| 107 | continue # don't keep metadata files | ||
| 108 | if item.filename in self.files_to_keep: | ||
| 109 | item = self._clean_zipinfo(item) | ||
| 110 | zout.writestr(item, zin.read(item)) | ||
| 111 | continue | ||
| 112 | |||
| 113 | if self._clean_internal_file(item, temp_folder, zin, zout) is False: | ||
| 114 | return False | ||
| 115 | |||
| 116 | shutil.rmtree(temp_folder) | ||
| 117 | zout.close() | ||
| 118 | zin.close() | ||
| 119 | return True | ||
| 120 | |||
| 121 | |||
| 122 | |||
| 123 | class LibreOfficeParser(ArchiveBasedAbstractParser): | 125 | class LibreOfficeParser(ArchiveBasedAbstractParser): |
| 124 | mimetypes = { | 126 | mimetypes = { |
| 125 | 'application/vnd.oasis.opendocument.text', | 127 | 'application/vnd.oasis.opendocument.text', |
| @@ -130,10 +132,20 @@ class LibreOfficeParser(ArchiveBasedAbstractParser): | |||
| 130 | 'application/vnd.oasis.opendocument.formula', | 132 | 'application/vnd.oasis.opendocument.formula', |
| 131 | 'application/vnd.oasis.opendocument.image', | 133 | 'application/vnd.oasis.opendocument.image', |
| 132 | } | 134 | } |
| 133 | whitelist = {'mimetype', 'manifest.rdf'} | 135 | files_to_keep = { |
| 134 | 136 | 'META-INF/manifest.xml', | |
| 137 | 'content.xml', | ||
| 138 | 'manifest.rdf', | ||
| 139 | 'mimetype', | ||
| 140 | 'settings.xml', | ||
| 141 | 'styles.xml', | ||
| 142 | } | ||
| 143 | files_to_omit = set(map(re.compile, { # type: ignore | ||
| 144 | '^meta\.xml$', | ||
| 145 | '^Configurations2/', | ||
| 146 | })) | ||
| 135 | 147 | ||
| 136 | def get_meta(self): | 148 | def get_meta(self) -> Dict[str, str]: |
| 137 | """ | 149 | """ |
| 138 | Yes, I know that parsing xml with regexp ain't pretty, | 150 | Yes, I know that parsing xml with regexp ain't pretty, |
| 139 | be my guest and fix it if you want. | 151 | be my guest and fix it if you want. |
| @@ -156,21 +168,3 @@ class LibreOfficeParser(ArchiveBasedAbstractParser): | |||
| 156 | zipin.close() | 168 | zipin.close() |
| 157 | return metadata | 169 | return metadata |
| 158 | 170 | ||
| 159 | def remove_all(self): | ||
| 160 | zin = zipfile.ZipFile(self.filename, 'r') | ||
| 161 | zout = zipfile.ZipFile(self.output_filename, 'w') | ||
| 162 | temp_folder = tempfile.mkdtemp() | ||
| 163 | |||
| 164 | for item in zin.infolist(): | ||
| 165 | if item.filename[-1] == '/': | ||
| 166 | continue # `is_dir` is added in Python3.6 | ||
| 167 | elif item.filename == 'meta.xml': | ||
| 168 | continue # don't keep metadata files | ||
| 169 | |||
| 170 | if self._clean_internal_file(item, temp_folder, zin, zout) is False: | ||
| 171 | return False | ||
| 172 | |||
| 173 | shutil.rmtree(temp_folder) | ||
| 174 | zout.close() | ||
| 175 | zin.close() | ||
| 176 | return True | ||
