diff options
Diffstat (limited to 'src/office.py')
| -rw-r--r-- | src/office.py | 52 |
1 files changed, 52 insertions, 0 deletions
diff --git a/src/office.py b/src/office.py new file mode 100644 index 0000000..d6728c8 --- /dev/null +++ b/src/office.py | |||
| @@ -0,0 +1,52 @@ | |||
| 1 | import subprocess | ||
| 2 | import json | ||
| 3 | import zipfile | ||
| 4 | import tempfile | ||
| 5 | import shutil | ||
| 6 | import os | ||
| 7 | |||
| 8 | from . import abstract, parser_factory | ||
| 9 | |||
| 10 | class OfficeParser(abstract.AbstractParser): | ||
| 11 | mimetypes = { | ||
| 12 | 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', | ||
| 13 | 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', | ||
| 14 | 'application/vnd.openxmlformats-officedocument.presentationml.presentation' | ||
| 15 | } | ||
| 16 | files_to_keep = {'_rels/.rels', 'word/_rels/document.xml.rels'} | ||
| 17 | |||
| 18 | def get_meta(self): | ||
| 19 | metadata = {} | ||
| 20 | zipin = zipfile.ZipFile(self.filename) | ||
| 21 | for item in zipin.namelist(): | ||
| 22 | if item.startswith('docProps/'): | ||
| 23 | metadata[item] = 'harmful content' | ||
| 24 | zipin.close() | ||
| 25 | return metadata | ||
| 26 | |||
| 27 | def remove_all(self): | ||
| 28 | zin = zipfile.ZipFile(self.filename, 'r') | ||
| 29 | zout = zipfile.ZipFile(self.output_filename, 'w') | ||
| 30 | temp_folder = tempfile.mkdtemp() | ||
| 31 | |||
| 32 | for item in zin.infolist(): | ||
| 33 | if item.is_dir(): | ||
| 34 | continue | ||
| 35 | elif item.filename.startswith('docProps/'): | ||
| 36 | if not item.filename.endswith('.rels'): | ||
| 37 | continue # don't keep metadata files | ||
| 38 | if item.filename in self.files_to_keep: | ||
| 39 | zout.writestr(item, zin.read(item)) | ||
| 40 | continue | ||
| 41 | |||
| 42 | zin.extract(member=item, path=temp_folder) | ||
| 43 | tmp_parser = parser_factory.get_parser(os.path.join(temp_folder, item.filename)) | ||
| 44 | if tmp_parser is None: | ||
| 45 | print("%s isn't supported" % item.filename) | ||
| 46 | continue | ||
| 47 | tmp_parser.remove_all() | ||
| 48 | zout.write(tmp_parser.output_filename, item.filename) | ||
| 49 | shutil.rmtree(temp_folder) | ||
| 50 | zout.close() | ||
| 51 | zin.close() | ||
| 52 | return True | ||
