summaryrefslogtreecommitdiff
path: root/src/office.py
diff options
context:
space:
mode:
Diffstat (limited to 'src/office.py')
-rw-r--r--src/office.py52
1 files changed, 52 insertions, 0 deletions
diff --git a/src/office.py b/src/office.py
new file mode 100644
index 0000000..d6728c8
--- /dev/null
+++ b/src/office.py
@@ -0,0 +1,52 @@
1import subprocess
2import json
3import zipfile
4import tempfile
5import shutil
6import os
7
8from . import abstract, parser_factory
9
10class OfficeParser(abstract.AbstractParser):
11 mimetypes = {
12 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
13 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
14 'application/vnd.openxmlformats-officedocument.presentationml.presentation'
15 }
16 files_to_keep = {'_rels/.rels', 'word/_rels/document.xml.rels'}
17
18 def get_meta(self):
19 metadata = {}
20 zipin = zipfile.ZipFile(self.filename)
21 for item in zipin.namelist():
22 if item.startswith('docProps/'):
23 metadata[item] = 'harmful content'
24 zipin.close()
25 return metadata
26
27 def remove_all(self):
28 zin = zipfile.ZipFile(self.filename, 'r')
29 zout = zipfile.ZipFile(self.output_filename, 'w')
30 temp_folder = tempfile.mkdtemp()
31
32 for item in zin.infolist():
33 if item.is_dir():
34 continue
35 elif item.filename.startswith('docProps/'):
36 if not item.filename.endswith('.rels'):
37 continue # don't keep metadata files
38 if item.filename in self.files_to_keep:
39 zout.writestr(item, zin.read(item))
40 continue
41
42 zin.extract(member=item, path=temp_folder)
43 tmp_parser = parser_factory.get_parser(os.path.join(temp_folder, item.filename))
44 if tmp_parser is None:
45 print("%s isn't supported" % item.filename)
46 continue
47 tmp_parser.remove_all()
48 zout.write(tmp_parser.output_filename, item.filename)
49 shutil.rmtree(temp_folder)
50 zout.close()
51 zin.close()
52 return True