summaryrefslogtreecommitdiff
path: root/libmat2/office.py
diff options
context:
space:
mode:
Diffstat (limited to 'libmat2/office.py')
-rw-r--r--libmat2/office.py130
1 files changed, 62 insertions, 68 deletions
diff --git a/libmat2/office.py b/libmat2/office.py
index 0791b07..fd3cdf4 100644
--- a/libmat2/office.py
+++ b/libmat2/office.py
@@ -4,17 +4,16 @@ import shutil
4import tempfile 4import tempfile
5import datetime 5import datetime
6import zipfile 6import zipfile
7from typing import Dict, Set 7from typing import Dict, Set, Pattern
8 8
9from . import abstract, parser_factory 9from . import abstract, parser_factory
10 10
11assert Set # make pyflakes happy
12 11
13class ArchiveBasedAbstractParser(abstract.AbstractParser): 12class ArchiveBasedAbstractParser(abstract.AbstractParser):
14 whitelist = set() # type: Set[str] 13 files_to_keep : Set[str] = set()
14 files_to_omit : Set[Pattern] = set()
15 15
16 def _clean_zipinfo(self, zipinfo: zipfile.ZipInfo) -> zipfile.ZipInfo: 16 def _clean_zipinfo(self, zipinfo: zipfile.ZipInfo) -> zipfile.ZipInfo:
17 zipinfo.compress_type = zipfile.ZIP_DEFLATED
18 zipinfo.create_system = 3 # Linux 17 zipinfo.create_system = 3 # Linux
19 zipinfo.comment = b'' 18 zipinfo.comment = b''
20 zipinfo.date_time = (1980, 1, 1, 0, 0, 0) 19 zipinfo.date_time = (1980, 1, 1, 0, 0, 0)
@@ -34,33 +33,51 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
34 metadata['comment'] = zipinfo.comment # type: ignore 33 metadata['comment'] = zipinfo.comment # type: ignore
35 34
36 if zipinfo.date_time != (1980, 1, 1, 0, 0, 0): 35 if zipinfo.date_time != (1980, 1, 1, 0, 0, 0):
37 metadata['date_time'] =str(datetime.datetime(*zipinfo.date_time)) 36 metadata['date_time'] = str(datetime.datetime(*zipinfo.date_time))
38 37
39 return metadata 38 return metadata
40 39
41 40
42 def _clean_internal_file(self, item: zipfile.ZipInfo, temp_folder: str, 41 def _clean_internal_file(self, item: zipfile.ZipInfo, temp_folder: str,
43 zin: zipfile.ZipFile, zout: zipfile.ZipFile) -> bool: 42 zin: zipfile.ZipFile, zout: zipfile.ZipFile) -> bool:
44 output = ''
45 zin.extract(member=item, path=temp_folder) 43 zin.extract(member=item, path=temp_folder)
46 if item.filename not in self.whitelist: 44 full_path = os.path.join(temp_folder, item.filename)
47 full_path = os.path.join(temp_folder, item.filename) 45 tmp_parser, mtype = parser_factory.get_parser(full_path) # type: ignore
48 tmp_parser, mtype = parser_factory.get_parser(full_path) # type: ignore 46 if not tmp_parser:
49 if not tmp_parser: 47 zout.close()
50 zout.close() 48 os.remove(self.output_filename)
51 os.remove(self.output_filename) 49 print("%s's format (%s) isn't supported" % (item.filename, mtype))
52 print("%s's format (%s) isn't supported" % (item.filename, mtype)) 50 return False
53 return False 51 tmp_parser.remove_all()
54 tmp_parser.remove_all() 52
55 output = tmp_parser.output_filename
56 else:
57 output = os.path.join(temp_folder, item.filename)
58 zinfo = zipfile.ZipInfo(item.filename) # type: ignore 53 zinfo = zipfile.ZipInfo(item.filename) # type: ignore
59 clean_zinfo = self._clean_zipinfo(zinfo) 54 clean_zinfo = self._clean_zipinfo(zinfo)
60 with open(output, 'rb') as f: 55 with open(tmp_parser.output_filename, 'rb') as f:
61 zout.writestr(clean_zinfo, f.read()) 56 zout.writestr(clean_zinfo, f.read())
62 return True 57 return True
63 58
59 def remove_all(self) -> bool:
60 zin = zipfile.ZipFile(self.filename, 'r')
61 zout = zipfile.ZipFile(self.output_filename, 'w')
62 temp_folder = tempfile.mkdtemp()
63
64 for item in zin.infolist():
65 if item.filename[-1] == '/': # `is_dir` is added in Python3.6
66 continue # don't keep empty folders
67 elif item.filename in self.files_to_keep:
68 item = self._clean_zipinfo(item)
69 zout.writestr(item, zin.read(item))
70 continue
71 elif any(map(lambda r: r.search(item.filename), self.files_to_omit)):
72 continue
73 elif not self._clean_internal_file(item, temp_folder, zin, zout):
74 return False
75
76 shutil.rmtree(temp_folder)
77 zout.close()
78 zin.close()
79 return True
80
64 81
65class MSOfficeParser(ArchiveBasedAbstractParser): 82class MSOfficeParser(ArchiveBasedAbstractParser):
66 mimetypes = { 83 mimetypes = {
@@ -68,9 +85,20 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
68 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', 85 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
69 'application/vnd.openxmlformats-officedocument.presentationml.presentation' 86 'application/vnd.openxmlformats-officedocument.presentationml.presentation'
70 } 87 }
71 files_to_keep = {'_rels/.rels', 'word/_rels/document.xml.rels'} 88 files_to_keep = {
89 '[Content_Types].xml',
90 '_rels/.rels',
91 'word/_rels/document.xml.rels',
92 'word/document.xml',
93 'word/fontTable.xml',
94 'word/settings.xml',
95 'word/styles.xml',
96 }
97 files_to_omit = set(map(re.compile, { # type: ignore
98 '^docProps/',
99 }))
72 100
73 def get_meta(self): 101 def get_meta(self) -> Dict[str, str]:
74 """ 102 """
75 Yes, I know that parsing xml with regexp ain't pretty, 103 Yes, I know that parsing xml with regexp ain't pretty,
76 be my guest and fix it if you want. 104 be my guest and fix it if you want.
@@ -88,38 +116,12 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
88 pass 116 pass
89 if not metadata: # better safe than sorry 117 if not metadata: # better safe than sorry
90 metadata[item] = 'harmful content' 118 metadata[item] = 'harmful content'
91
92 for key, value in self._get_zipinfo_meta(item).items(): 119 for key, value in self._get_zipinfo_meta(item).items():
93 metadata[key] = value 120 metadata[key] = value
94 zipin.close() 121 zipin.close()
95 return metadata 122 return metadata
96 123
97 124
98 def remove_all(self):
99 zin = zipfile.ZipFile(self.filename, 'r')
100 zout = zipfile.ZipFile(self.output_filename, 'w')
101 temp_folder = tempfile.mkdtemp()
102
103 for item in zin.infolist():
104 if item.filename[-1] == '/':
105 continue # `is_dir` is added in Python3.6
106 elif item.filename.startswith('docProps/'):
107 continue # don't keep metadata files
108 if item.filename in self.files_to_keep:
109 item = self._clean_zipinfo(item)
110 zout.writestr(item, zin.read(item))
111 continue
112
113 if self._clean_internal_file(item, temp_folder, zin, zout) is False:
114 return False
115
116 shutil.rmtree(temp_folder)
117 zout.close()
118 zin.close()
119 return True
120
121
122
123class LibreOfficeParser(ArchiveBasedAbstractParser): 125class LibreOfficeParser(ArchiveBasedAbstractParser):
124 mimetypes = { 126 mimetypes = {
125 'application/vnd.oasis.opendocument.text', 127 'application/vnd.oasis.opendocument.text',
@@ -130,10 +132,20 @@ class LibreOfficeParser(ArchiveBasedAbstractParser):
130 'application/vnd.oasis.opendocument.formula', 132 'application/vnd.oasis.opendocument.formula',
131 'application/vnd.oasis.opendocument.image', 133 'application/vnd.oasis.opendocument.image',
132 } 134 }
133 whitelist = {'mimetype', 'manifest.rdf'} 135 files_to_keep = {
134 136 'META-INF/manifest.xml',
137 'content.xml',
138 'manifest.rdf',
139 'mimetype',
140 'settings.xml',
141 'styles.xml',
142 }
143 files_to_omit = set(map(re.compile, { # type: ignore
144 '^meta\.xml$',
145 '^Configurations2/',
146 }))
135 147
136 def get_meta(self): 148 def get_meta(self) -> Dict[str, str]:
137 """ 149 """
138 Yes, I know that parsing xml with regexp ain't pretty, 150 Yes, I know that parsing xml with regexp ain't pretty,
139 be my guest and fix it if you want. 151 be my guest and fix it if you want.
@@ -156,21 +168,3 @@ class LibreOfficeParser(ArchiveBasedAbstractParser):
156 zipin.close() 168 zipin.close()
157 return metadata 169 return metadata
158 170
159 def remove_all(self):
160 zin = zipfile.ZipFile(self.filename, 'r')
161 zout = zipfile.ZipFile(self.output_filename, 'w')
162 temp_folder = tempfile.mkdtemp()
163
164 for item in zin.infolist():
165 if item.filename[-1] == '/':
166 continue # `is_dir` is added in Python3.6
167 elif item.filename == 'meta.xml':
168 continue # don't keep metadata files
169
170 if self._clean_internal_file(item, temp_folder, zin, zout) is False:
171 return False
172
173 shutil.rmtree(temp_folder)
174 zout.close()
175 zin.close()
176 return True