diff options
Diffstat (limited to 'libmat2')
| -rw-r--r-- | libmat2/office.py | 26 |
1 files changed, 14 insertions, 12 deletions
diff --git a/libmat2/office.py b/libmat2/office.py index 5165056..6087c47 100644 --- a/libmat2/office.py +++ b/libmat2/office.py | |||
| @@ -147,7 +147,10 @@ class MSOfficeParser(ArchiveBasedAbstractParser): | |||
| 147 | """ In this function, we're changing the XML | 147 | """ In this function, we're changing the XML |
| 148 | document in two times, since we don't want | 148 | document in two times, since we don't want |
| 149 | to change the tree we're iterating on.""" | 149 | to change the tree we're iterating on.""" |
| 150 | tree, ns = _parse_xml(full_path) | 150 | try: |
| 151 | tree, ns = _parse_xml(full_path) | ||
| 152 | except ET.ParseError: | ||
| 153 | return False | ||
| 151 | 154 | ||
| 152 | # No revisions are present | 155 | # No revisions are present |
| 153 | del_presence = tree.find('.//w:del', ns) | 156 | del_presence = tree.find('.//w:del', ns) |
| @@ -191,15 +194,13 @@ class MSOfficeParser(ArchiveBasedAbstractParser): | |||
| 191 | zipin = zipfile.ZipFile(self.filename) | 194 | zipin = zipfile.ZipFile(self.filename) |
| 192 | for item in zipin.infolist(): | 195 | for item in zipin.infolist(): |
| 193 | if item.filename.startswith('docProps/') and item.filename.endswith('.xml'): | 196 | if item.filename.startswith('docProps/') and item.filename.endswith('.xml'): |
| 194 | content = zipin.read(item).decode('utf-8') | ||
| 195 | try: | 197 | try: |
| 198 | content = zipin.read(item).decode('utf-8') | ||
| 196 | results = re.findall(r"<(.+)>(.+)</\1>", content, re.I|re.M) | 199 | results = re.findall(r"<(.+)>(.+)</\1>", content, re.I|re.M) |
| 197 | for (key, value) in results: | 200 | for (key, value) in results: |
| 198 | metadata[key] = value | 201 | metadata[key] = value |
| 199 | except TypeError: # We didn't manage to parse the xml file | 202 | except (TypeError, UnicodeDecodeError): # We didn't manage to parse the xml file |
| 200 | pass | 203 | metadata[item.filename] = 'harmful content' |
| 201 | if not metadata: # better safe than sorry | ||
| 202 | metadata[item] = 'harmful content' | ||
| 203 | for key, value in self._get_zipinfo_meta(item).items(): | 204 | for key, value in self._get_zipinfo_meta(item).items(): |
| 204 | metadata[key] = value | 205 | metadata[key] = value |
| 205 | zipin.close() | 206 | zipin.close() |
| @@ -232,7 +233,10 @@ class LibreOfficeParser(ArchiveBasedAbstractParser): | |||
| 232 | 233 | ||
| 233 | 234 | ||
| 234 | def __remove_revisions(self, full_path: str) -> bool: | 235 | def __remove_revisions(self, full_path: str) -> bool: |
| 235 | tree, ns = _parse_xml(full_path) | 236 | try: |
| 237 | tree, ns = _parse_xml(full_path) | ||
| 238 | except ET.ParseError: | ||
| 239 | return False | ||
| 236 | 240 | ||
| 237 | if 'office' not in ns.keys(): # no revisions in the current file | 241 | if 'office' not in ns.keys(): # no revisions in the current file |
| 238 | return True | 242 | return True |
| @@ -259,15 +263,13 @@ class LibreOfficeParser(ArchiveBasedAbstractParser): | |||
| 259 | zipin = zipfile.ZipFile(self.filename) | 263 | zipin = zipfile.ZipFile(self.filename) |
| 260 | for item in zipin.infolist(): | 264 | for item in zipin.infolist(): |
| 261 | if item.filename == 'meta.xml': | 265 | if item.filename == 'meta.xml': |
| 262 | content = zipin.read(item).decode('utf-8') | ||
| 263 | try: | 266 | try: |
| 267 | content = zipin.read(item).decode('utf-8') | ||
| 264 | results = re.findall(r"<((?:meta|dc|cp).+?)>(.+)</\1>", content, re.I|re.M) | 268 | results = re.findall(r"<((?:meta|dc|cp).+?)>(.+)</\1>", content, re.I|re.M) |
| 265 | for (key, value) in results: | 269 | for (key, value) in results: |
| 266 | metadata[key] = value | 270 | metadata[key] = value |
| 267 | except TypeError: # We didn't manage to parse the xml file | 271 | except (TypeError, UnicodeDecodeError): # We didn't manage to parse the xml file |
| 268 | pass | 272 | metadata[item.filename] = 'harmful content' |
| 269 | if not metadata: # better safe than sorry | ||
| 270 | metadata[item] = 'harmful content' | ||
| 271 | for key, value in self._get_zipinfo_meta(item).items(): | 273 | for key, value in self._get_zipinfo_meta(item).items(): |
| 272 | metadata[key] = value | 274 | metadata[key] = value |
| 273 | zipin.close() | 275 | zipin.close() |
