From ad3e7ccee8b1c18c982c39248625d5230cd3d283 Mon Sep 17 00:00:00 2001 From: jvoisin Date: Sun, 8 Jul 2018 21:35:45 +0200 Subject: Bump coverage for office files and fix some related crashes --- libmat2/office.py | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) (limited to 'libmat2') diff --git a/libmat2/office.py b/libmat2/office.py index 5165056..6087c47 100644 --- a/libmat2/office.py +++ b/libmat2/office.py @@ -147,7 +147,10 @@ class MSOfficeParser(ArchiveBasedAbstractParser): """ In this function, we're changing the XML document in two times, since we don't want to change the tree we're iterating on.""" - tree, ns = _parse_xml(full_path) + try: + tree, ns = _parse_xml(full_path) + except ET.ParseError: + return False # No revisions are present del_presence = tree.find('.//w:del', ns) @@ -191,15 +194,13 @@ class MSOfficeParser(ArchiveBasedAbstractParser): zipin = zipfile.ZipFile(self.filename) for item in zipin.infolist(): if item.filename.startswith('docProps/') and item.filename.endswith('.xml'): - content = zipin.read(item).decode('utf-8') try: + content = zipin.read(item).decode('utf-8') results = re.findall(r"<(.+)>(.+)", content, re.I|re.M) for (key, value) in results: metadata[key] = value - except TypeError: # We didn't manage to parse the xml file - pass - if not metadata: # better safe than sorry - metadata[item] = 'harmful content' + except (TypeError, UnicodeDecodeError): # We didn't manage to parse the xml file + metadata[item.filename] = 'harmful content' for key, value in self._get_zipinfo_meta(item).items(): metadata[key] = value zipin.close() @@ -232,7 +233,10 @@ class LibreOfficeParser(ArchiveBasedAbstractParser): def __remove_revisions(self, full_path: str) -> bool: - tree, ns = _parse_xml(full_path) + try: + tree, ns = _parse_xml(full_path) + except ET.ParseError: + return False if 'office' not in ns.keys(): # no revisions in the current file return True @@ -259,15 +263,13 @@ class LibreOfficeParser(ArchiveBasedAbstractParser): zipin = zipfile.ZipFile(self.filename) for item in zipin.infolist(): if item.filename == 'meta.xml': - content = zipin.read(item).decode('utf-8') try: + content = zipin.read(item).decode('utf-8') results = re.findall(r"<((?:meta|dc|cp).+?)>(.+)", content, re.I|re.M) for (key, value) in results: metadata[key] = value - except TypeError: # We didn't manage to parse the xml file - pass - if not metadata: # better safe than sorry - metadata[item] = 'harmful content' + except (TypeError, UnicodeDecodeError): # We didn't manage to parse the xml file + metadata[item.filename] = 'harmful content' for key, value in self._get_zipinfo_meta(item).items(): metadata[key] = value zipin.close() -- cgit v1.3