From d7a03d907baac73dd69c0ce77f3610e7d4bad5f4 Mon Sep 17 00:00:00 2001 From: jvoisin Date: Sun, 8 Mar 2020 12:17:56 +0100 Subject: Vastly improve ppt compatibility --- libmat2/office.py | 86 +++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 81 insertions(+), 5 deletions(-) (limited to 'libmat2/office.py') diff --git a/libmat2/office.py b/libmat2/office.py index 7f70b72..d122fc8 100644 --- a/libmat2/office.py +++ b/libmat2/office.py @@ -1,3 +1,4 @@ +import random import uuid import logging import os @@ -75,6 +76,14 @@ class MSOfficeParser(ZipParser): def __init__(self, filename): super().__init__(filename) + # MSOffice documents are using various counters for cross-references, + # we collect them all, to make sure that they're effectively counters, + # and not unique id used for fingerprinting. + self.__counters = { + 'cNvPr': set(), + 'rid': set(), + } + self.files_to_keep = set(map(re.compile, { # type: ignore r'^\[Content_Types\]\.xml$', r'^_rels/\.rels$', @@ -84,8 +93,14 @@ class MSOfficeParser(ZipParser): r'^ppt/slideLayouts/_rels/slideLayout[0-9]+\.xml\.rels$', r'^ppt/slideLayouts/slideLayout[0-9]+\.xml$', r'^(?:word|ppt)/tableStyles\.xml$', + r'^ppt/slides/_rels/slide[0-9]*\.xml\.rels$', + r'^ppt/slides/slide[0-9]*\.xml$', # https://msdn.microsoft.com/en-us/library/dd908153(v=office.12).aspx r'^(?:word|ppt)/stylesWithEffects\.xml$', + r'^ppt/presentation\.xml$', + # TODO: check if p:bgRef can be randomized + r'^ppt/slideMasters/slideMaster[0-9]+\.xml', + r'^ppt/slideMasters/_rels/slideMaster[0-9]+\.xml\.rels', })) self.files_to_omit = set(map(re.compile, { # type: ignore r'^customXml/', @@ -95,6 +110,7 @@ class MSOfficeParser(ZipParser): r'^(?:word|ppt)/theme', r'^(?:word|ppt)/people\.xml$', r'^(?:word|ppt)/numbering\.xml$', + r'^(?:word|ppt)/tags/', # View properties like view mode, last viewed slide etc r'^(?:word|ppt)/viewProps\.xml$', # Additional presentation-wide properties like printing properties, @@ -146,7 +162,7 @@ class MSOfficeParser(ZipParser): """ try: tree, namespace = _parse_xml(full_path) - except ET.ParseError as e: + except ET.ParseError as e: # pragma: no cover logging.error("Unable to parse %s: %s", full_path, e) return False @@ -206,7 +222,7 @@ class MSOfficeParser(ZipParser): def __remove_revisions(full_path: str) -> bool: try: tree, namespace = _parse_xml(full_path) - except ET.ParseError as e: + except ET.ParseError as e: # pragma: no cover logging.error("Unable to parse %s: %s", full_path, e) return False @@ -272,14 +288,71 @@ class MSOfficeParser(ZipParser): tree.write(full_path, xml_declaration=True) return True + def _final_checks(self) -> bool: + for k, v in self.__counters.items(): + if v and len(v) != max(v): + # TODO: make this an error and return False + # once the ability to correct the counters is implemented + logging.warning("%s contains invalid %s: %s", self.filename, k, v) + return True + return True + + def __collect_counters(self, full_path: str): + with open(full_path, encoding='utf-8') as f: + content = f.read() + # "relationship Id" + for i in re.findall(r'(?:\s|r:)[iI][dD]="rId([0-9]+)"(?:\s|/)', content): + self.__counters['rid'].add(int(i)) + # "connector for Non-visual property" + for i in re.findall(r' bool: + try: + tree, namespace = _parse_xml(full_path) + except ET.ParseError as e: # pragma: no cover + logging.error("Unable to parse %s: %s", full_path, e) + return False + + if 'p14' not in namespace.keys(): + return True # pragma: no cover + + for item in tree.iterfind('.//p14:creationId', namespace): + item.set('val', '%s' % random.randint(0, 2**32)) + tree.write(full_path, xml_declaration=True) + return True + + @staticmethod + def __randomize_sldMasterId(full_path: str) -> bool: + try: + tree, namespace = _parse_xml(full_path) + except ET.ParseError as e: # pragma: no cover + logging.error("Unable to parse %s: %s", full_path, e) + return False + + if 'p' not in namespace.keys(): + return True # pragma: no cover + + for item in tree.iterfind('.//p:sldMasterId', namespace): + item.set('id', '%s' % random.randint(0, 2**32)) + tree.write(full_path, xml_declaration=True) + return True + def _specific_cleanup(self, full_path: str) -> bool: - # pylint: disable=too-many-return-statements + # pylint: disable=too-many-return-statements,too-many-branches if os.stat(full_path).st_size == 0: # Don't process empty files return True if not full_path.endswith('.xml'): return True + if self.__randomize_creationId(full_path) is False: + return False + + self.__collect_counters(full_path) + if full_path.endswith('/[Content_Types].xml'): # this file contains references to files that we might # remove, and MS Office doesn't like dangling references @@ -288,7 +361,7 @@ class MSOfficeParser(ZipParser): elif full_path.endswith('/word/document.xml'): # this file contains the revisions if self.__remove_revisions(full_path) is False: - return False + return False # pragma: no cover elif full_path.endswith('/docProps/app.xml'): # This file must be present and valid, # so we're removing as much as we can. @@ -310,9 +383,12 @@ class MSOfficeParser(ZipParser): f.write(b'') uid = str(uuid.uuid4()).encode('utf-8') f.write(b'' % uid) + elif full_path.endswith('ppt/presentation.xml'): + if self.__randomize_sldMasterId(full_path) is False: + return False # pragma: no cover if self.__remove_rsid(full_path) is False: - return False + return False # pragma: no cover if self.__remove_nsid(full_path) is False: return False # pragma: no cover -- cgit v1.3