From d7a03d907baac73dd69c0ce77f3610e7d4bad5f4 Mon Sep 17 00:00:00 2001
From: jvoisin
Date: Sun, 8 Mar 2020 12:17:56 +0100
Subject: Vastly improve ppt compatibility

---
 libmat2/office.py | 86 +++++++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 81 insertions(+), 5 deletions(-)

(limited to 'libmat2/office.py')

diff --git a/libmat2/office.py b/libmat2/office.py
index 7f70b72..d122fc8 100644
--- a/libmat2/office.py
+++ b/libmat2/office.py
@@ -1,3 +1,4 @@
+import random
 import uuid
 import logging
 import os
@@ -75,6 +76,14 @@ class MSOfficeParser(ZipParser):
     def __init__(self, filename):
         super().__init__(filename)
 
+        # MSOffice documents are using various counters for cross-references,
+        # we collect them all, to make sure that they're effectively counters,
+        # and not unique id used for fingerprinting.
+        self.__counters = {
+            'cNvPr': set(),
+            'rid': set(),
+            }
+
         self.files_to_keep = set(map(re.compile, {  # type: ignore
             r'^\[Content_Types\]\.xml$',
             r'^_rels/\.rels$',
@@ -84,8 +93,14 @@ class MSOfficeParser(ZipParser):
             r'^ppt/slideLayouts/_rels/slideLayout[0-9]+\.xml\.rels$',
             r'^ppt/slideLayouts/slideLayout[0-9]+\.xml$',
             r'^(?:word|ppt)/tableStyles\.xml$',
+            r'^ppt/slides/_rels/slide[0-9]*\.xml\.rels$',
+            r'^ppt/slides/slide[0-9]*\.xml$',
             # https://msdn.microsoft.com/en-us/library/dd908153(v=office.12).aspx
             r'^(?:word|ppt)/stylesWithEffects\.xml$',
+            r'^ppt/presentation\.xml$',
+            # TODO: check if p:bgRef can be randomized
+            r'^ppt/slideMasters/slideMaster[0-9]+\.xml',
+            r'^ppt/slideMasters/_rels/slideMaster[0-9]+\.xml\.rels',
         }))
         self.files_to_omit = set(map(re.compile, {  # type: ignore
             r'^customXml/',
@@ -95,6 +110,7 @@ class MSOfficeParser(ZipParser):
             r'^(?:word|ppt)/theme',
             r'^(?:word|ppt)/people\.xml$',
             r'^(?:word|ppt)/numbering\.xml$',
+            r'^(?:word|ppt)/tags/',
             # View properties like view mode, last viewed slide etc
             r'^(?:word|ppt)/viewProps\.xml$',
             # Additional presentation-wide properties like printing properties,
@@ -146,7 +162,7 @@ class MSOfficeParser(ZipParser):
         """
         try:
             tree, namespace = _parse_xml(full_path)
-        except ET.ParseError as e:
+        except ET.ParseError as e:  # pragma: no cover
             logging.error("Unable to parse %s: %s", full_path, e)
             return False
 
@@ -206,7 +222,7 @@ class MSOfficeParser(ZipParser):
     def __remove_revisions(full_path: str) -> bool:
         try:
             tree, namespace = _parse_xml(full_path)
-        except ET.ParseError as e:
+        except ET.ParseError as e:  # pragma: no cover
             logging.error("Unable to parse %s: %s", full_path, e)
             return False
 
@@ -272,14 +288,71 @@ class MSOfficeParser(ZipParser):
         tree.write(full_path, xml_declaration=True)
         return True
 
+    def _final_checks(self) -> bool:
+        for k, v in self.__counters.items():
+            if v and len(v) != max(v):
+                # TODO: make this an error and return False
+                # once the ability to correct the counters is implemented
+                logging.warning("%s contains invalid %s: %s", self.filename, k, v)
+                return True
+        return True
+
+    def __collect_counters(self, full_path: str):
+        with open(full_path, encoding='utf-8') as f:
+            content = f.read()
+            # "relationship Id"
+            for i in re.findall(r'(?:\s|r:)[iI][dD]="rId([0-9]+)"(?:\s|/)', content):
+                self.__counters['rid'].add(int(i))
+            # "connector for Non-visual property"
+            for i in re.findall(r'<p:cNvPr id="([0-9]+)"', content):
+                self.__counters['cNvPr'].add(int(i))
+
+
+    @staticmethod
+    def __randomize_creationId(full_path: str) -> bool:
+        try:
+            tree, namespace = _parse_xml(full_path)
+        except ET.ParseError as e:  # pragma: no cover
+            logging.error("Unable to parse %s: %s", full_path, e)
+            return False
+
+        if 'p14' not in namespace.keys():
+            return True  # pragma: no cover
+
+        for item in tree.iterfind('.//p14:creationId', namespace):
+            item.set('val', '%s' % random.randint(0, 2**32))
+        tree.write(full_path, xml_declaration=True)
+        return True
+
+    @staticmethod
+    def __randomize_sldMasterId(full_path: str) -> bool:
+        try:
+            tree, namespace = _parse_xml(full_path)
+        except ET.ParseError as e:  # pragma: no cover
+            logging.error("Unable to parse %s: %s", full_path, e)
+            return False
+
+        if 'p' not in namespace.keys():
+            return True  # pragma: no cover
+
+        for item in tree.iterfind('.//p:sldMasterId', namespace):
+            item.set('id', '%s' % random.randint(0, 2**32))
+        tree.write(full_path, xml_declaration=True)
+        return True
+
     def _specific_cleanup(self, full_path: str) -> bool:
-        # pylint: disable=too-many-return-statements
+        # pylint: disable=too-many-return-statements,too-many-branches
         if os.stat(full_path).st_size == 0:  # Don't process empty files
             return True
 
         if not full_path.endswith('.xml'):
             return True
 
+        if self.__randomize_creationId(full_path) is False:
+            return False
+
+        self.__collect_counters(full_path)
+
         if full_path.endswith('/[Content_Types].xml'):
             # this file contains references to files that we might
             # remove, and MS Office doesn't like dangling references
@@ -288,7 +361,7 @@ class MSOfficeParser(ZipParser):
         elif full_path.endswith('/word/document.xml'):
             # this file contains the revisions
             if self.__remove_revisions(full_path) is False:
-                return False
+                return False  # pragma: no cover
         elif full_path.endswith('/docProps/app.xml'):
             # This file must be present and valid,
             # so we're removing as much as we can.
@@ -310,9 +383,12 @@ class MSOfficeParser(ZipParser):
                 f.write(b'<?xml version="1.0" encoding="UTF-8" standalone="yes"?>')
                 uid = str(uuid.uuid4()).encode('utf-8')
                 f.write(b'<a:tblStyleLst def="{%s}" xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"/>' % uid)
+        elif full_path.endswith('ppt/presentation.xml'):
+            if self.__randomize_sldMasterId(full_path) is False:
+                return False  # pragma: no cover
 
         if self.__remove_rsid(full_path) is False:
-            return False
+            return False  # pragma: no cover
 
         if self.__remove_nsid(full_path) is False:
             return False  # pragma: no cover
-- 
cgit v1.3