Vastly improve ppt compatibility

author: jvoisin 2020-03-08 12:17:56 +0100
committer: jvoisin 2020-03-08 14:06:27 +0100
commit: d7a03d907baac73dd69c0ce77f3610e7d4bad5f4 (patch)
tree: 1f4ac10ff3355a162047c08c3ec79ed04f1ecbc2 /libmat2/office.py
parent: a23dc001cd74866204c868ef1dbf7ef7ca99b5aa (diff)
1 files changed, 81 insertions, 5 deletions
diff --git a/libmat2/office.py b/libmat2/office.py
index 7f70b72..d122fc8 100644
--- a/libmat2/office.py
+++ b/libmat2/office.py
@@ -1,3 +1,4 @@
+import random
 import uuid
 import logging
 import os
@@ -75,6 +76,14 @@ class MSOfficeParser(ZipParser):
    def __init__(self, filename):
        super().__init__(filename)
+        # MSOffice documents are using various counters for cross-references,
+        # we collect them all, to make sure that they're effectively counters,
+        # and not unique id used for fingerprinting.
+        self.__counters = {
+            'cNvPr': set(),
+            'rid': set(),
+            }
        self.files_to_keep = set(map(re.compile, {  # type: ignore
            r'^\[Content_Types\]\.xml$',
            r'^_rels/\.rels$',
@@ -84,8 +93,14 @@ class MSOfficeParser(ZipParser):
            r'^ppt/slideLayouts/_rels/slideLayout[0-9]+\.xml\.rels$',
            r'^ppt/slideLayouts/slideLayout[0-9]+\.xml$',
            r'^(?:word|ppt)/tableStyles\.xml$',
+            r'^ppt/slides/_rels/slide[0-9]*\.xml\.rels$',
+            r'^ppt/slides/slide[0-9]*\.xml$',
            # https://msdn.microsoft.com/en-us/library/dd908153(v=office.12).aspx
            r'^(?:word|ppt)/stylesWithEffects\.xml$',
+            r'^ppt/presentation\.xml$',
+            # TODO: check if p:bgRef can be randomized
+            r'^ppt/slideMasters/slideMaster[0-9]+\.xml',
+            r'^ppt/slideMasters/_rels/slideMaster[0-9]+\.xml\.rels',
        }))
        self.files_to_omit = set(map(re.compile, {  # type: ignore
            r'^customXml/',
@@ -95,6 +110,7 @@ class MSOfficeParser(ZipParser):
            r'^(?:word|ppt)/theme',
            r'^(?:word|ppt)/people\.xml$',
            r'^(?:word|ppt)/numbering\.xml$',
+            r'^(?:word|ppt)/tags/',
            # View properties like view mode, last viewed slide etc
            r'^(?:word|ppt)/viewProps\.xml$',
            # Additional presentation-wide properties like printing properties,
@@ -146,7 +162,7 @@ class MSOfficeParser(ZipParser):
        """
        try:
            tree, namespace = _parse_xml(full_path)
-        except ET.ParseError as e:
+        except ET.ParseError as e:  # pragma: no cover
            logging.error("Unable to parse %s: %s", full_path, e)
            return False
@@ -206,7 +222,7 @@ class MSOfficeParser(ZipParser):
    def __remove_revisions(full_path: str) -> bool:
        try:
            tree, namespace = _parse_xml(full_path)
-        except ET.ParseError as e:
+        except ET.ParseError as e:  # pragma: no cover
            logging.error("Unable to parse %s: %s", full_path, e)
            return False
@@ -272,14 +288,71 @@ class MSOfficeParser(ZipParser):
        tree.write(full_path, xml_declaration=True)
        return True
+    def _final_checks(self) -> bool:
+        for k, v in self.__counters.items():
+            if v and len(v) != max(v):
+                # TODO: make this an error and return False
+                # once the ability to correct the counters is implemented
+                logging.warning("%s contains invalid %s: %s", self.filename, k, v)
+                return True
+        return True
+    def __collect_counters(self, full_path: str):
+        with open(full_path, encoding='utf-8') as f:
+            content = f.read()
+            # "relationship Id"
+            for i in re.findall(r'(?:\s|r:)[iI][dD]="rId([0-9]+)"(?:\s|/)', content):
+                self.__counters['rid'].add(int(i))
+            # "connector for Non-visual property"
+            for i in re.findall(r'<p:cNvPr id="([0-9]+)"', content):
+                self.__counters['cNvPr'].add(int(i))
+    @staticmethod
+    def __randomize_creationId(full_path: str) -> bool:
+        try:
+            tree, namespace = _parse_xml(full_path)
+        except ET.ParseError as e:  # pragma: no cover
+            logging.error("Unable to parse %s: %s", full_path, e)
+            return False
+        if 'p14' not in namespace.keys():
+            return True  # pragma: no cover
+        for item in tree.iterfind('.//p14:creationId', namespace):
+            item.set('val', '%s' % random.randint(0, 2**32))
+        tree.write(full_path, xml_declaration=True)
+        return True
+    @staticmethod
+    def __randomize_sldMasterId(full_path: str) -> bool:
+        try:
+            tree, namespace = _parse_xml(full_path)
+        except ET.ParseError as e:  # pragma: no cover
+            logging.error("Unable to parse %s: %s", full_path, e)
+            return False
+        if 'p' not in namespace.keys():
+            return True  # pragma: no cover
+        for item in tree.iterfind('.//p:sldMasterId', namespace):
+            item.set('id', '%s' % random.randint(0, 2**32))
+        tree.write(full_path, xml_declaration=True)
+        return True
    def _specific_cleanup(self, full_path: str) -> bool:
-        # pylint: disable=too-many-return-statements
+        # pylint: disable=too-many-return-statements,too-many-branches
        if os.stat(full_path).st_size == 0:  # Don't process empty files
            return True
        if not full_path.endswith('.xml'):
            return True
+        if self.__randomize_creationId(full_path) is False:
+            return False
+        self.__collect_counters(full_path)
        if full_path.endswith('/[Content_Types].xml'):
            # this file contains references to files that we might
            # remove, and MS Office doesn't like dangling references
@@ -288,7 +361,7 @@ class MSOfficeParser(ZipParser):
        elif full_path.endswith('/word/document.xml'):
            # this file contains the revisions
            if self.__remove_revisions(full_path) is False:
-                return False
+                return False  # pragma: no cover
        elif full_path.endswith('/docProps/app.xml'):
            # This file must be present and valid,
            # so we're removing as much as we can.
@@ -310,9 +383,12 @@ class MSOfficeParser(ZipParser):
                f.write(b'<?xml version="1.0" encoding="UTF-8" standalone="yes"?>')
                uid = str(uuid.uuid4()).encode('utf-8')
                f.write(b'<a:tblStyleLst def="{%s}" xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"/>' % uid)
+        elif full_path.endswith('ppt/presentation.xml'):
+            if self.__randomize_sldMasterId(full_path) is False:
+                return False  # pragma: no cover
        if self.__remove_rsid(full_path) is False:
-            return False
+            return False  # pragma: no cover
        if self.__remove_nsid(full_path) is False:
            return False  # pragma: no cover
author	jvoisin	2020-03-08 12:17:56 +0100
committer	jvoisin	2020-03-08 14:06:27 +0100
commit	d7a03d907baac73dd69c0ce77f3610e7d4bad5f4 (patch)
tree	1f4ac10ff3355a162047c08c3ec79ed04f1ecbc2 /libmat2/office.py
parent	a23dc001cd74866204c868ef1dbf7ef7ca99b5aa (diff)

diff --git a/libmat2/office.py b/libmat2/office.py index 7f70b72..d122fc8 100644 --- a/libmat2/office.py +++ b/libmat2/office.py
@@ -1,3 +1,4 @@
		1	import random
1	import uuid	2	import uuid
2	import logging	3	import logging
3	import os	4	import os
@@ -75,6 +76,14 @@ class MSOfficeParser(ZipParser):
75	def __init__(self, filename):	76	def __init__(self, filename):
76	super().__init__(filename)	77	super().__init__(filename)
77		78
		79	# MSOffice documents are using various counters for cross-references,
		80	# we collect them all, to make sure that they're effectively counters,
		81	# and not unique id used for fingerprinting.
		82	self.__counters = {
		83	'cNvPr': set(),
		84	'rid': set(),
		85	}
		86
78	self.files_to_keep = set(map(re.compile, { # type: ignore	87	self.files_to_keep = set(map(re.compile, { # type: ignore
79	r'^\[Content_Types\]\.xml$',	88	r'^\[Content_Types\]\.xml$',
80	r'^_rels/\.rels$',	89	r'^_rels/\.rels$',
@@ -84,8 +93,14 @@ class MSOfficeParser(ZipParser):
84	r'^ppt/slideLayouts/_rels/slideLayout[0-9]+\.xml\.rels$',	93	r'^ppt/slideLayouts/_rels/slideLayout[0-9]+\.xml\.rels$',
85	r'^ppt/slideLayouts/slideLayout[0-9]+\.xml$',	94	r'^ppt/slideLayouts/slideLayout[0-9]+\.xml$',
86	r'^(?:word\|ppt)/tableStyles\.xml$',	95	r'^(?:word\|ppt)/tableStyles\.xml$',
		96	r'^ppt/slides/_rels/slide[0-9]*\.xml\.rels$',
		97	r'^ppt/slides/slide[0-9]*\.xml$',
87	# https://msdn.microsoft.com/en-us/library/dd908153(v=office.12).aspx	98	# https://msdn.microsoft.com/en-us/library/dd908153(v=office.12).aspx
88	r'^(?:word\|ppt)/stylesWithEffects\.xml$',	99	r'^(?:word\|ppt)/stylesWithEffects\.xml$',
		100	r'^ppt/presentation\.xml$',
		101	# TODO: check if p:bgRef can be randomized
		102	r'^ppt/slideMasters/slideMaster[0-9]+\.xml',
		103	r'^ppt/slideMasters/_rels/slideMaster[0-9]+\.xml\.rels',
89	}))	104	}))
90	self.files_to_omit = set(map(re.compile, { # type: ignore	105	self.files_to_omit = set(map(re.compile, { # type: ignore
91	r'^customXml/',	106	r'^customXml/',
@@ -95,6 +110,7 @@ class MSOfficeParser(ZipParser):
95	r'^(?:word\|ppt)/theme',	110	r'^(?:word\|ppt)/theme',
96	r'^(?:word\|ppt)/people\.xml$',	111	r'^(?:word\|ppt)/people\.xml$',
97	r'^(?:word\|ppt)/numbering\.xml$',	112	r'^(?:word\|ppt)/numbering\.xml$',
		113	r'^(?:word\|ppt)/tags/',
98	# View properties like view mode, last viewed slide etc	114	# View properties like view mode, last viewed slide etc
99	r'^(?:word\|ppt)/viewProps\.xml$',	115	r'^(?:word\|ppt)/viewProps\.xml$',
100	# Additional presentation-wide properties like printing properties,	116	# Additional presentation-wide properties like printing properties,
@@ -146,7 +162,7 @@ class MSOfficeParser(ZipParser):
146	"""	162	"""
147	try:	163	try:
148	tree, namespace = _parse_xml(full_path)	164	tree, namespace = _parse_xml(full_path)
149	except ET.ParseError as e:	165	except ET.ParseError as e: # pragma: no cover
150	logging.error("Unable to parse %s: %s", full_path, e)	166	logging.error("Unable to parse %s: %s", full_path, e)
151	return False	167	return False
152		168
@@ -206,7 +222,7 @@ class MSOfficeParser(ZipParser):
206	def __remove_revisions(full_path: str) -> bool:	222	def __remove_revisions(full_path: str) -> bool:
207	try:	223	try:
208	tree, namespace = _parse_xml(full_path)	224	tree, namespace = _parse_xml(full_path)
209	except ET.ParseError as e:	225	except ET.ParseError as e: # pragma: no cover
210	logging.error("Unable to parse %s: %s", full_path, e)	226	logging.error("Unable to parse %s: %s", full_path, e)
211	return False	227	return False
212		228
@@ -272,14 +288,71 @@ class MSOfficeParser(ZipParser):
272	tree.write(full_path, xml_declaration=True)	288	tree.write(full_path, xml_declaration=True)
273	return True	289	return True
274		290
		291	def _final_checks(self) -> bool:
		292	for k, v in self.__counters.items():
		293	if v and len(v) != max(v):
		294	# TODO: make this an error and return False
		295	# once the ability to correct the counters is implemented
		296	logging.warning("%s contains invalid %s: %s", self.filename, k, v)
		297	return True
		298	return True
		299
		300	def __collect_counters(self, full_path: str):
		301	with open(full_path, encoding='utf-8') as f:
		302	content = f.read()
		303	# "relationship Id"
		304	for i in re.findall(r'(?:\s\|r:)[iI][dD]="rId([0-9]+)"(?:\s\|/)', content):
		305	self.__counters['rid'].add(int(i))
		306	# "connector for Non-visual property"
		307	for i in re.findall(r'<p:cNvPr id="([0-9]+)"', content):
		308	self.__counters['cNvPr'].add(int(i))
		309
		310
		311	@staticmethod
		312	def __randomize_creationId(full_path: str) -> bool:
		313	try:
		314	tree, namespace = _parse_xml(full_path)
		315	except ET.ParseError as e: # pragma: no cover
		316	logging.error("Unable to parse %s: %s", full_path, e)
		317	return False
		318
		319	if 'p14' not in namespace.keys():
		320	return True # pragma: no cover
		321
		322	for item in tree.iterfind('.//p14:creationId', namespace):
		323	item.set('val', '%s' % random.randint(0, 2**32))
		324	tree.write(full_path, xml_declaration=True)
		325	return True
		326
		327	@staticmethod
		328	def __randomize_sldMasterId(full_path: str) -> bool:
		329	try:
		330	tree, namespace = _parse_xml(full_path)
		331	except ET.ParseError as e: # pragma: no cover
		332	logging.error("Unable to parse %s: %s", full_path, e)
		333	return False
		334
		335	if 'p' not in namespace.keys():
		336	return True # pragma: no cover
		337
		338	for item in tree.iterfind('.//p:sldMasterId', namespace):
		339	item.set('id', '%s' % random.randint(0, 2**32))
		340	tree.write(full_path, xml_declaration=True)
		341	return True
		342
275	def _specific_cleanup(self, full_path: str) -> bool:	343	def _specific_cleanup(self, full_path: str) -> bool:
276	# pylint: disable=too-many-return-statements	344	# pylint: disable=too-many-return-statements,too-many-branches
277	if os.stat(full_path).st_size == 0: # Don't process empty files	345	if os.stat(full_path).st_size == 0: # Don't process empty files
278	return True	346	return True
279		347
280	if not full_path.endswith('.xml'):	348	if not full_path.endswith('.xml'):
281	return True	349	return True
282		350
		351	if self.__randomize_creationId(full_path) is False:
		352	return False
		353
		354	self.__collect_counters(full_path)
		355
283	if full_path.endswith('/[Content_Types].xml'):	356	if full_path.endswith('/[Content_Types].xml'):
284	# this file contains references to files that we might	357	# this file contains references to files that we might
285	# remove, and MS Office doesn't like dangling references	358	# remove, and MS Office doesn't like dangling references
@@ -288,7 +361,7 @@ class MSOfficeParser(ZipParser):
288	elif full_path.endswith('/word/document.xml'):	361	elif full_path.endswith('/word/document.xml'):
289	# this file contains the revisions	362	# this file contains the revisions
290	if self.__remove_revisions(full_path) is False:	363	if self.__remove_revisions(full_path) is False:
291	return False	364	return False # pragma: no cover
292	elif full_path.endswith('/docProps/app.xml'):	365	elif full_path.endswith('/docProps/app.xml'):
293	# This file must be present and valid,	366	# This file must be present and valid,
294	# so we're removing as much as we can.	367	# so we're removing as much as we can.
@@ -310,9 +383,12 @@ class MSOfficeParser(ZipParser):
310	f.write(b'<?xml version="1.0" encoding="UTF-8" standalone="yes"?>')	383	f.write(b'<?xml version="1.0" encoding="UTF-8" standalone="yes"?>')
311	uid = str(uuid.uuid4()).encode('utf-8')	384	uid = str(uuid.uuid4()).encode('utf-8')
312	f.write(b'<a:tblStyleLst def="{%s}" xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"/>' % uid)	385	f.write(b'<a:tblStyleLst def="{%s}" xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"/>' % uid)
		386	elif full_path.endswith('ppt/presentation.xml'):
		387	if self.__randomize_sldMasterId(full_path) is False:
		388	return False # pragma: no cover
313		389
314	if self.__remove_rsid(full_path) is False:	390	if self.__remove_rsid(full_path) is False:
315	return False	391	return False # pragma: no cover
316		392
317	if self.__remove_nsid(full_path) is False:	393	if self.__remove_nsid(full_path) is False:
318	return False # pragma: no cover	394	return False # pragma: no cover