Files processed via MAT2 are now accepted without warnings by MS Office

author: jvoisin 2018-10-01 12:25:37 -0700
committer: jvoisin 2018-10-01 12:25:37 -0700
commit: 652b8e519fbd11da051f40ecde5b814e1e8fc013 (patch)
tree: d510e6543d61806facf99a70174c98145823f323 /libmat2/office.py
parent: c14be47f95e3a0fe6ecddfc236329bc3e76f5eec (diff)
1 files changed, 50 insertions, 13 deletions
diff --git a/libmat2/office.py b/libmat2/office.py
index a8a2c94..4348d9b 100644
--- a/libmat2/office.py
+++ b/libmat2/office.py
@@ -36,9 +36,8 @@ def _sort_xml_attributes(full_path: str) -> bool:
    since they are all using different orders.
    """
    tree = ET.parse(full_path)
-    root = tree.getroot()
-    for c in root:
+    for c in tree.getroot():
        c[:] = sorted(c, key=lambda child: (child.tag, child.get('desc')))
    tree.write(full_path, xml_declaration=True)
@@ -59,6 +58,8 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
        'word/fontTable.xml',
        'word/settings.xml',
        'word/styles.xml',
+        'docProps/app.xml',
+        'docProps/core.xml',
        # https://msdn.microsoft.com/en-us/library/dd908153(v=office.12).aspx
        'word/stylesWithEffects.xml',
@@ -66,7 +67,6 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
    files_to_omit = set(map(re.compile, {  # type: ignore
        'word/webSettings.xml',
        'word/theme',
-        '^docProps/',
    }))
    @staticmethod
@@ -95,7 +95,7 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
        elements_to_remove = list()
        for item in tree.iterfind('.//', namespace):
-            if '}rsid' in item.tag.strip().lower():  # resi as tag
+            if '}rsid' in item.tag.strip().lower():  # rsid as tag
                elements_to_remove.append(item)
                continue
            for key in list(item.attrib.keys()):  # rsid as attribute
@@ -106,7 +106,6 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
            parent_map[element].remove(element)
        tree.write(full_path, xml_declaration=True)
        return True
    @staticmethod
@@ -148,7 +147,6 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
            parent_map[element].remove(element)
        tree.write(full_path, xml_declaration=True)
        return True
    def __remove_content_type_members(self, full_path: str) -> bool:
@@ -176,27 +174,67 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
                root.remove(item)
        tree.write(full_path, xml_declaration=True)
        return True
    def _specific_cleanup(self, full_path: str) -> bool:
+        # pylint: disable=too-many-return-statements
        if os.stat(full_path).st_size == 0:  # Don't process empty files
            return True
+        if not full_path.endswith('.xml'):
+            return True
        if full_path.endswith('/[Content_Types].xml'):
            # this file contains references to files that we might
            # remove, and MS Office doesn't like dangling references
            if self.__remove_content_type_members(full_path) is False:
                return False
+        elif full_path.endswith('/word/document.xml'):
-        if full_path.endswith('/word/document.xml'):
            # this file contains the revisions
            if self.__remove_revisions(full_path) is False:
                return False
+        elif full_path.endswith('/docProps/app.xml'):
+            # This file must be present and valid,
+            # so we're removing as much as we can.
+            with open(full_path, 'wb') as f:
+                f.write(b'<?xml version="1.0" encoding="UTF-8" standalone="yes"?>')
+                f.write(b'<Properties xmlns="http://schemas.openxmlformats.org/officeDocument/2006/extended-properties">')
+                f.write(b'</Properties>')
+        elif full_path.endswith('/docProps/core.xml'):
+            # This file must be present and valid,
+            # so we're removing as much as we can.
+            with open(full_path, 'wb') as f:
+                f.write(b'<?xml version="1.0" encoding="UTF-8" standalone="yes"?>')
+                f.write(b'<cp:coreProperties xmlns:cp="http://schemas.openxmlformats.org/package/2006/metadata/core-properties">')
+                f.write(b'</cp:coreProperties>')
-        if full_path.endswith('.xml'):
-            if self.__remove_rsid(full_path) is False:
+        if self.__remove_rsid(full_path) is False:
-                return False
+            return False
+        try:
+            _sort_xml_attributes(full_path)
+        except ET.ParseError as e:  # pragma: no cover
+            logging.error("Unable to parse %s: %s", full_path, e)
+            return False
+        # This is awful, I'm sorry.
+        #
+        # Microsoft Office isn't happy when we have the `mc:Ignorable`
+        # tag containing namespaces that aren't present in the xml file,
+        # so instead of trying to remove this specific tag with etree,
+        # we're removing it, with a regexp.
+        #
+        # Since we're the ones producing this file, via the call to
+        # _sort_xml_attributes, there won't be any "funny tricks".
+        # Worst case, the tag isn't present, and everything is fine.
+        #
+        # see: https://docs.microsoft.com/en-us/dotnet/framework/wpf/advanced/mc-ignorable-attribute
+        with open(full_path, 'rb') as f:
+            text = f.read()
+            out = re.sub(b'mc:Ignorable="[^"]*"', b'', text, 1)
+        with open(full_path, 'wb') as f:
+            f.write(out)
        return True
@@ -262,7 +300,6 @@ class LibreOfficeParser(ArchiveBasedAbstractParser):
                text.remove(changes)
        tree.write(full_path, xml_declaration=True)
        return True
    def _specific_cleanup(self, full_path: str) -> bool:
author	jvoisin	2018-10-01 12:25:37 -0700
committer	jvoisin	2018-10-01 12:25:37 -0700
commit	652b8e519fbd11da051f40ecde5b814e1e8fc013 (patch)
tree	d510e6543d61806facf99a70174c98145823f323 /libmat2/office.py
parent	c14be47f95e3a0fe6ecddfc236329bc3e76f5eec (diff)

diff --git a/libmat2/office.py b/libmat2/office.py index a8a2c94..4348d9b 100644 --- a/libmat2/office.py +++ b/libmat2/office.py
@@ -36,9 +36,8 @@ def _sort_xml_attributes(full_path: str) -> bool:
36	since they are all using different orders.	36	since they are all using different orders.
37	"""	37	"""
38	tree = ET.parse(full_path)	38	tree = ET.parse(full_path)
39	root = tree.getroot()
40		39
41	for c in root:	40	for c in tree.getroot():
42	c[:] = sorted(c, key=lambda child: (child.tag, child.get('desc')))	41	c[:] = sorted(c, key=lambda child: (child.tag, child.get('desc')))
43		42
44	tree.write(full_path, xml_declaration=True)	43	tree.write(full_path, xml_declaration=True)
@@ -59,6 +58,8 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
59	'word/fontTable.xml',	58	'word/fontTable.xml',
60	'word/settings.xml',	59	'word/settings.xml',
61	'word/styles.xml',	60	'word/styles.xml',
		61	'docProps/app.xml',
		62	'docProps/core.xml',
62		63
63	# https://msdn.microsoft.com/en-us/library/dd908153(v=office.12).aspx	64	# https://msdn.microsoft.com/en-us/library/dd908153(v=office.12).aspx
64	'word/stylesWithEffects.xml',	65	'word/stylesWithEffects.xml',
@@ -66,7 +67,6 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
66	files_to_omit = set(map(re.compile, { # type: ignore	67	files_to_omit = set(map(re.compile, { # type: ignore
67	'word/webSettings.xml',	68	'word/webSettings.xml',
68	'word/theme',	69	'word/theme',
69	'^docProps/',
70	}))	70	}))
71		71
72	@staticmethod	72	@staticmethod
@@ -95,7 +95,7 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
95		95
96	elements_to_remove = list()	96	elements_to_remove = list()
97	for item in tree.iterfind('.//', namespace):	97	for item in tree.iterfind('.//', namespace):
98	if '}rsid' in item.tag.strip().lower(): # resi as tag	98	if '}rsid' in item.tag.strip().lower(): # rsid as tag
99	elements_to_remove.append(item)	99	elements_to_remove.append(item)
100	continue	100	continue
101	for key in list(item.attrib.keys()): # rsid as attribute	101	for key in list(item.attrib.keys()): # rsid as attribute
@@ -106,7 +106,6 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
106	parent_map[element].remove(element)	106	parent_map[element].remove(element)
107		107
108	tree.write(full_path, xml_declaration=True)	108	tree.write(full_path, xml_declaration=True)
109
110	return True	109	return True
111		110
112	@staticmethod	111	@staticmethod
@@ -148,7 +147,6 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
148	parent_map[element].remove(element)	147	parent_map[element].remove(element)
149		148
150	tree.write(full_path, xml_declaration=True)	149	tree.write(full_path, xml_declaration=True)
151
152	return True	150	return True
153		151
154	def __remove_content_type_members(self, full_path: str) -> bool:	152	def __remove_content_type_members(self, full_path: str) -> bool:
@@ -176,27 +174,67 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
176	root.remove(item)	174	root.remove(item)
177		175
178	tree.write(full_path, xml_declaration=True)	176	tree.write(full_path, xml_declaration=True)
179
180	return True	177	return True
181		178
182	def _specific_cleanup(self, full_path: str) -> bool:	179	def _specific_cleanup(self, full_path: str) -> bool:
		180	# pylint: disable=too-many-return-statements
183	if os.stat(full_path).st_size == 0: # Don't process empty files	181	if os.stat(full_path).st_size == 0: # Don't process empty files
184	return True	182	return True
185		183
		184	if not full_path.endswith('.xml'):
		185	return True
		186
186	if full_path.endswith('/[Content_Types].xml'):	187	if full_path.endswith('/[Content_Types].xml'):
187	# this file contains references to files that we might	188	# this file contains references to files that we might
188	# remove, and MS Office doesn't like dangling references	189	# remove, and MS Office doesn't like dangling references
189	if self.__remove_content_type_members(full_path) is False:	190	if self.__remove_content_type_members(full_path) is False:
190	return False	191	return False
191		192	elif full_path.endswith('/word/document.xml'):
192	if full_path.endswith('/word/document.xml'):
193	# this file contains the revisions	193	# this file contains the revisions
194	if self.__remove_revisions(full_path) is False:	194	if self.__remove_revisions(full_path) is False:
195	return False	195	return False
		196	elif full_path.endswith('/docProps/app.xml'):
		197	# This file must be present and valid,
		198	# so we're removing as much as we can.
		199	with open(full_path, 'wb') as f:
		200	f.write(b'<?xml version="1.0" encoding="UTF-8" standalone="yes"?>')
		201	f.write(b'<Properties xmlns="http://schemas.openxmlformats.org/officeDocument/2006/extended-properties">')
		202	f.write(b'</Properties>')
		203	elif full_path.endswith('/docProps/core.xml'):
		204	# This file must be present and valid,
		205	# so we're removing as much as we can.
		206	with open(full_path, 'wb') as f:
		207	f.write(b'<?xml version="1.0" encoding="UTF-8" standalone="yes"?>')
		208	f.write(b'<cp:coreProperties xmlns:cp="http://schemas.openxmlformats.org/package/2006/metadata/core-properties">')
		209	f.write(b'</cp:coreProperties>')
196		210
197	if full_path.endswith('.xml'):	211
198	if self.__remove_rsid(full_path) is False:	212	if self.__remove_rsid(full_path) is False:
199	return False	213	return False
		214
		215	try:
		216	_sort_xml_attributes(full_path)
		217	except ET.ParseError as e: # pragma: no cover
		218	logging.error("Unable to parse %s: %s", full_path, e)
		219	return False
		220
		221	# This is awful, I'm sorry.
		222	#
		223	# Microsoft Office isn't happy when we have the `mc:Ignorable`
		224	# tag containing namespaces that aren't present in the xml file,
		225	# so instead of trying to remove this specific tag with etree,
		226	# we're removing it, with a regexp.
		227	#
		228	# Since we're the ones producing this file, via the call to
		229	# _sort_xml_attributes, there won't be any "funny tricks".
		230	# Worst case, the tag isn't present, and everything is fine.
		231	#
		232	# see: https://docs.microsoft.com/en-us/dotnet/framework/wpf/advanced/mc-ignorable-attribute
		233	with open(full_path, 'rb') as f:
		234	text = f.read()
		235	out = re.sub(b'mc:Ignorable="[^"]*"', b'', text, 1)
		236	with open(full_path, 'wb') as f:
		237	f.write(out)
200		238
201	return True	239	return True
202		240
@@ -262,7 +300,6 @@ class LibreOfficeParser(ArchiveBasedAbstractParser):
262	text.remove(changes)	300	text.remove(changes)
263		301
264	tree.write(full_path, xml_declaration=True)	302	tree.write(full_path, xml_declaration=True)
265
266	return True	303	return True
267		304
268	def _specific_cleanup(self, full_path: str) -> bool:	305	def _specific_cleanup(self, full_path: str) -> bool: