3 files changed, 87 insertions, 5 deletions
diff --git a/libmat2/office.py b/libmat2/office.py
index 5c2c996..07bbbb9 100644
--- a/libmat2/office.py
+++ b/libmat2/office.py
@@ -8,6 +8,8 @@ import xml.etree.ElementTree as ET  # type: ignore
 from .archive import ArchiveBasedAbstractParser
+# pylint: disable=line-too-long
 # Make pyflakes happy
 assert Set
 assert Pattern
@@ -15,14 +17,12 @@ assert Pattern
 def _parse_xml(full_path: str):
    """ This function parses XML, with namespace support. """
-    cpt = 0
    namespace_map = dict()
    for _, (key, value) in ET.iterparse(full_path, ("start-ns", )):
        # The ns[0-9]+ namespaces are reserved for interal usage, so
        # we have to use an other nomenclature.
-        if re.match('^ns[0-9]+$', key):
+        if re.match('^ns[0-9]+$', key, re.I):  #pragma: no cover
-            key = 'mat%d' % cpt
+            key = 'mat' + key[2:]
-            cpt += 1
        namespace_map[key] = value
        ET.register_namespace(key, value)
@@ -59,12 +59,57 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
        'word/fontTable.xml',
        'word/settings.xml',
        'word/styles.xml',
+        # https://msdn.microsoft.com/en-us/library/dd908153(v=office.12).aspx
+        'word/stylesWithEffects.xml',
    }
    files_to_omit = set(map(re.compile, {  # type: ignore
+        'word/webSettings.xml',
+        'word/theme',
        '^docProps/',
    }))
    @staticmethod
+    def __remove_rsid(full_path: str) -> bool:
+        """ The method will remove "revision session ID".  We're '}rsid'
+        instead of proper parsing, since rsid can have multiple forms, like
+        `rsidRDefault`, `rsidR`, `rsids`, …
+        We're removing rsid tags in two times, because we can't modify
+        the xml while we're iterating on it.
+        For more details, see
+        - https://msdn.microsoft.com/en-us/library/office/documentformat.openxml.wordprocessing.previoussectionproperties.rsidrpr.aspx
+        - https://blogs.msdn.microsoft.com/brian_jones/2006/12/11/whats-up-with-all-those-rsids/
+        """
+        try:
+            tree, namespace = _parse_xml(full_path)
+        except ET.ParseError:
+            return False
+        # rsid, tags or attributes, are always under the `w` namespace
+        if 'w' not in namespace.keys():
+            return True
+        parent_map = {c:p for p in tree.iter() for c in p}
+        elements_to_remove = list()
+        for item in tree.iterfind('.//', namespace):
+            if '}rsid' in item.tag.strip().lower():  # resi as tag
+                elements_to_remove.append(item)
+                continue
+            for key in list(item.attrib.keys()):  # rsid as attribute
+                if '}rsid' in key.lower():
+                    del item.attrib[key]
+        for element in elements_to_remove:
+            parent_map[element].remove(element)
+        tree.write(full_path, xml_declaration=True)
+        return True
+    @staticmethod
    def __remove_revisions(full_path: str) -> bool:
        """ In this function, we're changing the XML document in several
        different times, since we don't want to change the tree we're currently
@@ -112,7 +157,13 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
        if full_path.endswith('/word/document.xml'):
            # this file contains the revisions
-            return self.__remove_revisions(full_path)
+            if self.__remove_revisions(full_path) is False:
+                return False
+        if full_path.endswith('.xml'):
+            if self.__remove_rsid(full_path) is False:
+                return False
        return True
    def get_meta(self) -> Dict[str, str]:
diff --git a/tests/data/office_revision_session_ids.docx b/tests/data/office_revision_session_ids.docx
new file mode 100644
index 0000000..b40a341
--- /dev/null
+++ b/tests/data/office_revision_session_ids.docx
Binary files differ
diff --git a/tests/test_deep_cleaning.py b/tests/test_deep_cleaning.py
index 3d1c8e1..82579a3 100644
--- a/tests/test_deep_cleaning.py
+++ b/tests/test_deep_cleaning.py
@@ -105,3 +105,34 @@ class TestZipOrder(unittest.TestCase):
        os.remove('./tests/data/clean.odt')
        os.remove('./tests/data/clean.cleaned.odt')
+class TestRsidRemoval(unittest.TestCase):
+    def test_office(self):
+        shutil.copy('./tests/data/office_revision_session_ids.docx', './tests/data/clean.docx')
+        p = office.MSOfficeParser('./tests/data/clean.docx')
+        meta = p.get_meta()
+        self.assertIsNotNone(meta)
+        how_many_rsid = False
+        with zipfile.ZipFile('./tests/data/clean.docx') as zin:
+            for item in zin.infolist():
+                if not item.filename.endswith('.xml'):
+                    continue
+                num = zin.read(item).decode('utf-8').lower().count('w:rsid')
+                how_many_rsid += num
+        self.assertEqual(how_many_rsid, 11)
+        ret = p.remove_all()
+        self.assertTrue(ret)
+        with zipfile.ZipFile('./tests/data/clean.cleaned.docx') as zin:
+            for item in zin.infolist():
+                if not item.filename.endswith('.xml'):
+                    continue
+                num = zin.read(item).decode('utf-8').lower().count('w:rsid')
+                self.assertEqual(num, 0)
+        os.remove('./tests/data/clean.docx')
+        os.remove('./tests/data/clean.cleaned.docx')

diff --git a/libmat2/office.py b/libmat2/office.py index 5c2c996..07bbbb9 100644 --- a/libmat2/office.py +++ b/libmat2/office.py
@@ -8,6 +8,8 @@ import xml.etree.ElementTree as ET # type: ignore
8		8
9	from .archive import ArchiveBasedAbstractParser	9	from .archive import ArchiveBasedAbstractParser
10		10
		11	# pylint: disable=line-too-long
		12
11	# Make pyflakes happy	13	# Make pyflakes happy
12	assert Set	14	assert Set
13	assert Pattern	15	assert Pattern
@@ -15,14 +17,12 @@ assert Pattern
15	def _parse_xml(full_path: str):	17	def _parse_xml(full_path: str):
16	""" This function parses XML, with namespace support. """	18	""" This function parses XML, with namespace support. """
17		19
18	cpt = 0
19	namespace_map = dict()	20	namespace_map = dict()
20	for _, (key, value) in ET.iterparse(full_path, ("start-ns", )):	21	for _, (key, value) in ET.iterparse(full_path, ("start-ns", )):
21	# The ns[0-9]+ namespaces are reserved for interal usage, so	22	# The ns[0-9]+ namespaces are reserved for interal usage, so
22	# we have to use an other nomenclature.	23	# we have to use an other nomenclature.
23	if re.match('^ns[0-9]+$', key):	24	if re.match('^ns[0-9]+$', key, re.I): #pragma: no cover
24	key = 'mat%d' % cpt	25	key = 'mat' + key[2:]
25	cpt += 1
26		26
27	namespace_map[key] = value	27	namespace_map[key] = value
28	ET.register_namespace(key, value)	28	ET.register_namespace(key, value)
@@ -59,12 +59,57 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
59	'word/fontTable.xml',	59	'word/fontTable.xml',
60	'word/settings.xml',	60	'word/settings.xml',
61	'word/styles.xml',	61	'word/styles.xml',
		62
		63	# https://msdn.microsoft.com/en-us/library/dd908153(v=office.12).aspx
		64	'word/stylesWithEffects.xml',
62	}	65	}
63	files_to_omit = set(map(re.compile, { # type: ignore	66	files_to_omit = set(map(re.compile, { # type: ignore
		67	'word/webSettings.xml',
		68	'word/theme',
64	'^docProps/',	69	'^docProps/',
65	}))	70	}))
66		71
67	@staticmethod	72	@staticmethod
		73	def __remove_rsid(full_path: str) -> bool:
		74	""" The method will remove "revision session ID". We're '}rsid'
		75	instead of proper parsing, since rsid can have multiple forms, like
		76	`rsidRDefault`, `rsidR`, `rsids`, …
		77
		78	We're removing rsid tags in two times, because we can't modify
		79	the xml while we're iterating on it.
		80
		81	For more details, see
		82	- https://msdn.microsoft.com/en-us/library/office/documentformat.openxml.wordprocessing.previoussectionproperties.rsidrpr.aspx
		83	- https://blogs.msdn.microsoft.com/brian_jones/2006/12/11/whats-up-with-all-those-rsids/
		84	"""
		85	try:
		86	tree, namespace = _parse_xml(full_path)
		87	except ET.ParseError:
		88	return False
		89
		90	# rsid, tags or attributes, are always under the `w` namespace
		91	if 'w' not in namespace.keys():
		92	return True
		93
		94	parent_map = {c:p for p in tree.iter() for c in p}
		95
		96	elements_to_remove = list()
		97	for item in tree.iterfind('.//', namespace):
		98	if '}rsid' in item.tag.strip().lower(): # resi as tag
		99	elements_to_remove.append(item)
		100	continue
		101	for key in list(item.attrib.keys()): # rsid as attribute
		102	if '}rsid' in key.lower():
		103	del item.attrib[key]
		104
		105	for element in elements_to_remove:
		106	parent_map[element].remove(element)
		107
		108	tree.write(full_path, xml_declaration=True)
		109
		110	return True
		111
		112	@staticmethod
68	def __remove_revisions(full_path: str) -> bool:	113	def __remove_revisions(full_path: str) -> bool:
69	""" In this function, we're changing the XML document in several	114	""" In this function, we're changing the XML document in several
70	different times, since we don't want to change the tree we're currently	115	different times, since we don't want to change the tree we're currently
@@ -112,7 +157,13 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
112		157
113	if full_path.endswith('/word/document.xml'):	158	if full_path.endswith('/word/document.xml'):
114	# this file contains the revisions	159	# this file contains the revisions
115	return self.__remove_revisions(full_path)	160	if self.__remove_revisions(full_path) is False:
		161	return False
		162
		163	if full_path.endswith('.xml'):
		164	if self.__remove_rsid(full_path) is False:
		165	return False
		166
116	return True	167	return True
117		168
118	def get_meta(self) -> Dict[str, str]:	169	def get_meta(self) -> Dict[str, str]:


diff --git a/tests/data/office_revision_session_ids.docx b/tests/data/office_revision_session_ids.docx new file mode 100644 index 0000000..b40a341 --- /dev/null +++ b/tests/data/office_revision_session_ids.docx
Binary files differ


diff --git a/tests/test_deep_cleaning.py b/tests/test_deep_cleaning.py index 3d1c8e1..82579a3 100644 --- a/tests/test_deep_cleaning.py +++ b/tests/test_deep_cleaning.py
@@ -105,3 +105,34 @@ class TestZipOrder(unittest.TestCase):
105		105
106	os.remove('./tests/data/clean.odt')	106	os.remove('./tests/data/clean.odt')
107	os.remove('./tests/data/clean.cleaned.odt')	107	os.remove('./tests/data/clean.cleaned.odt')
		108
		109	class TestRsidRemoval(unittest.TestCase):
		110	def test_office(self):
		111	shutil.copy('./tests/data/office_revision_session_ids.docx', './tests/data/clean.docx')
		112	p = office.MSOfficeParser('./tests/data/clean.docx')
		113
		114	meta = p.get_meta()
		115	self.assertIsNotNone(meta)
		116
		117	how_many_rsid = False
		118	with zipfile.ZipFile('./tests/data/clean.docx') as zin:
		119	for item in zin.infolist():
		120	if not item.filename.endswith('.xml'):
		121	continue
		122	num = zin.read(item).decode('utf-8').lower().count('w:rsid')
		123	how_many_rsid += num
		124	self.assertEqual(how_many_rsid, 11)
		125
		126	ret = p.remove_all()
		127	self.assertTrue(ret)
		128
		129	with zipfile.ZipFile('./tests/data/clean.cleaned.docx') as zin:
		130	for item in zin.infolist():
		131	if not item.filename.endswith('.xml'):
		132	continue
		133	num = zin.read(item).decode('utf-8').lower().count('w:rsid')
		134	self.assertEqual(num, 0)
		135
		136	os.remove('./tests/data/clean.docx')
		137	os.remove('./tests/data/clean.cleaned.docx')
		138