From 174d4a0ac09c2e9d4a9aa3677a442c05459b8309 Mon Sep 17 00:00:00 2001
From: jvoisin
Date: Thu, 20 Sep 2018 22:37:53 +0200
Subject: Implement rsid stripping for office files

MS Office XML rsid is a "unique identifier used to track the editing session
when the physical character representing this section mark was last formatted."

See the following links for details:
- https://msdn.microsoft.com/en-us/library/office/documentformat.openxml.wordprocessing.previoussectionproperties.rsidrpr.aspx
- https://blogs.msdn.microsoft.com/brian_jones/2006/12/11/whats-up-with-all-those-rsids/.
---
 tests/data/office_revision_session_ids.docx | Bin 0 -> 12163 bytes
 tests/test_deep_cleaning.py                 |  31 ++++++++++++++++++++++++++++
 2 files changed, 31 insertions(+)
 create mode 100644 tests/data/office_revision_session_ids.docx

(limited to 'tests')

diff --git a/tests/data/office_revision_session_ids.docx b/tests/data/office_revision_session_ids.docx
new file mode 100644
index 0000000..b40a341
Binary files /dev/null and b/tests/data/office_revision_session_ids.docx differ
diff --git a/tests/test_deep_cleaning.py b/tests/test_deep_cleaning.py
index 3d1c8e1..82579a3 100644
--- a/tests/test_deep_cleaning.py
+++ b/tests/test_deep_cleaning.py
@@ -105,3 +105,34 @@ class TestZipOrder(unittest.TestCase):
 
         os.remove('./tests/data/clean.odt')
         os.remove('./tests/data/clean.cleaned.odt')
+
+class TestRsidRemoval(unittest.TestCase):
+    def test_office(self):
+        shutil.copy('./tests/data/office_revision_session_ids.docx', './tests/data/clean.docx')
+        p = office.MSOfficeParser('./tests/data/clean.docx')
+
+        meta = p.get_meta()
+        self.assertIsNotNone(meta)
+
+        how_many_rsid = False
+        with zipfile.ZipFile('./tests/data/clean.docx') as zin:
+            for item in zin.infolist():
+                if not item.filename.endswith('.xml'):
+                    continue
+                num = zin.read(item).decode('utf-8').lower().count('w:rsid')
+                how_many_rsid += num
+        self.assertEqual(how_many_rsid, 11)
+
+        ret = p.remove_all()
+        self.assertTrue(ret)
+
+        with zipfile.ZipFile('./tests/data/clean.cleaned.docx') as zin:
+            for item in zin.infolist():
+                if not item.filename.endswith('.xml'):
+                    continue
+                num = zin.read(item).decode('utf-8').lower().count('w:rsid')
+                self.assertEqual(num, 0)
+
+        os.remove('./tests/data/clean.docx')
+        os.remove('./tests/data/clean.cleaned.docx')
+
-- 
cgit v1.3