From 174d4a0ac09c2e9d4a9aa3677a442c05459b8309 Mon Sep 17 00:00:00 2001 From: jvoisin Date: Thu, 20 Sep 2018 22:37:53 +0200 Subject: Implement rsid stripping for office files MS Office XML rsid is a "unique identifier used to track the editing session when the physical character representing this section mark was last formatted." See the following links for details: - https://msdn.microsoft.com/en-us/library/office/documentformat.openxml.wordprocessing.previoussectionproperties.rsidrpr.aspx - https://blogs.msdn.microsoft.com/brian_jones/2006/12/11/whats-up-with-all-those-rsids/. --- tests/data/office_revision_session_ids.docx | Bin 0 -> 12163 bytes tests/test_deep_cleaning.py | 31 ++++++++++++++++++++++++++++ 2 files changed, 31 insertions(+) create mode 100644 tests/data/office_revision_session_ids.docx (limited to 'tests') diff --git a/tests/data/office_revision_session_ids.docx b/tests/data/office_revision_session_ids.docx new file mode 100644 index 0000000..b40a341 Binary files /dev/null and b/tests/data/office_revision_session_ids.docx differ diff --git a/tests/test_deep_cleaning.py b/tests/test_deep_cleaning.py index 3d1c8e1..82579a3 100644 --- a/tests/test_deep_cleaning.py +++ b/tests/test_deep_cleaning.py @@ -105,3 +105,34 @@ class TestZipOrder(unittest.TestCase): os.remove('./tests/data/clean.odt') os.remove('./tests/data/clean.cleaned.odt') + +class TestRsidRemoval(unittest.TestCase): + def test_office(self): + shutil.copy('./tests/data/office_revision_session_ids.docx', './tests/data/clean.docx') + p = office.MSOfficeParser('./tests/data/clean.docx') + + meta = p.get_meta() + self.assertIsNotNone(meta) + + how_many_rsid = False + with zipfile.ZipFile('./tests/data/clean.docx') as zin: + for item in zin.infolist(): + if not item.filename.endswith('.xml'): + continue + num = zin.read(item).decode('utf-8').lower().count('w:rsid') + how_many_rsid += num + self.assertEqual(how_many_rsid, 11) + + ret = p.remove_all() + self.assertTrue(ret) + + with zipfile.ZipFile('./tests/data/clean.cleaned.docx') as zin: + for item in zin.infolist(): + if not item.filename.endswith('.xml'): + continue + num = zin.read(item).decode('utf-8').lower().count('w:rsid') + self.assertEqual(num, 0) + + os.remove('./tests/data/clean.docx') + os.remove('./tests/data/clean.cleaned.docx') + -- cgit v1.3