summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--libmat2/office.py61
-rw-r--r--tests/data/office_revision_session_ids.docxbin0 -> 12163 bytes
-rw-r--r--tests/test_deep_cleaning.py31
3 files changed, 87 insertions, 5 deletions
diff --git a/libmat2/office.py b/libmat2/office.py
index 5c2c996..07bbbb9 100644
--- a/libmat2/office.py
+++ b/libmat2/office.py
@@ -8,6 +8,8 @@ import xml.etree.ElementTree as ET # type: ignore
8 8
9from .archive import ArchiveBasedAbstractParser 9from .archive import ArchiveBasedAbstractParser
10 10
11# pylint: disable=line-too-long
12
11# Make pyflakes happy 13# Make pyflakes happy
12assert Set 14assert Set
13assert Pattern 15assert Pattern
@@ -15,14 +17,12 @@ assert Pattern
15def _parse_xml(full_path: str): 17def _parse_xml(full_path: str):
16 """ This function parses XML, with namespace support. """ 18 """ This function parses XML, with namespace support. """
17 19
18 cpt = 0
19 namespace_map = dict() 20 namespace_map = dict()
20 for _, (key, value) in ET.iterparse(full_path, ("start-ns", )): 21 for _, (key, value) in ET.iterparse(full_path, ("start-ns", )):
21 # The ns[0-9]+ namespaces are reserved for interal usage, so 22 # The ns[0-9]+ namespaces are reserved for interal usage, so
22 # we have to use an other nomenclature. 23 # we have to use an other nomenclature.
23 if re.match('^ns[0-9]+$', key): 24 if re.match('^ns[0-9]+$', key, re.I): #pragma: no cover
24 key = 'mat%d' % cpt 25 key = 'mat' + key[2:]
25 cpt += 1
26 26
27 namespace_map[key] = value 27 namespace_map[key] = value
28 ET.register_namespace(key, value) 28 ET.register_namespace(key, value)
@@ -59,12 +59,57 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
59 'word/fontTable.xml', 59 'word/fontTable.xml',
60 'word/settings.xml', 60 'word/settings.xml',
61 'word/styles.xml', 61 'word/styles.xml',
62
63 # https://msdn.microsoft.com/en-us/library/dd908153(v=office.12).aspx
64 'word/stylesWithEffects.xml',
62 } 65 }
63 files_to_omit = set(map(re.compile, { # type: ignore 66 files_to_omit = set(map(re.compile, { # type: ignore
67 'word/webSettings.xml',
68 'word/theme',
64 '^docProps/', 69 '^docProps/',
65 })) 70 }))
66 71
67 @staticmethod 72 @staticmethod
73 def __remove_rsid(full_path: str) -> bool:
74 """ The method will remove "revision session ID". We're '}rsid'
75 instead of proper parsing, since rsid can have multiple forms, like
76 `rsidRDefault`, `rsidR`, `rsids`, …
77
78 We're removing rsid tags in two times, because we can't modify
79 the xml while we're iterating on it.
80
81 For more details, see
82 - https://msdn.microsoft.com/en-us/library/office/documentformat.openxml.wordprocessing.previoussectionproperties.rsidrpr.aspx
83 - https://blogs.msdn.microsoft.com/brian_jones/2006/12/11/whats-up-with-all-those-rsids/
84 """
85 try:
86 tree, namespace = _parse_xml(full_path)
87 except ET.ParseError:
88 return False
89
90 # rsid, tags or attributes, are always under the `w` namespace
91 if 'w' not in namespace.keys():
92 return True
93
94 parent_map = {c:p for p in tree.iter() for c in p}
95
96 elements_to_remove = list()
97 for item in tree.iterfind('.//', namespace):
98 if '}rsid' in item.tag.strip().lower(): # resi as tag
99 elements_to_remove.append(item)
100 continue
101 for key in list(item.attrib.keys()): # rsid as attribute
102 if '}rsid' in key.lower():
103 del item.attrib[key]
104
105 for element in elements_to_remove:
106 parent_map[element].remove(element)
107
108 tree.write(full_path, xml_declaration=True)
109
110 return True
111
112 @staticmethod
68 def __remove_revisions(full_path: str) -> bool: 113 def __remove_revisions(full_path: str) -> bool:
69 """ In this function, we're changing the XML document in several 114 """ In this function, we're changing the XML document in several
70 different times, since we don't want to change the tree we're currently 115 different times, since we don't want to change the tree we're currently
@@ -112,7 +157,13 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
112 157
113 if full_path.endswith('/word/document.xml'): 158 if full_path.endswith('/word/document.xml'):
114 # this file contains the revisions 159 # this file contains the revisions
115 return self.__remove_revisions(full_path) 160 if self.__remove_revisions(full_path) is False:
161 return False
162
163 if full_path.endswith('.xml'):
164 if self.__remove_rsid(full_path) is False:
165 return False
166
116 return True 167 return True
117 168
118 def get_meta(self) -> Dict[str, str]: 169 def get_meta(self) -> Dict[str, str]:
diff --git a/tests/data/office_revision_session_ids.docx b/tests/data/office_revision_session_ids.docx
new file mode 100644
index 0000000..b40a341
--- /dev/null
+++ b/tests/data/office_revision_session_ids.docx
Binary files differ
diff --git a/tests/test_deep_cleaning.py b/tests/test_deep_cleaning.py
index 3d1c8e1..82579a3 100644
--- a/tests/test_deep_cleaning.py
+++ b/tests/test_deep_cleaning.py
@@ -105,3 +105,34 @@ class TestZipOrder(unittest.TestCase):
105 105
106 os.remove('./tests/data/clean.odt') 106 os.remove('./tests/data/clean.odt')
107 os.remove('./tests/data/clean.cleaned.odt') 107 os.remove('./tests/data/clean.cleaned.odt')
108
109class TestRsidRemoval(unittest.TestCase):
110 def test_office(self):
111 shutil.copy('./tests/data/office_revision_session_ids.docx', './tests/data/clean.docx')
112 p = office.MSOfficeParser('./tests/data/clean.docx')
113
114 meta = p.get_meta()
115 self.assertIsNotNone(meta)
116
117 how_many_rsid = False
118 with zipfile.ZipFile('./tests/data/clean.docx') as zin:
119 for item in zin.infolist():
120 if not item.filename.endswith('.xml'):
121 continue
122 num = zin.read(item).decode('utf-8').lower().count('w:rsid')
123 how_many_rsid += num
124 self.assertEqual(how_many_rsid, 11)
125
126 ret = p.remove_all()
127 self.assertTrue(ret)
128
129 with zipfile.ZipFile('./tests/data/clean.cleaned.docx') as zin:
130 for item in zin.infolist():
131 if not item.filename.endswith('.xml'):
132 continue
133 num = zin.read(item).decode('utf-8').lower().count('w:rsid')
134 self.assertEqual(num, 0)
135
136 os.remove('./tests/data/clean.docx')
137 os.remove('./tests/data/clean.cleaned.docx')
138