summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--libmat2/office.py50
1 files changed, 44 insertions, 6 deletions
diff --git a/libmat2/office.py b/libmat2/office.py
index b769991..c9bed7a 100644
--- a/libmat2/office.py
+++ b/libmat2/office.py
@@ -62,9 +62,6 @@ class MSOfficeParser(ZipParser):
62 62
63 # Do we want to keep the following ones? 63 # Do we want to keep the following ones?
64 'application/vnd.openxmlformats-officedocument.wordprocessingml.settings+xml', 64 'application/vnd.openxmlformats-officedocument.wordprocessingml.settings+xml',
65
66 # See https://0xacab.org/jvoisin/mat2/issues/71
67 'application/vnd.openxmlformats-officedocument.wordprocessingml.numbering+xml', # /word/numbering.xml
68 } 65 }
69 66
70 67
@@ -88,6 +85,7 @@ class MSOfficeParser(ZipParser):
88 r'^word/printerSettings/', 85 r'^word/printerSettings/',
89 r'^word/theme', 86 r'^word/theme',
90 r'^word/people\.xml$', 87 r'^word/people\.xml$',
88 r'^word/numbering\.xml$',
91 89
92 # we have an allowlist in self.files_to_keep, 90 # we have an allowlist in self.files_to_keep,
93 # so we can trash everything else 91 # so we can trash everything else
@@ -124,7 +122,7 @@ class MSOfficeParser(ZipParser):
124 122
125 @staticmethod 123 @staticmethod
126 def __remove_rsid(full_path: str) -> bool: 124 def __remove_rsid(full_path: str) -> bool:
127 """ The method will remove "revision session ID". We're '}rsid' 125 """ The method will remove "revision session ID". We're using '}rsid'
128 instead of proper parsing, since rsid can have multiple forms, like 126 instead of proper parsing, since rsid can have multiple forms, like
129 `rsidRDefault`, `rsidR`, `rsids`, … 127 `rsidRDefault`, `rsidR`, `rsids`, …
130 128
@@ -137,7 +135,8 @@ class MSOfficeParser(ZipParser):
137 """ 135 """
138 try: 136 try:
139 tree, namespace = _parse_xml(full_path) 137 tree, namespace = _parse_xml(full_path)
140 except ET.ParseError: 138 except ET.ParseError as e:
139 logging.error("Unable to parse %s: %s", full_path, e)
141 return False 140 return False
142 141
143 # rsid, tags or attributes, are always under the `w` namespace 142 # rsid, tags or attributes, are always under the `w` namespace
@@ -162,6 +161,41 @@ class MSOfficeParser(ZipParser):
162 return True 161 return True
163 162
164 @staticmethod 163 @staticmethod
164 def __remove_nsid(full_path: str) -> bool:
165 """
166 NSID are random identifiers that can be used
167 to ease the merging of some components of a document.
168 They can also be used for fingerprinting.
169
170 See the spec for more details: https://docs.microsoft.com/en-us/dotnet/api/documentformat.openxml.wordprocessing.nsid?view=openxml-2.8.1
171
172 In this function, we're changing the XML document in several
173 different times, since we don't want to change the tree we're currently
174 iterating on.
175 """
176 try:
177 tree, namespace = _parse_xml(full_path)
178 except ET.ParseError as e: # pragma: no cover
179 logging.error("Unable to parse %s: %s", full_path, e)
180 return False
181
182 # The NSID tag is always under the `w` namespace
183 if 'w' not in namespace.keys():
184 return True
185
186 parent_map = {c:p for p in tree.iter() for c in p}
187
188 elements_to_remove = list()
189 for element in tree.iterfind('.//w:nsid', namespace):
190 elements_to_remove.append(element)
191 for element in elements_to_remove:
192 parent_map[element].remove(element)
193
194 tree.write(full_path, xml_declaration=True)
195 return True
196
197
198 @staticmethod
165 def __remove_revisions(full_path: str) -> bool: 199 def __remove_revisions(full_path: str) -> bool:
166 """ In this function, we're changing the XML document in several 200 """ In this function, we're changing the XML document in several
167 different times, since we don't want to change the tree we're currently 201 different times, since we don't want to change the tree we're currently
@@ -208,7 +242,8 @@ class MSOfficeParser(ZipParser):
208 """ 242 """
209 try: 243 try:
210 tree, namespace = _parse_xml(full_path) 244 tree, namespace = _parse_xml(full_path)
211 except ET.ParseError: # pragma: no cover 245 except ET.ParseError as e: # pragma: no cover
246 logging.error("Unable to parse %s: %s", full_path, e)
212 return False 247 return False
213 248
214 if len(namespace.items()) != 1: 249 if len(namespace.items()) != 1:
@@ -269,6 +304,9 @@ class MSOfficeParser(ZipParser):
269 if self.__remove_rsid(full_path) is False: 304 if self.__remove_rsid(full_path) is False:
270 return False 305 return False
271 306
307 if self.__remove_nsid(full_path) is False:
308 return False # pragma: no cover
309
272 try: 310 try:
273 _sort_xml_attributes(full_path) 311 _sort_xml_attributes(full_path)
274 except ET.ParseError as e: # pragma: no cover 312 except ET.ParseError as e: # pragma: no cover