summaryrefslogtreecommitdiff
path: root/libmat2/office.py
diff options
context:
space:
mode:
authorjvoisin2024-04-05 18:33:30 +0200
committerjvoisin2024-04-05 18:33:30 +0200
commit09672a2dccb2fea0035278c7014f319b85e89c31 (patch)
tree2f530cf359d3c99807c5ac6c03fc52b2b93445d6 /libmat2/office.py
parent61f39c4bd0b51be6371fb2973c14054a2772352e (diff)
parentf2c898c92d0422ddc76fa977d60f7345b06a5ad6 (diff)
Merge branch 'alexmarchant-utf-8-encode-all'
Diffstat (limited to 'libmat2/office.py')
-rw-r--r--libmat2/office.py49
1 files changed, 41 insertions, 8 deletions
diff --git a/libmat2/office.py b/libmat2/office.py
index 66f462b..f182277 100644
--- a/libmat2/office.py
+++ b/libmat2/office.py
@@ -38,7 +38,7 @@ def _sort_xml_attributes(full_path: str) -> bool:
38 for c in tree.getroot(): 38 for c in tree.getroot():
39 c[:] = sorted(c, key=lambda child: (child.tag, child.get('desc'))) 39 c[:] = sorted(c, key=lambda child: (child.tag, child.get('desc')))
40 40
41 tree.write(full_path, xml_declaration=True) 41 tree.write(full_path, xml_declaration=True, encoding='utf-8')
42 return True 42 return True
43 43
44 44
@@ -220,7 +220,7 @@ class MSOfficeParser(ZipParser):
220 for element in elements_to_remove: 220 for element in elements_to_remove:
221 parent_map[element].remove(element) 221 parent_map[element].remove(element)
222 222
223 tree.write(full_path, xml_declaration=True) 223 tree.write(full_path, xml_declaration=True, encoding='utf-8')
224 return True 224 return True
225 225
226 @staticmethod 226 @staticmethod
@@ -250,7 +250,7 @@ class MSOfficeParser(ZipParser):
250 for element in elements_to_remove: 250 for element in elements_to_remove:
251 parent_map[element].remove(element) 251 parent_map[element].remove(element)
252 252
253 tree.write(full_path, xml_declaration=True) 253 tree.write(full_path, xml_declaration=True, encoding='utf-8')
254 return True 254 return True
255 255
256 @staticmethod 256 @staticmethod
@@ -287,7 +287,40 @@ class MSOfficeParser(ZipParser):
287 parent_map[element].insert(position, children) 287 parent_map[element].insert(position, children)
288 parent_map[element].remove(element) 288 parent_map[element].remove(element)
289 289
290 tree.write(full_path, xml_declaration=True) 290 tree.write(full_path, xml_declaration=True, encoding='utf-8')
291 return True
292
293 @staticmethod
294 def __remove_document_comment_meta(full_path: str) -> bool:
295 try:
296 tree, namespace = _parse_xml(full_path)
297 except ET.ParseError as e: # pragma: no cover
298 logging.error("Unable to parse %s: %s", full_path, e)
299 return False
300
301 # search the docs to see if we can bail early
302 range_start = tree.find('.//w:commentRangeStart', namespace)
303 range_end = tree.find('.//w:commentRangeEnd', namespace)
304 references = tree.find('.//w:commentReference', namespace)
305 if range_start is None and range_end is None and references is None:
306 return True # No comment meta tags are present
307
308 parent_map = {c:p for p in tree.iter() for c in p}
309
310 # iterate over the elements and add them to list
311 elements_del = list()
312 for element in tree.iterfind('.//w:commentRangeStart', namespace):
313 elements_del.append(element)
314 for element in tree.iterfind('.//w:commentRangeEnd', namespace):
315 elements_del.append(element)
316 for element in tree.iterfind('.//w:commentReference', namespace):
317 elements_del.append(element)
318
319 # remove the elements
320 for element in elements_del:
321 parent_map[element].remove(element)
322
323 tree.write(full_path, xml_declaration=True, encoding='utf-8')
291 return True 324 return True
292 325
293 @staticmethod 326 @staticmethod
@@ -353,7 +386,7 @@ class MSOfficeParser(ZipParser):
353 if name in removed_fnames: 386 if name in removed_fnames:
354 root.remove(item) 387 root.remove(item)
355 388
356 tree.write(full_path, xml_declaration=True) 389 tree.write(full_path, xml_declaration=True, encoding='utf-8')
357 return True 390 return True
358 391
359 def _final_checks(self) -> bool: 392 def _final_checks(self) -> bool:
@@ -388,7 +421,7 @@ class MSOfficeParser(ZipParser):
388 421
389 for item in tree.iterfind('.//p14:creationId', namespace): 422 for item in tree.iterfind('.//p14:creationId', namespace):
390 item.set('val', '%s' % random.randint(0, 2**32)) 423 item.set('val', '%s' % random.randint(0, 2**32))
391 tree.write(full_path, xml_declaration=True) 424 tree.write(full_path, xml_declaration=True, encoding='utf-8')
392 return True 425 return True
393 426
394 @staticmethod 427 @staticmethod
@@ -404,7 +437,7 @@ class MSOfficeParser(ZipParser):
404 437
405 for item in tree.iterfind('.//p:sldMasterId', namespace): 438 for item in tree.iterfind('.//p:sldMasterId', namespace):
406 item.set('id', '%s' % random.randint(0, 2**32)) 439 item.set('id', '%s' % random.randint(0, 2**32))
407 tree.write(full_path, xml_declaration=True) 440 tree.write(full_path, xml_declaration=True, encoding='utf-8')
408 return True 441 return True
409 442
410 def _specific_cleanup(self, full_path: str) -> bool: 443 def _specific_cleanup(self, full_path: str) -> bool:
@@ -550,7 +583,7 @@ class LibreOfficeParser(ZipParser):
550 for changes in text.iterfind('.//text:tracked-changes', namespace): 583 for changes in text.iterfind('.//text:tracked-changes', namespace):
551 text.remove(changes) 584 text.remove(changes)
552 585
553 tree.write(full_path, xml_declaration=True) 586 tree.write(full_path, xml_declaration=True, encoding='utf-8')
554 return True 587 return True
555 588
556 def _specific_cleanup(self, full_path: str) -> bool: 589 def _specific_cleanup(self, full_path: str) -> bool: