diff options
Diffstat (limited to 'libmat2')
| -rw-r--r-- | libmat2/office.py | 49 |
1 files changed, 41 insertions, 8 deletions
diff --git a/libmat2/office.py b/libmat2/office.py index 66f462b..f182277 100644 --- a/libmat2/office.py +++ b/libmat2/office.py | |||
| @@ -38,7 +38,7 @@ def _sort_xml_attributes(full_path: str) -> bool: | |||
| 38 | for c in tree.getroot(): | 38 | for c in tree.getroot(): |
| 39 | c[:] = sorted(c, key=lambda child: (child.tag, child.get('desc'))) | 39 | c[:] = sorted(c, key=lambda child: (child.tag, child.get('desc'))) |
| 40 | 40 | ||
| 41 | tree.write(full_path, xml_declaration=True) | 41 | tree.write(full_path, xml_declaration=True, encoding='utf-8') |
| 42 | return True | 42 | return True |
| 43 | 43 | ||
| 44 | 44 | ||
| @@ -220,7 +220,7 @@ class MSOfficeParser(ZipParser): | |||
| 220 | for element in elements_to_remove: | 220 | for element in elements_to_remove: |
| 221 | parent_map[element].remove(element) | 221 | parent_map[element].remove(element) |
| 222 | 222 | ||
| 223 | tree.write(full_path, xml_declaration=True) | 223 | tree.write(full_path, xml_declaration=True, encoding='utf-8') |
| 224 | return True | 224 | return True |
| 225 | 225 | ||
| 226 | @staticmethod | 226 | @staticmethod |
| @@ -250,7 +250,7 @@ class MSOfficeParser(ZipParser): | |||
| 250 | for element in elements_to_remove: | 250 | for element in elements_to_remove: |
| 251 | parent_map[element].remove(element) | 251 | parent_map[element].remove(element) |
| 252 | 252 | ||
| 253 | tree.write(full_path, xml_declaration=True) | 253 | tree.write(full_path, xml_declaration=True, encoding='utf-8') |
| 254 | return True | 254 | return True |
| 255 | 255 | ||
| 256 | @staticmethod | 256 | @staticmethod |
| @@ -287,7 +287,40 @@ class MSOfficeParser(ZipParser): | |||
| 287 | parent_map[element].insert(position, children) | 287 | parent_map[element].insert(position, children) |
| 288 | parent_map[element].remove(element) | 288 | parent_map[element].remove(element) |
| 289 | 289 | ||
| 290 | tree.write(full_path, xml_declaration=True) | 290 | tree.write(full_path, xml_declaration=True, encoding='utf-8') |
| 291 | return True | ||
| 292 | |||
| 293 | @staticmethod | ||
| 294 | def __remove_document_comment_meta(full_path: str) -> bool: | ||
| 295 | try: | ||
| 296 | tree, namespace = _parse_xml(full_path) | ||
| 297 | except ET.ParseError as e: # pragma: no cover | ||
| 298 | logging.error("Unable to parse %s: %s", full_path, e) | ||
| 299 | return False | ||
| 300 | |||
| 301 | # search the docs to see if we can bail early | ||
| 302 | range_start = tree.find('.//w:commentRangeStart', namespace) | ||
| 303 | range_end = tree.find('.//w:commentRangeEnd', namespace) | ||
| 304 | references = tree.find('.//w:commentReference', namespace) | ||
| 305 | if range_start is None and range_end is None and references is None: | ||
| 306 | return True # No comment meta tags are present | ||
| 307 | |||
| 308 | parent_map = {c:p for p in tree.iter() for c in p} | ||
| 309 | |||
| 310 | # iterate over the elements and add them to list | ||
| 311 | elements_del = list() | ||
| 312 | for element in tree.iterfind('.//w:commentRangeStart', namespace): | ||
| 313 | elements_del.append(element) | ||
| 314 | for element in tree.iterfind('.//w:commentRangeEnd', namespace): | ||
| 315 | elements_del.append(element) | ||
| 316 | for element in tree.iterfind('.//w:commentReference', namespace): | ||
| 317 | elements_del.append(element) | ||
| 318 | |||
| 319 | # remove the elements | ||
| 320 | for element in elements_del: | ||
| 321 | parent_map[element].remove(element) | ||
| 322 | |||
| 323 | tree.write(full_path, xml_declaration=True, encoding='utf-8') | ||
| 291 | return True | 324 | return True |
| 292 | 325 | ||
| 293 | @staticmethod | 326 | @staticmethod |
| @@ -353,7 +386,7 @@ class MSOfficeParser(ZipParser): | |||
| 353 | if name in removed_fnames: | 386 | if name in removed_fnames: |
| 354 | root.remove(item) | 387 | root.remove(item) |
| 355 | 388 | ||
| 356 | tree.write(full_path, xml_declaration=True) | 389 | tree.write(full_path, xml_declaration=True, encoding='utf-8') |
| 357 | return True | 390 | return True |
| 358 | 391 | ||
| 359 | def _final_checks(self) -> bool: | 392 | def _final_checks(self) -> bool: |
| @@ -388,7 +421,7 @@ class MSOfficeParser(ZipParser): | |||
| 388 | 421 | ||
| 389 | for item in tree.iterfind('.//p14:creationId', namespace): | 422 | for item in tree.iterfind('.//p14:creationId', namespace): |
| 390 | item.set('val', '%s' % random.randint(0, 2**32)) | 423 | item.set('val', '%s' % random.randint(0, 2**32)) |
| 391 | tree.write(full_path, xml_declaration=True) | 424 | tree.write(full_path, xml_declaration=True, encoding='utf-8') |
| 392 | return True | 425 | return True |
| 393 | 426 | ||
| 394 | @staticmethod | 427 | @staticmethod |
| @@ -404,7 +437,7 @@ class MSOfficeParser(ZipParser): | |||
| 404 | 437 | ||
| 405 | for item in tree.iterfind('.//p:sldMasterId', namespace): | 438 | for item in tree.iterfind('.//p:sldMasterId', namespace): |
| 406 | item.set('id', '%s' % random.randint(0, 2**32)) | 439 | item.set('id', '%s' % random.randint(0, 2**32)) |
| 407 | tree.write(full_path, xml_declaration=True) | 440 | tree.write(full_path, xml_declaration=True, encoding='utf-8') |
| 408 | return True | 441 | return True |
| 409 | 442 | ||
| 410 | def _specific_cleanup(self, full_path: str) -> bool: | 443 | def _specific_cleanup(self, full_path: str) -> bool: |
| @@ -550,7 +583,7 @@ class LibreOfficeParser(ZipParser): | |||
| 550 | for changes in text.iterfind('.//text:tracked-changes', namespace): | 583 | for changes in text.iterfind('.//text:tracked-changes', namespace): |
| 551 | text.remove(changes) | 584 | text.remove(changes) |
| 552 | 585 | ||
| 553 | tree.write(full_path, xml_declaration=True) | 586 | tree.write(full_path, xml_declaration=True, encoding='utf-8') |
| 554 | return True | 587 | return True |
| 555 | 588 | ||
| 556 | def _specific_cleanup(self, full_path: str) -> bool: | 589 | def _specific_cleanup(self, full_path: str) -> bool: |
