diff options
| author | Alex Marchant | 2024-04-03 15:20:00 -0400 |
|---|---|---|
| committer | Alex Marchant | 2024-04-03 15:20:00 -0400 |
| commit | 61f39c4bd0b51be6371fb2973c14054a2772352e (patch) | |
| tree | 3f5a8c11eb8c37fbc229a213a6e89ade3b5272b0 /libmat2 | |
| parent | 1b9ce34e2c3da718e79137e2c2210ccdcd299486 (diff) | |
Strip comment references from document.xml
Diffstat (limited to 'libmat2')
| -rw-r--r-- | libmat2/office.py | 36 |
1 files changed, 36 insertions, 0 deletions
diff --git a/libmat2/office.py b/libmat2/office.py index 6f69e4a..66f462b 100644 --- a/libmat2/office.py +++ b/libmat2/office.py | |||
| @@ -290,6 +290,39 @@ class MSOfficeParser(ZipParser): | |||
| 290 | tree.write(full_path, xml_declaration=True) | 290 | tree.write(full_path, xml_declaration=True) |
| 291 | return True | 291 | return True |
| 292 | 292 | ||
| 293 | @staticmethod | ||
| 294 | def __remove_document_comment_meta(full_path: str) -> bool: | ||
| 295 | try: | ||
| 296 | tree, namespace = _parse_xml(full_path) | ||
| 297 | except ET.ParseError as e: # pragma: no cover | ||
| 298 | logging.error("Unable to parse %s: %s", full_path, e) | ||
| 299 | return False | ||
| 300 | |||
| 301 | # search the docs to see if we can bail early | ||
| 302 | range_start = tree.find('.//w:commentRangeStart', namespace) | ||
| 303 | range_end = tree.find('.//w:commentRangeEnd', namespace) | ||
| 304 | references = tree.find('.//w:commentReference', namespace) | ||
| 305 | if range_start is None and range_end is None and references is None: | ||
| 306 | return True # No comment meta tags are present | ||
| 307 | |||
| 308 | parent_map = {c:p for p in tree.iter() for c in p} | ||
| 309 | |||
| 310 | # iterate over the elements and add them to list | ||
| 311 | elements_del = list() | ||
| 312 | for element in tree.iterfind('.//w:commentRangeStart', namespace): | ||
| 313 | elements_del.append(element) | ||
| 314 | for element in tree.iterfind('.//w:commentRangeEnd', namespace): | ||
| 315 | elements_del.append(element) | ||
| 316 | for element in tree.iterfind('.//w:commentReference', namespace): | ||
| 317 | elements_del.append(element) | ||
| 318 | |||
| 319 | # remove the elements | ||
| 320 | for element in elements_del: | ||
| 321 | parent_map[element].remove(element) | ||
| 322 | |||
| 323 | tree.write(full_path, xml_declaration=True, encoding='utf-8') | ||
| 324 | return True | ||
| 325 | |||
| 293 | def __remove_content_type_members(self, full_path: str) -> bool: | 326 | def __remove_content_type_members(self, full_path: str) -> bool: |
| 294 | """ The method will remove the dangling references | 327 | """ The method will remove the dangling references |
| 295 | form the [Content_Types].xml file, since MS office doesn't like them | 328 | form the [Content_Types].xml file, since MS office doesn't like them |
| @@ -396,6 +429,9 @@ class MSOfficeParser(ZipParser): | |||
| 396 | # this file contains the revisions | 429 | # this file contains the revisions |
| 397 | if self.__remove_revisions(full_path) is False: | 430 | if self.__remove_revisions(full_path) is False: |
| 398 | return False # pragma: no cover | 431 | return False # pragma: no cover |
| 432 | # remove comment references and ranges | ||
| 433 | if self.__remove_document_comment_meta(full_path) is False: | ||
| 434 | return False # pragma: no cover | ||
| 399 | elif full_path.endswith('/docProps/app.xml'): | 435 | elif full_path.endswith('/docProps/app.xml'): |
| 400 | # This file must be present and valid, | 436 | # This file must be present and valid, |
| 401 | # so we're removing as much as we can. | 437 | # so we're removing as much as we can. |
