diff options
| -rw-r--r-- | libmat2/office.py | 36 | ||||
| -rw-r--r-- | tests/test_libmat2.py | 32 |
2 files changed, 67 insertions, 1 deletions
diff --git a/libmat2/office.py b/libmat2/office.py index fa79834..3a290d8 100644 --- a/libmat2/office.py +++ b/libmat2/office.py | |||
| @@ -290,6 +290,39 @@ class MSOfficeParser(ZipParser): | |||
| 290 | tree.write(full_path, xml_declaration=True, encoding='utf-8') | 290 | tree.write(full_path, xml_declaration=True, encoding='utf-8') |
| 291 | return True | 291 | return True |
| 292 | 292 | ||
| 293 | @staticmethod | ||
| 294 | def __remove_document_comment_meta(full_path: str) -> bool: | ||
| 295 | try: | ||
| 296 | tree, namespace = _parse_xml(full_path) | ||
| 297 | except ET.ParseError as e: # pragma: no cover | ||
| 298 | logging.error("Unable to parse %s: %s", full_path, e) | ||
| 299 | return False | ||
| 300 | |||
| 301 | # search the docs to see if we can bail early | ||
| 302 | range_start = tree.find('.//w:commentRangeStart', namespace) | ||
| 303 | range_end = tree.find('.//w:commentRangeEnd', namespace) | ||
| 304 | references = tree.find('.//w:commentReference', namespace) | ||
| 305 | if range_start is None and range_end is None and references is None: | ||
| 306 | return True # No comment meta tags are present | ||
| 307 | |||
| 308 | parent_map = {c:p for p in tree.iter() for c in p} | ||
| 309 | |||
| 310 | # iterate over the elements and add them to list | ||
| 311 | elements_del = list() | ||
| 312 | for element in tree.iterfind('.//w:commentRangeStart', namespace): | ||
| 313 | elements_del.append(element) | ||
| 314 | for element in tree.iterfind('.//w:commentRangeEnd', namespace): | ||
| 315 | elements_del.append(element) | ||
| 316 | for element in tree.iterfind('.//w:commentReference', namespace): | ||
| 317 | elements_del.append(element) | ||
| 318 | |||
| 319 | # remove the elements | ||
| 320 | for element in elements_del: | ||
| 321 | parent_map[element].remove(element) | ||
| 322 | |||
| 323 | tree.write(full_path, xml_declaration=True, encoding='utf-8') | ||
| 324 | return True | ||
| 325 | |||
| 293 | def __remove_content_type_members(self, full_path: str) -> bool: | 326 | def __remove_content_type_members(self, full_path: str) -> bool: |
| 294 | """ The method will remove the dangling references | 327 | """ The method will remove the dangling references |
| 295 | form the [Content_Types].xml file, since MS office doesn't like them | 328 | form the [Content_Types].xml file, since MS office doesn't like them |
| @@ -396,6 +429,9 @@ class MSOfficeParser(ZipParser): | |||
| 396 | # this file contains the revisions | 429 | # this file contains the revisions |
| 397 | if self.__remove_revisions(full_path) is False: | 430 | if self.__remove_revisions(full_path) is False: |
| 398 | return False # pragma: no cover | 431 | return False # pragma: no cover |
| 432 | # remove comment references and ranges | ||
| 433 | if self.__remove_document_comment_meta(full_path) is False: | ||
| 434 | return False # pragma: no cover | ||
| 399 | elif full_path.endswith('/docProps/app.xml'): | 435 | elif full_path.endswith('/docProps/app.xml'): |
| 400 | # This file must be present and valid, | 436 | # This file must be present and valid, |
| 401 | # so we're removing as much as we can. | 437 | # so we're removing as much as we can. |
diff --git a/tests/test_libmat2.py b/tests/test_libmat2.py index 0435113..491f396 100644 --- a/tests/test_libmat2.py +++ b/tests/test_libmat2.py | |||
| @@ -900,4 +900,34 @@ class TextDocx(unittest.TestCase): | |||
| 900 | self.assertIsNotNone(match) | 900 | self.assertIsNotNone(match) |
| 901 | 901 | ||
| 902 | os.remove('./tests/data/comment_clean.docx') | 902 | os.remove('./tests/data/comment_clean.docx') |
| 903 | os.remove('./tests/data/comment_clean.cleaned.docx') \ No newline at end of file | 903 | os.remove('./tests/data/comment_clean.cleaned.docx') |
| 904 | |||
| 905 | def test_comment_references_are_removed(self): | ||
| 906 | with zipfile.ZipFile('./tests/data/comment.docx') as zipin: | ||
| 907 | c = zipin.open('word/document.xml') | ||
| 908 | content = c.read() | ||
| 909 | |||
| 910 | r = b'w:commentRangeStart' | ||
| 911 | self.assertIn(r, content) | ||
| 912 | r = b'w:commentRangeEnd' | ||
| 913 | self.assertIn(r, content) | ||
| 914 | r = b'w:commentReference' | ||
| 915 | self.assertIn(r, content) | ||
| 916 | |||
| 917 | shutil.copy('./tests/data/comment.docx', './tests/data/comment_clean.docx') | ||
| 918 | p = office.MSOfficeParser('./tests/data/comment_clean.docx') | ||
| 919 | self.assertTrue(p.remove_all()) | ||
| 920 | |||
| 921 | with zipfile.ZipFile('./tests/data/comment_clean.cleaned.docx') as zipin: | ||
| 922 | c = zipin.open('word/document.xml') | ||
| 923 | content = c.read() | ||
| 924 | |||
| 925 | r = b'w:commentRangeStart' | ||
| 926 | self.assertNotIn(r, content) | ||
| 927 | r = b'w:commentRangeEnd' | ||
| 928 | self.assertNotIn(r, content) | ||
| 929 | r = b'w:commentReference' | ||
| 930 | self.assertNotIn(r, content) | ||
| 931 | |||
| 932 | os.remove('./tests/data/comment_clean.docx') | ||
| 933 | os.remove('./tests/data/comment_clean.cleaned.docx') | ||
