diff options
| -rw-r--r-- | libmat2/office.py | 38 | ||||
| -rw-r--r-- | tests/test_libmat2.py | 21 |
2 files changed, 58 insertions, 1 deletions
diff --git a/libmat2/office.py b/libmat2/office.py index f182277..fc83a20 100644 --- a/libmat2/office.py +++ b/libmat2/office.py | |||
| @@ -323,6 +323,38 @@ class MSOfficeParser(ZipParser): | |||
| 323 | tree.write(full_path, xml_declaration=True, encoding='utf-8') | 323 | tree.write(full_path, xml_declaration=True, encoding='utf-8') |
| 324 | return True | 324 | return True |
| 325 | 325 | ||
| 326 | def __remove_document_xml_rels_members(self, full_path: str) -> bool: | ||
| 327 | """ Remove the dangling references from the word/_rels/document.xml.rels file, since MS office doesn't like them. | ||
| 328 | """ | ||
| 329 | try: | ||
| 330 | tree, namespace = _parse_xml(full_path) | ||
| 331 | except ET.ParseError as e: # pragma: no cover | ||
| 332 | logging.error("Unable to parse %s: %s", full_path, e) | ||
| 333 | return False | ||
| 334 | |||
| 335 | if len(namespace.items()) != 1: # pragma: no cover | ||
| 336 | logging.debug("Got several namespaces for Types: %s", namespace.items()) | ||
| 337 | |||
| 338 | removed_fnames = set() | ||
| 339 | with zipfile.ZipFile(self.filename) as zin: | ||
| 340 | for fname in [item.filename for item in zin.infolist()]: | ||
| 341 | for file_to_omit in self.files_to_omit: | ||
| 342 | if file_to_omit.search(fname): | ||
| 343 | matches = map(lambda r: r.search(fname), self.files_to_keep) | ||
| 344 | if any(matches): # the file is in the allowlist | ||
| 345 | continue | ||
| 346 | removed_fnames.add(fname) | ||
| 347 | break | ||
| 348 | |||
| 349 | root = tree.getroot() | ||
| 350 | for item in root.findall('{%s}Relationship' % namespace['']): | ||
| 351 | name = 'word/' + item.attrib['Target'] # add the word/ prefix to the path, since all document rels are in the word/ directory | ||
| 352 | if name in removed_fnames: | ||
| 353 | root.remove(item) | ||
| 354 | |||
| 355 | tree.write(full_path, xml_declaration=True, encoding='utf-8') | ||
| 356 | return True | ||
| 357 | |||
| 326 | @staticmethod | 358 | @staticmethod |
| 327 | def __remove_document_comment_meta(full_path: str) -> bool: | 359 | def __remove_document_comment_meta(full_path: str) -> bool: |
| 328 | try: | 360 | try: |
| @@ -445,7 +477,7 @@ class MSOfficeParser(ZipParser): | |||
| 445 | if os.stat(full_path).st_size == 0: # Don't process empty files | 477 | if os.stat(full_path).st_size == 0: # Don't process empty files |
| 446 | return True | 478 | return True |
| 447 | 479 | ||
| 448 | if not full_path.endswith('.xml'): | 480 | if not full_path.endswith(('.xml', '.xml.rels')): |
| 449 | return True | 481 | return True |
| 450 | 482 | ||
| 451 | if self.__randomize_creationId(full_path) is False: | 483 | if self.__randomize_creationId(full_path) is False: |
| @@ -465,6 +497,10 @@ class MSOfficeParser(ZipParser): | |||
| 465 | # remove comment references and ranges | 497 | # remove comment references and ranges |
| 466 | if self.__remove_document_comment_meta(full_path) is False: | 498 | if self.__remove_document_comment_meta(full_path) is False: |
| 467 | return False # pragma: no cover | 499 | return False # pragma: no cover |
| 500 | elif full_path.endswith('/word/_rels/document.xml.rels'): | ||
| 501 | # similar to the above, but for the document.xml.rels file | ||
| 502 | if self.__remove_document_xml_rels_members(full_path) is False: # pragma: no cover | ||
| 503 | return False | ||
| 468 | elif full_path.endswith('/docProps/app.xml'): | 504 | elif full_path.endswith('/docProps/app.xml'): |
| 469 | # This file must be present and valid, | 505 | # This file must be present and valid, |
| 470 | # so we're removing as much as we can. | 506 | # so we're removing as much as we can. |
diff --git a/tests/test_libmat2.py b/tests/test_libmat2.py index 491f396..7855062 100644 --- a/tests/test_libmat2.py +++ b/tests/test_libmat2.py | |||
| @@ -931,3 +931,24 @@ class TextDocx(unittest.TestCase): | |||
| 931 | 931 | ||
| 932 | os.remove('./tests/data/comment_clean.docx') | 932 | os.remove('./tests/data/comment_clean.docx') |
| 933 | os.remove('./tests/data/comment_clean.cleaned.docx') | 933 | os.remove('./tests/data/comment_clean.cleaned.docx') |
| 934 | |||
| 935 | def test_clean_document_xml_rels(self): | ||
| 936 | with zipfile.ZipFile('./tests/data/comment.docx') as zipin: | ||
| 937 | c = zipin.open('word/_rels/document.xml.rels') | ||
| 938 | content = c.read() | ||
| 939 | r = b'Target="comments.xml"' | ||
| 940 | self.assertIn(r, content) | ||
| 941 | |||
| 942 | shutil.copy('./tests/data/comment.docx', './tests/data/comment_clean.docx') | ||
| 943 | p = office.MSOfficeParser('./tests/data/comment_clean.docx') | ||
| 944 | self.assertTrue(p.remove_all()) | ||
| 945 | |||
| 946 | with zipfile.ZipFile('./tests/data/comment_clean.cleaned.docx') as zipin: | ||
| 947 | c = zipin.open('word/_rels/document.xml.rels') | ||
| 948 | content = c.read() | ||
| 949 | r = b'Target="comments.xml"' | ||
| 950 | self.assertNotIn(r, content) | ||
| 951 | |||
| 952 | os.remove('./tests/data/comment_clean.docx') | ||
| 953 | os.remove('./tests/data/comment_clean.cleaned.docx') | ||
| 954 | |||
