diff options
| author | jvoisin | 2024-04-05 18:33:30 +0200 |
|---|---|---|
| committer | jvoisin | 2024-04-05 18:33:30 +0200 |
| commit | 09672a2dccb2fea0035278c7014f319b85e89c31 (patch) | |
| tree | 2f530cf359d3c99807c5ac6c03fc52b2b93445d6 | |
| parent | 61f39c4bd0b51be6371fb2973c14054a2772352e (diff) | |
| parent | f2c898c92d0422ddc76fa977d60f7345b06a5ad6 (diff) | |
Merge branch 'alexmarchant-utf-8-encode-all'
| -rw-r--r-- | libmat2/office.py | 49 | ||||
| -rw-r--r-- | tests/test_libmat2.py | 28 |
2 files changed, 68 insertions, 9 deletions
diff --git a/libmat2/office.py b/libmat2/office.py index 66f462b..f182277 100644 --- a/libmat2/office.py +++ b/libmat2/office.py | |||
| @@ -38,7 +38,7 @@ def _sort_xml_attributes(full_path: str) -> bool: | |||
| 38 | for c in tree.getroot(): | 38 | for c in tree.getroot(): |
| 39 | c[:] = sorted(c, key=lambda child: (child.tag, child.get('desc'))) | 39 | c[:] = sorted(c, key=lambda child: (child.tag, child.get('desc'))) |
| 40 | 40 | ||
| 41 | tree.write(full_path, xml_declaration=True) | 41 | tree.write(full_path, xml_declaration=True, encoding='utf-8') |
| 42 | return True | 42 | return True |
| 43 | 43 | ||
| 44 | 44 | ||
| @@ -220,7 +220,7 @@ class MSOfficeParser(ZipParser): | |||
| 220 | for element in elements_to_remove: | 220 | for element in elements_to_remove: |
| 221 | parent_map[element].remove(element) | 221 | parent_map[element].remove(element) |
| 222 | 222 | ||
| 223 | tree.write(full_path, xml_declaration=True) | 223 | tree.write(full_path, xml_declaration=True, encoding='utf-8') |
| 224 | return True | 224 | return True |
| 225 | 225 | ||
| 226 | @staticmethod | 226 | @staticmethod |
| @@ -250,7 +250,7 @@ class MSOfficeParser(ZipParser): | |||
| 250 | for element in elements_to_remove: | 250 | for element in elements_to_remove: |
| 251 | parent_map[element].remove(element) | 251 | parent_map[element].remove(element) |
| 252 | 252 | ||
| 253 | tree.write(full_path, xml_declaration=True) | 253 | tree.write(full_path, xml_declaration=True, encoding='utf-8') |
| 254 | return True | 254 | return True |
| 255 | 255 | ||
| 256 | @staticmethod | 256 | @staticmethod |
| @@ -287,7 +287,40 @@ class MSOfficeParser(ZipParser): | |||
| 287 | parent_map[element].insert(position, children) | 287 | parent_map[element].insert(position, children) |
| 288 | parent_map[element].remove(element) | 288 | parent_map[element].remove(element) |
| 289 | 289 | ||
| 290 | tree.write(full_path, xml_declaration=True) | 290 | tree.write(full_path, xml_declaration=True, encoding='utf-8') |
| 291 | return True | ||
| 292 | |||
| 293 | @staticmethod | ||
| 294 | def __remove_document_comment_meta(full_path: str) -> bool: | ||
| 295 | try: | ||
| 296 | tree, namespace = _parse_xml(full_path) | ||
| 297 | except ET.ParseError as e: # pragma: no cover | ||
| 298 | logging.error("Unable to parse %s: %s", full_path, e) | ||
| 299 | return False | ||
| 300 | |||
| 301 | # search the docs to see if we can bail early | ||
| 302 | range_start = tree.find('.//w:commentRangeStart', namespace) | ||
| 303 | range_end = tree.find('.//w:commentRangeEnd', namespace) | ||
| 304 | references = tree.find('.//w:commentReference', namespace) | ||
| 305 | if range_start is None and range_end is None and references is None: | ||
| 306 | return True # No comment meta tags are present | ||
| 307 | |||
| 308 | parent_map = {c:p for p in tree.iter() for c in p} | ||
| 309 | |||
| 310 | # iterate over the elements and add them to list | ||
| 311 | elements_del = list() | ||
| 312 | for element in tree.iterfind('.//w:commentRangeStart', namespace): | ||
| 313 | elements_del.append(element) | ||
| 314 | for element in tree.iterfind('.//w:commentRangeEnd', namespace): | ||
| 315 | elements_del.append(element) | ||
| 316 | for element in tree.iterfind('.//w:commentReference', namespace): | ||
| 317 | elements_del.append(element) | ||
| 318 | |||
| 319 | # remove the elements | ||
| 320 | for element in elements_del: | ||
| 321 | parent_map[element].remove(element) | ||
| 322 | |||
| 323 | tree.write(full_path, xml_declaration=True, encoding='utf-8') | ||
| 291 | return True | 324 | return True |
| 292 | 325 | ||
| 293 | @staticmethod | 326 | @staticmethod |
| @@ -353,7 +386,7 @@ class MSOfficeParser(ZipParser): | |||
| 353 | if name in removed_fnames: | 386 | if name in removed_fnames: |
| 354 | root.remove(item) | 387 | root.remove(item) |
| 355 | 388 | ||
| 356 | tree.write(full_path, xml_declaration=True) | 389 | tree.write(full_path, xml_declaration=True, encoding='utf-8') |
| 357 | return True | 390 | return True |
| 358 | 391 | ||
| 359 | def _final_checks(self) -> bool: | 392 | def _final_checks(self) -> bool: |
| @@ -388,7 +421,7 @@ class MSOfficeParser(ZipParser): | |||
| 388 | 421 | ||
| 389 | for item in tree.iterfind('.//p14:creationId', namespace): | 422 | for item in tree.iterfind('.//p14:creationId', namespace): |
| 390 | item.set('val', '%s' % random.randint(0, 2**32)) | 423 | item.set('val', '%s' % random.randint(0, 2**32)) |
| 391 | tree.write(full_path, xml_declaration=True) | 424 | tree.write(full_path, xml_declaration=True, encoding='utf-8') |
| 392 | return True | 425 | return True |
| 393 | 426 | ||
| 394 | @staticmethod | 427 | @staticmethod |
| @@ -404,7 +437,7 @@ class MSOfficeParser(ZipParser): | |||
| 404 | 437 | ||
| 405 | for item in tree.iterfind('.//p:sldMasterId', namespace): | 438 | for item in tree.iterfind('.//p:sldMasterId', namespace): |
| 406 | item.set('id', '%s' % random.randint(0, 2**32)) | 439 | item.set('id', '%s' % random.randint(0, 2**32)) |
| 407 | tree.write(full_path, xml_declaration=True) | 440 | tree.write(full_path, xml_declaration=True, encoding='utf-8') |
| 408 | return True | 441 | return True |
| 409 | 442 | ||
| 410 | def _specific_cleanup(self, full_path: str) -> bool: | 443 | def _specific_cleanup(self, full_path: str) -> bool: |
| @@ -550,7 +583,7 @@ class LibreOfficeParser(ZipParser): | |||
| 550 | for changes in text.iterfind('.//text:tracked-changes', namespace): | 583 | for changes in text.iterfind('.//text:tracked-changes', namespace): |
| 551 | text.remove(changes) | 584 | text.remove(changes) |
| 552 | 585 | ||
| 553 | tree.write(full_path, xml_declaration=True) | 586 | tree.write(full_path, xml_declaration=True, encoding='utf-8') |
| 554 | return True | 587 | return True |
| 555 | 588 | ||
| 556 | def _specific_cleanup(self, full_path: str) -> bool: | 589 | def _specific_cleanup(self, full_path: str) -> bool: |
diff --git a/tests/test_libmat2.py b/tests/test_libmat2.py index d199f54..491f396 100644 --- a/tests/test_libmat2.py +++ b/tests/test_libmat2.py | |||
| @@ -876,6 +876,32 @@ class TextDocx(unittest.TestCase): | |||
| 876 | os.remove('./tests/data/comment_clean.docx') | 876 | os.remove('./tests/data/comment_clean.docx') |
| 877 | os.remove('./tests/data/comment_clean.cleaned.docx') | 877 | os.remove('./tests/data/comment_clean.cleaned.docx') |
| 878 | 878 | ||
| 879 | def test_xml_is_utf8(self): | ||
| 880 | with zipfile.ZipFile('./tests/data/comment.docx') as zipin: | ||
| 881 | c = zipin.open('word/document.xml') | ||
| 882 | content = c.read() | ||
| 883 | |||
| 884 | # ensure encoding is utf-8 | ||
| 885 | r = b'encoding=(\'|\")UTF-8(\'|\")' | ||
| 886 | match = re.search(r, content, re.IGNORECASE) | ||
| 887 | self.assertIsNotNone(match) | ||
| 888 | |||
| 889 | shutil.copy('./tests/data/comment.docx', './tests/data/comment_clean.docx') | ||
| 890 | p = office.MSOfficeParser('./tests/data/comment_clean.docx') | ||
| 891 | self.assertTrue(p.remove_all()) | ||
| 892 | |||
| 893 | with zipfile.ZipFile('./tests/data/comment_clean.cleaned.docx') as zipin: | ||
| 894 | c = zipin.open('word/document.xml') | ||
| 895 | content = c.read() | ||
| 896 | |||
| 897 | # ensure encoding is still utf-8 | ||
| 898 | r = b'encoding=(\'|\")UTF-8(\'|\")' | ||
| 899 | match = re.search(r, content, re.IGNORECASE) | ||
| 900 | self.assertIsNotNone(match) | ||
| 901 | |||
| 902 | os.remove('./tests/data/comment_clean.docx') | ||
| 903 | os.remove('./tests/data/comment_clean.cleaned.docx') | ||
| 904 | |||
| 879 | def test_comment_references_are_removed(self): | 905 | def test_comment_references_are_removed(self): |
| 880 | with zipfile.ZipFile('./tests/data/comment.docx') as zipin: | 906 | with zipfile.ZipFile('./tests/data/comment.docx') as zipin: |
| 881 | c = zipin.open('word/document.xml') | 907 | c = zipin.open('word/document.xml') |
| @@ -904,4 +930,4 @@ class TextDocx(unittest.TestCase): | |||
| 904 | self.assertNotIn(r, content) | 930 | self.assertNotIn(r, content) |
| 905 | 931 | ||
| 906 | os.remove('./tests/data/comment_clean.docx') | 932 | os.remove('./tests/data/comment_clean.docx') |
| 907 | os.remove('./tests/data/comment_clean.cleaned.docx') \ No newline at end of file | 933 | os.remove('./tests/data/comment_clean.cleaned.docx') |
