Merge branch 'alexmarchant-utf-8-encode-all'

author: jvoisin 2024-04-05 18:33:30 +0200
committer: jvoisin 2024-04-05 18:33:30 +0200
commit: 09672a2dccb2fea0035278c7014f319b85e89c31 (patch)
tree: 2f530cf359d3c99807c5ac6c03fc52b2b93445d6
parent: 61f39c4bd0b51be6371fb2973c14054a2772352e (diff)
parent: f2c898c92d0422ddc76fa977d60f7345b06a5ad6 (diff)
2 files changed, 68 insertions, 9 deletions
diff --git a/libmat2/office.py b/libmat2/office.py
index 66f462b..f182277 100644
--- a/libmat2/office.py
+++ b/libmat2/office.py
@@ -38,7 +38,7 @@ def _sort_xml_attributes(full_path: str) -> bool:
    for c in tree.getroot():
        c[:] = sorted(c, key=lambda child: (child.tag, child.get('desc')))
-    tree.write(full_path, xml_declaration=True)
+    tree.write(full_path, xml_declaration=True, encoding='utf-8')
    return True
@@ -220,7 +220,7 @@ class MSOfficeParser(ZipParser):
        for element in elements_to_remove:
            parent_map[element].remove(element)
-        tree.write(full_path, xml_declaration=True)
+        tree.write(full_path, xml_declaration=True, encoding='utf-8')
        return True
    @staticmethod
@@ -250,7 +250,7 @@ class MSOfficeParser(ZipParser):
        for element in elements_to_remove:
            parent_map[element].remove(element)
-        tree.write(full_path, xml_declaration=True)
+        tree.write(full_path, xml_declaration=True, encoding='utf-8')
        return True
    @staticmethod
@@ -287,7 +287,40 @@ class MSOfficeParser(ZipParser):
            parent_map[element].insert(position, children)
            parent_map[element].remove(element)
-        tree.write(full_path, xml_declaration=True)
+        tree.write(full_path, xml_declaration=True, encoding='utf-8')
+        return True
+    @staticmethod
+    def __remove_document_comment_meta(full_path: str) -> bool:
+        try:
+            tree, namespace = _parse_xml(full_path)
+        except ET.ParseError as e:  # pragma: no cover
+            logging.error("Unable to parse %s: %s", full_path, e)
+            return False
+        # search the docs to see if we can bail early
+        range_start = tree.find('.//w:commentRangeStart', namespace)
+        range_end = tree.find('.//w:commentRangeEnd', namespace)
+        references = tree.find('.//w:commentReference', namespace)
+        if range_start is None and range_end is None and references is None:
+            return True  # No comment meta tags are present
+        parent_map = {c:p for p in tree.iter() for c in p}
+        # iterate over the elements and add them to list
+        elements_del = list()
+        for element in tree.iterfind('.//w:commentRangeStart', namespace):
+            elements_del.append(element)
+        for element in tree.iterfind('.//w:commentRangeEnd', namespace):
+            elements_del.append(element)
+        for element in tree.iterfind('.//w:commentReference', namespace):
+            elements_del.append(element)
+        # remove the elements
+        for element in elements_del:
+            parent_map[element].remove(element)
+        tree.write(full_path, xml_declaration=True, encoding='utf-8')
        return True
    @staticmethod
@@ -353,7 +386,7 @@ class MSOfficeParser(ZipParser):
            if name in removed_fnames:
                root.remove(item)
-        tree.write(full_path, xml_declaration=True)
+        tree.write(full_path, xml_declaration=True, encoding='utf-8')
        return True
    def _final_checks(self) -> bool:
@@ -388,7 +421,7 @@ class MSOfficeParser(ZipParser):
        for item in tree.iterfind('.//p14:creationId', namespace):
            item.set('val', '%s' % random.randint(0, 2**32))
-        tree.write(full_path, xml_declaration=True)
+        tree.write(full_path, xml_declaration=True, encoding='utf-8')
        return True
    @staticmethod
@@ -404,7 +437,7 @@ class MSOfficeParser(ZipParser):
        for item in tree.iterfind('.//p:sldMasterId', namespace):
            item.set('id', '%s' % random.randint(0, 2**32))
-        tree.write(full_path, xml_declaration=True)
+        tree.write(full_path, xml_declaration=True, encoding='utf-8')
        return True
    def _specific_cleanup(self, full_path: str) -> bool:
@@ -550,7 +583,7 @@ class LibreOfficeParser(ZipParser):
            for changes in text.iterfind('.//text:tracked-changes', namespace):
                text.remove(changes)
-        tree.write(full_path, xml_declaration=True)
+        tree.write(full_path, xml_declaration=True, encoding='utf-8')
        return True
    def _specific_cleanup(self, full_path: str) -> bool:
diff --git a/tests/test_libmat2.py b/tests/test_libmat2.py
index d199f54..491f396 100644
--- a/tests/test_libmat2.py
+++ b/tests/test_libmat2.py
@@ -876,6 +876,32 @@ class TextDocx(unittest.TestCase):
        os.remove('./tests/data/comment_clean.docx')
        os.remove('./tests/data/comment_clean.cleaned.docx')
+    def test_xml_is_utf8(self):
+        with zipfile.ZipFile('./tests/data/comment.docx') as zipin:
+            c = zipin.open('word/document.xml')
+            content = c.read()
+            # ensure encoding is utf-8
+            r = b'encoding=(\'|\")UTF-8(\'|\")'
+            match = re.search(r, content, re.IGNORECASE)
+            self.assertIsNotNone(match)
+        shutil.copy('./tests/data/comment.docx', './tests/data/comment_clean.docx')
+        p = office.MSOfficeParser('./tests/data/comment_clean.docx')
+        self.assertTrue(p.remove_all())
+        with zipfile.ZipFile('./tests/data/comment_clean.cleaned.docx') as zipin:
+            c = zipin.open('word/document.xml')
+            content = c.read()
+            # ensure encoding is still utf-8
+            r = b'encoding=(\'|\")UTF-8(\'|\")'
+            match = re.search(r, content, re.IGNORECASE)
+            self.assertIsNotNone(match)
+        os.remove('./tests/data/comment_clean.docx')
+        os.remove('./tests/data/comment_clean.cleaned.docx')
    def test_comment_references_are_removed(self):
        with zipfile.ZipFile('./tests/data/comment.docx') as zipin:
            c = zipin.open('word/document.xml')
@@ -904,4 +930,4 @@ class TextDocx(unittest.TestCase):
            self.assertNotIn(r, content)
        os.remove('./tests/data/comment_clean.docx')
-        os.remove('./tests/data/comment_clean.cleaned.docx')
-\ No newline at end of file
+        os.remove('./tests/data/comment_clean.cleaned.docx')
author	jvoisin	2024-04-05 18:33:30 +0200
committer	jvoisin	2024-04-05 18:33:30 +0200
commit	09672a2dccb2fea0035278c7014f319b85e89c31 (patch)
tree	2f530cf359d3c99807c5ac6c03fc52b2b93445d6
parent	61f39c4bd0b51be6371fb2973c14054a2772352e (diff)
parent	f2c898c92d0422ddc76fa977d60f7345b06a5ad6 (diff)