From f931a0eceed3a89ef7c94a8a7b2bbed208bf0295 Mon Sep 17 00:00:00 2001
From: Alex Marchant
Date: Wed, 3 Apr 2024 15:27:48 -0400
Subject: Make utf-8 explicit in all tree.write calls

---
 tests/test_libmat2.py | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

(limited to 'tests')

diff --git a/tests/test_libmat2.py b/tests/test_libmat2.py
index 32ae543..0435113 100644
--- a/tests/test_libmat2.py
+++ b/tests/test_libmat2.py
@@ -873,5 +873,31 @@ class TextDocx(unittest.TestCase):
             # Check if 'word/comments.xml' exists in the zip
             self.assertNotIn('word/comments.xml', zipin.namelist())
 
+        os.remove('./tests/data/comment_clean.docx')
+        os.remove('./tests/data/comment_clean.cleaned.docx')
+
+    def test_xml_is_utf8(self):
+        with zipfile.ZipFile('./tests/data/comment.docx') as zipin:
+            c = zipin.open('word/document.xml')
+            content = c.read()
+
+            # ensure encoding is utf-8
+            r = b'encoding=(\'|\")UTF-8(\'|\")'
+            match = re.search(r, content, re.IGNORECASE)
+            self.assertIsNotNone(match)
+
+        shutil.copy('./tests/data/comment.docx', './tests/data/comment_clean.docx')
+        p = office.MSOfficeParser('./tests/data/comment_clean.docx')
+        self.assertTrue(p.remove_all())
+
+        with zipfile.ZipFile('./tests/data/comment_clean.cleaned.docx') as zipin:
+            c = zipin.open('word/document.xml')
+            content = c.read()
+
+            # ensure encoding is still utf-8
+            r = b'encoding=(\'|\")UTF-8(\'|\")'
+            match = re.search(r, content, re.IGNORECASE)
+            self.assertIsNotNone(match)
+
         os.remove('./tests/data/comment_clean.docx')
         os.remove('./tests/data/comment_clean.cleaned.docx')
\ No newline at end of file
-- 
cgit v1.3