From f931a0eceed3a89ef7c94a8a7b2bbed208bf0295 Mon Sep 17 00:00:00 2001 From: Alex Marchant Date: Wed, 3 Apr 2024 15:27:48 -0400 Subject: Make utf-8 explicit in all tree.write calls --- tests/test_libmat2.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'tests') diff --git a/tests/test_libmat2.py b/tests/test_libmat2.py index 32ae543..0435113 100644 --- a/tests/test_libmat2.py +++ b/tests/test_libmat2.py @@ -873,5 +873,31 @@ class TextDocx(unittest.TestCase): # Check if 'word/comments.xml' exists in the zip self.assertNotIn('word/comments.xml', zipin.namelist()) + os.remove('./tests/data/comment_clean.docx') + os.remove('./tests/data/comment_clean.cleaned.docx') + + def test_xml_is_utf8(self): + with zipfile.ZipFile('./tests/data/comment.docx') as zipin: + c = zipin.open('word/document.xml') + content = c.read() + + # ensure encoding is utf-8 + r = b'encoding=(\'|\")UTF-8(\'|\")' + match = re.search(r, content, re.IGNORECASE) + self.assertIsNotNone(match) + + shutil.copy('./tests/data/comment.docx', './tests/data/comment_clean.docx') + p = office.MSOfficeParser('./tests/data/comment_clean.docx') + self.assertTrue(p.remove_all()) + + with zipfile.ZipFile('./tests/data/comment_clean.cleaned.docx') as zipin: + c = zipin.open('word/document.xml') + content = c.read() + + # ensure encoding is still utf-8 + r = b'encoding=(\'|\")UTF-8(\'|\")' + match = re.search(r, content, re.IGNORECASE) + self.assertIsNotNone(match) + os.remove('./tests/data/comment_clean.docx') os.remove('./tests/data/comment_clean.cleaned.docx') \ No newline at end of file -- cgit v1.3