summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--libmat2/archive.py4
-rw-r--r--libmat2/office.py98
-rw-r--r--tests/data/broken_xml_content_types.docxbin0 -> 4145 bytes
-rw-r--r--tests/data/malformed_content_types.docxbin4131 -> 4135 bytes
-rw-r--r--tests/data/no_content_types.docxbin0 -> 3651 bytes
-rw-r--r--tests/test_corrupted_files.py16
6 files changed, 90 insertions, 28 deletions
diff --git a/libmat2/archive.py b/libmat2/archive.py
index d812531..b29d690 100644
--- a/libmat2/archive.py
+++ b/libmat2/archive.py
@@ -17,7 +17,7 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
17 """ Office files (.docx, .odt, …) are zipped files. """ 17 """ Office files (.docx, .odt, …) are zipped files. """
18 # Those are the files that have a format that _isn't_ 18 # Those are the files that have a format that _isn't_
19 # supported by MAT2, but that we want to keep anyway. 19 # supported by MAT2, but that we want to keep anyway.
20 files_to_keep = set() # type: Set[str] 20 files_to_keep = set() # type: Set[Pattern]
21 21
22 # Those are the files that we _do not_ want to keep, 22 # Those are the files that we _do not_ want to keep,
23 # no matter if they are supported or not. 23 # no matter if they are supported or not.
@@ -89,7 +89,7 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
89 abort = True 89 abort = True
90 continue 90 continue
91 91
92 if item.filename in self.files_to_keep: 92 if any(map(lambda r: r.search(item.filename), self.files_to_keep)):
93 # those files aren't supported, but we want to add them anyway 93 # those files aren't supported, but we want to add them anyway
94 pass 94 pass
95 elif any(map(lambda r: r.search(item.filename), self.files_to_omit)): 95 elif any(map(lambda r: r.search(item.filename), self.files_to_omit)):
diff --git a/libmat2/office.py b/libmat2/office.py
index 91bf2a6..3abf108 100644
--- a/libmat2/office.py
+++ b/libmat2/office.py
@@ -50,25 +50,75 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
50 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', 50 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
51 'application/vnd.openxmlformats-officedocument.presentationml.presentation' 51 'application/vnd.openxmlformats-officedocument.presentationml.presentation'
52 } 52 }
53 files_to_keep = { 53 content_types_to_keep = {
54 '[Content_Types].xml', 54 'application/vnd.openxmlformats-officedocument.wordprocessingml.endnotes+xml', # /word/endnotes.xml
55 '_rels/.rels', 55 'application/vnd.openxmlformats-officedocument.wordprocessingml.footnotes+xml', # /word/footnotes.xml
56 'word/_rels/document.xml.rels', 56 'application/vnd.openxmlformats-officedocument.extended-properties+xml', # /docProps/app.xml
57 'word/document.xml', 57 'application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml', # /word/document.xml
58 'word/fontTable.xml', 58 'application/vnd.openxmlformats-officedocument.wordprocessingml.fontTable+xml', # /word/fontTable.xml
59 'word/settings.xml', 59 'application/vnd.openxmlformats-officedocument.wordprocessingml.footer+xml', # /word/footer.xml
60 'word/styles.xml', 60 'application/vnd.openxmlformats-officedocument.wordprocessingml.header+xml', # /word/header.xml
61 'docProps/app.xml', 61 'application/vnd.openxmlformats-officedocument.wordprocessingml.styles+xml', # /word/styles.xml
62 'docProps/core.xml', 62 'application/vnd.openxmlformats-package.core-properties+xml', # /docProps/core.xml
63 63
64 # https://msdn.microsoft.com/en-us/library/dd908153(v=office.12).aspx 64 # Do we want to keep the following ones?
65 'word/stylesWithEffects.xml', 65 'application/vnd.openxmlformats-officedocument.wordprocessingml.settings+xml',
66
67 # See https://0xacab.org/jvoisin/mat2/issues/71
68 'application/vnd.openxmlformats-officedocument.wordprocessingml.numbering+xml', # /word/numbering.xml
66 } 69 }
70 files_to_keep = set(map(re.compile, { # type: ignore
71 r'^\[Content_Types\]\.xml$',
72 r'^_rels/\.rels$',
73 r'^word/_rels/document\.xml\.rels$',
74 r'^word/_rels/footer[0-9]*\.xml\.rels$',
75 r'^word/_rels/header[0-9]*\.xml\.rels$',
76
77 # https://msdn.microsoft.com/en-us/library/dd908153(v=office.12).aspx
78 r'^word/stylesWithEffects\.xml$',
79 }))
67 files_to_omit = set(map(re.compile, { # type: ignore 80 files_to_omit = set(map(re.compile, { # type: ignore
68 'word/webSettings.xml', 81 r'^customXml/',
69 'word/theme', 82 r'webSettings\.xml$',
83 r'^docProps/custom\.xml$',
84 r'^word/printerSettings/',
85 r'^word/theme',
86
87 # we have a whitelist in self.files_to_keep,
88 # so we can trash everything else
89 r'^word/_rels/',
70 })) 90 }))
71 91
92 def __init__(self, filename):
93 super().__init__(filename)
94 if self.__fill_files_to_keep_via_content_types() is False:
95 raise ValueError
96
97 def __fill_files_to_keep_via_content_types(self) -> bool:
98 """ There is a suer-handy `[Content_Types].xml` file
99 in MS Office archives, describing what each other file contains.
100 The self.content_types_to_keep member contains a type whitelist,
101 so we're using it to fill the self.files_to_keep one.
102 """
103 with zipfile.ZipFile(self.filename) as zin:
104 if '[Content_Types].xml' not in zin.namelist():
105 return False
106 xml_data = zin.read('[Content_Types].xml')
107
108 self.content_types = dict() # type: Dict[str, str]
109 try:
110 tree = ET.fromstring(xml_data)
111 except ET.ParseError:
112 return False
113 for c in tree:
114 if 'PartName' not in c.attrib or 'ContentType' not in c.attrib:
115 continue
116 elif c.attrib['ContentType'] in self.content_types_to_keep:
117 fname = c.attrib['PartName'][1:] # remove leading `/`
118 re_fname = re.compile('^' + re.escape(fname) + '$')
119 self.files_to_keep.add(re_fname) # type: ignore
120 return True
121
72 @staticmethod 122 @staticmethod
73 def __remove_rsid(full_path: str) -> bool: 123 def __remove_rsid(full_path: str) -> bool:
74 """ The method will remove "revision session ID". We're '}rsid' 124 """ The method will remove "revision session ID". We're '}rsid'
@@ -270,18 +320,18 @@ class LibreOfficeParser(ArchiveBasedAbstractParser):
270 'application/vnd.oasis.opendocument.formula', 320 'application/vnd.oasis.opendocument.formula',
271 'application/vnd.oasis.opendocument.image', 321 'application/vnd.oasis.opendocument.image',
272 } 322 }
273 files_to_keep = { 323 files_to_keep = set(map(re.compile, { # type: ignore
274 'META-INF/manifest.xml', 324 r'^META-INF/manifest\.xml$',
275 'content.xml', 325 r'^content\.xml$',
276 'manifest.rdf', 326 r'^manifest\.rdf$',
277 'mimetype', 327 r'^mimetype$',
278 'settings.xml', 328 r'^settings\.xml$',
279 'styles.xml', 329 r'^styles\.xml$',
280 } 330 }))
281 files_to_omit = set(map(re.compile, { # type: ignore 331 files_to_omit = set(map(re.compile, { # type: ignore
282 r'^meta\.xml$', 332 r'^meta\.xml$',
283 '^Configurations2/', 333 r'^Configurations2/',
284 '^Thumbnails/', 334 r'^Thumbnails/',
285 })) 335 }))
286 336
287 @staticmethod 337 @staticmethod
diff --git a/tests/data/broken_xml_content_types.docx b/tests/data/broken_xml_content_types.docx
new file mode 100644
index 0000000..41e0e49
--- /dev/null
+++ b/tests/data/broken_xml_content_types.docx
Binary files differ
diff --git a/tests/data/malformed_content_types.docx b/tests/data/malformed_content_types.docx
index 43ac743..cc5caf3 100644
--- a/tests/data/malformed_content_types.docx
+++ b/tests/data/malformed_content_types.docx
Binary files differ
diff --git a/tests/data/no_content_types.docx b/tests/data/no_content_types.docx
new file mode 100644
index 0000000..d0e0330
--- /dev/null
+++ b/tests/data/no_content_types.docx
Binary files differ
diff --git a/tests/test_corrupted_files.py b/tests/test_corrupted_files.py
index 4ac2678..8d7c252 100644
--- a/tests/test_corrupted_files.py
+++ b/tests/test_corrupted_files.py
@@ -86,14 +86,26 @@ class TestExplicitelyUnsupportedFiles(unittest.TestCase):
86 os.remove('./tests/data/clean.py') 86 os.remove('./tests/data/clean.py')
87 87
88 88
89class TestCorruptedContentTypesOffice(unittest.TestCase): 89class TestWrongContentTypesFileOffice(unittest.TestCase):
90 def test_office(self): 90 def test_office_incomplete(self):
91 shutil.copy('./tests/data/malformed_content_types.docx', './tests/data/clean.docx') 91 shutil.copy('./tests/data/malformed_content_types.docx', './tests/data/clean.docx')
92 p = office.MSOfficeParser('./tests/data/clean.docx') 92 p = office.MSOfficeParser('./tests/data/clean.docx')
93 self.assertIsNotNone(p) 93 self.assertIsNotNone(p)
94 self.assertFalse(p.remove_all()) 94 self.assertFalse(p.remove_all())
95 os.remove('./tests/data/clean.docx') 95 os.remove('./tests/data/clean.docx')
96 96
97 def test_office_broken(self):
98 shutil.copy('./tests/data/broken_xml_content_types.docx', './tests/data/clean.docx')
99 with self.assertRaises(ValueError):
100 office.MSOfficeParser('./tests/data/clean.docx')
101 os.remove('./tests/data/clean.docx')
102
103 def test_office_absent(self):
104 shutil.copy('./tests/data/no_content_types.docx', './tests/data/clean.docx')
105 with self.assertRaises(ValueError):
106 office.MSOfficeParser('./tests/data/clean.docx')
107 os.remove('./tests/data/clean.docx')
108
97class TestCorruptedFiles(unittest.TestCase): 109class TestCorruptedFiles(unittest.TestCase):
98 def test_pdf(self): 110 def test_pdf(self):
99 shutil.copy('./tests/data/dirty.png', './tests/data/clean.png') 111 shutil.copy('./tests/data/dirty.png', './tests/data/clean.png')