diff options
| -rw-r--r-- | libmat2/archive.py | 4 | ||||
| -rw-r--r-- | libmat2/office.py | 98 | ||||
| -rw-r--r-- | tests/data/broken_xml_content_types.docx | bin | 0 -> 4145 bytes | |||
| -rw-r--r-- | tests/data/malformed_content_types.docx | bin | 4131 -> 4135 bytes | |||
| -rw-r--r-- | tests/data/no_content_types.docx | bin | 0 -> 3651 bytes | |||
| -rw-r--r-- | tests/test_corrupted_files.py | 16 |
6 files changed, 90 insertions, 28 deletions
diff --git a/libmat2/archive.py b/libmat2/archive.py index d812531..b29d690 100644 --- a/libmat2/archive.py +++ b/libmat2/archive.py | |||
| @@ -17,7 +17,7 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser): | |||
| 17 | """ Office files (.docx, .odt, …) are zipped files. """ | 17 | """ Office files (.docx, .odt, …) are zipped files. """ |
| 18 | # Those are the files that have a format that _isn't_ | 18 | # Those are the files that have a format that _isn't_ |
| 19 | # supported by MAT2, but that we want to keep anyway. | 19 | # supported by MAT2, but that we want to keep anyway. |
| 20 | files_to_keep = set() # type: Set[str] | 20 | files_to_keep = set() # type: Set[Pattern] |
| 21 | 21 | ||
| 22 | # Those are the files that we _do not_ want to keep, | 22 | # Those are the files that we _do not_ want to keep, |
| 23 | # no matter if they are supported or not. | 23 | # no matter if they are supported or not. |
| @@ -89,7 +89,7 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser): | |||
| 89 | abort = True | 89 | abort = True |
| 90 | continue | 90 | continue |
| 91 | 91 | ||
| 92 | if item.filename in self.files_to_keep: | 92 | if any(map(lambda r: r.search(item.filename), self.files_to_keep)): |
| 93 | # those files aren't supported, but we want to add them anyway | 93 | # those files aren't supported, but we want to add them anyway |
| 94 | pass | 94 | pass |
| 95 | elif any(map(lambda r: r.search(item.filename), self.files_to_omit)): | 95 | elif any(map(lambda r: r.search(item.filename), self.files_to_omit)): |
diff --git a/libmat2/office.py b/libmat2/office.py index 91bf2a6..3abf108 100644 --- a/libmat2/office.py +++ b/libmat2/office.py | |||
| @@ -50,25 +50,75 @@ class MSOfficeParser(ArchiveBasedAbstractParser): | |||
| 50 | 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', | 50 | 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', |
| 51 | 'application/vnd.openxmlformats-officedocument.presentationml.presentation' | 51 | 'application/vnd.openxmlformats-officedocument.presentationml.presentation' |
| 52 | } | 52 | } |
| 53 | files_to_keep = { | 53 | content_types_to_keep = { |
| 54 | '[Content_Types].xml', | 54 | 'application/vnd.openxmlformats-officedocument.wordprocessingml.endnotes+xml', # /word/endnotes.xml |
| 55 | '_rels/.rels', | 55 | 'application/vnd.openxmlformats-officedocument.wordprocessingml.footnotes+xml', # /word/footnotes.xml |
| 56 | 'word/_rels/document.xml.rels', | 56 | 'application/vnd.openxmlformats-officedocument.extended-properties+xml', # /docProps/app.xml |
| 57 | 'word/document.xml', | 57 | 'application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml', # /word/document.xml |
| 58 | 'word/fontTable.xml', | 58 | 'application/vnd.openxmlformats-officedocument.wordprocessingml.fontTable+xml', # /word/fontTable.xml |
| 59 | 'word/settings.xml', | 59 | 'application/vnd.openxmlformats-officedocument.wordprocessingml.footer+xml', # /word/footer.xml |
| 60 | 'word/styles.xml', | 60 | 'application/vnd.openxmlformats-officedocument.wordprocessingml.header+xml', # /word/header.xml |
| 61 | 'docProps/app.xml', | 61 | 'application/vnd.openxmlformats-officedocument.wordprocessingml.styles+xml', # /word/styles.xml |
| 62 | 'docProps/core.xml', | 62 | 'application/vnd.openxmlformats-package.core-properties+xml', # /docProps/core.xml |
| 63 | 63 | ||
| 64 | # https://msdn.microsoft.com/en-us/library/dd908153(v=office.12).aspx | 64 | # Do we want to keep the following ones? |
| 65 | 'word/stylesWithEffects.xml', | 65 | 'application/vnd.openxmlformats-officedocument.wordprocessingml.settings+xml', |
| 66 | |||
| 67 | # See https://0xacab.org/jvoisin/mat2/issues/71 | ||
| 68 | 'application/vnd.openxmlformats-officedocument.wordprocessingml.numbering+xml', # /word/numbering.xml | ||
| 66 | } | 69 | } |
| 70 | files_to_keep = set(map(re.compile, { # type: ignore | ||
| 71 | r'^\[Content_Types\]\.xml$', | ||
| 72 | r'^_rels/\.rels$', | ||
| 73 | r'^word/_rels/document\.xml\.rels$', | ||
| 74 | r'^word/_rels/footer[0-9]*\.xml\.rels$', | ||
| 75 | r'^word/_rels/header[0-9]*\.xml\.rels$', | ||
| 76 | |||
| 77 | # https://msdn.microsoft.com/en-us/library/dd908153(v=office.12).aspx | ||
| 78 | r'^word/stylesWithEffects\.xml$', | ||
| 79 | })) | ||
| 67 | files_to_omit = set(map(re.compile, { # type: ignore | 80 | files_to_omit = set(map(re.compile, { # type: ignore |
| 68 | 'word/webSettings.xml', | 81 | r'^customXml/', |
| 69 | 'word/theme', | 82 | r'webSettings\.xml$', |
| 83 | r'^docProps/custom\.xml$', | ||
| 84 | r'^word/printerSettings/', | ||
| 85 | r'^word/theme', | ||
| 86 | |||
| 87 | # we have a whitelist in self.files_to_keep, | ||
| 88 | # so we can trash everything else | ||
| 89 | r'^word/_rels/', | ||
| 70 | })) | 90 | })) |
| 71 | 91 | ||
| 92 | def __init__(self, filename): | ||
| 93 | super().__init__(filename) | ||
| 94 | if self.__fill_files_to_keep_via_content_types() is False: | ||
| 95 | raise ValueError | ||
| 96 | |||
| 97 | def __fill_files_to_keep_via_content_types(self) -> bool: | ||
| 98 | """ There is a suer-handy `[Content_Types].xml` file | ||
| 99 | in MS Office archives, describing what each other file contains. | ||
| 100 | The self.content_types_to_keep member contains a type whitelist, | ||
| 101 | so we're using it to fill the self.files_to_keep one. | ||
| 102 | """ | ||
| 103 | with zipfile.ZipFile(self.filename) as zin: | ||
| 104 | if '[Content_Types].xml' not in zin.namelist(): | ||
| 105 | return False | ||
| 106 | xml_data = zin.read('[Content_Types].xml') | ||
| 107 | |||
| 108 | self.content_types = dict() # type: Dict[str, str] | ||
| 109 | try: | ||
| 110 | tree = ET.fromstring(xml_data) | ||
| 111 | except ET.ParseError: | ||
| 112 | return False | ||
| 113 | for c in tree: | ||
| 114 | if 'PartName' not in c.attrib or 'ContentType' not in c.attrib: | ||
| 115 | continue | ||
| 116 | elif c.attrib['ContentType'] in self.content_types_to_keep: | ||
| 117 | fname = c.attrib['PartName'][1:] # remove leading `/` | ||
| 118 | re_fname = re.compile('^' + re.escape(fname) + '$') | ||
| 119 | self.files_to_keep.add(re_fname) # type: ignore | ||
| 120 | return True | ||
| 121 | |||
| 72 | @staticmethod | 122 | @staticmethod |
| 73 | def __remove_rsid(full_path: str) -> bool: | 123 | def __remove_rsid(full_path: str) -> bool: |
| 74 | """ The method will remove "revision session ID". We're '}rsid' | 124 | """ The method will remove "revision session ID". We're '}rsid' |
| @@ -270,18 +320,18 @@ class LibreOfficeParser(ArchiveBasedAbstractParser): | |||
| 270 | 'application/vnd.oasis.opendocument.formula', | 320 | 'application/vnd.oasis.opendocument.formula', |
| 271 | 'application/vnd.oasis.opendocument.image', | 321 | 'application/vnd.oasis.opendocument.image', |
| 272 | } | 322 | } |
| 273 | files_to_keep = { | 323 | files_to_keep = set(map(re.compile, { # type: ignore |
| 274 | 'META-INF/manifest.xml', | 324 | r'^META-INF/manifest\.xml$', |
| 275 | 'content.xml', | 325 | r'^content\.xml$', |
| 276 | 'manifest.rdf', | 326 | r'^manifest\.rdf$', |
| 277 | 'mimetype', | 327 | r'^mimetype$', |
| 278 | 'settings.xml', | 328 | r'^settings\.xml$', |
| 279 | 'styles.xml', | 329 | r'^styles\.xml$', |
| 280 | } | 330 | })) |
| 281 | files_to_omit = set(map(re.compile, { # type: ignore | 331 | files_to_omit = set(map(re.compile, { # type: ignore |
| 282 | r'^meta\.xml$', | 332 | r'^meta\.xml$', |
| 283 | '^Configurations2/', | 333 | r'^Configurations2/', |
| 284 | '^Thumbnails/', | 334 | r'^Thumbnails/', |
| 285 | })) | 335 | })) |
| 286 | 336 | ||
| 287 | @staticmethod | 337 | @staticmethod |
diff --git a/tests/data/broken_xml_content_types.docx b/tests/data/broken_xml_content_types.docx new file mode 100644 index 0000000..41e0e49 --- /dev/null +++ b/tests/data/broken_xml_content_types.docx | |||
| Binary files differ | |||
diff --git a/tests/data/malformed_content_types.docx b/tests/data/malformed_content_types.docx index 43ac743..cc5caf3 100644 --- a/tests/data/malformed_content_types.docx +++ b/tests/data/malformed_content_types.docx | |||
| Binary files differ | |||
diff --git a/tests/data/no_content_types.docx b/tests/data/no_content_types.docx new file mode 100644 index 0000000..d0e0330 --- /dev/null +++ b/tests/data/no_content_types.docx | |||
| Binary files differ | |||
diff --git a/tests/test_corrupted_files.py b/tests/test_corrupted_files.py index 4ac2678..8d7c252 100644 --- a/tests/test_corrupted_files.py +++ b/tests/test_corrupted_files.py | |||
| @@ -86,14 +86,26 @@ class TestExplicitelyUnsupportedFiles(unittest.TestCase): | |||
| 86 | os.remove('./tests/data/clean.py') | 86 | os.remove('./tests/data/clean.py') |
| 87 | 87 | ||
| 88 | 88 | ||
| 89 | class TestCorruptedContentTypesOffice(unittest.TestCase): | 89 | class TestWrongContentTypesFileOffice(unittest.TestCase): |
| 90 | def test_office(self): | 90 | def test_office_incomplete(self): |
| 91 | shutil.copy('./tests/data/malformed_content_types.docx', './tests/data/clean.docx') | 91 | shutil.copy('./tests/data/malformed_content_types.docx', './tests/data/clean.docx') |
| 92 | p = office.MSOfficeParser('./tests/data/clean.docx') | 92 | p = office.MSOfficeParser('./tests/data/clean.docx') |
| 93 | self.assertIsNotNone(p) | 93 | self.assertIsNotNone(p) |
| 94 | self.assertFalse(p.remove_all()) | 94 | self.assertFalse(p.remove_all()) |
| 95 | os.remove('./tests/data/clean.docx') | 95 | os.remove('./tests/data/clean.docx') |
| 96 | 96 | ||
| 97 | def test_office_broken(self): | ||
| 98 | shutil.copy('./tests/data/broken_xml_content_types.docx', './tests/data/clean.docx') | ||
| 99 | with self.assertRaises(ValueError): | ||
| 100 | office.MSOfficeParser('./tests/data/clean.docx') | ||
| 101 | os.remove('./tests/data/clean.docx') | ||
| 102 | |||
| 103 | def test_office_absent(self): | ||
| 104 | shutil.copy('./tests/data/no_content_types.docx', './tests/data/clean.docx') | ||
| 105 | with self.assertRaises(ValueError): | ||
| 106 | office.MSOfficeParser('./tests/data/clean.docx') | ||
| 107 | os.remove('./tests/data/clean.docx') | ||
| 108 | |||
| 97 | class TestCorruptedFiles(unittest.TestCase): | 109 | class TestCorruptedFiles(unittest.TestCase): |
| 98 | def test_pdf(self): | 110 | def test_pdf(self): |
| 99 | shutil.copy('./tests/data/dirty.png', './tests/data/clean.png') | 111 | shutil.copy('./tests/data/dirty.png', './tests/data/clean.png') |
