diff options
Diffstat (limited to 'libmat2/office.py')
| -rw-r--r-- | libmat2/office.py | 98 |
1 files changed, 74 insertions, 24 deletions
diff --git a/libmat2/office.py b/libmat2/office.py index 91bf2a6..3abf108 100644 --- a/libmat2/office.py +++ b/libmat2/office.py | |||
| @@ -50,25 +50,75 @@ class MSOfficeParser(ArchiveBasedAbstractParser): | |||
| 50 | 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', | 50 | 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', |
| 51 | 'application/vnd.openxmlformats-officedocument.presentationml.presentation' | 51 | 'application/vnd.openxmlformats-officedocument.presentationml.presentation' |
| 52 | } | 52 | } |
| 53 | files_to_keep = { | 53 | content_types_to_keep = { |
| 54 | '[Content_Types].xml', | 54 | 'application/vnd.openxmlformats-officedocument.wordprocessingml.endnotes+xml', # /word/endnotes.xml |
| 55 | '_rels/.rels', | 55 | 'application/vnd.openxmlformats-officedocument.wordprocessingml.footnotes+xml', # /word/footnotes.xml |
| 56 | 'word/_rels/document.xml.rels', | 56 | 'application/vnd.openxmlformats-officedocument.extended-properties+xml', # /docProps/app.xml |
| 57 | 'word/document.xml', | 57 | 'application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml', # /word/document.xml |
| 58 | 'word/fontTable.xml', | 58 | 'application/vnd.openxmlformats-officedocument.wordprocessingml.fontTable+xml', # /word/fontTable.xml |
| 59 | 'word/settings.xml', | 59 | 'application/vnd.openxmlformats-officedocument.wordprocessingml.footer+xml', # /word/footer.xml |
| 60 | 'word/styles.xml', | 60 | 'application/vnd.openxmlformats-officedocument.wordprocessingml.header+xml', # /word/header.xml |
| 61 | 'docProps/app.xml', | 61 | 'application/vnd.openxmlformats-officedocument.wordprocessingml.styles+xml', # /word/styles.xml |
| 62 | 'docProps/core.xml', | 62 | 'application/vnd.openxmlformats-package.core-properties+xml', # /docProps/core.xml |
| 63 | 63 | ||
| 64 | # https://msdn.microsoft.com/en-us/library/dd908153(v=office.12).aspx | 64 | # Do we want to keep the following ones? |
| 65 | 'word/stylesWithEffects.xml', | 65 | 'application/vnd.openxmlformats-officedocument.wordprocessingml.settings+xml', |
| 66 | |||
| 67 | # See https://0xacab.org/jvoisin/mat2/issues/71 | ||
| 68 | 'application/vnd.openxmlformats-officedocument.wordprocessingml.numbering+xml', # /word/numbering.xml | ||
| 66 | } | 69 | } |
| 70 | files_to_keep = set(map(re.compile, { # type: ignore | ||
| 71 | r'^\[Content_Types\]\.xml$', | ||
| 72 | r'^_rels/\.rels$', | ||
| 73 | r'^word/_rels/document\.xml\.rels$', | ||
| 74 | r'^word/_rels/footer[0-9]*\.xml\.rels$', | ||
| 75 | r'^word/_rels/header[0-9]*\.xml\.rels$', | ||
| 76 | |||
| 77 | # https://msdn.microsoft.com/en-us/library/dd908153(v=office.12).aspx | ||
| 78 | r'^word/stylesWithEffects\.xml$', | ||
| 79 | })) | ||
| 67 | files_to_omit = set(map(re.compile, { # type: ignore | 80 | files_to_omit = set(map(re.compile, { # type: ignore |
| 68 | 'word/webSettings.xml', | 81 | r'^customXml/', |
| 69 | 'word/theme', | 82 | r'webSettings\.xml$', |
| 83 | r'^docProps/custom\.xml$', | ||
| 84 | r'^word/printerSettings/', | ||
| 85 | r'^word/theme', | ||
| 86 | |||
| 87 | # we have a whitelist in self.files_to_keep, | ||
| 88 | # so we can trash everything else | ||
| 89 | r'^word/_rels/', | ||
| 70 | })) | 90 | })) |
| 71 | 91 | ||
| 92 | def __init__(self, filename): | ||
| 93 | super().__init__(filename) | ||
| 94 | if self.__fill_files_to_keep_via_content_types() is False: | ||
| 95 | raise ValueError | ||
| 96 | |||
| 97 | def __fill_files_to_keep_via_content_types(self) -> bool: | ||
| 98 | """ There is a suer-handy `[Content_Types].xml` file | ||
| 99 | in MS Office archives, describing what each other file contains. | ||
| 100 | The self.content_types_to_keep member contains a type whitelist, | ||
| 101 | so we're using it to fill the self.files_to_keep one. | ||
| 102 | """ | ||
| 103 | with zipfile.ZipFile(self.filename) as zin: | ||
| 104 | if '[Content_Types].xml' not in zin.namelist(): | ||
| 105 | return False | ||
| 106 | xml_data = zin.read('[Content_Types].xml') | ||
| 107 | |||
| 108 | self.content_types = dict() # type: Dict[str, str] | ||
| 109 | try: | ||
| 110 | tree = ET.fromstring(xml_data) | ||
| 111 | except ET.ParseError: | ||
| 112 | return False | ||
| 113 | for c in tree: | ||
| 114 | if 'PartName' not in c.attrib or 'ContentType' not in c.attrib: | ||
| 115 | continue | ||
| 116 | elif c.attrib['ContentType'] in self.content_types_to_keep: | ||
| 117 | fname = c.attrib['PartName'][1:] # remove leading `/` | ||
| 118 | re_fname = re.compile('^' + re.escape(fname) + '$') | ||
| 119 | self.files_to_keep.add(re_fname) # type: ignore | ||
| 120 | return True | ||
| 121 | |||
| 72 | @staticmethod | 122 | @staticmethod |
| 73 | def __remove_rsid(full_path: str) -> bool: | 123 | def __remove_rsid(full_path: str) -> bool: |
| 74 | """ The method will remove "revision session ID". We're '}rsid' | 124 | """ The method will remove "revision session ID". We're '}rsid' |
| @@ -270,18 +320,18 @@ class LibreOfficeParser(ArchiveBasedAbstractParser): | |||
| 270 | 'application/vnd.oasis.opendocument.formula', | 320 | 'application/vnd.oasis.opendocument.formula', |
| 271 | 'application/vnd.oasis.opendocument.image', | 321 | 'application/vnd.oasis.opendocument.image', |
| 272 | } | 322 | } |
| 273 | files_to_keep = { | 323 | files_to_keep = set(map(re.compile, { # type: ignore |
| 274 | 'META-INF/manifest.xml', | 324 | r'^META-INF/manifest\.xml$', |
| 275 | 'content.xml', | 325 | r'^content\.xml$', |
| 276 | 'manifest.rdf', | 326 | r'^manifest\.rdf$', |
| 277 | 'mimetype', | 327 | r'^mimetype$', |
| 278 | 'settings.xml', | 328 | r'^settings\.xml$', |
| 279 | 'styles.xml', | 329 | r'^styles\.xml$', |
| 280 | } | 330 | })) |
| 281 | files_to_omit = set(map(re.compile, { # type: ignore | 331 | files_to_omit = set(map(re.compile, { # type: ignore |
| 282 | r'^meta\.xml$', | 332 | r'^meta\.xml$', |
| 283 | '^Configurations2/', | 333 | r'^Configurations2/', |
| 284 | '^Thumbnails/', | 334 | r'^Thumbnails/', |
| 285 | })) | 335 | })) |
| 286 | 336 | ||
| 287 | @staticmethod | 337 | @staticmethod |
