diff options
| author | jvoisin | 2018-09-30 19:52:35 +0200 |
|---|---|---|
| committer | jvoisin | 2018-09-30 19:53:18 +0200 |
| commit | e342671eadd3f5ff922fe62cae81792d4cd65e83 (patch) | |
| tree | cd85ad31860cdb3d4a670bd0f1c3dd48ff5129c6 /libmat2 | |
| parent | 212d9c472cd7017cab31449ff75461bebd7a40d4 (diff) | |
Remove dangling references in MS Office's [Content_types].xml
Diffstat (limited to 'libmat2')
| -rw-r--r-- | libmat2/office.py | 34 |
1 files changed, 34 insertions, 0 deletions
diff --git a/libmat2/office.py b/libmat2/office.py index bad352b..b220092 100644 --- a/libmat2/office.py +++ b/libmat2/office.py | |||
| @@ -151,10 +151,44 @@ class MSOfficeParser(ArchiveBasedAbstractParser): | |||
| 151 | 151 | ||
| 152 | return True | 152 | return True |
| 153 | 153 | ||
| 154 | def __remove_content_type_members(self, full_path: str) -> bool: | ||
| 155 | """ The method will remove the dangling references | ||
| 156 | form the [Content_Types].xml file, since MS office doesn't like them | ||
| 157 | """ | ||
| 158 | try: | ||
| 159 | tree, namespace = _parse_xml(full_path) | ||
| 160 | except ET.ParseError: # pragma: no cover | ||
| 161 | return False | ||
| 162 | |||
| 163 | if len(namespace.items()) != 1: | ||
| 164 | return False # there should be only one namespace for Types | ||
| 165 | |||
| 166 | removed_fnames = set() | ||
| 167 | with zipfile.ZipFile(self.filename) as zin: | ||
| 168 | for fname in [item.filename for item in zin.infolist()]: | ||
| 169 | if any(map(lambda r: r.search(fname), self.files_to_omit)): | ||
| 170 | removed_fnames.add(fname) | ||
| 171 | |||
| 172 | root = tree.getroot() | ||
| 173 | for item in root.findall('{%s}Override' % namespace['']): | ||
| 174 | name = item.attrib['PartName'][1:] # remove the leading '/' | ||
| 175 | if name in removed_fnames: | ||
| 176 | root.remove(item) | ||
| 177 | |||
| 178 | tree.write(full_path, xml_declaration=True) | ||
| 179 | |||
| 180 | return True | ||
| 181 | |||
| 154 | def _specific_cleanup(self, full_path: str) -> bool: | 182 | def _specific_cleanup(self, full_path: str) -> bool: |
| 155 | if os.stat(full_path).st_size == 0: # Don't process empty files | 183 | if os.stat(full_path).st_size == 0: # Don't process empty files |
| 156 | return True | 184 | return True |
| 157 | 185 | ||
| 186 | if full_path.endswith('/[Content_Types].xml'): | ||
| 187 | # this file contains references to files that we might | ||
| 188 | # remove, and MS Office doesn't like dangling references | ||
| 189 | if self.__remove_content_type_members(full_path) is False: | ||
| 190 | return False | ||
| 191 | |||
| 158 | if full_path.endswith('/word/document.xml'): | 192 | if full_path.endswith('/word/document.xml'): |
| 159 | # this file contains the revisions | 193 | # this file contains the revisions |
| 160 | if self.__remove_revisions(full_path) is False: | 194 | if self.__remove_revisions(full_path) is False: |
