diff options
| author | jvoisin | 2018-10-01 12:25:37 -0700 |
|---|---|---|
| committer | jvoisin | 2018-10-01 12:25:37 -0700 |
| commit | 652b8e519fbd11da051f40ecde5b814e1e8fc013 (patch) | |
| tree | d510e6543d61806facf99a70174c98145823f323 /libmat2/office.py | |
| parent | c14be47f95e3a0fe6ecddfc236329bc3e76f5eec (diff) | |
Files processed via MAT2 are now accepted without warnings by MS Office
Diffstat (limited to 'libmat2/office.py')
| -rw-r--r-- | libmat2/office.py | 63 |
1 files changed, 50 insertions, 13 deletions
diff --git a/libmat2/office.py b/libmat2/office.py index a8a2c94..4348d9b 100644 --- a/libmat2/office.py +++ b/libmat2/office.py | |||
| @@ -36,9 +36,8 @@ def _sort_xml_attributes(full_path: str) -> bool: | |||
| 36 | since they are all using different orders. | 36 | since they are all using different orders. |
| 37 | """ | 37 | """ |
| 38 | tree = ET.parse(full_path) | 38 | tree = ET.parse(full_path) |
| 39 | root = tree.getroot() | ||
| 40 | 39 | ||
| 41 | for c in root: | 40 | for c in tree.getroot(): |
| 42 | c[:] = sorted(c, key=lambda child: (child.tag, child.get('desc'))) | 41 | c[:] = sorted(c, key=lambda child: (child.tag, child.get('desc'))) |
| 43 | 42 | ||
| 44 | tree.write(full_path, xml_declaration=True) | 43 | tree.write(full_path, xml_declaration=True) |
| @@ -59,6 +58,8 @@ class MSOfficeParser(ArchiveBasedAbstractParser): | |||
| 59 | 'word/fontTable.xml', | 58 | 'word/fontTable.xml', |
| 60 | 'word/settings.xml', | 59 | 'word/settings.xml', |
| 61 | 'word/styles.xml', | 60 | 'word/styles.xml', |
| 61 | 'docProps/app.xml', | ||
| 62 | 'docProps/core.xml', | ||
| 62 | 63 | ||
| 63 | # https://msdn.microsoft.com/en-us/library/dd908153(v=office.12).aspx | 64 | # https://msdn.microsoft.com/en-us/library/dd908153(v=office.12).aspx |
| 64 | 'word/stylesWithEffects.xml', | 65 | 'word/stylesWithEffects.xml', |
| @@ -66,7 +67,6 @@ class MSOfficeParser(ArchiveBasedAbstractParser): | |||
| 66 | files_to_omit = set(map(re.compile, { # type: ignore | 67 | files_to_omit = set(map(re.compile, { # type: ignore |
| 67 | 'word/webSettings.xml', | 68 | 'word/webSettings.xml', |
| 68 | 'word/theme', | 69 | 'word/theme', |
| 69 | '^docProps/', | ||
| 70 | })) | 70 | })) |
| 71 | 71 | ||
| 72 | @staticmethod | 72 | @staticmethod |
| @@ -95,7 +95,7 @@ class MSOfficeParser(ArchiveBasedAbstractParser): | |||
| 95 | 95 | ||
| 96 | elements_to_remove = list() | 96 | elements_to_remove = list() |
| 97 | for item in tree.iterfind('.//', namespace): | 97 | for item in tree.iterfind('.//', namespace): |
| 98 | if '}rsid' in item.tag.strip().lower(): # resi as tag | 98 | if '}rsid' in item.tag.strip().lower(): # rsid as tag |
| 99 | elements_to_remove.append(item) | 99 | elements_to_remove.append(item) |
| 100 | continue | 100 | continue |
| 101 | for key in list(item.attrib.keys()): # rsid as attribute | 101 | for key in list(item.attrib.keys()): # rsid as attribute |
| @@ -106,7 +106,6 @@ class MSOfficeParser(ArchiveBasedAbstractParser): | |||
| 106 | parent_map[element].remove(element) | 106 | parent_map[element].remove(element) |
| 107 | 107 | ||
| 108 | tree.write(full_path, xml_declaration=True) | 108 | tree.write(full_path, xml_declaration=True) |
| 109 | |||
| 110 | return True | 109 | return True |
| 111 | 110 | ||
| 112 | @staticmethod | 111 | @staticmethod |
| @@ -148,7 +147,6 @@ class MSOfficeParser(ArchiveBasedAbstractParser): | |||
| 148 | parent_map[element].remove(element) | 147 | parent_map[element].remove(element) |
| 149 | 148 | ||
| 150 | tree.write(full_path, xml_declaration=True) | 149 | tree.write(full_path, xml_declaration=True) |
| 151 | |||
| 152 | return True | 150 | return True |
| 153 | 151 | ||
| 154 | def __remove_content_type_members(self, full_path: str) -> bool: | 152 | def __remove_content_type_members(self, full_path: str) -> bool: |
| @@ -176,27 +174,67 @@ class MSOfficeParser(ArchiveBasedAbstractParser): | |||
| 176 | root.remove(item) | 174 | root.remove(item) |
| 177 | 175 | ||
| 178 | tree.write(full_path, xml_declaration=True) | 176 | tree.write(full_path, xml_declaration=True) |
| 179 | |||
| 180 | return True | 177 | return True |
| 181 | 178 | ||
| 182 | def _specific_cleanup(self, full_path: str) -> bool: | 179 | def _specific_cleanup(self, full_path: str) -> bool: |
| 180 | # pylint: disable=too-many-return-statements | ||
| 183 | if os.stat(full_path).st_size == 0: # Don't process empty files | 181 | if os.stat(full_path).st_size == 0: # Don't process empty files |
| 184 | return True | 182 | return True |
| 185 | 183 | ||
| 184 | if not full_path.endswith('.xml'): | ||
| 185 | return True | ||
| 186 | |||
| 186 | if full_path.endswith('/[Content_Types].xml'): | 187 | if full_path.endswith('/[Content_Types].xml'): |
| 187 | # this file contains references to files that we might | 188 | # this file contains references to files that we might |
| 188 | # remove, and MS Office doesn't like dangling references | 189 | # remove, and MS Office doesn't like dangling references |
| 189 | if self.__remove_content_type_members(full_path) is False: | 190 | if self.__remove_content_type_members(full_path) is False: |
| 190 | return False | 191 | return False |
| 191 | 192 | elif full_path.endswith('/word/document.xml'): | |
| 192 | if full_path.endswith('/word/document.xml'): | ||
| 193 | # this file contains the revisions | 193 | # this file contains the revisions |
| 194 | if self.__remove_revisions(full_path) is False: | 194 | if self.__remove_revisions(full_path) is False: |
| 195 | return False | 195 | return False |
| 196 | elif full_path.endswith('/docProps/app.xml'): | ||
| 197 | # This file must be present and valid, | ||
| 198 | # so we're removing as much as we can. | ||
| 199 | with open(full_path, 'wb') as f: | ||
| 200 | f.write(b'<?xml version="1.0" encoding="UTF-8" standalone="yes"?>') | ||
| 201 | f.write(b'<Properties xmlns="http://schemas.openxmlformats.org/officeDocument/2006/extended-properties">') | ||
| 202 | f.write(b'</Properties>') | ||
| 203 | elif full_path.endswith('/docProps/core.xml'): | ||
| 204 | # This file must be present and valid, | ||
| 205 | # so we're removing as much as we can. | ||
| 206 | with open(full_path, 'wb') as f: | ||
| 207 | f.write(b'<?xml version="1.0" encoding="UTF-8" standalone="yes"?>') | ||
| 208 | f.write(b'<cp:coreProperties xmlns:cp="http://schemas.openxmlformats.org/package/2006/metadata/core-properties">') | ||
| 209 | f.write(b'</cp:coreProperties>') | ||
| 196 | 210 | ||
| 197 | if full_path.endswith('.xml'): | 211 | |
| 198 | if self.__remove_rsid(full_path) is False: | 212 | if self.__remove_rsid(full_path) is False: |
| 199 | return False | 213 | return False |
| 214 | |||
| 215 | try: | ||
| 216 | _sort_xml_attributes(full_path) | ||
| 217 | except ET.ParseError as e: # pragma: no cover | ||
| 218 | logging.error("Unable to parse %s: %s", full_path, e) | ||
| 219 | return False | ||
| 220 | |||
| 221 | # This is awful, I'm sorry. | ||
| 222 | # | ||
| 223 | # Microsoft Office isn't happy when we have the `mc:Ignorable` | ||
| 224 | # tag containing namespaces that aren't present in the xml file, | ||
| 225 | # so instead of trying to remove this specific tag with etree, | ||
| 226 | # we're removing it, with a regexp. | ||
| 227 | # | ||
| 228 | # Since we're the ones producing this file, via the call to | ||
| 229 | # _sort_xml_attributes, there won't be any "funny tricks". | ||
| 230 | # Worst case, the tag isn't present, and everything is fine. | ||
| 231 | # | ||
| 232 | # see: https://docs.microsoft.com/en-us/dotnet/framework/wpf/advanced/mc-ignorable-attribute | ||
| 233 | with open(full_path, 'rb') as f: | ||
| 234 | text = f.read() | ||
| 235 | out = re.sub(b'mc:Ignorable="[^"]*"', b'', text, 1) | ||
| 236 | with open(full_path, 'wb') as f: | ||
| 237 | f.write(out) | ||
| 200 | 238 | ||
| 201 | return True | 239 | return True |
| 202 | 240 | ||
| @@ -262,7 +300,6 @@ class LibreOfficeParser(ArchiveBasedAbstractParser): | |||
| 262 | text.remove(changes) | 300 | text.remove(changes) |
| 263 | 301 | ||
| 264 | tree.write(full_path, xml_declaration=True) | 302 | tree.write(full_path, xml_declaration=True) |
| 265 | |||
| 266 | return True | 303 | return True |
| 267 | 304 | ||
| 268 | def _specific_cleanup(self, full_path: str) -> bool: | 305 | def _specific_cleanup(self, full_path: str) -> bool: |
