diff options
| -rw-r--r-- | libmat2/office.py | 50 |
1 files changed, 44 insertions, 6 deletions
diff --git a/libmat2/office.py b/libmat2/office.py index b769991..c9bed7a 100644 --- a/libmat2/office.py +++ b/libmat2/office.py | |||
| @@ -62,9 +62,6 @@ class MSOfficeParser(ZipParser): | |||
| 62 | 62 | ||
| 63 | # Do we want to keep the following ones? | 63 | # Do we want to keep the following ones? |
| 64 | 'application/vnd.openxmlformats-officedocument.wordprocessingml.settings+xml', | 64 | 'application/vnd.openxmlformats-officedocument.wordprocessingml.settings+xml', |
| 65 | |||
| 66 | # See https://0xacab.org/jvoisin/mat2/issues/71 | ||
| 67 | 'application/vnd.openxmlformats-officedocument.wordprocessingml.numbering+xml', # /word/numbering.xml | ||
| 68 | } | 65 | } |
| 69 | 66 | ||
| 70 | 67 | ||
| @@ -88,6 +85,7 @@ class MSOfficeParser(ZipParser): | |||
| 88 | r'^word/printerSettings/', | 85 | r'^word/printerSettings/', |
| 89 | r'^word/theme', | 86 | r'^word/theme', |
| 90 | r'^word/people\.xml$', | 87 | r'^word/people\.xml$', |
| 88 | r'^word/numbering\.xml$', | ||
| 91 | 89 | ||
| 92 | # we have an allowlist in self.files_to_keep, | 90 | # we have an allowlist in self.files_to_keep, |
| 93 | # so we can trash everything else | 91 | # so we can trash everything else |
| @@ -124,7 +122,7 @@ class MSOfficeParser(ZipParser): | |||
| 124 | 122 | ||
| 125 | @staticmethod | 123 | @staticmethod |
| 126 | def __remove_rsid(full_path: str) -> bool: | 124 | def __remove_rsid(full_path: str) -> bool: |
| 127 | """ The method will remove "revision session ID". We're '}rsid' | 125 | """ The method will remove "revision session ID". We're using '}rsid' |
| 128 | instead of proper parsing, since rsid can have multiple forms, like | 126 | instead of proper parsing, since rsid can have multiple forms, like |
| 129 | `rsidRDefault`, `rsidR`, `rsids`, … | 127 | `rsidRDefault`, `rsidR`, `rsids`, … |
| 130 | 128 | ||
| @@ -137,7 +135,8 @@ class MSOfficeParser(ZipParser): | |||
| 137 | """ | 135 | """ |
| 138 | try: | 136 | try: |
| 139 | tree, namespace = _parse_xml(full_path) | 137 | tree, namespace = _parse_xml(full_path) |
| 140 | except ET.ParseError: | 138 | except ET.ParseError as e: |
| 139 | logging.error("Unable to parse %s: %s", full_path, e) | ||
| 141 | return False | 140 | return False |
| 142 | 141 | ||
| 143 | # rsid, tags or attributes, are always under the `w` namespace | 142 | # rsid, tags or attributes, are always under the `w` namespace |
| @@ -162,6 +161,41 @@ class MSOfficeParser(ZipParser): | |||
| 162 | return True | 161 | return True |
| 163 | 162 | ||
| 164 | @staticmethod | 163 | @staticmethod |
| 164 | def __remove_nsid(full_path: str) -> bool: | ||
| 165 | """ | ||
| 166 | NSID are random identifiers that can be used | ||
| 167 | to ease the merging of some components of a document. | ||
| 168 | They can also be used for fingerprinting. | ||
| 169 | |||
| 170 | See the spec for more details: https://docs.microsoft.com/en-us/dotnet/api/documentformat.openxml.wordprocessing.nsid?view=openxml-2.8.1 | ||
| 171 | |||
| 172 | In this function, we're changing the XML document in several | ||
| 173 | different times, since we don't want to change the tree we're currently | ||
| 174 | iterating on. | ||
| 175 | """ | ||
| 176 | try: | ||
| 177 | tree, namespace = _parse_xml(full_path) | ||
| 178 | except ET.ParseError as e: # pragma: no cover | ||
| 179 | logging.error("Unable to parse %s: %s", full_path, e) | ||
| 180 | return False | ||
| 181 | |||
| 182 | # The NSID tag is always under the `w` namespace | ||
| 183 | if 'w' not in namespace.keys(): | ||
| 184 | return True | ||
| 185 | |||
| 186 | parent_map = {c:p for p in tree.iter() for c in p} | ||
| 187 | |||
| 188 | elements_to_remove = list() | ||
| 189 | for element in tree.iterfind('.//w:nsid', namespace): | ||
| 190 | elements_to_remove.append(element) | ||
| 191 | for element in elements_to_remove: | ||
| 192 | parent_map[element].remove(element) | ||
| 193 | |||
| 194 | tree.write(full_path, xml_declaration=True) | ||
| 195 | return True | ||
| 196 | |||
| 197 | |||
| 198 | @staticmethod | ||
| 165 | def __remove_revisions(full_path: str) -> bool: | 199 | def __remove_revisions(full_path: str) -> bool: |
| 166 | """ In this function, we're changing the XML document in several | 200 | """ In this function, we're changing the XML document in several |
| 167 | different times, since we don't want to change the tree we're currently | 201 | different times, since we don't want to change the tree we're currently |
| @@ -208,7 +242,8 @@ class MSOfficeParser(ZipParser): | |||
| 208 | """ | 242 | """ |
| 209 | try: | 243 | try: |
| 210 | tree, namespace = _parse_xml(full_path) | 244 | tree, namespace = _parse_xml(full_path) |
| 211 | except ET.ParseError: # pragma: no cover | 245 | except ET.ParseError as e: # pragma: no cover |
| 246 | logging.error("Unable to parse %s: %s", full_path, e) | ||
| 212 | return False | 247 | return False |
| 213 | 248 | ||
| 214 | if len(namespace.items()) != 1: | 249 | if len(namespace.items()) != 1: |
| @@ -269,6 +304,9 @@ class MSOfficeParser(ZipParser): | |||
| 269 | if self.__remove_rsid(full_path) is False: | 304 | if self.__remove_rsid(full_path) is False: |
| 270 | return False | 305 | return False |
| 271 | 306 | ||
| 307 | if self.__remove_nsid(full_path) is False: | ||
| 308 | return False # pragma: no cover | ||
| 309 | |||
| 272 | try: | 310 | try: |
| 273 | _sort_xml_attributes(full_path) | 311 | _sort_xml_attributes(full_path) |
| 274 | except ET.ParseError as e: # pragma: no cover | 312 | except ET.ParseError as e: # pragma: no cover |
