diff options
| author | jvoisin | 2018-09-09 18:57:08 +0200 |
|---|---|---|
| committer | jvoisin | 2018-09-24 17:45:09 +0200 |
| commit | fbcf68c280643bce8f6451cc84db2910755df5a8 (patch) | |
| tree | dd74a4af93d396eaa7f7f7d7fbb2186896ea39af /libmat2 | |
| parent | 9826de3526ed3b955911bd7cefb3a9a8e9114f37 (diff) | |
Lexicographical sort on xml attributes for office files
In XML, the order of the attributes shouldn't be meaningful,
however, MS Office sorts attributes for a given XML tag
differently than LibreOffice.
Diffstat (limited to 'libmat2')
| -rw-r--r-- | libmat2/office.py | 49 |
1 files changed, 44 insertions, 5 deletions
diff --git a/libmat2/office.py b/libmat2/office.py index 50b776e..5c2c996 100644 --- a/libmat2/office.py +++ b/libmat2/office.py | |||
| @@ -1,3 +1,4 @@ | |||
| 1 | import logging | ||
| 1 | import os | 2 | import os |
| 2 | import re | 3 | import re |
| 3 | import zipfile | 4 | import zipfile |
| @@ -12,16 +13,38 @@ assert Set | |||
| 12 | assert Pattern | 13 | assert Pattern |
| 13 | 14 | ||
| 14 | def _parse_xml(full_path: str): | 15 | def _parse_xml(full_path: str): |
| 15 | """ This function parse XML, with namespace support. """ | 16 | """ This function parses XML, with namespace support. """ |
| 16 | 17 | ||
| 18 | cpt = 0 | ||
| 17 | namespace_map = dict() | 19 | namespace_map = dict() |
| 18 | for _, (key, value) in ET.iterparse(full_path, ("start-ns", )): | 20 | for _, (key, value) in ET.iterparse(full_path, ("start-ns", )): |
| 21 | # The ns[0-9]+ namespaces are reserved for interal usage, so | ||
| 22 | # we have to use an other nomenclature. | ||
| 23 | if re.match('^ns[0-9]+$', key): | ||
| 24 | key = 'mat%d' % cpt | ||
| 25 | cpt += 1 | ||
| 26 | |||
| 19 | namespace_map[key] = value | 27 | namespace_map[key] = value |
| 20 | ET.register_namespace(key, value) | 28 | ET.register_namespace(key, value) |
| 21 | 29 | ||
| 22 | return ET.parse(full_path), namespace_map | 30 | return ET.parse(full_path), namespace_map |
| 23 | 31 | ||
| 24 | 32 | ||
| 33 | def _sort_xml_attributes(full_path: str) -> bool: | ||
| 34 | """ Sort xml attributes lexicographically, | ||
| 35 | because it's possible to fingerprint producers (MS Office, Libreoffice, …) | ||
| 36 | since they are all using different orders. | ||
| 37 | """ | ||
| 38 | tree = ET.parse(full_path) | ||
| 39 | root = tree.getroot() | ||
| 40 | |||
| 41 | for c in root: | ||
| 42 | c[:] = sorted(c, key=lambda child: (child.tag, child.get('desc'))) | ||
| 43 | |||
| 44 | tree.write(full_path, xml_declaration=True) | ||
| 45 | return True | ||
| 46 | |||
| 47 | |||
| 25 | class MSOfficeParser(ArchiveBasedAbstractParser): | 48 | class MSOfficeParser(ArchiveBasedAbstractParser): |
| 26 | mimetypes = { | 49 | mimetypes = { |
| 27 | 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', | 50 | 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', |
| @@ -49,7 +72,8 @@ class MSOfficeParser(ArchiveBasedAbstractParser): | |||
| 49 | """ | 72 | """ |
| 50 | try: | 73 | try: |
| 51 | tree, namespace = _parse_xml(full_path) | 74 | tree, namespace = _parse_xml(full_path) |
| 52 | except ET.ParseError: | 75 | except ET.ParseError as e: |
| 76 | logging.error("Unable to parse %s: %s", full_path, e) | ||
| 53 | return False | 77 | return False |
| 54 | 78 | ||
| 55 | # Revisions are either deletions (`w:del`) or | 79 | # Revisions are either deletions (`w:del`) or |
| @@ -83,6 +107,9 @@ class MSOfficeParser(ArchiveBasedAbstractParser): | |||
| 83 | return True | 107 | return True |
| 84 | 108 | ||
| 85 | def _specific_cleanup(self, full_path: str) -> bool: | 109 | def _specific_cleanup(self, full_path: str) -> bool: |
| 110 | if os.stat(full_path).st_size == 0: # Don't process empty files | ||
| 111 | return True | ||
| 112 | |||
| 86 | if full_path.endswith('/word/document.xml'): | 113 | if full_path.endswith('/word/document.xml'): |
| 87 | # this file contains the revisions | 114 | # this file contains the revisions |
| 88 | return self.__remove_revisions(full_path) | 115 | return self.__remove_revisions(full_path) |
| @@ -139,7 +166,8 @@ class LibreOfficeParser(ArchiveBasedAbstractParser): | |||
| 139 | def __remove_revisions(full_path: str) -> bool: | 166 | def __remove_revisions(full_path: str) -> bool: |
| 140 | try: | 167 | try: |
| 141 | tree, namespace = _parse_xml(full_path) | 168 | tree, namespace = _parse_xml(full_path) |
| 142 | except ET.ParseError: | 169 | except ET.ParseError as e: |
| 170 | logging.error("Unable to parse %s: %s", full_path, e) | ||
| 143 | return False | 171 | return False |
| 144 | 172 | ||
| 145 | if 'office' not in namespace.keys(): # no revisions in the current file | 173 | if 'office' not in namespace.keys(): # no revisions in the current file |
| @@ -154,8 +182,19 @@ class LibreOfficeParser(ArchiveBasedAbstractParser): | |||
| 154 | return True | 182 | return True |
| 155 | 183 | ||
| 156 | def _specific_cleanup(self, full_path: str) -> bool: | 184 | def _specific_cleanup(self, full_path: str) -> bool: |
| 157 | if os.path.basename(full_path) == 'content.xml': | 185 | if os.stat(full_path).st_size == 0: # Don't process empty files |
| 158 | return self.__remove_revisions(full_path) | 186 | return True |
| 187 | |||
| 188 | if os.path.basename(full_path).endswith('.xml'): | ||
| 189 | if os.path.basename(full_path) == 'content.xml': | ||
| 190 | if self.__remove_revisions(full_path) is False: | ||
| 191 | return False | ||
| 192 | |||
| 193 | try: | ||
| 194 | _sort_xml_attributes(full_path) | ||
| 195 | except ET.ParseError as e: | ||
| 196 | logging.error("Unable to parse %s: %s", full_path, e) | ||
| 197 | return False | ||
| 159 | return True | 198 | return True |
| 160 | 199 | ||
| 161 | def get_meta(self) -> Dict[str, str]: | 200 | def get_meta(self) -> Dict[str, str]: |
