diff options
| -rw-r--r-- | libmat2/office.py | 79 | ||||
| -rw-r--r-- | tests/data/revision.docx | bin | 0 -> 4701 bytes | |||
| -rw-r--r-- | tests/test_libmat2.py | 21 |
3 files changed, 85 insertions, 15 deletions
diff --git a/libmat2/office.py b/libmat2/office.py index 5381eb9..acd8ca2 100644 --- a/libmat2/office.py +++ b/libmat2/office.py | |||
| @@ -14,6 +14,24 @@ from . import abstract, parser_factory | |||
| 14 | assert Set | 14 | assert Set |
| 15 | assert Pattern | 15 | assert Pattern |
| 16 | 16 | ||
| 17 | def _parse_xml(full_path: str): | ||
| 18 | """ This function parse XML with namespace support. """ | ||
| 19 | def parse_map(f): # etree support for ns is a bit rough | ||
| 20 | ns_map = dict() | ||
| 21 | for event, (k, v) in ET.iterparse(f, ("start-ns", )): | ||
| 22 | if event == "start-ns": | ||
| 23 | ns_map[k] = v | ||
| 24 | return ns_map | ||
| 25 | |||
| 26 | ns = parse_map(full_path) | ||
| 27 | |||
| 28 | # Register the namespaces | ||
| 29 | for k,v in ns.items(): | ||
| 30 | ET.register_namespace(k, v) | ||
| 31 | |||
| 32 | return ET.parse(full_path), ns | ||
| 33 | |||
| 34 | |||
| 17 | class ArchiveBasedAbstractParser(abstract.AbstractParser): | 35 | class ArchiveBasedAbstractParser(abstract.AbstractParser): |
| 18 | # Those are the files that have a format that _isn't_ | 36 | # Those are the files that have a format that _isn't_ |
| 19 | # supported by MAT2, but that we want to keep anyway. | 37 | # supported by MAT2, but that we want to keep anyway. |
| @@ -72,7 +90,11 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser): | |||
| 72 | zin.extract(member=item, path=temp_folder) | 90 | zin.extract(member=item, path=temp_folder) |
| 73 | full_path = os.path.join(temp_folder, item.filename) | 91 | full_path = os.path.join(temp_folder, item.filename) |
| 74 | 92 | ||
| 75 | self._specific_cleanup(full_path) | 93 | if self._specific_cleanup(full_path) is False: |
| 94 | shutil.rmtree(temp_folder) | ||
| 95 | os.remove(self.output_filename) | ||
| 96 | print("Something went wrong during deep cleaning of %s" % item.filename) | ||
| 97 | return False | ||
| 76 | 98 | ||
| 77 | if item.filename in self.files_to_keep: | 99 | if item.filename in self.files_to_keep: |
| 78 | # those files aren't supported, but we want to add them anyway | 100 | # those files aren't supported, but we want to add them anyway |
| @@ -118,6 +140,45 @@ class MSOfficeParser(ArchiveBasedAbstractParser): | |||
| 118 | '^docProps/', | 140 | '^docProps/', |
| 119 | })) | 141 | })) |
| 120 | 142 | ||
| 143 | def __remove_revisions(self, full_path:str) -> bool: | ||
| 144 | """ In this function, we're changing the XML | ||
| 145 | document in two times, since we don't want | ||
| 146 | to change the tree we're iterating on.""" | ||
| 147 | tree, ns = _parse_xml(full_path) | ||
| 148 | |||
| 149 | # No revisions are present | ||
| 150 | if tree.find('.//w:del', ns) is None: | ||
| 151 | return True | ||
| 152 | elif tree.find('.//w:ins', ns) is None: | ||
| 153 | return True | ||
| 154 | |||
| 155 | parent_map = {c:p for p in tree.iter( ) for c in p} | ||
| 156 | |||
| 157 | elements = list([element for element in tree.iterfind('.//w:del', ns)]) | ||
| 158 | for element in elements: | ||
| 159 | parent_map[element].remove(element) | ||
| 160 | |||
| 161 | elements = list() | ||
| 162 | for element in tree.iterfind('.//w:ins', ns): | ||
| 163 | for position, item in enumerate(tree.iter()): | ||
| 164 | if item == element: | ||
| 165 | for children in element.iterfind('./*'): | ||
| 166 | elements.append((element, position, children)) | ||
| 167 | break | ||
| 168 | |||
| 169 | for (element, position, children) in elements: | ||
| 170 | parent_map[element].insert(position, children) | ||
| 171 | parent_map[element].remove(element) | ||
| 172 | |||
| 173 | tree.write(full_path, xml_declaration=True) | ||
| 174 | |||
| 175 | return True | ||
| 176 | |||
| 177 | def _specific_cleanup(self, full_path:str) -> bool: | ||
| 178 | if full_path.endswith('/word/document.xml'): | ||
| 179 | return self.__remove_revisions(full_path) | ||
| 180 | return True | ||
| 181 | |||
| 121 | def get_meta(self) -> Dict[str, str]: | 182 | def get_meta(self) -> Dict[str, str]: |
| 122 | """ | 183 | """ |
| 123 | Yes, I know that parsing xml with regexp ain't pretty, | 184 | Yes, I know that parsing xml with regexp ain't pretty, |
| @@ -168,27 +229,16 @@ class LibreOfficeParser(ArchiveBasedAbstractParser): | |||
| 168 | 229 | ||
| 169 | 230 | ||
| 170 | def __remove_revisions(self, full_path:str) -> bool: | 231 | def __remove_revisions(self, full_path:str) -> bool: |
| 171 | def parse_map(f): # etree support for ns is a bit rough | 232 | tree, ns = _parse_xml(full_path) |
| 172 | ns_map = dict() | ||
| 173 | for event, (k, v) in ET.iterparse(f, ("start-ns", )): | ||
| 174 | if event == "start-ns": | ||
| 175 | ns_map[k] = v | ||
| 176 | return ns_map | ||
| 177 | 233 | ||
| 178 | ns = parse_map(full_path) | ||
| 179 | if 'office' not in ns.keys(): # no revisions in the current file | 234 | if 'office' not in ns.keys(): # no revisions in the current file |
| 180 | return True | 235 | return True |
| 181 | 236 | ||
| 182 | # Register the namespaces | ||
| 183 | for k,v in ns.items(): | ||
| 184 | ET.register_namespace(k, v) | ||
| 185 | |||
| 186 | tree = ET.parse(full_path) | ||
| 187 | for text in tree.getroot().iterfind('.//office:text', ns): | 237 | for text in tree.getroot().iterfind('.//office:text', ns): |
| 188 | for changes in text.iterfind('.//text:tracked-changes', ns): | 238 | for changes in text.iterfind('.//text:tracked-changes', ns): |
| 189 | text.remove(changes) | 239 | text.remove(changes) |
| 190 | 240 | ||
| 191 | tree.write(full_path, xml_declaration = True) | 241 | tree.write(full_path, xml_declaration=True) |
| 192 | 242 | ||
| 193 | return True | 243 | return True |
| 194 | 244 | ||
| @@ -219,4 +269,3 @@ class LibreOfficeParser(ArchiveBasedAbstractParser): | |||
| 219 | metadata[key] = value | 269 | metadata[key] = value |
| 220 | zipin.close() | 270 | zipin.close() |
| 221 | return metadata | 271 | return metadata |
| 222 | |||
diff --git a/tests/data/revision.docx b/tests/data/revision.docx new file mode 100644 index 0000000..8a2d814 --- /dev/null +++ b/tests/data/revision.docx | |||
| Binary files differ | |||
diff --git a/tests/test_libmat2.py b/tests/test_libmat2.py index 1573790..4df6385 100644 --- a/tests/test_libmat2.py +++ b/tests/test_libmat2.py | |||
| @@ -121,6 +121,7 @@ class TestRemovingThumbnails(unittest.TestCase): | |||
| 121 | zipin.close() | 121 | zipin.close() |
| 122 | 122 | ||
| 123 | os.remove('./tests/data/clean.cleaned.odt') | 123 | os.remove('./tests/data/clean.cleaned.odt') |
| 124 | os.remove('./tests/data/clean.odt') | ||
| 124 | 125 | ||
| 125 | 126 | ||
| 126 | class TestRevisionsCleaning(unittest.TestCase): | 127 | class TestRevisionsCleaning(unittest.TestCase): |
| @@ -142,6 +143,26 @@ class TestRevisionsCleaning(unittest.TestCase): | |||
| 142 | os.remove('./tests/data/clean.odt') | 143 | os.remove('./tests/data/clean.odt') |
| 143 | os.remove('./tests/data/clean.cleaned.odt') | 144 | os.remove('./tests/data/clean.cleaned.odt') |
| 144 | 145 | ||
| 146 | def test_msoffice(self): | ||
| 147 | with zipfile.ZipFile('./tests/data/revision.docx') as zipin: | ||
| 148 | c = zipin.open('word/document.xml') | ||
| 149 | content = c.read() | ||
| 150 | r = b'<w:ins w:id="1" w:author="Unknown Author" w:date="2018-06-28T23:48:00Z">' | ||
| 151 | self.assertIn(r, content) | ||
| 152 | |||
| 153 | shutil.copy('./tests/data/revision.docx', './tests/data/revision_clean.docx') | ||
| 154 | p = office.MSOfficeParser('./tests/data/revision_clean.docx') | ||
| 155 | self.assertTrue(p.remove_all()) | ||
| 156 | |||
| 157 | with zipfile.ZipFile('./tests/data/revision_clean.cleaned.docx') as zipin: | ||
| 158 | c = zipin.open('word/document.xml') | ||
| 159 | content = c.read() | ||
| 160 | r = b'<w:ins w:id="1" w:author="Unknown Author" w:date="2018-06-28T23:48:00Z">' | ||
| 161 | self.assertNotIn(r, content) | ||
| 162 | |||
| 163 | os.remove('./tests/data/revision_clean.docx') | ||
| 164 | os.remove('./tests/data/revision_clean.cleaned.docx') | ||
| 165 | |||
| 145 | 166 | ||
| 146 | class TestDeepCleaning(unittest.TestCase): | 167 | class TestDeepCleaning(unittest.TestCase): |
| 147 | def __check_deep_meta(self, p): | 168 | def __check_deep_meta(self, p): |
