summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--libmat2/office.py79
-rw-r--r--tests/data/revision.docxbin0 -> 4701 bytes
-rw-r--r--tests/test_libmat2.py21
3 files changed, 85 insertions, 15 deletions
diff --git a/libmat2/office.py b/libmat2/office.py
index 5381eb9..acd8ca2 100644
--- a/libmat2/office.py
+++ b/libmat2/office.py
@@ -14,6 +14,24 @@ from . import abstract, parser_factory
14assert Set 14assert Set
15assert Pattern 15assert Pattern
16 16
17def _parse_xml(full_path: str):
18 """ This function parse XML with namespace support. """
19 def parse_map(f): # etree support for ns is a bit rough
20 ns_map = dict()
21 for event, (k, v) in ET.iterparse(f, ("start-ns", )):
22 if event == "start-ns":
23 ns_map[k] = v
24 return ns_map
25
26 ns = parse_map(full_path)
27
28 # Register the namespaces
29 for k,v in ns.items():
30 ET.register_namespace(k, v)
31
32 return ET.parse(full_path), ns
33
34
17class ArchiveBasedAbstractParser(abstract.AbstractParser): 35class ArchiveBasedAbstractParser(abstract.AbstractParser):
18 # Those are the files that have a format that _isn't_ 36 # Those are the files that have a format that _isn't_
19 # supported by MAT2, but that we want to keep anyway. 37 # supported by MAT2, but that we want to keep anyway.
@@ -72,7 +90,11 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
72 zin.extract(member=item, path=temp_folder) 90 zin.extract(member=item, path=temp_folder)
73 full_path = os.path.join(temp_folder, item.filename) 91 full_path = os.path.join(temp_folder, item.filename)
74 92
75 self._specific_cleanup(full_path) 93 if self._specific_cleanup(full_path) is False:
94 shutil.rmtree(temp_folder)
95 os.remove(self.output_filename)
96 print("Something went wrong during deep cleaning of %s" % item.filename)
97 return False
76 98
77 if item.filename in self.files_to_keep: 99 if item.filename in self.files_to_keep:
78 # those files aren't supported, but we want to add them anyway 100 # those files aren't supported, but we want to add them anyway
@@ -118,6 +140,45 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
118 '^docProps/', 140 '^docProps/',
119 })) 141 }))
120 142
143 def __remove_revisions(self, full_path:str) -> bool:
144 """ In this function, we're changing the XML
145 document in two times, since we don't want
146 to change the tree we're iterating on."""
147 tree, ns = _parse_xml(full_path)
148
149 # No revisions are present
150 if tree.find('.//w:del', ns) is None:
151 return True
152 elif tree.find('.//w:ins', ns) is None:
153 return True
154
155 parent_map = {c:p for p in tree.iter( ) for c in p}
156
157 elements = list([element for element in tree.iterfind('.//w:del', ns)])
158 for element in elements:
159 parent_map[element].remove(element)
160
161 elements = list()
162 for element in tree.iterfind('.//w:ins', ns):
163 for position, item in enumerate(tree.iter()):
164 if item == element:
165 for children in element.iterfind('./*'):
166 elements.append((element, position, children))
167 break
168
169 for (element, position, children) in elements:
170 parent_map[element].insert(position, children)
171 parent_map[element].remove(element)
172
173 tree.write(full_path, xml_declaration=True)
174
175 return True
176
177 def _specific_cleanup(self, full_path:str) -> bool:
178 if full_path.endswith('/word/document.xml'):
179 return self.__remove_revisions(full_path)
180 return True
181
121 def get_meta(self) -> Dict[str, str]: 182 def get_meta(self) -> Dict[str, str]:
122 """ 183 """
123 Yes, I know that parsing xml with regexp ain't pretty, 184 Yes, I know that parsing xml with regexp ain't pretty,
@@ -168,27 +229,16 @@ class LibreOfficeParser(ArchiveBasedAbstractParser):
168 229
169 230
170 def __remove_revisions(self, full_path:str) -> bool: 231 def __remove_revisions(self, full_path:str) -> bool:
171 def parse_map(f): # etree support for ns is a bit rough 232 tree, ns = _parse_xml(full_path)
172 ns_map = dict()
173 for event, (k, v) in ET.iterparse(f, ("start-ns", )):
174 if event == "start-ns":
175 ns_map[k] = v
176 return ns_map
177 233
178 ns = parse_map(full_path)
179 if 'office' not in ns.keys(): # no revisions in the current file 234 if 'office' not in ns.keys(): # no revisions in the current file
180 return True 235 return True
181 236
182 # Register the namespaces
183 for k,v in ns.items():
184 ET.register_namespace(k, v)
185
186 tree = ET.parse(full_path)
187 for text in tree.getroot().iterfind('.//office:text', ns): 237 for text in tree.getroot().iterfind('.//office:text', ns):
188 for changes in text.iterfind('.//text:tracked-changes', ns): 238 for changes in text.iterfind('.//text:tracked-changes', ns):
189 text.remove(changes) 239 text.remove(changes)
190 240
191 tree.write(full_path, xml_declaration = True) 241 tree.write(full_path, xml_declaration=True)
192 242
193 return True 243 return True
194 244
@@ -219,4 +269,3 @@ class LibreOfficeParser(ArchiveBasedAbstractParser):
219 metadata[key] = value 269 metadata[key] = value
220 zipin.close() 270 zipin.close()
221 return metadata 271 return metadata
222
diff --git a/tests/data/revision.docx b/tests/data/revision.docx
new file mode 100644
index 0000000..8a2d814
--- /dev/null
+++ b/tests/data/revision.docx
Binary files differ
diff --git a/tests/test_libmat2.py b/tests/test_libmat2.py
index 1573790..4df6385 100644
--- a/tests/test_libmat2.py
+++ b/tests/test_libmat2.py
@@ -121,6 +121,7 @@ class TestRemovingThumbnails(unittest.TestCase):
121 zipin.close() 121 zipin.close()
122 122
123 os.remove('./tests/data/clean.cleaned.odt') 123 os.remove('./tests/data/clean.cleaned.odt')
124 os.remove('./tests/data/clean.odt')
124 125
125 126
126class TestRevisionsCleaning(unittest.TestCase): 127class TestRevisionsCleaning(unittest.TestCase):
@@ -142,6 +143,26 @@ class TestRevisionsCleaning(unittest.TestCase):
142 os.remove('./tests/data/clean.odt') 143 os.remove('./tests/data/clean.odt')
143 os.remove('./tests/data/clean.cleaned.odt') 144 os.remove('./tests/data/clean.cleaned.odt')
144 145
146 def test_msoffice(self):
147 with zipfile.ZipFile('./tests/data/revision.docx') as zipin:
148 c = zipin.open('word/document.xml')
149 content = c.read()
150 r = b'<w:ins w:id="1" w:author="Unknown Author" w:date="2018-06-28T23:48:00Z">'
151 self.assertIn(r, content)
152
153 shutil.copy('./tests/data/revision.docx', './tests/data/revision_clean.docx')
154 p = office.MSOfficeParser('./tests/data/revision_clean.docx')
155 self.assertTrue(p.remove_all())
156
157 with zipfile.ZipFile('./tests/data/revision_clean.cleaned.docx') as zipin:
158 c = zipin.open('word/document.xml')
159 content = c.read()
160 r = b'<w:ins w:id="1" w:author="Unknown Author" w:date="2018-06-28T23:48:00Z">'
161 self.assertNotIn(r, content)
162
163 os.remove('./tests/data/revision_clean.docx')
164 os.remove('./tests/data/revision_clean.cleaned.docx')
165
145 166
146class TestDeepCleaning(unittest.TestCase): 167class TestDeepCleaning(unittest.TestCase):
147 def __check_deep_meta(self, p): 168 def __check_deep_meta(self, p):