summaryrefslogtreecommitdiff
path: root/libmat2/office.py
diff options
context:
space:
mode:
Diffstat (limited to 'libmat2/office.py')
-rw-r--r--libmat2/office.py24
1 files changed, 14 insertions, 10 deletions
diff --git a/libmat2/office.py b/libmat2/office.py
index c6c4688..62d0395 100644
--- a/libmat2/office.py
+++ b/libmat2/office.py
@@ -33,6 +33,7 @@ def _parse_xml(full_path: str):
33 33
34 34
35class ArchiveBasedAbstractParser(abstract.AbstractParser): 35class ArchiveBasedAbstractParser(abstract.AbstractParser):
36 """ Office files (.docx, .odt, …) are zipped files. """
36 # Those are the files that have a format that _isn't_ 37 # Those are the files that have a format that _isn't_
37 # supported by MAT2, but that we want to keep anyway. 38 # supported by MAT2, but that we want to keep anyway.
38 files_to_keep = set() # type: Set[str] 39 files_to_keep = set() # type: Set[str]
@@ -58,14 +59,13 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
58 def _clean_zipinfo(zipinfo: zipfile.ZipInfo) -> zipfile.ZipInfo: 59 def _clean_zipinfo(zipinfo: zipfile.ZipInfo) -> zipfile.ZipInfo:
59 zipinfo.create_system = 3 # Linux 60 zipinfo.create_system = 3 # Linux
60 zipinfo.comment = b'' 61 zipinfo.comment = b''
61 zipinfo.date_time = (1980, 1, 1, 0, 0, 0) 62 zipinfo.date_time = (1980, 1, 1, 0, 0, 0) # this is as early as a zipfile can be
62 return zipinfo 63 return zipinfo
63 64
64 @staticmethod 65 @staticmethod
65 def _get_zipinfo_meta(zipinfo: zipfile.ZipInfo) -> Dict[str, str]: 66 def _get_zipinfo_meta(zipinfo: zipfile.ZipInfo) -> Dict[str, str]:
66 metadata = {} 67 metadata = {}
67 if zipinfo.create_system == 3: 68 if zipinfo.create_system == 3: # this is Linux
68 #metadata['create_system'] = 'Linux'
69 pass 69 pass
70 elif zipinfo.create_system == 2: 70 elif zipinfo.create_system == 2:
71 metadata['create_system'] = 'Windows' 71 metadata['create_system'] = 'Windows'
@@ -145,23 +145,27 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
145 145
146 @staticmethod 146 @staticmethod
147 def __remove_revisions(full_path: str) -> bool: 147 def __remove_revisions(full_path: str) -> bool:
148 """ In this function, we're changing the XML 148 """ In this function, we're changing the XML document in several
149 document in two times, since we don't want 149 different times, since we don't want to change the tree we're currently
150 to change the tree we're iterating on.""" 150 iterating on.
151 """
151 try: 152 try:
152 tree, namespace = _parse_xml(full_path) 153 tree, namespace = _parse_xml(full_path)
153 except ET.ParseError: 154 except ET.ParseError:
154 return False 155 return False
155 156
156 # No revisions are present 157 # Revisions are either deletions (`w:del`) or
158 # insertions (`w:ins`)
157 del_presence = tree.find('.//w:del', namespace) 159 del_presence = tree.find('.//w:del', namespace)
158 ins_presence = tree.find('.//w:ins', namespace) 160 ins_presence = tree.find('.//w:ins', namespace)
159 if del_presence is None and ins_presence is None: 161 if del_presence is None and ins_presence is None:
160 return True 162 return True # No revisions are present
161 163
162 parent_map = {c:p for p in tree.iter() for c in p} 164 parent_map = {c:p for p in tree.iter() for c in p}
163 165
164 elements = list([element for element in tree.iterfind('.//w:del', namespace)]) 166 elements = list()
167 for element in tree.iterfind('.//w:del', namespace):
168 elements.append(element)
165 for element in elements: 169 for element in elements:
166 parent_map[element].remove(element) 170 parent_map[element].remove(element)
167 171
@@ -172,7 +176,6 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
172 for children in element.iterfind('./*'): 176 for children in element.iterfind('./*'):
173 elements.append((element, position, children)) 177 elements.append((element, position, children))
174 break 178 break
175
176 for (element, position, children) in elements: 179 for (element, position, children) in elements:
177 parent_map[element].insert(position, children) 180 parent_map[element].insert(position, children)
178 parent_map[element].remove(element) 181 parent_map[element].remove(element)
@@ -183,6 +186,7 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
183 186
184 def _specific_cleanup(self, full_path: str) -> bool: 187 def _specific_cleanup(self, full_path: str) -> bool:
185 if full_path.endswith('/word/document.xml'): 188 if full_path.endswith('/word/document.xml'):
189 # this file contains the revisions
186 return self.__remove_revisions(full_path) 190 return self.__remove_revisions(full_path)
187 return True 191 return True
188 192