summaryrefslogtreecommitdiff
path: root/libmat2/office.py
diff options
context:
space:
mode:
authorjvoisin2018-06-27 23:10:53 +0200
committerjvoisin2018-07-01 21:09:20 +0200
commit02f7605ac124ed42bd9f3f156ee40877fe3c6b42 (patch)
treef2dbd634d91c6c808a1269695c2abd465e3d0333 /libmat2/office.py
parent80fc4ffb40ea425e14697082e4b7e6a7cf0b5583 (diff)
MAT2 is now cleaning revisions from odt files!
Diffstat (limited to 'libmat2/office.py')
-rw-r--r--libmat2/office.py76
1 files changed, 62 insertions, 14 deletions
diff --git a/libmat2/office.py b/libmat2/office.py
index 34ae7a2..5381eb9 100644
--- a/libmat2/office.py
+++ b/libmat2/office.py
@@ -4,8 +4,10 @@ import shutil
4import tempfile 4import tempfile
5import datetime 5import datetime
6import zipfile 6import zipfile
7import xml.etree.ElementTree as ET
7from typing import Dict, Set, Pattern 8from typing import Dict, Set, Pattern
8 9
10
9from . import abstract, parser_factory 11from . import abstract, parser_factory
10 12
11# Make pyflakes happy 13# Make pyflakes happy
@@ -13,7 +15,12 @@ assert Set
13assert Pattern 15assert Pattern
14 16
15class ArchiveBasedAbstractParser(abstract.AbstractParser): 17class ArchiveBasedAbstractParser(abstract.AbstractParser):
18 # Those are the files that have a format that _isn't_
19 # supported by MAT2, but that we want to keep anyway.
16 files_to_keep = set() # type: Set[str] 20 files_to_keep = set() # type: Set[str]
21
22 # Those are the files that we _do not_ want to keep,
23 # no matter if they are supported or not.
17 files_to_omit = set() # type: Set[Pattern] 24 files_to_omit = set() # type: Set[Pattern]
18 25
19 def __init__(self, filename): 26 def __init__(self, filename):
@@ -23,6 +30,11 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
23 except zipfile.BadZipFile: 30 except zipfile.BadZipFile:
24 raise ValueError 31 raise ValueError
25 32
33 def _specific_cleanup(self, full_path:str) -> bool:
34 """ This method can be used to apply specific treatment
35 to files present in the archive."""
36 return True
37
26 def _clean_zipinfo(self, zipinfo: zipfile.ZipInfo) -> zipfile.ZipInfo: 38 def _clean_zipinfo(self, zipinfo: zipfile.ZipInfo) -> zipfile.ZipInfo:
27 zipinfo.create_system = 3 # Linux 39 zipinfo.create_system = 3 # Linux
28 zipinfo.comment = b'' 40 zipinfo.comment = b''
@@ -56,26 +68,31 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
56 for item in zin.infolist(): 68 for item in zin.infolist():
57 if item.filename[-1] == '/': # `is_dir` is added in Python3.6 69 if item.filename[-1] == '/': # `is_dir` is added in Python3.6
58 continue # don't keep empty folders 70 continue # don't keep empty folders
59 elif item.filename in self.files_to_keep:
60 item = self._clean_zipinfo(item)
61 zout.writestr(item, zin.read(item))
62 continue
63 elif any(map(lambda r: r.search(item.filename), self.files_to_omit)):
64 continue
65 71
66 zin.extract(member=item, path=temp_folder) 72 zin.extract(member=item, path=temp_folder)
67 full_path = os.path.join(temp_folder, item.filename) 73 full_path = os.path.join(temp_folder, item.filename)
68 tmp_parser, mtype = parser_factory.get_parser(full_path) # type: ignore 74
69 if not tmp_parser: 75 self._specific_cleanup(full_path)
70 shutil.rmtree(temp_folder) 76
71 os.remove(self.output_filename) 77 if item.filename in self.files_to_keep:
72 print("%s's format (%s) isn't supported" % (item.filename, mtype)) 78 # those files aren't supported, but we want to add them anyway
73 return False 79 pass
74 tmp_parser.remove_all() 80 elif any(map(lambda r: r.search(item.filename), self.files_to_omit)):
81 continue
82 else:
83 # supported files that we want to clean then add
84 tmp_parser, mtype = parser_factory.get_parser(full_path) # type: ignore
85 if not tmp_parser:
86 shutil.rmtree(temp_folder)
87 os.remove(self.output_filename)
88 print("%s's format (%s) isn't supported" % (item.filename, mtype))
89 return False
90 tmp_parser.remove_all()
91 os.rename(tmp_parser.output_filename, full_path)
75 92
76 zinfo = zipfile.ZipInfo(item.filename) # type: ignore 93 zinfo = zipfile.ZipInfo(item.filename) # type: ignore
77 clean_zinfo = self._clean_zipinfo(zinfo) 94 clean_zinfo = self._clean_zipinfo(zinfo)
78 with open(tmp_parser.output_filename, 'rb') as f: 95 with open(full_path, 'rb') as f:
79 zout.writestr(clean_zinfo, f.read()) 96 zout.writestr(clean_zinfo, f.read())
80 97
81 shutil.rmtree(temp_folder) 98 shutil.rmtree(temp_folder)
@@ -149,6 +166,37 @@ class LibreOfficeParser(ArchiveBasedAbstractParser):
149 '^Thumbnails/', 166 '^Thumbnails/',
150 })) 167 }))
151 168
169
170 def __remove_revisions(self, full_path:str) -> bool:
171 def parse_map(f): # etree support for ns is a bit rough
172 ns_map = dict()
173 for event, (k, v) in ET.iterparse(f, ("start-ns", )):
174 if event == "start-ns":
175 ns_map[k] = v
176 return ns_map
177
178 ns = parse_map(full_path)
179 if 'office' not in ns.keys(): # no revisions in the current file
180 return True
181
182 # Register the namespaces
183 for k,v in ns.items():
184 ET.register_namespace(k, v)
185
186 tree = ET.parse(full_path)
187 for text in tree.getroot().iterfind('.//office:text', ns):
188 for changes in text.iterfind('.//text:tracked-changes', ns):
189 text.remove(changes)
190
191 tree.write(full_path, xml_declaration = True)
192
193 return True
194
195 def _specific_cleanup(self, full_path:str) -> bool:
196 if os.path.basename(full_path) == 'content.xml':
197 return self.__remove_revisions(full_path)
198 return True
199
152 def get_meta(self) -> Dict[str, str]: 200 def get_meta(self) -> Dict[str, str]:
153 """ 201 """
154 Yes, I know that parsing xml with regexp ain't pretty, 202 Yes, I know that parsing xml with regexp ain't pretty,