diff options
| author | jvoisin | 2018-06-27 23:10:53 +0200 |
|---|---|---|
| committer | jvoisin | 2018-07-01 21:09:20 +0200 |
| commit | 02f7605ac124ed42bd9f3f156ee40877fe3c6b42 (patch) | |
| tree | f2dbd634d91c6c808a1269695c2abd465e3d0333 /libmat2/office.py | |
| parent | 80fc4ffb40ea425e14697082e4b7e6a7cf0b5583 (diff) | |
MAT2 is now cleaning revisions from odt files!
Diffstat (limited to 'libmat2/office.py')
| -rw-r--r-- | libmat2/office.py | 76 |
1 files changed, 62 insertions, 14 deletions
diff --git a/libmat2/office.py b/libmat2/office.py index 34ae7a2..5381eb9 100644 --- a/libmat2/office.py +++ b/libmat2/office.py | |||
| @@ -4,8 +4,10 @@ import shutil | |||
| 4 | import tempfile | 4 | import tempfile |
| 5 | import datetime | 5 | import datetime |
| 6 | import zipfile | 6 | import zipfile |
| 7 | import xml.etree.ElementTree as ET | ||
| 7 | from typing import Dict, Set, Pattern | 8 | from typing import Dict, Set, Pattern |
| 8 | 9 | ||
| 10 | |||
| 9 | from . import abstract, parser_factory | 11 | from . import abstract, parser_factory |
| 10 | 12 | ||
| 11 | # Make pyflakes happy | 13 | # Make pyflakes happy |
| @@ -13,7 +15,12 @@ assert Set | |||
| 13 | assert Pattern | 15 | assert Pattern |
| 14 | 16 | ||
| 15 | class ArchiveBasedAbstractParser(abstract.AbstractParser): | 17 | class ArchiveBasedAbstractParser(abstract.AbstractParser): |
| 18 | # Those are the files that have a format that _isn't_ | ||
| 19 | # supported by MAT2, but that we want to keep anyway. | ||
| 16 | files_to_keep = set() # type: Set[str] | 20 | files_to_keep = set() # type: Set[str] |
| 21 | |||
| 22 | # Those are the files that we _do not_ want to keep, | ||
| 23 | # no matter if they are supported or not. | ||
| 17 | files_to_omit = set() # type: Set[Pattern] | 24 | files_to_omit = set() # type: Set[Pattern] |
| 18 | 25 | ||
| 19 | def __init__(self, filename): | 26 | def __init__(self, filename): |
| @@ -23,6 +30,11 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser): | |||
| 23 | except zipfile.BadZipFile: | 30 | except zipfile.BadZipFile: |
| 24 | raise ValueError | 31 | raise ValueError |
| 25 | 32 | ||
| 33 | def _specific_cleanup(self, full_path:str) -> bool: | ||
| 34 | """ This method can be used to apply specific treatment | ||
| 35 | to files present in the archive.""" | ||
| 36 | return True | ||
| 37 | |||
| 26 | def _clean_zipinfo(self, zipinfo: zipfile.ZipInfo) -> zipfile.ZipInfo: | 38 | def _clean_zipinfo(self, zipinfo: zipfile.ZipInfo) -> zipfile.ZipInfo: |
| 27 | zipinfo.create_system = 3 # Linux | 39 | zipinfo.create_system = 3 # Linux |
| 28 | zipinfo.comment = b'' | 40 | zipinfo.comment = b'' |
| @@ -56,26 +68,31 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser): | |||
| 56 | for item in zin.infolist(): | 68 | for item in zin.infolist(): |
| 57 | if item.filename[-1] == '/': # `is_dir` is added in Python3.6 | 69 | if item.filename[-1] == '/': # `is_dir` is added in Python3.6 |
| 58 | continue # don't keep empty folders | 70 | continue # don't keep empty folders |
| 59 | elif item.filename in self.files_to_keep: | ||
| 60 | item = self._clean_zipinfo(item) | ||
| 61 | zout.writestr(item, zin.read(item)) | ||
| 62 | continue | ||
| 63 | elif any(map(lambda r: r.search(item.filename), self.files_to_omit)): | ||
| 64 | continue | ||
| 65 | 71 | ||
| 66 | zin.extract(member=item, path=temp_folder) | 72 | zin.extract(member=item, path=temp_folder) |
| 67 | full_path = os.path.join(temp_folder, item.filename) | 73 | full_path = os.path.join(temp_folder, item.filename) |
| 68 | tmp_parser, mtype = parser_factory.get_parser(full_path) # type: ignore | 74 | |
| 69 | if not tmp_parser: | 75 | self._specific_cleanup(full_path) |
| 70 | shutil.rmtree(temp_folder) | 76 | |
| 71 | os.remove(self.output_filename) | 77 | if item.filename in self.files_to_keep: |
| 72 | print("%s's format (%s) isn't supported" % (item.filename, mtype)) | 78 | # those files aren't supported, but we want to add them anyway |
| 73 | return False | 79 | pass |
| 74 | tmp_parser.remove_all() | 80 | elif any(map(lambda r: r.search(item.filename), self.files_to_omit)): |
| 81 | continue | ||
| 82 | else: | ||
| 83 | # supported files that we want to clean then add | ||
| 84 | tmp_parser, mtype = parser_factory.get_parser(full_path) # type: ignore | ||
| 85 | if not tmp_parser: | ||
| 86 | shutil.rmtree(temp_folder) | ||
| 87 | os.remove(self.output_filename) | ||
| 88 | print("%s's format (%s) isn't supported" % (item.filename, mtype)) | ||
| 89 | return False | ||
| 90 | tmp_parser.remove_all() | ||
| 91 | os.rename(tmp_parser.output_filename, full_path) | ||
| 75 | 92 | ||
| 76 | zinfo = zipfile.ZipInfo(item.filename) # type: ignore | 93 | zinfo = zipfile.ZipInfo(item.filename) # type: ignore |
| 77 | clean_zinfo = self._clean_zipinfo(zinfo) | 94 | clean_zinfo = self._clean_zipinfo(zinfo) |
| 78 | with open(tmp_parser.output_filename, 'rb') as f: | 95 | with open(full_path, 'rb') as f: |
| 79 | zout.writestr(clean_zinfo, f.read()) | 96 | zout.writestr(clean_zinfo, f.read()) |
| 80 | 97 | ||
| 81 | shutil.rmtree(temp_folder) | 98 | shutil.rmtree(temp_folder) |
| @@ -149,6 +166,37 @@ class LibreOfficeParser(ArchiveBasedAbstractParser): | |||
| 149 | '^Thumbnails/', | 166 | '^Thumbnails/', |
| 150 | })) | 167 | })) |
| 151 | 168 | ||
| 169 | |||
| 170 | def __remove_revisions(self, full_path:str) -> bool: | ||
| 171 | def parse_map(f): # etree support for ns is a bit rough | ||
| 172 | ns_map = dict() | ||
| 173 | for event, (k, v) in ET.iterparse(f, ("start-ns", )): | ||
| 174 | if event == "start-ns": | ||
| 175 | ns_map[k] = v | ||
| 176 | return ns_map | ||
| 177 | |||
| 178 | ns = parse_map(full_path) | ||
| 179 | if 'office' not in ns.keys(): # no revisions in the current file | ||
| 180 | return True | ||
| 181 | |||
| 182 | # Register the namespaces | ||
| 183 | for k,v in ns.items(): | ||
| 184 | ET.register_namespace(k, v) | ||
| 185 | |||
| 186 | tree = ET.parse(full_path) | ||
| 187 | for text in tree.getroot().iterfind('.//office:text', ns): | ||
| 188 | for changes in text.iterfind('.//text:tracked-changes', ns): | ||
| 189 | text.remove(changes) | ||
| 190 | |||
| 191 | tree.write(full_path, xml_declaration = True) | ||
| 192 | |||
| 193 | return True | ||
| 194 | |||
| 195 | def _specific_cleanup(self, full_path:str) -> bool: | ||
| 196 | if os.path.basename(full_path) == 'content.xml': | ||
| 197 | return self.__remove_revisions(full_path) | ||
| 198 | return True | ||
| 199 | |||
| 152 | def get_meta(self) -> Dict[str, str]: | 200 | def get_meta(self) -> Dict[str, str]: |
| 153 | """ | 201 | """ |
| 154 | Yes, I know that parsing xml with regexp ain't pretty, | 202 | Yes, I know that parsing xml with regexp ain't pretty, |
