diff options
| author | jvoisin | 2018-10-03 15:22:36 +0200 |
|---|---|---|
| committer | jvoisin | 2018-10-03 15:22:36 +0200 |
| commit | 1b356b8c6ff8154f64e2019721897e0a7e909a54 (patch) | |
| tree | e155d29fda0ce10ea201c5bcdccf6bcc67f01955 | |
| parent | c67bbafb2c60782096af4f6225d94e18225d2ecf (diff) | |
Improve mat2's cli reliability
- Replace some class members by instance members
- Don't thread the cleaning process anymore for now
| -rw-r--r-- | libmat2/archive.py | 23 | ||||
| -rw-r--r-- | libmat2/office.py | 74 | ||||
| -rwxr-xr-x | mat2 | 13 |
3 files changed, 58 insertions, 52 deletions
diff --git a/libmat2/archive.py b/libmat2/archive.py index b29d690..016142d 100644 --- a/libmat2/archive.py +++ b/libmat2/archive.py | |||
| @@ -15,20 +15,21 @@ assert Pattern | |||
| 15 | 15 | ||
| 16 | class ArchiveBasedAbstractParser(abstract.AbstractParser): | 16 | class ArchiveBasedAbstractParser(abstract.AbstractParser): |
| 17 | """ Office files (.docx, .odt, …) are zipped files. """ | 17 | """ Office files (.docx, .odt, …) are zipped files. """ |
| 18 | # Those are the files that have a format that _isn't_ | 18 | def __init__(self, filename): |
| 19 | # supported by MAT2, but that we want to keep anyway. | 19 | super().__init__(filename) |
| 20 | files_to_keep = set() # type: Set[Pattern] | ||
| 21 | 20 | ||
| 22 | # Those are the files that we _do not_ want to keep, | 21 | # Those are the files that have a format that _isn't_ |
| 23 | # no matter if they are supported or not. | 22 | # supported by MAT2, but that we want to keep anyway. |
| 24 | files_to_omit = set() # type: Set[Pattern] | 23 | self.files_to_keep = set() # type: Set[Pattern] |
| 25 | 24 | ||
| 26 | # what should the parser do if it encounters an unknown file in | 25 | # Those are the files that we _do not_ want to keep, |
| 27 | # the archive? | 26 | # no matter if they are supported or not. |
| 28 | unknown_member_policy = UnknownMemberPolicy.ABORT # type: UnknownMemberPolicy | 27 | self.files_to_omit = set() # type: Set[Pattern] |
| 28 | |||
| 29 | # what should the parser do if it encounters an unknown file in | ||
| 30 | # the archive? | ||
| 31 | self.unknown_member_policy = UnknownMemberPolicy.ABORT # type: UnknownMemberPolicy | ||
| 29 | 32 | ||
| 30 | def __init__(self, filename): | ||
| 31 | super().__init__(filename) | ||
| 32 | try: # better fail here than later | 33 | try: # better fail here than later |
| 33 | zipfile.ZipFile(self.filename) | 34 | zipfile.ZipFile(self.filename) |
| 34 | except zipfile.BadZipFile: | 35 | except zipfile.BadZipFile: |
diff --git a/libmat2/office.py b/libmat2/office.py index 3abf108..997a247 100644 --- a/libmat2/office.py +++ b/libmat2/office.py | |||
| @@ -67,30 +67,33 @@ class MSOfficeParser(ArchiveBasedAbstractParser): | |||
| 67 | # See https://0xacab.org/jvoisin/mat2/issues/71 | 67 | # See https://0xacab.org/jvoisin/mat2/issues/71 |
| 68 | 'application/vnd.openxmlformats-officedocument.wordprocessingml.numbering+xml', # /word/numbering.xml | 68 | 'application/vnd.openxmlformats-officedocument.wordprocessingml.numbering+xml', # /word/numbering.xml |
| 69 | } | 69 | } |
| 70 | files_to_keep = set(map(re.compile, { # type: ignore | ||
| 71 | r'^\[Content_Types\]\.xml$', | ||
| 72 | r'^_rels/\.rels$', | ||
| 73 | r'^word/_rels/document\.xml\.rels$', | ||
| 74 | r'^word/_rels/footer[0-9]*\.xml\.rels$', | ||
| 75 | r'^word/_rels/header[0-9]*\.xml\.rels$', | ||
| 76 | 70 | ||
| 77 | # https://msdn.microsoft.com/en-us/library/dd908153(v=office.12).aspx | ||
| 78 | r'^word/stylesWithEffects\.xml$', | ||
| 79 | })) | ||
| 80 | files_to_omit = set(map(re.compile, { # type: ignore | ||
| 81 | r'^customXml/', | ||
| 82 | r'webSettings\.xml$', | ||
| 83 | r'^docProps/custom\.xml$', | ||
| 84 | r'^word/printerSettings/', | ||
| 85 | r'^word/theme', | ||
| 86 | |||
| 87 | # we have a whitelist in self.files_to_keep, | ||
| 88 | # so we can trash everything else | ||
| 89 | r'^word/_rels/', | ||
| 90 | })) | ||
| 91 | 71 | ||
| 92 | def __init__(self, filename): | 72 | def __init__(self, filename): |
| 93 | super().__init__(filename) | 73 | super().__init__(filename) |
| 74 | |||
| 75 | self.files_to_keep = set(map(re.compile, { # type: ignore | ||
| 76 | r'^\[Content_Types\]\.xml$', | ||
| 77 | r'^_rels/\.rels$', | ||
| 78 | r'^word/_rels/document\.xml\.rels$', | ||
| 79 | r'^word/_rels/footer[0-9]*\.xml\.rels$', | ||
| 80 | r'^word/_rels/header[0-9]*\.xml\.rels$', | ||
| 81 | |||
| 82 | # https://msdn.microsoft.com/en-us/library/dd908153(v=office.12).aspx | ||
| 83 | r'^word/stylesWithEffects\.xml$', | ||
| 84 | })) | ||
| 85 | self.files_to_omit = set(map(re.compile, { # type: ignore | ||
| 86 | r'^customXml/', | ||
| 87 | r'webSettings\.xml$', | ||
| 88 | r'^docProps/custom\.xml$', | ||
| 89 | r'^word/printerSettings/', | ||
| 90 | r'^word/theme', | ||
| 91 | |||
| 92 | # we have a whitelist in self.files_to_keep, | ||
| 93 | # so we can trash everything else | ||
| 94 | r'^word/_rels/', | ||
| 95 | })) | ||
| 96 | |||
| 94 | if self.__fill_files_to_keep_via_content_types() is False: | 97 | if self.__fill_files_to_keep_via_content_types() is False: |
| 95 | raise ValueError | 98 | raise ValueError |
| 96 | 99 | ||
| @@ -320,19 +323,24 @@ class LibreOfficeParser(ArchiveBasedAbstractParser): | |||
| 320 | 'application/vnd.oasis.opendocument.formula', | 323 | 'application/vnd.oasis.opendocument.formula', |
| 321 | 'application/vnd.oasis.opendocument.image', | 324 | 'application/vnd.oasis.opendocument.image', |
| 322 | } | 325 | } |
| 323 | files_to_keep = set(map(re.compile, { # type: ignore | 326 | |
| 324 | r'^META-INF/manifest\.xml$', | 327 | |
| 325 | r'^content\.xml$', | 328 | def __init__(self, filename): |
| 326 | r'^manifest\.rdf$', | 329 | super().__init__(filename) |
| 327 | r'^mimetype$', | 330 | |
| 328 | r'^settings\.xml$', | 331 | self.files_to_keep = set(map(re.compile, { # type: ignore |
| 329 | r'^styles\.xml$', | 332 | r'^META-INF/manifest\.xml$', |
| 330 | })) | 333 | r'^content\.xml$', |
| 331 | files_to_omit = set(map(re.compile, { # type: ignore | 334 | r'^manifest\.rdf$', |
| 332 | r'^meta\.xml$', | 335 | r'^mimetype$', |
| 333 | r'^Configurations2/', | 336 | r'^settings\.xml$', |
| 334 | r'^Thumbnails/', | 337 | r'^styles\.xml$', |
| 335 | })) | 338 | })) |
| 339 | self.files_to_omit = set(map(re.compile, { # type: ignore | ||
| 340 | r'^meta\.xml$', | ||
| 341 | r'^Configurations2/', | ||
| 342 | r'^Thumbnails/', | ||
| 343 | })) | ||
| 336 | 344 | ||
| 337 | @staticmethod | 345 | @staticmethod |
| 338 | def __remove_revisions(full_path: str) -> bool: | 346 | def __remove_revisions(full_path: str) -> bool: |
| @@ -3,10 +3,8 @@ | |||
| 3 | import os | 3 | import os |
| 4 | from typing import Tuple | 4 | from typing import Tuple |
| 5 | import sys | 5 | import sys |
| 6 | import itertools | ||
| 7 | import mimetypes | 6 | import mimetypes |
| 8 | import argparse | 7 | import argparse |
| 9 | import multiprocessing | ||
| 10 | import logging | 8 | import logging |
| 11 | 9 | ||
| 12 | try: | 10 | try: |
| @@ -142,13 +140,12 @@ def main(): | |||
| 142 | if unknown_member_policy == UnknownMemberPolicy.KEEP: | 140 | if unknown_member_policy == UnknownMemberPolicy.KEEP: |
| 143 | logging.warning('Keeping unknown member files may leak metadata in the resulting file!') | 141 | logging.warning('Keeping unknown member files may leak metadata in the resulting file!') |
| 144 | 142 | ||
| 145 | rep_mode = itertools.repeat(args.lightweight is True) | 143 | success = True |
| 146 | rep_policy = itertools.repeat(unknown_member_policy) | 144 | for f in __get_files_recursively(args.files): |
| 147 | l = zip(__get_files_recursively(args.files), rep_mode, rep_policy) | 145 | if clean_meta([f, args.lightweight, unknown_member_policy]) is False: |
| 146 | success = False | ||
| 147 | return success | ||
| 148 | 148 | ||
| 149 | p = multiprocessing.Pool() | ||
| 150 | ret = list(p.imap_unordered(clean_meta, list(l))) | ||
| 151 | return 0 if all(ret) else -1 | ||
| 152 | 149 | ||
| 153 | if __name__ == '__main__': | 150 | if __name__ == '__main__': |
| 154 | sys.exit(main()) | 151 | sys.exit(main()) |
