diff options
| author | jvoisin | 2018-10-03 15:22:36 +0200 |
|---|---|---|
| committer | jvoisin | 2018-10-03 15:22:36 +0200 |
| commit | 1b356b8c6ff8154f64e2019721897e0a7e909a54 (patch) | |
| tree | e155d29fda0ce10ea201c5bcdccf6bcc67f01955 /libmat2/office.py | |
| parent | c67bbafb2c60782096af4f6225d94e18225d2ecf (diff) | |
Improve mat2's cli reliability
- Replace some class members by instance members
- Don't thread the cleaning process anymore for now
Diffstat (limited to 'libmat2/office.py')
| -rw-r--r-- | libmat2/office.py | 74 |
1 files changed, 41 insertions, 33 deletions
diff --git a/libmat2/office.py b/libmat2/office.py index 3abf108..997a247 100644 --- a/libmat2/office.py +++ b/libmat2/office.py | |||
| @@ -67,30 +67,33 @@ class MSOfficeParser(ArchiveBasedAbstractParser): | |||
| 67 | # See https://0xacab.org/jvoisin/mat2/issues/71 | 67 | # See https://0xacab.org/jvoisin/mat2/issues/71 |
| 68 | 'application/vnd.openxmlformats-officedocument.wordprocessingml.numbering+xml', # /word/numbering.xml | 68 | 'application/vnd.openxmlformats-officedocument.wordprocessingml.numbering+xml', # /word/numbering.xml |
| 69 | } | 69 | } |
| 70 | files_to_keep = set(map(re.compile, { # type: ignore | ||
| 71 | r'^\[Content_Types\]\.xml$', | ||
| 72 | r'^_rels/\.rels$', | ||
| 73 | r'^word/_rels/document\.xml\.rels$', | ||
| 74 | r'^word/_rels/footer[0-9]*\.xml\.rels$', | ||
| 75 | r'^word/_rels/header[0-9]*\.xml\.rels$', | ||
| 76 | 70 | ||
| 77 | # https://msdn.microsoft.com/en-us/library/dd908153(v=office.12).aspx | ||
| 78 | r'^word/stylesWithEffects\.xml$', | ||
| 79 | })) | ||
| 80 | files_to_omit = set(map(re.compile, { # type: ignore | ||
| 81 | r'^customXml/', | ||
| 82 | r'webSettings\.xml$', | ||
| 83 | r'^docProps/custom\.xml$', | ||
| 84 | r'^word/printerSettings/', | ||
| 85 | r'^word/theme', | ||
| 86 | |||
| 87 | # we have a whitelist in self.files_to_keep, | ||
| 88 | # so we can trash everything else | ||
| 89 | r'^word/_rels/', | ||
| 90 | })) | ||
| 91 | 71 | ||
| 92 | def __init__(self, filename): | 72 | def __init__(self, filename): |
| 93 | super().__init__(filename) | 73 | super().__init__(filename) |
| 74 | |||
| 75 | self.files_to_keep = set(map(re.compile, { # type: ignore | ||
| 76 | r'^\[Content_Types\]\.xml$', | ||
| 77 | r'^_rels/\.rels$', | ||
| 78 | r'^word/_rels/document\.xml\.rels$', | ||
| 79 | r'^word/_rels/footer[0-9]*\.xml\.rels$', | ||
| 80 | r'^word/_rels/header[0-9]*\.xml\.rels$', | ||
| 81 | |||
| 82 | # https://msdn.microsoft.com/en-us/library/dd908153(v=office.12).aspx | ||
| 83 | r'^word/stylesWithEffects\.xml$', | ||
| 84 | })) | ||
| 85 | self.files_to_omit = set(map(re.compile, { # type: ignore | ||
| 86 | r'^customXml/', | ||
| 87 | r'webSettings\.xml$', | ||
| 88 | r'^docProps/custom\.xml$', | ||
| 89 | r'^word/printerSettings/', | ||
| 90 | r'^word/theme', | ||
| 91 | |||
| 92 | # we have a whitelist in self.files_to_keep, | ||
| 93 | # so we can trash everything else | ||
| 94 | r'^word/_rels/', | ||
| 95 | })) | ||
| 96 | |||
| 94 | if self.__fill_files_to_keep_via_content_types() is False: | 97 | if self.__fill_files_to_keep_via_content_types() is False: |
| 95 | raise ValueError | 98 | raise ValueError |
| 96 | 99 | ||
| @@ -320,19 +323,24 @@ class LibreOfficeParser(ArchiveBasedAbstractParser): | |||
| 320 | 'application/vnd.oasis.opendocument.formula', | 323 | 'application/vnd.oasis.opendocument.formula', |
| 321 | 'application/vnd.oasis.opendocument.image', | 324 | 'application/vnd.oasis.opendocument.image', |
| 322 | } | 325 | } |
| 323 | files_to_keep = set(map(re.compile, { # type: ignore | 326 | |
| 324 | r'^META-INF/manifest\.xml$', | 327 | |
| 325 | r'^content\.xml$', | 328 | def __init__(self, filename): |
| 326 | r'^manifest\.rdf$', | 329 | super().__init__(filename) |
| 327 | r'^mimetype$', | 330 | |
| 328 | r'^settings\.xml$', | 331 | self.files_to_keep = set(map(re.compile, { # type: ignore |
| 329 | r'^styles\.xml$', | 332 | r'^META-INF/manifest\.xml$', |
| 330 | })) | 333 | r'^content\.xml$', |
| 331 | files_to_omit = set(map(re.compile, { # type: ignore | 334 | r'^manifest\.rdf$', |
| 332 | r'^meta\.xml$', | 335 | r'^mimetype$', |
| 333 | r'^Configurations2/', | 336 | r'^settings\.xml$', |
| 334 | r'^Thumbnails/', | 337 | r'^styles\.xml$', |
| 335 | })) | 338 | })) |
| 339 | self.files_to_omit = set(map(re.compile, { # type: ignore | ||
| 340 | r'^meta\.xml$', | ||
| 341 | r'^Configurations2/', | ||
| 342 | r'^Thumbnails/', | ||
| 343 | })) | ||
| 336 | 344 | ||
| 337 | @staticmethod | 345 | @staticmethod |
| 338 | def __remove_revisions(full_path: str) -> bool: | 346 | def __remove_revisions(full_path: str) -> bool: |
