summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorjvoisin2018-10-03 15:22:36 +0200
committerjvoisin2018-10-03 15:22:36 +0200
commit1b356b8c6ff8154f64e2019721897e0a7e909a54 (patch)
treee155d29fda0ce10ea201c5bcdccf6bcc67f01955
parentc67bbafb2c60782096af4f6225d94e18225d2ecf (diff)
Improve mat2's cli reliability
- Replace some class members by instance members - Don't thread the cleaning process anymore for now
-rw-r--r--libmat2/archive.py23
-rw-r--r--libmat2/office.py74
-rwxr-xr-xmat213
3 files changed, 58 insertions, 52 deletions
diff --git a/libmat2/archive.py b/libmat2/archive.py
index b29d690..016142d 100644
--- a/libmat2/archive.py
+++ b/libmat2/archive.py
@@ -15,20 +15,21 @@ assert Pattern
15 15
16class ArchiveBasedAbstractParser(abstract.AbstractParser): 16class ArchiveBasedAbstractParser(abstract.AbstractParser):
17 """ Office files (.docx, .odt, …) are zipped files. """ 17 """ Office files (.docx, .odt, …) are zipped files. """
18 # Those are the files that have a format that _isn't_ 18 def __init__(self, filename):
19 # supported by MAT2, but that we want to keep anyway. 19 super().__init__(filename)
20 files_to_keep = set() # type: Set[Pattern]
21 20
22 # Those are the files that we _do not_ want to keep, 21 # Those are the files that have a format that _isn't_
23 # no matter if they are supported or not. 22 # supported by MAT2, but that we want to keep anyway.
24 files_to_omit = set() # type: Set[Pattern] 23 self.files_to_keep = set() # type: Set[Pattern]
25 24
26 # what should the parser do if it encounters an unknown file in 25 # Those are the files that we _do not_ want to keep,
27 # the archive? 26 # no matter if they are supported or not.
28 unknown_member_policy = UnknownMemberPolicy.ABORT # type: UnknownMemberPolicy 27 self.files_to_omit = set() # type: Set[Pattern]
28
29 # what should the parser do if it encounters an unknown file in
30 # the archive?
31 self.unknown_member_policy = UnknownMemberPolicy.ABORT # type: UnknownMemberPolicy
29 32
30 def __init__(self, filename):
31 super().__init__(filename)
32 try: # better fail here than later 33 try: # better fail here than later
33 zipfile.ZipFile(self.filename) 34 zipfile.ZipFile(self.filename)
34 except zipfile.BadZipFile: 35 except zipfile.BadZipFile:
diff --git a/libmat2/office.py b/libmat2/office.py
index 3abf108..997a247 100644
--- a/libmat2/office.py
+++ b/libmat2/office.py
@@ -67,30 +67,33 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
67 # See https://0xacab.org/jvoisin/mat2/issues/71 67 # See https://0xacab.org/jvoisin/mat2/issues/71
68 'application/vnd.openxmlformats-officedocument.wordprocessingml.numbering+xml', # /word/numbering.xml 68 'application/vnd.openxmlformats-officedocument.wordprocessingml.numbering+xml', # /word/numbering.xml
69 } 69 }
70 files_to_keep = set(map(re.compile, { # type: ignore
71 r'^\[Content_Types\]\.xml$',
72 r'^_rels/\.rels$',
73 r'^word/_rels/document\.xml\.rels$',
74 r'^word/_rels/footer[0-9]*\.xml\.rels$',
75 r'^word/_rels/header[0-9]*\.xml\.rels$',
76 70
77 # https://msdn.microsoft.com/en-us/library/dd908153(v=office.12).aspx
78 r'^word/stylesWithEffects\.xml$',
79 }))
80 files_to_omit = set(map(re.compile, { # type: ignore
81 r'^customXml/',
82 r'webSettings\.xml$',
83 r'^docProps/custom\.xml$',
84 r'^word/printerSettings/',
85 r'^word/theme',
86
87 # we have a whitelist in self.files_to_keep,
88 # so we can trash everything else
89 r'^word/_rels/',
90 }))
91 71
92 def __init__(self, filename): 72 def __init__(self, filename):
93 super().__init__(filename) 73 super().__init__(filename)
74
75 self.files_to_keep = set(map(re.compile, { # type: ignore
76 r'^\[Content_Types\]\.xml$',
77 r'^_rels/\.rels$',
78 r'^word/_rels/document\.xml\.rels$',
79 r'^word/_rels/footer[0-9]*\.xml\.rels$',
80 r'^word/_rels/header[0-9]*\.xml\.rels$',
81
82 # https://msdn.microsoft.com/en-us/library/dd908153(v=office.12).aspx
83 r'^word/stylesWithEffects\.xml$',
84 }))
85 self.files_to_omit = set(map(re.compile, { # type: ignore
86 r'^customXml/',
87 r'webSettings\.xml$',
88 r'^docProps/custom\.xml$',
89 r'^word/printerSettings/',
90 r'^word/theme',
91
92 # we have a whitelist in self.files_to_keep,
93 # so we can trash everything else
94 r'^word/_rels/',
95 }))
96
94 if self.__fill_files_to_keep_via_content_types() is False: 97 if self.__fill_files_to_keep_via_content_types() is False:
95 raise ValueError 98 raise ValueError
96 99
@@ -320,19 +323,24 @@ class LibreOfficeParser(ArchiveBasedAbstractParser):
320 'application/vnd.oasis.opendocument.formula', 323 'application/vnd.oasis.opendocument.formula',
321 'application/vnd.oasis.opendocument.image', 324 'application/vnd.oasis.opendocument.image',
322 } 325 }
323 files_to_keep = set(map(re.compile, { # type: ignore 326
324 r'^META-INF/manifest\.xml$', 327
325 r'^content\.xml$', 328 def __init__(self, filename):
326 r'^manifest\.rdf$', 329 super().__init__(filename)
327 r'^mimetype$', 330
328 r'^settings\.xml$', 331 self.files_to_keep = set(map(re.compile, { # type: ignore
329 r'^styles\.xml$', 332 r'^META-INF/manifest\.xml$',
330 })) 333 r'^content\.xml$',
331 files_to_omit = set(map(re.compile, { # type: ignore 334 r'^manifest\.rdf$',
332 r'^meta\.xml$', 335 r'^mimetype$',
333 r'^Configurations2/', 336 r'^settings\.xml$',
334 r'^Thumbnails/', 337 r'^styles\.xml$',
335 })) 338 }))
339 self.files_to_omit = set(map(re.compile, { # type: ignore
340 r'^meta\.xml$',
341 r'^Configurations2/',
342 r'^Thumbnails/',
343 }))
336 344
337 @staticmethod 345 @staticmethod
338 def __remove_revisions(full_path: str) -> bool: 346 def __remove_revisions(full_path: str) -> bool:
diff --git a/mat2 b/mat2
index 0b8ea98..5afd804 100755
--- a/mat2
+++ b/mat2
@@ -3,10 +3,8 @@
3import os 3import os
4from typing import Tuple 4from typing import Tuple
5import sys 5import sys
6import itertools
7import mimetypes 6import mimetypes
8import argparse 7import argparse
9import multiprocessing
10import logging 8import logging
11 9
12try: 10try:
@@ -142,13 +140,12 @@ def main():
142 if unknown_member_policy == UnknownMemberPolicy.KEEP: 140 if unknown_member_policy == UnknownMemberPolicy.KEEP:
143 logging.warning('Keeping unknown member files may leak metadata in the resulting file!') 141 logging.warning('Keeping unknown member files may leak metadata in the resulting file!')
144 142
145 rep_mode = itertools.repeat(args.lightweight is True) 143 success = True
146 rep_policy = itertools.repeat(unknown_member_policy) 144 for f in __get_files_recursively(args.files):
147 l = zip(__get_files_recursively(args.files), rep_mode, rep_policy) 145 if clean_meta([f, args.lightweight, unknown_member_policy]) is False:
146 success = False
147 return success
148 148
149 p = multiprocessing.Pool()
150 ret = list(p.imap_unordered(clean_meta, list(l)))
151 return 0 if all(ret) else -1
152 149
153if __name__ == '__main__': 150if __name__ == '__main__':
154 sys.exit(main()) 151 sys.exit(main())