summaryrefslogtreecommitdiff
path: root/libmat2
diff options
context:
space:
mode:
authorjvoisin2018-10-03 15:22:36 +0200
committerjvoisin2018-10-03 15:22:36 +0200
commit1b356b8c6ff8154f64e2019721897e0a7e909a54 (patch)
treee155d29fda0ce10ea201c5bcdccf6bcc67f01955 /libmat2
parentc67bbafb2c60782096af4f6225d94e18225d2ecf (diff)
Improve mat2's cli reliability
- Replace some class members by instance members - Don't thread the cleaning process anymore for now
Diffstat (limited to 'libmat2')
-rw-r--r--libmat2/archive.py23
-rw-r--r--libmat2/office.py74
2 files changed, 53 insertions, 44 deletions
diff --git a/libmat2/archive.py b/libmat2/archive.py
index b29d690..016142d 100644
--- a/libmat2/archive.py
+++ b/libmat2/archive.py
@@ -15,20 +15,21 @@ assert Pattern
15 15
16class ArchiveBasedAbstractParser(abstract.AbstractParser): 16class ArchiveBasedAbstractParser(abstract.AbstractParser):
17 """ Office files (.docx, .odt, …) are zipped files. """ 17 """ Office files (.docx, .odt, …) are zipped files. """
18 # Those are the files that have a format that _isn't_ 18 def __init__(self, filename):
19 # supported by MAT2, but that we want to keep anyway. 19 super().__init__(filename)
20 files_to_keep = set() # type: Set[Pattern]
21 20
22 # Those are the files that we _do not_ want to keep, 21 # Those are the files that have a format that _isn't_
23 # no matter if they are supported or not. 22 # supported by MAT2, but that we want to keep anyway.
24 files_to_omit = set() # type: Set[Pattern] 23 self.files_to_keep = set() # type: Set[Pattern]
25 24
26 # what should the parser do if it encounters an unknown file in 25 # Those are the files that we _do not_ want to keep,
27 # the archive? 26 # no matter if they are supported or not.
28 unknown_member_policy = UnknownMemberPolicy.ABORT # type: UnknownMemberPolicy 27 self.files_to_omit = set() # type: Set[Pattern]
28
29 # what should the parser do if it encounters an unknown file in
30 # the archive?
31 self.unknown_member_policy = UnknownMemberPolicy.ABORT # type: UnknownMemberPolicy
29 32
30 def __init__(self, filename):
31 super().__init__(filename)
32 try: # better fail here than later 33 try: # better fail here than later
33 zipfile.ZipFile(self.filename) 34 zipfile.ZipFile(self.filename)
34 except zipfile.BadZipFile: 35 except zipfile.BadZipFile:
diff --git a/libmat2/office.py b/libmat2/office.py
index 3abf108..997a247 100644
--- a/libmat2/office.py
+++ b/libmat2/office.py
@@ -67,30 +67,33 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
67 # See https://0xacab.org/jvoisin/mat2/issues/71 67 # See https://0xacab.org/jvoisin/mat2/issues/71
68 'application/vnd.openxmlformats-officedocument.wordprocessingml.numbering+xml', # /word/numbering.xml 68 'application/vnd.openxmlformats-officedocument.wordprocessingml.numbering+xml', # /word/numbering.xml
69 } 69 }
70 files_to_keep = set(map(re.compile, { # type: ignore
71 r'^\[Content_Types\]\.xml$',
72 r'^_rels/\.rels$',
73 r'^word/_rels/document\.xml\.rels$',
74 r'^word/_rels/footer[0-9]*\.xml\.rels$',
75 r'^word/_rels/header[0-9]*\.xml\.rels$',
76 70
77 # https://msdn.microsoft.com/en-us/library/dd908153(v=office.12).aspx
78 r'^word/stylesWithEffects\.xml$',
79 }))
80 files_to_omit = set(map(re.compile, { # type: ignore
81 r'^customXml/',
82 r'webSettings\.xml$',
83 r'^docProps/custom\.xml$',
84 r'^word/printerSettings/',
85 r'^word/theme',
86
87 # we have a whitelist in self.files_to_keep,
88 # so we can trash everything else
89 r'^word/_rels/',
90 }))
91 71
92 def __init__(self, filename): 72 def __init__(self, filename):
93 super().__init__(filename) 73 super().__init__(filename)
74
75 self.files_to_keep = set(map(re.compile, { # type: ignore
76 r'^\[Content_Types\]\.xml$',
77 r'^_rels/\.rels$',
78 r'^word/_rels/document\.xml\.rels$',
79 r'^word/_rels/footer[0-9]*\.xml\.rels$',
80 r'^word/_rels/header[0-9]*\.xml\.rels$',
81
82 # https://msdn.microsoft.com/en-us/library/dd908153(v=office.12).aspx
83 r'^word/stylesWithEffects\.xml$',
84 }))
85 self.files_to_omit = set(map(re.compile, { # type: ignore
86 r'^customXml/',
87 r'webSettings\.xml$',
88 r'^docProps/custom\.xml$',
89 r'^word/printerSettings/',
90 r'^word/theme',
91
92 # we have a whitelist in self.files_to_keep,
93 # so we can trash everything else
94 r'^word/_rels/',
95 }))
96
94 if self.__fill_files_to_keep_via_content_types() is False: 97 if self.__fill_files_to_keep_via_content_types() is False:
95 raise ValueError 98 raise ValueError
96 99
@@ -320,19 +323,24 @@ class LibreOfficeParser(ArchiveBasedAbstractParser):
320 'application/vnd.oasis.opendocument.formula', 323 'application/vnd.oasis.opendocument.formula',
321 'application/vnd.oasis.opendocument.image', 324 'application/vnd.oasis.opendocument.image',
322 } 325 }
323 files_to_keep = set(map(re.compile, { # type: ignore 326
324 r'^META-INF/manifest\.xml$', 327
325 r'^content\.xml$', 328 def __init__(self, filename):
326 r'^manifest\.rdf$', 329 super().__init__(filename)
327 r'^mimetype$', 330
328 r'^settings\.xml$', 331 self.files_to_keep = set(map(re.compile, { # type: ignore
329 r'^styles\.xml$', 332 r'^META-INF/manifest\.xml$',
330 })) 333 r'^content\.xml$',
331 files_to_omit = set(map(re.compile, { # type: ignore 334 r'^manifest\.rdf$',
332 r'^meta\.xml$', 335 r'^mimetype$',
333 r'^Configurations2/', 336 r'^settings\.xml$',
334 r'^Thumbnails/', 337 r'^styles\.xml$',
335 })) 338 }))
339 self.files_to_omit = set(map(re.compile, { # type: ignore
340 r'^meta\.xml$',
341 r'^Configurations2/',
342 r'^Thumbnails/',
343 }))
336 344
337 @staticmethod 345 @staticmethod
338 def __remove_revisions(full_path: str) -> bool: 346 def __remove_revisions(full_path: str) -> bool: