summaryrefslogtreecommitdiff
path: root/libmat2/office.py
diff options
context:
space:
mode:
authorjvoisin2018-10-03 15:22:36 +0200
committerjvoisin2018-10-03 15:22:36 +0200
commit1b356b8c6ff8154f64e2019721897e0a7e909a54 (patch)
treee155d29fda0ce10ea201c5bcdccf6bcc67f01955 /libmat2/office.py
parentc67bbafb2c60782096af4f6225d94e18225d2ecf (diff)
Improve mat2's cli reliability
- Replace some class members by instance members - Don't thread the cleaning process anymore for now
Diffstat (limited to 'libmat2/office.py')
-rw-r--r--libmat2/office.py74
1 files changed, 41 insertions, 33 deletions
diff --git a/libmat2/office.py b/libmat2/office.py
index 3abf108..997a247 100644
--- a/libmat2/office.py
+++ b/libmat2/office.py
@@ -67,30 +67,33 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
67 # See https://0xacab.org/jvoisin/mat2/issues/71 67 # See https://0xacab.org/jvoisin/mat2/issues/71
68 'application/vnd.openxmlformats-officedocument.wordprocessingml.numbering+xml', # /word/numbering.xml 68 'application/vnd.openxmlformats-officedocument.wordprocessingml.numbering+xml', # /word/numbering.xml
69 } 69 }
70 files_to_keep = set(map(re.compile, { # type: ignore
71 r'^\[Content_Types\]\.xml$',
72 r'^_rels/\.rels$',
73 r'^word/_rels/document\.xml\.rels$',
74 r'^word/_rels/footer[0-9]*\.xml\.rels$',
75 r'^word/_rels/header[0-9]*\.xml\.rels$',
76 70
77 # https://msdn.microsoft.com/en-us/library/dd908153(v=office.12).aspx
78 r'^word/stylesWithEffects\.xml$',
79 }))
80 files_to_omit = set(map(re.compile, { # type: ignore
81 r'^customXml/',
82 r'webSettings\.xml$',
83 r'^docProps/custom\.xml$',
84 r'^word/printerSettings/',
85 r'^word/theme',
86
87 # we have a whitelist in self.files_to_keep,
88 # so we can trash everything else
89 r'^word/_rels/',
90 }))
91 71
92 def __init__(self, filename): 72 def __init__(self, filename):
93 super().__init__(filename) 73 super().__init__(filename)
74
75 self.files_to_keep = set(map(re.compile, { # type: ignore
76 r'^\[Content_Types\]\.xml$',
77 r'^_rels/\.rels$',
78 r'^word/_rels/document\.xml\.rels$',
79 r'^word/_rels/footer[0-9]*\.xml\.rels$',
80 r'^word/_rels/header[0-9]*\.xml\.rels$',
81
82 # https://msdn.microsoft.com/en-us/library/dd908153(v=office.12).aspx
83 r'^word/stylesWithEffects\.xml$',
84 }))
85 self.files_to_omit = set(map(re.compile, { # type: ignore
86 r'^customXml/',
87 r'webSettings\.xml$',
88 r'^docProps/custom\.xml$',
89 r'^word/printerSettings/',
90 r'^word/theme',
91
92 # we have a whitelist in self.files_to_keep,
93 # so we can trash everything else
94 r'^word/_rels/',
95 }))
96
94 if self.__fill_files_to_keep_via_content_types() is False: 97 if self.__fill_files_to_keep_via_content_types() is False:
95 raise ValueError 98 raise ValueError
96 99
@@ -320,19 +323,24 @@ class LibreOfficeParser(ArchiveBasedAbstractParser):
320 'application/vnd.oasis.opendocument.formula', 323 'application/vnd.oasis.opendocument.formula',
321 'application/vnd.oasis.opendocument.image', 324 'application/vnd.oasis.opendocument.image',
322 } 325 }
323 files_to_keep = set(map(re.compile, { # type: ignore 326
324 r'^META-INF/manifest\.xml$', 327
325 r'^content\.xml$', 328 def __init__(self, filename):
326 r'^manifest\.rdf$', 329 super().__init__(filename)
327 r'^mimetype$', 330
328 r'^settings\.xml$', 331 self.files_to_keep = set(map(re.compile, { # type: ignore
329 r'^styles\.xml$', 332 r'^META-INF/manifest\.xml$',
330 })) 333 r'^content\.xml$',
331 files_to_omit = set(map(re.compile, { # type: ignore 334 r'^manifest\.rdf$',
332 r'^meta\.xml$', 335 r'^mimetype$',
333 r'^Configurations2/', 336 r'^settings\.xml$',
334 r'^Thumbnails/', 337 r'^styles\.xml$',
335 })) 338 }))
339 self.files_to_omit = set(map(re.compile, { # type: ignore
340 r'^meta\.xml$',
341 r'^Configurations2/',
342 r'^Thumbnails/',
343 }))
336 344
337 @staticmethod 345 @staticmethod
338 def __remove_revisions(full_path: str) -> bool: 346 def __remove_revisions(full_path: str) -> bool: