summaryrefslogtreecommitdiff
path: root/libmat2/office.py
diff options
context:
space:
mode:
Diffstat (limited to 'libmat2/office.py')
-rw-r--r--libmat2/office.py98
1 files changed, 74 insertions, 24 deletions
diff --git a/libmat2/office.py b/libmat2/office.py
index 91bf2a6..3abf108 100644
--- a/libmat2/office.py
+++ b/libmat2/office.py
@@ -50,25 +50,75 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
50 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', 50 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
51 'application/vnd.openxmlformats-officedocument.presentationml.presentation' 51 'application/vnd.openxmlformats-officedocument.presentationml.presentation'
52 } 52 }
53 files_to_keep = { 53 content_types_to_keep = {
54 '[Content_Types].xml', 54 'application/vnd.openxmlformats-officedocument.wordprocessingml.endnotes+xml', # /word/endnotes.xml
55 '_rels/.rels', 55 'application/vnd.openxmlformats-officedocument.wordprocessingml.footnotes+xml', # /word/footnotes.xml
56 'word/_rels/document.xml.rels', 56 'application/vnd.openxmlformats-officedocument.extended-properties+xml', # /docProps/app.xml
57 'word/document.xml', 57 'application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml', # /word/document.xml
58 'word/fontTable.xml', 58 'application/vnd.openxmlformats-officedocument.wordprocessingml.fontTable+xml', # /word/fontTable.xml
59 'word/settings.xml', 59 'application/vnd.openxmlformats-officedocument.wordprocessingml.footer+xml', # /word/footer.xml
60 'word/styles.xml', 60 'application/vnd.openxmlformats-officedocument.wordprocessingml.header+xml', # /word/header.xml
61 'docProps/app.xml', 61 'application/vnd.openxmlformats-officedocument.wordprocessingml.styles+xml', # /word/styles.xml
62 'docProps/core.xml', 62 'application/vnd.openxmlformats-package.core-properties+xml', # /docProps/core.xml
63 63
64 # https://msdn.microsoft.com/en-us/library/dd908153(v=office.12).aspx 64 # Do we want to keep the following ones?
65 'word/stylesWithEffects.xml', 65 'application/vnd.openxmlformats-officedocument.wordprocessingml.settings+xml',
66
67 # See https://0xacab.org/jvoisin/mat2/issues/71
68 'application/vnd.openxmlformats-officedocument.wordprocessingml.numbering+xml', # /word/numbering.xml
66 } 69 }
70 files_to_keep = set(map(re.compile, { # type: ignore
71 r'^\[Content_Types\]\.xml$',
72 r'^_rels/\.rels$',
73 r'^word/_rels/document\.xml\.rels$',
74 r'^word/_rels/footer[0-9]*\.xml\.rels$',
75 r'^word/_rels/header[0-9]*\.xml\.rels$',
76
77 # https://msdn.microsoft.com/en-us/library/dd908153(v=office.12).aspx
78 r'^word/stylesWithEffects\.xml$',
79 }))
67 files_to_omit = set(map(re.compile, { # type: ignore 80 files_to_omit = set(map(re.compile, { # type: ignore
68 'word/webSettings.xml', 81 r'^customXml/',
69 'word/theme', 82 r'webSettings\.xml$',
83 r'^docProps/custom\.xml$',
84 r'^word/printerSettings/',
85 r'^word/theme',
86
87 # we have a whitelist in self.files_to_keep,
88 # so we can trash everything else
89 r'^word/_rels/',
70 })) 90 }))
71 91
92 def __init__(self, filename):
93 super().__init__(filename)
94 if self.__fill_files_to_keep_via_content_types() is False:
95 raise ValueError
96
97 def __fill_files_to_keep_via_content_types(self) -> bool:
98 """ There is a suer-handy `[Content_Types].xml` file
99 in MS Office archives, describing what each other file contains.
100 The self.content_types_to_keep member contains a type whitelist,
101 so we're using it to fill the self.files_to_keep one.
102 """
103 with zipfile.ZipFile(self.filename) as zin:
104 if '[Content_Types].xml' not in zin.namelist():
105 return False
106 xml_data = zin.read('[Content_Types].xml')
107
108 self.content_types = dict() # type: Dict[str, str]
109 try:
110 tree = ET.fromstring(xml_data)
111 except ET.ParseError:
112 return False
113 for c in tree:
114 if 'PartName' not in c.attrib or 'ContentType' not in c.attrib:
115 continue
116 elif c.attrib['ContentType'] in self.content_types_to_keep:
117 fname = c.attrib['PartName'][1:] # remove leading `/`
118 re_fname = re.compile('^' + re.escape(fname) + '$')
119 self.files_to_keep.add(re_fname) # type: ignore
120 return True
121
72 @staticmethod 122 @staticmethod
73 def __remove_rsid(full_path: str) -> bool: 123 def __remove_rsid(full_path: str) -> bool:
74 """ The method will remove "revision session ID". We're '}rsid' 124 """ The method will remove "revision session ID". We're '}rsid'
@@ -270,18 +320,18 @@ class LibreOfficeParser(ArchiveBasedAbstractParser):
270 'application/vnd.oasis.opendocument.formula', 320 'application/vnd.oasis.opendocument.formula',
271 'application/vnd.oasis.opendocument.image', 321 'application/vnd.oasis.opendocument.image',
272 } 322 }
273 files_to_keep = { 323 files_to_keep = set(map(re.compile, { # type: ignore
274 'META-INF/manifest.xml', 324 r'^META-INF/manifest\.xml$',
275 'content.xml', 325 r'^content\.xml$',
276 'manifest.rdf', 326 r'^manifest\.rdf$',
277 'mimetype', 327 r'^mimetype$',
278 'settings.xml', 328 r'^settings\.xml$',
279 'styles.xml', 329 r'^styles\.xml$',
280 } 330 }))
281 files_to_omit = set(map(re.compile, { # type: ignore 331 files_to_omit = set(map(re.compile, { # type: ignore
282 r'^meta\.xml$', 332 r'^meta\.xml$',
283 '^Configurations2/', 333 r'^Configurations2/',
284 '^Thumbnails/', 334 r'^Thumbnails/',
285 })) 335 }))
286 336
287 @staticmethod 337 @staticmethod