libmat2/office.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403

import logging
import os
import re
import zipfile
from typing import Dict, Set, Pattern

import xml.etree.ElementTree as ET  # type: ignore

from .archive import ArchiveBasedAbstractParser

# pylint: disable=line-too-long

# Make pyflakes happy
assert Set
assert Pattern

def _parse_xml(full_path: str):
    """ This function parses XML, with namespace support. """

    namespace_map = dict()
    for _, (key, value) in ET.iterparse(full_path, ("start-ns", )):
        # The ns[0-9]+ namespaces are reserved for internal usage, so
        # we have to use an other nomenclature.
        if re.match('^ns[0-9]+$', key, re.I):  # pragma: no cover
            key = 'mat' + key[2:]

        namespace_map[key] = value
        ET.register_namespace(key, value)

    return ET.parse(full_path), namespace_map


def _sort_xml_attributes(full_path: str) -> bool:
    """ Sort xml attributes lexicographically,
    because it's possible to fingerprint producers (MS Office, Libreoffice, …)
    since they are all using different orders.
    """
    tree = ET.parse(full_path)

    for c in tree.getroot():
        c[:] = sorted(c, key=lambda child: (child.tag, child.get('desc')))

    tree.write(full_path, xml_declaration=True)
    return True


class MSOfficeParser(ArchiveBasedAbstractParser):
    mimetypes = {
        'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
        'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
        'application/vnd.openxmlformats-officedocument.presentationml.presentation'
    }
    content_types_to_keep = {
        'application/vnd.openxmlformats-officedocument.wordprocessingml.endnotes+xml',  # /word/endnotes.xml
        'application/vnd.openxmlformats-officedocument.wordprocessingml.footnotes+xml',  # /word/footnotes.xml
        'application/vnd.openxmlformats-officedocument.extended-properties+xml',  # /docProps/app.xml
        'application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml',  # /word/document.xml
        'application/vnd.openxmlformats-officedocument.wordprocessingml.fontTable+xml',  # /word/fontTable.xml
        'application/vnd.openxmlformats-officedocument.wordprocessingml.footer+xml',  # /word/footer.xml
        'application/vnd.openxmlformats-officedocument.wordprocessingml.header+xml',  # /word/header.xml
        'application/vnd.openxmlformats-officedocument.wordprocessingml.styles+xml',  # /word/styles.xml
        'application/vnd.openxmlformats-package.core-properties+xml',  # /docProps/core.xml

        # Do we want to keep the following ones?
        'application/vnd.openxmlformats-officedocument.wordprocessingml.settings+xml',

        # See https://0xacab.org/jvoisin/mat2/issues/71
        'application/vnd.openxmlformats-officedocument.wordprocessingml.numbering+xml',  # /word/numbering.xml
    }


    def __init__(self, filename):
        super().__init__(filename)

        self.files_to_keep = set(map(re.compile, {  # type: ignore
            r'^\[Content_Types\]\.xml$',
            r'^_rels/\.rels$',
            r'^word/_rels/document\.xml\.rels$',
            r'^word/_rels/footer[0-9]*\.xml\.rels$',
            r'^word/_rels/header[0-9]*\.xml\.rels$',

            # https://msdn.microsoft.com/en-us/library/dd908153(v=office.12).aspx
            r'^word/stylesWithEffects\.xml$',
        }))
        self.files_to_omit = set(map(re.compile, {  # type: ignore
            r'^customXml/',
            r'webSettings\.xml$',
            r'^docProps/custom\.xml$',
            r'^word/printerSettings/',
            r'^word/theme',

            # we have a whitelist in self.files_to_keep,
            # so we can trash everything else
            r'^word/_rels/',
        }))

        if self.__fill_files_to_keep_via_content_types() is False:
            raise ValueError

    def __fill_files_to_keep_via_content_types(self) -> bool:
        """ There is a suer-handy `[Content_Types].xml` file
        in MS Office archives, describing what each other file contains.
        The self.content_types_to_keep member contains a type whitelist,
        so we're using it to fill the self.files_to_keep one.
        """
        with zipfile.ZipFile(self.filename) as zin:
            if '[Content_Types].xml' not in zin.namelist():
                return False
            xml_data = zin.read('[Content_Types].xml')

        self.content_types = dict()  # type: Dict[str, str]
        try:
            tree = ET.fromstring(xml_data)
        except ET.ParseError:
            return False
        for c in tree:
            if 'PartName' not in c.attrib or 'ContentType' not in c.attrib:
                continue
            elif c.attrib['ContentType'] in self.content_types_to_keep:
                fname = c.attrib['PartName'][1:]  # remove leading `/`
                re_fname = re.compile('^' + re.escape(fname) + '$')
                self.files_to_keep.add(re_fname)  # type: ignore
        return True

    @staticmethod
    def __remove_rsid(full_path: str) -> bool:
        """ The method will remove "revision session ID".  We're '}rsid'
        instead of proper parsing, since rsid can have multiple forms, like
        `rsidRDefault`, `rsidR`, `rsids`, …

        We're removing rsid tags in two times, because we can't modify
        the xml while we're iterating on it.

        For more details, see
        - https://msdn.microsoft.com/en-us/library/office/documentformat.openxml.wordprocessing.previoussectionproperties.rsidrpr.aspx
        - https://blogs.msdn.microsoft.com/brian_jones/2006/12/11/whats-up-with-all-those-rsids/
        """
        try:
            tree, namespace = _parse_xml(full_path)
        except ET.ParseError:
            return False

        # rsid, tags or attributes, are always under the `w` namespace
        if 'w' not in namespace.keys():
            return True

        parent_map = {c:p for p in tree.iter() for c in p}

        elements_to_remove = list()
        for item in tree.iterfind('.//', namespace):
            if '}rsid' in item.tag.strip().lower():  # rsid as tag
                elements_to_remove.append(item)
                continue
            for key in list(item.attrib.keys()):  # rsid as attribute
                if '}rsid' in key.lower():
                    del item.attrib[key]

        for element in elements_to_remove:
            parent_map[element].remove(element)

        tree.write(full_path, xml_declaration=True)
        return True

    @staticmethod
    def __remove_revisions(full_path: str) -> bool:
        """ In this function, we're changing the XML document in several
        different times, since we don't want to change the tree we're currently
        iterating on.
        """
        try:
            tree, namespace = _parse_xml(full_path)
        except ET.ParseError as e:
            logging.error("Unable to parse %s: %s", full_path, e)
            return False

        # Revisions are either deletions (`w:del`) or
        # insertions (`w:ins`)
        del_presence = tree.find('.//w:del', namespace)
        ins_presence = tree.find('.//w:ins', namespace)
        if del_presence is None and ins_presence is None:
            return True  # No revisions are present

        parent_map = {c:p for p in tree.iter() for c in p}

        elements = list()
        for element in tree.iterfind('.//w:del', namespace):
            elements.append(element)
        for element in elements:
            parent_map[element].remove(element)

        elements = list()
        for element in tree.iterfind('.//w:ins', namespace):
            for position, item in enumerate(tree.iter()):  # pragma: no cover
                if item == element:
                    for children in element.iterfind('./*'):
                        elements.append((element, position, children))
                    break
        for (element, position, children) in elements:
            parent_map[element].insert(position, children)
            parent_map[element].remove(element)

        tree.write(full_path, xml_declaration=True)
        return True

    def __remove_content_type_members(self, full_path: str) -> bool:
        """ The method will remove the dangling references
        form the [Content_Types].xml file, since MS office doesn't like them
        """
        try:
            tree, namespace = _parse_xml(full_path)
        except ET.ParseError:  # pragma: no cover
            return False

        if len(namespace.items()) != 1:
            return False  # there should be only one namespace for Types

        removed_fnames = set()
        with zipfile.ZipFile(self.filename) as zin:
            for fname in [item.filename for item in zin.infolist()]:
                for file_to_omit in self.files_to_omit:
                    if file_to_omit.search(fname):
                        matches = map(lambda r: r.search(fname), self.files_to_keep)
                        if any(matches):  # the file is whitelisted
                            continue
                        removed_fnames.add(fname)
                        break

        root = tree.getroot()
        for item in root.findall('{%s}Override' % namespace['']):
            name = item.attrib['PartName'][1:]  # remove the leading '/'
            if name in removed_fnames:
                root.remove(item)

        tree.write(full_path, xml_declaration=True)
        return True

    def _specific_cleanup(self, full_path: str) -> bool:
        # pylint: disable=too-many-return-statements
        if os.stat(full_path).st_size == 0:  # Don't process empty files
            return True

        if not full_path.endswith('.xml'):
            return True

        if full_path.endswith('/[Content_Types].xml'):
            # this file contains references to files that we might
            # remove, and MS Office doesn't like dangling references
            if self.__remove_content_type_members(full_path) is False:
                return False
        elif full_path.endswith('/word/document.xml'):
            # this file contains the revisions
            if self.__remove_revisions(full_path) is False:
                return False
        elif full_path.endswith('/docProps/app.xml'):
            # This file must be present and valid,
            # so we're removing as much as we can.
            with open(full_path, 'wb') as f:
                f.write(b'<?xml version="1.0" encoding="UTF-8" standalone="yes"?>')
                f.write(b'<Properties xmlns="http://schemas.openxmlformats.org/officeDocument/2006/extended-properties">')
                f.write(b'</Properties>')
        elif full_path.endswith('/docProps/core.xml'):
            # This file must be present and valid,
            # so we're removing as much as we can.
            with open(full_path, 'wb') as f:
                f.write(b'<?xml version="1.0" encoding="UTF-8" standalone="yes"?>')
                f.write(b'<cp:coreProperties xmlns:cp="http://schemas.openxmlformats.org/package/2006/metadata/core-properties">')
                f.write(b'</cp:coreProperties>')


        if self.__remove_rsid(full_path) is False:
            return False

        try:
            _sort_xml_attributes(full_path)
        except ET.ParseError as e:  # pragma: no cover
            logging.error("Unable to parse %s: %s", full_path, e)
            return False

        # This is awful, I'm sorry.
        #
        # Microsoft Office isn't happy when we have the `mc:Ignorable`
        # tag containing namespaces that aren't present in the xml file,
        # so instead of trying to remove this specific tag with etree,
        # we're removing it, with a regexp.
        #
        # Since we're the ones producing this file, via the call to
        # _sort_xml_attributes, there won't be any "funny tricks".
        # Worst case, the tag isn't present, and everything is fine.
        #
        # see: https://docs.microsoft.com/en-us/dotnet/framework/wpf/advanced/mc-ignorable-attribute
        with open(full_path, 'rb') as f:
            text = f.read()
            out = re.sub(b'mc:Ignorable="[^"]*"', b'', text, 1)
        with open(full_path, 'wb') as f:
            f.write(out)

        return True

    def get_meta(self) -> Dict[str, str]:
        """
        Yes, I know that parsing xml with regexp ain't pretty,
        be my guest and fix it if you want.
        """
        metadata = {}
        zipin = zipfile.ZipFile(self.filename)
        for item in zipin.infolist():
            if item.filename.startswith('docProps/') and item.filename.endswith('.xml'):
                try:
                    content = zipin.read(item).decode('utf-8')
                    results = re.findall(r"<(.+)>(.+)</\1>", content, re.I|re.M)
                    for (key, value) in results:
                        metadata[key] = value
                except (TypeError, UnicodeDecodeError):  # We didn't manage to parse the xml file
                    metadata[item.filename] = 'harmful content'
            for key, value in self._get_zipinfo_meta(item).items():
                metadata[key] = value
        zipin.close()
        return metadata


class LibreOfficeParser(ArchiveBasedAbstractParser):
    mimetypes = {
        'application/vnd.oasis.opendocument.text',
        'application/vnd.oasis.opendocument.spreadsheet',
        'application/vnd.oasis.opendocument.presentation',
        'application/vnd.oasis.opendocument.graphics',
        'application/vnd.oasis.opendocument.chart',
        'application/vnd.oasis.opendocument.formula',
        'application/vnd.oasis.opendocument.image',
    }


    def __init__(self, filename):
        super().__init__(filename)

        self.files_to_keep = set(map(re.compile, {  # type: ignore
            r'^META-INF/manifest\.xml$',
            r'^content\.xml$',
            r'^manifest\.rdf$',
            r'^mimetype$',
            r'^settings\.xml$',
            r'^styles\.xml$',
        }))
        self.files_to_omit = set(map(re.compile, {  # type: ignore
            r'^meta\.xml$',
            r'^Configurations2/',
            r'^Thumbnails/',
        }))

    @staticmethod
    def __remove_revisions(full_path: str) -> bool:
        try:
            tree, namespace = _parse_xml(full_path)
        except ET.ParseError as e:
            logging.error("Unable to parse %s: %s", full_path, e)
            return False

        if 'office' not in namespace.keys():  # no revisions in the current file
            return True

        for text in tree.getroot().iterfind('.//office:text', namespace):
            for changes in text.iterfind('.//text:tracked-changes', namespace):
                text.remove(changes)

        tree.write(full_path, xml_declaration=True)
        return True

    def _specific_cleanup(self, full_path: str) -> bool:
        if os.stat(full_path).st_size == 0:  # Don't process empty files
            return True

        if os.path.basename(full_path).endswith('.xml'):
            if os.path.basename(full_path) == 'content.xml':
                if self.__remove_revisions(full_path) is False:
                    return False

            try:
                _sort_xml_attributes(full_path)
            except ET.ParseError as e:
                logging.error("Unable to parse %s: %s", full_path, e)
                return False
        return True

    def get_meta(self) -> Dict[str, str]:
        """
        Yes, I know that parsing xml with regexp ain't pretty,
        be my guest and fix it if you want.
        """
        metadata = {}
        zipin = zipfile.ZipFile(self.filename)
        for item in zipin.infolist():
            if item.filename == 'meta.xml':
                try:
                    content = zipin.read(item).decode('utf-8')
                    results = re.findall(r"<((?:meta|dc|cp).+?)>(.+)</\1>", content, re.I|re.M)
                    for (key, value) in results:
                        metadata[key] = value
                except (TypeError, UnicodeDecodeError):  # We didn't manage to parse the xml file
                    metadata[item.filename] = 'harmful content'
            for key, value in self._get_zipinfo_meta(item).items():
                metadata[key] = value
        zipin.close()
        return metadata