summaryrefslogtreecommitdiff
path: root/MAT/parser.py
blob: d6b7faf2c697a92cce8b35ea686407bfb31bb524 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
'''
    Parent class of all parser
'''

import hachoir_core
import hachoir_editor

import os

import mat

NOMETA = ('.bmp',  # image
          '.rdf',  # text
          '.txt',  # plain text
          '.xml',  # formated text (XML)
          '.rels', # openXML formated text
          )

FIELD = object()


class GenericParser(object):
    '''
        Parent class of all parsers
    '''
    def __init__(self, filename, parser, mime, backup, **kwargs):
        self.filename = ''
        self.parser = parser
        self.mime = mime
        self.backup = backup
        self.editor = hachoir_editor.createEditor(parser)
        self.realname = filename
        try:
            self.filename = hachoir_core.cmd_line.unicodeFilename(filename)
        except TypeError:  # get rid of "decoding Unicode is not supported"
            self.filename = filename
        basename, ext = os.path.splitext(filename)
        self.output = basename + '.cleaned' + ext
        self.basename = os.path.basename(filename)  # only filename

    def is_clean(self):
        '''
            Check if the file is clean from harmful metadatas
        '''
        for field in self.editor:
            if self._should_remove(field):
                return self._is_clean(self.editor)
        return True

    def _is_clean(self, fieldset):
        for field in fieldset:
            remove = self._should_remove(field)
            if remove is True:
                return False
            if remove is FIELD:
                if not self._is_clean(field):
                    return False
        return True

    def remove_all(self):
        '''
            Remove all compromising fields
        '''
        state = self._remove_all(self.editor)
        hachoir_core.field.writeIntoFile(self.editor, self.output)
        self.do_backup()
        return state

    def _remove_all(self, fieldset):
        '''
            Recursive way to handle tree metadatas
        '''
        try:
            for field in fieldset:
                remove = self._should_remove(field)
                if remove is True:
                    self._remove(fieldset, field.name)
                if remove is FIELD:
                    self._remove_all(field)
            return True
        except:
            return False

    def _remove(self, fieldset, field):
        '''
            Delete the given field
        '''
        del fieldset[field]

    def get_meta(self):
        '''
            Return a dict with all the meta of the file
        '''
        metadata = {}
        self._get_meta(self.editor, metadata)
        return metadata

    def _get_meta(self, fieldset, metadata):
        '''
            Recursive way to handle tree metadatas
        '''
        for field in fieldset:
            remove = self._should_remove(field)
            if remove:
                try:
                    metadata[field.name] = field.value
                except:
                    metadata[field.name] = 'harmful content'
            if remove is FIELD:
                self._get_meta(field, None)

    def _should_remove(self, key):
        '''
            return True if the field is compromising
            abstract method
        '''
        raise NotImplementedError

    def do_backup(self):
        '''
            Do a backup of the file if asked,
            and change his creation/access date
        '''
        if not self.backup:
            mat.secure_remove(self.filename)
            os.rename(self.output, self.filename)