1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
|
'''
Parent class of all parser
'''
import hachoir_core
import hachoir_editor
import os
import mat
NOMETA = ('.bmp', '.rdf', '.txt', '.xml', '.rels')
#bmp : image
#rdf : text
#txt : plain text
#xml : formated text
#rels : openxml foramted text
FIELD = object()
class GenericParser(object):
'''
Parent class of all parsers
'''
def __init__(self, filename, parser, mime, backup, add2archive):
self.filename = ''
self.parser = parser
self.mime = mime
self.backup = backup
self.editor = hachoir_editor.createEditor(parser)
self.realname = filename
try:
self.filename = hachoir_core.cmd_line.unicodeFilename(filename)
except TypeError: # get rid of "decoding Unicode is not supported"
self.filename = filename
basename, ext = os.path.splitext(filename)
self.output = basename + '.cleaned' + ext
self.basename = os.path.basename(filename) # only filename
def is_clean(self):
'''
Check if the file is clean from harmful metadatas
'''
for field in self.editor:
if self._should_remove(field):
return self._is_clean(self.editor)
return True
def _is_clean(self, fieldset):
for field in fieldset:
remove = self._should_remove(field)
if remove is True:
return False
if remove is FIELD:
if not self._is_clean(field):
return False
return True
def remove_all(self):
'''
Remove all the files that are compromizing
'''
state = self._remove_all(self.editor)
hachoir_core.field.writeIntoFile(self.editor, self.output)
self.do_backup()
return state
def _remove_all(self, fieldset):
try:
for field in fieldset:
remove = self._should_remove(field)
if remove is True:
self._remove(fieldset, field.name)
if remove is FIELD:
self._remove_all(field)
return True
except:
return False
def remove_all_strict(self):
'''
If the remove_all() is not efficient enough,
this method is implemented :
It is efficient, but destructive.
In a perfect world, with nice fileformat,
this method would not exist.
'''
self.remove_all()
def _remove(self, fieldset, field):
'''
Delete the given field
'''
del fieldset[field]
def get_meta(self):
'''
Return a dict with all the meta of the file
'''
metadata = {}
self._get_meta(self.editor, metadata)
return metadata
def _get_meta(self, fieldset, metadata):
for field in fieldset:
remove = self._should_remove(field)
if remove is True:
try:
metadata[field.name] = field.value
except:
metadata[field.name] = 'harmful content'
if remove is FIELD:
self._get_meta(field)
def _should_remove(self, key):
'''
return True if the field is compromizing
abstract method
'''
raise NotImplementedError
def do_backup(self):
'''
Do a backup of the file if asked,
and change his creation/access date
'''
if self.backup is False:
mat.secure_remove(self.filename)
os.rename(self.output, self.filename)
|