summaryrefslogtreecommitdiff
path: root/libmat/parser.py
diff options
context:
space:
mode:
Diffstat (limited to 'libmat/parser.py')
-rw-r--r--libmat/parser.py135
1 files changed, 135 insertions, 0 deletions
diff --git a/libmat/parser.py b/libmat/parser.py
new file mode 100644
index 0000000..1765da8
--- /dev/null
+++ b/libmat/parser.py
@@ -0,0 +1,135 @@
1''' Parent class of all parser
2'''
3
4import os
5import shutil
6import tempfile
7
8import hachoir_core
9import hachoir_editor
10
11import mat
12
13NOMETA = frozenset((
14 '.bmp', # "raw" image
15 '.rdf', # text
16 '.txt', # plain text
17 '.xml', # formated text (XML)
18 '.rels', # openXML formated text
19))
20
21FIELD = object()
22
23
24class GenericParser(object):
25 ''' Parent class of all parsers
26 '''
27 def __init__(self, filename, parser, mime, backup, is_writable, **kwargs):
28 self.filename = ''
29 self.parser = parser
30 self.mime = mime
31 self.backup = backup
32 self.is_writable = is_writable
33 self.editor = hachoir_editor.createEditor(parser)
34 try:
35 self.filename = hachoir_core.cmd_line.unicodeFilename(filename)
36 except TypeError: # get rid of "decoding Unicode is not supported"
37 self.filename = filename
38 self.basename = os.path.basename(filename)
39 _, output = tempfile.mkstemp()
40 self.output = hachoir_core.cmd_line.unicodeFilename(output)
41
42 def __del__(self):
43 ''' Remove tempfile if it was not used
44 '''
45 if os.path.exists(self.output):
46 mat.secure_remove(self.output)
47
48 def is_clean(self):
49 '''
50 Check if the file is clean from harmful metadatas
51 '''
52 for field in self.editor:
53 if self._should_remove(field):
54 return self._is_clean(self.editor)
55 return True
56
57 def _is_clean(self, fieldset):
58 for field in fieldset:
59 remove = self._should_remove(field)
60 if remove is True:
61 return False
62 if remove is FIELD:
63 if not self._is_clean(field):
64 return False
65 return True
66
67 def remove_all(self):
68 ''' Remove all compromising fields
69 '''
70 state = self._remove_all(self.editor)
71 hachoir_core.field.writeIntoFile(self.editor, self.output)
72 self.do_backup()
73 return state
74
75 def _remove_all(self, fieldset):
76 ''' Recursive way to handle tree metadatas
77 '''
78 try:
79 for field in fieldset:
80 remove = self._should_remove(field)
81 if remove is True:
82 self._remove(fieldset, field.name)
83 if remove is FIELD:
84 self._remove_all(field)
85 return True
86 except:
87 return False
88
89 def _remove(self, fieldset, field):
90 ''' Delete the given field
91 '''
92 del fieldset[field]
93
94 def get_meta(self):
95 ''' Return a dict with all the meta of the file
96 '''
97 metadata = {}
98 self._get_meta(self.editor, metadata)
99 return metadata
100
101 def _get_meta(self, fieldset, metadata):
102 ''' Recursive way to handle tree metadatas
103 '''
104 for field in fieldset:
105 remove = self._should_remove(field)
106 if remove:
107 try:
108 metadata[field.name] = field.value
109 except:
110 metadata[field.name] = 'harmful content'
111 if remove is FIELD:
112 self._get_meta(field, None)
113
114 def _should_remove(self, key):
115 ''' Return True if the field is compromising
116 abstract method
117 '''
118 raise NotImplementedError
119
120 def create_backup_copy(self):
121 ''' Create a backup copy
122 '''
123 shutil.copy2(self.filename, self.filename + '.bak')
124
125 def do_backup(self):
126 ''' Keep a backup of the file if asked.
127
128 The process of double-renaming is not very elegant,
129 but it greatly simplify new strippers implementation.
130 '''
131 if self.backup:
132 shutil.move(self.filename, self.filename + '.bak')
133 else:
134 mat.secure_remove(self.filename)
135 shutil.move(self.output, self.filename)