summaryrefslogtreecommitdiff
path: root/libmat
diff options
context:
space:
mode:
authorjvoisin2014-06-08 13:39:18 +0200
committerjvoisin2014-06-08 13:39:18 +0200
commitaf36529554c39a2eefcc2c8723715e2d25b401b8 (patch)
treef54b964520bab44d1dfac725086211eaf22d3763 /libmat
parentef5a32cfd3c0555ffe5ddf413eeaae61622ebb4b (diff)
Rename the MAT folder to libmat.
This commit fixes some issues for dump operating systems who doesn't handle capitalization.
Diffstat (limited to 'libmat')
-rw-r--r--libmat/__init__.py1
-rw-r--r--libmat/archive.py335
-rw-r--r--libmat/audio.py53
-rw-r--r--libmat/bencode/__init__.py1
-rw-r--r--libmat/bencode/bencode.py143
-rw-r--r--libmat/exceptions.py14
-rw-r--r--libmat/exiftool.py78
-rw-r--r--libmat/hachoir_editor/__init__.py8
-rw-r--r--libmat/hachoir_editor/field.py69
-rw-r--r--libmat/hachoir_editor/fieldset.py352
-rw-r--r--libmat/hachoir_editor/typed_field.py253
-rw-r--r--libmat/images.py52
-rw-r--r--libmat/mat.py186
-rw-r--r--libmat/misc.py76
-rw-r--r--libmat/mutagenstripper.py33
-rw-r--r--libmat/office.py191
-rw-r--r--libmat/parser.py135
-rw-r--r--libmat/strippers.py70
18 files changed, 2050 insertions, 0 deletions
diff --git a/libmat/__init__.py b/libmat/__init__.py
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/libmat/__init__.py
@@ -0,0 +1 @@
diff --git a/libmat/archive.py b/libmat/archive.py
new file mode 100644
index 0000000..d483dcc
--- /dev/null
+++ b/libmat/archive.py
@@ -0,0 +1,335 @@
1''' Take care of archives formats
2'''
3
4import datetime
5import logging
6import os
7import shutil
8import stat
9import tarfile
10import tempfile
11import zipfile
12
13import mat
14import parser
15
16# Zip files do not support dates older than 01/01/1980
17ZIP_EPOCH = (1980, 1, 1, 0, 0, 0)
18ZIP_EPOCH_SECONDS = (datetime.datetime(1980, 1, 1, 0, 0, 0)
19 - datetime.datetime(1970, 1, 1, 1, 0, 0)).total_seconds()
20
21
22class GenericArchiveStripper(parser.GenericParser):
23 ''' Represent a generic archive
24 '''
25 def __init__(self, filename, parser, mime, backup, is_writable, **kwargs):
26 super(GenericArchiveStripper, self).__init__(filename,
27 parser, mime, backup, is_writable, **kwargs)
28 self.compression = ''
29 self.add2archive = kwargs['add2archive']
30 self.tempdir = tempfile.mkdtemp()
31
32 def __del__(self):
33 ''' Remove the files inside the temp dir,
34 then remove the temp dir
35 '''
36 for root, dirs, files in os.walk(self.tempdir):
37 for item in files:
38 path_file = os.path.join(root, item)
39 mat.secure_remove(path_file)
40 shutil.rmtree(self.tempdir)
41
42 def is_clean(self, list_unsupported=False):
43 ''' Virtual method to check for harmul metadata
44 '''
45 raise NotImplementedError
46
47 def list_unsupported(self):
48 ''' Get a list of every non-supported files present in the archive
49 '''
50 return self.is_clean(list_unsupported=True)
51
52 def remove_all(self):
53 ''' Virtual method to remove all metadata
54 '''
55 raise NotImplementedError
56
57
58class ZipStripper(GenericArchiveStripper):
59 ''' Represent a zip file
60 '''
61 def __is_zipfile_clean(self, fileinfo):
62 ''' Check if a ZipInfo object is clean of metadata added
63 by zip itself, independently of the corresponding file metadata
64 '''
65 if fileinfo.comment != '':
66 return False
67 elif fileinfo.date_time != ZIP_EPOCH:
68 return False
69 elif fileinfo.create_system != 3: # 3 is UNIX
70 return False
71 return True
72
73 def is_clean(self, list_unsupported=False):
74 ''' Check if the given file is clean from harmful metadata
75 When list_unsupported is True, the method returns a list
76 of all non-supported/archives files contained in the
77 archive.
78 '''
79 ret_list = []
80 zipin = zipfile.ZipFile(self.filename, 'r')
81 if zipin.comment != '' and not list_unsupported:
82 logging.debug('%s has a comment' % self.filename)
83 return False
84 for item in zipin.infolist():
85 zipin.extract(item, self.tempdir)
86 path = os.path.join(self.tempdir, item.filename)
87 if not self.__is_zipfile_clean(item) and not list_unsupported:
88 logging.debug('%s from %s has compromising zipinfo' %
89 (item.filename, self.filename))
90 return False
91 if os.path.isfile(path):
92 cfile = mat.create_class_file(path, False, add2archive=self.add2archive)
93 if cfile is not None:
94 if not cfile.is_clean():
95 logging.debug('%s from %s has metadata' % (item.filename, self.filename))
96 if not list_unsupported:
97 return False
98 else:
99 logging.info('%s\'s fileformat is not supported or harmless.'
100 % item.filename)
101 basename, ext = os.path.splitext(path)
102 if os.path.basename(item.filename) not in ('mimetype', '.rels'):
103 if ext not in parser.NOMETA:
104 if not list_unsupported:
105 return False
106 ret_list.append(item.filename)
107 zipin.close()
108 if list_unsupported:
109 return ret_list
110 return True
111
112 def get_meta(self):
113 ''' Return all the metadata of a zip archive'''
114 zipin = zipfile.ZipFile(self.filename, 'r')
115 metadata = {}
116 if zipin.comment != '':
117 metadata['comment'] = zipin.comment
118 for item in zipin.infolist():
119 zipinfo_meta = self.__get_zipinfo_meta(item)
120 if zipinfo_meta != {}: # zipinfo metadata
121 metadata[item.filename + "'s zipinfo"] = str(zipinfo_meta)
122 zipin.extract(item, self.tempdir)
123 path = os.path.join(self.tempdir, item.filename)
124 if os.path.isfile(path):
125 cfile = mat.create_class_file(path, False, add2archive=self.add2archive)
126 if cfile is not None:
127 cfile_meta = cfile.get_meta()
128 if cfile_meta != {}:
129 metadata[item.filename] = str(cfile_meta)
130 else:
131 logging.info('%s\'s fileformat is not supported or harmless'
132 % item.filename)
133 zipin.close()
134 return metadata
135
136 def __get_zipinfo_meta(self, zipinfo):
137 ''' Return all the metadata of a ZipInfo
138 '''
139 metadata = {}
140 if zipinfo.comment != '':
141 metadata['comment'] = zipinfo.comment
142 if zipinfo.date_time != ZIP_EPOCH:
143 metadata['modified'] = zipinfo.date_time
144 if zipinfo.create_system != 3: # 3 is UNIX
145 metadata['system'] = "windows" if zipinfo.create_system == 2 else "unknown"
146 return metadata
147
148 def remove_all(self, whitelist=[], beginning_blacklist=[], ending_blacklist=[]):
149 ''' Remove all metadata from a zip archive, even thoses
150 added by Python's zipfile itself. It will not add
151 files starting with "begining_blacklist", or ending with
152 "ending_blacklist". This method also add files present in
153 whitelist to the archive.
154 '''
155 zipin = zipfile.ZipFile(self.filename, 'r')
156 zipout = zipfile.ZipFile(self.output, 'w', allowZip64=True)
157 for item in zipin.infolist():
158 zipin.extract(item, self.tempdir)
159 path = os.path.join(self.tempdir, item.filename)
160
161 beginning = any((True for f in beginning_blacklist if item.filename.startswith(f)))
162 ending = any((True for f in ending_blacklist if item.filename.endswith(f)))
163
164 if os.path.isfile(path) and not beginning and not ending:
165 cfile = mat.create_class_file(path, False, add2archive=self.add2archive)
166 if cfile is not None:
167 # Handle read-only files inside archive
168 old_stat = os.stat(path).st_mode
169 os.chmod(path, old_stat|stat.S_IWUSR)
170 cfile.remove_all()
171 os.chmod(path, old_stat)
172 logging.debug('Processing %s from %s' % (item.filename, self.filename))
173 elif item.filename not in whitelist:
174 logging.info('%s\'s format is not supported or harmless' % item.filename)
175 basename, ext = os.path.splitext(path)
176 if not (self.add2archive or ext in parser.NOMETA):
177 continue
178 os.utime(path, (ZIP_EPOCH_SECONDS, ZIP_EPOCH_SECONDS))
179 zipout.write(path, item.filename)
180 zipin.close()
181 zipout.close()
182
183 logging.info('%s processed' % self.filename)
184 self.do_backup()
185 return True
186
187
188class TarStripper(GenericArchiveStripper):
189 ''' Represent a tarfile archive
190 '''
191 def _remove(self, current_file):
192 ''' Remove the meta added by tarfile itself to the file
193 '''
194 current_file.mtime = 0
195 current_file.uid = 0
196 current_file.gid = 0
197 current_file.uname = ''
198 current_file.gname = ''
199 return current_file
200
201 def remove_all(self, whitelist=[]):
202 ''' Remove all harmful metadata from the tarfile.
203 The method will also add every files matching
204 whitelist in the produced archive.
205 '''
206 tarin = tarfile.open(self.filename, 'r' + self.compression, encoding='utf-8')
207 tarout = tarfile.open(self.output, 'w' + self.compression, encoding='utf-8')
208 for item in tarin.getmembers():
209 tarin.extract(item, self.tempdir)
210 if item.isfile():
211 path = os.path.join(self.tempdir, item.name)
212 cfile = mat.create_class_file(path, False, add2archive=self.add2archive)
213 if cfile is not None:
214 # Handle read-only files inside archive
215 old_stat = os.stat(path).st_mode
216 os.chmod(path, old_stat|stat.S_IWUSR)
217 cfile.remove_all()
218 os.chmod(path, old_stat)
219 elif self.add2archive or os.path.splitext(item.name)[1] in parser.NOMETA:
220 logging.debug('%s\' format is either not supported or harmless' % item.name)
221 elif item.name in whitelist:
222 logging.debug('%s is not supported, but MAT was told to add it anyway.'
223 % item.name)
224 else: # Don't add the file to the archive
225 logging.debug('%s will not be added' % item.name)
226 continue
227 tarout.add(path, item.name, filter=self._remove)
228 tarin.close()
229 tarout.close()
230 self.do_backup()
231 return True
232
233 def is_file_clean(self, current_file):
234 ''' Check metadatas added by tarfile
235 '''
236 if current_file.mtime != 0:
237 return False
238 elif current_file.uid != 0:
239 return False
240 elif current_file.gid != 0:
241 return False
242 elif current_file.uname != '':
243 return False
244 elif current_file.gname != '':
245 return False
246 return True
247
248 def is_clean(self, list_unsupported=False):
249 ''' Check if the file is clean from harmful metadatas
250 When list_unsupported is True, the method returns a list
251 of all non-supported/archives files contained in the
252 archive.
253 '''
254 ret_list = []
255 tarin = tarfile.open(self.filename, 'r' + self.compression)
256 for item in tarin.getmembers():
257 if not self.is_file_clean(item) and not list_unsupported:
258 logging.debug('%s from %s has compromising tarinfo' %
259 (item.name, self.filename))
260 return False
261 tarin.extract(item, self.tempdir)
262 path = os.path.join(self.tempdir, item.name)
263 if item.isfile():
264 cfile = mat.create_class_file(path, False, add2archive=self.add2archive)
265 if cfile is not None:
266 if not cfile.is_clean():
267 logging.debug('%s from %s has metadata' %
268 (item.name.decode("utf8"), self.filename))
269 if not list_unsupported:
270 return False
271 # Nested archives are treated like unsupported files
272 elif isinstance(cfile, GenericArchiveStripper):
273 ret_list.append(item.name)
274 else:
275 logging.error('%s\'s format is not supported or harmless' % item.name)
276 if os.path.splitext(path)[1] not in parser.NOMETA:
277 if not list_unsupported:
278 return False
279 ret_list.append(item.name)
280 tarin.close()
281 if list_unsupported:
282 return ret_list
283 return True
284
285 def get_meta(self):
286 ''' Return a dict with all the meta of the tarfile
287 '''
288 tarin = tarfile.open(self.filename, 'r' + self.compression)
289 metadata = {}
290 for item in tarin.getmembers():
291 current_meta = {}
292 if item.isfile():
293 tarin.extract(item, self.tempdir)
294 path = os.path.join(self.tempdir, item.name)
295 class_file = mat.create_class_file(path, False, add2archive=self.add2archive)
296 if class_file is not None:
297 meta = class_file.get_meta()
298 if meta:
299 current_meta['file'] = str(meta)
300 else:
301 logging.error('%s\'s format is not supported or harmless' % item.name)
302
303 if not self.is_file_clean(item): # if there is meta
304 current_meta['mtime'] = item.mtime
305 current_meta['uid'] = item.uid
306 current_meta['gid'] = item.gid
307 current_meta['uname'] = item.uname
308 current_meta['gname'] = item.gname
309 metadata[item.name] = str(current_meta)
310 tarin.close()
311 return metadata
312
313
314class TerminalZipStripper(ZipStripper):
315 ''' Represent a terminal level archive.
316 This type of archive can not contain nested archives.
317 It is used for formats like docx, which are basically
318 ziped xml.
319 '''
320
321
322class GzipStripper(TarStripper):
323 ''' Represent a tar.gz archive
324 '''
325 def __init__(self, filename, parser, mime, backup, is_writable, **kwargs):
326 super(GzipStripper, self).__init__(filename, parser, mime, backup, is_writable, **kwargs)
327 self.compression = ':gz'
328
329
330class Bzip2Stripper(TarStripper):
331 ''' Represent a tar.bz2 archive
332 '''
333 def __init__(self, filename, parser, mime, backup, is_writable, **kwargs):
334 super(Bzip2Stripper, self).__init__(filename, parser, mime, backup, is_writable, **kwargs)
335 self.compression = ':bz2'
diff --git a/libmat/audio.py b/libmat/audio.py
new file mode 100644
index 0000000..dae9d75
--- /dev/null
+++ b/libmat/audio.py
@@ -0,0 +1,53 @@
1''' Care about audio fileformat
2'''
3
4try:
5 from mutagen.flac import FLAC
6 from mutagen.oggvorbis import OggVorbis
7except ImportError:
8 pass
9
10import parser
11import mutagenstripper
12
13
14class MpegAudioStripper(parser.GenericParser):
15 ''' Represent mpeg audio file (mp3, ...)
16 '''
17 def _should_remove(self, field):
18 return field.name in ("id3v1", "id3v2")
19
20
21class OggStripper(mutagenstripper.MutagenStripper):
22 ''' Represent an ogg vorbis file
23 '''
24 def _create_mfile(self):
25 self.mfile = OggVorbis(self.filename)
26
27
28class FlacStripper(mutagenstripper.MutagenStripper):
29 ''' Represent a Flac audio file
30 '''
31 def _create_mfile(self):
32 self.mfile = FLAC(self.filename)
33
34 def remove_all(self):
35 ''' Remove the "metadata" block from the file
36 '''
37 super(FlacStripper, self).remove_all()
38 self.mfile.clear_pictures()
39 self.mfile.save()
40 return True
41
42 def is_clean(self):
43 ''' Check if the "metadata" block is present in the file
44 '''
45 return super(FlacStripper, self).is_clean() and not self.mfile.pictures
46
47 def get_meta(self):
48 ''' Return the content of the metadata block if present
49 '''
50 metadata = super(FlacStripper, self).get_meta()
51 if self.mfile.pictures:
52 metadata['picture:'] = 'yes'
53 return metadata
diff --git a/libmat/bencode/__init__.py b/libmat/bencode/__init__.py
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/libmat/bencode/__init__.py
@@ -0,0 +1 @@
diff --git a/libmat/bencode/bencode.py b/libmat/bencode/bencode.py
new file mode 100644
index 0000000..a0cc99a
--- /dev/null
+++ b/libmat/bencode/bencode.py
@@ -0,0 +1,143 @@
1# Copyright 2007 by Petru Paler
2# Copyright 2011 by Julien (jvoisin) Voisin
3#
4# Permission is hereby granted, free of charge, to any person obtaining a copy
5# of this software and associated documentation files (the "Software"), to deal
6# in the Software without restriction, including without limitation the rights
7# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8# copies of the Software, and to permit persons to whom the Software is
9# furnished to do so, subject to the following conditions:
10#
11# The above copyright notice and this permission notice shall be included in
12# all copies or substantial portions of the Software.
13#
14# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19# FROM,
20# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21# THE SOFTWARE.
22#
23
24'''
25 A quick (and also nice) lib to bencode/bdecode torrent files
26'''
27
28
29class BTFailure(Exception):
30 '''Custom Exception'''
31 pass
32
33
34class Bencached(object):
35 '''Custom type : cached string'''
36 __slots__ = ['bencoded']
37
38 def __init__(self, string):
39 self.bencoded = string
40
41
42def decode_int(x, f):
43 '''decode an int'''
44 f += 1
45 newf = x.index('e', f)
46 if x[f:f+1] == '-0':
47 raise ValueError
48 elif x[f] == '0' and newf != f + 1:
49 raise ValueError
50 return int(x[f:newf]), newf + 1
51
52
53def decode_string(x, f):
54 '''decode a string'''
55 colon = x.index(':', f)
56 if x[f] == '0' and colon != f + 1:
57 raise ValueError
58 n = int(x[f:colon])
59 colon += 1
60 return x[colon:colon + n], colon + n
61
62
63def decode_list(x, f):
64 '''decode a list'''
65 result = []
66 f += 1
67 while x[f] != 'e':
68 v, f = DECODE_FUNC[x[f]](x, f)
69 result.append(v)
70 return result, f + 1
71
72
73def decode_dict(x, f):
74 '''decode a dict'''
75 result = {}
76 f += 1
77 while x[f] != 'e':
78 k, f = decode_string(x, f)
79 result[k], f = DECODE_FUNC[x[f]](x, f)
80 return result, f + 1
81
82
83def encode_bool(x, r):
84 '''bencode a boolean'''
85 encode_int(1 if r else 0, r)
86
87
88def encode_int(x, r):
89 '''bencode an integer/float'''
90 r.extend(('i', str(x), 'e'))
91
92
93def encode_list(x, r):
94 '''bencode a list/tuple'''
95 r.append('l')
96 [ENCODE_FUNC[type(item)](item, r) for item in x]
97 r.append('e')
98
99
100def encode_dict(x, result):
101 '''bencode a dict'''
102 result.append('d')
103 ilist = list(x.items())
104 ilist.sort()
105 for k, v in ilist:
106 result.extend((str(len(k)), ':', k))
107 ENCODE_FUNC[type(v)](v, result)
108 result.append('e')
109
110
111DECODE_FUNC = {str(x):decode_string for x in range(9)}
112DECODE_FUNC['l'] = decode_list
113DECODE_FUNC['d'] = decode_dict
114DECODE_FUNC['i'] = decode_int
115
116
117ENCODE_FUNC = {}
118ENCODE_FUNC[Bencached] = lambda x, r: r.append(x.bencoded)
119ENCODE_FUNC[int] = encode_int
120ENCODE_FUNC[int] = encode_int
121ENCODE_FUNC[bytes] = lambda x, r: r.extend((str(len(x)), ':', x))
122ENCODE_FUNC[list] = encode_list
123ENCODE_FUNC[tuple] = encode_list
124ENCODE_FUNC[dict] = encode_dict
125ENCODE_FUNC[bool] = encode_bool
126
127
128def bencode(string):
129 '''bencode $string'''
130 table = []
131 ENCODE_FUNC[type(string)](string, table)
132 return ''.join(table)
133
134
135def bdecode(string):
136 '''decode $string'''
137 try:
138 result, lenght = DECODE_FUNC[string[0]](string, 0)
139 except (IndexError, KeyError, ValueError):
140 raise BTFailure('Not a valid bencoded string')
141 if lenght != len(string):
142 raise BTFailure('Invalid bencoded value (data after valid prefix)')
143 return result
diff --git a/libmat/exceptions.py b/libmat/exceptions.py
new file mode 100644
index 0000000..47da15c
--- /dev/null
+++ b/libmat/exceptions.py
@@ -0,0 +1,14 @@
1''' Base exceptions for MAT
2'''
3
4
5class UnableToRemoveFile(Exception):
6 '''This exception is raised when a file could not be removed
7 '''
8 pass
9
10class UnableToWriteFile(Exception):
11 '''This exception is raised when a file
12 can could not be chmod +w
13 '''
14 pass
diff --git a/libmat/exiftool.py b/libmat/exiftool.py
new file mode 100644
index 0000000..9e38f04
--- /dev/null
+++ b/libmat/exiftool.py
@@ -0,0 +1,78 @@
1''' Care about images with help of the amazing (perl) library Exiftool.
2'''
3
4import parser
5import subprocess
6
7
8class ExiftoolStripper(parser.GenericParser):
9 ''' A generic stripper class using exiftool as backend
10 '''
11
12 def __init__(self, filename, parser, mime, backup, is_writable, **kwargs):
13 super(ExiftoolStripper, self).__init__(filename, parser, mime, backup, is_writable, **kwargs)
14 self.allowed = set(['ExifTool Version Number', 'File Name', 'Directory',
15 'File Size', 'File Modification Date/Time', 'File Access Date/Time', 'File Permissions',
16 'File Type', 'MIME Type', 'Image Width', 'Image Height',
17 'Image Size', 'File Inode Change Date/Time'])
18 self._set_allowed()
19
20 def _set_allowed(self):
21 ''' Virtual method. Set the allowed/harmless list of metadata
22 '''
23 raise NotImplementedError
24
25 def remove_all(self):
26 ''' Remove all metadata with help of exiftool
27 '''
28 try:
29 if self.backup:
30 self.create_backup_copy()
31 # Note: '-All=' must be followed by a known exiftool option.
32 subprocess.call(['exiftool', '-m', '-all=',
33 '-adobe=', '-overwrite_original', self.filename],
34 stdout=open('/dev/null'))
35 return True
36 except:
37 return False
38
39 def is_clean(self):
40 ''' Check if the file is clean with the help of exiftool
41 '''
42 return not self.get_meta()
43
44 def get_meta(self):
45 ''' Return every harmful meta with help of exiftool.
46 Exiftool output looks like this:
47 field name : value
48 field name : value
49 '''
50 output = subprocess.Popen(['exiftool', self.filename],
51 stdout=subprocess.PIPE).communicate()[0]
52 meta = {}
53 for i in output.split('\n')[:-1]: # chop last char ('\n')
54 key = i.split(':')[0].strip()
55 if key not in self.allowed:
56 meta[key] = i.split(':')[1].strip() # add the field name to the metadata set
57 return meta
58
59
60class JpegStripper(ExiftoolStripper):
61 ''' Care about jpeg files with help
62 of exiftool
63 '''
64 def _set_allowed(self):
65 self.allowed.update(['JFIF Version', 'Resolution Unit',
66 'X Resolution', 'Y Resolution', 'Encoding Process',
67 'Bits Per Sample', 'Color Components', 'Y Cb Cr Sub Sampling'])
68
69
70class PngStripper(ExiftoolStripper):
71 ''' Care about png files with help
72 of exiftool
73 '''
74 def _set_allowed(self):
75 self.allowed.update(['Bit Depth', 'Color Type',
76 'Compression', 'Filter', 'Interlace', 'Pixels Per Unit X',
77 'Pixels Per Unit Y', 'Pixel Units', 'Significant Bits',
78 'Background Color', 'SRGB Rendering'])
diff --git a/libmat/hachoir_editor/__init__.py b/libmat/hachoir_editor/__init__.py
new file mode 100644
index 0000000..1835676
--- /dev/null
+++ b/libmat/hachoir_editor/__init__.py
@@ -0,0 +1,8 @@
1from field import (
2 EditorError, FakeField)
3from typed_field import (
4 EditableField, EditableBits, EditableBytes,
5 EditableInteger, EditableString,
6 createEditableField)
7from fieldset import EditableFieldSet, NewFieldSet, createEditor
8
diff --git a/libmat/hachoir_editor/field.py b/libmat/hachoir_editor/field.py
new file mode 100644
index 0000000..6b1efe3
--- /dev/null
+++ b/libmat/hachoir_editor/field.py
@@ -0,0 +1,69 @@
1from hachoir_core.error import HachoirError
2from hachoir_core.field import joinPath, MissingField
3
4class EditorError(HachoirError):
5 pass
6
7class FakeField(object):
8 """
9 This class have API looks similar to Field API, but objects don't contain
10 any value: all values are _computed_ by parent methods.
11
12 Example: FakeField(editor, "abc").size calls editor._getFieldSize("abc").
13 """
14 is_field_set = False
15
16 def __init__(self, parent, name):
17 self._parent = parent
18 self._name = name
19
20 def _getPath(self):
21 return joinPath(self._parent.path, self._name)
22 path = property(_getPath)
23
24 def _getName(self):
25 return self._name
26 name = property(_getName)
27
28 def _getAddress(self):
29 return self._parent._getFieldAddress(self._name)
30 address = property(_getAddress)
31
32 def _getSize(self):
33 return self._parent.input[self._name].size
34 size = property(_getSize)
35
36 def _getValue(self):
37 return self._parent.input[self._name].value
38 value = property(_getValue)
39
40 def createDisplay(self):
41 # TODO: Returns new value if field is altered
42 return self._parent.input[self._name].display
43 display = property(createDisplay)
44
45 def _getParent(self):
46 return self._parent
47 parent = property(_getParent)
48
49 def hasValue(self):
50 return self._parent.input[self._name].hasValue()
51
52 def __getitem__(self, key):
53 # TODO: Implement this function!
54 raise MissingField(self, key)
55
56 def _isAltered(self):
57 return False
58 is_altered = property(_isAltered)
59
60 def writeInto(self, output):
61 size = self.size
62 addr = self._parent._getFieldInputAddress(self._name)
63 input = self._parent.input
64 stream = input.stream
65 if size % 8:
66 output.copyBitsFrom(stream, addr, size, input.endian)
67 else:
68 output.copyBytesFrom(stream, addr, size//8)
69
diff --git a/libmat/hachoir_editor/fieldset.py b/libmat/hachoir_editor/fieldset.py
new file mode 100644
index 0000000..b7c9b07
--- /dev/null
+++ b/libmat/hachoir_editor/fieldset.py
@@ -0,0 +1,352 @@
1from hachoir_core.dict import UniqKeyError
2from hachoir_core.field import MissingField, Float32, Float64, FakeArray
3from hachoir_core.compatibility import any
4from hachoir_core.i18n import _
5from typed_field import createEditableField
6from field import EditorError
7from collections import deque # Python 2.4
8import weakref # Python 2.1
9import struct
10
11class EditableFieldSet(object):
12 MAX_SIZE = (1 << 40) # Arbitrary limit to catch errors
13 is_field_set = True
14
15 def __init__(self, parent, fieldset):
16 self._parent = parent
17 self.input = fieldset # original FieldSet
18 self._fields = {} # cache of editable fields
19 self._deleted = set() # Names of deleted fields
20 self._inserted = {} # Inserted field (name => list of field,
21 # where name is the name after)
22
23 def array(self, key):
24 # FIXME: Use cache?
25 return FakeArray(self, key)
26
27 def _getParent(self):
28 return self._parent
29 parent = property(_getParent)
30
31 def _isAltered(self):
32 if self._inserted:
33 return True
34 if self._deleted:
35 return True
36 return any(field.is_altered for field in self._fields.itervalues())
37 is_altered = property(_isAltered)
38
39 def reset(self):
40 """
41 Reset the field set and the input field set.
42 """
43 for key, field in self._fields.iteritems():
44 if not field.is_altered:
45 del self._fields[key]
46 self.input.reset()
47
48 def __len__(self):
49 return len(self.input) \
50 - len(self._deleted) \
51 + sum( len(new) for new in self._inserted.itervalues() )
52
53 def __iter__(self):
54 for field in self.input:
55 name = field.name
56 if name in self._inserted:
57 for newfield in self._inserted[name]:
58 yield weakref.proxy(newfield)
59 if name not in self._deleted:
60 yield self[name]
61 if None in self._inserted:
62 for newfield in self._inserted[None]:
63 yield weakref.proxy(newfield)
64
65 def insertBefore(self, name, *new_fields):
66 self._insert(name, new_fields, False)
67
68 def insertAfter(self, name, *new_fields):
69 self._insert(name, new_fields, True)
70
71 def insert(self, *new_fields):
72 self._insert(None, new_fields, True)
73
74 def _insert(self, key, new_fields, next):
75 """
76 key is the name of the field before which new_fields
77 will be inserted. If next is True, the fields will be inserted
78 _after_ this field.
79 """
80 # Set unique field name
81 for field in new_fields:
82 if field._name.endswith("[]"):
83 self.input.setUniqueFieldName(field)
84
85 # Check that there is no duplicate in inserted fields
86 new_names = list(field.name for field in new_fields)
87 names_set = set(new_names)
88 if len(names_set) != len(new_fields):
89 duplicates = (name for name in names_set if 1 < new_names.count(name))
90 raise UniqKeyError(_("Duplicates in inserted fields: %s") % ", ".join(duplicates))
91
92 # Check that field names are not in input
93 if self.input: # Write special version for NewFieldSet?
94 for name in new_names:
95 if name in self.input and name not in self._deleted:
96 raise UniqKeyError(_("Field name '%s' already exists") % name)
97
98 # Check that field names are not in inserted fields
99 for fields in self._inserted.itervalues():
100 for field in fields:
101 if field.name in new_names:
102 raise UniqKeyError(_("Field name '%s' already exists") % field.name)
103
104 # Input have already inserted field?
105 if key in self._inserted:
106 if next:
107 self._inserted[key].extend( reversed(new_fields) )
108 else:
109 self._inserted[key].extendleft( reversed(new_fields) )
110 return
111
112 # Whould like to insert in inserted fields?
113 if key:
114 for fields in self._inserted.itervalues():
115 names = [item.name for item in fields]
116 try:
117 pos = names.index(key)
118 except ValueError:
119 continue
120 if 0 <= pos:
121 if next:
122 pos += 1
123 fields.rotate(-pos)
124 fields.extendleft( reversed(new_fields) )
125 fields.rotate(pos)
126 return
127
128 # Get next field. Use None if we are at the end.
129 if next:
130 index = self.input[key].index + 1
131 try:
132 key = self.input[index].name
133 except IndexError:
134 key = None
135
136 # Check that field names are not in input
137 if key not in self.input:
138 raise MissingField(self, key)
139
140 # Insert in original input
141 self._inserted[key]= deque(new_fields)
142
143 def _getDescription(self):
144 return self.input.description
145 description = property(_getDescription)
146
147 def _getStream(self):
148 # FIXME: This property is maybe a bad idea since address may be differents
149 return self.input.stream
150 stream = property(_getStream)
151
152 def _getName(self):
153 return self.input.name
154 name = property(_getName)
155
156 def _getEndian(self):
157 return self.input.endian
158 endian = property(_getEndian)
159
160 def _getAddress(self):
161 if self._parent:
162 return self._parent._getFieldAddress(self.name)
163 else:
164 return 0
165 address = property(_getAddress)
166
167 def _getAbsoluteAddress(self):
168 address = self.address
169 current = self._parent
170 while current:
171 address += current.address
172 current = current._parent
173 return address
174 absolute_address = property(_getAbsoluteAddress)
175
176 def hasValue(self):
177 return False
178# return self._parent.input[self.name].hasValue()
179
180 def _getSize(self):
181 if self.is_altered:
182 return sum(field.size for field in self)
183 else:
184 return self.input.size
185 size = property(_getSize)
186
187 def _getPath(self):
188 return self.input.path
189 path = property(_getPath)
190
191 def _getOriginalField(self, name):
192 assert name in self.input
193 return self.input[name]
194
195 def _getFieldInputAddress(self, name):
196 """
197 Absolute address of a field from the input field set.
198 """
199 assert name in self.input
200 return self.input[name].absolute_address
201
202 def _getFieldAddress(self, name):
203 """
204 Compute relative address of a field. The operation takes care of
205 deleted and resized fields.
206 """
207 #assert name not in self._deleted
208 addr = 0
209 for field in self:
210 if field.name == name:
211 return addr
212 addr += field.size
213 raise MissingField(self, name)
214
215 def _getItemByPath(self, path):
216 if not path[0]:
217 path = path[1:]
218 field = self
219 for name in path:
220 field = field[name]
221 return field
222
223 def __contains__(self, name):
224 try:
225 field = self[name]
226 return (field is not None)
227 except MissingField:
228 return False
229
230 def __getitem__(self, key):
231 """
232 Create a weak reference to an editable field (EditableField) for the
233 field with specified name. If the field is removed later, using the
234 editable field will raise a weakref.ReferenceError exception.
235
236 May raise a MissingField error if the field doesn't exist in original
237 field set or it has been deleted.
238 """
239 if "/" in key:
240 return self._getItemByPath(key.split("/"))
241 if isinstance(key, (int, long)):
242 raise EditorError("Integer index are not supported")
243
244 if (key in self._deleted) or (key not in self.input):
245 raise MissingField(self, key)
246 if key not in self._fields:
247 field = self.input[key]
248 if field.is_field_set:
249 self._fields[key] = createEditableFieldSet(self, field)
250 else:
251 self._fields[key] = createEditableField(self, field)
252 return weakref.proxy(self._fields[key])
253
254 def __delitem__(self, name):
255 """
256 Remove a field from the field set. May raise an MissingField exception
257 if the field has already been deleted.
258 """
259 parts = name.partition('/')
260 if parts[2]:
261 fieldset = self[parts[0]]
262 del fieldset[parts[2]]
263 return
264 if name in self._deleted:
265 raise MissingField(self, name)
266 self._deleted.add(name)
267 if name in self._fields:
268 del self._fields[name]
269
270 def writeInto(self, output):
271 """
272 Write the content if this field set into the output stream
273 (OutputStream).
274 """
275 if not self.is_altered:
276 # Not altered: just copy bits/bytes
277 input = self.input
278 if input.size % 8:
279 output.copyBitsFrom(input.stream,
280 input.absolute_address, input.size, input.endian)
281 else:
282 output.copyBytesFrom(input.stream,
283 input.absolute_address, input.size//8)
284 else:
285 # Altered: call writeInto() method of each field
286 realaddr = 0
287 for field in self:
288 field.writeInto(output)
289 realaddr += field.size
290
291 def _getValue(self):
292 raise EditorError('Field set "%s" has no value' % self.path)
293 def _setValue(self, value):
294 raise EditorError('Field set "%s" value is read only' % self.path)
295 value = property(_getValue, _setValue, "Value of field")
296
297class EditableFloat(EditableFieldSet):
298 _value = None
299
300 def _isAltered(self):
301 return (self._value is not None)
302 is_altered = property(_isAltered)
303
304 def writeInto(self, output):
305 if self._value is not None:
306 self._write(output)
307 else:
308 EditableFieldSet.writeInto(self, output)
309
310 def _write(self, output):
311 format = self.input.struct_format
312 raw = struct.pack(format, self._value)
313 output.writeBytes(raw)
314
315 def _setValue(self, value):
316 self.parent._is_altered = True
317 self._value = value
318 value = property(EditableFieldSet._getValue, _setValue)
319
320def createEditableFieldSet(parent, field):
321 cls = field.__class__
322 # FIXME: Support Float80
323 if cls in (Float32, Float64):
324 return EditableFloat(parent, field)
325 else:
326 return EditableFieldSet(parent, field)
327
328class NewFieldSet(EditableFieldSet):
329 def __init__(self, parent, name):
330 EditableFieldSet.__init__(self, parent, None)
331 self._name = name
332 self._endian = parent.endian
333
334 def __iter__(self):
335 if None in self._inserted:
336 return iter(self._inserted[None])
337 else:
338 raise StopIteration()
339
340 def _getName(self):
341 return self._name
342 name = property(_getName)
343
344 def _getEndian(self):
345 return self._endian
346 endian = property(_getEndian)
347
348 is_altered = property(lambda self: True)
349
350def createEditor(fieldset):
351 return EditableFieldSet(None, fieldset)
352
diff --git a/libmat/hachoir_editor/typed_field.py b/libmat/hachoir_editor/typed_field.py
new file mode 100644
index 0000000..0f0427b
--- /dev/null
+++ b/libmat/hachoir_editor/typed_field.py
@@ -0,0 +1,253 @@
1from hachoir_core.field import (
2 RawBits, Bit, Bits, PaddingBits,
3 RawBytes, Bytes, PaddingBytes,
4 GenericString, Character,
5 isInteger, isString)
6from field import FakeField
7
8class EditableField(FakeField):
9 """
10 Pure virtual class used to write editable field class.
11 """
12
13 _is_altered = False
14 def __init__(self, parent, name, value=None):
15 FakeField.__init__(self, parent, name)
16 self._value = value
17
18 def _isAltered(self):
19 return self._is_altered
20 is_altered = property(_isAltered)
21
22 def hasValue(self):
23 return True
24
25 def _computeSize(self):
26 raise NotImplementedError()
27 def _getValue(self):
28 return self._value
29 def _setValue(self, value):
30 self._value = value
31
32 def _propGetValue(self):
33 if self._value is not None:
34 return self._getValue()
35 else:
36 return FakeField._getValue(self)
37 def _propSetValue(self, value):
38 self._setValue(value)
39 self._is_altered = True
40 value = property(_propGetValue, _propSetValue)
41
42 def _getSize(self):
43 if self._value is not None:
44 return self._computeSize()
45 else:
46 return FakeField._getSize(self)
47 size = property(_getSize)
48
49 def _write(self, output):
50 raise NotImplementedError()
51
52 def writeInto(self, output):
53 if self._is_altered:
54 self._write(output)
55 else:
56 return FakeField.writeInto(self, output)
57
58class EditableFixedField(EditableField):
59 """
60 Editable field with fixed size.
61 """
62
63 def __init__(self, parent, name, value=None, size=None):
64 EditableField.__init__(self, parent, name, value)
65 if size is not None:
66 self._size = size
67 else:
68 self._size = self._parent._getOriginalField(self._name).size
69
70 def _getSize(self):
71 return self._size
72 size = property(_getSize)
73
74class EditableBits(EditableFixedField):
75 def __init__(self, parent, name, *args):
76 if args:
77 if len(args) != 2:
78 raise TypeError(
79 "Wrong argument count, EditableBits constructor prototype is: "
80 "(parent, name, [size, value])")
81 size = args[0]
82 value = args[1]
83 assert isinstance(value, (int, long))
84 else:
85 size = None
86 value = None
87 EditableFixedField.__init__(self, parent, name, value, size)
88 if args:
89 self._setValue(args[1])
90 self._is_altered = True
91
92 def _setValue(self, value):
93 if not(0 <= value < (1 << self._size)):
94 raise ValueError("Invalid value, must be in range %s..%s"
95 % (0, (1 << self._size) - 1))
96 self._value = value
97
98 def _write(self, output):
99 output.writeBits(self._size, self._value, self._parent.endian)
100
101class EditableBytes(EditableField):
102 def _setValue(self, value):
103 if not value: raise ValueError(
104 "Unable to set empty string to a EditableBytes field")
105 self._value = value
106
107 def _computeSize(self):
108 return len(self._value) * 8
109
110 def _write(self, output):
111 output.writeBytes(self._value)
112
113class EditableString(EditableField):
114 MAX_SIZE = {
115 "Pascal8": (1 << 8)-1,
116 "Pascal16": (1 << 16)-1,
117 "Pascal32": (1 << 32)-1,
118 }
119
120 def __init__(self, parent, name, *args, **kw):
121 if len(args) == 2:
122 value = args[1]
123 assert isinstance(value, str) # TODO: support Unicode
124 elif not args:
125 value = None
126 else:
127 raise TypeError(
128 "Wrong argument count, EditableString constructor prototype is:"
129 "(parent, name, [format, value])")
130 EditableField.__init__(self, parent, name, value)
131 if len(args) == 2:
132 self._charset = kw.get('charset', None)
133 self._format = args[0]
134 if self._format in GenericString.PASCAL_FORMATS:
135 self._prefix_size = GenericString.PASCAL_FORMATS[self._format]
136 else:
137 self._prefix_size = 0
138 self._suffix_str = GenericString.staticSuffixStr(
139 self._format, self._charset, self._parent.endian)
140 self._is_altered = True
141 else:
142 orig = self._parent._getOriginalField(name)
143 self._charset = orig.charset
144 self._format = orig.format
145 self._prefix_size = orig.content_offset
146 self._suffix_str = orig.suffix_str
147
148 def _setValue(self, value):
149 size = len(value)
150 if self._format in self.MAX_SIZE and self.MAX_SIZE[self._format] < size:
151 raise ValueError("String is too big")
152 self._value = value
153
154 def _computeSize(self):
155 return (self._prefix_size + len(self._value) + len(self._suffix_str))*8
156
157 def _write(self, output):
158 if self._format in GenericString.SUFFIX_FORMAT:
159 output.writeBytes(self._value)
160 output.writeBytes(self._suffix_str)
161 elif self._format == "fixed":
162 output.writeBytes(self._value)
163 else:
164 assert self._format in GenericString.PASCAL_FORMATS
165 size = GenericString.PASCAL_FORMATS[self._format]
166 output.writeInteger(len(self._value), False, size, self._parent.endian)
167 output.writeBytes(self._value)
168
169class EditableCharacter(EditableFixedField):
170 def __init__(self, parent, name, *args):
171 if args:
172 if len(args) != 3:
173 raise TypeError(
174 "Wrong argument count, EditableCharacter "
175 "constructor prototype is: (parent, name, [value])")
176 value = args[0]
177 if not isinstance(value, str) or len(value) != 1:
178 raise TypeError("EditableCharacter needs a character")
179 else:
180 value = None
181 EditableFixedField.__init__(self, parent, name, value, 8)
182 if args:
183 self._is_altered = True
184
185 def _setValue(self, value):
186 if not isinstance(value, str) or len(value) != 1:
187 raise TypeError("EditableCharacter needs a character")
188 self._value = value
189
190 def _write(self, output):
191 output.writeBytes(self._value)
192
193class EditableInteger(EditableFixedField):
194 VALID_VALUE_SIGNED = {
195 8: (-(1 << 8), (1 << 8)-1),
196 16: (-(1 << 15), (1 << 15)-1),
197 32: (-(1 << 31), (1 << 31)-1),
198 }
199 VALID_VALUE_UNSIGNED = {
200 8: (0, (1 << 8)-1),
201 16: (0, (1 << 16)-1),
202 32: (0, (1 << 32)-1)
203 }
204
205 def __init__(self, parent, name, *args):
206 if args:
207 if len(args) != 3:
208 raise TypeError(
209 "Wrong argument count, EditableInteger constructor prototype is: "
210 "(parent, name, [signed, size, value])")
211 size = args[1]
212 value = args[2]
213 assert isinstance(value, (int, long))
214 else:
215 size = None
216 value = None
217 EditableFixedField.__init__(self, parent, name, value, size)
218 if args:
219 self._signed = args[0]
220 self._is_altered = True
221 else:
222 self._signed = self._parent._getOriginalField(self._name).signed
223
224 def _setValue(self, value):
225 if self._signed:
226 valid = self.VALID_VALUE_SIGNED
227 else:
228 valid = self.VALID_VALUE_UNSIGNED
229 minval, maxval = valid[self._size]
230 if not(minval <= value <= maxval):
231 raise ValueError("Invalid value, must be in range %s..%s"
232 % (minval, maxval))
233 self._value = value
234
235 def _write(self, output):
236 output.writeInteger(
237 self.value, self._signed, self._size//8, self._parent.endian)
238
239def createEditableField(fieldset, field):
240 if isInteger(field):
241 cls = EditableInteger
242 elif isString(field):
243 cls = EditableString
244 elif field.__class__ in (RawBytes, Bytes, PaddingBytes):
245 cls = EditableBytes
246 elif field.__class__ in (RawBits, Bits, Bit, PaddingBits):
247 cls = EditableBits
248 elif field.__class__ == Character:
249 cls = EditableCharacter
250 else:
251 cls = FakeField
252 return cls(fieldset, field.name)
253
diff --git a/libmat/images.py b/libmat/images.py
new file mode 100644
index 0000000..67c710f
--- /dev/null
+++ b/libmat/images.py
@@ -0,0 +1,52 @@
1''' Takes care about pictures formats
2
3References:
4 - JFIF: http://www.ecma-international.org/publications/techreports/E-TR-098.htm
5 - PNG: http://www.sno.phy.queensu.ca/~phil/exiftool/TagNames/PNG.html
6 - PNG: http://www.w3.org/TR/PNG-Chunks.html
7'''
8
9import parser
10
11
12class JpegStripper(parser.GenericParser):
13 ''' Represents a jpeg file.
14 Custom Huffman and Quantization tables
15 are stripped: they may leak
16 some info, and the quality loss is minor.
17 '''
18 def _should_remove(self, field):
19 ''' Return True if the field is compromising
20 '''
21 field_list = frozenset([
22 'start_image', # start of the image
23 'app0', # JFIF data
24 'start_frame', # specify width, height, number of components
25 'start_scan', # specify which slice of data the top-to-bottom scan contains
26 'data', # actual data
27 'end_image']) # end of the image
28 if field.name in field_list:
29 return False
30 elif field.name.startswith('quantization['): # custom Quant. tables
31 return False
32 elif field.name.startswith('huffman['): # custom Huffman tables
33 return False
34 return True
35
36
37class PngStripper(parser.GenericParser):
38 ''' Represents a png file
39 '''
40 def _should_remove(self, field):
41 ''' Return True if the field is compromising
42 '''
43 field_list = frozenset([
44 'id',
45 'header', # PNG header
46 'physical', # the intended pixel size or aspect ratio
47 'end']) # end of the image
48 if field.name in field_list:
49 return False
50 if field.name.startswith('data['): # data
51 return False
52 return True
diff --git a/libmat/mat.py b/libmat/mat.py
new file mode 100644
index 0000000..8dfc2dc
--- /dev/null
+++ b/libmat/mat.py
@@ -0,0 +1,186 @@
1#!/usr/bin/env python
2
3''' Metadata anonymisation toolkit library
4'''
5
6import logging
7import mimetypes
8import os
9import subprocess
10import xml.sax
11
12import hachoir_core.cmd_line
13import hachoir_parser
14
15import libmat.exceptions
16
17__version__ = '0.5.2'
18__author__ = 'jvoisin'
19
20#Silence
21LOGGING_LEVEL = logging.CRITICAL
22hachoir_core.config.quiet = True
23fname = ''
24
25#Verbose
26#LOGGING_LEVEL = logging.DEBUG
27#hachoir_core.config.quiet = False
28#logname = 'report.log'
29
30logging.basicConfig(filename=fname, level=LOGGING_LEVEL)
31
32import strippers # this is loaded here because we need LOGGING_LEVEL
33
34
35def get_logo():
36 ''' Return the path to the logo
37 '''
38 if os.path.isfile('./data/mat.png'):
39 return './data/mat.png'
40 elif os.path.isfile('/usr/share/pixmaps/mat.png'):
41 return '/usr/share/pixmaps/mat.png'
42 elif os.path.isfile('/usr/local/share/pixmaps/mat.png'):
43 return '/usr/local/share/pixmaps/mat.png'
44
45
46def get_datadir():
47 ''' Return the path to the data directory
48 '''
49 if os.path.isdir('./data/'):
50 return './data/'
51 elif os.path.isdir('/usr/local/share/mat/'):
52 return '/usr/local/share/mat/'
53 elif os.path.isdir('/usr/share/mat/'):
54 return '/usr/share/mat/'
55
56
57def list_supported_formats():
58 ''' Return a list of all locally supported fileformat.
59 It parses that FORMATS file, and removes locally
60 non-supported formats.
61 '''
62 handler = XMLParser()
63 parser = xml.sax.make_parser()
64 parser.setContentHandler(handler)
65 path = os.path.join(get_datadir(), 'FORMATS')
66 with open(path, 'r') as xmlfile:
67 parser.parse(xmlfile)
68
69 localy_supported = []
70 for item in handler.list:
71 if item['mimetype'].split(',')[0] in strippers.STRIPPERS:
72 localy_supported.append(item)
73
74 return localy_supported
75
76
77class XMLParser(xml.sax.handler.ContentHandler):
78 ''' Parse the supported format xml, and return a corresponding
79 list of dict
80 '''
81 def __init__(self):
82 self.dict = {}
83 self.list = []
84 self.content, self.key = '', ''
85 self.between = False
86
87 def startElement(self, name, attrs):
88 ''' Called when entering into xml tag
89 '''
90 self.between = True
91 self.key = name
92 self.content = ''
93
94 def endElement(self, name):
95 ''' Called when exiting a xml tag
96 '''
97 if name == 'format': # leaving a fileformat section
98 self.list.append(self.dict.copy())
99 self.dict.clear()
100 else:
101 content = self.content.replace('\s', ' ')
102 self.dict[self.key] = content
103 self.between = False
104
105 def characters(self, characters):
106 ''' Concatenate the content between opening and closing tags
107 '''
108 if self.between:
109 self.content += characters
110
111
112def secure_remove(filename):
113 ''' Securely remove the file
114 '''
115 # I want the file removed, even if it's ro
116 try:
117 os.chmod(filename, 220)
118 except OSError:
119 logging.error('Unable to add write rights to %s' % filename)
120 raise libmat.exceptions.UnableToWriteFile
121
122 try:
123 if not subprocess.call(['shred', '--remove', filename]):
124 return True
125 else:
126 raise OSError
127 except OSError:
128 logging.error('Unable to securely remove %s' % filename)
129
130 try:
131 os.remove(filename)
132 except OSError:
133 logging.error('Unable to remove %s' % filename)
134 raise libmat.exceptions.UnableToRemoveFile
135
136 return True
137
138
139def create_class_file(name, backup, **kwargs):
140 ''' Return a $FILETYPEStripper() class,
141 corresponding to the filetype of the given file
142 '''
143 if not os.path.isfile(name): # check if the file exists
144 logging.error('%s is not a valid file' % name)
145 return None
146
147 if not os.access(name, os.R_OK): # check read permissions
148 logging.error('%s is is not readable' % name)
149 return None
150
151 if not os.path.getsize(name):
152 #check if the file is not empty (hachoir crash on empty files)
153 logging.error('%s is empty' % name)
154 return None
155
156 filename = ''
157 try:
158 filename = hachoir_core.cmd_line.unicodeFilename(name)
159 except TypeError: # get rid of "decoding Unicode is not supported"
160 filename = name
161
162 parser = hachoir_parser.createParser(filename)
163 if not parser:
164 logging.info('Unable to parse %s' % filename)
165 return None
166
167 mime = parser.mime_type
168
169 if mime == 'application/zip': # some formats are zipped stuff
170 if mimetypes.guess_type(name)[0]:
171 mime = mimetypes.guess_type(name)[0]
172
173 if mime.startswith('application/vnd.oasis.opendocument'):
174 mime = 'application/opendocument' # opendocument fileformat
175 elif mime.startswith('application/vnd.openxmlformats-officedocument'):
176 mime = 'application/officeopenxml' # office openxml
177
178 is_writable = os.access(name, os.W_OK)
179
180 try:
181 stripper_class = strippers.STRIPPERS[mime]
182 except KeyError:
183 logging.info('Don\'t have stripper for %s format' % mime)
184 return None
185
186 return stripper_class(filename, parser, mime, backup, is_writable, **kwargs)
diff --git a/libmat/misc.py b/libmat/misc.py
new file mode 100644
index 0000000..450f381
--- /dev/null
+++ b/libmat/misc.py
@@ -0,0 +1,76 @@
1''' Care about misc formats
2'''
3
4import parser
5
6from bencode import bencode
7
8
9class TorrentStripper(parser.GenericParser):
10 ''' Represent a torrent file with the help
11 of the bencode lib from Petru Paler
12 '''
13 def __init__(self, filename, parser, mime, backup, is_writable, **kwargs):
14 super(TorrentStripper, self).__init__(filename, parser, mime, backup, is_writable, **kwargs)
15 self.fields = frozenset(['announce', 'info', 'name', 'path', 'piece length', 'pieces',
16 'length', 'files', 'announce-list', 'nodes', 'httpseeds', 'private', 'root hash'])
17
18 def __get_key_recursively(self, dictionary):
19 ''' Get recursively all keys from a dict and
20 its subdicts
21 '''
22 for i, j in list(dictionary.items()):
23 if isinstance(j, dict):
24 return set([i]).union(self.__get_key_recursively(j))
25 return set([i])
26
27 def is_clean(self):
28 ''' Check if the file is clean from harmful metadata
29 '''
30 with open(self.filename, 'r') as f:
31 decoded = bencode.bdecode(f.read())
32 return self.fields.issuperset(self.__get_key_recursively(decoded))
33
34 def __get_meta_recursively(self, dictionary):
35 ''' Get recursively all harmful metadata
36 '''
37 d = dict()
38 for i, j in list(dictionary.items()):
39 if i not in self.fields:
40 d[i] = j
41 elif isinstance(j, dict):
42 d = dict(d.items() + list(self.__get_meta_recursively(j).items()))
43 return d
44
45 def get_meta(self):
46 ''' Return a dict with all the meta of the file
47 '''
48 with open(self.filename, 'r') as f:
49 decoded = bencode.bdecode(f.read())
50 return self.__get_meta_recursively(decoded)
51
52 def __remove_all_recursively(self, dictionary):
53 ''' Remove recursively all compromizing fields
54 '''
55 d = dict()
56 for i, j in [i for i in list(dictionary.items()) if i in self.fields]:
57 if isinstance(j, dict):
58 d = dict(list(d.items()) + list(self.__get_meta_recursively(j).items()))
59 else:
60 d[i] = j
61 return d
62
63 def remove_all(self):
64 ''' Remove all comprimizing fields
65 '''
66 decoded = ''
67 with open(self.filename, 'r') as f:
68 decoded = bencode.bdecode(f.read())
69
70 cleaned = {i: j for i, j in list(decoded.items()) if i in self.fields}
71
72 with open(self.output, 'w') as f: # encode the decoded torrent
73 f.write(bencode.bencode(cleaned)) # and write it in self.output
74
75 self.do_backup()
76 return True
diff --git a/libmat/mutagenstripper.py b/libmat/mutagenstripper.py
new file mode 100644
index 0000000..403c9a7
--- /dev/null
+++ b/libmat/mutagenstripper.py
@@ -0,0 +1,33 @@
1''' Take care of mutagen-supported formats (audio)
2'''
3
4import parser
5
6
7class MutagenStripper(parser.GenericParser):
8 def __init__(self, filename, parser, mime, backup, is_writable, **kwargs):
9 super(MutagenStripper, self).__init__(filename, parser, mime, backup, is_writable, **kwargs)
10 self._create_mfile()
11
12 def _create_mfile(self):
13 raise NotImplementedError
14
15 def is_clean(self):
16 return not self.mfile.tags
17
18 def remove_all(self):
19 if self.backup:
20 self.create_backup_copy()
21 self.mfile.delete()
22 self.mfile.save()
23 return True
24
25 def get_meta(self):
26 '''
27 Return the content of the metadata block is present
28 '''
29 metadata = {}
30 if self.mfile.tags:
31 for key, value in self.mfile.tags:
32 metadata[key] = value
33 return metadata
diff --git a/libmat/office.py b/libmat/office.py
new file mode 100644
index 0000000..0ca1ff1
--- /dev/null
+++ b/libmat/office.py
@@ -0,0 +1,191 @@
1''' Care about office's formats
2
3'''
4
5import logging
6import os
7import shutil
8import tempfile
9import xml.dom.minidom as minidom
10import zipfile
11
12try:
13 import cairo
14 from gi.repository import Poppler
15except ImportError:
16 logging.info('office.py loaded without PDF support')
17 pass
18
19import parser
20import archive
21
22
23class OpenDocumentStripper(archive.TerminalZipStripper):
24 ''' An open document file is a zip, with xml file into.
25 The one that interest us is meta.xml
26 '''
27
28 def get_meta(self):
29 ''' Return a dict with all the meta of the file by
30 trying to read the meta.xml file.
31 '''
32 metadata = super(OpenDocumentStripper, self).get_meta()
33 zipin = zipfile.ZipFile(self.filename, 'r')
34 try:
35 content = zipin.read('meta.xml')
36 dom1 = minidom.parseString(content)
37 elements = dom1.getElementsByTagName('office:meta')
38 for i in elements[0].childNodes:
39 if i.tagName != 'meta:document-statistic':
40 nodename = ''.join(i.nodeName.split(':')[1:])
41 metadata[nodename] = ''.join([j.data for j in i.childNodes])
42 else:
43 # thank you w3c for not providing a nice
44 # method to get all attributes of a node
45 pass
46 except KeyError: # no meta.xml file found
47 logging.debug('%s has no opendocument metadata' % self.filename)
48 zipin.close()
49 return metadata
50
51 def remove_all(self):
52 ''' Removes metadata
53 '''
54 return super(OpenDocumentStripper, self).remove_all(ending_blacklist=['meta.xml'])
55
56 def is_clean(self):
57 ''' Check if the file is clean from harmful metadatas
58 '''
59 clean_super = super(OpenDocumentStripper, self).is_clean()
60 if clean_super is False:
61 return False
62
63 zipin = zipfile.ZipFile(self.filename, 'r')
64 try:
65 zipin.getinfo('meta.xml')
66 except KeyError: # no meta.xml in the file
67 return True
68 zipin.close()
69 return False
70
71
72class OpenXmlStripper(archive.TerminalZipStripper):
73 ''' Represent an office openxml document, which is like
74 an opendocument format, with some tricky stuff added.
75 It contains mostly xml, but can have media blobs, crap, ...
76 (I don't like this format.)
77 '''
78 def remove_all(self):
79 return super(OpenXmlStripper, self).remove_all(
80 beginning_blacklist=('docProps/'), whitelist=('.rels'))
81
82 def is_clean(self):
83 ''' Check if the file is clean from harmful metadatas.
84 This implementation is faster than something like
85 "return this.get_meta() == {}".
86 '''
87 clean_super = super(OpenXmlStripper, self).is_clean()
88 if clean_super is False:
89 return False
90
91 zipin = zipfile.ZipFile(self.filename, 'r')
92 for item in zipin.namelist():
93 if item.startswith('docProps/'):
94 return False
95 zipin.close()
96 return True
97
98 def get_meta(self):
99 ''' Return a dict with all the meta of the file
100 '''
101 metadata = super(OpenXmlStripper, self).get_meta()
102
103 zipin = zipfile.ZipFile(self.filename, 'r')
104 for item in zipin.namelist():
105 if item.startswith('docProps/'):
106 metadata[item] = 'harmful content'
107 zipin.close()
108 return metadata
109
110
111class PdfStripper(parser.GenericParser):
112 ''' Represent a PDF file
113 '''
114 def __init__(self, filename, parser, mime, backup, is_writable, **kwargs):
115 super(PdfStripper, self).__init__(filename, parser, mime, backup, is_writable, **kwargs)
116 self.uri = 'file://' + os.path.abspath(self.filename)
117 self.password = None
118 try:
119 self.pdf_quality = kwargs['low_pdf_quality']
120 except KeyError:
121 self.pdf_quality = False
122
123 self.meta_list = frozenset(['title', 'author', 'subject',
124 'keywords', 'creator', 'producer', 'metadata'])
125
126 def is_clean(self):
127 ''' Check if the file is clean from harmful metadatas
128 '''
129 document = Poppler.Document.new_from_file(self.uri, self.password)
130 for key in self.meta_list:
131 if document.get_property(key):
132 return False
133 return True
134
135 def remove_all(self):
136 ''' Opening the PDF with poppler, then doing a render
137 on a cairo pdfsurface for each pages.
138
139 http://cairographics.org/documentation/pycairo/2/
140
141 The use of an intermediate tempfile is necessary because
142 python-cairo segfaults on unicode.
143 See http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=699457
144 '''
145 document = Poppler.Document.new_from_file(self.uri, self.password)
146 try:
147 output = tempfile.mkstemp()[1]
148 page = document.get_page(0)
149 # assume that every pages are the same size
150 page_width, page_height = page.get_size()
151 surface = cairo.PDFSurface(output, page_width, page_height)
152 context = cairo.Context(surface) # context draws on the surface
153 logging.debug('PDF rendering of %s' % self.filename)
154 for pagenum in range(document.get_n_pages()):
155 page = document.get_page(pagenum)
156 context.translate(0, 0)
157 if self.pdf_quality:
158 page.render(context) # render the page on context
159 else:
160 page.render_for_printing(context) # render the page on context
161 context.show_page() # draw context on surface
162 surface.finish()
163 shutil.move(output, self.output)
164 except:
165 logging.error('Something went wrong when cleaning %s.' % self.filename)
166 return False
167
168 try:
169 import pdfrw # For now, poppler cannot write meta, so we must use pdfrw
170 logging.debug('Removing %s\'s superficial metadata' % self.filename)
171 trailer = pdfrw.PdfReader(self.output)
172 trailer.Info.Producer = None
173 trailer.Info.Creator = None
174 writer = pdfrw.PdfWriter()
175 writer.trailer = trailer
176 writer.write(self.output)
177 self.do_backup()
178 except:
179 logging.error('Unable to remove all metadata from %s, please install pdfrw' % self.output)
180 return False
181 return True
182
183 def get_meta(self):
184 ''' Return a dict with all the meta of the file
185 '''
186 document = Poppler.Document.new_from_file(self.uri, self.password)
187 metadata = {}
188 for key in self.meta_list:
189 if document.get_property(key):
190 metadata[key] = document.get_property(key)
191 return metadata
diff --git a/libmat/parser.py b/libmat/parser.py
new file mode 100644
index 0000000..1765da8
--- /dev/null
+++ b/libmat/parser.py
@@ -0,0 +1,135 @@
1''' Parent class of all parser
2'''
3
4import os
5import shutil
6import tempfile
7
8import hachoir_core
9import hachoir_editor
10
11import mat
12
13NOMETA = frozenset((
14 '.bmp', # "raw" image
15 '.rdf', # text
16 '.txt', # plain text
17 '.xml', # formated text (XML)
18 '.rels', # openXML formated text
19))
20
21FIELD = object()
22
23
24class GenericParser(object):
25 ''' Parent class of all parsers
26 '''
27 def __init__(self, filename, parser, mime, backup, is_writable, **kwargs):
28 self.filename = ''
29 self.parser = parser
30 self.mime = mime
31 self.backup = backup
32 self.is_writable = is_writable
33 self.editor = hachoir_editor.createEditor(parser)
34 try:
35 self.filename = hachoir_core.cmd_line.unicodeFilename(filename)
36 except TypeError: # get rid of "decoding Unicode is not supported"
37 self.filename = filename
38 self.basename = os.path.basename(filename)
39 _, output = tempfile.mkstemp()
40 self.output = hachoir_core.cmd_line.unicodeFilename(output)
41
42 def __del__(self):
43 ''' Remove tempfile if it was not used
44 '''
45 if os.path.exists(self.output):
46 mat.secure_remove(self.output)
47
48 def is_clean(self):
49 '''
50 Check if the file is clean from harmful metadatas
51 '''
52 for field in self.editor:
53 if self._should_remove(field):
54 return self._is_clean(self.editor)
55 return True
56
57 def _is_clean(self, fieldset):
58 for field in fieldset:
59 remove = self._should_remove(field)
60 if remove is True:
61 return False
62 if remove is FIELD:
63 if not self._is_clean(field):
64 return False
65 return True
66
67 def remove_all(self):
68 ''' Remove all compromising fields
69 '''
70 state = self._remove_all(self.editor)
71 hachoir_core.field.writeIntoFile(self.editor, self.output)
72 self.do_backup()
73 return state
74
75 def _remove_all(self, fieldset):
76 ''' Recursive way to handle tree metadatas
77 '''
78 try:
79 for field in fieldset:
80 remove = self._should_remove(field)
81 if remove is True:
82 self._remove(fieldset, field.name)
83 if remove is FIELD:
84 self._remove_all(field)
85 return True
86 except:
87 return False
88
89 def _remove(self, fieldset, field):
90 ''' Delete the given field
91 '''
92 del fieldset[field]
93
94 def get_meta(self):
95 ''' Return a dict with all the meta of the file
96 '''
97 metadata = {}
98 self._get_meta(self.editor, metadata)
99 return metadata
100
101 def _get_meta(self, fieldset, metadata):
102 ''' Recursive way to handle tree metadatas
103 '''
104 for field in fieldset:
105 remove = self._should_remove(field)
106 if remove:
107 try:
108 metadata[field.name] = field.value
109 except:
110 metadata[field.name] = 'harmful content'
111 if remove is FIELD:
112 self._get_meta(field, None)
113
114 def _should_remove(self, key):
115 ''' Return True if the field is compromising
116 abstract method
117 '''
118 raise NotImplementedError
119
120 def create_backup_copy(self):
121 ''' Create a backup copy
122 '''
123 shutil.copy2(self.filename, self.filename + '.bak')
124
125 def do_backup(self):
126 ''' Keep a backup of the file if asked.
127
128 The process of double-renaming is not very elegant,
129 but it greatly simplify new strippers implementation.
130 '''
131 if self.backup:
132 shutil.move(self.filename, self.filename + '.bak')
133 else:
134 mat.secure_remove(self.filename)
135 shutil.move(self.output, self.filename)
diff --git a/libmat/strippers.py b/libmat/strippers.py
new file mode 100644
index 0000000..aea98da
--- /dev/null
+++ b/libmat/strippers.py
@@ -0,0 +1,70 @@
1''' Manage which fileformat can be processed
2'''
3
4import archive
5import audio
6import gi
7import images
8import logging
9import mat
10import misc
11import office
12import subprocess
13
14STRIPPERS = {
15 'application/x-tar': archive.TarStripper,
16 'application/x-bzip2': archive.Bzip2Stripper,
17 'application/x-gzip': archive.GzipStripper,
18 'application/zip': archive.ZipStripper,
19 'audio/mpeg': audio.MpegAudioStripper,
20 'application/x-bittorrent': misc.TorrentStripper,
21 'application/opendocument': office.OpenDocumentStripper,
22 'application/officeopenxml': office.OpenXmlStripper,
23}
24
25logging.basicConfig(level=mat.LOGGING_LEVEL)
26
27# PDF support
28pdfSupport = True
29try:
30 from gi.repository import Poppler
31except ImportError:
32 logging.info('Unable to import Poppler: no PDF support')
33 pdfSupport = False
34
35try:
36 import cairo
37except ImportError:
38 logging.info('Unable to import python-cairo: no PDF support')
39 pdfSupport = False
40
41try:
42 import pdfrw
43except ImportError:
44 logging.info('Unable to import python-pdfrw: no PDf support')
45 pdfSupport = False
46
47if pdfSupport:
48 STRIPPERS['application/x-pdf'] = office.PdfStripper
49 STRIPPERS['application/pdf'] = office.PdfStripper
50
51
52# audio format support with mutagen-python
53try:
54 import mutagen
55 STRIPPERS['audio/x-flac'] = audio.FlacStripper
56 STRIPPERS['audio/vorbis'] = audio.OggStripper
57 STRIPPERS['audio/mpeg'] = audio.MpegAudioStripper
58except ImportError:
59 logging.info('Unable to import python-mutagen: limited audio format support')
60
61# exiftool
62try:
63 subprocess.check_output(['exiftool', '-ver'])
64 import exiftool
65 STRIPPERS['image/jpeg'] = exiftool.JpegStripper
66 STRIPPERS['image/png'] = exiftool.PngStripper
67except OSError: # if exiftool is not installed, use hachoir instead
68 logging.info('Unable to find exiftool: limited images support')
69 STRIPPERS['image/jpeg'] = images.JpegStripper
70 STRIPPERS['image/png'] = images.PngStripper