summaryrefslogtreecommitdiff
path: root/mat
diff options
context:
space:
mode:
authorjvoisin2011-08-16 18:11:24 +0200
committerjvoisin2011-08-16 18:11:24 +0200
commit4bd3e47da02fde08acfada1795cc55170abdb00a (patch)
treef8c7aa5fd5e1b07a28b350c5ded8125ef2467c51 /mat
parentbaf8e080125614326ba9c96ca8f2404fd12b050e (diff)
setup.py now works !
Diffstat (limited to 'mat')
-rw-r--r--mat/__init__.py1
-rw-r--r--mat/archive.py289
-rw-r--r--mat/audio.py98
-rw-r--r--mat/images.py37
-rw-r--r--mat/mat.py162
-rw-r--r--mat/misc.py62
-rw-r--r--mat/office.py280
-rw-r--r--mat/parser.py104
8 files changed, 1033 insertions, 0 deletions
diff --git a/mat/__init__.py b/mat/__init__.py
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/mat/__init__.py
@@ -0,0 +1 @@
diff --git a/mat/archive.py b/mat/archive.py
new file mode 100644
index 0000000..77db71c
--- /dev/null
+++ b/mat/archive.py
@@ -0,0 +1,289 @@
1'''
2 Take care of archives formats
3'''
4
5import zipfile
6import shutil
7import os
8import logging
9import tempfile
10
11import parser
12import mat
13from tarfile import tarfile
14
15
16class GenericArchiveStripper(parser.GenericParser):
17 '''
18 Represent a generic archive
19 '''
20 def __init__(self, filename, parser, mime, backup, add2archive):
21 super(GenericArchiveStripper, self).__init__(filename, parser, mime,
22 backup, add2archive)
23 self.compression = ''
24 self.add2archive = add2archive
25 self.tempdir = tempfile.mkdtemp()
26
27 def __del__(self):
28 '''
29 Remove the files inside the temp dir,
30 then remove the temp dir
31 '''
32 for root, dirs, files in os.walk(self.tempdir):
33 for item in files:
34 path_file = os.path.join(root, item)
35 mat.secure_remove(path_file)
36 shutil.rmtree(self.tempdir)
37
38 def remove_all(self):
39 '''
40 Call _remove_all() with in argument : "normal"
41 '''
42 self._remove_all('normal')
43
44 def remove_all_ugly(self):
45 '''
46 call remove_all() with in argument : "ugly"
47 '''
48 self._remove_all('ugly')
49
50 def _remove_all(self, method):
51 '''
52 Remove all meta, normal way if method is "normal",
53 else, use the ugly way (with possible data loss)
54 '''
55 raise NotImplementedError
56
57
58class ZipStripper(GenericArchiveStripper):
59 '''
60 Represent a zip file
61 '''
62 def is_file_clean(self, fileinfo):
63 '''
64 Check if a ZipInfo object is clean of metadatas added
65 by zip itself, independently of the corresponding file metadatas
66 '''
67 if fileinfo.comment is not '':
68 return False
69 elif fileinfo.date_time is not 0:
70 return False
71 elif fileinfo.create_system is not 0:
72 return False
73 elif fileinfo.create_version is not 0:
74 return False
75 else:
76 return True
77
78 def is_clean(self):
79 '''
80 Check if the given file is clean from harmful metadata
81 '''
82 zipin = zipfile.ZipFile(self.filename, 'r')
83 if zipin.comment != '':
84 logging.debug('%s has a comment' % self.filename)
85 return False
86 for item in zipin.infolist():
87 #I have not found a way to remove the crap added by zipfile :/
88 #if not self.is_file_clean(item):
89 # logging.debug('%s from %s has compromizing zipinfo' %
90 # (item.filename, self.filename))
91 # return False
92 zipin.extract(item, self.tempdir)
93 name = os.path.join(self.tempdir, item.filename)
94 if os.path.isfile(name):
95 try:
96 cfile = mat.create_class_file(name, False,
97 self.add2archive)
98 if not cfile.is_clean():
99 return False
100 except:
101 #best solution I have found
102 logging.info('%s\'s fileformat is not supported, or is a \
103harmless format' % item.filename)
104 _, ext = os.path.splitext(name)
105 bname = os.path.basename(item.filename)
106 if ext not in parser.NOMETA:
107 if bname != 'mimetype' and bname != '.rels':
108 return False
109 zipin.close()
110 return True
111
112 def get_meta(self):
113 '''
114 Return all the metadata of a ZipFile (don't return metadatas
115 of contained files : should it ?)
116 '''
117 zipin = zipfile.ZipFile(self.filename, 'r')
118 metadata = {}
119 for field in zipin.infolist():
120 zipmeta = {}
121 zipmeta['comment'] = field.comment
122 zipmeta['modified'] = field.date_time
123 zipmeta['system'] = field.create_system
124 zipmeta['zip_version'] = field.create_version
125 metadata[field.filename] = zipmeta
126 metadata["%s comment" % self.filename] = zipin.comment
127 zipin.close()
128 return metadata
129
130 def _remove_all(self, method):
131 '''
132 So far, the zipfile module does not allow to write a ZipInfo
133 object into a zipfile (and it's a shame !) : so data added
134 by zipfile itself could not be removed. It's a big concern.
135 Is shiping a patched version of zipfile.py a good idea ?
136 '''
137 zipin = zipfile.ZipFile(self.filename, 'r')
138 zipout = zipfile.ZipFile(self.output, 'w', allowZip64=True)
139 for item in zipin.infolist():
140 zipin.extract(item, self.tempdir)
141 name = os.path.join(self.tempdir, item.filename)
142 if os.path.isfile(name):
143 try:
144 cfile = mat.create_class_file(name, False,
145 self.add2archive)
146 if method is 'normal':
147 cfile.remove_all()
148 else:
149 cfile.remove_all_ugly()
150 logging.debug('Processing %s from %s' % (item.filename,
151 self.filename))
152 zipout.write(name, item.filename)
153 except:
154 logging.info('%s\'s format is not supported or harmless' %
155 item.filename)
156 _, ext = os.path.splitext(name)
157 if self.add2archive or ext in parser.NOMETA:
158 zipout.write(name, item.filename)
159 zipout.comment = ''
160 zipin.close()
161 zipout.close()
162 logging.info('%s treated' % self.filename)
163 self.do_backup()
164
165
166class TarStripper(GenericArchiveStripper):
167 '''
168 Represent a tarfile archive
169 '''
170 def _remove(self, current_file):
171 '''
172 remove the meta added by tar itself to the file
173 '''
174 current_file.mtime = 0
175 current_file.uid = 0
176 current_file.gid = 0
177 current_file.uname = ''
178 current_file.gname = ''
179 return current_file
180
181 def _remove_all(self, method):
182 tarin = tarfile.open(self.filename, 'r' + self.compression)
183 tarout = tarfile.open(self.output, 'w' + self.compression)
184 for item in tarin.getmembers():
185 tarin.extract(item, self.tempdir)
186 name = os.path.join(self.tempdir, item.name)
187 if item.type is '0': # is item a regular file ?
188 #no backup file
189 try:
190 cfile = mat.create_class_file(name, False,
191 self.add2archive)
192 if method is 'normal':
193 cfile.remove_all()
194 else:
195 cfile.remove_all_ugly()
196 tarout.add(name, item.name, filter=self._remove)
197 except:
198 logging.info('%s\' format is not supported or harmless' %
199 item.name)
200 _, ext = os.path.splitext(name)
201 if self.add2archive or ext in parser.NOMETA:
202 tarout.add(name, item.name, filter=self._remove)
203 tarin.close()
204 tarout.close()
205 self.do_backup()
206
207 def is_file_clean(self, current_file):
208 '''
209 Check metadatas added by tar
210 '''
211 if current_file.mtime is not 0:
212 return False
213 elif current_file.uid is not 0:
214 return False
215 elif current_file.gid is not 0:
216 return False
217 elif current_file.uname is not '':
218 return False
219 elif current_file.gname is not '':
220 return False
221 else:
222 return True
223
224 def is_clean(self):
225 '''
226 Check if the file is clean from harmful metadatas
227 '''
228 tarin = tarfile.open(self.filename, 'r' + self.compression)
229 for item in tarin.getmembers():
230 if not self.is_file_clean(item):
231 tarin.close()
232 return False
233 tarin.extract(item, self.tempdir)
234 name = os.path.join(self.tempdir, item.name)
235 if item.type is '0': # is item a regular file ?
236 try:
237 class_file = mat.create_class_file(name,
238 False, self.add2archive) # no backup file
239 if not class_file.is_clean():
240 tarin.close()
241 return False
242 except:
243 logging.error('%s\'s foramt is not supported or harmless' %
244 item.filename)
245 _, ext = os.path.splitext(name)
246 if ext not in parser.NOMETA:
247 tarin.close()
248 return False
249 tarin.close()
250 return True
251
252 def get_meta(self):
253 '''
254 Return a dict with all the meta of the file
255 '''
256 tarin = tarfile.open(self.filename, 'r' + self.compression)
257 metadata = {}
258 for current_file in tarin.getmembers():
259 if current_file.type is '0':
260 if not self.is_file_clean(current_file): # if there is meta
261 current_meta = {}
262 current_meta['mtime'] = current_file.mtime
263 current_meta['uid'] = current_file.uid
264 current_meta['gid'] = current_file.gid
265 current_meta['uname'] = current_file.uname
266 current_meta['gname'] = current_file.gname
267 metadata[current_file.name] = current_meta
268 tarin.close()
269 return metadata
270
271
272class GzipStripper(TarStripper):
273 '''
274 Represent a tar.gz archive
275 '''
276 def __init__(self, filename, parser, mime, backup, add2archive):
277 super(GzipStripper, self).__init__(filename, parser, mime, backup,
278 add2archive)
279 self.compression = ':gz'
280
281
282class Bzip2Stripper(TarStripper):
283 '''
284 Represents a tar.bz2 archive
285 '''
286 def __init__(self, filename, parser, mime, backup, add2archive):
287 super(Bzip2Stripper, self).__init__(filename, parser, mime, backup,
288 add2archive)
289 self.compression = ':bz2'
diff --git a/mat/audio.py b/mat/audio.py
new file mode 100644
index 0000000..21a94be
--- /dev/null
+++ b/mat/audio.py
@@ -0,0 +1,98 @@
1'''
2 Care about audio fileformat
3'''
4try:
5 from mutagen.flac import FLAC
6 from mutagen.oggvorbis import OggVorbis
7except ImportError:
8 pass
9
10
11import parser
12import shutil
13
14
15class MpegAudioStripper(parser.GenericParser):
16 '''
17 Represent mpeg audio file (mp3, ...)
18 '''
19 def _should_remove(self, field):
20 if field.name in ("id3v1", "id3v2"):
21 return True
22 else:
23 return False
24
25
26class OggStripper(parser.GenericParser):
27 '''
28 Represent an ogg vorbis file
29 '''
30 def remove_all(self):
31 if self.backup is True:
32 shutil.copy2(self.filename, self.output)
33 self.filename = self.output
34
35 mfile = OggVorbis(self.filename)
36 mfile.delete()
37 mfile.save()
38
39 def is_clean(self):
40 '''
41 Check if the "metadata" block is present in the file
42 '''
43 mfile = OggVorbis(self.filename)
44 if mfile.tags == []:
45 return True
46 else:
47 return False
48
49 def get_meta(self):
50 '''
51 Return the content of the metadata block if present
52 '''
53 metadata = {}
54 mfile = OggVorbis(self.filename)
55 for key, value in mfile.tags:
56 metadata[key] = value
57 return metadata
58
59
60class FlacStripper(parser.GenericParser):
61 '''
62 Represent a Flac audio file
63 '''
64 def remove_all(self):
65 '''
66 Remove the "metadata" block from the file
67 '''
68 if self.backup is True:
69 shutil.copy2(self.filename, self.output)
70 self.filename = self.output
71
72 mfile = FLAC(self.filename)
73 mfile.delete()
74 mfile.clear_pictures()
75 mfile.save()
76
77 def is_clean(self):
78 '''
79 Check if the "metadata" block is present in the file
80 '''
81 mfile = FLAC(self.filename)
82 if mfile.tags is None and mfile.pictures == []:
83 return True
84 else:
85 return False
86
87 def get_meta(self):
88 '''
89 Return the content of the metadata block if present
90 '''
91 metadata = {}
92 mfile = FLAC(self.filename)
93 if mfile.tags is not None:
94 if mfile.pictures != []:
95 metadata['picture :'] = 'yes'
96 for key, value in mfile.tags:
97 metadata[key] = value
98 return metadata
diff --git a/mat/images.py b/mat/images.py
new file mode 100644
index 0000000..d090015
--- /dev/null
+++ b/mat/images.py
@@ -0,0 +1,37 @@
1'''
2 Takes care about pictures formats
3'''
4
5import parser
6
7
8class JpegStripper(parser.GenericParser):
9 '''
10 represents a jpeg file
11 '''
12 def _should_remove(self, field):
13 '''
14 return True if the field is compromizing
15 '''
16 if field.name.startswith('comment'):
17 return True
18 elif field.name in ("photoshop", "exif", "adobe"):
19 return True
20 else:
21 return False
22
23
24class PngStripper(parser.GenericParser):
25 '''
26 represents a png file
27 '''
28 def _should_remove(self, field):
29 '''
30 return True if the field is compromizing
31 '''
32 if field.name.startswith("text["):
33 return True
34 elif field.name is "time":
35 return True
36 else:
37 return False
diff --git a/mat/mat.py b/mat/mat.py
new file mode 100644
index 0000000..fd13287
--- /dev/null
+++ b/mat/mat.py
@@ -0,0 +1,162 @@
1#!/usr/bin/env python
2
3'''
4 Metadata anonymisation toolkit library
5'''
6
7import os
8import subprocess
9import logging
10import mimetypes
11import xml.sax
12
13import hachoir_core.cmd_line
14import hachoir_parser
15
16import images
17import audio
18import office
19import archive
20import misc
21
22__version__ = '0.1'
23__author__ = 'jvoisin'
24
25LOGGING_LEVEL = logging.DEBUG
26
27logging.basicConfig(level=LOGGING_LEVEL)
28
29STRIPPERS = {
30 'application/x-tar': archive.TarStripper,
31 'application/x-gzip': archive.GzipStripper,
32 'application/x-bzip2': archive.Bzip2Stripper,
33 'application/zip': archive.ZipStripper,
34 'audio/mpeg': audio.MpegAudioStripper,
35 'image/jpeg': images.JpegStripper,
36 'image/png': images.PngStripper,
37 'application/x-bittorrent': misc.TorrentStripper,
38 'application/opendocument': office.OpenDocumentStripper,
39 'application/officeopenxml': office.OpenXmlStripper,
40}
41
42try:
43 import poppler
44 import cairo
45 STRIPPERS['application/x-pdf'] = office.PdfStripper
46 STRIPPERS['application/pdf'] = office.PdfStripper
47except ImportError:
48 print('Unable to import python-poppler and/or python-cairo: no pdf \
49 support')
50
51try:
52 import mutagen
53 STRIPPERS['audio/x-flac'] = audio.FlacStripper
54 STRIPPERS['audio/vorbis'] = audio.OggStripper
55except ImportError:
56 print('unable to import python-mutagen : limited audio format support')
57
58
59class XMLParser(xml.sax.handler.ContentHandler):
60 '''
61 Parse the supported format xml, and return a corresponding
62 list of dict
63 '''
64 def __init__(self):
65 self.dict = {}
66 self.list = []
67 self.content, self.key = '', ''
68 self.between = False
69
70 def startElement(self, name, attrs):
71 '''
72 Called when entering into xml balise
73 '''
74 self.between = True
75 self.key = name
76 self.content = ''
77
78 def endElement(self, name):
79 '''
80 Called when exiting a xml balise
81 '''
82 if name == 'format': # exiting a fileformat section
83 self.list.append(self.dict.copy())
84 self.dict.clear()
85 else:
86 content = self.content.replace('\s', ' ')
87 self.dict[self.key] = content
88 self.between = False
89
90 def characters(self, characters):
91 '''
92 Concatenate the content between opening and closing balises
93 '''
94 if self.between is True:
95 self.content += characters
96
97
98def secure_remove(filename):
99 '''
100 securely remove the file
101 '''
102 removed = False
103 try:
104 subprocess.call('shred --remove %s' % filename, shell=True)
105 removed = True
106 except:
107 logging.error('Unable to securely remove %s' % filename)
108
109 if removed is False:
110 try:
111 os.remove(filename)
112 except:
113 logging.error('Unable to remove %s' % filename)
114
115
116def is_secure(filename):
117 '''
118 Prevent shell injection
119 '''
120 if not(os.path.isfile(filename)): # check if the file exist
121 logging.error('%s is not a valid file' % filename)
122 return False
123 else:
124 return True
125
126
127def create_class_file(name, backup, add2archive):
128 '''
129 return a $FILETYPEStripper() class,
130 corresponding to the filetype of the given file
131 '''
132 if not is_secure(name):
133 return
134
135 filename = ''
136 try:
137 filename = hachoir_core.cmd_line.unicodeFilename(name)
138 except TypeError: # get rid of "decoding Unicode is not supported"
139 filename = name
140
141 parser = hachoir_parser.createParser(filename)
142 if not parser:
143 logging.info('Unable to parse %s' % filename)
144 return
145
146 mime = parser.mime_type
147
148 if mime == 'application/zip': # some formats are zipped stuff
149 mime = mimetypes.guess_type(name)[0]
150
151 if mime.startswith('application/vnd.oasis.opendocument'):
152 mime = 'application/opendocument' # opendocument fileformat
153 elif mime.startswith('application/vnd.openxmlformats-officedocument'):
154 mime = 'application/officeopenxml' # office openxml
155
156 try:
157 stripper_class = STRIPPERS[mime]
158 except KeyError:
159 logging.info('Don\'t have stripper for %s format' % mime)
160 return
161
162 return stripper_class(filename, parser, mime, backup, add2archive)
diff --git a/mat/misc.py b/mat/misc.py
new file mode 100644
index 0000000..f7b256f
--- /dev/null
+++ b/mat/misc.py
@@ -0,0 +1,62 @@
1'''
2 Care about misc formats
3'''
4
5import parser
6
7from bencode import bencode
8
9
10class TorrentStripper(parser.GenericParser):
11 '''
12 Represent a torrent file with the help
13 of the bencode lib from Petru Paler
14 '''
15 def __init__(self, filename, parser, mime, backup, add2archive):
16 super(TorrentStripper, self).__init__(filename, parser, mime,
17 backup, add2archive)
18 self.fields = ['comment', 'creation date', 'created by']
19
20 def is_clean(self):
21 '''
22 Check if the file is clean from harmful metadatas
23 '''
24 with open(self.filename, 'r') as f:
25 decoded = bencode.bdecode(f.read())
26 for key in self.fields:
27 try:
28 if decoded[key] != '':
29 return False
30 except:
31 pass
32 return True
33
34 def get_meta(self):
35 '''
36 Return a dict with all the meta of the file
37 '''
38 metadata = {}
39 with open(self.filename, 'r') as f:
40 decoded = bencode.bdecode(f.read())
41 for key in self.fields:
42 try:
43 if decoded[key] != '':
44 metadata[key] = decoded[key]
45 except:
46 pass
47 return metadata
48
49 def remove_all(self):
50 '''
51 Remove all the files that are compromizing
52 '''
53 with open(self.filename, 'r') as f:
54 decoded = bencode.bdecode(f.read())
55 for key in self.fields:
56 try:
57 decoded[key] = ''
58 except:
59 pass
60 with open(self.output, 'w') as f: # encode the decoded torrent
61 f.write(bencode.bencode(decoded)) # and write it in self.output
62 self.do_backup()
diff --git a/mat/office.py b/mat/office.py
new file mode 100644
index 0000000..cb9c609
--- /dev/null
+++ b/mat/office.py
@@ -0,0 +1,280 @@
1'''
2 Care about office's formats
3'''
4
5import os
6import logging
7import zipfile
8import fileinput
9
10try:
11 import cairo
12 import poppler
13except ImportError:
14 pass
15
16import mat
17import parser
18import archive
19import pdfrw
20
21
22class OpenDocumentStripper(archive.GenericArchiveStripper):
23 '''
24 An open document file is a zip, with xml file into.
25 The one that interest us is meta.xml
26 '''
27
28 def get_meta(self):
29 '''
30 Return a dict with all the meta of the file by
31 trying to read the meta.xml file.
32 '''
33 zipin = zipfile.ZipFile(self.filename, 'r')
34 metadata = {}
35 try:
36 content = zipin.read('meta.xml')
37 zipin.close()
38 metadata[self.filename] = 'harful meta'
39 except KeyError: # no meta.xml file found
40 logging.debug('%s has no opendocument metadata' % self.filename)
41 return metadata
42
43 def _remove_all(self, method):
44 '''
45 FIXME ?
46 There is a patch implementing the Zipfile.remove()
47 method here : http://bugs.python.org/issue6818
48 '''
49 zipin = zipfile.ZipFile(self.filename, 'r')
50 zipout = zipfile.ZipFile(self.output, 'w', allowZip64=True)
51
52 for item in zipin.namelist():
53 name = os.path.join(self.tempdir, item)
54 _, ext = os.path.splitext(name)
55
56 if item.endswith('manifest.xml'):
57 # contain the list of all files present in the archive
58 zipin.extract(item, self.tempdir)
59 for line in fileinput.input(name, inplace=1):
60 #remove the line which contains "meta.xml"
61 line = line.strip()
62 if not 'meta.xml' in line:
63 print line
64 zipout.write(name, item)
65
66 elif ext in parser.NOMETA or item == 'mimetype':
67 #keep NOMETA files, and the "manifest" file
68 if item != 'meta.xml': # contains the metadata
69 zipin.extract(item, self.tempdir)
70 zipout.write(name, item)
71
72 else:
73 zipin.extract(item, self.tempdir)
74 if os.path.isfile(name):
75 try:
76 cfile = mat.create_class_file(name, False,
77 self.add2archive)
78 if method == 'normal':
79 cfile.remove_all()
80 else:
81 cfile.remove_all_ugly()
82 logging.debug('Processing %s from %s' % (item,
83 self.filename))
84 zipout.write(name, item)
85 except:
86 logging.info('%s\' fileformat is not supported' % item)
87 if self.add2archive:
88 zipout.write(name, item)
89 zipout.comment = ''
90 logging.info('%s treated' % self.filename)
91 zipin.close()
92 zipout.close()
93 self.do_backup()
94
95 def is_clean(self):
96 '''
97 Check if the file is clean from harmful metadatas
98 '''
99 zipin = zipfile.ZipFile(self.filename, 'r')
100 try:
101 zipin.getinfo('meta.xml')
102 except KeyError: # no meta.xml in the file
103 czf = archive.ZipStripper(self.filename, self.parser,
104 'application/zip', self.backup, self.add2archive)
105 if czf.is_clean():
106 zipin.close()
107 return True
108 zipin.close()
109 return False
110
111
112class PdfStripper(parser.GenericParser):
113 '''
114 Represent a pdf file
115 '''
116 def __init__(self, filename, parser, mime, backup, add2archive):
117 super(PdfStripper, self).__init__(filename, parser, mime, backup,
118 add2archive)
119 uri = 'file://' + os.path.abspath(self.filename)
120 self.password = None
121 self.document = poppler.document_new_from_file(uri, self.password)
122 self.meta_list = ('title', 'author', 'subject', 'keywords', 'creator',
123 'producer', 'creation-date', 'mod-date', 'metadata')
124
125 def is_clean(self):
126 '''
127 Check if the file is clean from harmful metadatas
128 '''
129 for key in self.meta_list:
130 if key == 'creation-date' or key == 'mod-date':
131 if self.document.get_property(key) != -1:
132 return False
133 elif self.document.get_property(key) is not None and \
134 self.document.get_property(key) != '':
135 return False
136 return True
137
138 def remove_all_ugly(self):
139 page = self.document.get_page(0)
140 page_width, page_height = page.get_size()
141 surface = cairo.PDFSurface(self.output, page_width, page_height)
142 context = cairo.Context(surface) # context draws on the surface
143 logging.debug('Pdf rendering of %s' % self.filename)
144 for pagenum in xrange(self.document.get_n_pages()):
145 page = self.document.get_page(pagenum)
146 context.translate(0, 0)
147 page.render(context) # render the page on context
148 context.show_page() # draw context on surface
149 surface.finish()
150
151 #For now, poppler cannot write meta, so we must use pdfrw
152 logging.debug('Removing %s\'s superficial metadata' % self.filename)
153 trailer = pdfrw.PdfReader(self.output)
154 trailer.Info.Producer = trailer.Info.Creator = None
155 writer = pdfrw.PdfWriter()
156 writer.trailer = trailer
157 writer.write(self.output)
158 self.do_backup()
159
160
161 def remove_all(self):
162 '''
163 Opening the pdf with poppler, then doing a render
164 on a cairo pdfsurface for each pages.
165 Thanks to Lunar^for the idea.
166 http://cairographics.org/documentation/pycairo/2/
167 python-poppler is not documented at all : have fun ;)
168 '''
169 page = self.document.get_page(0)
170 page_width, page_height = page.get_size()
171 surface = cairo.PDFSurface(self.output, page_width, page_height)
172 context = cairo.Context(surface) # context draws on the surface
173 logging.debug('Pdf rendering of %s' % self.filename)
174 for pagenum in xrange(self.document.get_n_pages()):
175 page = self.document.get_page(pagenum)
176 context.translate(0, 0)
177 page.render(context) # render the page on context
178 context.show_page() # draw context on surface
179 surface.finish()
180
181 #For now, poppler cannot write meta, so we must use pdfrw
182 logging.debug('Removing %s\'s superficial metadata' % self.filename)
183 trailer = pdfrw.PdfReader(self.output)
184 trailer.Info.Producer = trailer.Info.Creator = None
185 writer = pdfrw.PdfWriter()
186 writer.trailer = trailer
187 writer.write(self.output)
188 self.do_backup()
189
190 def get_meta(self):
191 '''
192 Return a dict with all the meta of the file
193 '''
194 metadata = {}
195 for key in self.meta_list:
196 if key == 'creation-date' or key == 'mod-date':
197 #creation and modification are set to -1
198 if self.document.get_property(key) != -1:
199 metadata[key] = self.document.get_property(key)
200 elif self.document.get_property(key) is not None and \
201 self.document.get_property(key) != '':
202 metadata[key] = self.document.get_property(key)
203 return metadata
204
205
206class OpenXmlStripper(archive.GenericArchiveStripper):
207 '''
208 Represent an office openxml document, which is like
209 an opendocument format, with some tricky stuff added.
210 It contains mostly xml, but can have media blobs, crap, ...
211 (I don't like this format.)
212 '''
213 def _remove_all(self, method):
214 '''
215 FIXME ?
216 There is a patch implementing the Zipfile.remove()
217 method here : http://bugs.python.org/issue6818
218 '''
219 zipin = zipfile.ZipFile(self.filename, 'r')
220 zipout = zipfile.ZipFile(self.output, 'w',
221 allowZip64=True)
222 for item in zipin.namelist():
223 name = os.path.join(self.tempdir, item)
224 _, ext = os.path.splitext(name)
225 if item.startswith('docProps/'): # metadatas
226 pass
227 elif ext in parser.NOMETA or item == '.rels':
228 #keep parser.NOMETA files, and the file named ".rels"
229 zipin.extract(item, self.tempdir)
230 zipout.write(name, item)
231 else:
232 zipin.extract(item, self.tempdir)
233 if os.path.isfile(name): # don't care about folders
234 try:
235 cfile = mat.create_class_file(name, False,
236 self.add2archive)
237 if method == 'normal':
238 cfile.remove_all()
239 else:
240 cfile.remove_all_ugly()
241 logging.debug('Processing %s from %s' % (item,
242 self.filename))
243 zipout.write(name, item)
244 except:
245 logging.info('%s\' fileformat is not supported' % item)
246 if self.add2archive:
247 zipout.write(name, item)
248 zipout.comment = ''
249 logging.info('%s treated' % self.filename)
250 zipin.close()
251 zipout.close()
252 self.do_backup()
253
254 def is_clean(self):
255 '''
256 Check if the file is clean from harmful metadatas
257 '''
258 zipin = zipfile.ZipFile(self.filename, 'r')
259 for item in zipin.namelist():
260 if item.startswith('docProps/'):
261 return False
262 zipin.close()
263 czf = archive.ZipStripper(self.filename, self.parser,
264 'application/zip', self.backup, self.add2archive)
265 if not czf.is_clean():
266 return False
267 else:
268 return True
269
270 def get_meta(self):
271 '''
272 Return a dict with all the meta of the file
273 '''
274 zipin = zipfile.ZipFile(self.filename, 'r')
275 metadata = {}
276 for item in zipin.namelist():
277 if item.startswith('docProps/'):
278 metadata[item] = 'harmful content'
279 zipin.close()
280 return metadata
diff --git a/mat/parser.py b/mat/parser.py
new file mode 100644
index 0000000..58dd7fa
--- /dev/null
+++ b/mat/parser.py
@@ -0,0 +1,104 @@
1'''
2 Parent class of all parser
3'''
4
5import hachoir_core
6import hachoir_editor
7
8import os
9
10import mat
11
12NOMETA = ('.bmp', '.rdf', '.txt', '.xml', '.rels')
13#bmp : image
14#rdf : text
15#txt : plain text
16#xml : formated text
17#rels : openxml foramted text
18
19
20class GenericParser(object):
21 '''
22 Parent class of all parsers
23 '''
24 def __init__(self, filename, parser, mime, backup, add2archive):
25 self.filename = ''
26 self.parser = parser
27 self.mime = mime
28 self.backup = backup
29 self.editor = hachoir_editor.createEditor(parser)
30 self.realname = filename
31 try:
32 self.filename = hachoir_core.cmd_line.unicodeFilename(filename)
33 except TypeError: # get rid of "decoding Unicode is not supported"
34 self.filename = filename
35 basename, ext = os.path.splitext(filename)
36 self.output = basename + '.cleaned' + ext
37 self.basename = os.path.basename(filename) # only filename
38
39 def is_clean(self):
40 '''
41 Check if the file is clean from harmful metadatas
42 '''
43 for field in self.editor:
44 if self._should_remove(field):
45 return False
46 return True
47
48 def remove_all(self):
49 '''
50 Remove all the files that are compromizing
51 '''
52 for field in self.editor:
53 if self._should_remove(field):
54 self._remove(field.name)
55 hachoir_core.field.writeIntoFile(self.editor, self.output)
56 self.do_backup()
57
58 def remove_all_ugly(self):
59 '''
60 If the remove_all() is not efficient enough,
61 this method is implemented :
62 It is efficient, but destructive.
63 In a perfect world, with nice fileformat,
64 this method would not exist.
65 '''
66 self.remove_all()
67
68 def _remove(self, field):
69 '''
70 Delete the given field
71 '''
72 del self.editor[field]
73
74 def get_meta(self):
75 '''
76 Return a dict with all the meta of the file
77 '''
78 metadata = {}
79 for field in self.editor:
80 if self._should_remove(field):
81 try:
82 metadata[field.name] = field.value
83 except:
84 metadata[field.name] = 'harmful content'
85 return metadata
86
87 def _should_remove(self, key):
88 '''
89 return True if the field is compromizing
90 abstract method
91 '''
92 raise NotImplementedError
93
94 def do_backup(self):
95 '''
96 Do a backup of the file if asked,
97 and change his creation/access date
98 '''
99 if self.backup is True:
100 os.utime(self.output, (0, 0))
101 else:
102 mat.secure_remove(self.filename)
103 os.rename(self.output, self.filename)
104 os.utime(self.filename, (0, 0))