summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--MAT/archive.py25
-rw-r--r--MAT/audio.py21
-rw-r--r--MAT/images.py45
-rw-r--r--MAT/mat.py50
-rw-r--r--MAT/mutagenstripper.py4
-rw-r--r--MAT/office.py30
-rw-r--r--MAT/parser.py33
-rw-r--r--MAT/strippers.py11
8 files changed, 102 insertions, 117 deletions
diff --git a/MAT/archive.py b/MAT/archive.py
index 447f068..f07e18c 100644
--- a/MAT/archive.py
+++ b/MAT/archive.py
@@ -1,21 +1,19 @@
1''' 1''' Take care of archives formats
2 Take care of archives formats
3''' 2'''
4 3
5import zipfile
6import shutil
7import os
8import logging 4import logging
5import os
6import shutil
9import tempfile 7import tempfile
8import zipfile
10 9
11import parser
12import mat 10import mat
11import parser
13import tarfile 12import tarfile
14 13
15 14
16class GenericArchiveStripper(parser.GenericParser): 15class GenericArchiveStripper(parser.GenericParser):
17 ''' 16 ''' Represent a generic archive
18 Represent a generic archive
19 ''' 17 '''
20 def __init__(self, filename, parser, mime, backup, is_writable, **kwargs): 18 def __init__(self, filename, parser, mime, backup, is_writable, **kwargs):
21 super(GenericArchiveStripper, self).__init__(filename, parser, mime, backup, is_writable, **kwargs) 19 super(GenericArchiveStripper, self).__init__(filename, parser, mime, backup, is_writable, **kwargs)
@@ -24,8 +22,7 @@ class GenericArchiveStripper(parser.GenericParser):
24 self.tempdir = tempfile.mkdtemp() 22 self.tempdir = tempfile.mkdtemp()
25 23
26 def __del__(self): 24 def __del__(self):
27 ''' 25 ''' Remove the files inside the temp dir,
28 Remove the files inside the temp dir,
29 then remove the temp dir 26 then remove the temp dir
30 ''' 27 '''
31 for root, dirs, files in os.walk(self.tempdir): 28 for root, dirs, files in os.walk(self.tempdir):
@@ -35,16 +32,16 @@ class GenericArchiveStripper(parser.GenericParser):
35 shutil.rmtree(self.tempdir) 32 shutil.rmtree(self.tempdir)
36 33
37 def remove_all(self): 34 def remove_all(self):
35 ''' Virtual method to remove all metadata
36 '''
38 raise NotImplementedError 37 raise NotImplementedError
39 38
40 39
41class ZipStripper(GenericArchiveStripper): 40class ZipStripper(GenericArchiveStripper):
42 ''' 41 ''' Represent a zip file
43 Represent a zip file
44 ''' 42 '''
45 def is_file_clean(self, fileinfo): 43 def is_file_clean(self, fileinfo):
46 ''' 44 ''' Check if a ZipInfo object is clean of metadatas added
47 Check if a ZipInfo object is clean of metadatas added
48 by zip itself, independently of the corresponding file metadatas 45 by zip itself, independently of the corresponding file metadatas
49 ''' 46 '''
50 if fileinfo.comment: 47 if fileinfo.comment:
diff --git a/MAT/audio.py b/MAT/audio.py
index 3c6c7bc..dae9d75 100644
--- a/MAT/audio.py
+++ b/MAT/audio.py
@@ -1,5 +1,4 @@
1''' 1''' Care about audio fileformat
2 Care about audio fileformat
3''' 2'''
4 3
5try: 4try:
@@ -13,31 +12,27 @@ import mutagenstripper
13 12
14 13
15class MpegAudioStripper(parser.GenericParser): 14class MpegAudioStripper(parser.GenericParser):
16 ''' 15 ''' Represent mpeg audio file (mp3, ...)
17 Represent mpeg audio file (mp3, ...)
18 ''' 16 '''
19 def _should_remove(self, field): 17 def _should_remove(self, field):
20 return field.name in ("id3v1", "id3v2") 18 return field.name in ("id3v1", "id3v2")
21 19
22 20
23class OggStripper(mutagenstripper.MutagenStripper): 21class OggStripper(mutagenstripper.MutagenStripper):
24 ''' 22 ''' Represent an ogg vorbis file
25 Represent an ogg vorbis file
26 ''' 23 '''
27 def _create_mfile(self): 24 def _create_mfile(self):
28 self.mfile = OggVorbis(self.filename) 25 self.mfile = OggVorbis(self.filename)
29 26
30 27
31class FlacStripper(mutagenstripper.MutagenStripper): 28class FlacStripper(mutagenstripper.MutagenStripper):
32 ''' 29 ''' Represent a Flac audio file
33 Represent a Flac audio file
34 ''' 30 '''
35 def _create_mfile(self): 31 def _create_mfile(self):
36 self.mfile = FLAC(self.filename) 32 self.mfile = FLAC(self.filename)
37 33
38 def remove_all(self): 34 def remove_all(self):
39 ''' 35 ''' Remove the "metadata" block from the file
40 Remove the "metadata" block from the file
41 ''' 36 '''
42 super(FlacStripper, self).remove_all() 37 super(FlacStripper, self).remove_all()
43 self.mfile.clear_pictures() 38 self.mfile.clear_pictures()
@@ -45,14 +40,12 @@ class FlacStripper(mutagenstripper.MutagenStripper):
45 return True 40 return True
46 41
47 def is_clean(self): 42 def is_clean(self):
48 ''' 43 ''' Check if the "metadata" block is present in the file
49 Check if the "metadata" block is present in the file
50 ''' 44 '''
51 return super(FlacStripper, self).is_clean() and not self.mfile.pictures 45 return super(FlacStripper, self).is_clean() and not self.mfile.pictures
52 46
53 def get_meta(self): 47 def get_meta(self):
54 ''' 48 ''' Return the content of the metadata block if present
55 Return the content of the metadata block if present
56 ''' 49 '''
57 metadata = super(FlacStripper, self).get_meta() 50 metadata = super(FlacStripper, self).get_meta()
58 if self.mfile.pictures: 51 if self.mfile.pictures:
diff --git a/MAT/images.py b/MAT/images.py
index 55c1a90..dc96e6a 100644
--- a/MAT/images.py
+++ b/MAT/images.py
@@ -1,41 +1,52 @@
1''' 1''' Takes care about pictures formats
2 Takes care about pictures formats 2
3References:
4 - JFIF: http://www.ecma-international.org/publications/techreports/E-TR-098.htm
5 - PNG: http://www.sno.phy.queensu.ca/~phil/exiftool/TagNames/PNG.html
6 - PNG: http://www.w3.org/TR/PNG-Chunks.html
3''' 7'''
4 8
5import parser 9import parser
6 10
7 11
8class JpegStripper(parser.GenericParser): 12class JpegStripper(parser.GenericParser):
9 ''' 13 ''' Represents a jpeg file.
10 represents a jpeg file 14 Custom Huffman and Quantization tables
15 are stripped: they may leak
16 some info, and the quality loss is minor.
11 ''' 17 '''
12 def _should_remove(self, field): 18 def _should_remove(self, field):
19 ''' Return True if the field is compromising
13 ''' 20 '''
14 return True if the field is compromising 21 field_list = frozenset([
15 ''' 22 'start_image', # start of the image
16 field_list = frozenset(['start_image', 'app0', 'start_frame', 23 'app0', # JFIF data
17 'start_scan', 'data', 'end_image']) 24 'start_frame', # specify width, height, number of components
25 'start_scan', # specify which slice of data the top-to-bottom scan contains
26 'data', # actual data
27 'end_image']) # end of the image
18 if field.name in field_list: 28 if field.name in field_list:
19 return False 29 return False
20 elif field.name.startswith('quantization['): 30 elif field.name.startswith('quantization['): # custom Quant. tables
21 return False 31 return False
22 elif field.name.startswith('huffman['): 32 elif field.name.startswith('huffman['): # custom Huffman tables
23 return False 33 return False
24 return True 34 return True
25 35
26 36
27class PngStripper(parser.GenericParser): 37class PngStripper(parser.GenericParser):
28 ''' 38 ''' Represents a png file
29 represents a png file
30 see : http://www.sno.phy.queensu.ca/~phil/exiftool/TagNames/PNG.html
31 ''' 39 '''
32 def _should_remove(self, field): 40 def _should_remove(self, field):
41 ''' Return True if the field is compromising
33 ''' 42 '''
34 return True if the field is compromising 43 field_list = frozenset([
35 ''' 44 'id',
36 field_list = frozenset(['id', 'header', 'physical', 'end']) 45 'header', # PNG header
46 'physical', # the intended pixel size or aspect ratio
47 'end']) # end of the image
37 if field.name in field_list: 48 if field.name in field_list:
38 return False 49 return False
39 if field.name.startswith('data['): 50 if field.name.startswith('data['): # data
40 return False 51 return False
41 return True 52 return True
diff --git a/MAT/mat.py b/MAT/mat.py
index a1dc111..a669515 100644
--- a/MAT/mat.py
+++ b/MAT/mat.py
@@ -1,13 +1,12 @@
1#!/usr/bin/env python 1#!/usr/bin/env python
2 2
3''' 3''' Metadata anonymisation toolkit library
4 Metadata anonymisation toolkit library
5''' 4'''
6 5
7import os
8import subprocess
9import logging 6import logging
10import mimetypes 7import mimetypes
8import os
9import subprocess
11import xml.sax 10import xml.sax
12 11
13import hachoir_core.cmd_line 12import hachoir_core.cmd_line
@@ -33,6 +32,8 @@ logging.basicConfig(filename=fname, level=LOGGING_LEVEL)
33import strippers # this is loaded here because we need LOGGING_LEVEL 32import strippers # this is loaded here because we need LOGGING_LEVEL
34 33
35def get_logo(): 34def get_logo():
35 ''' Return the path to the logo
36 '''
36 if os.path.isfile('./data/mat.png'): 37 if os.path.isfile('./data/mat.png'):
37 return './data/mat.png' 38 return './data/mat.png'
38 elif os.path.isfile('/usr/share/pixmaps/mat.png'): 39 elif os.path.isfile('/usr/share/pixmaps/mat.png'):
@@ -41,6 +42,8 @@ def get_logo():
41 return '/usr/local/share/pixmaps/mat.png' 42 return '/usr/local/share/pixmaps/mat.png'
42 43
43def get_datadir(): 44def get_datadir():
45 ''' Return the path to the data directory
46 '''
44 if os.path.isdir('./data/'): 47 if os.path.isdir('./data/'):
45 return './data/' 48 return './data/'
46 elif os.path.isdir('/usr/local/share/mat/'): 49 elif os.path.isdir('/usr/local/share/mat/'):
@@ -49,8 +52,9 @@ def get_datadir():
49 return '/usr/share/mat/' 52 return '/usr/share/mat/'
50 53
51def list_supported_formats(): 54def list_supported_formats():
52 ''' 55 ''' Return a list of all locally supported fileformat.
53 Return a list of all locally supported fileformat 56 It parses that FORMATS file, and removes locally
57 non-supported formats.
54 ''' 58 '''
55 handler = XMLParser() 59 handler = XMLParser()
56 parser = xml.sax.make_parser() 60 parser = xml.sax.make_parser()
@@ -67,8 +71,7 @@ def list_supported_formats():
67 return localy_supported 71 return localy_supported
68 72
69class XMLParser(xml.sax.handler.ContentHandler): 73class XMLParser(xml.sax.handler.ContentHandler):
70 ''' 74 ''' Parse the supported format xml, and return a corresponding
71 Parse the supported format xml, and return a corresponding
72 list of dict 75 list of dict
73 ''' 76 '''
74 def __init__(self): 77 def __init__(self):
@@ -78,18 +81,16 @@ class XMLParser(xml.sax.handler.ContentHandler):
78 self.between = False 81 self.between = False
79 82
80 def startElement(self, name, attrs): 83 def startElement(self, name, attrs):
81 ''' 84 ''' Called when entering into xml tag
82 Called when entering into xml tag
83 ''' 85 '''
84 self.between = True 86 self.between = True
85 self.key = name 87 self.key = name
86 self.content = '' 88 self.content = ''
87 89
88 def endElement(self, name): 90 def endElement(self, name):
91 ''' Called when exiting a xml tag
89 ''' 92 '''
90 Called when exiting a xml tag 93 if name == 'format': # leaving a fileformat section
91 '''
92 if name == 'format': # exiting a fileformat section
93 self.list.append(self.dict.copy()) 94 self.list.append(self.dict.copy())
94 self.dict.clear() 95 self.dict.clear()
95 else: 96 else:
@@ -98,19 +99,17 @@ class XMLParser(xml.sax.handler.ContentHandler):
98 self.between = False 99 self.between = False
99 100
100 def characters(self, characters): 101 def characters(self, characters):
101 ''' 102 ''' Concatenate the content between opening and closing tags
102 Concatenate the content between opening and closing tags
103 ''' 103 '''
104 if self.between: 104 if self.between:
105 self.content += characters 105 self.content += characters
106 106
107 107
108def secure_remove(filename): 108def secure_remove(filename):
109 ''' 109 ''' Securely remove the file
110 securely remove the file
111 ''' 110 '''
112 try: 111 try:
113 if subprocess.call(['shred', '--remove', filename]) == 0: 112 if not subprocess.call(['shred', '--remove', filename]):
114 return True 113 return True
115 else: 114 else:
116 raise OSError 115 raise OSError
@@ -126,22 +125,17 @@ def secure_remove(filename):
126 125
127 126
128def create_class_file(name, backup, **kwargs): 127def create_class_file(name, backup, **kwargs):
129 ''' 128 ''' Return a $FILETYPEStripper() class,
130 return a $FILETYPEStripper() class,
131 corresponding to the filetype of the given file 129 corresponding to the filetype of the given file
132 ''' 130 '''
133 if not os.path.isfile(name): 131 if not os.path.isfile(name): # check if the file exists
134 # check if the file exists
135 logging.error('%s is not a valid file' % name) 132 logging.error('%s is not a valid file' % name)
136 return None 133 return None
137 134
138 if not os.access(name, os.R_OK): 135 if not os.access(name, os.R_OK): #check read permissions
139 #check read permissions
140 logging.error('%s is is not readable' % name) 136 logging.error('%s is is not readable' % name)
141 return None 137 return None
142 138
143 is_writable = os.access(name, os.W_OK)
144
145 if not os.path.getsize(name): 139 if not os.path.getsize(name):
146 #check if the file is not empty (hachoir crash on empty files) 140 #check if the file is not empty (hachoir crash on empty files)
147 logging.error('%s is empty' % name) 141 logging.error('%s is empty' % name)
@@ -161,7 +155,7 @@ def create_class_file(name, backup, **kwargs):
161 mime = parser.mime_type 155 mime = parser.mime_type
162 156
163 if mime == 'application/zip': # some formats are zipped stuff 157 if mime == 'application/zip': # some formats are zipped stuff
164 if mimetypes.guess_type(name)[0] is not None: 158 if mimetypes.guess_type(name)[0]:
165 mime = mimetypes.guess_type(name)[0] 159 mime = mimetypes.guess_type(name)[0]
166 160
167 if mime.startswith('application/vnd.oasis.opendocument'): 161 if mime.startswith('application/vnd.oasis.opendocument'):
@@ -169,6 +163,8 @@ def create_class_file(name, backup, **kwargs):
169 elif mime.startswith('application/vnd.openxmlformats-officedocument'): 163 elif mime.startswith('application/vnd.openxmlformats-officedocument'):
170 mime = 'application/officeopenxml' # office openxml 164 mime = 'application/officeopenxml' # office openxml
171 165
166 is_writable = os.access(name, os.W_OK)
167
172 try: 168 try:
173 stripper_class = strippers.STRIPPERS[mime] 169 stripper_class = strippers.STRIPPERS[mime]
174 except KeyError: 170 except KeyError:
diff --git a/MAT/mutagenstripper.py b/MAT/mutagenstripper.py
index ebc6b91..403c9a7 100644
--- a/MAT/mutagenstripper.py
+++ b/MAT/mutagenstripper.py
@@ -1,5 +1,7 @@
1''' Take care of mutagen-supported formats (audio)
2'''
3
1import parser 4import parser
2import shutil
3 5
4 6
5class MutagenStripper(parser.GenericParser): 7class MutagenStripper(parser.GenericParser):
diff --git a/MAT/office.py b/MAT/office.py
index 583e0f9..91e49be 100644
--- a/MAT/office.py
+++ b/MAT/office.py
@@ -1,5 +1,4 @@
1''' 1''' Care about office's formats
2 Care about office's formats
3''' 2'''
4 3
5import os 4import os
@@ -23,14 +22,12 @@ import archive
23 22
24 23
25class OpenDocumentStripper(archive.GenericArchiveStripper): 24class OpenDocumentStripper(archive.GenericArchiveStripper):
26 ''' 25 ''' An open document file is a zip, with xml file into.
27 An open document file is a zip, with xml file into.
28 The one that interest us is meta.xml 26 The one that interest us is meta.xml
29 ''' 27 '''
30 28
31 def get_meta(self): 29 def get_meta(self):
32 ''' 30 ''' Return a dict with all the meta of the file by
33 Return a dict with all the meta of the file by
34 trying to read the meta.xml file. 31 trying to read the meta.xml file.
35 ''' 32 '''
36 zipin = zipfile.ZipFile(self.filename, 'r') 33 zipin = zipfile.ZipFile(self.filename, 'r')
@@ -103,8 +100,7 @@ class OpenDocumentStripper(archive.GenericArchiveStripper):
103 return True 100 return True
104 101
105 def is_clean(self): 102 def is_clean(self):
106 ''' 103 ''' Check if the file is clean from harmful metadatas
107 Check if the file is clean from harmful metadatas
108 ''' 104 '''
109 zipin = zipfile.ZipFile(self.filename, 'r') 105 zipin = zipfile.ZipFile(self.filename, 'r')
110 try: 106 try:
@@ -120,8 +116,7 @@ class OpenDocumentStripper(archive.GenericArchiveStripper):
120 116
121 117
122class PdfStripper(parser.GenericParser): 118class PdfStripper(parser.GenericParser):
123 ''' 119 ''' Represent a PDF file
124 Represent a PDF file
125 ''' 120 '''
126 def __init__(self, filename, parser, mime, backup, is_writable, **kwargs): 121 def __init__(self, filename, parser, mime, backup, is_writable, **kwargs):
127 super(PdfStripper, self).__init__(filename, parser, mime, backup, is_writable, **kwargs) 122 super(PdfStripper, self).__init__(filename, parser, mime, backup, is_writable, **kwargs)
@@ -137,8 +132,7 @@ class PdfStripper(parser.GenericParser):
137 'producer', 'metadata']) 132 'producer', 'metadata'])
138 133
139 def is_clean(self): 134 def is_clean(self):
140 ''' 135 ''' Check if the file is clean from harmful metadatas
141 Check if the file is clean from harmful metadatas
142 ''' 136 '''
143 for key in self.meta_list: 137 for key in self.meta_list:
144 if self.document.get_property(key): 138 if self.document.get_property(key):
@@ -146,8 +140,7 @@ class PdfStripper(parser.GenericParser):
146 return True 140 return True
147 141
148 def remove_all(self): 142 def remove_all(self):
149 ''' 143 ''' Opening the PDF with poppler, then doing a render
150 Opening the PDF with poppler, then doing a render
151 on a cairo pdfsurface for each pages. 144 on a cairo pdfsurface for each pages.
152 145
153 http://cairographics.org/documentation/pycairo/2/ 146 http://cairographics.org/documentation/pycairo/2/
@@ -195,8 +188,7 @@ pdfrw' % self.output)
195 return True 188 return True
196 189
197 def get_meta(self): 190 def get_meta(self):
198 ''' 191 ''' Return a dict with all the meta of the file
199 Return a dict with all the meta of the file
200 ''' 192 '''
201 metadata = {} 193 metadata = {}
202 for key in self.meta_list: 194 for key in self.meta_list:
@@ -252,8 +244,7 @@ class OpenXmlStripper(archive.GenericArchiveStripper):
252 return True 244 return True
253 245
254 def is_clean(self): 246 def is_clean(self):
255 ''' 247 ''' Check if the file is clean from harmful metadatas
256 Check if the file is clean from harmful metadatas
257 ''' 248 '''
258 zipin = zipfile.ZipFile(self.filename, 'r') 249 zipin = zipfile.ZipFile(self.filename, 'r')
259 for item in zipin.namelist(): 250 for item in zipin.namelist():
@@ -265,8 +256,7 @@ class OpenXmlStripper(archive.GenericArchiveStripper):
265 return czf.is_clean() 256 return czf.is_clean()
266 257
267 def get_meta(self): 258 def get_meta(self):
268 ''' 259 ''' Return a dict with all the meta of the file
269 Return a dict with all the meta of the file
270 ''' 260 '''
271 zipin = zipfile.ZipFile(self.filename, 'r') 261 zipin = zipfile.ZipFile(self.filename, 'r')
272 metadata = {} 262 metadata = {}
diff --git a/MAT/parser.py b/MAT/parser.py
index c1c3f4c..ae07d7e 100644
--- a/MAT/parser.py
+++ b/MAT/parser.py
@@ -1,22 +1,22 @@
1''' Parent class of all parser
1''' 2'''
2 Parent class of all parser
3'''
4
5import hachoir_core
6import hachoir_editor
7 3
8import os 4import os
9import tempfile
10import shutil 5import shutil
6import tempfile
7
8import hachoir_core
9import hachoir_editor
11 10
12import mat 11import mat
13 12
14NOMETA = frozenset(('.bmp', # image 13NOMETA = frozenset((
15 '.rdf', # text 14 '.bmp', # "raw" image
16 '.txt', # plain text 15 '.rdf', # text
17 '.xml', # formated text (XML) 16 '.txt', # plain text
18 '.rels', # openXML formated text 17 '.xml', # formated text (XML)
19 )) 18 '.rels', # openXML formated text
19))
20 20
21FIELD = object() 21FIELD = object()
22 22
@@ -92,8 +92,7 @@ class GenericParser(object):
92 del fieldset[field] 92 del fieldset[field]
93 93
94 def get_meta(self): 94 def get_meta(self):
95 ''' 95 ''' Return a dict with all the meta of the file
96 Return a dict with all the meta of the file
97 ''' 96 '''
98 metadata = {} 97 metadata = {}
99 self._get_meta(self.editor, metadata) 98 self._get_meta(self.editor, metadata)
@@ -113,8 +112,7 @@ class GenericParser(object):
113 self._get_meta(field, None) 112 self._get_meta(field, None)
114 113
115 def _should_remove(self, key): 114 def _should_remove(self, key):
116 ''' 115 ''' Return True if the field is compromising
117 Return True if the field is compromising
118 abstract method 116 abstract method
119 ''' 117 '''
120 raise NotImplementedError 118 raise NotImplementedError
@@ -125,8 +123,7 @@ class GenericParser(object):
125 shutil.copy2(self.filename, self.filename + '.bak') 123 shutil.copy2(self.filename, self.filename + '.bak')
126 124
127 def do_backup(self): 125 def do_backup(self):
128 ''' 126 ''' Keep a backup of the file if asked.
129 Keep a backup of the file if asked.
130 127
131 The process of double-renaming is not very elegant, 128 The process of double-renaming is not very elegant,
132 but it greatly simplify new strippers implementation. 129 but it greatly simplify new strippers implementation.
diff --git a/MAT/strippers.py b/MAT/strippers.py
index f6ae899..78113ff 100644
--- a/MAT/strippers.py
+++ b/MAT/strippers.py
@@ -1,16 +1,15 @@
1''' 1''' Manage which fileformat can be processed
2 Manage which fileformat can be processed
3''' 2'''
4 3
5import images 4import archive
6import audio 5import audio
7import gi 6import gi
8import office 7import images
9import archive 8import logging
10import mat 9import mat
11import misc 10import misc
11import office
12import subprocess 12import subprocess
13import logging
14 13
15STRIPPERS = { 14STRIPPERS = {
16 'application/x-tar': archive.TarStripper, 15 'application/x-tar': archive.TarStripper,