8 files changed, 102 insertions, 117 deletions
diff --git a/MAT/archive.py b/MAT/archive.py
index 447f068..f07e18c 100644
--- a/MAT/archive.py
+++ b/MAT/archive.py
@@ -1,21 +1,19 @@
-'''
+''' Take care of archives formats
-    Take care of archives formats
 '''
-import zipfile
-import shutil
-import os
 import logging
+import os
+import shutil
 import tempfile
+import zipfile
-import parser
 import mat
+import parser
 import tarfile
 class GenericArchiveStripper(parser.GenericParser):
-    '''
+    ''' Represent a generic archive
-        Represent a generic archive
    '''
    def __init__(self, filename, parser, mime, backup, is_writable, **kwargs):
        super(GenericArchiveStripper, self).__init__(filename, parser, mime, backup, is_writable, **kwargs)
@@ -24,8 +22,7 @@ class GenericArchiveStripper(parser.GenericParser):
        self.tempdir = tempfile.mkdtemp()
    def __del__(self):
-        '''
+        ''' Remove the files inside the temp dir,
-            Remove the files inside the temp dir,
            then remove the temp dir
        '''
        for root, dirs, files in os.walk(self.tempdir):
@@ -35,16 +32,16 @@ class GenericArchiveStripper(parser.GenericParser):
        shutil.rmtree(self.tempdir)
    def remove_all(self):
+        ''' Virtual method to remove all metadata
+        '''
        raise NotImplementedError
 class ZipStripper(GenericArchiveStripper):
-    '''
+    ''' Represent a zip file
-        Represent a zip file
    '''
    def is_file_clean(self, fileinfo):
-        '''
+        ''' Check if a ZipInfo object is clean of metadatas added
-            Check if a ZipInfo object is clean of metadatas added
            by zip itself, independently of the corresponding file metadatas
        '''
        if fileinfo.comment:
diff --git a/MAT/audio.py b/MAT/audio.py
index 3c6c7bc..dae9d75 100644
--- a/MAT/audio.py
+++ b/MAT/audio.py
@@ -1,5 +1,4 @@
-'''
+''' Care about audio fileformat
-    Care about audio fileformat
 '''
 try:
@@ -13,31 +12,27 @@ import mutagenstripper
 class MpegAudioStripper(parser.GenericParser):
-    '''
+    ''' Represent mpeg audio file (mp3, ...)
-        Represent mpeg audio file (mp3, ...)
    '''
    def _should_remove(self, field):
        return field.name in ("id3v1", "id3v2")
 class OggStripper(mutagenstripper.MutagenStripper):
-    '''
+    ''' Represent an ogg vorbis file
-        Represent an ogg vorbis file
    '''
    def _create_mfile(self):
        self.mfile = OggVorbis(self.filename)
 class FlacStripper(mutagenstripper.MutagenStripper):
-    '''
+    ''' Represent a Flac audio file
-        Represent a Flac audio file
    '''
    def _create_mfile(self):
        self.mfile = FLAC(self.filename)
    def remove_all(self):
-        '''
+        ''' Remove the "metadata" block from the file
-            Remove the "metadata" block from the file
        '''
        super(FlacStripper, self).remove_all()
        self.mfile.clear_pictures()
@@ -45,14 +40,12 @@ class FlacStripper(mutagenstripper.MutagenStripper):
        return True
    def is_clean(self):
-        '''
+        ''' Check if the "metadata" block is present in the file
-            Check if the "metadata" block is present in the file
        '''
        return super(FlacStripper, self).is_clean() and not self.mfile.pictures
    def get_meta(self):
-        '''
+        ''' Return the content of the metadata block if present
-            Return the content of the metadata block if present
        '''
        metadata = super(FlacStripper, self).get_meta()
        if self.mfile.pictures:
diff --git a/MAT/images.py b/MAT/images.py
index 55c1a90..dc96e6a 100644
--- a/MAT/images.py
+++ b/MAT/images.py
@@ -1,41 +1,52 @@
-'''
+''' Takes care about pictures formats
-    Takes care about pictures formats
+References:
+    - JFIF: http://www.ecma-international.org/publications/techreports/E-TR-098.htm
+    - PNG: http://www.sno.phy.queensu.ca/~phil/exiftool/TagNames/PNG.html
+    - PNG: http://www.w3.org/TR/PNG-Chunks.html
 '''
 import parser
 class JpegStripper(parser.GenericParser):
-    '''
+    ''' Represents a jpeg file.
-        represents a jpeg file
+        Custom Huffman and Quantization tables
+        are stripped: they may leak
+        some info, and the quality loss is minor.
    '''
    def _should_remove(self, field):
+        ''' Return True if the field is compromising
        '''
-            return True if the field is compromising
+        field_list = frozenset([
-        '''
+            'start_image',  # start of the image
-        field_list = frozenset(['start_image', 'app0', 'start_frame',
+            'app0',         # JFIF data
-                'start_scan', 'data', 'end_image'])
+            'start_frame',  # specify width, height, number of components
+            'start_scan',   # specify which slice of data the top-to-bottom scan contains
+            'data',         # actual data
+            'end_image'])   # end of the image
        if field.name in field_list:
            return False
-        elif field.name.startswith('quantization['):
+        elif field.name.startswith('quantization['):  # custom Quant. tables
            return False
-        elif field.name.startswith('huffman['):
+        elif field.name.startswith('huffman['):  # custom Huffman tables
            return False
        return True
 class PngStripper(parser.GenericParser):
-    '''
+    ''' Represents a png file
-        represents a png file
-        see : http://www.sno.phy.queensu.ca/~phil/exiftool/TagNames/PNG.html
    '''
    def _should_remove(self, field):
+        ''' Return True if the field is compromising
        '''
-            return True if the field is compromising
+        field_list = frozenset([
-        '''
+            'id',
-        field_list = frozenset(['id', 'header', 'physical', 'end'])
+            'header',   # PNG header
+            'physical', # the intended pixel size or aspect ratio
+            'end'])     # end of the image
        if field.name in field_list:
            return False
-        if field.name.startswith('data['):
+        if field.name.startswith('data['):  # data
            return False
        return True
diff --git a/MAT/mat.py b/MAT/mat.py
index a1dc111..a669515 100644
--- a/MAT/mat.py
+++ b/MAT/mat.py
@@ -1,13 +1,12 @@
 #!/usr/bin/env python
-'''
+''' Metadata anonymisation toolkit library
-    Metadata anonymisation toolkit library
 '''
-import os
-import subprocess
 import logging
 import mimetypes
+import os
+import subprocess
 import xml.sax
 import hachoir_core.cmd_line
@@ -33,6 +32,8 @@ logging.basicConfig(filename=fname, level=LOGGING_LEVEL)
 import strippers  # this is loaded here because we need LOGGING_LEVEL
 def get_logo():
+    ''' Return the path to the logo
+    '''
    if os.path.isfile('./data/mat.png'):
        return './data/mat.png'
    elif os.path.isfile('/usr/share/pixmaps/mat.png'):
@@ -41,6 +42,8 @@ def get_logo():
        return '/usr/local/share/pixmaps/mat.png'
 def get_datadir():
+    ''' Return the path to the data directory
+    '''
    if os.path.isdir('./data/'):
        return './data/'
    elif os.path.isdir('/usr/local/share/mat/'):
@@ -49,8 +52,9 @@ def get_datadir():
        return '/usr/share/mat/'
 def list_supported_formats():
-    '''
+    ''' Return a list of all locally supported fileformat.
-        Return a list of all locally supported fileformat
+        It parses that FORMATS file, and removes locally
+        non-supported formats.
    '''
    handler = XMLParser()
    parser = xml.sax.make_parser()
@@ -67,8 +71,7 @@ def list_supported_formats():
    return localy_supported
 class XMLParser(xml.sax.handler.ContentHandler):
-    '''
+    ''' Parse the supported format xml, and return a corresponding
-        Parse the supported format xml, and return a corresponding
        list of dict
    '''
    def __init__(self):
@@ -78,18 +81,16 @@ class XMLParser(xml.sax.handler.ContentHandler):
        self.between = False
    def startElement(self, name, attrs):
-        '''
+        ''' Called when entering into xml tag
-            Called when entering into xml tag
        '''
        self.between = True
        self.key = name
        self.content = ''
    def endElement(self, name):
+        ''' Called when exiting a xml tag
        '''
-            Called when exiting a xml tag
+        if name == 'format':  # leaving a fileformat section
-        '''
-        if name == 'format':  # exiting a fileformat section
            self.list.append(self.dict.copy())
            self.dict.clear()
        else:
@@ -98,19 +99,17 @@ class XMLParser(xml.sax.handler.ContentHandler):
            self.between = False
    def characters(self, characters):
-        '''
+        ''' Concatenate the content between opening and closing tags
-            Concatenate the content between opening and closing tags
        '''
        if self.between:
            self.content += characters
 def secure_remove(filename):
-    '''
+    ''' Securely remove the file
-        securely remove the file
    '''
    try:
-        if subprocess.call(['shred', '--remove', filename]) == 0:
+        if not subprocess.call(['shred', '--remove', filename]):
            return True
        else:
            raise OSError
@@ -126,22 +125,17 @@ def secure_remove(filename):
 def create_class_file(name, backup, **kwargs):
-    '''
+    ''' Return a $FILETYPEStripper() class,
-        return a $FILETYPEStripper() class,
        corresponding to the filetype of the given file
    '''
-    if not os.path.isfile(name):
+    if not os.path.isfile(name):  # check if the file exists
-        # check if the file exists
        logging.error('%s is not a valid file' % name)
        return None
-    if not os.access(name, os.R_OK):
+    if not os.access(name, os.R_OK):  #check read permissions
-        #check read permissions
        logging.error('%s is is not readable' % name)
        return None
-    is_writable = os.access(name, os.W_OK)
    if not os.path.getsize(name):
        #check if the file is not empty (hachoir crash on empty files)
        logging.error('%s is empty' % name)
@@ -161,7 +155,7 @@ def create_class_file(name, backup, **kwargs):
    mime = parser.mime_type
    if mime == 'application/zip':  # some formats are zipped stuff
-        if mimetypes.guess_type(name)[0] is not None:
+        if mimetypes.guess_type(name)[0]:
            mime =  mimetypes.guess_type(name)[0]
    if mime.startswith('application/vnd.oasis.opendocument'):
@@ -169,6 +163,8 @@ def create_class_file(name, backup, **kwargs):
    elif mime.startswith('application/vnd.openxmlformats-officedocument'):
        mime = 'application/officeopenxml'  # office openxml
+    is_writable = os.access(name, os.W_OK)
    try:
        stripper_class = strippers.STRIPPERS[mime]
    except KeyError:
diff --git a/MAT/mutagenstripper.py b/MAT/mutagenstripper.py
index ebc6b91..403c9a7 100644
--- a/MAT/mutagenstripper.py
+++ b/MAT/mutagenstripper.py
@@ -1,5 +1,7 @@
+''' Take care of mutagen-supported formats (audio)
+'''
 import parser
-import shutil
 class MutagenStripper(parser.GenericParser):
diff --git a/MAT/office.py b/MAT/office.py
index 583e0f9..91e49be 100644
--- a/MAT/office.py
+++ b/MAT/office.py
@@ -1,5 +1,4 @@
-'''
+''' Care about office's formats
-    Care about office's formats
 '''
 import os
@@ -23,14 +22,12 @@ import archive
 class OpenDocumentStripper(archive.GenericArchiveStripper):
-    '''
+    ''' An open document file is a zip, with xml file into.
-        An open document file is a zip, with xml file into.
        The one that interest us is meta.xml
    '''
    def get_meta(self):
-        '''
+        ''' Return a dict with all the meta of the file by
-            Return a dict with all the meta of the file by
            trying to read the meta.xml file.
        '''
        zipin = zipfile.ZipFile(self.filename, 'r')
@@ -103,8 +100,7 @@ class OpenDocumentStripper(archive.GenericArchiveStripper):
        return True
    def is_clean(self):
-        '''
+        ''' Check if the file is clean from harmful metadatas
-            Check if the file is clean from harmful metadatas
        '''
        zipin = zipfile.ZipFile(self.filename, 'r')
        try:
@@ -120,8 +116,7 @@ class OpenDocumentStripper(archive.GenericArchiveStripper):
 class PdfStripper(parser.GenericParser):
-    '''
+    ''' Represent a PDF file
-        Represent a PDF file
    '''
    def __init__(self, filename, parser, mime, backup, is_writable, **kwargs):
        super(PdfStripper, self).__init__(filename, parser, mime, backup, is_writable, **kwargs)
@@ -137,8 +132,7 @@ class PdfStripper(parser.GenericParser):
            'producer', 'metadata'])
    def is_clean(self):
-        '''
+        ''' Check if the file is clean from harmful metadatas
-            Check if the file is clean from harmful metadatas
        '''
        for key in self.meta_list:
            if self.document.get_property(key):
@@ -146,8 +140,7 @@ class PdfStripper(parser.GenericParser):
        return True
    def remove_all(self):
-        '''
+        ''' Opening the PDF with poppler, then doing a render
-            Opening the PDF with poppler, then doing a render
            on a cairo pdfsurface for each pages.
            http://cairographics.org/documentation/pycairo/2/
@@ -195,8 +188,7 @@ pdfrw' % self.output)
        return True
    def get_meta(self):
-        '''
+        ''' Return a dict with all the meta of the file
-            Return a dict with all the meta of the file
        '''
        metadata = {}
        for key in self.meta_list:
@@ -252,8 +244,7 @@ class OpenXmlStripper(archive.GenericArchiveStripper):
        return True
    def is_clean(self):
-        '''
+        ''' Check if the file is clean from harmful metadatas
-            Check if the file is clean from harmful metadatas
        '''
        zipin = zipfile.ZipFile(self.filename, 'r')
        for item in zipin.namelist():
@@ -265,8 +256,7 @@ class OpenXmlStripper(archive.GenericArchiveStripper):
        return czf.is_clean()
    def get_meta(self):
-        '''
+        ''' Return a dict with all the meta of the file
-            Return a dict with all the meta of the file
        '''
        zipin = zipfile.ZipFile(self.filename, 'r')
        metadata = {}
diff --git a/MAT/parser.py b/MAT/parser.py
index c1c3f4c..ae07d7e 100644
--- a/MAT/parser.py
+++ b/MAT/parser.py
@@ -1,22 +1,22 @@
+''' Parent class of all parser
 '''
-    Parent class of all parser
-'''
-import hachoir_core
-import hachoir_editor
 import os
-import tempfile
 import shutil
+import tempfile
+import hachoir_core
+import hachoir_editor
 import mat
-NOMETA = frozenset(('.bmp',  # image
+NOMETA = frozenset((
-          '.rdf',  # text
+    '.bmp',  # "raw" image
-          '.txt',  # plain text
+    '.rdf',  # text
-          '.xml',  # formated text (XML)
+    '.txt',  # plain text
-          '.rels',  # openXML formated text
+    '.xml',  # formated text (XML)
-          ))
+    '.rels', # openXML formated text
+))
 FIELD = object()
@@ -92,8 +92,7 @@ class GenericParser(object):
        del fieldset[field]
    def get_meta(self):
-        '''
+        ''' Return a dict with all the meta of the file
-            Return a dict with all the meta of the file
        '''
        metadata = {}
        self._get_meta(self.editor, metadata)
@@ -113,8 +112,7 @@ class GenericParser(object):
                self._get_meta(field, None)
    def _should_remove(self, key):
-        '''
+        ''' Return True if the field is compromising
-            Return True if the field is compromising
            abstract method
        '''
        raise NotImplementedError
@@ -125,8 +123,7 @@ class GenericParser(object):
        shutil.copy2(self.filename, self.filename + '.bak')
    def do_backup(self):
-        '''
+        ''' Keep a backup of the file if asked.
-            Keep a backup of the file if asked.
            The process of double-renaming is not very elegant,
            but it greatly simplify new strippers implementation.
diff --git a/MAT/strippers.py b/MAT/strippers.py
index f6ae899..78113ff 100644
--- a/MAT/strippers.py
+++ b/MAT/strippers.py
@@ -1,16 +1,15 @@
-'''
+''' Manage which fileformat can be processed
-    Manage which fileformat can be processed
 '''
-import images
+import archive
 import audio
 import gi
-import office
+import images
-import archive
+import logging
 import mat
 import misc
+import office
 import subprocess
-import logging
 STRIPPERS = {
    'application/x-tar': archive.TarStripper,

diff --git a/MAT/archive.py b/MAT/archive.py index 447f068..f07e18c 100644 --- a/MAT/archive.py +++ b/MAT/archive.py
@@ -1,21 +1,19 @@
1	'''	1	''' Take care of archives formats
2	Take care of archives formats
3	'''	2	'''
4		3
5	import zipfile
6	import shutil
7	import os
8	import logging	4	import logging
		5	import os
		6	import shutil
9	import tempfile	7	import tempfile
		8	import zipfile
10		9
11	import parser
12	import mat	10	import mat
		11	import parser
13	import tarfile	12	import tarfile
14		13
15		14
16	class GenericArchiveStripper(parser.GenericParser):	15	class GenericArchiveStripper(parser.GenericParser):
17	'''	16	''' Represent a generic archive
18	Represent a generic archive
19	'''	17	'''
20	def __init__(self, filename, parser, mime, backup, is_writable, **kwargs):	18	def __init__(self, filename, parser, mime, backup, is_writable, **kwargs):
21	super(GenericArchiveStripper, self).__init__(filename, parser, mime, backup, is_writable, **kwargs)	19	super(GenericArchiveStripper, self).__init__(filename, parser, mime, backup, is_writable, **kwargs)
@@ -24,8 +22,7 @@ class GenericArchiveStripper(parser.GenericParser):
24	self.tempdir = tempfile.mkdtemp()	22	self.tempdir = tempfile.mkdtemp()
25		23
26	def __del__(self):	24	def __del__(self):
27	'''	25	''' Remove the files inside the temp dir,
28	Remove the files inside the temp dir,
29	then remove the temp dir	26	then remove the temp dir
30	'''	27	'''
31	for root, dirs, files in os.walk(self.tempdir):	28	for root, dirs, files in os.walk(self.tempdir):
@@ -35,16 +32,16 @@ class GenericArchiveStripper(parser.GenericParser):
35	shutil.rmtree(self.tempdir)	32	shutil.rmtree(self.tempdir)
36		33
37	def remove_all(self):	34	def remove_all(self):
		35	''' Virtual method to remove all metadata
		36	'''
38	raise NotImplementedError	37	raise NotImplementedError
39		38
40		39
41	class ZipStripper(GenericArchiveStripper):	40	class ZipStripper(GenericArchiveStripper):
42	'''	41	''' Represent a zip file
43	Represent a zip file
44	'''	42	'''
45	def is_file_clean(self, fileinfo):	43	def is_file_clean(self, fileinfo):
46	'''	44	''' Check if a ZipInfo object is clean of metadatas added
47	Check if a ZipInfo object is clean of metadatas added
48	by zip itself, independently of the corresponding file metadatas	45	by zip itself, independently of the corresponding file metadatas
49	'''	46	'''
50	if fileinfo.comment:	47	if fileinfo.comment:


diff --git a/MAT/audio.py b/MAT/audio.py index 3c6c7bc..dae9d75 100644 --- a/MAT/audio.py +++ b/MAT/audio.py
@@ -1,5 +1,4 @@
1	'''	1	''' Care about audio fileformat
2	Care about audio fileformat
3	'''	2	'''
4		3
5	try:	4	try:
@@ -13,31 +12,27 @@ import mutagenstripper
13		12
14		13
15	class MpegAudioStripper(parser.GenericParser):	14	class MpegAudioStripper(parser.GenericParser):
16	'''	15	''' Represent mpeg audio file (mp3, ...)
17	Represent mpeg audio file (mp3, ...)
18	'''	16	'''
19	def _should_remove(self, field):	17	def _should_remove(self, field):
20	return field.name in ("id3v1", "id3v2")	18	return field.name in ("id3v1", "id3v2")
21		19
22		20
23	class OggStripper(mutagenstripper.MutagenStripper):	21	class OggStripper(mutagenstripper.MutagenStripper):
24	'''	22	''' Represent an ogg vorbis file
25	Represent an ogg vorbis file
26	'''	23	'''
27	def _create_mfile(self):	24	def _create_mfile(self):
28	self.mfile = OggVorbis(self.filename)	25	self.mfile = OggVorbis(self.filename)
29		26
30		27
31	class FlacStripper(mutagenstripper.MutagenStripper):	28	class FlacStripper(mutagenstripper.MutagenStripper):
32	'''	29	''' Represent a Flac audio file
33	Represent a Flac audio file
34	'''	30	'''
35	def _create_mfile(self):	31	def _create_mfile(self):
36	self.mfile = FLAC(self.filename)	32	self.mfile = FLAC(self.filename)
37		33
38	def remove_all(self):	34	def remove_all(self):
39	'''	35	''' Remove the "metadata" block from the file
40	Remove the "metadata" block from the file
41	'''	36	'''
42	super(FlacStripper, self).remove_all()	37	super(FlacStripper, self).remove_all()
43	self.mfile.clear_pictures()	38	self.mfile.clear_pictures()
@@ -45,14 +40,12 @@ class FlacStripper(mutagenstripper.MutagenStripper):
45	return True	40	return True
46		41
47	def is_clean(self):	42	def is_clean(self):
48	'''	43	''' Check if the "metadata" block is present in the file
49	Check if the "metadata" block is present in the file
50	'''	44	'''
51	return super(FlacStripper, self).is_clean() and not self.mfile.pictures	45	return super(FlacStripper, self).is_clean() and not self.mfile.pictures
52		46
53	def get_meta(self):	47	def get_meta(self):
54	'''	48	''' Return the content of the metadata block if present
55	Return the content of the metadata block if present
56	'''	49	'''
57	metadata = super(FlacStripper, self).get_meta()	50	metadata = super(FlacStripper, self).get_meta()
58	if self.mfile.pictures:	51	if self.mfile.pictures:


diff --git a/MAT/images.py b/MAT/images.py index 55c1a90..dc96e6a 100644 --- a/MAT/images.py +++ b/MAT/images.py
@@ -1,41 +1,52 @@
1	'''	1	''' Takes care about pictures formats
2	Takes care about pictures formats	2
		3	References:
		4	- JFIF: http://www.ecma-international.org/publications/techreports/E-TR-098.htm
		5	- PNG: http://www.sno.phy.queensu.ca/~phil/exiftool/TagNames/PNG.html
		6	- PNG: http://www.w3.org/TR/PNG-Chunks.html
3	'''	7	'''
4		8
5	import parser	9	import parser
6		10
7		11
8	class JpegStripper(parser.GenericParser):	12	class JpegStripper(parser.GenericParser):
9	'''	13	''' Represents a jpeg file.
10	represents a jpeg file	14	Custom Huffman and Quantization tables
		15	are stripped: they may leak
		16	some info, and the quality loss is minor.
11	'''	17	'''
12	def _should_remove(self, field):	18	def _should_remove(self, field):
		19	''' Return True if the field is compromising
13	'''	20	'''
14	return True if the field is compromising	21	field_list = frozenset([
15	'''	22	'start_image', # start of the image
16	field_list = frozenset(['start_image', 'app0', 'start_frame',	23	'app0', # JFIF data
17	'start_scan', 'data', 'end_image'])	24	'start_frame', # specify width, height, number of components
		25	'start_scan', # specify which slice of data the top-to-bottom scan contains
		26	'data', # actual data
		27	'end_image']) # end of the image
18	if field.name in field_list:	28	if field.name in field_list:
19	return False	29	return False
20	elif field.name.startswith('quantization['):	30	elif field.name.startswith('quantization['): # custom Quant. tables
21	return False	31	return False
22	elif field.name.startswith('huffman['):	32	elif field.name.startswith('huffman['): # custom Huffman tables
23	return False	33	return False
24	return True	34	return True
25		35
26		36
27	class PngStripper(parser.GenericParser):	37	class PngStripper(parser.GenericParser):
28	'''	38	''' Represents a png file
29	represents a png file
30	see : http://www.sno.phy.queensu.ca/~phil/exiftool/TagNames/PNG.html
31	'''	39	'''
32	def _should_remove(self, field):	40	def _should_remove(self, field):
		41	''' Return True if the field is compromising
33	'''	42	'''
34	return True if the field is compromising	43	field_list = frozenset([
35	'''	44	'id',
36	field_list = frozenset(['id', 'header', 'physical', 'end'])	45	'header', # PNG header
		46	'physical', # the intended pixel size or aspect ratio
		47	'end']) # end of the image
37	if field.name in field_list:	48	if field.name in field_list:
38	return False	49	return False
39	if field.name.startswith('data['):	50	if field.name.startswith('data['): # data
40	return False	51	return False
41	return True	52	return True


diff --git a/MAT/mat.py b/MAT/mat.py index a1dc111..a669515 100644 --- a/MAT/mat.py +++ b/MAT/mat.py
@@ -1,13 +1,12 @@
1	#!/usr/bin/env python	1	#!/usr/bin/env python
2		2
3	'''	3	''' Metadata anonymisation toolkit library
4	Metadata anonymisation toolkit library
5	'''	4	'''
6		5
7	import os
8	import subprocess
9	import logging	6	import logging
10	import mimetypes	7	import mimetypes
		8	import os
		9	import subprocess
11	import xml.sax	10	import xml.sax
12		11
13	import hachoir_core.cmd_line	12	import hachoir_core.cmd_line
@@ -33,6 +32,8 @@ logging.basicConfig(filename=fname, level=LOGGING_LEVEL)
33	import strippers # this is loaded here because we need LOGGING_LEVEL	32	import strippers # this is loaded here because we need LOGGING_LEVEL
34		33
35	def get_logo():	34	def get_logo():
		35	''' Return the path to the logo
		36	'''
36	if os.path.isfile('./data/mat.png'):	37	if os.path.isfile('./data/mat.png'):
37	return './data/mat.png'	38	return './data/mat.png'
38	elif os.path.isfile('/usr/share/pixmaps/mat.png'):	39	elif os.path.isfile('/usr/share/pixmaps/mat.png'):
@@ -41,6 +42,8 @@ def get_logo():
41	return '/usr/local/share/pixmaps/mat.png'	42	return '/usr/local/share/pixmaps/mat.png'
42		43
43	def get_datadir():	44	def get_datadir():
		45	''' Return the path to the data directory
		46	'''
44	if os.path.isdir('./data/'):	47	if os.path.isdir('./data/'):
45	return './data/'	48	return './data/'
46	elif os.path.isdir('/usr/local/share/mat/'):	49	elif os.path.isdir('/usr/local/share/mat/'):
@@ -49,8 +52,9 @@ def get_datadir():
49	return '/usr/share/mat/'	52	return '/usr/share/mat/'
50		53
51	def list_supported_formats():	54	def list_supported_formats():
52	'''	55	''' Return a list of all locally supported fileformat.
53	Return a list of all locally supported fileformat	56	It parses that FORMATS file, and removes locally
		57	non-supported formats.
54	'''	58	'''
55	handler = XMLParser()	59	handler = XMLParser()
56	parser = xml.sax.make_parser()	60	parser = xml.sax.make_parser()
@@ -67,8 +71,7 @@ def list_supported_formats():
67	return localy_supported	71	return localy_supported
68		72
69	class XMLParser(xml.sax.handler.ContentHandler):	73	class XMLParser(xml.sax.handler.ContentHandler):
70	'''	74	''' Parse the supported format xml, and return a corresponding
71	Parse the supported format xml, and return a corresponding
72	list of dict	75	list of dict
73	'''	76	'''
74	def __init__(self):	77	def __init__(self):
@@ -78,18 +81,16 @@ class XMLParser(xml.sax.handler.ContentHandler):
78	self.between = False	81	self.between = False
79		82
80	def startElement(self, name, attrs):	83	def startElement(self, name, attrs):
81	'''	84	''' Called when entering into xml tag
82	Called when entering into xml tag
83	'''	85	'''
84	self.between = True	86	self.between = True
85	self.key = name	87	self.key = name
86	self.content = ''	88	self.content = ''
87		89
88	def endElement(self, name):	90	def endElement(self, name):
		91	''' Called when exiting a xml tag
89	'''	92	'''
90	Called when exiting a xml tag	93	if name == 'format': # leaving a fileformat section
91	'''
92	if name == 'format': # exiting a fileformat section
93	self.list.append(self.dict.copy())	94	self.list.append(self.dict.copy())
94	self.dict.clear()	95	self.dict.clear()
95	else:	96	else:
@@ -98,19 +99,17 @@ class XMLParser(xml.sax.handler.ContentHandler):
98	self.between = False	99	self.between = False
99		100
100	def characters(self, characters):	101	def characters(self, characters):
101	'''	102	''' Concatenate the content between opening and closing tags
102	Concatenate the content between opening and closing tags
103	'''	103	'''
104	if self.between:	104	if self.between:
105	self.content += characters	105	self.content += characters
106		106
107		107
108	def secure_remove(filename):	108	def secure_remove(filename):
109	'''	109	''' Securely remove the file
110	securely remove the file
111	'''	110	'''
112	try:	111	try:
113	if subprocess.call(['shred', '--remove', filename]) == 0:	112	if not subprocess.call(['shred', '--remove', filename]):
114	return True	113	return True
115	else:	114	else:
116	raise OSError	115	raise OSError
@@ -126,22 +125,17 @@ def secure_remove(filename):
126		125
127		126
128	def create_class_file(name, backup, **kwargs):	127	def create_class_file(name, backup, **kwargs):
129	'''	128	''' Return a $FILETYPEStripper() class,
130	return a $FILETYPEStripper() class,
131	corresponding to the filetype of the given file	129	corresponding to the filetype of the given file
132	'''	130	'''
133	if not os.path.isfile(name):	131	if not os.path.isfile(name): # check if the file exists
134	# check if the file exists
135	logging.error('%s is not a valid file' % name)	132	logging.error('%s is not a valid file' % name)
136	return None	133	return None
137		134
138	if not os.access(name, os.R_OK):	135	if not os.access(name, os.R_OK): #check read permissions
139	#check read permissions
140	logging.error('%s is is not readable' % name)	136	logging.error('%s is is not readable' % name)
141	return None	137	return None
142		138
143	is_writable = os.access(name, os.W_OK)
144
145	if not os.path.getsize(name):	139	if not os.path.getsize(name):
146	#check if the file is not empty (hachoir crash on empty files)	140	#check if the file is not empty (hachoir crash on empty files)
147	logging.error('%s is empty' % name)	141	logging.error('%s is empty' % name)
@@ -161,7 +155,7 @@ def create_class_file(name, backup, **kwargs):
161	mime = parser.mime_type	155	mime = parser.mime_type
162		156
163	if mime == 'application/zip': # some formats are zipped stuff	157	if mime == 'application/zip': # some formats are zipped stuff
164	if mimetypes.guess_type(name)[0] is not None:	158	if mimetypes.guess_type(name)[0]:
165	mime = mimetypes.guess_type(name)[0]	159	mime = mimetypes.guess_type(name)[0]
166		160
167	if mime.startswith('application/vnd.oasis.opendocument'):	161	if mime.startswith('application/vnd.oasis.opendocument'):
@@ -169,6 +163,8 @@ def create_class_file(name, backup, **kwargs):
169	elif mime.startswith('application/vnd.openxmlformats-officedocument'):	163	elif mime.startswith('application/vnd.openxmlformats-officedocument'):
170	mime = 'application/officeopenxml' # office openxml	164	mime = 'application/officeopenxml' # office openxml
171		165
		166	is_writable = os.access(name, os.W_OK)
		167
172	try:	168	try:
173	stripper_class = strippers.STRIPPERS[mime]	169	stripper_class = strippers.STRIPPERS[mime]
174	except KeyError:	170	except KeyError:


diff --git a/MAT/mutagenstripper.py b/MAT/mutagenstripper.py index ebc6b91..403c9a7 100644 --- a/MAT/mutagenstripper.py +++ b/MAT/mutagenstripper.py
@@ -1,5 +1,7 @@
		1	''' Take care of mutagen-supported formats (audio)
		2	'''
		3
1	import parser	4	import parser
2	import shutil
3		5
4		6
5	class MutagenStripper(parser.GenericParser):	7	class MutagenStripper(parser.GenericParser):


diff --git a/MAT/office.py b/MAT/office.py index 583e0f9..91e49be 100644 --- a/MAT/office.py +++ b/MAT/office.py
@@ -1,5 +1,4 @@
1	'''	1	''' Care about office's formats
2	Care about office's formats
3	'''	2	'''
4		3
5	import os	4	import os
@@ -23,14 +22,12 @@ import archive
23		22
24		23
25	class OpenDocumentStripper(archive.GenericArchiveStripper):	24	class OpenDocumentStripper(archive.GenericArchiveStripper):
26	'''	25	''' An open document file is a zip, with xml file into.
27	An open document file is a zip, with xml file into.
28	The one that interest us is meta.xml	26	The one that interest us is meta.xml
29	'''	27	'''
30		28
31	def get_meta(self):	29	def get_meta(self):
32	'''	30	''' Return a dict with all the meta of the file by
33	Return a dict with all the meta of the file by
34	trying to read the meta.xml file.	31	trying to read the meta.xml file.
35	'''	32	'''
36	zipin = zipfile.ZipFile(self.filename, 'r')	33	zipin = zipfile.ZipFile(self.filename, 'r')
@@ -103,8 +100,7 @@ class OpenDocumentStripper(archive.GenericArchiveStripper):
103	return True	100	return True
104		101
105	def is_clean(self):	102	def is_clean(self):
106	'''	103	''' Check if the file is clean from harmful metadatas
107	Check if the file is clean from harmful metadatas
108	'''	104	'''
109	zipin = zipfile.ZipFile(self.filename, 'r')	105	zipin = zipfile.ZipFile(self.filename, 'r')
110	try:	106	try:
@@ -120,8 +116,7 @@ class OpenDocumentStripper(archive.GenericArchiveStripper):
120		116
121		117
122	class PdfStripper(parser.GenericParser):	118	class PdfStripper(parser.GenericParser):
123	'''	119	''' Represent a PDF file
124	Represent a PDF file
125	'''	120	'''
126	def __init__(self, filename, parser, mime, backup, is_writable, **kwargs):	121	def __init__(self, filename, parser, mime, backup, is_writable, **kwargs):
127	super(PdfStripper, self).__init__(filename, parser, mime, backup, is_writable, **kwargs)	122	super(PdfStripper, self).__init__(filename, parser, mime, backup, is_writable, **kwargs)
@@ -137,8 +132,7 @@ class PdfStripper(parser.GenericParser):
137	'producer', 'metadata'])	132	'producer', 'metadata'])
138		133
139	def is_clean(self):	134	def is_clean(self):
140	'''	135	''' Check if the file is clean from harmful metadatas
141	Check if the file is clean from harmful metadatas
142	'''	136	'''
143	for key in self.meta_list:	137	for key in self.meta_list:
144	if self.document.get_property(key):	138	if self.document.get_property(key):
@@ -146,8 +140,7 @@ class PdfStripper(parser.GenericParser):
146	return True	140	return True
147		141
148	def remove_all(self):	142	def remove_all(self):
149	'''	143	''' Opening the PDF with poppler, then doing a render
150	Opening the PDF with poppler, then doing a render
151	on a cairo pdfsurface for each pages.	144	on a cairo pdfsurface for each pages.
152		145
153	http://cairographics.org/documentation/pycairo/2/	146	http://cairographics.org/documentation/pycairo/2/
@@ -195,8 +188,7 @@ pdfrw' % self.output)
195	return True	188	return True
196		189
197	def get_meta(self):	190	def get_meta(self):
198	'''	191	''' Return a dict with all the meta of the file
199	Return a dict with all the meta of the file
200	'''	192	'''
201	metadata = {}	193	metadata = {}
202	for key in self.meta_list:	194	for key in self.meta_list:
@@ -252,8 +244,7 @@ class OpenXmlStripper(archive.GenericArchiveStripper):
252	return True	244	return True
253		245
254	def is_clean(self):	246	def is_clean(self):
255	'''	247	''' Check if the file is clean from harmful metadatas
256	Check if the file is clean from harmful metadatas
257	'''	248	'''
258	zipin = zipfile.ZipFile(self.filename, 'r')	249	zipin = zipfile.ZipFile(self.filename, 'r')
259	for item in zipin.namelist():	250	for item in zipin.namelist():
@@ -265,8 +256,7 @@ class OpenXmlStripper(archive.GenericArchiveStripper):
265	return czf.is_clean()	256	return czf.is_clean()
266		257
267	def get_meta(self):	258	def get_meta(self):
268	'''	259	''' Return a dict with all the meta of the file
269	Return a dict with all the meta of the file
270	'''	260	'''
271	zipin = zipfile.ZipFile(self.filename, 'r')	261	zipin = zipfile.ZipFile(self.filename, 'r')
272	metadata = {}	262	metadata = {}


diff --git a/MAT/parser.py b/MAT/parser.py index c1c3f4c..ae07d7e 100644 --- a/MAT/parser.py +++ b/MAT/parser.py
@@ -1,22 +1,22 @@
		1	''' Parent class of all parser
1	'''	2	'''
2	Parent class of all parser
3	'''
4
5	import hachoir_core
6	import hachoir_editor
7		3
8	import os	4	import os
9	import tempfile
10	import shutil	5	import shutil
		6	import tempfile
		7
		8	import hachoir_core
		9	import hachoir_editor
11		10
12	import mat	11	import mat
13		12
14	NOMETA = frozenset(('.bmp', # image	13	NOMETA = frozenset((
15	'.rdf', # text	14	'.bmp', # "raw" image
16	'.txt', # plain text	15	'.rdf', # text
17	'.xml', # formated text (XML)	16	'.txt', # plain text
18	'.rels', # openXML formated text	17	'.xml', # formated text (XML)
19	))	18	'.rels', # openXML formated text
		19	))
20		20
21	FIELD = object()	21	FIELD = object()
22		22
@@ -92,8 +92,7 @@ class GenericParser(object):
92	del fieldset[field]	92	del fieldset[field]
93		93
94	def get_meta(self):	94	def get_meta(self):
95	'''	95	''' Return a dict with all the meta of the file
96	Return a dict with all the meta of the file
97	'''	96	'''
98	metadata = {}	97	metadata = {}
99	self._get_meta(self.editor, metadata)	98	self._get_meta(self.editor, metadata)
@@ -113,8 +112,7 @@ class GenericParser(object):
113	self._get_meta(field, None)	112	self._get_meta(field, None)
114		113
115	def _should_remove(self, key):	114	def _should_remove(self, key):
116	'''	115	''' Return True if the field is compromising
117	Return True if the field is compromising
118	abstract method	116	abstract method
119	'''	117	'''
120	raise NotImplementedError	118	raise NotImplementedError
@@ -125,8 +123,7 @@ class GenericParser(object):
125	shutil.copy2(self.filename, self.filename + '.bak')	123	shutil.copy2(self.filename, self.filename + '.bak')
126		124
127	def do_backup(self):	125	def do_backup(self):
128	'''	126	''' Keep a backup of the file if asked.
129	Keep a backup of the file if asked.
130		127
131	The process of double-renaming is not very elegant,	128	The process of double-renaming is not very elegant,
132	but it greatly simplify new strippers implementation.	129	but it greatly simplify new strippers implementation.


diff --git a/MAT/strippers.py b/MAT/strippers.py index f6ae899..78113ff 100644 --- a/MAT/strippers.py +++ b/MAT/strippers.py
@@ -1,16 +1,15 @@
1	'''	1	''' Manage which fileformat can be processed
2	Manage which fileformat can be processed
3	'''	2	'''
4		3
5	import images	4	import archive
6	import audio	5	import audio
7	import gi	6	import gi
8	import office	7	import images
9	import archive	8	import logging
10	import mat	9	import mat
11	import misc	10	import misc
		11	import office
12	import subprocess	12	import subprocess
13	import logging
14		13
15	STRIPPERS = {	14	STRIPPERS = {
16	'application/x-tar': archive.TarStripper,	15	'application/x-tar': archive.TarStripper,