From 4c81e731a485d3ea84049ef6d568153c8b10e90b Mon Sep 17 00:00:00 2001 From: jvoisin Date: Sun, 27 Oct 2013 23:01:20 +0000 Subject: Improves documentation --- MAT/archive.py | 25 +++++++++++-------------- MAT/audio.py | 21 +++++++-------------- MAT/images.py | 45 ++++++++++++++++++++++++++++----------------- MAT/mat.py | 50 +++++++++++++++++++++++--------------------------- MAT/mutagenstripper.py | 4 +++- MAT/office.py | 30 ++++++++++-------------------- MAT/parser.py | 33 +++++++++++++++------------------ MAT/strippers.py | 11 +++++------ 8 files changed, 102 insertions(+), 117 deletions(-) diff --git a/MAT/archive.py b/MAT/archive.py index 447f068..f07e18c 100644 --- a/MAT/archive.py +++ b/MAT/archive.py @@ -1,21 +1,19 @@ -''' - Take care of archives formats +''' Take care of archives formats ''' -import zipfile -import shutil -import os import logging +import os +import shutil import tempfile +import zipfile -import parser import mat +import parser import tarfile class GenericArchiveStripper(parser.GenericParser): - ''' - Represent a generic archive + ''' Represent a generic archive ''' def __init__(self, filename, parser, mime, backup, is_writable, **kwargs): super(GenericArchiveStripper, self).__init__(filename, parser, mime, backup, is_writable, **kwargs) @@ -24,8 +22,7 @@ class GenericArchiveStripper(parser.GenericParser): self.tempdir = tempfile.mkdtemp() def __del__(self): - ''' - Remove the files inside the temp dir, + ''' Remove the files inside the temp dir, then remove the temp dir ''' for root, dirs, files in os.walk(self.tempdir): @@ -35,16 +32,16 @@ class GenericArchiveStripper(parser.GenericParser): shutil.rmtree(self.tempdir) def remove_all(self): + ''' Virtual method to remove all metadata + ''' raise NotImplementedError class ZipStripper(GenericArchiveStripper): - ''' - Represent a zip file + ''' Represent a zip file ''' def is_file_clean(self, fileinfo): - ''' - Check if a ZipInfo object is clean of metadatas added + ''' Check if a ZipInfo object is clean of metadatas added by zip itself, independently of the corresponding file metadatas ''' if fileinfo.comment: diff --git a/MAT/audio.py b/MAT/audio.py index 3c6c7bc..dae9d75 100644 --- a/MAT/audio.py +++ b/MAT/audio.py @@ -1,5 +1,4 @@ -''' - Care about audio fileformat +''' Care about audio fileformat ''' try: @@ -13,31 +12,27 @@ import mutagenstripper class MpegAudioStripper(parser.GenericParser): - ''' - Represent mpeg audio file (mp3, ...) + ''' Represent mpeg audio file (mp3, ...) ''' def _should_remove(self, field): return field.name in ("id3v1", "id3v2") class OggStripper(mutagenstripper.MutagenStripper): - ''' - Represent an ogg vorbis file + ''' Represent an ogg vorbis file ''' def _create_mfile(self): self.mfile = OggVorbis(self.filename) class FlacStripper(mutagenstripper.MutagenStripper): - ''' - Represent a Flac audio file + ''' Represent a Flac audio file ''' def _create_mfile(self): self.mfile = FLAC(self.filename) def remove_all(self): - ''' - Remove the "metadata" block from the file + ''' Remove the "metadata" block from the file ''' super(FlacStripper, self).remove_all() self.mfile.clear_pictures() @@ -45,14 +40,12 @@ class FlacStripper(mutagenstripper.MutagenStripper): return True def is_clean(self): - ''' - Check if the "metadata" block is present in the file + ''' Check if the "metadata" block is present in the file ''' return super(FlacStripper, self).is_clean() and not self.mfile.pictures def get_meta(self): - ''' - Return the content of the metadata block if present + ''' Return the content of the metadata block if present ''' metadata = super(FlacStripper, self).get_meta() if self.mfile.pictures: diff --git a/MAT/images.py b/MAT/images.py index 55c1a90..dc96e6a 100644 --- a/MAT/images.py +++ b/MAT/images.py @@ -1,41 +1,52 @@ -''' - Takes care about pictures formats +''' Takes care about pictures formats + +References: + - JFIF: http://www.ecma-international.org/publications/techreports/E-TR-098.htm + - PNG: http://www.sno.phy.queensu.ca/~phil/exiftool/TagNames/PNG.html + - PNG: http://www.w3.org/TR/PNG-Chunks.html ''' import parser class JpegStripper(parser.GenericParser): - ''' - represents a jpeg file + ''' Represents a jpeg file. + Custom Huffman and Quantization tables + are stripped: they may leak + some info, and the quality loss is minor. ''' def _should_remove(self, field): + ''' Return True if the field is compromising ''' - return True if the field is compromising - ''' - field_list = frozenset(['start_image', 'app0', 'start_frame', - 'start_scan', 'data', 'end_image']) + field_list = frozenset([ + 'start_image', # start of the image + 'app0', # JFIF data + 'start_frame', # specify width, height, number of components + 'start_scan', # specify which slice of data the top-to-bottom scan contains + 'data', # actual data + 'end_image']) # end of the image if field.name in field_list: return False - elif field.name.startswith('quantization['): + elif field.name.startswith('quantization['): # custom Quant. tables return False - elif field.name.startswith('huffman['): + elif field.name.startswith('huffman['): # custom Huffman tables return False return True class PngStripper(parser.GenericParser): - ''' - represents a png file - see : http://www.sno.phy.queensu.ca/~phil/exiftool/TagNames/PNG.html + ''' Represents a png file ''' def _should_remove(self, field): + ''' Return True if the field is compromising ''' - return True if the field is compromising - ''' - field_list = frozenset(['id', 'header', 'physical', 'end']) + field_list = frozenset([ + 'id', + 'header', # PNG header + 'physical', # the intended pixel size or aspect ratio + 'end']) # end of the image if field.name in field_list: return False - if field.name.startswith('data['): + if field.name.startswith('data['): # data return False return True diff --git a/MAT/mat.py b/MAT/mat.py index a1dc111..a669515 100644 --- a/MAT/mat.py +++ b/MAT/mat.py @@ -1,13 +1,12 @@ #!/usr/bin/env python -''' - Metadata anonymisation toolkit library +''' Metadata anonymisation toolkit library ''' -import os -import subprocess import logging import mimetypes +import os +import subprocess import xml.sax import hachoir_core.cmd_line @@ -33,6 +32,8 @@ logging.basicConfig(filename=fname, level=LOGGING_LEVEL) import strippers # this is loaded here because we need LOGGING_LEVEL def get_logo(): + ''' Return the path to the logo + ''' if os.path.isfile('./data/mat.png'): return './data/mat.png' elif os.path.isfile('/usr/share/pixmaps/mat.png'): @@ -41,6 +42,8 @@ def get_logo(): return '/usr/local/share/pixmaps/mat.png' def get_datadir(): + ''' Return the path to the data directory + ''' if os.path.isdir('./data/'): return './data/' elif os.path.isdir('/usr/local/share/mat/'): @@ -49,8 +52,9 @@ def get_datadir(): return '/usr/share/mat/' def list_supported_formats(): - ''' - Return a list of all locally supported fileformat + ''' Return a list of all locally supported fileformat. + It parses that FORMATS file, and removes locally + non-supported formats. ''' handler = XMLParser() parser = xml.sax.make_parser() @@ -67,8 +71,7 @@ def list_supported_formats(): return localy_supported class XMLParser(xml.sax.handler.ContentHandler): - ''' - Parse the supported format xml, and return a corresponding + ''' Parse the supported format xml, and return a corresponding list of dict ''' def __init__(self): @@ -78,18 +81,16 @@ class XMLParser(xml.sax.handler.ContentHandler): self.between = False def startElement(self, name, attrs): - ''' - Called when entering into xml tag + ''' Called when entering into xml tag ''' self.between = True self.key = name self.content = '' def endElement(self, name): + ''' Called when exiting a xml tag ''' - Called when exiting a xml tag - ''' - if name == 'format': # exiting a fileformat section + if name == 'format': # leaving a fileformat section self.list.append(self.dict.copy()) self.dict.clear() else: @@ -98,19 +99,17 @@ class XMLParser(xml.sax.handler.ContentHandler): self.between = False def characters(self, characters): - ''' - Concatenate the content between opening and closing tags + ''' Concatenate the content between opening and closing tags ''' if self.between: self.content += characters def secure_remove(filename): - ''' - securely remove the file + ''' Securely remove the file ''' try: - if subprocess.call(['shred', '--remove', filename]) == 0: + if not subprocess.call(['shred', '--remove', filename]): return True else: raise OSError @@ -126,22 +125,17 @@ def secure_remove(filename): def create_class_file(name, backup, **kwargs): - ''' - return a $FILETYPEStripper() class, + ''' Return a $FILETYPEStripper() class, corresponding to the filetype of the given file ''' - if not os.path.isfile(name): - # check if the file exists + if not os.path.isfile(name): # check if the file exists logging.error('%s is not a valid file' % name) return None - if not os.access(name, os.R_OK): - #check read permissions + if not os.access(name, os.R_OK): #check read permissions logging.error('%s is is not readable' % name) return None - is_writable = os.access(name, os.W_OK) - if not os.path.getsize(name): #check if the file is not empty (hachoir crash on empty files) logging.error('%s is empty' % name) @@ -161,7 +155,7 @@ def create_class_file(name, backup, **kwargs): mime = parser.mime_type if mime == 'application/zip': # some formats are zipped stuff - if mimetypes.guess_type(name)[0] is not None: + if mimetypes.guess_type(name)[0]: mime = mimetypes.guess_type(name)[0] if mime.startswith('application/vnd.oasis.opendocument'): @@ -169,6 +163,8 @@ def create_class_file(name, backup, **kwargs): elif mime.startswith('application/vnd.openxmlformats-officedocument'): mime = 'application/officeopenxml' # office openxml + is_writable = os.access(name, os.W_OK) + try: stripper_class = strippers.STRIPPERS[mime] except KeyError: diff --git a/MAT/mutagenstripper.py b/MAT/mutagenstripper.py index ebc6b91..403c9a7 100644 --- a/MAT/mutagenstripper.py +++ b/MAT/mutagenstripper.py @@ -1,5 +1,7 @@ +''' Take care of mutagen-supported formats (audio) +''' + import parser -import shutil class MutagenStripper(parser.GenericParser): diff --git a/MAT/office.py b/MAT/office.py index 583e0f9..91e49be 100644 --- a/MAT/office.py +++ b/MAT/office.py @@ -1,5 +1,4 @@ -''' - Care about office's formats +''' Care about office's formats ''' import os @@ -23,14 +22,12 @@ import archive class OpenDocumentStripper(archive.GenericArchiveStripper): - ''' - An open document file is a zip, with xml file into. + ''' An open document file is a zip, with xml file into. The one that interest us is meta.xml ''' def get_meta(self): - ''' - Return a dict with all the meta of the file by + ''' Return a dict with all the meta of the file by trying to read the meta.xml file. ''' zipin = zipfile.ZipFile(self.filename, 'r') @@ -103,8 +100,7 @@ class OpenDocumentStripper(archive.GenericArchiveStripper): return True def is_clean(self): - ''' - Check if the file is clean from harmful metadatas + ''' Check if the file is clean from harmful metadatas ''' zipin = zipfile.ZipFile(self.filename, 'r') try: @@ -120,8 +116,7 @@ class OpenDocumentStripper(archive.GenericArchiveStripper): class PdfStripper(parser.GenericParser): - ''' - Represent a PDF file + ''' Represent a PDF file ''' def __init__(self, filename, parser, mime, backup, is_writable, **kwargs): super(PdfStripper, self).__init__(filename, parser, mime, backup, is_writable, **kwargs) @@ -137,8 +132,7 @@ class PdfStripper(parser.GenericParser): 'producer', 'metadata']) def is_clean(self): - ''' - Check if the file is clean from harmful metadatas + ''' Check if the file is clean from harmful metadatas ''' for key in self.meta_list: if self.document.get_property(key): @@ -146,8 +140,7 @@ class PdfStripper(parser.GenericParser): return True def remove_all(self): - ''' - Opening the PDF with poppler, then doing a render + ''' Opening the PDF with poppler, then doing a render on a cairo pdfsurface for each pages. http://cairographics.org/documentation/pycairo/2/ @@ -195,8 +188,7 @@ pdfrw' % self.output) return True def get_meta(self): - ''' - Return a dict with all the meta of the file + ''' Return a dict with all the meta of the file ''' metadata = {} for key in self.meta_list: @@ -252,8 +244,7 @@ class OpenXmlStripper(archive.GenericArchiveStripper): return True def is_clean(self): - ''' - Check if the file is clean from harmful metadatas + ''' Check if the file is clean from harmful metadatas ''' zipin = zipfile.ZipFile(self.filename, 'r') for item in zipin.namelist(): @@ -265,8 +256,7 @@ class OpenXmlStripper(archive.GenericArchiveStripper): return czf.is_clean() def get_meta(self): - ''' - Return a dict with all the meta of the file + ''' Return a dict with all the meta of the file ''' zipin = zipfile.ZipFile(self.filename, 'r') metadata = {} diff --git a/MAT/parser.py b/MAT/parser.py index c1c3f4c..ae07d7e 100644 --- a/MAT/parser.py +++ b/MAT/parser.py @@ -1,22 +1,22 @@ +''' Parent class of all parser ''' - Parent class of all parser -''' - -import hachoir_core -import hachoir_editor import os -import tempfile import shutil +import tempfile + +import hachoir_core +import hachoir_editor import mat -NOMETA = frozenset(('.bmp', # image - '.rdf', # text - '.txt', # plain text - '.xml', # formated text (XML) - '.rels', # openXML formated text - )) +NOMETA = frozenset(( + '.bmp', # "raw" image + '.rdf', # text + '.txt', # plain text + '.xml', # formated text (XML) + '.rels', # openXML formated text +)) FIELD = object() @@ -92,8 +92,7 @@ class GenericParser(object): del fieldset[field] def get_meta(self): - ''' - Return a dict with all the meta of the file + ''' Return a dict with all the meta of the file ''' metadata = {} self._get_meta(self.editor, metadata) @@ -113,8 +112,7 @@ class GenericParser(object): self._get_meta(field, None) def _should_remove(self, key): - ''' - Return True if the field is compromising + ''' Return True if the field is compromising abstract method ''' raise NotImplementedError @@ -125,8 +123,7 @@ class GenericParser(object): shutil.copy2(self.filename, self.filename + '.bak') def do_backup(self): - ''' - Keep a backup of the file if asked. + ''' Keep a backup of the file if asked. The process of double-renaming is not very elegant, but it greatly simplify new strippers implementation. diff --git a/MAT/strippers.py b/MAT/strippers.py index f6ae899..78113ff 100644 --- a/MAT/strippers.py +++ b/MAT/strippers.py @@ -1,16 +1,15 @@ -''' - Manage which fileformat can be processed +''' Manage which fileformat can be processed ''' -import images +import archive import audio import gi -import office -import archive +import images +import logging import mat import misc +import office import subprocess -import logging STRIPPERS = { 'application/x-tar': archive.TarStripper, -- cgit v1.3