From e62ae6a87f630cbd389cf1b75672b06cd56973c8 Mon Sep 17 00:00:00 2001 From: jvoisin Date: Tue, 26 Jul 2011 14:06:38 +0200 Subject: Pyflakes and pep8 validation --- cli.py | 27 +++++++++++++-------- gui.py | 74 +++++++++++++++++++++++++++++++--------------------------- lib/archive.py | 20 +++++++++------- lib/audio.py | 4 ++++ lib/images.py | 4 +--- lib/mat.py | 19 ++++++++------- lib/misc.py | 48 +++++++++++++++++++++++++++++++++++++ lib/office.py | 24 +++++++++---------- lib/parser.py | 21 +++++++---------- 9 files changed, 151 insertions(+), 90 deletions(-) create mode 100644 lib/misc.py diff --git a/cli.py b/cli.py index b9c8a5c..bfedbf6 100755 --- a/cli.py +++ b/cli.py @@ -10,10 +10,11 @@ import hachoir_core __version__ = '0.1' + def parse(): parser = optparse.OptionParser(usage='%prog [options] filename') - parser.add_option('--add2archive', '-a', action='store_true', default=False, - help='Add to outputed archive non-supported filetypes') + parser.add_option('--add2archive', '-a', action='store_true', + default=False, help='Add to outputed archive non-supported filetypes') parser.add_option('--backup', '-b', action='store_true', default=False, help='Keep a backup copy') parser.add_option('--check', '-c', action='store_true', default=False, @@ -31,15 +32,17 @@ def parse(): sys.exit(0) return values, arguments + def display_version(*args): print('Metadata Anonymisation Toolkit version %s') % mat.__version__ print('CLI version %s') % __version__ print('Hachoir version %s') % hachoir_core.__version__ sys.exit(0) + def list_meta(class_file, filename): ''' - Print all the meta of 'filename' on stdout + Print all the meta of 'filename' on stdout ''' print('[+] File %s :' % filename) if class_file.is_clean(): @@ -48,18 +51,20 @@ def list_meta(class_file, filename): for key, value in class_file.get_meta().iteritems(): print(key + ' : ' + str(value)) + def is_clean(class_file, filename): ''' - Say if 'filename' is clean or not + Say if 'filename' is clean or not ''' if class_file.is_clean(): print('[+] %s is clean' % filename) else: print('[+] %s is not clean' % filename) + def clean_meta(class_file, filename): ''' - Clean the file 'filename' + Clean the file 'filename' ''' print('[+] Cleaning %s' % filename) if class_file.is_clean(): @@ -68,9 +73,10 @@ def clean_meta(class_file, filename): class_file.remove_all() print('%s cleaned !' % filename) + def clean_meta_ugly(class_file, filename): ''' - Clean the file 'filename', ugly way + Clean the file 'filename', ugly way ''' print('[+] Cleaning %s' % filename) if class_file.is_clean(): @@ -79,17 +85,18 @@ def clean_meta_ugly(class_file, filename): class_file.remove_all_ugly() print('%s cleaned' % filename) + def main(): args, filenames = parse() #func receive the function correponding to the options given as parameters - if args.display is True: #only print metadatas + if args.display is True: # only print metadatas func = list_meta - elif args.check is True: #only check if the file is clean + elif args.check is True: # only check if the file is clean func = is_clean - elif args.ugly is True: #destructive anonymisation method + elif args.ugly is True: # destructive anonymisation method func = clean_meta_ugly - else: #clean the file + else: # clean the file func = clean_meta for filename in filenames: diff --git a/gui.py b/gui.py index 978bd4b..550010a 100644 --- a/gui.py +++ b/gui.py @@ -1,21 +1,21 @@ #!/usr/bin/env python -from gi.repository import Gtk, GObject, Gdk +from gi.repository import Gtk, GObject import os -import glob import logging from lib import mat __version__ = '0.1' __author__ = 'jvoisin' -logging.basicConfig(level = mat.LOGGING_LEVEL) +logging.basicConfig(level=mat.LOGGING_LEVEL) SUPPORTED = (('image/png', 'image/jpeg', 'image/gif', 'misc/pdf'), ('*.jpg', '*.jpeg', '*.png', '*.bmp', '*.pdf', '*.tar', '*.tar.bz2', '*.tar.gz', '*.mp3')) + class cfile(GObject.GObject): ''' Contain the class-file of the file "path" @@ -29,6 +29,7 @@ class cfile(GObject.GObject): except: self.file = None + class ListStoreApp: ''' Main GUI class @@ -40,7 +41,8 @@ class ListStoreApp: self.add2archive = True self.window = Gtk.Window() - self.window.set_title('Metadata Anonymisation Toolkit %s' % __version__) + self.window.set_title('Metadata Anonymisation Toolkit %s' % + __version__) self.window.connect('destroy', Gtk.main_quit) self.window.set_default_size(800, 600) @@ -55,13 +57,12 @@ class ListStoreApp: vbox.pack_start(content, True, True, 0) #parser.class - name - type - cleaned - self.liststore= Gtk.ListStore(cfile ,str, str, str) + self.liststore= Gtk.ListStore(cfile, str, str, str) treeview = Gtk.TreeView(model=self.liststore) - treeview.set_search_column(1) #name column is searchable - treeview.set_rules_hint(True) #alternate colors for rows - treeview.set_rubber_banding(True) #mouse selection - treeview.drag_dest_set(Gtk.DestDefaults.ALL, None, Gdk.DragAction.COPY) + treeview.set_search_column(1) # name column is searchable + treeview.set_rules_hint(True) # alternate colors for rows + treeview.set_rubber_banding(True) # mouse selection self.add_columns(treeview) self.selection = treeview.get_selection() self.selection.set_mode(Gtk.SelectionMode.MULTIPLE) @@ -80,12 +81,12 @@ class ListStoreApp: ''' toolbar = Gtk.Toolbar() - toolbutton = Gtk.ToolButton(label = 'Add', stock_id=Gtk.STOCK_ADD) + toolbutton = Gtk.ToolButton(label='Add', stock_id=Gtk.STOCK_ADD) toolbutton.connect('clicked', self.add_files) toolbutton.set_tooltip_text('Add files') toolbar.add(toolbutton) - toolbutton = Gtk.ToolButton(label = 'Clean', + toolbutton = Gtk.ToolButton(label='Clean', stock_id=Gtk.STOCK_PRINT_REPORT) toolbutton.connect('clicked', self.mat_clean) toolbutton.set_tooltip_text('Clean selected files without data loss') @@ -93,7 +94,8 @@ class ListStoreApp: toolbutton = Gtk.ToolButton(label='Brute Clean', stock_id=Gtk.STOCK_PRINT_WARNING) - toolbutton.set_tooltip_text('Clean selected files with possible data loss') + toolbutton.set_tooltip_text('Clean selected files with possible data \ + loss') toolbar.add(toolbutton) toolbutton = Gtk.ToolButton(label='Check', stock_id=Gtk.STOCK_FIND) @@ -117,8 +119,8 @@ class ListStoreApp: for i, j in enumerate(colname): filenameColumn = Gtk.CellRendererText() - column = Gtk.TreeViewColumn(j, filenameColumn, text=i+1) - column.set_sort_column_id(i+1) + column = Gtk.TreeViewColumn(j, filenameColumn, text=i + 1) + column.set_sort_column_id(i + 1) treeview.append_column(column) def create_menu_item(self, name, func, menu, pix): @@ -156,8 +158,8 @@ class ListStoreApp: Gtk.STOCK_QUIT) edit_menu = self.create_sub_menu('Edit', menubar) - self.create_menu_item('Clear the filelist', self.clear_model, edit_menu, - Gtk.STOCK_REMOVE) + self.create_menu_item('Clear the filelist', self.clear_model, + edit_menu, Gtk.STOCK_REMOVE) self.create_menu_item('Preferences', self.preferences, edit_menu, Gtk.STOCK_PREFERENCES) @@ -181,9 +183,9 @@ class ListStoreApp: ''' filter = Gtk.FileFilter() filter.set_name('Supported files') - for item in SUPPORTED[0]: #add by mime + for item in SUPPORTED[0]: # add by mime filter.add_mime_type(item) - for item in SUPPORTED[1]: #add by extension + for item in SUPPORTED[1]: # add by extension filter.add_pattern(item) return filter @@ -195,8 +197,7 @@ class ListStoreApp: title='Choose files', parent=None, action=Gtk.FileChooserAction.OPEN, - buttons=(Gtk.STOCK_OK, 0, Gtk.STOCK_CANCEL, 1) - ) + buttons=(Gtk.STOCK_OK, 0, Gtk.STOCK_CANCEL, 1)) chooser.set_default_response(0) chooser.set_select_multiple(True) @@ -208,15 +209,15 @@ class ListStoreApp: response = chooser.run() - if response is 0: #Gtk.STOCK_OK + if response is 0: # Gtk.STOCK_OK filenames = chooser.get_filenames() chooser.destroy() for item in filenames: - if os.path.isdir(item): #directory + if os.path.isdir(item): # directory for root, dirs, files in os.walk(item): for name in files: self.populate(os.path.join(root, name)) - else: #regular file + else: # regular file self.populate(item) chooser.destroy() @@ -226,7 +227,8 @@ class ListStoreApp: ''' cf = cfile(item, self.backup, self.add2archive) if cf.file is not None: - self.liststore.append([cf, cf.file.filename, cf.file.mime,'unknow']) + self.liststore.append([cf, cf.file.filename, + cf.file.mime, 'unknow']) def about(self, button=None): w = Gtk.AboutDialog() @@ -235,7 +237,7 @@ class ListStoreApp: w.set_comments('This software was coded during the GSoC 2011') w.set_website('https://gitweb.torproject.org/user/jvoisin/mat.git') w.set_website_label('Website') - w.set_authors(['Julien (jvoisin) Voisin',]) + w.set_authors(['Julien (jvoisin) Voisin', ]) w.set_program_name('Metadata Anonymistion Toolkit') click = w.run() if click: @@ -250,29 +252,30 @@ class ListStoreApp: hbox = Gtk.HBox() content_area.pack_start(hbox, False, False, 0) icon = Gtk.Image(stock=Gtk.STOCK_PREFERENCES, - icon_size=Gtk.IconSize.DIALOG)#the little picture on the left + icon_size=Gtk.IconSize.DIALOG) # the little picture on the left hbox.pack_start(icon, False, False, 0) - table = Gtk.Table(3, 2, False)#nb rows, nb lines + table = Gtk.Table(3, 2, False) # nb rows, nb lines table.set_row_spacings(4) table.set_col_spacings(4) hbox.pack_start(table, True, True, 0) force = Gtk.CheckButton('Force Clean', False) force.connect('toggled', self.invert, 'force') - force.set_tooltip_text('Do not check if already clean before cleaning.') + force.set_tooltip_text('Do not check if already clean before cleaning') force.set_active(self.force) backup = Gtk.CheckButton('Backup', False) backup.connect('toggled', self.invert, 'backup') - backup.set_tooltip_text('Keep a backup copy.') + backup.set_tooltip_text('Keep a backup copy') backup.set_active(self.backup) - add2archive = Gtk.CheckButton('Add unsupported file to archives', False) + add2archive = Gtk.CheckButton('Add unsupported file to archives', + False) add2archive.connect('toggled', self.invert, 'add2archive') - add2archive.set_tooltip_text('Add non-supported (and so non-anonymised)\ - file to outputed archive.') + add2archive.set_tooltip_text('Add non-supported (and so \ +non-anonymised) file to outputed archive') add2archive.set_active(self.add2archive) table.attach_defaults(force, 0, 1, 0, 1) @@ -281,10 +284,10 @@ class ListStoreApp: hbox.show_all() response = dialog.run() - if response is 0:#Gtk.STOCK_OK + if response is 0: # Gtk.STOCK_OK dialog.destroy() - def invert(self, button, name): #Still not better :/ + def invert(self, button, name): # still not better :/ if name is 'force': self.force = not self.force elif name is 'ugly': @@ -338,8 +341,9 @@ class ListStoreApp: self.liststore[i][0].file.remove_all_ugly() self.liststore[i][3] = 'clean' + def main(): - app = ListStoreApp() + ListStoreApp() Gtk.main() if __name__ == '__main__': diff --git a/lib/archive.py b/lib/archive.py index f22af39..f11506a 100644 --- a/lib/archive.py +++ b/lib/archive.py @@ -9,11 +9,13 @@ import tempfile import parser import mat + class GenericArchiveStripper(parser.Generic_parser): ''' Represent a generic archive ''' - def __init__(self, realname, filename, parser, editor, backup, add2archive): + def __init__(self, realname, filename, parser, editor, backup, + add2archive): super(GenericArchiveStripper, self).__init__(realname, filename, parser, editor, backup, add2archive) self.compression = '' @@ -32,6 +34,7 @@ class GenericArchiveStripper(parser.Generic_parser): def remove_all_ugly(self): self._remove_all('ugly') + class ZipStripper(GenericArchiveStripper): ''' Represent a zip file @@ -94,7 +97,6 @@ harmless format' % item.filename) zipin.close() return metadata - def _remove_all(self, method): ''' So far, the zipfile module does not allow to write a ZipInfo @@ -150,7 +152,7 @@ class TarStripper(GenericArchiveStripper): for item in tarin.getmembers(): tarin.extract(item, self.tempdir) name = os.path.join(self.tempdir, item.name) - if item.type is '0': #is item a regular file ? + if item.type is '0': # is item a regular file ? #no backup file try: cfile = mat.create_class_file(name, False, @@ -164,7 +166,7 @@ class TarStripper(GenericArchiveStripper): logging.info('%s\' format is not supported' % item.name) if self.add2archive: - tarout.add(name, item.name,filter=self._remove) + tarout.add(name, item.name, filter=self._remove) mat.secure_remove(name) tarin.close() tarout.close() @@ -194,7 +196,7 @@ class TarStripper(GenericArchiveStripper): return False tarin.extract(item, self.tempdir) name = os.path.join(self.tempdir, item.name) - if item.type is '0': #is item a regular file ? + if item.type is '0': # is item a regular file ? #no backup file try: class_file = mat.create_class_file(name, @@ -216,7 +218,7 @@ class TarStripper(GenericArchiveStripper): metadata = {} for current_file in tarin.getmembers(): if current_file.type is '0': - if not self.is_file_clean(current_file):#if there is meta + if not self.is_file_clean(current_file): # if there is meta current_meta = {} current_meta['mtime'] = current_file.mtime current_meta['uid'] = current_file.uid @@ -229,14 +231,16 @@ class TarStripper(GenericArchiveStripper): class GzipStripper(TarStripper): - def __init__(self, realname, filename, parser, editor, backup, add2archive): + def __init__(self, realname, filename, parser, editor, backup, + add2archive): super(GzipStripper, self).__init__(realname, filename, parser, editor, backup, add2archive) self.compression = ':gz' class Bzip2Stripper(TarStripper): - def __init__(self, realname, filename, parser, editor, backup, add2archive): + def __init__(self, realname, filename, parser, editor, backup, + add2archive): super(Bzip2Stripper, self).__init__(realname, filename, parser, editor, backup, add2archive) self.compression = ':bz2' diff --git a/lib/audio.py b/lib/audio.py index 6d653bc..35d4fde 100644 --- a/lib/audio.py +++ b/lib/audio.py @@ -1,6 +1,10 @@ import parser + class MpegAudioStripper(parser.Generic_parser): + ''' + mpeg audio file (mp3, ...) + ''' def _should_remove(self, field): if field.name in ("id3v1", "id3v2"): return True diff --git a/lib/images.py b/lib/images.py index 4441b70..bab0bfb 100644 --- a/lib/images.py +++ b/lib/images.py @@ -1,8 +1,5 @@ import parser -class BmpStripper(parser.Generic_parser): - def _should_remove(self, field): - return False class JpegStripper(parser.Generic_parser): def _should_remove(self, field): @@ -13,6 +10,7 @@ class JpegStripper(parser.Generic_parser): else: return False + class PngStripper(parser.Generic_parser): def _should_remove(self, field): if field.name.startswith("text["): diff --git a/lib/mat.py b/lib/mat.py index ccf653f..e4371ce 100644 --- a/lib/mat.py +++ b/lib/mat.py @@ -23,12 +23,11 @@ __author__ = 'jvoisin' LOGGING_LEVEL = logging.DEBUG -logging.basicConfig(level = LOGGING_LEVEL) +logging.basicConfig(level=LOGGING_LEVEL) strippers = { hachoir_parser.image.JpegFile: images.JpegStripper, hachoir_parser.image.PngFile: images.PngStripper, - hachoir_parser.image.bmp.BmpFile: images.BmpStripper, hachoir_parser.audio.MpegAudioFile: audio.MpegAudioStripper, hachoir_parser.misc.PDFDocument: office.PdfStripper, hachoir_parser.archive.TarFile: archive.TarStripper, @@ -37,6 +36,7 @@ strippers = { hachoir_parser.archive.zip.ZipFile: archive.ZipStripper, } + def secure_remove(filename): ''' securely remove the file @@ -52,10 +52,11 @@ def is_secure(filename): Prevent shell injection ''' - if not(os.path.isfile(filename)): #check if the file exist + if not(os.path.isfile(filename)): # check if the file exist logging.error('Error: %s is not a valid file' % filename) return False + def create_class_file(name, backup, add2archive): ''' return a $FILETYPEStripper() class, @@ -68,7 +69,7 @@ def create_class_file(name, backup, add2archive): realname = name try: filename = hachoir_core.cmd_line.unicodeFilename(name) - except TypeError:# get rid of "TypeError: decoding Unicode is not supported" + except TypeError: # get rid of "decoding Unicode is not supported" filename = name parser = hachoir_parser.createParser(filename) if not parser: @@ -88,22 +89,22 @@ def create_class_file(name, backup, add2archive): logging.info('Don\'t have stripper for format %s' % editor.description) return - if editor.input.__class__ == hachoir_parser.misc.PDFDocument:#pdf + if editor.input.__class__ == hachoir_parser.misc.PDFDocument: # pdf return stripper_class(filename, realname, backup) elif editor.input.__class__ == hachoir_parser.archive.zip.ZipFile: #zip based format mime = mimetypes.guess_type(filename)[0] - try:#Ugly workaround, cleaning open document delete mime (wtf?) + try: # ugly workaround, cleaning open document delete mime (wtf?) if mime.startswith('application/vnd.oasis.opendocument'): return office.OpenDocumentStripper(realname, filename, parser, editor, backup, add2archive) - else:#normal zip + else: # normal zip return stripper_class(realname, filename, parser, editor, backup, add2archive) - except:#normal zip file + except: # normal zip return stripper_class(realname, filename, parser, editor, backup, add2archive) - else:#normal handling + else: # normal handling return stripper_class(realname, filename, parser, editor, backup, add2archive) diff --git a/lib/misc.py b/lib/misc.py new file mode 100644 index 0000000..ce14313 --- /dev/null +++ b/lib/misc.py @@ -0,0 +1,48 @@ +import hachoir_core +import parser + + +class TorrentStripper(parser.Generic_parser): + ''' + A torrent file looks like: + -root + -start + -announce + -announce-list + -comment + -created_by + -creation_date + -encoding + -info + -end + ''' + def remove_all(self): + for field in self.editor['root']: + if self._should_remove(field): + #FIXME : hachoir does not support torrent metadata editing :< + del self.editor['/root/' + field.name] + hachoir_core.field.writeIntoFile(self.editor, + self.filename + parser.POSTFIX) + self.do_backup() + + def is_clean(self): + for field in self.editor['root']: + if self._should_remove(field): + return False + return True + + def get_meta(self): + metadata = {} + for field in self.editor['root']: + if self._should_remove(field): + try: # FIXME + metadata[field.name] = field.value + except: + metadata[field.name] = 'harmful content' + return metadata + + def _should_remove(self, field): + if field.name in ('comment', 'created_by', 'creation_date', 'info'): + return True + else: + return False diff --git a/lib/office.py b/lib/office.py index 27677d2..432bc0b 100644 --- a/lib/office.py +++ b/lib/office.py @@ -5,17 +5,16 @@ import tempfile import glob import logging import zipfile -import shutil import re from xml.etree import ElementTree -import hachoir_core import pdfrw import mat import parser import archive + class OpenDocumentStripper(archive.GenericArchiveStripper): ''' An open document file is a zip, with xml file into. @@ -32,11 +31,10 @@ class OpenDocumentStripper(archive.GenericArchiveStripper): for node in tree.iter(): key = re.sub('{.*}', '', node.tag) metadata[key] = node.text - except KeyError:#no meta.xml file found + except KeyError: # no meta.xml file found logging.debug('%s has no opendocument metadata' % self.filename) return metadata - def _remove_all(self, method): ''' FIXME ? @@ -50,7 +48,7 @@ class OpenDocumentStripper(archive.GenericArchiveStripper): name = os.path.join(self.tempdir, item) if item.endswith('.xml') or item == 'mimetype': #keep .xml files, and the "manifest" file - if item != 'meta.xml':#contains the metadata + if item != 'meta.xml': # contains the metadata zipin.extract(item, self.tempdir) zipout.write(name, item) mat.secure_remove(name) @@ -73,7 +71,7 @@ class OpenDocumentStripper(archive.GenericArchiveStripper): self.filename)) zipout.write(name, item) except: - logging.info('%s\' fileformat is not supported' % item) + logging.info('%s\' fileformat is not supported' % item) if self.add2archive: zipout.write(name, item) mat.secure_remove(name) @@ -88,7 +86,7 @@ class OpenDocumentStripper(archive.GenericArchiveStripper): try: zipin.getinfo('meta.xml') return False - except KeyError:#no meta.xml in the file + except KeyError: # no meta.xml in the file zipin.close() czf = archive.ZipStripper(self.realname, self.filename, self.parser, self.editor, self.backup, self.add2archive) @@ -104,7 +102,7 @@ class PdfStripper(parser.Generic_parser): Represent a pdf file, with the help of pdfrw ''' def __init__(self, filename, realname, backup): - name, path = os.path.splitext(filename) + name, ext = os.path.splitext(filename) self.output = name + '.cleaned' + ext self.filename = filename self.backup = backup @@ -137,7 +135,7 @@ class PdfStripper(parser.Generic_parser): ''' _, self.tmpdir = tempfile.mkstemp() subprocess.call(self.convert % (self.filename, self.tmpdir + - 'temp.jpg'), shell=True)#Convert pages to jpg + 'temp.jpg'), shell=True) # Convert pages to jpg for current_file in glob.glob(self.tmpdir + 'temp*'): #Clean every jpg image @@ -145,18 +143,18 @@ class PdfStripper(parser.Generic_parser): class_file.remove_all() subprocess.call(self.convert % (self.tmpdir + - 'temp.jpg*', self.output), shell=True)#Assemble jpg into pdf + 'temp.jpg*', self.output), shell=True) # Assemble jpg into pdf for current_file in glob.glob(self.tmpdir + 'temp*'): #remove jpg files mat.secure_remove(current_file) if self.backup is False: - mat.secure_remove(self.filename) #remove the old file - os.rename(self.output, self.filename)#rename the new + mat.secure_remove(self.filename) # remove the old file + os.rename(self.output, self.filename) # rename the new name = self.realname else: - name = output_file + name = self.output class_file = mat.create_class_file(name, False) class_file.remove_all() diff --git a/lib/parser.py b/lib/parser.py index aa7e7f1..28e0849 100644 --- a/lib/parser.py +++ b/lib/parser.py @@ -2,27 +2,25 @@ Parent class of all parser ''' -import hachoir_core.error -import hachoir_parser -import hachoir_editor +import hachoir_core -import sys import os -import subprocess import mimetypes import mat -NOMETA = ('.txt', '.bmp', '.py', '.xml', '.rdf') +NOMETA = ('.bmp', 'html', '.py', '.rdf', '.txt', '.xml') + class Generic_parser(object): - def __init__(self, realname, filename, parser, editor, backup, add2archive): + def __init__(self, realname, filename, parser, editor, backup, + add2archive): basename, ext = os.path.splitext(filename) self.output = basename + '.cleaned' + ext - self.filename = filename #path + filename - self.realname = realname #path + filename - self.basename = os.path.basename(filename) #only filename - self.mime = mimetypes.guess_type(filename)[0] #mimetype + self.filename = filename # path + filename + self.realname = realname # path + filename + self.basename = os.path.basename(filename) # only filename + self.mime = mimetypes.guess_type(filename)[0] # mimetype self.parser = parser self.editor = editor self.backup = backup @@ -56,7 +54,6 @@ class Generic_parser(object): ''' self.remove_all() - def _remove(self, field): ''' Delete the given field -- cgit v1.3