From 2cba152e7c00ff2c422d5e1c911f17ea07f346ed Mon Sep 17 00:00:00 2001 From: jvoisin Date: Mon, 6 Feb 2012 02:05:05 +0100 Subject: Merge the two processing mode into a unique one --- lib/archive.py | 31 ++++++-------------------- lib/mat.py | 2 +- lib/office.py | 68 +++++++++++++--------------------------------------------- lib/parser.py | 10 --------- mat | 15 ------------- mat-gui | 63 +++++++++-------------------------------------------- mat.1 | 3 --- 7 files changed, 32 insertions(+), 160 deletions(-) diff --git a/lib/archive.py b/lib/archive.py index 9993102..a749b29 100644 --- a/lib/archive.py +++ b/lib/archive.py @@ -36,22 +36,9 @@ class GenericArchiveStripper(parser.GenericParser): shutil.rmtree(self.tempdir) def remove_all(self): - ''' - Call _remove_all() with in argument : "normal" - ''' - return self._remove_all('normal') + return self._remove_all() - def remove_all_strict(self): - ''' - call remove_all() with in argument : "strict" - ''' - return self._remove_all('strict') - - def _remove_all(self, method): - ''' - Remove all meta, normal way if method is "normal", - else, use the strict way (with possible data loss) - ''' + def _remove_all(self): raise NotImplementedError @@ -127,7 +114,7 @@ harmless format' % item.filename) zipin.close() return metadata - def _remove_all(self, method): + def _remove_all(self): ''' So far, the zipfile module does not allow to write a ZipInfo object into a zipfile (and it's a shame !) : so data added @@ -143,10 +130,7 @@ harmless format' % item.filename) try: cfile = mat.create_class_file(name, False, self.add2archive) - if method is 'normal': - cfile.remove_all() - else: - cfile.remove_all_strict() + cfile.remove_all() logging.debug('Processing %s from %s' % (item.filename, self.filename)) zipout.write(name, item.filename) @@ -179,7 +163,7 @@ class TarStripper(GenericArchiveStripper): current_file.gname = '' return current_file - def _remove_all(self, method): + def _remove_all(self): tarin = tarfile.open(self.filename, 'r' + self.compression) tarout = tarfile.open(self.output, 'w' + self.compression) for item in tarin.getmembers(): @@ -190,10 +174,7 @@ class TarStripper(GenericArchiveStripper): try: cfile = mat.create_class_file(name, False, self.add2archive) - if method is 'normal': - cfile.remove_all() - else: - cfile.remove_all_strict() + cfile.remove_all() tarout.add(name, item.name, filter=self._remove) except: logging.info('%s\' format is not supported or harmless' % diff --git a/lib/mat.py b/lib/mat.py index 53d02d8..dfcfc57 100644 --- a/lib/mat.py +++ b/lib/mat.py @@ -24,7 +24,7 @@ hachoir_core.config.quiet = True fname = '' #Verbose -#LOGGING_LEVEL = logging.DEBUG +LOGGING_LEVEL = logging.DEBUG #hachoir_core.config.quiet = False #logname = 'report.log' diff --git a/lib/office.py b/lib/office.py index e1d738e..82b817e 100644 --- a/lib/office.py +++ b/lib/office.py @@ -49,7 +49,7 @@ class OpenDocumentStripper(archive.GenericArchiveStripper): logging.debug('%s has no opendocument metadata' % self.filename) return metadata - def _remove_all(self, method): + def _remove_all(self): ''' FIXME ? There is a patch implementing the Zipfile.remove() @@ -84,10 +84,7 @@ class OpenDocumentStripper(archive.GenericArchiveStripper): try: cfile = mat.create_class_file(name, False, self.add2archive) - if method == 'normal': - cfile.remove_all() - else: - cfile.remove_all_strict() + cfile.remove_all() logging.debug('Processing %s from %s' % (item, self.filename)) zipout.write(name, item) @@ -137,20 +134,17 @@ class PdfStripper(parser.GenericParser): Check if the file is clean from harmful metadatas ''' for key in self.meta_list: - if self.document.get_property(key) is not None and \ - self.document.get_property(key) != '': + if self.document.get_property(key) != None: return False return True - def remove_all(self): ''' Remove supperficial ''' return self._remove_meta() - - def remove_all_strict(self): + def _remove_meta(self): ''' Opening the PDF with poppler, then doing a render on a cairo pdfsurface for each pages. @@ -166,54 +160,26 @@ class PdfStripper(parser.GenericParser): for pagenum in xrange(self.document.get_n_pages()): page = self.document.get_page(pagenum) context.translate(0, 0) - page.render(context) # render the page on context + page.render_for_printing(context) # render the page on context context.show_page() # draw context on surface surface.finish() - return self._remove_meta() - def _remove_meta(self): - ''' - Remove superficial/external metadata - from a PDF file, using exiftool, - of pdfrw if exiftool is not installed - ''' - processed = False - try:# try with pdfrw + try: import pdfrw #For now, poppler cannot write meta, so we must use pdfrw logging.debug('Removing %s\'s superficial metadata' % self.filename) trailer = pdfrw.PdfReader(self.output) - trailer.Info.Producer = trailer.Author = trailer.Info.Creator = None + trailer.Info.Producer = None + trailer.Info.Creator = None writer = pdfrw.PdfWriter() writer.trailer = trailer writer.write(self.output) self.do_backup() - processed = True - except: - pass - - try: # try with exiftool - subprocess.Popen('exiftool', stdout=open('/dev/null')) - import exiftool - # Note: '-All=' must be followed by a known exiftool option. - if self.backup: - process = subprocess.Popen(['exiftool', '-m', '-All=', - '-out', self.output, self.filename], stdout=open('/dev/null')) - process.wait() - else: - # Note: '-All=' must be followed by a known exiftool option. - process = subprocess.Popen( - ['exiftool', '-All=', '-overwrite_original', self.filename], - stdout=open('/dev/null')) - process.wait() - processed = True + return True except: - pass - - if processed is False: - logging.error('Please install either pdfrw, or exiftool to\ - fully handle PDF files') - return processed + print('Unable to remove all metadata from %s, please install\ + pdfrw' % self.output) + return False def get_meta(self): ''' @@ -221,8 +187,7 @@ class PdfStripper(parser.GenericParser): ''' metadata = {} for key in self.meta_list: - if self.document.get_property(key) is not None and \ - self.document.get_property(key) != '': + if self.document.get_property(key) is not None: metadata[key] = self.document.get_property(key) return metadata @@ -234,7 +199,7 @@ class OpenXmlStripper(archive.GenericArchiveStripper): It contains mostly xml, but can have media blobs, crap, ... (I don't like this format.) ''' - def _remove_all(self, method): + def _remove_all(self): ''' FIXME ? There is a patch implementing the Zipfile.remove() @@ -258,10 +223,7 @@ class OpenXmlStripper(archive.GenericArchiveStripper): try: cfile = mat.create_class_file(name, False, self.add2archive) - if method == 'normal': - cfile.remove_all() - else: - cfile.remove_all_strict() + cfile.remove_all() logging.debug('Processing %s from %s' % (item, self.filename)) zipout.write(name, item) diff --git a/lib/parser.py b/lib/parser.py index 6dc5d0b..d2eaf9c 100644 --- a/lib/parser.py +++ b/lib/parser.py @@ -78,16 +78,6 @@ class GenericParser(object): except: return False - def remove_all_strict(self): - ''' - If the remove_all() is not efficient enough, - this method is implemented : - It is efficient, but destructive. - In a perfect world, with nice fileformat, - this method would not exist. - ''' - self.remove_all() - def _remove(self, fieldset, field): ''' Delete the given field diff --git a/mat b/mat index ef83d84..468c76a 100755 --- a/mat +++ b/mat @@ -26,8 +26,6 @@ The default behaviour is to clean files given in argument') help='Keep a backup copy') options.add_option('--force', '-f', action='store_true', default=False, help='Don\'t check if files are clean before cleaning') - options.add_option('--strict', '-u', action='store_true', default=False, - help='Strict cleaning mode : loss can occur') info = optparse.OptionGroup(parser, 'Informations') info.add_option('--check', '-c', action='store_true', default=False, @@ -97,17 +95,6 @@ def clean_meta(class_file, filename, force): else: print('Unable to clean %s', filename) -def clean_meta_strict(class_file, filename, force): - ''' - Clean the file 'filename', strict way - ''' - print('[+] Cleaning %s' % filename) - if force is False and class_file.is_clean(): - print('%s is already clean' % filename) - else: - class_file.remove_all_strict() - print('%s cleaned' % filename) - def list_supported(): ''' @@ -142,8 +129,6 @@ def main(): func = list_meta elif args.check is True: # only check if the file is clean func = is_clean - elif args.strict is True: # destructive anonymisation method - func = clean_meta_strict elif args.list is True: # print the list of all supported format list_supported() else: # clean the file diff --git a/mat-gui b/mat-gui index db007e5..5c28732 100755 --- a/mat-gui +++ b/mat-gui @@ -103,18 +103,9 @@ class GUI: toolbar.add(toolbutton) toolbutton = gtk.ToolButton(gtk.STOCK_PRINT_REPORT) - toolbutton.set_label(_('Clean (lossless)')) + toolbutton.set_label(_('Clean')) toolbutton.connect('clicked', self.__process_files, self.__mat_clean) - toolbutton.set_tooltip_text(_('Clean selected files without possible \ -data loss')) - toolbar.add(toolbutton) - - toolbutton = gtk.ToolButton(gtk.STOCK_PRINT_WARNING) - toolbutton.set_label(_('Clean (strict)')) - toolbutton.connect('clicked', self.__process_files, - self.__mat_clean_strict) - toolbutton.set_tooltip_text(_('Clean selected files with possible \ -data loss, but clean more efficiently')) + toolbutton.set_tooltip_text(_('Clean selected files')) toolbar.add(toolbutton) toolbutton = gtk.ToolButton(gtk.STOCK_FIND) @@ -203,21 +194,10 @@ data loss, but clean more efficiently')) picture = gtk.Image() picture.set_from_stock(gtk.STOCK_PRINT_REPORT, gtk.ICON_SIZE_MENU) item.set_image(picture) - item.set_label(_('Clean (lossless)')) + item.set_label(_('Clean')) item.connect('activate', self.__process_files, self.__mat_clean) process_menu.append(item) - item = gtk.ImageMenuItem() - key, mod = gtk.accelerator_parse('S') - item.add_accelerator('activate', self.accelerator, - key, mod, gtk.ACCEL_VISIBLE) - picture = gtk.Image() - picture.set_from_stock(gtk.STOCK_PRINT_WARNING, gtk.ICON_SIZE_MENU) - item.set_image(picture) - item.set_label(_('Clean (strict)')) - item.connect('activate', self.__process_files, self.__mat_clean_strict) - process_menu.append(item) - item = gtk.ImageMenuItem() key, mod = gtk.accelerator_parse('h') item.add_accelerator('activate', self.accelerator, @@ -276,7 +256,6 @@ data loss, but clean more efficiently')) for root, dirs, files in os.walk(filename): for item in files: path_to_file = os.path.join(root, item) - if self.__add_file_to_treeview(path_to_file): not_supported.append(item) else: # filename is a regular file @@ -493,11 +472,11 @@ non-anonymised) file to output archive')) ''' for line in iterator: # for each file in selection self.statusbar.push(0, _('Checking %s...') % self.liststore[line][1]) - if self.liststore[line][3] != _('Clean (strict)'): + if self.force is True or self.liststore[line][3] != _('Clean'): if self.liststore[line][0].file.is_clean(): - string = _('Clean (lossless)') + string = _('Clean') else: - string = _('dirty') + string = _('Dirty') logging.info('%s is %s' % (self.liststore[line][1], string)) self.liststore[line][3] = string yield True @@ -509,33 +488,11 @@ non-anonymised) file to output archive')) Clean selected elements ''' for line in iterator: # for each file in selection - logging.info('Cleaning (lossless) %s' % self.liststore[line][1]) - self.statusbar.push(0, _('Cleaning %s...') % self.liststore[line][1]) - if self.liststore[line][3] != _('Clean (strict)'): - # if the file is not already strict cleaned - if self.force or not self.liststore[line][0].file.is_clean(): - if self.liststore[line][0].file.remove_all(): - # if everything went fine - self.liststore[line][3] = _('Clean (lossless)') - if self.backup: # the backup copy state - self.liststore[line][4] = self.liststore[line][0].file.output - yield True - self.statusbar.push(0, _('Ready')) - yield False - - def __mat_clean_strict(self, iterator): - ''' - Clean selected elements (ugly way) - ''' - for line in iterator: # for each file in selection - logging.info(_('Cleaning (strict) %s') % self.liststore[line][1]) + logging.info('Cleaning %s' % self.liststore[line][1]) self.statusbar.push(0, _('Cleaning %s...') % self.liststore[line][1]) - if self.liststore[line][3] != _('Clean (strict)'): - # if the file is not already strict cleaned - if self.force or not self.liststore[line][0].file.is_clean(): - if self.liststore[line][0].file.remove_all_strict(): - # if everything went fine - self.liststore[line][3] = _('Clean (strict)') + if self.force is True or self.liststore[line][3] != _('Clean'): + if self.liststore[line][0].file.remove_all(): + self.liststore[line][3] = _('Clean') if self.backup: # the backup copy state self.liststore[line][4] = self.liststore[line][0].file.output yield True diff --git a/mat.1 b/mat.1 index 178aa64..3e266f9 100644 --- a/mat.1 +++ b/mat.1 @@ -42,9 +42,6 @@ Don't check if files are clean before cleaning \fB\-l\fR, \fB\-\-list\fR List all supported fileformat .TP -\fB\-u\fR, \fB\-\-strict\fR -Remove harmful meta, but loss can occure -.TP \fB\-v\fR, \fB\-\-version\fR Display version and exit -- cgit v1.3