From 2cba152e7c00ff2c422d5e1c911f17ea07f346ed Mon Sep 17 00:00:00 2001 From: jvoisin Date: Mon, 6 Feb 2012 02:05:05 +0100 Subject: Merge the two processing mode into a unique one --- lib/archive.py | 31 ++++++-------------------- lib/mat.py | 2 +- lib/office.py | 68 +++++++++++++--------------------------------------------- lib/parser.py | 10 --------- 4 files changed, 22 insertions(+), 89 deletions(-) (limited to 'lib') diff --git a/lib/archive.py b/lib/archive.py index 9993102..a749b29 100644 --- a/lib/archive.py +++ b/lib/archive.py @@ -36,22 +36,9 @@ class GenericArchiveStripper(parser.GenericParser): shutil.rmtree(self.tempdir) def remove_all(self): - ''' - Call _remove_all() with in argument : "normal" - ''' - return self._remove_all('normal') + return self._remove_all() - def remove_all_strict(self): - ''' - call remove_all() with in argument : "strict" - ''' - return self._remove_all('strict') - - def _remove_all(self, method): - ''' - Remove all meta, normal way if method is "normal", - else, use the strict way (with possible data loss) - ''' + def _remove_all(self): raise NotImplementedError @@ -127,7 +114,7 @@ harmless format' % item.filename) zipin.close() return metadata - def _remove_all(self, method): + def _remove_all(self): ''' So far, the zipfile module does not allow to write a ZipInfo object into a zipfile (and it's a shame !) : so data added @@ -143,10 +130,7 @@ harmless format' % item.filename) try: cfile = mat.create_class_file(name, False, self.add2archive) - if method is 'normal': - cfile.remove_all() - else: - cfile.remove_all_strict() + cfile.remove_all() logging.debug('Processing %s from %s' % (item.filename, self.filename)) zipout.write(name, item.filename) @@ -179,7 +163,7 @@ class TarStripper(GenericArchiveStripper): current_file.gname = '' return current_file - def _remove_all(self, method): + def _remove_all(self): tarin = tarfile.open(self.filename, 'r' + self.compression) tarout = tarfile.open(self.output, 'w' + self.compression) for item in tarin.getmembers(): @@ -190,10 +174,7 @@ class TarStripper(GenericArchiveStripper): try: cfile = mat.create_class_file(name, False, self.add2archive) - if method is 'normal': - cfile.remove_all() - else: - cfile.remove_all_strict() + cfile.remove_all() tarout.add(name, item.name, filter=self._remove) except: logging.info('%s\' format is not supported or harmless' % diff --git a/lib/mat.py b/lib/mat.py index 53d02d8..dfcfc57 100644 --- a/lib/mat.py +++ b/lib/mat.py @@ -24,7 +24,7 @@ hachoir_core.config.quiet = True fname = '' #Verbose -#LOGGING_LEVEL = logging.DEBUG +LOGGING_LEVEL = logging.DEBUG #hachoir_core.config.quiet = False #logname = 'report.log' diff --git a/lib/office.py b/lib/office.py index e1d738e..82b817e 100644 --- a/lib/office.py +++ b/lib/office.py @@ -49,7 +49,7 @@ class OpenDocumentStripper(archive.GenericArchiveStripper): logging.debug('%s has no opendocument metadata' % self.filename) return metadata - def _remove_all(self, method): + def _remove_all(self): ''' FIXME ? There is a patch implementing the Zipfile.remove() @@ -84,10 +84,7 @@ class OpenDocumentStripper(archive.GenericArchiveStripper): try: cfile = mat.create_class_file(name, False, self.add2archive) - if method == 'normal': - cfile.remove_all() - else: - cfile.remove_all_strict() + cfile.remove_all() logging.debug('Processing %s from %s' % (item, self.filename)) zipout.write(name, item) @@ -137,20 +134,17 @@ class PdfStripper(parser.GenericParser): Check if the file is clean from harmful metadatas ''' for key in self.meta_list: - if self.document.get_property(key) is not None and \ - self.document.get_property(key) != '': + if self.document.get_property(key) != None: return False return True - def remove_all(self): ''' Remove supperficial ''' return self._remove_meta() - - def remove_all_strict(self): + def _remove_meta(self): ''' Opening the PDF with poppler, then doing a render on a cairo pdfsurface for each pages. @@ -166,54 +160,26 @@ class PdfStripper(parser.GenericParser): for pagenum in xrange(self.document.get_n_pages()): page = self.document.get_page(pagenum) context.translate(0, 0) - page.render(context) # render the page on context + page.render_for_printing(context) # render the page on context context.show_page() # draw context on surface surface.finish() - return self._remove_meta() - def _remove_meta(self): - ''' - Remove superficial/external metadata - from a PDF file, using exiftool, - of pdfrw if exiftool is not installed - ''' - processed = False - try:# try with pdfrw + try: import pdfrw #For now, poppler cannot write meta, so we must use pdfrw logging.debug('Removing %s\'s superficial metadata' % self.filename) trailer = pdfrw.PdfReader(self.output) - trailer.Info.Producer = trailer.Author = trailer.Info.Creator = None + trailer.Info.Producer = None + trailer.Info.Creator = None writer = pdfrw.PdfWriter() writer.trailer = trailer writer.write(self.output) self.do_backup() - processed = True - except: - pass - - try: # try with exiftool - subprocess.Popen('exiftool', stdout=open('/dev/null')) - import exiftool - # Note: '-All=' must be followed by a known exiftool option. - if self.backup: - process = subprocess.Popen(['exiftool', '-m', '-All=', - '-out', self.output, self.filename], stdout=open('/dev/null')) - process.wait() - else: - # Note: '-All=' must be followed by a known exiftool option. - process = subprocess.Popen( - ['exiftool', '-All=', '-overwrite_original', self.filename], - stdout=open('/dev/null')) - process.wait() - processed = True + return True except: - pass - - if processed is False: - logging.error('Please install either pdfrw, or exiftool to\ - fully handle PDF files') - return processed + print('Unable to remove all metadata from %s, please install\ + pdfrw' % self.output) + return False def get_meta(self): ''' @@ -221,8 +187,7 @@ class PdfStripper(parser.GenericParser): ''' metadata = {} for key in self.meta_list: - if self.document.get_property(key) is not None and \ - self.document.get_property(key) != '': + if self.document.get_property(key) is not None: metadata[key] = self.document.get_property(key) return metadata @@ -234,7 +199,7 @@ class OpenXmlStripper(archive.GenericArchiveStripper): It contains mostly xml, but can have media blobs, crap, ... (I don't like this format.) ''' - def _remove_all(self, method): + def _remove_all(self): ''' FIXME ? There is a patch implementing the Zipfile.remove() @@ -258,10 +223,7 @@ class OpenXmlStripper(archive.GenericArchiveStripper): try: cfile = mat.create_class_file(name, False, self.add2archive) - if method == 'normal': - cfile.remove_all() - else: - cfile.remove_all_strict() + cfile.remove_all() logging.debug('Processing %s from %s' % (item, self.filename)) zipout.write(name, item) diff --git a/lib/parser.py b/lib/parser.py index 6dc5d0b..d2eaf9c 100644 --- a/lib/parser.py +++ b/lib/parser.py @@ -78,16 +78,6 @@ class GenericParser(object): except: return False - def remove_all_strict(self): - ''' - If the remove_all() is not efficient enough, - this method is implemented : - It is efficient, but destructive. - In a perfect world, with nice fileformat, - this method would not exist. - ''' - self.remove_all() - def _remove(self, fieldset, field): ''' Delete the given field -- cgit v1.3