From cffdcc1b1e78d48cad62c54432a9d8ce41f4d616 Mon Sep 17 00:00:00 2001 From: jvoisin Date: Thu, 27 Dec 2012 17:01:30 +0100 Subject: Refactor the archive parser Refactoring of the archive and office parser, in order to simplify the code and reduce abstraction --- MAT/archive.py | 7 ++----- MAT/office.py | 10 ++-------- MAT/parser.py | 14 ++++++++++---- 3 files changed, 14 insertions(+), 17 deletions(-) (limited to 'MAT') diff --git a/MAT/archive.py b/MAT/archive.py index 69c8f1b..1dcddef 100644 --- a/MAT/archive.py +++ b/MAT/archive.py @@ -35,9 +35,6 @@ class GenericArchiveStripper(parser.GenericParser): shutil.rmtree(self.tempdir) def remove_all(self): - return self._remove_all() - - def _remove_all(self): raise NotImplementedError @@ -113,7 +110,7 @@ harmless format' % item.filename) zipin.close() return metadata - def _remove_all(self): + def remove_all(self): ''' So far, the zipfile module does not allow to write a ZipInfo object into a zipfile (and it's a shame !) : so data added @@ -162,7 +159,7 @@ class TarStripper(GenericArchiveStripper): current_file.gname = '' return current_file - def _remove_all(self): + def remove_all(self): tarin = tarfile.open(self.filename, 'r' + self.compression) tarout = tarfile.open(self.output, 'w' + self.compression) for item in tarin.getmembers(): diff --git a/MAT/office.py b/MAT/office.py index e7ce661..20664d2 100644 --- a/MAT/office.py +++ b/MAT/office.py @@ -49,7 +49,7 @@ class OpenDocumentStripper(archive.GenericArchiveStripper): logging.debug('%s has no opendocument metadata' % self.filename) return metadata - def _remove_all(self): + def remove_all(self): ''' FIXME ? There is a patch implementing the Zipfile.remove() @@ -139,12 +139,6 @@ class PdfStripper(parser.GenericParser): return True def remove_all(self): - ''' - Remove metadata - ''' - return self._remove_meta() - - def _remove_meta(self): ''' Opening the PDF with poppler, then doing a render on a cairo pdfsurface for each pages. @@ -202,7 +196,7 @@ class OpenXmlStripper(archive.GenericArchiveStripper): It contains mostly xml, but can have media blobs, crap, ... (I don't like this format.) ''' - def _remove_all(self): + def remove_all(self): ''' FIXME ? There is a patch implementing the Zipfile.remove() diff --git a/MAT/parser.py b/MAT/parser.py index 6be2b03..e5acbf8 100644 --- a/MAT/parser.py +++ b/MAT/parser.py @@ -59,7 +59,7 @@ class GenericParser(object): def remove_all(self): ''' - Remove all the files that are compromizing + Remove all compromising fields ''' state = self._remove_all(self.editor) hachoir_core.field.writeIntoFile(self.editor, self.output) @@ -67,6 +67,9 @@ class GenericParser(object): return state def _remove_all(self, fieldset): + ''' + Recursive way to handle tree metadatas + ''' try: for field in fieldset: remove = self._should_remove(field) @@ -93,6 +96,9 @@ class GenericParser(object): return metadata def _get_meta(self, fieldset, metadata): + ''' + Recursive way to handle tree metadatas + ''' for field in fieldset: remove = self._should_remove(field) if remove is True: @@ -101,11 +107,11 @@ class GenericParser(object): except: metadata[field.name] = 'harmful content' if remove is FIELD: - self._get_meta(field) + self._get_meta(field, None) def _should_remove(self, key): ''' - return True if the field is compromizing + return True if the field is compromising abstract method ''' raise NotImplementedError @@ -115,6 +121,6 @@ class GenericParser(object): Do a backup of the file if asked, and change his creation/access date ''' - if self.backup is False: + if not self.backup: mat.secure_remove(self.filename) os.rename(self.output, self.filename) -- cgit v1.3