From 7bec354973580216c64889b925e1f7d6a224d7dd Mon Sep 17 00:00:00 2001 From: jvoisin Date: Mon, 25 Jul 2011 03:03:12 +0200 Subject: more abstraction, and changed the name of the outputed file --- lib/archive.py | 14 +++++--------- lib/office.py | 24 +++++++++--------------- lib/parser.py | 20 ++++++++++++++------ 3 files changed, 28 insertions(+), 30 deletions(-) (limited to 'lib') diff --git a/lib/archive.py b/lib/archive.py index 8a305d5..21bc5c5 100644 --- a/lib/archive.py +++ b/lib/archive.py @@ -83,7 +83,7 @@ class ZipStripper(GenericArchiveStripper): def _remove_all(self, method): zipin = zipfile.ZipFile(self.filename, 'r') - zipout = zipfile.ZipFile(self.filename + parser.POSTFIX, 'w', + zipout = zipfile.ZipFile(self.output, 'w', allowZip64=True) for item in zipin.infolist(): zipin.extract(item, self.tempdir) @@ -109,6 +109,7 @@ class ZipStripper(GenericArchiveStripper): logging.info('%s treated' % self.filename) zipin.close() zipout.close() + self.do_backup() class TarStripper(GenericArchiveStripper): @@ -125,8 +126,7 @@ class TarStripper(GenericArchiveStripper): def _remove_all(self, method): tarin = tarfile.open(self.filename, 'r' + self.compression) - tarout = tarfile.open(self.filename + parser.POSTFIX, - 'w' + self.compression) + tarout = tarfile.open(self.output, 'w' + self.compression) for item in tarin.getmembers(): tarin.extract(item, self.tempdir) name = os.path.join(self.tempdir, item.name) @@ -148,10 +148,7 @@ class TarStripper(GenericArchiveStripper): mat.secure_remove(name) tarin.close() tarout.close() - - if self.backup is False: - mat.secure_remove(self.filename) - os.rename(self.filename + parser.POSTFIX, self.filename) + self.do_backup() def is_file_clean(self, current_file): ''' @@ -179,8 +176,7 @@ class TarStripper(GenericArchiveStripper): name = os.path.join(self.tempdir, item.name) if item.type is '0': #is item a regular file ? #no backup file - class_file = mat.create_class_file(name, False, - self.add2archive) + class_file = mat.create_class_file(name, False,self.add2archive) mat.secure_remove(name) if not class_file.is_clean():#if the extracted file is not clean return False diff --git a/lib/office.py b/lib/office.py index f87f357..2302dbc 100644 --- a/lib/office.py +++ b/lib/office.py @@ -27,7 +27,7 @@ class OpenDocumentStripper(archive.GenericArchiveStripper): method here : http://bugs.python.org/issue6818 ''' zipin = zipfile.ZipFile(self.filename, 'r') - zipout = zipfile.ZipFile(self.filename + parser.POSTFIX, 'w', + zipout = zipfile.ZipFile(self.basename + parser.POSTFIX + self.ext, 'w', allowZip64=True) for item in zipin.namelist(): name = os.path.join(self.tempdir, item) @@ -65,10 +65,7 @@ class OpenDocumentStripper(archive.GenericArchiveStripper): logging.info('%s treated' % self.filename) zipin.close() zipout.close() - - if self.backup is False: - mat.secure_remove(self.filename) #remove the old file - os.rename(self.filename + parser.POSTFIX, self.filename) + self.do_backup() def is_clean(self): zipin = zipfile.ZipFile(self.filename, 'r') @@ -106,9 +103,7 @@ class TorrentStripper(parser.Generic_parser): del self.editor['/root/' + field.name] hachoir_core.field.writeIntoFile(self.editor, self.filename + parser.POSTFIX) - if self.backup is False: - mat.secure_remove(self.filename) #remove the old file - os.rename(self.filename + parser.POSTFIX, self.filename) + self.do_backup() def is_clean(self): for field in self.editor['root']: @@ -138,6 +133,8 @@ class PdfStripper(parser.Generic_parser): Represent a pdf file, with the help of pdfrw ''' def __init__(self, filename, realname, backup): + name, path = os.path.splitext(filename) + self.output = name + '.cleaned.' + ext self.filename = filename self.backup = backup self.realname = realname @@ -159,17 +156,14 @@ class PdfStripper(parser.Generic_parser): self.trailer.Info.ModDate = '' self.writer.trailer = self.trailer - self.writer.write(self.filename + parser.POSTFIX) - if self.backup is False: - mat.secure_remove(self.filename) #remove the old file - os.rename(self.filename + parser.POSTFIX, self.filename) + self.writer.write(self.output) + self.do_backup() def remove_all_ugly(self): ''' Transform each pages into a jpg, clean them, then re-assemble them into a new pdf ''' - output_file = self.realname + parser.POSTFIX + '.pdf' _, self.tmpdir = tempfile.mkstemp() subprocess.call(self.convert % (self.filename, self.tmpdir + 'temp.jpg'), shell=True)#Convert pages to jpg @@ -180,7 +174,7 @@ class PdfStripper(parser.Generic_parser): class_file.remove_all() subprocess.call(self.convert % (self.tmpdir + - 'temp.jpg*', output_file), shell=True)#Assemble jpg into pdf + 'temp.jpg*', self.output), shell=True)#Assemble jpg into pdf for current_file in glob.glob(self.tmpdir + 'temp*'): #remove jpg files @@ -188,7 +182,7 @@ class PdfStripper(parser.Generic_parser): if self.backup is False: mat.secure_remove(self.filename) #remove the old file - os.rename(output_file, self.filename)#rename the new + os.rename(self.output, self.filename)#rename the new name = self.realname else: name = output_file diff --git a/lib/parser.py b/lib/parser.py index ba4981d..11e776e 100644 --- a/lib/parser.py +++ b/lib/parser.py @@ -13,10 +13,12 @@ import mimetypes import mat -POSTFIX = ".cleaned" +NOMETA = ('*.txt', '*.bmp', '*.py') class Generic_parser(object): def __init__(self, realname, filename, parser, editor, backup, add2archive): + basename, ext = os.path.splitext(filename) + self.output = basename + '.cleaned.' + ext self.filename = filename #path + filename self.realname = realname #path + filename self.shortname = os.path.basename(filename) #only filename @@ -41,10 +43,8 @@ class Generic_parser(object): for field in self.editor: if self._should_remove(field): self._remove(field.name) - hachoir_core.field.writeIntoFile(self.editor, self.filename + POSTFIX) - if self.backup is False: - mat.secure_remove(self.filename) #remove the old file - os.rename(self.filename+ POSTFIX, self.filename) #rename the new + hachoir_core.field.writeIntoFile(self.editor, self.output) + self.do_backup() def remove_all_ugly(self): ''' @@ -73,7 +73,7 @@ class Generic_parser(object): try: metadata[field.name] = field.value except: - metadata[field.name] = "harmful content" + metadata[field.name] = 'harmful content' return metadata def _should_remove(self, key): @@ -82,3 +82,11 @@ class Generic_parser(object): abstract method ''' raise NotImplementedError() + + def do_backup(self): + ''' + Do a backup of the file if asked + ''' + if self.backup is False: + mat.secure_remove(self.filename) + os.rename(self.output, self.filename) -- cgit v1.3