From e62ae6a87f630cbd389cf1b75672b06cd56973c8 Mon Sep 17 00:00:00 2001 From: jvoisin Date: Tue, 26 Jul 2011 14:06:38 +0200 Subject: Pyflakes and pep8 validation --- lib/archive.py | 20 ++++++++++++-------- lib/audio.py | 4 ++++ lib/images.py | 4 +--- lib/mat.py | 19 ++++++++++--------- lib/misc.py | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ lib/office.py | 24 +++++++++++------------- lib/parser.py | 21 +++++++++------------ 7 files changed, 95 insertions(+), 45 deletions(-) create mode 100644 lib/misc.py (limited to 'lib') diff --git a/lib/archive.py b/lib/archive.py index f22af39..f11506a 100644 --- a/lib/archive.py +++ b/lib/archive.py @@ -9,11 +9,13 @@ import tempfile import parser import mat + class GenericArchiveStripper(parser.Generic_parser): ''' Represent a generic archive ''' - def __init__(self, realname, filename, parser, editor, backup, add2archive): + def __init__(self, realname, filename, parser, editor, backup, + add2archive): super(GenericArchiveStripper, self).__init__(realname, filename, parser, editor, backup, add2archive) self.compression = '' @@ -32,6 +34,7 @@ class GenericArchiveStripper(parser.Generic_parser): def remove_all_ugly(self): self._remove_all('ugly') + class ZipStripper(GenericArchiveStripper): ''' Represent a zip file @@ -94,7 +97,6 @@ harmless format' % item.filename) zipin.close() return metadata - def _remove_all(self, method): ''' So far, the zipfile module does not allow to write a ZipInfo @@ -150,7 +152,7 @@ class TarStripper(GenericArchiveStripper): for item in tarin.getmembers(): tarin.extract(item, self.tempdir) name = os.path.join(self.tempdir, item.name) - if item.type is '0': #is item a regular file ? + if item.type is '0': # is item a regular file ? #no backup file try: cfile = mat.create_class_file(name, False, @@ -164,7 +166,7 @@ class TarStripper(GenericArchiveStripper): logging.info('%s\' format is not supported' % item.name) if self.add2archive: - tarout.add(name, item.name,filter=self._remove) + tarout.add(name, item.name, filter=self._remove) mat.secure_remove(name) tarin.close() tarout.close() @@ -194,7 +196,7 @@ class TarStripper(GenericArchiveStripper): return False tarin.extract(item, self.tempdir) name = os.path.join(self.tempdir, item.name) - if item.type is '0': #is item a regular file ? + if item.type is '0': # is item a regular file ? #no backup file try: class_file = mat.create_class_file(name, @@ -216,7 +218,7 @@ class TarStripper(GenericArchiveStripper): metadata = {} for current_file in tarin.getmembers(): if current_file.type is '0': - if not self.is_file_clean(current_file):#if there is meta + if not self.is_file_clean(current_file): # if there is meta current_meta = {} current_meta['mtime'] = current_file.mtime current_meta['uid'] = current_file.uid @@ -229,14 +231,16 @@ class TarStripper(GenericArchiveStripper): class GzipStripper(TarStripper): - def __init__(self, realname, filename, parser, editor, backup, add2archive): + def __init__(self, realname, filename, parser, editor, backup, + add2archive): super(GzipStripper, self).__init__(realname, filename, parser, editor, backup, add2archive) self.compression = ':gz' class Bzip2Stripper(TarStripper): - def __init__(self, realname, filename, parser, editor, backup, add2archive): + def __init__(self, realname, filename, parser, editor, backup, + add2archive): super(Bzip2Stripper, self).__init__(realname, filename, parser, editor, backup, add2archive) self.compression = ':bz2' diff --git a/lib/audio.py b/lib/audio.py index 6d653bc..35d4fde 100644 --- a/lib/audio.py +++ b/lib/audio.py @@ -1,6 +1,10 @@ import parser + class MpegAudioStripper(parser.Generic_parser): + ''' + mpeg audio file (mp3, ...) + ''' def _should_remove(self, field): if field.name in ("id3v1", "id3v2"): return True diff --git a/lib/images.py b/lib/images.py index 4441b70..bab0bfb 100644 --- a/lib/images.py +++ b/lib/images.py @@ -1,8 +1,5 @@ import parser -class BmpStripper(parser.Generic_parser): - def _should_remove(self, field): - return False class JpegStripper(parser.Generic_parser): def _should_remove(self, field): @@ -13,6 +10,7 @@ class JpegStripper(parser.Generic_parser): else: return False + class PngStripper(parser.Generic_parser): def _should_remove(self, field): if field.name.startswith("text["): diff --git a/lib/mat.py b/lib/mat.py index ccf653f..e4371ce 100644 --- a/lib/mat.py +++ b/lib/mat.py @@ -23,12 +23,11 @@ __author__ = 'jvoisin' LOGGING_LEVEL = logging.DEBUG -logging.basicConfig(level = LOGGING_LEVEL) +logging.basicConfig(level=LOGGING_LEVEL) strippers = { hachoir_parser.image.JpegFile: images.JpegStripper, hachoir_parser.image.PngFile: images.PngStripper, - hachoir_parser.image.bmp.BmpFile: images.BmpStripper, hachoir_parser.audio.MpegAudioFile: audio.MpegAudioStripper, hachoir_parser.misc.PDFDocument: office.PdfStripper, hachoir_parser.archive.TarFile: archive.TarStripper, @@ -37,6 +36,7 @@ strippers = { hachoir_parser.archive.zip.ZipFile: archive.ZipStripper, } + def secure_remove(filename): ''' securely remove the file @@ -52,10 +52,11 @@ def is_secure(filename): Prevent shell injection ''' - if not(os.path.isfile(filename)): #check if the file exist + if not(os.path.isfile(filename)): # check if the file exist logging.error('Error: %s is not a valid file' % filename) return False + def create_class_file(name, backup, add2archive): ''' return a $FILETYPEStripper() class, @@ -68,7 +69,7 @@ def create_class_file(name, backup, add2archive): realname = name try: filename = hachoir_core.cmd_line.unicodeFilename(name) - except TypeError:# get rid of "TypeError: decoding Unicode is not supported" + except TypeError: # get rid of "decoding Unicode is not supported" filename = name parser = hachoir_parser.createParser(filename) if not parser: @@ -88,22 +89,22 @@ def create_class_file(name, backup, add2archive): logging.info('Don\'t have stripper for format %s' % editor.description) return - if editor.input.__class__ == hachoir_parser.misc.PDFDocument:#pdf + if editor.input.__class__ == hachoir_parser.misc.PDFDocument: # pdf return stripper_class(filename, realname, backup) elif editor.input.__class__ == hachoir_parser.archive.zip.ZipFile: #zip based format mime = mimetypes.guess_type(filename)[0] - try:#Ugly workaround, cleaning open document delete mime (wtf?) + try: # ugly workaround, cleaning open document delete mime (wtf?) if mime.startswith('application/vnd.oasis.opendocument'): return office.OpenDocumentStripper(realname, filename, parser, editor, backup, add2archive) - else:#normal zip + else: # normal zip return stripper_class(realname, filename, parser, editor, backup, add2archive) - except:#normal zip file + except: # normal zip return stripper_class(realname, filename, parser, editor, backup, add2archive) - else:#normal handling + else: # normal handling return stripper_class(realname, filename, parser, editor, backup, add2archive) diff --git a/lib/misc.py b/lib/misc.py new file mode 100644 index 0000000..ce14313 --- /dev/null +++ b/lib/misc.py @@ -0,0 +1,48 @@ +import hachoir_core +import parser + + +class TorrentStripper(parser.Generic_parser): + ''' + A torrent file looks like: + -root + -start + -announce + -announce-list + -comment + -created_by + -creation_date + -encoding + -info + -end + ''' + def remove_all(self): + for field in self.editor['root']: + if self._should_remove(field): + #FIXME : hachoir does not support torrent metadata editing :< + del self.editor['/root/' + field.name] + hachoir_core.field.writeIntoFile(self.editor, + self.filename + parser.POSTFIX) + self.do_backup() + + def is_clean(self): + for field in self.editor['root']: + if self._should_remove(field): + return False + return True + + def get_meta(self): + metadata = {} + for field in self.editor['root']: + if self._should_remove(field): + try: # FIXME + metadata[field.name] = field.value + except: + metadata[field.name] = 'harmful content' + return metadata + + def _should_remove(self, field): + if field.name in ('comment', 'created_by', 'creation_date', 'info'): + return True + else: + return False diff --git a/lib/office.py b/lib/office.py index 27677d2..432bc0b 100644 --- a/lib/office.py +++ b/lib/office.py @@ -5,17 +5,16 @@ import tempfile import glob import logging import zipfile -import shutil import re from xml.etree import ElementTree -import hachoir_core import pdfrw import mat import parser import archive + class OpenDocumentStripper(archive.GenericArchiveStripper): ''' An open document file is a zip, with xml file into. @@ -32,11 +31,10 @@ class OpenDocumentStripper(archive.GenericArchiveStripper): for node in tree.iter(): key = re.sub('{.*}', '', node.tag) metadata[key] = node.text - except KeyError:#no meta.xml file found + except KeyError: # no meta.xml file found logging.debug('%s has no opendocument metadata' % self.filename) return metadata - def _remove_all(self, method): ''' FIXME ? @@ -50,7 +48,7 @@ class OpenDocumentStripper(archive.GenericArchiveStripper): name = os.path.join(self.tempdir, item) if item.endswith('.xml') or item == 'mimetype': #keep .xml files, and the "manifest" file - if item != 'meta.xml':#contains the metadata + if item != 'meta.xml': # contains the metadata zipin.extract(item, self.tempdir) zipout.write(name, item) mat.secure_remove(name) @@ -73,7 +71,7 @@ class OpenDocumentStripper(archive.GenericArchiveStripper): self.filename)) zipout.write(name, item) except: - logging.info('%s\' fileformat is not supported' % item) + logging.info('%s\' fileformat is not supported' % item) if self.add2archive: zipout.write(name, item) mat.secure_remove(name) @@ -88,7 +86,7 @@ class OpenDocumentStripper(archive.GenericArchiveStripper): try: zipin.getinfo('meta.xml') return False - except KeyError:#no meta.xml in the file + except KeyError: # no meta.xml in the file zipin.close() czf = archive.ZipStripper(self.realname, self.filename, self.parser, self.editor, self.backup, self.add2archive) @@ -104,7 +102,7 @@ class PdfStripper(parser.Generic_parser): Represent a pdf file, with the help of pdfrw ''' def __init__(self, filename, realname, backup): - name, path = os.path.splitext(filename) + name, ext = os.path.splitext(filename) self.output = name + '.cleaned' + ext self.filename = filename self.backup = backup @@ -137,7 +135,7 @@ class PdfStripper(parser.Generic_parser): ''' _, self.tmpdir = tempfile.mkstemp() subprocess.call(self.convert % (self.filename, self.tmpdir + - 'temp.jpg'), shell=True)#Convert pages to jpg + 'temp.jpg'), shell=True) # Convert pages to jpg for current_file in glob.glob(self.tmpdir + 'temp*'): #Clean every jpg image @@ -145,18 +143,18 @@ class PdfStripper(parser.Generic_parser): class_file.remove_all() subprocess.call(self.convert % (self.tmpdir + - 'temp.jpg*', self.output), shell=True)#Assemble jpg into pdf + 'temp.jpg*', self.output), shell=True) # Assemble jpg into pdf for current_file in glob.glob(self.tmpdir + 'temp*'): #remove jpg files mat.secure_remove(current_file) if self.backup is False: - mat.secure_remove(self.filename) #remove the old file - os.rename(self.output, self.filename)#rename the new + mat.secure_remove(self.filename) # remove the old file + os.rename(self.output, self.filename) # rename the new name = self.realname else: - name = output_file + name = self.output class_file = mat.create_class_file(name, False) class_file.remove_all() diff --git a/lib/parser.py b/lib/parser.py index aa7e7f1..28e0849 100644 --- a/lib/parser.py +++ b/lib/parser.py @@ -2,27 +2,25 @@ Parent class of all parser ''' -import hachoir_core.error -import hachoir_parser -import hachoir_editor +import hachoir_core -import sys import os -import subprocess import mimetypes import mat -NOMETA = ('.txt', '.bmp', '.py', '.xml', '.rdf') +NOMETA = ('.bmp', 'html', '.py', '.rdf', '.txt', '.xml') + class Generic_parser(object): - def __init__(self, realname, filename, parser, editor, backup, add2archive): + def __init__(self, realname, filename, parser, editor, backup, + add2archive): basename, ext = os.path.splitext(filename) self.output = basename + '.cleaned' + ext - self.filename = filename #path + filename - self.realname = realname #path + filename - self.basename = os.path.basename(filename) #only filename - self.mime = mimetypes.guess_type(filename)[0] #mimetype + self.filename = filename # path + filename + self.realname = realname # path + filename + self.basename = os.path.basename(filename) # only filename + self.mime = mimetypes.guess_type(filename)[0] # mimetype self.parser = parser self.editor = editor self.backup = backup @@ -56,7 +54,6 @@ class Generic_parser(object): ''' self.remove_all() - def _remove(self, field): ''' Delete the given field -- cgit v1.3