From 962e9aec5ffcdaae39e06f277dd47d1943205c37 Mon Sep 17 00:00:00 2001 From: jvoisin Date: Tue, 26 Jul 2011 15:14:48 +0200 Subject: Bugfixes (especially for pdf), and more pylint conformity --- lib/archive.py | 40 +++++++++++++++++++++++++++++++++------- lib/audio.py | 2 +- lib/images.py | 10 ++++++++-- lib/mat.py | 4 ++-- lib/misc.py | 5 ++--- lib/office.py | 23 +++++++++++++++-------- lib/parser.py | 4 ++-- 7 files changed, 63 insertions(+), 25 deletions(-) diff --git a/lib/archive.py b/lib/archive.py index f11506a..1aaf74b 100644 --- a/lib/archive.py +++ b/lib/archive.py @@ -10,7 +10,7 @@ import parser import mat -class GenericArchiveStripper(parser.Generic_parser): +class GenericArchiveStripper(parser.GenericParser): ''' Represent a generic archive ''' @@ -29,24 +29,40 @@ class GenericArchiveStripper(parser.Generic_parser): shutil.rmtree(self.tempdir) def remove_all(self): + ''' + Call _remove_all() with in argument : "normal" + ''' self._remove_all('normal') def remove_all_ugly(self): + ''' + call remove_all() with in argument : "ugly" + ''' self._remove_all('ugly') + def _remove_all(self, method): + ''' + Remove all meta, normal way if method is "normal", + else, use the ugly way (with possible data loss) + ''' + raise NotImplementedError class ZipStripper(GenericArchiveStripper): ''' Represent a zip file ''' - def is_file_clean(self, file): - if file.comment is not '': + def is_file_clean(self, fileinfo): + ''' + Check if a ZipInfo object is clean of metadatas added + by zip itself, independently of the corresponding file metadatas + ''' + if fileinfo.comment is not '': return False - elif file.date_time is not 0: + elif fileinfo.date_time is not 0: return False - elif file.create_system is not 0: + elif fileinfo.create_system is not 0: return False - elif file.create_version is not 0: + elif fileinfo.create_version is not 0: return False else: return True @@ -74,7 +90,7 @@ class ZipStripper(GenericArchiveStripper): #best solution I have found logging.info('%s\'s fileformat is not supported, or is a \ harmless format' % item.filename) - base, ext = os.path.splitext(name) + _, ext = os.path.splitext(name) bname = os.path.basename(item.filename) if ext not in parser.NOMETA: if bname != 'mimetype': @@ -84,6 +100,10 @@ harmless format' % item.filename) return True def get_meta(self): + ''' + Return all the metadata of a ZipFile (don't return metadatas + of contained files : should it ?) + ''' zipin = zipfile.ZipFile(self.filename, 'r') metadata = {} for field in zipin.infolist(): @@ -231,6 +251,9 @@ class TarStripper(GenericArchiveStripper): class GzipStripper(TarStripper): + ''' + Represent a tar.gz archive + ''' def __init__(self, realname, filename, parser, editor, backup, add2archive): super(GzipStripper, self).__init__(realname, @@ -239,6 +262,9 @@ class GzipStripper(TarStripper): class Bzip2Stripper(TarStripper): + ''' + Represents a tar.bz2 archive + ''' def __init__(self, realname, filename, parser, editor, backup, add2archive): super(Bzip2Stripper, self).__init__(realname, diff --git a/lib/audio.py b/lib/audio.py index 35d4fde..d77efd9 100644 --- a/lib/audio.py +++ b/lib/audio.py @@ -1,7 +1,7 @@ import parser -class MpegAudioStripper(parser.Generic_parser): +class MpegAudioStripper(parser.GenericParser): ''' mpeg audio file (mp3, ...) ''' diff --git a/lib/images.py b/lib/images.py index bab0bfb..df3d256 100644 --- a/lib/images.py +++ b/lib/images.py @@ -1,7 +1,10 @@ import parser -class JpegStripper(parser.Generic_parser): +class JpegStripper(parser.GenericParser): + ''' + Represents a .jpeg file + ''' def _should_remove(self, field): if field.name.startswith('comment'): return True @@ -11,7 +14,10 @@ class JpegStripper(parser.Generic_parser): return False -class PngStripper(parser.Generic_parser): +class PngStripper(parser.GenericParser): + ''' + Represents a .png file + ''' def _should_remove(self, field): if field.name.startswith("text["): return True diff --git a/lib/mat.py b/lib/mat.py index e4371ce..8d01e05 100644 --- a/lib/mat.py +++ b/lib/mat.py @@ -25,7 +25,7 @@ LOGGING_LEVEL = logging.DEBUG logging.basicConfig(level=LOGGING_LEVEL) -strippers = { +STRIPPERS = { hachoir_parser.image.JpegFile: images.JpegStripper, hachoir_parser.image.PngFile: images.PngStripper, hachoir_parser.audio.MpegAudioFile: audio.MpegAudioStripper, @@ -83,7 +83,7 @@ def create_class_file(name, backup, add2archive): (which herits from the "file" class), based on the editor of given file (name) ''' - stripper_class = strippers[editor.input.__class__] + stripper_class = STRIPPERS[editor.input.__class__] except KeyError: #Place for another lib than hachoir logging.info('Don\'t have stripper for format %s' % editor.description) diff --git a/lib/misc.py b/lib/misc.py index ce14313..f846388 100644 --- a/lib/misc.py +++ b/lib/misc.py @@ -2,7 +2,7 @@ import hachoir_core import parser -class TorrentStripper(parser.Generic_parser): +class TorrentStripper(parser.GenericParser): ''' A torrent file looks like: -root @@ -21,8 +21,7 @@ class TorrentStripper(parser.Generic_parser): if self._should_remove(field): #FIXME : hachoir does not support torrent metadata editing :< del self.editor['/root/' + field.name] - hachoir_core.field.writeIntoFile(self.editor, - self.filename + parser.POSTFIX) + hachoir_core.field.writeIntoFile(self.editor, self.output) self.do_backup() def is_clean(self): diff --git a/lib/office.py b/lib/office.py index 432bc0b..5fa475d 100644 --- a/lib/office.py +++ b/lib/office.py @@ -6,6 +6,7 @@ import glob import logging import zipfile import re +import shutil from xml.etree import ElementTree @@ -97,7 +98,7 @@ class OpenDocumentStripper(archive.GenericArchiveStripper): return True -class PdfStripper(parser.Generic_parser): +class PdfStripper(parser.GenericParser): ''' Represent a pdf file, with the help of pdfrw ''' @@ -109,10 +110,17 @@ class PdfStripper(parser.Generic_parser): self.realname = realname self.shortname = os.path.basename(filename) self.mime = mimetypes.guess_type(filename)[0] + self.tempdir = tempfile.mkdtemp() self.trailer = pdfrw.PdfReader(self.filename) self.writer = pdfrw.PdfWriter() self.convert = 'gm convert -antialias -enhance %s %s' + def __del__(self): + ''' + Remove the temp dir + ''' + shutil.rmtree(self.tempdir) + def remove_all(self): ''' Remove all the meta fields that are compromizing @@ -133,19 +141,18 @@ class PdfStripper(parser.Generic_parser): Transform each pages into a jpg, clean them, then re-assemble them into a new pdf ''' - _, self.tmpdir = tempfile.mkstemp() - subprocess.call(self.convert % (self.filename, self.tmpdir + + subprocess.call(self.convert % (self.filename, self.tempdir + 'temp.jpg'), shell=True) # Convert pages to jpg - for current_file in glob.glob(self.tmpdir + 'temp*'): + for current_file in glob.glob(self.tempdir + 'temp*'): #Clean every jpg image - class_file = mat.create_class_file(current_file, False) + class_file = mat.create_class_file(current_file, False, False) class_file.remove_all() - subprocess.call(self.convert % (self.tmpdir + + subprocess.call(self.convert % (self.tempdir + 'temp.jpg*', self.output), shell=True) # Assemble jpg into pdf - for current_file in glob.glob(self.tmpdir + 'temp*'): + for current_file in glob.glob(self.tempdir + 'temp*'): #remove jpg files mat.secure_remove(current_file) @@ -155,7 +162,7 @@ class PdfStripper(parser.Generic_parser): name = self.realname else: name = self.output - class_file = mat.create_class_file(name, False) + class_file = mat.create_class_file(name, False, False) class_file.remove_all() def is_clean(self): diff --git a/lib/parser.py b/lib/parser.py index 28e0849..ae647fe 100644 --- a/lib/parser.py +++ b/lib/parser.py @@ -12,7 +12,7 @@ import mat NOMETA = ('.bmp', 'html', '.py', '.rdf', '.txt', '.xml') -class Generic_parser(object): +class GenericParser(object): def __init__(self, realname, filename, parser, editor, backup, add2archive): basename, ext = os.path.splitext(filename) @@ -78,7 +78,7 @@ class Generic_parser(object): return True if the field is compromizing abstract method ''' - raise NotImplementedError() + raise NotImplementedError def do_backup(self): ''' -- cgit v1.3