diff options
Diffstat (limited to 'lib')
| -rw-r--r-- | lib/archive.py | 40 | ||||
| -rw-r--r-- | lib/audio.py | 2 | ||||
| -rw-r--r-- | lib/images.py | 10 | ||||
| -rw-r--r-- | lib/mat.py | 4 | ||||
| -rw-r--r-- | lib/misc.py | 5 | ||||
| -rw-r--r-- | lib/office.py | 23 | ||||
| -rw-r--r-- | lib/parser.py | 4 |
7 files changed, 63 insertions, 25 deletions
diff --git a/lib/archive.py b/lib/archive.py index f11506a..1aaf74b 100644 --- a/lib/archive.py +++ b/lib/archive.py | |||
| @@ -10,7 +10,7 @@ import parser | |||
| 10 | import mat | 10 | import mat |
| 11 | 11 | ||
| 12 | 12 | ||
| 13 | class GenericArchiveStripper(parser.Generic_parser): | 13 | class GenericArchiveStripper(parser.GenericParser): |
| 14 | ''' | 14 | ''' |
| 15 | Represent a generic archive | 15 | Represent a generic archive |
| 16 | ''' | 16 | ''' |
| @@ -29,24 +29,40 @@ class GenericArchiveStripper(parser.Generic_parser): | |||
| 29 | shutil.rmtree(self.tempdir) | 29 | shutil.rmtree(self.tempdir) |
| 30 | 30 | ||
| 31 | def remove_all(self): | 31 | def remove_all(self): |
| 32 | ''' | ||
| 33 | Call _remove_all() with in argument : "normal" | ||
| 34 | ''' | ||
| 32 | self._remove_all('normal') | 35 | self._remove_all('normal') |
| 33 | 36 | ||
| 34 | def remove_all_ugly(self): | 37 | def remove_all_ugly(self): |
| 38 | ''' | ||
| 39 | call remove_all() with in argument : "ugly" | ||
| 40 | ''' | ||
| 35 | self._remove_all('ugly') | 41 | self._remove_all('ugly') |
| 36 | 42 | ||
| 43 | def _remove_all(self, method): | ||
| 44 | ''' | ||
| 45 | Remove all meta, normal way if method is "normal", | ||
| 46 | else, use the ugly way (with possible data loss) | ||
| 47 | ''' | ||
| 48 | raise NotImplementedError | ||
| 37 | 49 | ||
| 38 | class ZipStripper(GenericArchiveStripper): | 50 | class ZipStripper(GenericArchiveStripper): |
| 39 | ''' | 51 | ''' |
| 40 | Represent a zip file | 52 | Represent a zip file |
| 41 | ''' | 53 | ''' |
| 42 | def is_file_clean(self, file): | 54 | def is_file_clean(self, fileinfo): |
| 43 | if file.comment is not '': | 55 | ''' |
| 56 | Check if a ZipInfo object is clean of metadatas added | ||
| 57 | by zip itself, independently of the corresponding file metadatas | ||
| 58 | ''' | ||
| 59 | if fileinfo.comment is not '': | ||
| 44 | return False | 60 | return False |
| 45 | elif file.date_time is not 0: | 61 | elif fileinfo.date_time is not 0: |
| 46 | return False | 62 | return False |
| 47 | elif file.create_system is not 0: | 63 | elif fileinfo.create_system is not 0: |
| 48 | return False | 64 | return False |
| 49 | elif file.create_version is not 0: | 65 | elif fileinfo.create_version is not 0: |
| 50 | return False | 66 | return False |
| 51 | else: | 67 | else: |
| 52 | return True | 68 | return True |
| @@ -74,7 +90,7 @@ class ZipStripper(GenericArchiveStripper): | |||
| 74 | #best solution I have found | 90 | #best solution I have found |
| 75 | logging.info('%s\'s fileformat is not supported, or is a \ | 91 | logging.info('%s\'s fileformat is not supported, or is a \ |
| 76 | harmless format' % item.filename) | 92 | harmless format' % item.filename) |
| 77 | base, ext = os.path.splitext(name) | 93 | _, ext = os.path.splitext(name) |
| 78 | bname = os.path.basename(item.filename) | 94 | bname = os.path.basename(item.filename) |
| 79 | if ext not in parser.NOMETA: | 95 | if ext not in parser.NOMETA: |
| 80 | if bname != 'mimetype': | 96 | if bname != 'mimetype': |
| @@ -84,6 +100,10 @@ harmless format' % item.filename) | |||
| 84 | return True | 100 | return True |
| 85 | 101 | ||
| 86 | def get_meta(self): | 102 | def get_meta(self): |
| 103 | ''' | ||
| 104 | Return all the metadata of a ZipFile (don't return metadatas | ||
| 105 | of contained files : should it ?) | ||
| 106 | ''' | ||
| 87 | zipin = zipfile.ZipFile(self.filename, 'r') | 107 | zipin = zipfile.ZipFile(self.filename, 'r') |
| 88 | metadata = {} | 108 | metadata = {} |
| 89 | for field in zipin.infolist(): | 109 | for field in zipin.infolist(): |
| @@ -231,6 +251,9 @@ class TarStripper(GenericArchiveStripper): | |||
| 231 | 251 | ||
| 232 | 252 | ||
| 233 | class GzipStripper(TarStripper): | 253 | class GzipStripper(TarStripper): |
| 254 | ''' | ||
| 255 | Represent a tar.gz archive | ||
| 256 | ''' | ||
| 234 | def __init__(self, realname, filename, parser, editor, backup, | 257 | def __init__(self, realname, filename, parser, editor, backup, |
| 235 | add2archive): | 258 | add2archive): |
| 236 | super(GzipStripper, self).__init__(realname, | 259 | super(GzipStripper, self).__init__(realname, |
| @@ -239,6 +262,9 @@ class GzipStripper(TarStripper): | |||
| 239 | 262 | ||
| 240 | 263 | ||
| 241 | class Bzip2Stripper(TarStripper): | 264 | class Bzip2Stripper(TarStripper): |
| 265 | ''' | ||
| 266 | Represents a tar.bz2 archive | ||
| 267 | ''' | ||
| 242 | def __init__(self, realname, filename, parser, editor, backup, | 268 | def __init__(self, realname, filename, parser, editor, backup, |
| 243 | add2archive): | 269 | add2archive): |
| 244 | super(Bzip2Stripper, self).__init__(realname, | 270 | super(Bzip2Stripper, self).__init__(realname, |
diff --git a/lib/audio.py b/lib/audio.py index 35d4fde..d77efd9 100644 --- a/lib/audio.py +++ b/lib/audio.py | |||
| @@ -1,7 +1,7 @@ | |||
| 1 | import parser | 1 | import parser |
| 2 | 2 | ||
| 3 | 3 | ||
| 4 | class MpegAudioStripper(parser.Generic_parser): | 4 | class MpegAudioStripper(parser.GenericParser): |
| 5 | ''' | 5 | ''' |
| 6 | mpeg audio file (mp3, ...) | 6 | mpeg audio file (mp3, ...) |
| 7 | ''' | 7 | ''' |
diff --git a/lib/images.py b/lib/images.py index bab0bfb..df3d256 100644 --- a/lib/images.py +++ b/lib/images.py | |||
| @@ -1,7 +1,10 @@ | |||
| 1 | import parser | 1 | import parser |
| 2 | 2 | ||
| 3 | 3 | ||
| 4 | class JpegStripper(parser.Generic_parser): | 4 | class JpegStripper(parser.GenericParser): |
| 5 | ''' | ||
| 6 | Represents a .jpeg file | ||
| 7 | ''' | ||
| 5 | def _should_remove(self, field): | 8 | def _should_remove(self, field): |
| 6 | if field.name.startswith('comment'): | 9 | if field.name.startswith('comment'): |
| 7 | return True | 10 | return True |
| @@ -11,7 +14,10 @@ class JpegStripper(parser.Generic_parser): | |||
| 11 | return False | 14 | return False |
| 12 | 15 | ||
| 13 | 16 | ||
| 14 | class PngStripper(parser.Generic_parser): | 17 | class PngStripper(parser.GenericParser): |
| 18 | ''' | ||
| 19 | Represents a .png file | ||
| 20 | ''' | ||
| 15 | def _should_remove(self, field): | 21 | def _should_remove(self, field): |
| 16 | if field.name.startswith("text["): | 22 | if field.name.startswith("text["): |
| 17 | return True | 23 | return True |
| @@ -25,7 +25,7 @@ LOGGING_LEVEL = logging.DEBUG | |||
| 25 | 25 | ||
| 26 | logging.basicConfig(level=LOGGING_LEVEL) | 26 | logging.basicConfig(level=LOGGING_LEVEL) |
| 27 | 27 | ||
| 28 | strippers = { | 28 | STRIPPERS = { |
| 29 | hachoir_parser.image.JpegFile: images.JpegStripper, | 29 | hachoir_parser.image.JpegFile: images.JpegStripper, |
| 30 | hachoir_parser.image.PngFile: images.PngStripper, | 30 | hachoir_parser.image.PngFile: images.PngStripper, |
| 31 | hachoir_parser.audio.MpegAudioFile: audio.MpegAudioStripper, | 31 | hachoir_parser.audio.MpegAudioFile: audio.MpegAudioStripper, |
| @@ -83,7 +83,7 @@ def create_class_file(name, backup, add2archive): | |||
| 83 | (which herits from the "file" class), based on the editor | 83 | (which herits from the "file" class), based on the editor |
| 84 | of given file (name) | 84 | of given file (name) |
| 85 | ''' | 85 | ''' |
| 86 | stripper_class = strippers[editor.input.__class__] | 86 | stripper_class = STRIPPERS[editor.input.__class__] |
| 87 | except KeyError: | 87 | except KeyError: |
| 88 | #Place for another lib than hachoir | 88 | #Place for another lib than hachoir |
| 89 | logging.info('Don\'t have stripper for format %s' % editor.description) | 89 | logging.info('Don\'t have stripper for format %s' % editor.description) |
diff --git a/lib/misc.py b/lib/misc.py index ce14313..f846388 100644 --- a/lib/misc.py +++ b/lib/misc.py | |||
| @@ -2,7 +2,7 @@ import hachoir_core | |||
| 2 | import parser | 2 | import parser |
| 3 | 3 | ||
| 4 | 4 | ||
| 5 | class TorrentStripper(parser.Generic_parser): | 5 | class TorrentStripper(parser.GenericParser): |
| 6 | ''' | 6 | ''' |
| 7 | A torrent file looks like: | 7 | A torrent file looks like: |
| 8 | -root | 8 | -root |
| @@ -21,8 +21,7 @@ class TorrentStripper(parser.Generic_parser): | |||
| 21 | if self._should_remove(field): | 21 | if self._should_remove(field): |
| 22 | #FIXME : hachoir does not support torrent metadata editing :< | 22 | #FIXME : hachoir does not support torrent metadata editing :< |
| 23 | del self.editor['/root/' + field.name] | 23 | del self.editor['/root/' + field.name] |
| 24 | hachoir_core.field.writeIntoFile(self.editor, | 24 | hachoir_core.field.writeIntoFile(self.editor, self.output) |
| 25 | self.filename + parser.POSTFIX) | ||
| 26 | self.do_backup() | 25 | self.do_backup() |
| 27 | 26 | ||
| 28 | def is_clean(self): | 27 | def is_clean(self): |
diff --git a/lib/office.py b/lib/office.py index 432bc0b..5fa475d 100644 --- a/lib/office.py +++ b/lib/office.py | |||
| @@ -6,6 +6,7 @@ import glob | |||
| 6 | import logging | 6 | import logging |
| 7 | import zipfile | 7 | import zipfile |
| 8 | import re | 8 | import re |
| 9 | import shutil | ||
| 9 | from xml.etree import ElementTree | 10 | from xml.etree import ElementTree |
| 10 | 11 | ||
| 11 | 12 | ||
| @@ -97,7 +98,7 @@ class OpenDocumentStripper(archive.GenericArchiveStripper): | |||
| 97 | return True | 98 | return True |
| 98 | 99 | ||
| 99 | 100 | ||
| 100 | class PdfStripper(parser.Generic_parser): | 101 | class PdfStripper(parser.GenericParser): |
| 101 | ''' | 102 | ''' |
| 102 | Represent a pdf file, with the help of pdfrw | 103 | Represent a pdf file, with the help of pdfrw |
| 103 | ''' | 104 | ''' |
| @@ -109,10 +110,17 @@ class PdfStripper(parser.Generic_parser): | |||
| 109 | self.realname = realname | 110 | self.realname = realname |
| 110 | self.shortname = os.path.basename(filename) | 111 | self.shortname = os.path.basename(filename) |
| 111 | self.mime = mimetypes.guess_type(filename)[0] | 112 | self.mime = mimetypes.guess_type(filename)[0] |
| 113 | self.tempdir = tempfile.mkdtemp() | ||
| 112 | self.trailer = pdfrw.PdfReader(self.filename) | 114 | self.trailer = pdfrw.PdfReader(self.filename) |
| 113 | self.writer = pdfrw.PdfWriter() | 115 | self.writer = pdfrw.PdfWriter() |
| 114 | self.convert = 'gm convert -antialias -enhance %s %s' | 116 | self.convert = 'gm convert -antialias -enhance %s %s' |
| 115 | 117 | ||
| 118 | def __del__(self): | ||
| 119 | ''' | ||
| 120 | Remove the temp dir | ||
| 121 | ''' | ||
| 122 | shutil.rmtree(self.tempdir) | ||
| 123 | |||
| 116 | def remove_all(self): | 124 | def remove_all(self): |
| 117 | ''' | 125 | ''' |
| 118 | Remove all the meta fields that are compromizing | 126 | Remove all the meta fields that are compromizing |
| @@ -133,19 +141,18 @@ class PdfStripper(parser.Generic_parser): | |||
| 133 | Transform each pages into a jpg, clean them, | 141 | Transform each pages into a jpg, clean them, |
| 134 | then re-assemble them into a new pdf | 142 | then re-assemble them into a new pdf |
| 135 | ''' | 143 | ''' |
| 136 | _, self.tmpdir = tempfile.mkstemp() | 144 | subprocess.call(self.convert % (self.filename, self.tempdir + |
| 137 | subprocess.call(self.convert % (self.filename, self.tmpdir + | ||
| 138 | 'temp.jpg'), shell=True) # Convert pages to jpg | 145 | 'temp.jpg'), shell=True) # Convert pages to jpg |
| 139 | 146 | ||
| 140 | for current_file in glob.glob(self.tmpdir + 'temp*'): | 147 | for current_file in glob.glob(self.tempdir + 'temp*'): |
| 141 | #Clean every jpg image | 148 | #Clean every jpg image |
| 142 | class_file = mat.create_class_file(current_file, False) | 149 | class_file = mat.create_class_file(current_file, False, False) |
| 143 | class_file.remove_all() | 150 | class_file.remove_all() |
| 144 | 151 | ||
| 145 | subprocess.call(self.convert % (self.tmpdir + | 152 | subprocess.call(self.convert % (self.tempdir + |
| 146 | 'temp.jpg*', self.output), shell=True) # Assemble jpg into pdf | 153 | 'temp.jpg*', self.output), shell=True) # Assemble jpg into pdf |
| 147 | 154 | ||
| 148 | for current_file in glob.glob(self.tmpdir + 'temp*'): | 155 | for current_file in glob.glob(self.tempdir + 'temp*'): |
| 149 | #remove jpg files | 156 | #remove jpg files |
| 150 | mat.secure_remove(current_file) | 157 | mat.secure_remove(current_file) |
| 151 | 158 | ||
| @@ -155,7 +162,7 @@ class PdfStripper(parser.Generic_parser): | |||
| 155 | name = self.realname | 162 | name = self.realname |
| 156 | else: | 163 | else: |
| 157 | name = self.output | 164 | name = self.output |
| 158 | class_file = mat.create_class_file(name, False) | 165 | class_file = mat.create_class_file(name, False, False) |
| 159 | class_file.remove_all() | 166 | class_file.remove_all() |
| 160 | 167 | ||
| 161 | def is_clean(self): | 168 | def is_clean(self): |
diff --git a/lib/parser.py b/lib/parser.py index 28e0849..ae647fe 100644 --- a/lib/parser.py +++ b/lib/parser.py | |||
| @@ -12,7 +12,7 @@ import mat | |||
| 12 | NOMETA = ('.bmp', 'html', '.py', '.rdf', '.txt', '.xml') | 12 | NOMETA = ('.bmp', 'html', '.py', '.rdf', '.txt', '.xml') |
| 13 | 13 | ||
| 14 | 14 | ||
| 15 | class Generic_parser(object): | 15 | class GenericParser(object): |
| 16 | def __init__(self, realname, filename, parser, editor, backup, | 16 | def __init__(self, realname, filename, parser, editor, backup, |
| 17 | add2archive): | 17 | add2archive): |
| 18 | basename, ext = os.path.splitext(filename) | 18 | basename, ext = os.path.splitext(filename) |
| @@ -78,7 +78,7 @@ class Generic_parser(object): | |||
| 78 | return True if the field is compromizing | 78 | return True if the field is compromizing |
| 79 | abstract method | 79 | abstract method |
| 80 | ''' | 80 | ''' |
| 81 | raise NotImplementedError() | 81 | raise NotImplementedError |
| 82 | 82 | ||
| 83 | def do_backup(self): | 83 | def do_backup(self): |
| 84 | ''' | 84 | ''' |
