diff options
| author | jvoisin | 2011-07-25 03:03:12 +0200 |
|---|---|---|
| committer | jvoisin | 2011-07-25 03:03:12 +0200 |
| commit | 7bec354973580216c64889b925e1f7d6a224d7dd (patch) | |
| tree | 7ddf33ae6a1ffd5c9d03522ae508f67632f638cb /lib | |
| parent | ac248b5b4979aafa0c05f8253e2f9e1bdba305e6 (diff) | |
more abstraction, and changed the name of the outputed file
Diffstat (limited to 'lib')
| -rw-r--r-- | lib/archive.py | 14 | ||||
| -rw-r--r-- | lib/office.py | 24 | ||||
| -rw-r--r-- | lib/parser.py | 20 |
3 files changed, 28 insertions, 30 deletions
diff --git a/lib/archive.py b/lib/archive.py index 8a305d5..21bc5c5 100644 --- a/lib/archive.py +++ b/lib/archive.py | |||
| @@ -83,7 +83,7 @@ class ZipStripper(GenericArchiveStripper): | |||
| 83 | 83 | ||
| 84 | def _remove_all(self, method): | 84 | def _remove_all(self, method): |
| 85 | zipin = zipfile.ZipFile(self.filename, 'r') | 85 | zipin = zipfile.ZipFile(self.filename, 'r') |
| 86 | zipout = zipfile.ZipFile(self.filename + parser.POSTFIX, 'w', | 86 | zipout = zipfile.ZipFile(self.output, 'w', |
| 87 | allowZip64=True) | 87 | allowZip64=True) |
| 88 | for item in zipin.infolist(): | 88 | for item in zipin.infolist(): |
| 89 | zipin.extract(item, self.tempdir) | 89 | zipin.extract(item, self.tempdir) |
| @@ -109,6 +109,7 @@ class ZipStripper(GenericArchiveStripper): | |||
| 109 | logging.info('%s treated' % self.filename) | 109 | logging.info('%s treated' % self.filename) |
| 110 | zipin.close() | 110 | zipin.close() |
| 111 | zipout.close() | 111 | zipout.close() |
| 112 | self.do_backup() | ||
| 112 | 113 | ||
| 113 | 114 | ||
| 114 | class TarStripper(GenericArchiveStripper): | 115 | class TarStripper(GenericArchiveStripper): |
| @@ -125,8 +126,7 @@ class TarStripper(GenericArchiveStripper): | |||
| 125 | 126 | ||
| 126 | def _remove_all(self, method): | 127 | def _remove_all(self, method): |
| 127 | tarin = tarfile.open(self.filename, 'r' + self.compression) | 128 | tarin = tarfile.open(self.filename, 'r' + self.compression) |
| 128 | tarout = tarfile.open(self.filename + parser.POSTFIX, | 129 | tarout = tarfile.open(self.output, 'w' + self.compression) |
| 129 | 'w' + self.compression) | ||
| 130 | for item in tarin.getmembers(): | 130 | for item in tarin.getmembers(): |
| 131 | tarin.extract(item, self.tempdir) | 131 | tarin.extract(item, self.tempdir) |
| 132 | name = os.path.join(self.tempdir, item.name) | 132 | name = os.path.join(self.tempdir, item.name) |
| @@ -148,10 +148,7 @@ class TarStripper(GenericArchiveStripper): | |||
| 148 | mat.secure_remove(name) | 148 | mat.secure_remove(name) |
| 149 | tarin.close() | 149 | tarin.close() |
| 150 | tarout.close() | 150 | tarout.close() |
| 151 | 151 | self.do_backup() | |
| 152 | if self.backup is False: | ||
| 153 | mat.secure_remove(self.filename) | ||
| 154 | os.rename(self.filename + parser.POSTFIX, self.filename) | ||
| 155 | 152 | ||
| 156 | def is_file_clean(self, current_file): | 153 | def is_file_clean(self, current_file): |
| 157 | ''' | 154 | ''' |
| @@ -179,8 +176,7 @@ class TarStripper(GenericArchiveStripper): | |||
| 179 | name = os.path.join(self.tempdir, item.name) | 176 | name = os.path.join(self.tempdir, item.name) |
| 180 | if item.type is '0': #is item a regular file ? | 177 | if item.type is '0': #is item a regular file ? |
| 181 | #no backup file | 178 | #no backup file |
| 182 | class_file = mat.create_class_file(name, False, | 179 | class_file = mat.create_class_file(name, False,self.add2archive) |
| 183 | self.add2archive) | ||
| 184 | mat.secure_remove(name) | 180 | mat.secure_remove(name) |
| 185 | if not class_file.is_clean():#if the extracted file is not clean | 181 | if not class_file.is_clean():#if the extracted file is not clean |
| 186 | return False | 182 | return False |
diff --git a/lib/office.py b/lib/office.py index f87f357..2302dbc 100644 --- a/lib/office.py +++ b/lib/office.py | |||
| @@ -27,7 +27,7 @@ class OpenDocumentStripper(archive.GenericArchiveStripper): | |||
| 27 | method here : http://bugs.python.org/issue6818 | 27 | method here : http://bugs.python.org/issue6818 |
| 28 | ''' | 28 | ''' |
| 29 | zipin = zipfile.ZipFile(self.filename, 'r') | 29 | zipin = zipfile.ZipFile(self.filename, 'r') |
| 30 | zipout = zipfile.ZipFile(self.filename + parser.POSTFIX, 'w', | 30 | zipout = zipfile.ZipFile(self.basename + parser.POSTFIX + self.ext, 'w', |
| 31 | allowZip64=True) | 31 | allowZip64=True) |
| 32 | for item in zipin.namelist(): | 32 | for item in zipin.namelist(): |
| 33 | name = os.path.join(self.tempdir, item) | 33 | name = os.path.join(self.tempdir, item) |
| @@ -65,10 +65,7 @@ class OpenDocumentStripper(archive.GenericArchiveStripper): | |||
| 65 | logging.info('%s treated' % self.filename) | 65 | logging.info('%s treated' % self.filename) |
| 66 | zipin.close() | 66 | zipin.close() |
| 67 | zipout.close() | 67 | zipout.close() |
| 68 | 68 | self.do_backup() | |
| 69 | if self.backup is False: | ||
| 70 | mat.secure_remove(self.filename) #remove the old file | ||
| 71 | os.rename(self.filename + parser.POSTFIX, self.filename) | ||
| 72 | 69 | ||
| 73 | def is_clean(self): | 70 | def is_clean(self): |
| 74 | zipin = zipfile.ZipFile(self.filename, 'r') | 71 | zipin = zipfile.ZipFile(self.filename, 'r') |
| @@ -106,9 +103,7 @@ class TorrentStripper(parser.Generic_parser): | |||
| 106 | del self.editor['/root/' + field.name] | 103 | del self.editor['/root/' + field.name] |
| 107 | hachoir_core.field.writeIntoFile(self.editor, | 104 | hachoir_core.field.writeIntoFile(self.editor, |
| 108 | self.filename + parser.POSTFIX) | 105 | self.filename + parser.POSTFIX) |
| 109 | if self.backup is False: | 106 | self.do_backup() |
| 110 | mat.secure_remove(self.filename) #remove the old file | ||
| 111 | os.rename(self.filename + parser.POSTFIX, self.filename) | ||
| 112 | 107 | ||
| 113 | def is_clean(self): | 108 | def is_clean(self): |
| 114 | for field in self.editor['root']: | 109 | for field in self.editor['root']: |
| @@ -138,6 +133,8 @@ class PdfStripper(parser.Generic_parser): | |||
| 138 | Represent a pdf file, with the help of pdfrw | 133 | Represent a pdf file, with the help of pdfrw |
| 139 | ''' | 134 | ''' |
| 140 | def __init__(self, filename, realname, backup): | 135 | def __init__(self, filename, realname, backup): |
| 136 | name, path = os.path.splitext(filename) | ||
| 137 | self.output = name + '.cleaned.' + ext | ||
| 141 | self.filename = filename | 138 | self.filename = filename |
| 142 | self.backup = backup | 139 | self.backup = backup |
| 143 | self.realname = realname | 140 | self.realname = realname |
| @@ -159,17 +156,14 @@ class PdfStripper(parser.Generic_parser): | |||
| 159 | self.trailer.Info.ModDate = '' | 156 | self.trailer.Info.ModDate = '' |
| 160 | 157 | ||
| 161 | self.writer.trailer = self.trailer | 158 | self.writer.trailer = self.trailer |
| 162 | self.writer.write(self.filename + parser.POSTFIX) | 159 | self.writer.write(self.output) |
| 163 | if self.backup is False: | 160 | self.do_backup() |
| 164 | mat.secure_remove(self.filename) #remove the old file | ||
| 165 | os.rename(self.filename + parser.POSTFIX, self.filename) | ||
| 166 | 161 | ||
| 167 | def remove_all_ugly(self): | 162 | def remove_all_ugly(self): |
| 168 | ''' | 163 | ''' |
| 169 | Transform each pages into a jpg, clean them, | 164 | Transform each pages into a jpg, clean them, |
| 170 | then re-assemble them into a new pdf | 165 | then re-assemble them into a new pdf |
| 171 | ''' | 166 | ''' |
| 172 | output_file = self.realname + parser.POSTFIX + '.pdf' | ||
| 173 | _, self.tmpdir = tempfile.mkstemp() | 167 | _, self.tmpdir = tempfile.mkstemp() |
| 174 | subprocess.call(self.convert % (self.filename, self.tmpdir + | 168 | subprocess.call(self.convert % (self.filename, self.tmpdir + |
| 175 | 'temp.jpg'), shell=True)#Convert pages to jpg | 169 | 'temp.jpg'), shell=True)#Convert pages to jpg |
| @@ -180,7 +174,7 @@ class PdfStripper(parser.Generic_parser): | |||
| 180 | class_file.remove_all() | 174 | class_file.remove_all() |
| 181 | 175 | ||
| 182 | subprocess.call(self.convert % (self.tmpdir + | 176 | subprocess.call(self.convert % (self.tmpdir + |
| 183 | 'temp.jpg*', output_file), shell=True)#Assemble jpg into pdf | 177 | 'temp.jpg*', self.output), shell=True)#Assemble jpg into pdf |
| 184 | 178 | ||
| 185 | for current_file in glob.glob(self.tmpdir + 'temp*'): | 179 | for current_file in glob.glob(self.tmpdir + 'temp*'): |
| 186 | #remove jpg files | 180 | #remove jpg files |
| @@ -188,7 +182,7 @@ class PdfStripper(parser.Generic_parser): | |||
| 188 | 182 | ||
| 189 | if self.backup is False: | 183 | if self.backup is False: |
| 190 | mat.secure_remove(self.filename) #remove the old file | 184 | mat.secure_remove(self.filename) #remove the old file |
| 191 | os.rename(output_file, self.filename)#rename the new | 185 | os.rename(self.output, self.filename)#rename the new |
| 192 | name = self.realname | 186 | name = self.realname |
| 193 | else: | 187 | else: |
| 194 | name = output_file | 188 | name = output_file |
diff --git a/lib/parser.py b/lib/parser.py index ba4981d..11e776e 100644 --- a/lib/parser.py +++ b/lib/parser.py | |||
| @@ -13,10 +13,12 @@ import mimetypes | |||
| 13 | 13 | ||
| 14 | import mat | 14 | import mat |
| 15 | 15 | ||
| 16 | POSTFIX = ".cleaned" | 16 | NOMETA = ('*.txt', '*.bmp', '*.py') |
| 17 | 17 | ||
| 18 | class Generic_parser(object): | 18 | class Generic_parser(object): |
| 19 | def __init__(self, realname, filename, parser, editor, backup, add2archive): | 19 | def __init__(self, realname, filename, parser, editor, backup, add2archive): |
| 20 | basename, ext = os.path.splitext(filename) | ||
| 21 | self.output = basename + '.cleaned.' + ext | ||
| 20 | self.filename = filename #path + filename | 22 | self.filename = filename #path + filename |
| 21 | self.realname = realname #path + filename | 23 | self.realname = realname #path + filename |
| 22 | self.shortname = os.path.basename(filename) #only filename | 24 | self.shortname = os.path.basename(filename) #only filename |
| @@ -41,10 +43,8 @@ class Generic_parser(object): | |||
| 41 | for field in self.editor: | 43 | for field in self.editor: |
| 42 | if self._should_remove(field): | 44 | if self._should_remove(field): |
| 43 | self._remove(field.name) | 45 | self._remove(field.name) |
| 44 | hachoir_core.field.writeIntoFile(self.editor, self.filename + POSTFIX) | 46 | hachoir_core.field.writeIntoFile(self.editor, self.output) |
| 45 | if self.backup is False: | 47 | self.do_backup() |
| 46 | mat.secure_remove(self.filename) #remove the old file | ||
| 47 | os.rename(self.filename+ POSTFIX, self.filename) #rename the new | ||
| 48 | 48 | ||
| 49 | def remove_all_ugly(self): | 49 | def remove_all_ugly(self): |
| 50 | ''' | 50 | ''' |
| @@ -73,7 +73,7 @@ class Generic_parser(object): | |||
| 73 | try: | 73 | try: |
| 74 | metadata[field.name] = field.value | 74 | metadata[field.name] = field.value |
| 75 | except: | 75 | except: |
| 76 | metadata[field.name] = "harmful content" | 76 | metadata[field.name] = 'harmful content' |
| 77 | return metadata | 77 | return metadata |
| 78 | 78 | ||
| 79 | def _should_remove(self, key): | 79 | def _should_remove(self, key): |
| @@ -82,3 +82,11 @@ class Generic_parser(object): | |||
| 82 | abstract method | 82 | abstract method |
| 83 | ''' | 83 | ''' |
| 84 | raise NotImplementedError() | 84 | raise NotImplementedError() |
| 85 | |||
| 86 | def do_backup(self): | ||
| 87 | ''' | ||
| 88 | Do a backup of the file if asked | ||
| 89 | ''' | ||
| 90 | if self.backup is False: | ||
| 91 | mat.secure_remove(self.filename) | ||
| 92 | os.rename(self.output, self.filename) | ||
