summaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
authorjvoisin2011-07-26 14:06:38 +0200
committerjvoisin2011-07-26 14:06:38 +0200
commite62ae6a87f630cbd389cf1b75672b06cd56973c8 (patch)
tree5433e5bde0d0448795626190f8014c61b38ac1c5 /lib
parentf6e3d57173604dab7228c830e84415ead02e169b (diff)
Pyflakes and pep8 validation
Diffstat (limited to 'lib')
-rw-r--r--lib/archive.py20
-rw-r--r--lib/audio.py4
-rw-r--r--lib/images.py4
-rw-r--r--lib/mat.py19
-rw-r--r--lib/misc.py48
-rw-r--r--lib/office.py24
-rw-r--r--lib/parser.py21
7 files changed, 95 insertions, 45 deletions
diff --git a/lib/archive.py b/lib/archive.py
index f22af39..f11506a 100644
--- a/lib/archive.py
+++ b/lib/archive.py
@@ -9,11 +9,13 @@ import tempfile
9import parser 9import parser
10import mat 10import mat
11 11
12
12class GenericArchiveStripper(parser.Generic_parser): 13class GenericArchiveStripper(parser.Generic_parser):
13 ''' 14 '''
14 Represent a generic archive 15 Represent a generic archive
15 ''' 16 '''
16 def __init__(self, realname, filename, parser, editor, backup, add2archive): 17 def __init__(self, realname, filename, parser, editor, backup,
18 add2archive):
17 super(GenericArchiveStripper, self).__init__(realname, 19 super(GenericArchiveStripper, self).__init__(realname,
18 filename, parser, editor, backup, add2archive) 20 filename, parser, editor, backup, add2archive)
19 self.compression = '' 21 self.compression = ''
@@ -32,6 +34,7 @@ class GenericArchiveStripper(parser.Generic_parser):
32 def remove_all_ugly(self): 34 def remove_all_ugly(self):
33 self._remove_all('ugly') 35 self._remove_all('ugly')
34 36
37
35class ZipStripper(GenericArchiveStripper): 38class ZipStripper(GenericArchiveStripper):
36 ''' 39 '''
37 Represent a zip file 40 Represent a zip file
@@ -94,7 +97,6 @@ harmless format' % item.filename)
94 zipin.close() 97 zipin.close()
95 return metadata 98 return metadata
96 99
97
98 def _remove_all(self, method): 100 def _remove_all(self, method):
99 ''' 101 '''
100 So far, the zipfile module does not allow to write a ZipInfo 102 So far, the zipfile module does not allow to write a ZipInfo
@@ -150,7 +152,7 @@ class TarStripper(GenericArchiveStripper):
150 for item in tarin.getmembers(): 152 for item in tarin.getmembers():
151 tarin.extract(item, self.tempdir) 153 tarin.extract(item, self.tempdir)
152 name = os.path.join(self.tempdir, item.name) 154 name = os.path.join(self.tempdir, item.name)
153 if item.type is '0': #is item a regular file ? 155 if item.type is '0': # is item a regular file ?
154 #no backup file 156 #no backup file
155 try: 157 try:
156 cfile = mat.create_class_file(name, False, 158 cfile = mat.create_class_file(name, False,
@@ -164,7 +166,7 @@ class TarStripper(GenericArchiveStripper):
164 logging.info('%s\' format is not supported' % 166 logging.info('%s\' format is not supported' %
165 item.name) 167 item.name)
166 if self.add2archive: 168 if self.add2archive:
167 tarout.add(name, item.name,filter=self._remove) 169 tarout.add(name, item.name, filter=self._remove)
168 mat.secure_remove(name) 170 mat.secure_remove(name)
169 tarin.close() 171 tarin.close()
170 tarout.close() 172 tarout.close()
@@ -194,7 +196,7 @@ class TarStripper(GenericArchiveStripper):
194 return False 196 return False
195 tarin.extract(item, self.tempdir) 197 tarin.extract(item, self.tempdir)
196 name = os.path.join(self.tempdir, item.name) 198 name = os.path.join(self.tempdir, item.name)
197 if item.type is '0': #is item a regular file ? 199 if item.type is '0': # is item a regular file ?
198 #no backup file 200 #no backup file
199 try: 201 try:
200 class_file = mat.create_class_file(name, 202 class_file = mat.create_class_file(name,
@@ -216,7 +218,7 @@ class TarStripper(GenericArchiveStripper):
216 metadata = {} 218 metadata = {}
217 for current_file in tarin.getmembers(): 219 for current_file in tarin.getmembers():
218 if current_file.type is '0': 220 if current_file.type is '0':
219 if not self.is_file_clean(current_file):#if there is meta 221 if not self.is_file_clean(current_file): # if there is meta
220 current_meta = {} 222 current_meta = {}
221 current_meta['mtime'] = current_file.mtime 223 current_meta['mtime'] = current_file.mtime
222 current_meta['uid'] = current_file.uid 224 current_meta['uid'] = current_file.uid
@@ -229,14 +231,16 @@ class TarStripper(GenericArchiveStripper):
229 231
230 232
231class GzipStripper(TarStripper): 233class GzipStripper(TarStripper):
232 def __init__(self, realname, filename, parser, editor, backup, add2archive): 234 def __init__(self, realname, filename, parser, editor, backup,
235 add2archive):
233 super(GzipStripper, self).__init__(realname, 236 super(GzipStripper, self).__init__(realname,
234 filename, parser, editor, backup, add2archive) 237 filename, parser, editor, backup, add2archive)
235 self.compression = ':gz' 238 self.compression = ':gz'
236 239
237 240
238class Bzip2Stripper(TarStripper): 241class Bzip2Stripper(TarStripper):
239 def __init__(self, realname, filename, parser, editor, backup, add2archive): 242 def __init__(self, realname, filename, parser, editor, backup,
243 add2archive):
240 super(Bzip2Stripper, self).__init__(realname, 244 super(Bzip2Stripper, self).__init__(realname,
241 filename, parser, editor, backup, add2archive) 245 filename, parser, editor, backup, add2archive)
242 self.compression = ':bz2' 246 self.compression = ':bz2'
diff --git a/lib/audio.py b/lib/audio.py
index 6d653bc..35d4fde 100644
--- a/lib/audio.py
+++ b/lib/audio.py
@@ -1,6 +1,10 @@
1import parser 1import parser
2 2
3
3class MpegAudioStripper(parser.Generic_parser): 4class MpegAudioStripper(parser.Generic_parser):
5 '''
6 mpeg audio file (mp3, ...)
7 '''
4 def _should_remove(self, field): 8 def _should_remove(self, field):
5 if field.name in ("id3v1", "id3v2"): 9 if field.name in ("id3v1", "id3v2"):
6 return True 10 return True
diff --git a/lib/images.py b/lib/images.py
index 4441b70..bab0bfb 100644
--- a/lib/images.py
+++ b/lib/images.py
@@ -1,8 +1,5 @@
1import parser 1import parser
2 2
3class BmpStripper(parser.Generic_parser):
4 def _should_remove(self, field):
5 return False
6 3
7class JpegStripper(parser.Generic_parser): 4class JpegStripper(parser.Generic_parser):
8 def _should_remove(self, field): 5 def _should_remove(self, field):
@@ -13,6 +10,7 @@ class JpegStripper(parser.Generic_parser):
13 else: 10 else:
14 return False 11 return False
15 12
13
16class PngStripper(parser.Generic_parser): 14class PngStripper(parser.Generic_parser):
17 def _should_remove(self, field): 15 def _should_remove(self, field):
18 if field.name.startswith("text["): 16 if field.name.startswith("text["):
diff --git a/lib/mat.py b/lib/mat.py
index ccf653f..e4371ce 100644
--- a/lib/mat.py
+++ b/lib/mat.py
@@ -23,12 +23,11 @@ __author__ = 'jvoisin'
23 23
24LOGGING_LEVEL = logging.DEBUG 24LOGGING_LEVEL = logging.DEBUG
25 25
26logging.basicConfig(level = LOGGING_LEVEL) 26logging.basicConfig(level=LOGGING_LEVEL)
27 27
28strippers = { 28strippers = {
29 hachoir_parser.image.JpegFile: images.JpegStripper, 29 hachoir_parser.image.JpegFile: images.JpegStripper,
30 hachoir_parser.image.PngFile: images.PngStripper, 30 hachoir_parser.image.PngFile: images.PngStripper,
31 hachoir_parser.image.bmp.BmpFile: images.BmpStripper,
32 hachoir_parser.audio.MpegAudioFile: audio.MpegAudioStripper, 31 hachoir_parser.audio.MpegAudioFile: audio.MpegAudioStripper,
33 hachoir_parser.misc.PDFDocument: office.PdfStripper, 32 hachoir_parser.misc.PDFDocument: office.PdfStripper,
34 hachoir_parser.archive.TarFile: archive.TarStripper, 33 hachoir_parser.archive.TarFile: archive.TarStripper,
@@ -37,6 +36,7 @@ strippers = {
37 hachoir_parser.archive.zip.ZipFile: archive.ZipStripper, 36 hachoir_parser.archive.zip.ZipFile: archive.ZipStripper,
38} 37}
39 38
39
40def secure_remove(filename): 40def secure_remove(filename):
41 ''' 41 '''
42 securely remove the file 42 securely remove the file
@@ -52,10 +52,11 @@ def is_secure(filename):
52 Prevent shell injection 52 Prevent shell injection
53 ''' 53 '''
54 54
55 if not(os.path.isfile(filename)): #check if the file exist 55 if not(os.path.isfile(filename)): # check if the file exist
56 logging.error('Error: %s is not a valid file' % filename) 56 logging.error('Error: %s is not a valid file' % filename)
57 return False 57 return False
58 58
59
59def create_class_file(name, backup, add2archive): 60def create_class_file(name, backup, add2archive):
60 ''' 61 '''
61 return a $FILETYPEStripper() class, 62 return a $FILETYPEStripper() class,
@@ -68,7 +69,7 @@ def create_class_file(name, backup, add2archive):
68 realname = name 69 realname = name
69 try: 70 try:
70 filename = hachoir_core.cmd_line.unicodeFilename(name) 71 filename = hachoir_core.cmd_line.unicodeFilename(name)
71 except TypeError:# get rid of "TypeError: decoding Unicode is not supported" 72 except TypeError: # get rid of "decoding Unicode is not supported"
72 filename = name 73 filename = name
73 parser = hachoir_parser.createParser(filename) 74 parser = hachoir_parser.createParser(filename)
74 if not parser: 75 if not parser:
@@ -88,22 +89,22 @@ def create_class_file(name, backup, add2archive):
88 logging.info('Don\'t have stripper for format %s' % editor.description) 89 logging.info('Don\'t have stripper for format %s' % editor.description)
89 return 90 return
90 91
91 if editor.input.__class__ == hachoir_parser.misc.PDFDocument:#pdf 92 if editor.input.__class__ == hachoir_parser.misc.PDFDocument: # pdf
92 return stripper_class(filename, realname, backup) 93 return stripper_class(filename, realname, backup)
93 94
94 elif editor.input.__class__ == hachoir_parser.archive.zip.ZipFile: 95 elif editor.input.__class__ == hachoir_parser.archive.zip.ZipFile:
95 #zip based format 96 #zip based format
96 mime = mimetypes.guess_type(filename)[0] 97 mime = mimetypes.guess_type(filename)[0]
97 try:#Ugly workaround, cleaning open document delete mime (wtf?) 98 try: # ugly workaround, cleaning open document delete mime (wtf?)
98 if mime.startswith('application/vnd.oasis.opendocument'): 99 if mime.startswith('application/vnd.oasis.opendocument'):
99 return office.OpenDocumentStripper(realname, filename, parser, 100 return office.OpenDocumentStripper(realname, filename, parser,
100 editor, backup, add2archive) 101 editor, backup, add2archive)
101 else:#normal zip 102 else: # normal zip
102 return stripper_class(realname, filename, parser, editor, 103 return stripper_class(realname, filename, parser, editor,
103 backup, add2archive) 104 backup, add2archive)
104 except:#normal zip file 105 except: # normal zip
105 return stripper_class(realname, filename, parser, editor, backup, 106 return stripper_class(realname, filename, parser, editor, backup,
106 add2archive) 107 add2archive)
107 else:#normal handling 108 else: # normal handling
108 return stripper_class(realname, filename, parser, editor, backup, 109 return stripper_class(realname, filename, parser, editor, backup,
109 add2archive) 110 add2archive)
diff --git a/lib/misc.py b/lib/misc.py
new file mode 100644
index 0000000..ce14313
--- /dev/null
+++ b/lib/misc.py
@@ -0,0 +1,48 @@
1import hachoir_core
2import parser
3
4
5class TorrentStripper(parser.Generic_parser):
6 '''
7 A torrent file looks like:
8 -root
9 -start
10 -announce
11 -announce-list
12 -comment
13 -created_by
14 -creation_date
15 -encoding
16 -info
17 -end
18 '''
19 def remove_all(self):
20 for field in self.editor['root']:
21 if self._should_remove(field):
22 #FIXME : hachoir does not support torrent metadata editing :<
23 del self.editor['/root/' + field.name]
24 hachoir_core.field.writeIntoFile(self.editor,
25 self.filename + parser.POSTFIX)
26 self.do_backup()
27
28 def is_clean(self):
29 for field in self.editor['root']:
30 if self._should_remove(field):
31 return False
32 return True
33
34 def get_meta(self):
35 metadata = {}
36 for field in self.editor['root']:
37 if self._should_remove(field):
38 try: # FIXME
39 metadata[field.name] = field.value
40 except:
41 metadata[field.name] = 'harmful content'
42 return metadata
43
44 def _should_remove(self, field):
45 if field.name in ('comment', 'created_by', 'creation_date', 'info'):
46 return True
47 else:
48 return False
diff --git a/lib/office.py b/lib/office.py
index 27677d2..432bc0b 100644
--- a/lib/office.py
+++ b/lib/office.py
@@ -5,17 +5,16 @@ import tempfile
5import glob 5import glob
6import logging 6import logging
7import zipfile 7import zipfile
8import shutil
9import re 8import re
10from xml.etree import ElementTree 9from xml.etree import ElementTree
11 10
12import hachoir_core
13 11
14import pdfrw 12import pdfrw
15import mat 13import mat
16import parser 14import parser
17import archive 15import archive
18 16
17
19class OpenDocumentStripper(archive.GenericArchiveStripper): 18class OpenDocumentStripper(archive.GenericArchiveStripper):
20 ''' 19 '''
21 An open document file is a zip, with xml file into. 20 An open document file is a zip, with xml file into.
@@ -32,11 +31,10 @@ class OpenDocumentStripper(archive.GenericArchiveStripper):
32 for node in tree.iter(): 31 for node in tree.iter():
33 key = re.sub('{.*}', '', node.tag) 32 key = re.sub('{.*}', '', node.tag)
34 metadata[key] = node.text 33 metadata[key] = node.text
35 except KeyError:#no meta.xml file found 34 except KeyError: # no meta.xml file found
36 logging.debug('%s has no opendocument metadata' % self.filename) 35 logging.debug('%s has no opendocument metadata' % self.filename)
37 return metadata 36 return metadata
38 37
39
40 def _remove_all(self, method): 38 def _remove_all(self, method):
41 ''' 39 '''
42 FIXME ? 40 FIXME ?
@@ -50,7 +48,7 @@ class OpenDocumentStripper(archive.GenericArchiveStripper):
50 name = os.path.join(self.tempdir, item) 48 name = os.path.join(self.tempdir, item)
51 if item.endswith('.xml') or item == 'mimetype': 49 if item.endswith('.xml') or item == 'mimetype':
52 #keep .xml files, and the "manifest" file 50 #keep .xml files, and the "manifest" file
53 if item != 'meta.xml':#contains the metadata 51 if item != 'meta.xml': # contains the metadata
54 zipin.extract(item, self.tempdir) 52 zipin.extract(item, self.tempdir)
55 zipout.write(name, item) 53 zipout.write(name, item)
56 mat.secure_remove(name) 54 mat.secure_remove(name)
@@ -73,7 +71,7 @@ class OpenDocumentStripper(archive.GenericArchiveStripper):
73 self.filename)) 71 self.filename))
74 zipout.write(name, item) 72 zipout.write(name, item)
75 except: 73 except:
76 logging.info('%s\' fileformat is not supported' % item) 74 logging.info('%s\' fileformat is not supported' % item)
77 if self.add2archive: 75 if self.add2archive:
78 zipout.write(name, item) 76 zipout.write(name, item)
79 mat.secure_remove(name) 77 mat.secure_remove(name)
@@ -88,7 +86,7 @@ class OpenDocumentStripper(archive.GenericArchiveStripper):
88 try: 86 try:
89 zipin.getinfo('meta.xml') 87 zipin.getinfo('meta.xml')
90 return False 88 return False
91 except KeyError:#no meta.xml in the file 89 except KeyError: # no meta.xml in the file
92 zipin.close() 90 zipin.close()
93 czf = archive.ZipStripper(self.realname, self.filename, 91 czf = archive.ZipStripper(self.realname, self.filename,
94 self.parser, self.editor, self.backup, self.add2archive) 92 self.parser, self.editor, self.backup, self.add2archive)
@@ -104,7 +102,7 @@ class PdfStripper(parser.Generic_parser):
104 Represent a pdf file, with the help of pdfrw 102 Represent a pdf file, with the help of pdfrw
105 ''' 103 '''
106 def __init__(self, filename, realname, backup): 104 def __init__(self, filename, realname, backup):
107 name, path = os.path.splitext(filename) 105 name, ext = os.path.splitext(filename)
108 self.output = name + '.cleaned' + ext 106 self.output = name + '.cleaned' + ext
109 self.filename = filename 107 self.filename = filename
110 self.backup = backup 108 self.backup = backup
@@ -137,7 +135,7 @@ class PdfStripper(parser.Generic_parser):
137 ''' 135 '''
138 _, self.tmpdir = tempfile.mkstemp() 136 _, self.tmpdir = tempfile.mkstemp()
139 subprocess.call(self.convert % (self.filename, self.tmpdir + 137 subprocess.call(self.convert % (self.filename, self.tmpdir +
140 'temp.jpg'), shell=True)#Convert pages to jpg 138 'temp.jpg'), shell=True) # Convert pages to jpg
141 139
142 for current_file in glob.glob(self.tmpdir + 'temp*'): 140 for current_file in glob.glob(self.tmpdir + 'temp*'):
143 #Clean every jpg image 141 #Clean every jpg image
@@ -145,18 +143,18 @@ class PdfStripper(parser.Generic_parser):
145 class_file.remove_all() 143 class_file.remove_all()
146 144
147 subprocess.call(self.convert % (self.tmpdir + 145 subprocess.call(self.convert % (self.tmpdir +
148 'temp.jpg*', self.output), shell=True)#Assemble jpg into pdf 146 'temp.jpg*', self.output), shell=True) # Assemble jpg into pdf
149 147
150 for current_file in glob.glob(self.tmpdir + 'temp*'): 148 for current_file in glob.glob(self.tmpdir + 'temp*'):
151 #remove jpg files 149 #remove jpg files
152 mat.secure_remove(current_file) 150 mat.secure_remove(current_file)
153 151
154 if self.backup is False: 152 if self.backup is False:
155 mat.secure_remove(self.filename) #remove the old file 153 mat.secure_remove(self.filename) # remove the old file
156 os.rename(self.output, self.filename)#rename the new 154 os.rename(self.output, self.filename) # rename the new
157 name = self.realname 155 name = self.realname
158 else: 156 else:
159 name = output_file 157 name = self.output
160 class_file = mat.create_class_file(name, False) 158 class_file = mat.create_class_file(name, False)
161 class_file.remove_all() 159 class_file.remove_all()
162 160
diff --git a/lib/parser.py b/lib/parser.py
index aa7e7f1..28e0849 100644
--- a/lib/parser.py
+++ b/lib/parser.py
@@ -2,27 +2,25 @@
2 Parent class of all parser 2 Parent class of all parser
3''' 3'''
4 4
5import hachoir_core.error 5import hachoir_core
6import hachoir_parser
7import hachoir_editor
8 6
9import sys
10import os 7import os
11import subprocess
12import mimetypes 8import mimetypes
13 9
14import mat 10import mat
15 11
16NOMETA = ('.txt', '.bmp', '.py', '.xml', '.rdf') 12NOMETA = ('.bmp', 'html', '.py', '.rdf', '.txt', '.xml')
13
17 14
18class Generic_parser(object): 15class Generic_parser(object):
19 def __init__(self, realname, filename, parser, editor, backup, add2archive): 16 def __init__(self, realname, filename, parser, editor, backup,
17 add2archive):
20 basename, ext = os.path.splitext(filename) 18 basename, ext = os.path.splitext(filename)
21 self.output = basename + '.cleaned' + ext 19 self.output = basename + '.cleaned' + ext
22 self.filename = filename #path + filename 20 self.filename = filename # path + filename
23 self.realname = realname #path + filename 21 self.realname = realname # path + filename
24 self.basename = os.path.basename(filename) #only filename 22 self.basename = os.path.basename(filename) # only filename
25 self.mime = mimetypes.guess_type(filename)[0] #mimetype 23 self.mime = mimetypes.guess_type(filename)[0] # mimetype
26 self.parser = parser 24 self.parser = parser
27 self.editor = editor 25 self.editor = editor
28 self.backup = backup 26 self.backup = backup
@@ -56,7 +54,6 @@ class Generic_parser(object):
56 ''' 54 '''
57 self.remove_all() 55 self.remove_all()
58 56
59
60 def _remove(self, field): 57 def _remove(self, field):
61 ''' 58 '''
62 Delete the given field 59 Delete the given field