summaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
authorjvoisin2011-07-25 03:03:12 +0200
committerjvoisin2011-07-25 03:03:12 +0200
commit7bec354973580216c64889b925e1f7d6a224d7dd (patch)
tree7ddf33ae6a1ffd5c9d03522ae508f67632f638cb /lib
parentac248b5b4979aafa0c05f8253e2f9e1bdba305e6 (diff)
more abstraction, and changed the name of the outputed file
Diffstat (limited to 'lib')
-rw-r--r--lib/archive.py14
-rw-r--r--lib/office.py24
-rw-r--r--lib/parser.py20
3 files changed, 28 insertions, 30 deletions
diff --git a/lib/archive.py b/lib/archive.py
index 8a305d5..21bc5c5 100644
--- a/lib/archive.py
+++ b/lib/archive.py
@@ -83,7 +83,7 @@ class ZipStripper(GenericArchiveStripper):
83 83
84 def _remove_all(self, method): 84 def _remove_all(self, method):
85 zipin = zipfile.ZipFile(self.filename, 'r') 85 zipin = zipfile.ZipFile(self.filename, 'r')
86 zipout = zipfile.ZipFile(self.filename + parser.POSTFIX, 'w', 86 zipout = zipfile.ZipFile(self.output, 'w',
87 allowZip64=True) 87 allowZip64=True)
88 for item in zipin.infolist(): 88 for item in zipin.infolist():
89 zipin.extract(item, self.tempdir) 89 zipin.extract(item, self.tempdir)
@@ -109,6 +109,7 @@ class ZipStripper(GenericArchiveStripper):
109 logging.info('%s treated' % self.filename) 109 logging.info('%s treated' % self.filename)
110 zipin.close() 110 zipin.close()
111 zipout.close() 111 zipout.close()
112 self.do_backup()
112 113
113 114
114class TarStripper(GenericArchiveStripper): 115class TarStripper(GenericArchiveStripper):
@@ -125,8 +126,7 @@ class TarStripper(GenericArchiveStripper):
125 126
126 def _remove_all(self, method): 127 def _remove_all(self, method):
127 tarin = tarfile.open(self.filename, 'r' + self.compression) 128 tarin = tarfile.open(self.filename, 'r' + self.compression)
128 tarout = tarfile.open(self.filename + parser.POSTFIX, 129 tarout = tarfile.open(self.output, 'w' + self.compression)
129 'w' + self.compression)
130 for item in tarin.getmembers(): 130 for item in tarin.getmembers():
131 tarin.extract(item, self.tempdir) 131 tarin.extract(item, self.tempdir)
132 name = os.path.join(self.tempdir, item.name) 132 name = os.path.join(self.tempdir, item.name)
@@ -148,10 +148,7 @@ class TarStripper(GenericArchiveStripper):
148 mat.secure_remove(name) 148 mat.secure_remove(name)
149 tarin.close() 149 tarin.close()
150 tarout.close() 150 tarout.close()
151 151 self.do_backup()
152 if self.backup is False:
153 mat.secure_remove(self.filename)
154 os.rename(self.filename + parser.POSTFIX, self.filename)
155 152
156 def is_file_clean(self, current_file): 153 def is_file_clean(self, current_file):
157 ''' 154 '''
@@ -179,8 +176,7 @@ class TarStripper(GenericArchiveStripper):
179 name = os.path.join(self.tempdir, item.name) 176 name = os.path.join(self.tempdir, item.name)
180 if item.type is '0': #is item a regular file ? 177 if item.type is '0': #is item a regular file ?
181 #no backup file 178 #no backup file
182 class_file = mat.create_class_file(name, False, 179 class_file = mat.create_class_file(name, False,self.add2archive)
183 self.add2archive)
184 mat.secure_remove(name) 180 mat.secure_remove(name)
185 if not class_file.is_clean():#if the extracted file is not clean 181 if not class_file.is_clean():#if the extracted file is not clean
186 return False 182 return False
diff --git a/lib/office.py b/lib/office.py
index f87f357..2302dbc 100644
--- a/lib/office.py
+++ b/lib/office.py
@@ -27,7 +27,7 @@ class OpenDocumentStripper(archive.GenericArchiveStripper):
27 method here : http://bugs.python.org/issue6818 27 method here : http://bugs.python.org/issue6818
28 ''' 28 '''
29 zipin = zipfile.ZipFile(self.filename, 'r') 29 zipin = zipfile.ZipFile(self.filename, 'r')
30 zipout = zipfile.ZipFile(self.filename + parser.POSTFIX, 'w', 30 zipout = zipfile.ZipFile(self.basename + parser.POSTFIX + self.ext, 'w',
31 allowZip64=True) 31 allowZip64=True)
32 for item in zipin.namelist(): 32 for item in zipin.namelist():
33 name = os.path.join(self.tempdir, item) 33 name = os.path.join(self.tempdir, item)
@@ -65,10 +65,7 @@ class OpenDocumentStripper(archive.GenericArchiveStripper):
65 logging.info('%s treated' % self.filename) 65 logging.info('%s treated' % self.filename)
66 zipin.close() 66 zipin.close()
67 zipout.close() 67 zipout.close()
68 68 self.do_backup()
69 if self.backup is False:
70 mat.secure_remove(self.filename) #remove the old file
71 os.rename(self.filename + parser.POSTFIX, self.filename)
72 69
73 def is_clean(self): 70 def is_clean(self):
74 zipin = zipfile.ZipFile(self.filename, 'r') 71 zipin = zipfile.ZipFile(self.filename, 'r')
@@ -106,9 +103,7 @@ class TorrentStripper(parser.Generic_parser):
106 del self.editor['/root/' + field.name] 103 del self.editor['/root/' + field.name]
107 hachoir_core.field.writeIntoFile(self.editor, 104 hachoir_core.field.writeIntoFile(self.editor,
108 self.filename + parser.POSTFIX) 105 self.filename + parser.POSTFIX)
109 if self.backup is False: 106 self.do_backup()
110 mat.secure_remove(self.filename) #remove the old file
111 os.rename(self.filename + parser.POSTFIX, self.filename)
112 107
113 def is_clean(self): 108 def is_clean(self):
114 for field in self.editor['root']: 109 for field in self.editor['root']:
@@ -138,6 +133,8 @@ class PdfStripper(parser.Generic_parser):
138 Represent a pdf file, with the help of pdfrw 133 Represent a pdf file, with the help of pdfrw
139 ''' 134 '''
140 def __init__(self, filename, realname, backup): 135 def __init__(self, filename, realname, backup):
136 name, path = os.path.splitext(filename)
137 self.output = name + '.cleaned.' + ext
141 self.filename = filename 138 self.filename = filename
142 self.backup = backup 139 self.backup = backup
143 self.realname = realname 140 self.realname = realname
@@ -159,17 +156,14 @@ class PdfStripper(parser.Generic_parser):
159 self.trailer.Info.ModDate = '' 156 self.trailer.Info.ModDate = ''
160 157
161 self.writer.trailer = self.trailer 158 self.writer.trailer = self.trailer
162 self.writer.write(self.filename + parser.POSTFIX) 159 self.writer.write(self.output)
163 if self.backup is False: 160 self.do_backup()
164 mat.secure_remove(self.filename) #remove the old file
165 os.rename(self.filename + parser.POSTFIX, self.filename)
166 161
167 def remove_all_ugly(self): 162 def remove_all_ugly(self):
168 ''' 163 '''
169 Transform each pages into a jpg, clean them, 164 Transform each pages into a jpg, clean them,
170 then re-assemble them into a new pdf 165 then re-assemble them into a new pdf
171 ''' 166 '''
172 output_file = self.realname + parser.POSTFIX + '.pdf'
173 _, self.tmpdir = tempfile.mkstemp() 167 _, self.tmpdir = tempfile.mkstemp()
174 subprocess.call(self.convert % (self.filename, self.tmpdir + 168 subprocess.call(self.convert % (self.filename, self.tmpdir +
175 'temp.jpg'), shell=True)#Convert pages to jpg 169 'temp.jpg'), shell=True)#Convert pages to jpg
@@ -180,7 +174,7 @@ class PdfStripper(parser.Generic_parser):
180 class_file.remove_all() 174 class_file.remove_all()
181 175
182 subprocess.call(self.convert % (self.tmpdir + 176 subprocess.call(self.convert % (self.tmpdir +
183 'temp.jpg*', output_file), shell=True)#Assemble jpg into pdf 177 'temp.jpg*', self.output), shell=True)#Assemble jpg into pdf
184 178
185 for current_file in glob.glob(self.tmpdir + 'temp*'): 179 for current_file in glob.glob(self.tmpdir + 'temp*'):
186 #remove jpg files 180 #remove jpg files
@@ -188,7 +182,7 @@ class PdfStripper(parser.Generic_parser):
188 182
189 if self.backup is False: 183 if self.backup is False:
190 mat.secure_remove(self.filename) #remove the old file 184 mat.secure_remove(self.filename) #remove the old file
191 os.rename(output_file, self.filename)#rename the new 185 os.rename(self.output, self.filename)#rename the new
192 name = self.realname 186 name = self.realname
193 else: 187 else:
194 name = output_file 188 name = output_file
diff --git a/lib/parser.py b/lib/parser.py
index ba4981d..11e776e 100644
--- a/lib/parser.py
+++ b/lib/parser.py
@@ -13,10 +13,12 @@ import mimetypes
13 13
14import mat 14import mat
15 15
16POSTFIX = ".cleaned" 16NOMETA = ('*.txt', '*.bmp', '*.py')
17 17
18class Generic_parser(object): 18class Generic_parser(object):
19 def __init__(self, realname, filename, parser, editor, backup, add2archive): 19 def __init__(self, realname, filename, parser, editor, backup, add2archive):
20 basename, ext = os.path.splitext(filename)
21 self.output = basename + '.cleaned.' + ext
20 self.filename = filename #path + filename 22 self.filename = filename #path + filename
21 self.realname = realname #path + filename 23 self.realname = realname #path + filename
22 self.shortname = os.path.basename(filename) #only filename 24 self.shortname = os.path.basename(filename) #only filename
@@ -41,10 +43,8 @@ class Generic_parser(object):
41 for field in self.editor: 43 for field in self.editor:
42 if self._should_remove(field): 44 if self._should_remove(field):
43 self._remove(field.name) 45 self._remove(field.name)
44 hachoir_core.field.writeIntoFile(self.editor, self.filename + POSTFIX) 46 hachoir_core.field.writeIntoFile(self.editor, self.output)
45 if self.backup is False: 47 self.do_backup()
46 mat.secure_remove(self.filename) #remove the old file
47 os.rename(self.filename+ POSTFIX, self.filename) #rename the new
48 48
49 def remove_all_ugly(self): 49 def remove_all_ugly(self):
50 ''' 50 '''
@@ -73,7 +73,7 @@ class Generic_parser(object):
73 try: 73 try:
74 metadata[field.name] = field.value 74 metadata[field.name] = field.value
75 except: 75 except:
76 metadata[field.name] = "harmful content" 76 metadata[field.name] = 'harmful content'
77 return metadata 77 return metadata
78 78
79 def _should_remove(self, key): 79 def _should_remove(self, key):
@@ -82,3 +82,11 @@ class Generic_parser(object):
82 abstract method 82 abstract method
83 ''' 83 '''
84 raise NotImplementedError() 84 raise NotImplementedError()
85
86 def do_backup(self):
87 '''
88 Do a backup of the file if asked
89 '''
90 if self.backup is False:
91 mat.secure_remove(self.filename)
92 os.rename(self.output, self.filename)