summaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
Diffstat (limited to 'lib')
-rw-r--r--lib/archive.py31
-rw-r--r--lib/mat.py2
-rw-r--r--lib/office.py68
-rw-r--r--lib/parser.py10
4 files changed, 22 insertions, 89 deletions
diff --git a/lib/archive.py b/lib/archive.py
index 9993102..a749b29 100644
--- a/lib/archive.py
+++ b/lib/archive.py
@@ -36,22 +36,9 @@ class GenericArchiveStripper(parser.GenericParser):
36 shutil.rmtree(self.tempdir) 36 shutil.rmtree(self.tempdir)
37 37
38 def remove_all(self): 38 def remove_all(self):
39 ''' 39 return self._remove_all()
40 Call _remove_all() with in argument : "normal"
41 '''
42 return self._remove_all('normal')
43 40
44 def remove_all_strict(self): 41 def _remove_all(self):
45 '''
46 call remove_all() with in argument : "strict"
47 '''
48 return self._remove_all('strict')
49
50 def _remove_all(self, method):
51 '''
52 Remove all meta, normal way if method is "normal",
53 else, use the strict way (with possible data loss)
54 '''
55 raise NotImplementedError 42 raise NotImplementedError
56 43
57 44
@@ -127,7 +114,7 @@ harmless format' % item.filename)
127 zipin.close() 114 zipin.close()
128 return metadata 115 return metadata
129 116
130 def _remove_all(self, method): 117 def _remove_all(self):
131 ''' 118 '''
132 So far, the zipfile module does not allow to write a ZipInfo 119 So far, the zipfile module does not allow to write a ZipInfo
133 object into a zipfile (and it's a shame !) : so data added 120 object into a zipfile (and it's a shame !) : so data added
@@ -143,10 +130,7 @@ harmless format' % item.filename)
143 try: 130 try:
144 cfile = mat.create_class_file(name, False, 131 cfile = mat.create_class_file(name, False,
145 self.add2archive) 132 self.add2archive)
146 if method is 'normal': 133 cfile.remove_all()
147 cfile.remove_all()
148 else:
149 cfile.remove_all_strict()
150 logging.debug('Processing %s from %s' % (item.filename, 134 logging.debug('Processing %s from %s' % (item.filename,
151 self.filename)) 135 self.filename))
152 zipout.write(name, item.filename) 136 zipout.write(name, item.filename)
@@ -179,7 +163,7 @@ class TarStripper(GenericArchiveStripper):
179 current_file.gname = '' 163 current_file.gname = ''
180 return current_file 164 return current_file
181 165
182 def _remove_all(self, method): 166 def _remove_all(self):
183 tarin = tarfile.open(self.filename, 'r' + self.compression) 167 tarin = tarfile.open(self.filename, 'r' + self.compression)
184 tarout = tarfile.open(self.output, 'w' + self.compression) 168 tarout = tarfile.open(self.output, 'w' + self.compression)
185 for item in tarin.getmembers(): 169 for item in tarin.getmembers():
@@ -190,10 +174,7 @@ class TarStripper(GenericArchiveStripper):
190 try: 174 try:
191 cfile = mat.create_class_file(name, False, 175 cfile = mat.create_class_file(name, False,
192 self.add2archive) 176 self.add2archive)
193 if method is 'normal': 177 cfile.remove_all()
194 cfile.remove_all()
195 else:
196 cfile.remove_all_strict()
197 tarout.add(name, item.name, filter=self._remove) 178 tarout.add(name, item.name, filter=self._remove)
198 except: 179 except:
199 logging.info('%s\' format is not supported or harmless' % 180 logging.info('%s\' format is not supported or harmless' %
diff --git a/lib/mat.py b/lib/mat.py
index 53d02d8..dfcfc57 100644
--- a/lib/mat.py
+++ b/lib/mat.py
@@ -24,7 +24,7 @@ hachoir_core.config.quiet = True
24fname = '' 24fname = ''
25 25
26#Verbose 26#Verbose
27#LOGGING_LEVEL = logging.DEBUG 27LOGGING_LEVEL = logging.DEBUG
28#hachoir_core.config.quiet = False 28#hachoir_core.config.quiet = False
29#logname = 'report.log' 29#logname = 'report.log'
30 30
diff --git a/lib/office.py b/lib/office.py
index e1d738e..82b817e 100644
--- a/lib/office.py
+++ b/lib/office.py
@@ -49,7 +49,7 @@ class OpenDocumentStripper(archive.GenericArchiveStripper):
49 logging.debug('%s has no opendocument metadata' % self.filename) 49 logging.debug('%s has no opendocument metadata' % self.filename)
50 return metadata 50 return metadata
51 51
52 def _remove_all(self, method): 52 def _remove_all(self):
53 ''' 53 '''
54 FIXME ? 54 FIXME ?
55 There is a patch implementing the Zipfile.remove() 55 There is a patch implementing the Zipfile.remove()
@@ -84,10 +84,7 @@ class OpenDocumentStripper(archive.GenericArchiveStripper):
84 try: 84 try:
85 cfile = mat.create_class_file(name, False, 85 cfile = mat.create_class_file(name, False,
86 self.add2archive) 86 self.add2archive)
87 if method == 'normal': 87 cfile.remove_all()
88 cfile.remove_all()
89 else:
90 cfile.remove_all_strict()
91 logging.debug('Processing %s from %s' % (item, 88 logging.debug('Processing %s from %s' % (item,
92 self.filename)) 89 self.filename))
93 zipout.write(name, item) 90 zipout.write(name, item)
@@ -137,20 +134,17 @@ class PdfStripper(parser.GenericParser):
137 Check if the file is clean from harmful metadatas 134 Check if the file is clean from harmful metadatas
138 ''' 135 '''
139 for key in self.meta_list: 136 for key in self.meta_list:
140 if self.document.get_property(key) is not None and \ 137 if self.document.get_property(key) != None:
141 self.document.get_property(key) != '':
142 return False 138 return False
143 return True 139 return True
144 140
145
146 def remove_all(self): 141 def remove_all(self):
147 ''' 142 '''
148 Remove supperficial 143 Remove supperficial
149 ''' 144 '''
150 return self._remove_meta() 145 return self._remove_meta()
151 146
152 147 def _remove_meta(self):
153 def remove_all_strict(self):
154 ''' 148 '''
155 Opening the PDF with poppler, then doing a render 149 Opening the PDF with poppler, then doing a render
156 on a cairo pdfsurface for each pages. 150 on a cairo pdfsurface for each pages.
@@ -166,54 +160,26 @@ class PdfStripper(parser.GenericParser):
166 for pagenum in xrange(self.document.get_n_pages()): 160 for pagenum in xrange(self.document.get_n_pages()):
167 page = self.document.get_page(pagenum) 161 page = self.document.get_page(pagenum)
168 context.translate(0, 0) 162 context.translate(0, 0)
169 page.render(context) # render the page on context 163 page.render_for_printing(context) # render the page on context
170 context.show_page() # draw context on surface 164 context.show_page() # draw context on surface
171 surface.finish() 165 surface.finish()
172 return self._remove_meta()
173 166
174 def _remove_meta(self): 167 try:
175 '''
176 Remove superficial/external metadata
177 from a PDF file, using exiftool,
178 of pdfrw if exiftool is not installed
179 '''
180 processed = False
181 try:# try with pdfrw
182 import pdfrw 168 import pdfrw
183 #For now, poppler cannot write meta, so we must use pdfrw 169 #For now, poppler cannot write meta, so we must use pdfrw
184 logging.debug('Removing %s\'s superficial metadata' % self.filename) 170 logging.debug('Removing %s\'s superficial metadata' % self.filename)
185 trailer = pdfrw.PdfReader(self.output) 171 trailer = pdfrw.PdfReader(self.output)
186 trailer.Info.Producer = trailer.Author = trailer.Info.Creator = None 172 trailer.Info.Producer = None
173 trailer.Info.Creator = None
187 writer = pdfrw.PdfWriter() 174 writer = pdfrw.PdfWriter()
188 writer.trailer = trailer 175 writer.trailer = trailer
189 writer.write(self.output) 176 writer.write(self.output)
190 self.do_backup() 177 self.do_backup()
191 processed = True 178 return True
192 except:
193 pass
194
195 try: # try with exiftool
196 subprocess.Popen('exiftool', stdout=open('/dev/null'))
197 import exiftool
198 # Note: '-All=' must be followed by a known exiftool option.
199 if self.backup:
200 process = subprocess.Popen(['exiftool', '-m', '-All=',
201 '-out', self.output, self.filename], stdout=open('/dev/null'))
202 process.wait()
203 else:
204 # Note: '-All=' must be followed by a known exiftool option.
205 process = subprocess.Popen(
206 ['exiftool', '-All=', '-overwrite_original', self.filename],
207 stdout=open('/dev/null'))
208 process.wait()
209 processed = True
210 except: 179 except:
211 pass 180 print('Unable to remove all metadata from %s, please install\
212 181 pdfrw' % self.output)
213 if processed is False: 182 return False
214 logging.error('Please install either pdfrw, or exiftool to\
215 fully handle PDF files')
216 return processed
217 183
218 def get_meta(self): 184 def get_meta(self):
219 ''' 185 '''
@@ -221,8 +187,7 @@ class PdfStripper(parser.GenericParser):
221 ''' 187 '''
222 metadata = {} 188 metadata = {}
223 for key in self.meta_list: 189 for key in self.meta_list:
224 if self.document.get_property(key) is not None and \ 190 if self.document.get_property(key) is not None:
225 self.document.get_property(key) != '':
226 metadata[key] = self.document.get_property(key) 191 metadata[key] = self.document.get_property(key)
227 return metadata 192 return metadata
228 193
@@ -234,7 +199,7 @@ class OpenXmlStripper(archive.GenericArchiveStripper):
234 It contains mostly xml, but can have media blobs, crap, ... 199 It contains mostly xml, but can have media blobs, crap, ...
235 (I don't like this format.) 200 (I don't like this format.)
236 ''' 201 '''
237 def _remove_all(self, method): 202 def _remove_all(self):
238 ''' 203 '''
239 FIXME ? 204 FIXME ?
240 There is a patch implementing the Zipfile.remove() 205 There is a patch implementing the Zipfile.remove()
@@ -258,10 +223,7 @@ class OpenXmlStripper(archive.GenericArchiveStripper):
258 try: 223 try:
259 cfile = mat.create_class_file(name, False, 224 cfile = mat.create_class_file(name, False,
260 self.add2archive) 225 self.add2archive)
261 if method == 'normal': 226 cfile.remove_all()
262 cfile.remove_all()
263 else:
264 cfile.remove_all_strict()
265 logging.debug('Processing %s from %s' % (item, 227 logging.debug('Processing %s from %s' % (item,
266 self.filename)) 228 self.filename))
267 zipout.write(name, item) 229 zipout.write(name, item)
diff --git a/lib/parser.py b/lib/parser.py
index 6dc5d0b..d2eaf9c 100644
--- a/lib/parser.py
+++ b/lib/parser.py
@@ -78,16 +78,6 @@ class GenericParser(object):
78 except: 78 except:
79 return False 79 return False
80 80
81 def remove_all_strict(self):
82 '''
83 If the remove_all() is not efficient enough,
84 this method is implemented :
85 It is efficient, but destructive.
86 In a perfect world, with nice fileformat,
87 this method would not exist.
88 '''
89 self.remove_all()
90
91 def _remove(self, fieldset, field): 81 def _remove(self, fieldset, field):
92 ''' 82 '''
93 Delete the given field 83 Delete the given field