diff options
Diffstat (limited to 'lib')
| -rw-r--r-- | lib/archive.py | 31 | ||||
| -rw-r--r-- | lib/mat.py | 2 | ||||
| -rw-r--r-- | lib/office.py | 68 | ||||
| -rw-r--r-- | lib/parser.py | 10 |
4 files changed, 22 insertions, 89 deletions
diff --git a/lib/archive.py b/lib/archive.py index 9993102..a749b29 100644 --- a/lib/archive.py +++ b/lib/archive.py | |||
| @@ -36,22 +36,9 @@ class GenericArchiveStripper(parser.GenericParser): | |||
| 36 | shutil.rmtree(self.tempdir) | 36 | shutil.rmtree(self.tempdir) |
| 37 | 37 | ||
| 38 | def remove_all(self): | 38 | def remove_all(self): |
| 39 | ''' | 39 | return self._remove_all() |
| 40 | Call _remove_all() with in argument : "normal" | ||
| 41 | ''' | ||
| 42 | return self._remove_all('normal') | ||
| 43 | 40 | ||
| 44 | def remove_all_strict(self): | 41 | def _remove_all(self): |
| 45 | ''' | ||
| 46 | call remove_all() with in argument : "strict" | ||
| 47 | ''' | ||
| 48 | return self._remove_all('strict') | ||
| 49 | |||
| 50 | def _remove_all(self, method): | ||
| 51 | ''' | ||
| 52 | Remove all meta, normal way if method is "normal", | ||
| 53 | else, use the strict way (with possible data loss) | ||
| 54 | ''' | ||
| 55 | raise NotImplementedError | 42 | raise NotImplementedError |
| 56 | 43 | ||
| 57 | 44 | ||
| @@ -127,7 +114,7 @@ harmless format' % item.filename) | |||
| 127 | zipin.close() | 114 | zipin.close() |
| 128 | return metadata | 115 | return metadata |
| 129 | 116 | ||
| 130 | def _remove_all(self, method): | 117 | def _remove_all(self): |
| 131 | ''' | 118 | ''' |
| 132 | So far, the zipfile module does not allow to write a ZipInfo | 119 | So far, the zipfile module does not allow to write a ZipInfo |
| 133 | object into a zipfile (and it's a shame !) : so data added | 120 | object into a zipfile (and it's a shame !) : so data added |
| @@ -143,10 +130,7 @@ harmless format' % item.filename) | |||
| 143 | try: | 130 | try: |
| 144 | cfile = mat.create_class_file(name, False, | 131 | cfile = mat.create_class_file(name, False, |
| 145 | self.add2archive) | 132 | self.add2archive) |
| 146 | if method is 'normal': | 133 | cfile.remove_all() |
| 147 | cfile.remove_all() | ||
| 148 | else: | ||
| 149 | cfile.remove_all_strict() | ||
| 150 | logging.debug('Processing %s from %s' % (item.filename, | 134 | logging.debug('Processing %s from %s' % (item.filename, |
| 151 | self.filename)) | 135 | self.filename)) |
| 152 | zipout.write(name, item.filename) | 136 | zipout.write(name, item.filename) |
| @@ -179,7 +163,7 @@ class TarStripper(GenericArchiveStripper): | |||
| 179 | current_file.gname = '' | 163 | current_file.gname = '' |
| 180 | return current_file | 164 | return current_file |
| 181 | 165 | ||
| 182 | def _remove_all(self, method): | 166 | def _remove_all(self): |
| 183 | tarin = tarfile.open(self.filename, 'r' + self.compression) | 167 | tarin = tarfile.open(self.filename, 'r' + self.compression) |
| 184 | tarout = tarfile.open(self.output, 'w' + self.compression) | 168 | tarout = tarfile.open(self.output, 'w' + self.compression) |
| 185 | for item in tarin.getmembers(): | 169 | for item in tarin.getmembers(): |
| @@ -190,10 +174,7 @@ class TarStripper(GenericArchiveStripper): | |||
| 190 | try: | 174 | try: |
| 191 | cfile = mat.create_class_file(name, False, | 175 | cfile = mat.create_class_file(name, False, |
| 192 | self.add2archive) | 176 | self.add2archive) |
| 193 | if method is 'normal': | 177 | cfile.remove_all() |
| 194 | cfile.remove_all() | ||
| 195 | else: | ||
| 196 | cfile.remove_all_strict() | ||
| 197 | tarout.add(name, item.name, filter=self._remove) | 178 | tarout.add(name, item.name, filter=self._remove) |
| 198 | except: | 179 | except: |
| 199 | logging.info('%s\' format is not supported or harmless' % | 180 | logging.info('%s\' format is not supported or harmless' % |
| @@ -24,7 +24,7 @@ hachoir_core.config.quiet = True | |||
| 24 | fname = '' | 24 | fname = '' |
| 25 | 25 | ||
| 26 | #Verbose | 26 | #Verbose |
| 27 | #LOGGING_LEVEL = logging.DEBUG | 27 | LOGGING_LEVEL = logging.DEBUG |
| 28 | #hachoir_core.config.quiet = False | 28 | #hachoir_core.config.quiet = False |
| 29 | #logname = 'report.log' | 29 | #logname = 'report.log' |
| 30 | 30 | ||
diff --git a/lib/office.py b/lib/office.py index e1d738e..82b817e 100644 --- a/lib/office.py +++ b/lib/office.py | |||
| @@ -49,7 +49,7 @@ class OpenDocumentStripper(archive.GenericArchiveStripper): | |||
| 49 | logging.debug('%s has no opendocument metadata' % self.filename) | 49 | logging.debug('%s has no opendocument metadata' % self.filename) |
| 50 | return metadata | 50 | return metadata |
| 51 | 51 | ||
| 52 | def _remove_all(self, method): | 52 | def _remove_all(self): |
| 53 | ''' | 53 | ''' |
| 54 | FIXME ? | 54 | FIXME ? |
| 55 | There is a patch implementing the Zipfile.remove() | 55 | There is a patch implementing the Zipfile.remove() |
| @@ -84,10 +84,7 @@ class OpenDocumentStripper(archive.GenericArchiveStripper): | |||
| 84 | try: | 84 | try: |
| 85 | cfile = mat.create_class_file(name, False, | 85 | cfile = mat.create_class_file(name, False, |
| 86 | self.add2archive) | 86 | self.add2archive) |
| 87 | if method == 'normal': | 87 | cfile.remove_all() |
| 88 | cfile.remove_all() | ||
| 89 | else: | ||
| 90 | cfile.remove_all_strict() | ||
| 91 | logging.debug('Processing %s from %s' % (item, | 88 | logging.debug('Processing %s from %s' % (item, |
| 92 | self.filename)) | 89 | self.filename)) |
| 93 | zipout.write(name, item) | 90 | zipout.write(name, item) |
| @@ -137,20 +134,17 @@ class PdfStripper(parser.GenericParser): | |||
| 137 | Check if the file is clean from harmful metadatas | 134 | Check if the file is clean from harmful metadatas |
| 138 | ''' | 135 | ''' |
| 139 | for key in self.meta_list: | 136 | for key in self.meta_list: |
| 140 | if self.document.get_property(key) is not None and \ | 137 | if self.document.get_property(key) != None: |
| 141 | self.document.get_property(key) != '': | ||
| 142 | return False | 138 | return False |
| 143 | return True | 139 | return True |
| 144 | 140 | ||
| 145 | |||
| 146 | def remove_all(self): | 141 | def remove_all(self): |
| 147 | ''' | 142 | ''' |
| 148 | Remove supperficial | 143 | Remove supperficial |
| 149 | ''' | 144 | ''' |
| 150 | return self._remove_meta() | 145 | return self._remove_meta() |
| 151 | 146 | ||
| 152 | 147 | def _remove_meta(self): | |
| 153 | def remove_all_strict(self): | ||
| 154 | ''' | 148 | ''' |
| 155 | Opening the PDF with poppler, then doing a render | 149 | Opening the PDF with poppler, then doing a render |
| 156 | on a cairo pdfsurface for each pages. | 150 | on a cairo pdfsurface for each pages. |
| @@ -166,54 +160,26 @@ class PdfStripper(parser.GenericParser): | |||
| 166 | for pagenum in xrange(self.document.get_n_pages()): | 160 | for pagenum in xrange(self.document.get_n_pages()): |
| 167 | page = self.document.get_page(pagenum) | 161 | page = self.document.get_page(pagenum) |
| 168 | context.translate(0, 0) | 162 | context.translate(0, 0) |
| 169 | page.render(context) # render the page on context | 163 | page.render_for_printing(context) # render the page on context |
| 170 | context.show_page() # draw context on surface | 164 | context.show_page() # draw context on surface |
| 171 | surface.finish() | 165 | surface.finish() |
| 172 | return self._remove_meta() | ||
| 173 | 166 | ||
| 174 | def _remove_meta(self): | 167 | try: |
| 175 | ''' | ||
| 176 | Remove superficial/external metadata | ||
| 177 | from a PDF file, using exiftool, | ||
| 178 | of pdfrw if exiftool is not installed | ||
| 179 | ''' | ||
| 180 | processed = False | ||
| 181 | try:# try with pdfrw | ||
| 182 | import pdfrw | 168 | import pdfrw |
| 183 | #For now, poppler cannot write meta, so we must use pdfrw | 169 | #For now, poppler cannot write meta, so we must use pdfrw |
| 184 | logging.debug('Removing %s\'s superficial metadata' % self.filename) | 170 | logging.debug('Removing %s\'s superficial metadata' % self.filename) |
| 185 | trailer = pdfrw.PdfReader(self.output) | 171 | trailer = pdfrw.PdfReader(self.output) |
| 186 | trailer.Info.Producer = trailer.Author = trailer.Info.Creator = None | 172 | trailer.Info.Producer = None |
| 173 | trailer.Info.Creator = None | ||
| 187 | writer = pdfrw.PdfWriter() | 174 | writer = pdfrw.PdfWriter() |
| 188 | writer.trailer = trailer | 175 | writer.trailer = trailer |
| 189 | writer.write(self.output) | 176 | writer.write(self.output) |
| 190 | self.do_backup() | 177 | self.do_backup() |
| 191 | processed = True | 178 | return True |
| 192 | except: | ||
| 193 | pass | ||
| 194 | |||
| 195 | try: # try with exiftool | ||
| 196 | subprocess.Popen('exiftool', stdout=open('/dev/null')) | ||
| 197 | import exiftool | ||
| 198 | # Note: '-All=' must be followed by a known exiftool option. | ||
| 199 | if self.backup: | ||
| 200 | process = subprocess.Popen(['exiftool', '-m', '-All=', | ||
| 201 | '-out', self.output, self.filename], stdout=open('/dev/null')) | ||
| 202 | process.wait() | ||
| 203 | else: | ||
| 204 | # Note: '-All=' must be followed by a known exiftool option. | ||
| 205 | process = subprocess.Popen( | ||
| 206 | ['exiftool', '-All=', '-overwrite_original', self.filename], | ||
| 207 | stdout=open('/dev/null')) | ||
| 208 | process.wait() | ||
| 209 | processed = True | ||
| 210 | except: | 179 | except: |
| 211 | pass | 180 | print('Unable to remove all metadata from %s, please install\ |
| 212 | 181 | pdfrw' % self.output) | |
| 213 | if processed is False: | 182 | return False |
| 214 | logging.error('Please install either pdfrw, or exiftool to\ | ||
| 215 | fully handle PDF files') | ||
| 216 | return processed | ||
| 217 | 183 | ||
| 218 | def get_meta(self): | 184 | def get_meta(self): |
| 219 | ''' | 185 | ''' |
| @@ -221,8 +187,7 @@ class PdfStripper(parser.GenericParser): | |||
| 221 | ''' | 187 | ''' |
| 222 | metadata = {} | 188 | metadata = {} |
| 223 | for key in self.meta_list: | 189 | for key in self.meta_list: |
| 224 | if self.document.get_property(key) is not None and \ | 190 | if self.document.get_property(key) is not None: |
| 225 | self.document.get_property(key) != '': | ||
| 226 | metadata[key] = self.document.get_property(key) | 191 | metadata[key] = self.document.get_property(key) |
| 227 | return metadata | 192 | return metadata |
| 228 | 193 | ||
| @@ -234,7 +199,7 @@ class OpenXmlStripper(archive.GenericArchiveStripper): | |||
| 234 | It contains mostly xml, but can have media blobs, crap, ... | 199 | It contains mostly xml, but can have media blobs, crap, ... |
| 235 | (I don't like this format.) | 200 | (I don't like this format.) |
| 236 | ''' | 201 | ''' |
| 237 | def _remove_all(self, method): | 202 | def _remove_all(self): |
| 238 | ''' | 203 | ''' |
| 239 | FIXME ? | 204 | FIXME ? |
| 240 | There is a patch implementing the Zipfile.remove() | 205 | There is a patch implementing the Zipfile.remove() |
| @@ -258,10 +223,7 @@ class OpenXmlStripper(archive.GenericArchiveStripper): | |||
| 258 | try: | 223 | try: |
| 259 | cfile = mat.create_class_file(name, False, | 224 | cfile = mat.create_class_file(name, False, |
| 260 | self.add2archive) | 225 | self.add2archive) |
| 261 | if method == 'normal': | 226 | cfile.remove_all() |
| 262 | cfile.remove_all() | ||
| 263 | else: | ||
| 264 | cfile.remove_all_strict() | ||
| 265 | logging.debug('Processing %s from %s' % (item, | 227 | logging.debug('Processing %s from %s' % (item, |
| 266 | self.filename)) | 228 | self.filename)) |
| 267 | zipout.write(name, item) | 229 | zipout.write(name, item) |
diff --git a/lib/parser.py b/lib/parser.py index 6dc5d0b..d2eaf9c 100644 --- a/lib/parser.py +++ b/lib/parser.py | |||
| @@ -78,16 +78,6 @@ class GenericParser(object): | |||
| 78 | except: | 78 | except: |
| 79 | return False | 79 | return False |
| 80 | 80 | ||
| 81 | def remove_all_strict(self): | ||
| 82 | ''' | ||
| 83 | If the remove_all() is not efficient enough, | ||
| 84 | this method is implemented : | ||
| 85 | It is efficient, but destructive. | ||
| 86 | In a perfect world, with nice fileformat, | ||
| 87 | this method would not exist. | ||
| 88 | ''' | ||
| 89 | self.remove_all() | ||
| 90 | |||
| 91 | def _remove(self, fieldset, field): | 81 | def _remove(self, fieldset, field): |
| 92 | ''' | 82 | ''' |
| 93 | Delete the given field | 83 | Delete the given field |
