diff options
| author | jvoisin | 2011-10-01 20:24:59 +0200 |
|---|---|---|
| committer | jvoisin | 2011-10-01 20:24:59 +0200 |
| commit | be6becb15e24d7044fc05a236f288b141fa46a92 (patch) | |
| tree | cc003853d0c5ad96c2f60361e50f747824a9189d | |
| parent | 33e441ce348e728f218c74cf527f0cd3e949ab73 (diff) | |
Remove internal pdfrw
| -rw-r--r-- | README | 1 | ||||
| -rw-r--r-- | mat/office.py | 82 |
2 files changed, 43 insertions, 40 deletions
| @@ -23,6 +23,7 @@ WARNING : | |||
| 23 | DEPENDENCIES: | 23 | DEPENDENCIES: |
| 24 | python2.6 (at least) | 24 | python2.6 (at least) |
| 25 | python-hachoir-core and python-hachoir-parser | 25 | python-hachoir-core and python-hachoir-parser |
| 26 | python-pdfrw or exiftool for full pdf support | ||
| 26 | shred (should be already installed) | 27 | shred (should be already installed) |
| 27 | 28 | ||
| 28 | 29 | ||
diff --git a/mat/office.py b/mat/office.py index 0b36fe7..b8a235f 100644 --- a/mat/office.py +++ b/mat/office.py | |||
| @@ -6,6 +6,7 @@ import os | |||
| 6 | import logging | 6 | import logging |
| 7 | import zipfile | 7 | import zipfile |
| 8 | import fileinput | 8 | import fileinput |
| 9 | import subprocess | ||
| 9 | 10 | ||
| 10 | try: | 11 | try: |
| 11 | import cairo | 12 | import cairo |
| @@ -16,7 +17,6 @@ except ImportError: | |||
| 16 | import mat | 17 | import mat |
| 17 | import parser | 18 | import parser |
| 18 | import archive | 19 | import archive |
| 19 | import pdfrw | ||
| 20 | 20 | ||
| 21 | 21 | ||
| 22 | class OpenDocumentStripper(archive.GenericArchiveStripper): | 22 | class OpenDocumentStripper(archive.GenericArchiveStripper): |
| @@ -120,45 +120,27 @@ class PdfStripper(parser.GenericParser): | |||
| 120 | self.password = None | 120 | self.password = None |
| 121 | self.document = poppler.document_new_from_file(uri, self.password) | 121 | self.document = poppler.document_new_from_file(uri, self.password) |
| 122 | self.meta_list = ('title', 'author', 'subject', 'keywords', 'creator', | 122 | self.meta_list = ('title', 'author', 'subject', 'keywords', 'creator', |
| 123 | 'producer', 'creation-date', 'mod-date', 'metadata') | 123 | 'producer', 'metadata') |
| 124 | 124 | ||
| 125 | def is_clean(self): | 125 | def is_clean(self): |
| 126 | ''' | 126 | ''' |
| 127 | Check if the file is clean from harmful metadatas | 127 | Check if the file is clean from harmful metadatas |
| 128 | ''' | 128 | ''' |
| 129 | for key in self.meta_list: | 129 | for key in self.meta_list: |
| 130 | if key == 'creation-date' or key == 'mod-date': | 130 | if self.document.get_property(key) is not None and \ |
| 131 | if self.document.get_property(key) != -1: | ||
| 132 | return False | ||
| 133 | elif self.document.get_property(key) is not None and \ | ||
| 134 | self.document.get_property(key) != '': | 131 | self.document.get_property(key) != '': |
| 135 | return False | 132 | return False |
| 136 | return True | 133 | return True |
| 137 | 134 | ||
| 138 | def remove_all_ugly(self): | ||
| 139 | page = self.document.get_page(0) | ||
| 140 | page_width, page_height = page.get_size() | ||
| 141 | surface = cairo.PDFSurface(self.output, page_width, page_height) | ||
| 142 | context = cairo.Context(surface) # context draws on the surface | ||
| 143 | logging.debug('Pdf rendering of %s' % self.filename) | ||
| 144 | for pagenum in xrange(self.document.get_n_pages()): | ||
| 145 | page = self.document.get_page(pagenum) | ||
| 146 | context.translate(0, 0) | ||
| 147 | page.render(context) # render the page on context | ||
| 148 | context.show_page() # draw context on surface | ||
| 149 | surface.finish() | ||
| 150 | 135 | ||
| 151 | #For now, poppler cannot write meta, so we must use pdfrw | 136 | def remove_all(self): |
| 152 | logging.debug('Removing %s\'s superficial metadata' % self.filename) | 137 | ''' |
| 153 | trailer = pdfrw.PdfReader(self.output) | 138 | Remove supperficial |
| 154 | trailer.Info.Producer = trailer.Info.Creator = None | 139 | ''' |
| 155 | writer = pdfrw.PdfWriter() | 140 | self._remove_superficial_meta() |
| 156 | writer.trailer = trailer | ||
| 157 | writer.write(self.output) | ||
| 158 | self.do_backup() | ||
| 159 | 141 | ||
| 160 | 142 | ||
| 161 | def remove_all(self): | 143 | def remove_all_ugly(self): |
| 162 | ''' | 144 | ''' |
| 163 | Opening the pdf with poppler, then doing a render | 145 | Opening the pdf with poppler, then doing a render |
| 164 | on a cairo pdfsurface for each pages. | 146 | on a cairo pdfsurface for each pages. |
| @@ -177,15 +159,39 @@ class PdfStripper(parser.GenericParser): | |||
| 177 | page.render(context) # render the page on context | 159 | page.render(context) # render the page on context |
| 178 | context.show_page() # draw context on surface | 160 | context.show_page() # draw context on surface |
| 179 | surface.finish() | 161 | surface.finish() |
| 162 | self._remove_superficial_meta() | ||
| 180 | 163 | ||
| 181 | #For now, poppler cannot write meta, so we must use pdfrw | 164 | def _remove_superficial_meta(self): |
| 182 | logging.debug('Removing %s\'s superficial metadata' % self.filename) | 165 | ''' |
| 183 | trailer = pdfrw.PdfReader(self.output) | 166 | Remove superficial/external metadata |
| 184 | trailer.Info.Producer = trailer.Info.Creator = None | 167 | from a pdf file, using exiftool, |
| 185 | writer = pdfrw.PdfWriter() | 168 | of pdfrw if exiftool is not installed |
| 186 | writer.trailer = trailer | 169 | ''' |
| 187 | writer.write(self.output) | 170 | try: |
| 188 | self.do_backup() | 171 | import exiftool |
| 172 | if self.backup: | ||
| 173 | process = subprocess.Popen(['exiftool', '-all=', | ||
| 174 | '-o %s' % self.output, self.filename], | ||
| 175 | stdout=open('/dev/null')) | ||
| 176 | process.wait() | ||
| 177 | else: | ||
| 178 | process = subprocess.Popen(['exiftool', '-overwrite_original', | ||
| 179 | '-all=', self.filename], stdout=open('/dev/null')) | ||
| 180 | process.wait() | ||
| 181 | except: | ||
| 182 | try: | ||
| 183 | import pdfrw | ||
| 184 | #For now, poppler cannot write meta, so we must use pdfrw | ||
| 185 | logging.debug('Removing %s\'s superficial metadata' % self.filename) | ||
| 186 | trailer = pdfrw.PdfReader(self.output) | ||
| 187 | trailer.Info.Producer = trailer.Info.Creator = None | ||
| 188 | writer = pdfrw.PdfWriter() | ||
| 189 | writer.trailer = trailer | ||
| 190 | writer.write(self.output) | ||
| 191 | self.do_backup() | ||
| 192 | except: | ||
| 193 | logging.error('You don\'t have either python-pdfrw, or\ | ||
| 194 | exiftool: processed pdf are not totally clean !') | ||
| 189 | 195 | ||
| 190 | def get_meta(self): | 196 | def get_meta(self): |
| 191 | ''' | 197 | ''' |
| @@ -193,11 +199,7 @@ class PdfStripper(parser.GenericParser): | |||
| 193 | ''' | 199 | ''' |
| 194 | metadata = {} | 200 | metadata = {} |
| 195 | for key in self.meta_list: | 201 | for key in self.meta_list: |
| 196 | if key == 'creation-date' or key == 'mod-date': | 202 | if self.document.get_property(key) is not None and \ |
| 197 | #creation and modification are set to -1 | ||
| 198 | if self.document.get_property(key) != -1: | ||
| 199 | metadata[key] = self.document.get_property(key) | ||
| 200 | elif self.document.get_property(key) is not None and \ | ||
| 201 | self.document.get_property(key) != '': | 203 | self.document.get_property(key) != '': |
| 202 | metadata[key] = self.document.get_property(key) | 204 | metadata[key] = self.document.get_property(key) |
| 203 | return metadata | 205 | return metadata |
