diff options
| -rw-r--r-- | lib/office.py | 22 |
1 files changed, 10 insertions, 12 deletions
diff --git a/lib/office.py b/lib/office.py index 82b817e..8350244 100644 --- a/lib/office.py +++ b/lib/office.py | |||
| @@ -19,6 +19,7 @@ import mat | |||
| 19 | import parser | 19 | import parser |
| 20 | import archive | 20 | import archive |
| 21 | 21 | ||
| 22 | |||
| 22 | class OpenDocumentStripper(archive.GenericArchiveStripper): | 23 | class OpenDocumentStripper(archive.GenericArchiveStripper): |
| 23 | ''' | 24 | ''' |
| 24 | An open document file is a zip, with xml file into. | 25 | An open document file is a zip, with xml file into. |
| @@ -126,21 +127,21 @@ class PdfStripper(parser.GenericParser): | |||
| 126 | uri = 'file://' + os.path.abspath(self.filename) | 127 | uri = 'file://' + os.path.abspath(self.filename) |
| 127 | self.password = None | 128 | self.password = None |
| 128 | self.document = poppler.document_new_from_file(uri, self.password) | 129 | self.document = poppler.document_new_from_file(uri, self.password) |
| 129 | self.meta_list = ('title', 'author', 'subject', 'keywords', 'creator', | 130 | self.meta_list = frozenset(['title', 'author', 'subject', 'keywords', 'creator', |
| 130 | 'producer', 'metadata') | 131 | 'producer', 'metadata']) |
| 131 | 132 | ||
| 132 | def is_clean(self): | 133 | def is_clean(self): |
| 133 | ''' | 134 | ''' |
| 134 | Check if the file is clean from harmful metadatas | 135 | Check if the file is clean from harmful metadatas |
| 135 | ''' | 136 | ''' |
| 136 | for key in self.meta_list: | 137 | for key in self.meta_list: |
| 137 | if self.document.get_property(key) != None: | 138 | if self.document.get_property(key): |
| 138 | return False | 139 | return False |
| 139 | return True | 140 | return True |
| 140 | 141 | ||
| 141 | def remove_all(self): | 142 | def remove_all(self): |
| 142 | ''' | 143 | ''' |
| 143 | Remove supperficial | 144 | Remove metadata |
| 144 | ''' | 145 | ''' |
| 145 | return self._remove_meta() | 146 | return self._remove_meta() |
| 146 | 147 | ||
| @@ -148,11 +149,12 @@ class PdfStripper(parser.GenericParser): | |||
| 148 | ''' | 149 | ''' |
| 149 | Opening the PDF with poppler, then doing a render | 150 | Opening the PDF with poppler, then doing a render |
| 150 | on a cairo pdfsurface for each pages. | 151 | on a cairo pdfsurface for each pages. |
| 151 | Thanks to Lunar^for the idea. | 152 | |
| 152 | http://cairographics.org/documentation/pycairo/2/ | 153 | http://cairographics.org/documentation/pycairo/2/ |
| 153 | python-poppler is not documented at all : have fun ;) | 154 | python-poppler is not documented at all : have fun ;) |
| 154 | ''' | 155 | ''' |
| 155 | page = self.document.get_page(0) | 156 | page = self.document.get_page(0) |
| 157 | # assume that every pages are the same size | ||
| 156 | page_width, page_height = page.get_size() | 158 | page_width, page_height = page.get_size() |
| 157 | surface = cairo.PDFSurface(self.output, page_width, page_height) | 159 | surface = cairo.PDFSurface(self.output, page_width, page_height) |
| 158 | context = cairo.Context(surface) # context draws on the surface | 160 | context = cairo.Context(surface) # context draws on the surface |
| @@ -165,8 +167,7 @@ class PdfStripper(parser.GenericParser): | |||
| 165 | surface.finish() | 167 | surface.finish() |
| 166 | 168 | ||
| 167 | try: | 169 | try: |
| 168 | import pdfrw | 170 | import pdfrw # For now, poppler cannot write meta, so we must use pdfrw |
| 169 | #For now, poppler cannot write meta, so we must use pdfrw | ||
| 170 | logging.debug('Removing %s\'s superficial metadata' % self.filename) | 171 | logging.debug('Removing %s\'s superficial metadata' % self.filename) |
| 171 | trailer = pdfrw.PdfReader(self.output) | 172 | trailer = pdfrw.PdfReader(self.output) |
| 172 | trailer.Info.Producer = None | 173 | trailer.Info.Producer = None |
| @@ -187,7 +188,7 @@ class PdfStripper(parser.GenericParser): | |||
| 187 | ''' | 188 | ''' |
| 188 | metadata = {} | 189 | metadata = {} |
| 189 | for key in self.meta_list: | 190 | for key in self.meta_list: |
| 190 | if self.document.get_property(key) is not None: | 191 | if self.document.get_property(key): |
| 191 | metadata[key] = self.document.get_property(key) | 192 | metadata[key] = self.document.get_property(key) |
| 192 | return metadata | 193 | return metadata |
| 193 | 194 | ||
| @@ -249,10 +250,7 @@ class OpenXmlStripper(archive.GenericArchiveStripper): | |||
| 249 | zipin.close() | 250 | zipin.close() |
| 250 | czf = archive.ZipStripper(self.filename, self.parser, | 251 | czf = archive.ZipStripper(self.filename, self.parser, |
| 251 | 'application/zip', self.backup, self.add2archive) | 252 | 'application/zip', self.backup, self.add2archive) |
| 252 | if not czf.is_clean(): | 253 | return czf.is_clean() |
| 253 | return False | ||
| 254 | else: | ||
| 255 | return True | ||
| 256 | 254 | ||
| 257 | def get_meta(self): | 255 | def get_meta(self): |
| 258 | ''' | 256 | ''' |
