diff options
| -rw-r--r-- | lib/office.py | 65 |
1 files changed, 41 insertions, 24 deletions
diff --git a/lib/office.py b/lib/office.py index cfee3aa..370aa3f 100644 --- a/lib/office.py +++ b/lib/office.py | |||
| @@ -22,6 +22,7 @@ except ImportError: | |||
| 22 | import mat | 22 | import mat |
| 23 | import parser | 23 | import parser |
| 24 | import archive | 24 | import archive |
| 25 | import pdfrw | ||
| 25 | 26 | ||
| 26 | 27 | ||
| 27 | class OpenDocumentStripper(archive.GenericArchiveStripper): | 28 | class OpenDocumentStripper(archive.GenericArchiveStripper): |
| @@ -111,48 +112,64 @@ class OpenDocumentStripper(archive.GenericArchiveStripper): | |||
| 111 | return False | 112 | return False |
| 112 | return True | 113 | return True |
| 113 | 114 | ||
| 115 | |||
| 114 | class PdfStripper(parser.GenericParser): | 116 | class PdfStripper(parser.GenericParser): |
| 115 | ''' | 117 | ''' |
| 116 | Represent a pdf file | 118 | Represent a pdf file |
| 117 | ''' | 119 | ''' |
| 120 | def __init__(self, filename, parser, mime, backup, add2archive): | ||
| 121 | super(PdfStripper, self).__init__(filename, parser, mime, backup, | ||
| 122 | add2archive) | ||
| 123 | uri = 'file://' + self.filename | ||
| 124 | self.password = None | ||
| 125 | self.document = poppler.document_new_from_file(uri, self.password) | ||
| 126 | self.meta_list = ('title', 'author', 'subject', 'keywords', 'creator', | ||
| 127 | 'producer', 'creation-date', 'mod-date', 'metadata') | ||
| 128 | |||
| 118 | def is_clean(self): | 129 | def is_clean(self): |
| 119 | #FIXME | 130 | ''' |
| 120 | return False | 131 | Check if the file is clean from harmful metadatas |
| 132 | ''' | ||
| 133 | for key in self.meta_list: | ||
| 134 | if key != 'creation-date' and key != 'mod-date': | ||
| 135 | if self.document.get_property(key) is not None: | ||
| 136 | return False | ||
| 137 | else: | ||
| 138 | if self.document.get_property(key) != -1: | ||
| 139 | return False | ||
| 140 | return True | ||
| 121 | 141 | ||
| 122 | def remove_all(self): | 142 | def remove_all(self): |
| 123 | #FIXME | ||
| 124 | self.remove_all_ugly() | ||
| 125 | |||
| 126 | def remove_all_ugly(self): | ||
| 127 | ''' | 143 | ''' |
| 128 | Opening the pdf with poppler, then doing a render | 144 | Opening the pdf with poppler, then doing a render |
| 129 | on a cairo pdfsurface. | 145 | on a cairo pdfsurface for each pages. |
| 146 | http://cairographics.org/documentation/pycairo/2/ | ||
| 147 | python-poppler is not documented at all : have fun ;) | ||
| 130 | ''' | 148 | ''' |
| 131 | uri = 'file://' + self.filename | 149 | page = self.document.get_page(0) |
| 132 | password = None | ||
| 133 | document = poppler.document_new_from_file(uri, password) | ||
| 134 | page = document.get_page(0) | ||
| 135 | page_width, page_height = page.get_size() | 150 | page_width, page_height = page.get_size() |
| 136 | surface = cairo.PDFSurface(self.output, page_width, page_height) | 151 | surface = cairo.PDFSurface(self.output, page_width, page_height) |
| 137 | context = cairo.Context(surface) | 152 | context = cairo.Context(surface) |
| 138 | for i in xrange(document.get_n_pages()): | 153 | for pagenum in xrange(self.document.get_n_pages()): |
| 139 | page = document.get_page(i) | 154 | page = self.document.get_page(pagenum) |
| 140 | context.translate(0, 0) | 155 | context.translate(0, 0) |
| 141 | page.render(context) | 156 | page.render(context) |
| 142 | context.show_page() | 157 | context.show_page() |
| 143 | surface.finish() | 158 | surface.finish() |
| 159 | #For now, poppler cannot write meta, so we must use pdfrw | ||
| 160 | trailer = pdfrw.PdfReader(self.output) | ||
| 161 | trailer.Info.Producer = '' | ||
| 162 | trailer.Info.Creator = '' | ||
| 163 | writer = pdfrw.PdfWriter() | ||
| 164 | writer.trailer = trailer | ||
| 165 | writer.write(self.output) | ||
| 144 | 166 | ||
| 145 | def get_meta(self): | 167 | def get_meta(self): |
| 168 | ''' | ||
| 169 | Return a dict with all the meta of the file | ||
| 170 | ''' | ||
| 146 | metadata={} | 171 | metadata={} |
| 147 | meta_list=('title', 'author', 'subject', 'keywords', 'creator', | 172 | for key in self.meta_list: |
| 148 | 'producer', 'creation-date', 'mod-date', 'metadata') | 173 | if self.document.get_property(key) is not None: |
| 149 | uri = 'file://' + self.filename | 174 | metadata[key] = self.document.get_property(key) |
| 150 | password = None | ||
| 151 | document = poppler.document_new_from_file(uri, password) | ||
| 152 | for key in meta_list: | ||
| 153 | self._get_meta(document, metadata, key) | ||
| 154 | return metadata | 175 | return metadata |
| 155 | |||
| 156 | def _get_meta(self, document, metadata, key): | ||
| 157 | if document.get_property(key) is not None: | ||
| 158 | metadata[key] = document.get_property(key) | ||
