summaryrefslogtreecommitdiff
path: root/lib/office.py
diff options
context:
space:
mode:
authorjvoisin2011-07-30 19:15:23 +0200
committerjvoisin2011-07-30 19:15:23 +0200
commite5a1635364acb8c84efa627a924a0a4a1c558d4b (patch)
tree227088ab3164220a38e3ecd540c8916e2184796a /lib/office.py
parent158fbf02f5f349d2f9a7b1976306804224ad92da (diff)
Full support of pdf files, yeah !
Diffstat (limited to 'lib/office.py')
-rw-r--r--lib/office.py65
1 files changed, 41 insertions, 24 deletions
diff --git a/lib/office.py b/lib/office.py
index cfee3aa..370aa3f 100644
--- a/lib/office.py
+++ b/lib/office.py
@@ -22,6 +22,7 @@ except ImportError:
22import mat 22import mat
23import parser 23import parser
24import archive 24import archive
25import pdfrw
25 26
26 27
27class OpenDocumentStripper(archive.GenericArchiveStripper): 28class OpenDocumentStripper(archive.GenericArchiveStripper):
@@ -111,48 +112,64 @@ class OpenDocumentStripper(archive.GenericArchiveStripper):
111 return False 112 return False
112 return True 113 return True
113 114
115
114class PdfStripper(parser.GenericParser): 116class PdfStripper(parser.GenericParser):
115 ''' 117 '''
116 Represent a pdf file 118 Represent a pdf file
117 ''' 119 '''
120 def __init__(self, filename, parser, mime, backup, add2archive):
121 super(PdfStripper, self).__init__(filename, parser, mime, backup,
122 add2archive)
123 uri = 'file://' + self.filename
124 self.password = None
125 self.document = poppler.document_new_from_file(uri, self.password)
126 self.meta_list = ('title', 'author', 'subject', 'keywords', 'creator',
127 'producer', 'creation-date', 'mod-date', 'metadata')
128
118 def is_clean(self): 129 def is_clean(self):
119 #FIXME 130 '''
120 return False 131 Check if the file is clean from harmful metadatas
132 '''
133 for key in self.meta_list:
134 if key != 'creation-date' and key != 'mod-date':
135 if self.document.get_property(key) is not None:
136 return False
137 else:
138 if self.document.get_property(key) != -1:
139 return False
140 return True
121 141
122 def remove_all(self): 142 def remove_all(self):
123 #FIXME
124 self.remove_all_ugly()
125
126 def remove_all_ugly(self):
127 ''' 143 '''
128 Opening the pdf with poppler, then doing a render 144 Opening the pdf with poppler, then doing a render
129 on a cairo pdfsurface. 145 on a cairo pdfsurface for each pages.
146 http://cairographics.org/documentation/pycairo/2/
147 python-poppler is not documented at all : have fun ;)
130 ''' 148 '''
131 uri = 'file://' + self.filename 149 page = self.document.get_page(0)
132 password = None
133 document = poppler.document_new_from_file(uri, password)
134 page = document.get_page(0)
135 page_width, page_height = page.get_size() 150 page_width, page_height = page.get_size()
136 surface = cairo.PDFSurface(self.output, page_width, page_height) 151 surface = cairo.PDFSurface(self.output, page_width, page_height)
137 context = cairo.Context(surface) 152 context = cairo.Context(surface)
138 for i in xrange(document.get_n_pages()): 153 for pagenum in xrange(self.document.get_n_pages()):
139 page = document.get_page(i) 154 page = self.document.get_page(pagenum)
140 context.translate(0, 0) 155 context.translate(0, 0)
141 page.render(context) 156 page.render(context)
142 context.show_page() 157 context.show_page()
143 surface.finish() 158 surface.finish()
159 #For now, poppler cannot write meta, so we must use pdfrw
160 trailer = pdfrw.PdfReader(self.output)
161 trailer.Info.Producer = ''
162 trailer.Info.Creator = ''
163 writer = pdfrw.PdfWriter()
164 writer.trailer = trailer
165 writer.write(self.output)
144 166
145 def get_meta(self): 167 def get_meta(self):
168 '''
169 Return a dict with all the meta of the file
170 '''
146 metadata={} 171 metadata={}
147 meta_list=('title', 'author', 'subject', 'keywords', 'creator', 172 for key in self.meta_list:
148 'producer', 'creation-date', 'mod-date', 'metadata') 173 if self.document.get_property(key) is not None:
149 uri = 'file://' + self.filename 174 metadata[key] = self.document.get_property(key)
150 password = None
151 document = poppler.document_new_from_file(uri, password)
152 for key in meta_list:
153 self._get_meta(document, metadata, key)
154 return metadata 175 return metadata
155
156 def _get_meta(self, document, metadata, key):
157 if document.get_property(key) is not None:
158 metadata[key] = document.get_property(key)