summaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
authorjvoisin2012-02-08 19:18:24 +0100
committerjvoisin2012-02-08 19:18:24 +0100
commit97faad76d385d78643c2d3752b2a1378b9bb2591 (patch)
tree2d3553455953a31ac21f31302c0a5a99b41b0e7e /lib
parenta86f9a31726f5afb7c2f3f958305af03878583b4 (diff)
Some improvements for office formats
Diffstat (limited to 'lib')
-rw-r--r--lib/office.py22
1 files changed, 10 insertions, 12 deletions
diff --git a/lib/office.py b/lib/office.py
index 82b817e..8350244 100644
--- a/lib/office.py
+++ b/lib/office.py
@@ -19,6 +19,7 @@ import mat
19import parser 19import parser
20import archive 20import archive
21 21
22
22class OpenDocumentStripper(archive.GenericArchiveStripper): 23class OpenDocumentStripper(archive.GenericArchiveStripper):
23 ''' 24 '''
24 An open document file is a zip, with xml file into. 25 An open document file is a zip, with xml file into.
@@ -126,21 +127,21 @@ class PdfStripper(parser.GenericParser):
126 uri = 'file://' + os.path.abspath(self.filename) 127 uri = 'file://' + os.path.abspath(self.filename)
127 self.password = None 128 self.password = None
128 self.document = poppler.document_new_from_file(uri, self.password) 129 self.document = poppler.document_new_from_file(uri, self.password)
129 self.meta_list = ('title', 'author', 'subject', 'keywords', 'creator', 130 self.meta_list = frozenset(['title', 'author', 'subject', 'keywords', 'creator',
130 'producer', 'metadata') 131 'producer', 'metadata'])
131 132
132 def is_clean(self): 133 def is_clean(self):
133 ''' 134 '''
134 Check if the file is clean from harmful metadatas 135 Check if the file is clean from harmful metadatas
135 ''' 136 '''
136 for key in self.meta_list: 137 for key in self.meta_list:
137 if self.document.get_property(key) != None: 138 if self.document.get_property(key):
138 return False 139 return False
139 return True 140 return True
140 141
141 def remove_all(self): 142 def remove_all(self):
142 ''' 143 '''
143 Remove supperficial 144 Remove metadata
144 ''' 145 '''
145 return self._remove_meta() 146 return self._remove_meta()
146 147
@@ -148,11 +149,12 @@ class PdfStripper(parser.GenericParser):
148 ''' 149 '''
149 Opening the PDF with poppler, then doing a render 150 Opening the PDF with poppler, then doing a render
150 on a cairo pdfsurface for each pages. 151 on a cairo pdfsurface for each pages.
151 Thanks to Lunar^for the idea. 152
152 http://cairographics.org/documentation/pycairo/2/ 153 http://cairographics.org/documentation/pycairo/2/
153 python-poppler is not documented at all : have fun ;) 154 python-poppler is not documented at all : have fun ;)
154 ''' 155 '''
155 page = self.document.get_page(0) 156 page = self.document.get_page(0)
157 # assume that every pages are the same size
156 page_width, page_height = page.get_size() 158 page_width, page_height = page.get_size()
157 surface = cairo.PDFSurface(self.output, page_width, page_height) 159 surface = cairo.PDFSurface(self.output, page_width, page_height)
158 context = cairo.Context(surface) # context draws on the surface 160 context = cairo.Context(surface) # context draws on the surface
@@ -165,8 +167,7 @@ class PdfStripper(parser.GenericParser):
165 surface.finish() 167 surface.finish()
166 168
167 try: 169 try:
168 import pdfrw 170 import pdfrw # For now, poppler cannot write meta, so we must use pdfrw
169 #For now, poppler cannot write meta, so we must use pdfrw
170 logging.debug('Removing %s\'s superficial metadata' % self.filename) 171 logging.debug('Removing %s\'s superficial metadata' % self.filename)
171 trailer = pdfrw.PdfReader(self.output) 172 trailer = pdfrw.PdfReader(self.output)
172 trailer.Info.Producer = None 173 trailer.Info.Producer = None
@@ -187,7 +188,7 @@ class PdfStripper(parser.GenericParser):
187 ''' 188 '''
188 metadata = {} 189 metadata = {}
189 for key in self.meta_list: 190 for key in self.meta_list:
190 if self.document.get_property(key) is not None: 191 if self.document.get_property(key):
191 metadata[key] = self.document.get_property(key) 192 metadata[key] = self.document.get_property(key)
192 return metadata 193 return metadata
193 194
@@ -249,10 +250,7 @@ class OpenXmlStripper(archive.GenericArchiveStripper):
249 zipin.close() 250 zipin.close()
250 czf = archive.ZipStripper(self.filename, self.parser, 251 czf = archive.ZipStripper(self.filename, self.parser,
251 'application/zip', self.backup, self.add2archive) 252 'application/zip', self.backup, self.add2archive)
252 if not czf.is_clean(): 253 return czf.is_clean()
253 return False
254 else:
255 return True
256 254
257 def get_meta(self): 255 def get_meta(self):
258 ''' 256 '''