summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--README1
-rw-r--r--mat/office.py82
2 files changed, 43 insertions, 40 deletions
diff --git a/README b/README
index d65defc..2c34cda 100644
--- a/README
+++ b/README
@@ -23,6 +23,7 @@ WARNING :
23DEPENDENCIES: 23DEPENDENCIES:
24 python2.6 (at least) 24 python2.6 (at least)
25 python-hachoir-core and python-hachoir-parser 25 python-hachoir-core and python-hachoir-parser
26 python-pdfrw or exiftool for full pdf support
26 shred (should be already installed) 27 shred (should be already installed)
27 28
28 29
diff --git a/mat/office.py b/mat/office.py
index 0b36fe7..b8a235f 100644
--- a/mat/office.py
+++ b/mat/office.py
@@ -6,6 +6,7 @@ import os
6import logging 6import logging
7import zipfile 7import zipfile
8import fileinput 8import fileinput
9import subprocess
9 10
10try: 11try:
11 import cairo 12 import cairo
@@ -16,7 +17,6 @@ except ImportError:
16import mat 17import mat
17import parser 18import parser
18import archive 19import archive
19import pdfrw
20 20
21 21
22class OpenDocumentStripper(archive.GenericArchiveStripper): 22class OpenDocumentStripper(archive.GenericArchiveStripper):
@@ -120,45 +120,27 @@ class PdfStripper(parser.GenericParser):
120 self.password = None 120 self.password = None
121 self.document = poppler.document_new_from_file(uri, self.password) 121 self.document = poppler.document_new_from_file(uri, self.password)
122 self.meta_list = ('title', 'author', 'subject', 'keywords', 'creator', 122 self.meta_list = ('title', 'author', 'subject', 'keywords', 'creator',
123 'producer', 'creation-date', 'mod-date', 'metadata') 123 'producer', 'metadata')
124 124
125 def is_clean(self): 125 def is_clean(self):
126 ''' 126 '''
127 Check if the file is clean from harmful metadatas 127 Check if the file is clean from harmful metadatas
128 ''' 128 '''
129 for key in self.meta_list: 129 for key in self.meta_list:
130 if key == 'creation-date' or key == 'mod-date': 130 if self.document.get_property(key) is not None and \
131 if self.document.get_property(key) != -1:
132 return False
133 elif self.document.get_property(key) is not None and \
134 self.document.get_property(key) != '': 131 self.document.get_property(key) != '':
135 return False 132 return False
136 return True 133 return True
137 134
138 def remove_all_ugly(self):
139 page = self.document.get_page(0)
140 page_width, page_height = page.get_size()
141 surface = cairo.PDFSurface(self.output, page_width, page_height)
142 context = cairo.Context(surface) # context draws on the surface
143 logging.debug('Pdf rendering of %s' % self.filename)
144 for pagenum in xrange(self.document.get_n_pages()):
145 page = self.document.get_page(pagenum)
146 context.translate(0, 0)
147 page.render(context) # render the page on context
148 context.show_page() # draw context on surface
149 surface.finish()
150 135
151 #For now, poppler cannot write meta, so we must use pdfrw 136 def remove_all(self):
152 logging.debug('Removing %s\'s superficial metadata' % self.filename) 137 '''
153 trailer = pdfrw.PdfReader(self.output) 138 Remove supperficial
154 trailer.Info.Producer = trailer.Info.Creator = None 139 '''
155 writer = pdfrw.PdfWriter() 140 self._remove_superficial_meta()
156 writer.trailer = trailer
157 writer.write(self.output)
158 self.do_backup()
159 141
160 142
161 def remove_all(self): 143 def remove_all_ugly(self):
162 ''' 144 '''
163 Opening the pdf with poppler, then doing a render 145 Opening the pdf with poppler, then doing a render
164 on a cairo pdfsurface for each pages. 146 on a cairo pdfsurface for each pages.
@@ -177,15 +159,39 @@ class PdfStripper(parser.GenericParser):
177 page.render(context) # render the page on context 159 page.render(context) # render the page on context
178 context.show_page() # draw context on surface 160 context.show_page() # draw context on surface
179 surface.finish() 161 surface.finish()
162 self._remove_superficial_meta()
180 163
181 #For now, poppler cannot write meta, so we must use pdfrw 164 def _remove_superficial_meta(self):
182 logging.debug('Removing %s\'s superficial metadata' % self.filename) 165 '''
183 trailer = pdfrw.PdfReader(self.output) 166 Remove superficial/external metadata
184 trailer.Info.Producer = trailer.Info.Creator = None 167 from a pdf file, using exiftool,
185 writer = pdfrw.PdfWriter() 168 of pdfrw if exiftool is not installed
186 writer.trailer = trailer 169 '''
187 writer.write(self.output) 170 try:
188 self.do_backup() 171 import exiftool
172 if self.backup:
173 process = subprocess.Popen(['exiftool', '-all=',
174 '-o %s' % self.output, self.filename],
175 stdout=open('/dev/null'))
176 process.wait()
177 else:
178 process = subprocess.Popen(['exiftool', '-overwrite_original',
179 '-all=', self.filename], stdout=open('/dev/null'))
180 process.wait()
181 except:
182 try:
183 import pdfrw
184 #For now, poppler cannot write meta, so we must use pdfrw
185 logging.debug('Removing %s\'s superficial metadata' % self.filename)
186 trailer = pdfrw.PdfReader(self.output)
187 trailer.Info.Producer = trailer.Info.Creator = None
188 writer = pdfrw.PdfWriter()
189 writer.trailer = trailer
190 writer.write(self.output)
191 self.do_backup()
192 except:
193 logging.error('You don\'t have either python-pdfrw, or\
194 exiftool: processed pdf are not totally clean !')
189 195
190 def get_meta(self): 196 def get_meta(self):
191 ''' 197 '''
@@ -193,11 +199,7 @@ class PdfStripper(parser.GenericParser):
193 ''' 199 '''
194 metadata = {} 200 metadata = {}
195 for key in self.meta_list: 201 for key in self.meta_list:
196 if key == 'creation-date' or key == 'mod-date': 202 if self.document.get_property(key) is not None and \
197 #creation and modification are set to -1
198 if self.document.get_property(key) != -1:
199 metadata[key] = self.document.get_property(key)
200 elif self.document.get_property(key) is not None and \
201 self.document.get_property(key) != '': 203 self.document.get_property(key) != '':
202 metadata[key] = self.document.get_property(key) 204 metadata[key] = self.document.get_property(key)
203 return metadata 205 return metadata