summaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
authorjvoisin2011-07-29 19:18:37 +0200
committerjvoisin2011-07-29 19:18:37 +0200
commit8f889fead81b2046d289402b831e18f8ddb00276 (patch)
treec65736ba7f1b79b76aed9cac7e06317e1fb61f00 /lib
parent4ce3a446bb7d053962053895195e0feab18160a4 (diff)
preliminary clean/smooth support of pdf files, with help of poppler and cairo
Diffstat (limited to 'lib')
-rw-r--r--lib/mat.py11
-rw-r--r--lib/office.py122
2 files changed, 58 insertions, 75 deletions
diff --git a/lib/mat.py b/lib/mat.py
index 0283fbc..fa6cf96 100644
--- a/lib/mat.py
+++ b/lib/mat.py
@@ -32,11 +32,18 @@ STRIPPERS = {
32 'audio/mpeg': audio.MpegAudioStripper, 32 'audio/mpeg': audio.MpegAudioStripper,
33 'image/jpeg': images.JpegStripper, 33 'image/jpeg': images.JpegStripper,
34 'image/png': images.PngStripper, 34 'image/png': images.PngStripper,
35 'application/x-pdf ': office.PdfStripper,
36 'application/vnd.oasis.opendocument': office.OpenDocumentStripper, 35 'application/vnd.oasis.opendocument': office.OpenDocumentStripper,
37} 36}
38 37
39try: 38try:
39 import poppler
40 import cairo
41 STRIPPERS['application/x-pdf'] = office.PdfStripper
42 STRIPPERS['application/pdf'] = office.PdfStripper
43except ImportError:
44 print('Unable to import python-poppler and/or python-cairo: no pdf support')
45
46try:
40 import mutagen 47 import mutagen
41 STRIPPERS['audio/x-flac'] = audio.FlacStripper 48 STRIPPERS['audio/x-flac'] = audio.FlacStripper
42 STRIPPERS['audio/x-ape'] = audio.Apev2Stripper 49 STRIPPERS['audio/x-ape'] = audio.Apev2Stripper
@@ -100,6 +107,8 @@ def create_class_file(name, backup, add2archive):
100 if mime.startswith('application/vnd.oasis.opendocument'): 107 if mime.startswith('application/vnd.oasis.opendocument'):
101 mime = 'application/vnd.oasis.opendocument' # opendocument fileformat 108 mime = 'application/vnd.oasis.opendocument' # opendocument fileformat
102 109
110 #stripper_class = STRIPPERS[mime]
111
103 try: 112 try:
104 stripper_class = STRIPPERS[mime] 113 stripper_class = STRIPPERS[mime]
105 except KeyError: 114 except KeyError:
diff --git a/lib/office.py b/lib/office.py
index 00fce3c..cfee3aa 100644
--- a/lib/office.py
+++ b/lib/office.py
@@ -1,3 +1,7 @@
1'''
2 Care about office's formats
3'''
4
1import os 5import os
2import mimetypes 6import mimetypes
3import subprocess 7import subprocess
@@ -9,8 +13,12 @@ import re
9import shutil 13import shutil
10from xml.etree import ElementTree 14from xml.etree import ElementTree
11 15
16try:
17 import cairo
18 import poppler
19except ImportError:
20 pass
12 21
13import pdfrw
14import mat 22import mat
15import parser 23import parser
16import archive 24import archive
@@ -23,6 +31,9 @@ class OpenDocumentStripper(archive.GenericArchiveStripper):
23 ''' 31 '''
24 32
25 def get_meta(self): 33 def get_meta(self):
34 '''
35 Return a dict with all the meta of the file
36 '''
26 zipin = zipfile.ZipFile(self.filename, 'r') 37 zipin = zipfile.ZipFile(self.filename, 'r')
27 metadata = {} 38 metadata = {}
28 try: 39 try:
@@ -83,6 +94,9 @@ class OpenDocumentStripper(archive.GenericArchiveStripper):
83 self.do_backup() 94 self.do_backup()
84 95
85 def is_clean(self): 96 def is_clean(self):
97 '''
98 Check if the file is clean from harmful metadatas
99 '''
86 zipin = zipfile.ZipFile(self.filename, 'r') 100 zipin = zipfile.ZipFile(self.filename, 'r')
87 try: 101 try:
88 zipin.getinfo('meta.xml') 102 zipin.getinfo('meta.xml')
@@ -97,88 +111,48 @@ class OpenDocumentStripper(archive.GenericArchiveStripper):
97 return False 111 return False
98 return True 112 return True
99 113
100
101class PdfStripper(parser.GenericParser): 114class PdfStripper(parser.GenericParser):
102 ''' 115 '''
103 Represent a pdf file, with the help of pdfrw 116 Represent a pdf file
104 ''' 117 '''
105 def __init__(self, filename, parser, mime, backup, add2archive): 118 def is_clean(self):
106 name, ext = os.path.splitext(filename) 119 #FIXME
107 self.output = name + '.cleaned' + ext 120 return False
108 self.filename = filename
109 self.backup = backup
110 self.realname = realname
111 self.shortname = os.path.basename(filename)
112 self.mime = mime
113 self.tempdir = tempfile.mkdtemp()
114 self.trailer = pdfrw.PdfReader(self.filename)
115 self.writer = pdfrw.PdfWriter()
116 self.convert = 'gm convert -antialias -enhance %s %s'
117
118 def __del__(self):
119 '''
120 Remove the temp dir
121 '''
122 shutil.rmtree(self.tempdir)
123 121
124 def remove_all(self): 122 def remove_all(self):
125 ''' 123 #FIXME
126 Remove all the meta fields that are compromizing 124 self.remove_all_ugly()
127 '''
128 self.trailer.Info.Title = ''
129 self.trailer.Info.Author = ''
130 self.trailer.Info.Producer = ''
131 self.trailer.Info.Creator = ''
132 self.trailer.Info.CreationDate = ''
133 self.trailer.Info.ModDate = ''
134
135 self.writer.trailer = self.trailer
136 self.writer.write(self.output)
137 self.do_backup()
138 125
139 def remove_all_ugly(self): 126 def remove_all_ugly(self):
140 ''' 127 '''
141 Transform each pages into a jpg, clean them, 128 Opening the pdf with poppler, then doing a render
142 then re-assemble them into a new pdf 129 on a cairo pdfsurface.
143 ''' 130 '''
144 subprocess.call(self.convert % (self.filename, self.tempdir + 131 uri = 'file://' + self.filename
145 'temp.jpg'), shell=True) # Convert pages to jpg 132 password = None
146 133 document = poppler.document_new_from_file(uri, password)
147 for current_file in glob.glob(self.tempdir + 'temp*'): 134 page = document.get_page(0)
148 #Clean every jpg image 135 page_width, page_height = page.get_size()
149 class_file = mat.create_class_file(current_file, False, False) 136 surface = cairo.PDFSurface(self.output, page_width, page_height)
150 class_file.remove_all() 137 context = cairo.Context(surface)
151 138 for i in xrange(document.get_n_pages()):
152 subprocess.call(self.convert % (self.tempdir + 139 page = document.get_page(i)
153 'temp.jpg*', self.output), shell=True) # Assemble jpg into pdf 140 context.translate(0, 0)
154 141 page.render(context)
155 for current_file in glob.glob(self.tempdir + 'temp*'): 142 context.show_page()
156 #remove jpg files 143 surface.finish()
157 mat.secure_remove(current_file)
158
159 if self.backup is False:
160 mat.secure_remove(self.filename) # remove the old file
161 os.rename(self.output, self.filename) # rename the new
162 name = self.realname
163 else:
164 name = self.output
165 class_file = mat.create_class_file(name, False, False)
166 class_file.remove_all()
167
168 def is_clean(self):
169 '''
170 Check if the file is clean from harmful metadatas
171 '''
172 for field in self.trailer.Info:
173 if field != '':
174 return False
175 return True
176 144
177 def get_meta(self): 145 def get_meta(self):
178 ''' 146 metadata={}
179 return a dict with all the meta of the file 147 meta_list=('title', 'author', 'subject', 'keywords', 'creator',
180 ''' 148 'producer', 'creation-date', 'mod-date', 'metadata')
181 metadata = {} 149 uri = 'file://' + self.filename
182 for key, value in self.trailer.Info.iteritems(): 150 password = None
183 metadata[key[1:]] = value[1:-1] 151 document = poppler.document_new_from_file(uri, password)
152 for key in meta_list:
153 self._get_meta(document, metadata, key)
184 return metadata 154 return metadata
155
156 def _get_meta(self, document, metadata, key):
157 if document.get_property(key) is not None:
158 metadata[key] = document.get_property(key)