diff options
| author | jvoisin | 2011-07-29 19:18:37 +0200 |
|---|---|---|
| committer | jvoisin | 2011-07-29 19:18:37 +0200 |
| commit | 8f889fead81b2046d289402b831e18f8ddb00276 (patch) | |
| tree | c65736ba7f1b79b76aed9cac7e06317e1fb61f00 /lib | |
| parent | 4ce3a446bb7d053962053895195e0feab18160a4 (diff) | |
preliminary clean/smooth support of pdf files, with help of poppler and cairo
Diffstat (limited to 'lib')
| -rw-r--r-- | lib/mat.py | 11 | ||||
| -rw-r--r-- | lib/office.py | 122 |
2 files changed, 58 insertions, 75 deletions
| @@ -32,11 +32,18 @@ STRIPPERS = { | |||
| 32 | 'audio/mpeg': audio.MpegAudioStripper, | 32 | 'audio/mpeg': audio.MpegAudioStripper, |
| 33 | 'image/jpeg': images.JpegStripper, | 33 | 'image/jpeg': images.JpegStripper, |
| 34 | 'image/png': images.PngStripper, | 34 | 'image/png': images.PngStripper, |
| 35 | 'application/x-pdf ': office.PdfStripper, | ||
| 36 | 'application/vnd.oasis.opendocument': office.OpenDocumentStripper, | 35 | 'application/vnd.oasis.opendocument': office.OpenDocumentStripper, |
| 37 | } | 36 | } |
| 38 | 37 | ||
| 39 | try: | 38 | try: |
| 39 | import poppler | ||
| 40 | import cairo | ||
| 41 | STRIPPERS['application/x-pdf'] = office.PdfStripper | ||
| 42 | STRIPPERS['application/pdf'] = office.PdfStripper | ||
| 43 | except ImportError: | ||
| 44 | print('Unable to import python-poppler and/or python-cairo: no pdf support') | ||
| 45 | |||
| 46 | try: | ||
| 40 | import mutagen | 47 | import mutagen |
| 41 | STRIPPERS['audio/x-flac'] = audio.FlacStripper | 48 | STRIPPERS['audio/x-flac'] = audio.FlacStripper |
| 42 | STRIPPERS['audio/x-ape'] = audio.Apev2Stripper | 49 | STRIPPERS['audio/x-ape'] = audio.Apev2Stripper |
| @@ -100,6 +107,8 @@ def create_class_file(name, backup, add2archive): | |||
| 100 | if mime.startswith('application/vnd.oasis.opendocument'): | 107 | if mime.startswith('application/vnd.oasis.opendocument'): |
| 101 | mime = 'application/vnd.oasis.opendocument' # opendocument fileformat | 108 | mime = 'application/vnd.oasis.opendocument' # opendocument fileformat |
| 102 | 109 | ||
| 110 | #stripper_class = STRIPPERS[mime] | ||
| 111 | |||
| 103 | try: | 112 | try: |
| 104 | stripper_class = STRIPPERS[mime] | 113 | stripper_class = STRIPPERS[mime] |
| 105 | except KeyError: | 114 | except KeyError: |
diff --git a/lib/office.py b/lib/office.py index 00fce3c..cfee3aa 100644 --- a/lib/office.py +++ b/lib/office.py | |||
| @@ -1,3 +1,7 @@ | |||
| 1 | ''' | ||
| 2 | Care about office's formats | ||
| 3 | ''' | ||
| 4 | |||
| 1 | import os | 5 | import os |
| 2 | import mimetypes | 6 | import mimetypes |
| 3 | import subprocess | 7 | import subprocess |
| @@ -9,8 +13,12 @@ import re | |||
| 9 | import shutil | 13 | import shutil |
| 10 | from xml.etree import ElementTree | 14 | from xml.etree import ElementTree |
| 11 | 15 | ||
| 16 | try: | ||
| 17 | import cairo | ||
| 18 | import poppler | ||
| 19 | except ImportError: | ||
| 20 | pass | ||
| 12 | 21 | ||
| 13 | import pdfrw | ||
| 14 | import mat | 22 | import mat |
| 15 | import parser | 23 | import parser |
| 16 | import archive | 24 | import archive |
| @@ -23,6 +31,9 @@ class OpenDocumentStripper(archive.GenericArchiveStripper): | |||
| 23 | ''' | 31 | ''' |
| 24 | 32 | ||
| 25 | def get_meta(self): | 33 | def get_meta(self): |
| 34 | ''' | ||
| 35 | Return a dict with all the meta of the file | ||
| 36 | ''' | ||
| 26 | zipin = zipfile.ZipFile(self.filename, 'r') | 37 | zipin = zipfile.ZipFile(self.filename, 'r') |
| 27 | metadata = {} | 38 | metadata = {} |
| 28 | try: | 39 | try: |
| @@ -83,6 +94,9 @@ class OpenDocumentStripper(archive.GenericArchiveStripper): | |||
| 83 | self.do_backup() | 94 | self.do_backup() |
| 84 | 95 | ||
| 85 | def is_clean(self): | 96 | def is_clean(self): |
| 97 | ''' | ||
| 98 | Check if the file is clean from harmful metadatas | ||
| 99 | ''' | ||
| 86 | zipin = zipfile.ZipFile(self.filename, 'r') | 100 | zipin = zipfile.ZipFile(self.filename, 'r') |
| 87 | try: | 101 | try: |
| 88 | zipin.getinfo('meta.xml') | 102 | zipin.getinfo('meta.xml') |
| @@ -97,88 +111,48 @@ class OpenDocumentStripper(archive.GenericArchiveStripper): | |||
| 97 | return False | 111 | return False |
| 98 | return True | 112 | return True |
| 99 | 113 | ||
| 100 | |||
| 101 | class PdfStripper(parser.GenericParser): | 114 | class PdfStripper(parser.GenericParser): |
| 102 | ''' | 115 | ''' |
| 103 | Represent a pdf file, with the help of pdfrw | 116 | Represent a pdf file |
| 104 | ''' | 117 | ''' |
| 105 | def __init__(self, filename, parser, mime, backup, add2archive): | 118 | def is_clean(self): |
| 106 | name, ext = os.path.splitext(filename) | 119 | #FIXME |
| 107 | self.output = name + '.cleaned' + ext | 120 | return False |
| 108 | self.filename = filename | ||
| 109 | self.backup = backup | ||
| 110 | self.realname = realname | ||
| 111 | self.shortname = os.path.basename(filename) | ||
| 112 | self.mime = mime | ||
| 113 | self.tempdir = tempfile.mkdtemp() | ||
| 114 | self.trailer = pdfrw.PdfReader(self.filename) | ||
| 115 | self.writer = pdfrw.PdfWriter() | ||
| 116 | self.convert = 'gm convert -antialias -enhance %s %s' | ||
| 117 | |||
| 118 | def __del__(self): | ||
| 119 | ''' | ||
| 120 | Remove the temp dir | ||
| 121 | ''' | ||
| 122 | shutil.rmtree(self.tempdir) | ||
| 123 | 121 | ||
| 124 | def remove_all(self): | 122 | def remove_all(self): |
| 125 | ''' | 123 | #FIXME |
| 126 | Remove all the meta fields that are compromizing | 124 | self.remove_all_ugly() |
| 127 | ''' | ||
| 128 | self.trailer.Info.Title = '' | ||
| 129 | self.trailer.Info.Author = '' | ||
| 130 | self.trailer.Info.Producer = '' | ||
| 131 | self.trailer.Info.Creator = '' | ||
| 132 | self.trailer.Info.CreationDate = '' | ||
| 133 | self.trailer.Info.ModDate = '' | ||
| 134 | |||
| 135 | self.writer.trailer = self.trailer | ||
| 136 | self.writer.write(self.output) | ||
| 137 | self.do_backup() | ||
| 138 | 125 | ||
| 139 | def remove_all_ugly(self): | 126 | def remove_all_ugly(self): |
| 140 | ''' | 127 | ''' |
| 141 | Transform each pages into a jpg, clean them, | 128 | Opening the pdf with poppler, then doing a render |
| 142 | then re-assemble them into a new pdf | 129 | on a cairo pdfsurface. |
| 143 | ''' | 130 | ''' |
| 144 | subprocess.call(self.convert % (self.filename, self.tempdir + | 131 | uri = 'file://' + self.filename |
| 145 | 'temp.jpg'), shell=True) # Convert pages to jpg | 132 | password = None |
| 146 | 133 | document = poppler.document_new_from_file(uri, password) | |
| 147 | for current_file in glob.glob(self.tempdir + 'temp*'): | 134 | page = document.get_page(0) |
| 148 | #Clean every jpg image | 135 | page_width, page_height = page.get_size() |
| 149 | class_file = mat.create_class_file(current_file, False, False) | 136 | surface = cairo.PDFSurface(self.output, page_width, page_height) |
| 150 | class_file.remove_all() | 137 | context = cairo.Context(surface) |
| 151 | 138 | for i in xrange(document.get_n_pages()): | |
| 152 | subprocess.call(self.convert % (self.tempdir + | 139 | page = document.get_page(i) |
| 153 | 'temp.jpg*', self.output), shell=True) # Assemble jpg into pdf | 140 | context.translate(0, 0) |
| 154 | 141 | page.render(context) | |
| 155 | for current_file in glob.glob(self.tempdir + 'temp*'): | 142 | context.show_page() |
| 156 | #remove jpg files | 143 | surface.finish() |
| 157 | mat.secure_remove(current_file) | ||
| 158 | |||
| 159 | if self.backup is False: | ||
| 160 | mat.secure_remove(self.filename) # remove the old file | ||
| 161 | os.rename(self.output, self.filename) # rename the new | ||
| 162 | name = self.realname | ||
| 163 | else: | ||
| 164 | name = self.output | ||
| 165 | class_file = mat.create_class_file(name, False, False) | ||
| 166 | class_file.remove_all() | ||
| 167 | |||
| 168 | def is_clean(self): | ||
| 169 | ''' | ||
| 170 | Check if the file is clean from harmful metadatas | ||
| 171 | ''' | ||
| 172 | for field in self.trailer.Info: | ||
| 173 | if field != '': | ||
| 174 | return False | ||
| 175 | return True | ||
| 176 | 144 | ||
| 177 | def get_meta(self): | 145 | def get_meta(self): |
| 178 | ''' | 146 | metadata={} |
| 179 | return a dict with all the meta of the file | 147 | meta_list=('title', 'author', 'subject', 'keywords', 'creator', |
| 180 | ''' | 148 | 'producer', 'creation-date', 'mod-date', 'metadata') |
| 181 | metadata = {} | 149 | uri = 'file://' + self.filename |
| 182 | for key, value in self.trailer.Info.iteritems(): | 150 | password = None |
| 183 | metadata[key[1:]] = value[1:-1] | 151 | document = poppler.document_new_from_file(uri, password) |
| 152 | for key in meta_list: | ||
| 153 | self._get_meta(document, metadata, key) | ||
| 184 | return metadata | 154 | return metadata |
| 155 | |||
| 156 | def _get_meta(self, document, metadata, key): | ||
| 157 | if document.get_property(key) is not None: | ||
| 158 | metadata[key] = document.get_property(key) | ||
