diff options
| -rw-r--r-- | src/parsers/pdf.py | 37 | ||||
| -rw-r--r-- | tests/test_libmat2.py | 5 |
2 files changed, 18 insertions, 24 deletions
diff --git a/src/parsers/pdf.py b/src/parsers/pdf.py index a77eabd..26985c6 100644 --- a/src/parsers/pdf.py +++ b/src/parsers/pdf.py | |||
| @@ -7,17 +7,13 @@ import logging | |||
| 7 | import tempfile | 7 | import tempfile |
| 8 | import shutil | 8 | import shutil |
| 9 | import io | 9 | import io |
| 10 | import tempfile | ||
| 10 | 11 | ||
| 11 | import cairo | 12 | import cairo |
| 12 | import gi | 13 | import gi |
| 13 | gi.require_version('Poppler', '0.18') | 14 | gi.require_version('Poppler', '0.18') |
| 14 | from gi.repository import Poppler | 15 | from gi.repository import Poppler |
| 15 | 16 | ||
| 16 | try: | ||
| 17 | from PIL import Image | ||
| 18 | except ImportError: | ||
| 19 | Image = None | ||
| 20 | |||
| 21 | from . import abstract | 17 | from . import abstract |
| 22 | 18 | ||
| 23 | logging.basicConfig(level=logging.DEBUG) | 19 | logging.basicConfig(level=logging.DEBUG) |
| @@ -26,8 +22,9 @@ logging.basicConfig(level=logging.DEBUG) | |||
| 26 | class PDFParser(abstract.AbstractParser): | 22 | class PDFParser(abstract.AbstractParser): |
| 27 | def __init__(self, filename): | 23 | def __init__(self, filename): |
| 28 | super().__init__(filename) | 24 | super().__init__(filename) |
| 29 | self.meta_list = {'title', 'author', 'subject', | 25 | self.meta_list = {'author', 'creation-date', 'creator', 'format', 'keywords', |
| 30 | 'keywords', 'creator', 'producer', 'metadata'} | 26 | 'metadata', 'mod-date', 'producer', 'subject', 'title', |
| 27 | 'viewer-preferences'} | ||
| 31 | self.uri = 'file://' + os.path.abspath(self.filename) | 28 | self.uri = 'file://' + os.path.abspath(self.filename) |
| 32 | self.password = None | 29 | self.password = None |
| 33 | 30 | ||
| @@ -37,25 +34,24 @@ class PDFParser(abstract.AbstractParser): | |||
| 37 | and shove those PNG into a new PDF. Metadata from the new | 34 | and shove those PNG into a new PDF. Metadata from the new |
| 38 | PDF are removed via Poppler, because there is no way to tell | 35 | PDF are removed via Poppler, because there is no way to tell |
| 39 | cairo to not add "created by cairo" during rendering. | 36 | cairo to not add "created by cairo" during rendering. |
| 40 | |||
| 41 | TODO: Improve the resolution | ||
| 42 | TODO: Don't use a temp file | ||
| 43 | """ | 37 | """ |
| 44 | document = Poppler.Document.new_from_file(self.uri, self.password) | 38 | document = Poppler.Document.new_from_file(self.uri, self.password) |
| 39 | pages_count = document.get_n_pages() | ||
| 45 | 40 | ||
| 46 | pdf_surface = cairo.PDFSurface(self.output_filename, 128, 128) | 41 | _, tmp_path = tempfile.mkstemp() |
| 42 | pdf_surface = cairo.PDFSurface(tmp_path, 128, 128) | ||
| 47 | pdf_context = cairo.Context(pdf_surface) | 43 | pdf_context = cairo.Context(pdf_surface) |
| 48 | 44 | ||
| 49 | for pagenum in range(document.get_n_pages()): | 45 | for pagenum in range(pages_count): |
| 50 | page = document.get_page(pagenum) | 46 | page = document.get_page(pagenum) |
| 51 | page_width, page_height = page.get_size() | 47 | page_width, page_height = page.get_size() |
| 52 | logging.info("Rendering page %d/%d", pagenum + 1, document.get_n_pages()) | 48 | logging.info("Rendering page %d/%d", pagenum + 1, pages_count) |
| 53 | 49 | ||
| 54 | img_surface = cairo.ImageSurface(cairo.FORMAT_ARGB32, int(page_width)*2, int(page_height)*2) | 50 | img_surface = cairo.ImageSurface(cairo.FORMAT_ARGB32, int(page_width)*2, int(page_height)*2) |
| 55 | img_context = cairo.Context(img_surface) | 51 | img_context = cairo.Context(img_surface) |
| 56 | 52 | ||
| 57 | img_context.scale(2, 2) | 53 | img_context.scale(2, 2) |
| 58 | page.render_for_printing_with_options(img_context, Poppler.PrintFlags.DOCUMENT) | 54 | page.render_for_printing(img_context) |
| 59 | img_context.show_page() | 55 | img_context.show_page() |
| 60 | 56 | ||
| 61 | buf = io.BytesIO() | 57 | buf = io.BytesIO() |
| @@ -63,8 +59,6 @@ class PDFParser(abstract.AbstractParser): | |||
| 63 | img_surface.finish() | 59 | img_surface.finish() |
| 64 | buf.seek(0) | 60 | buf.seek(0) |
| 65 | 61 | ||
| 66 | #buf = self.__optimize_image_size(buf) | ||
| 67 | |||
| 68 | img = cairo.ImageSurface.create_from_png(buf) | 62 | img = cairo.ImageSurface.create_from_png(buf) |
| 69 | pdf_surface.set_size(page_width*2, page_height*2) | 63 | pdf_surface.set_size(page_width*2, page_height*2) |
| 70 | pdf_context.set_source_surface(img, 0, 0) | 64 | pdf_context.set_source_surface(img, 0, 0) |
| @@ -73,11 +67,12 @@ class PDFParser(abstract.AbstractParser): | |||
| 73 | 67 | ||
| 74 | pdf_surface.finish() | 68 | pdf_surface.finish() |
| 75 | 69 | ||
| 76 | # This is removing metadata | 70 | # This is removing metadata added by Poppler |
| 77 | #document = Poppler.Document.new_from_file('file://' + os.path.abspath('OUT.pdf'), self.password) | 71 | document = Poppler.Document.new_from_file('file://' + tmp_path) |
| 78 | #document.set_producer('totally not MAT2 ;)') | 72 | document.set_producer('') |
| 79 | #document.set_creator('') | 73 | document.set_creator('') |
| 80 | #document.save('file://' + os.path.abspath("OUT_clean.pdf")) | 74 | document.save('file://' + os.path.abspath(self.output_filename)) |
| 75 | os.remove(tmp_path) | ||
| 81 | 76 | ||
| 82 | return True | 77 | return True |
| 83 | 78 | ||
diff --git a/tests/test_libmat2.py b/tests/test_libmat2.py index 4751aa4..4b36270 100644 --- a/tests/test_libmat2.py +++ b/tests/test_libmat2.py | |||
| @@ -31,6 +31,5 @@ class TestCleaning(unittest.TestCase): | |||
| 31 | self.assertTrue(ret) | 31 | self.assertTrue(ret) |
| 32 | 32 | ||
| 33 | p = pdf.PDFParser('./tests/data/clean.pdf.cleaned') | 33 | p = pdf.PDFParser('./tests/data/clean.pdf.cleaned') |
| 34 | remaining_meta = {'creator': 'cairo 1.14.10 (http://cairographics.org)', | 34 | expected_meta = {'creation-date': -1, 'format': 'PDF-1.5', 'mod-date': -1} |
| 35 | 'producer': 'cairo 1.14.10 (http://cairographics.org)'} | 35 | self.assertEqual(p.get_meta(), expected_meta) |
| 36 | self.assertEqual(p.get_meta(), remaining_meta) | ||
