summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/parsers/pdf.py37
-rw-r--r--tests/test_libmat2.py5
2 files changed, 18 insertions, 24 deletions
diff --git a/src/parsers/pdf.py b/src/parsers/pdf.py
index a77eabd..26985c6 100644
--- a/src/parsers/pdf.py
+++ b/src/parsers/pdf.py
@@ -7,17 +7,13 @@ import logging
7import tempfile 7import tempfile
8import shutil 8import shutil
9import io 9import io
10import tempfile
10 11
11import cairo 12import cairo
12import gi 13import gi
13gi.require_version('Poppler', '0.18') 14gi.require_version('Poppler', '0.18')
14from gi.repository import Poppler 15from gi.repository import Poppler
15 16
16try:
17 from PIL import Image
18except ImportError:
19 Image = None
20
21from . import abstract 17from . import abstract
22 18
23logging.basicConfig(level=logging.DEBUG) 19logging.basicConfig(level=logging.DEBUG)
@@ -26,8 +22,9 @@ logging.basicConfig(level=logging.DEBUG)
26class PDFParser(abstract.AbstractParser): 22class PDFParser(abstract.AbstractParser):
27 def __init__(self, filename): 23 def __init__(self, filename):
28 super().__init__(filename) 24 super().__init__(filename)
29 self.meta_list = {'title', 'author', 'subject', 25 self.meta_list = {'author', 'creation-date', 'creator', 'format', 'keywords',
30 'keywords', 'creator', 'producer', 'metadata'} 26 'metadata', 'mod-date', 'producer', 'subject', 'title',
27 'viewer-preferences'}
31 self.uri = 'file://' + os.path.abspath(self.filename) 28 self.uri = 'file://' + os.path.abspath(self.filename)
32 self.password = None 29 self.password = None
33 30
@@ -37,25 +34,24 @@ class PDFParser(abstract.AbstractParser):
37 and shove those PNG into a new PDF. Metadata from the new 34 and shove those PNG into a new PDF. Metadata from the new
38 PDF are removed via Poppler, because there is no way to tell 35 PDF are removed via Poppler, because there is no way to tell
39 cairo to not add "created by cairo" during rendering. 36 cairo to not add "created by cairo" during rendering.
40
41 TODO: Improve the resolution
42 TODO: Don't use a temp file
43 """ 37 """
44 document = Poppler.Document.new_from_file(self.uri, self.password) 38 document = Poppler.Document.new_from_file(self.uri, self.password)
39 pages_count = document.get_n_pages()
45 40
46 pdf_surface = cairo.PDFSurface(self.output_filename, 128, 128) 41 _, tmp_path = tempfile.mkstemp()
42 pdf_surface = cairo.PDFSurface(tmp_path, 128, 128)
47 pdf_context = cairo.Context(pdf_surface) 43 pdf_context = cairo.Context(pdf_surface)
48 44
49 for pagenum in range(document.get_n_pages()): 45 for pagenum in range(pages_count):
50 page = document.get_page(pagenum) 46 page = document.get_page(pagenum)
51 page_width, page_height = page.get_size() 47 page_width, page_height = page.get_size()
52 logging.info("Rendering page %d/%d", pagenum + 1, document.get_n_pages()) 48 logging.info("Rendering page %d/%d", pagenum + 1, pages_count)
53 49
54 img_surface = cairo.ImageSurface(cairo.FORMAT_ARGB32, int(page_width)*2, int(page_height)*2) 50 img_surface = cairo.ImageSurface(cairo.FORMAT_ARGB32, int(page_width)*2, int(page_height)*2)
55 img_context = cairo.Context(img_surface) 51 img_context = cairo.Context(img_surface)
56 52
57 img_context.scale(2, 2) 53 img_context.scale(2, 2)
58 page.render_for_printing_with_options(img_context, Poppler.PrintFlags.DOCUMENT) 54 page.render_for_printing(img_context)
59 img_context.show_page() 55 img_context.show_page()
60 56
61 buf = io.BytesIO() 57 buf = io.BytesIO()
@@ -63,8 +59,6 @@ class PDFParser(abstract.AbstractParser):
63 img_surface.finish() 59 img_surface.finish()
64 buf.seek(0) 60 buf.seek(0)
65 61
66 #buf = self.__optimize_image_size(buf)
67
68 img = cairo.ImageSurface.create_from_png(buf) 62 img = cairo.ImageSurface.create_from_png(buf)
69 pdf_surface.set_size(page_width*2, page_height*2) 63 pdf_surface.set_size(page_width*2, page_height*2)
70 pdf_context.set_source_surface(img, 0, 0) 64 pdf_context.set_source_surface(img, 0, 0)
@@ -73,11 +67,12 @@ class PDFParser(abstract.AbstractParser):
73 67
74 pdf_surface.finish() 68 pdf_surface.finish()
75 69
76 # This is removing metadata 70 # This is removing metadata added by Poppler
77 #document = Poppler.Document.new_from_file('file://' + os.path.abspath('OUT.pdf'), self.password) 71 document = Poppler.Document.new_from_file('file://' + tmp_path)
78 #document.set_producer('totally not MAT2 ;)') 72 document.set_producer('')
79 #document.set_creator('') 73 document.set_creator('')
80 #document.save('file://' + os.path.abspath("OUT_clean.pdf")) 74 document.save('file://' + os.path.abspath(self.output_filename))
75 os.remove(tmp_path)
81 76
82 return True 77 return True
83 78
diff --git a/tests/test_libmat2.py b/tests/test_libmat2.py
index 4751aa4..4b36270 100644
--- a/tests/test_libmat2.py
+++ b/tests/test_libmat2.py
@@ -31,6 +31,5 @@ class TestCleaning(unittest.TestCase):
31 self.assertTrue(ret) 31 self.assertTrue(ret)
32 32
33 p = pdf.PDFParser('./tests/data/clean.pdf.cleaned') 33 p = pdf.PDFParser('./tests/data/clean.pdf.cleaned')
34 remaining_meta = {'creator': 'cairo 1.14.10 (http://cairographics.org)', 34 expected_meta = {'creation-date': -1, 'format': 'PDF-1.5', 'mod-date': -1}
35 'producer': 'cairo 1.14.10 (http://cairographics.org)'} 35 self.assertEqual(p.get_meta(), expected_meta)
36 self.assertEqual(p.get_meta(), remaining_meta)