summaryrefslogtreecommitdiff
path: root/src/pdf.py
diff options
context:
space:
mode:
authorjvoisin2018-04-14 21:23:31 +0200
committerjvoisin2018-04-14 21:23:31 +0200
commit96299c6a5350f59eab022a09400eddcc347daede (patch)
tree492df3a7637b2d1cb45424615ab2777043043eab /src/pdf.py
parent6f4ed2490fbcde0b74e7b8251ad71e29b430b8ef (diff)
Add lightweight processing for PDF
Diffstat (limited to 'src/pdf.py')
-rw-r--r--src/pdf.py45
1 files changed, 37 insertions, 8 deletions
diff --git a/src/pdf.py b/src/pdf.py
index c119449..6e639cd 100644
--- a/src/pdf.py
+++ b/src/pdf.py
@@ -29,18 +29,43 @@ class PDFParser(abstract.AbstractParser):
29 self.uri = 'file://' + os.path.abspath(self.filename) 29 self.uri = 'file://' + os.path.abspath(self.filename)
30 self.__scale = 2 # how much precision do we want for the render 30 self.__scale = 2 # how much precision do we want for the render
31 31
32 def remove_all_lightweight(self):
33 """
34 Load the document into Poppler, render pages on a new PDFSurface.
35 """
36 document = Poppler.Document.new_from_file(self.uri, None)
37 pages_count = document.get_n_pages()
38
39 tmp_path = tempfile.mkstemp()[1]
40 pdf_surface = cairo.PDFSurface(tmp_path, 10, 10)
41 pdf_context = cairo.Context(pdf_surface) # context draws on the surface
42
43 for pagenum in range(pages_count):
44 logging.info("Rendering page %d/%d", pagenum + 1, pages_count)
45 page = document.get_page(pagenum)
46 page_width, page_height = page.get_size()
47 pdf_surface.set_size(page_width, page_height)
48 pdf_context.save()
49 page.render_for_printing(pdf_context)
50 pdf_context.restore()
51 pdf_context.show_page() # draw pdf_context on pdf_surface
52 pdf_surface.finish()
53
54 self.__remove_superficial_meta(tmp_path, self.output_filename)
55 os.remove(tmp_path)
56
57 return True
58
32 def remove_all(self): 59 def remove_all(self):
33 """ 60 """
34 Load the document into Poppler, render pages on PNG, 61 Load the document into Poppler, render pages on PNG,
35 and shove those PNG into a new PDF. Metadata from the new 62 and shove those PNG into a new PDF.
36 PDF are removed via Poppler, because there is no way to tell
37 cairo to not add "created by cairo" during rendering.
38 """ 63 """
39 document = Poppler.Document.new_from_file(self.uri, None) 64 document = Poppler.Document.new_from_file(self.uri, None)
40 pages_count = document.get_n_pages() 65 pages_count = document.get_n_pages()
41 66
42 _, tmp_path = tempfile.mkstemp() 67 _, tmp_path = tempfile.mkstemp()
43 pdf_surface = cairo.PDFSurface(tmp_path, 128, 128) 68 pdf_surface = cairo.PDFSurface(tmp_path, 32, 32) # resized later anyway
44 pdf_context = cairo.Context(pdf_surface) 69 pdf_context = cairo.Context(pdf_surface)
45 70
46 for pagenum in range(pages_count): 71 for pagenum in range(pages_count):
@@ -69,14 +94,18 @@ class PDFParser(abstract.AbstractParser):
69 pdf_surface.finish() 94 pdf_surface.finish()
70 95
71 # Removes metadata added by Poppler 96 # Removes metadata added by Poppler
72 document = Poppler.Document.new_from_file('file://' + tmp_path) 97 self.__remove_superficial_meta(tmp_path, self.output_filename)
73 document.set_producer('')
74 document.set_creator('')
75 document.save('file://' + os.path.abspath(self.output_filename))
76 os.remove(tmp_path) 98 os.remove(tmp_path)
77 99
78 return True 100 return True
79 101
102 def __remove_superficial_meta(self, in_file:str, out_file: str) -> bool:
103 document = Poppler.Document.new_from_file('file://' + in_file)
104 document.set_producer('')
105 document.set_creator('')
106 document.save('file://' + os.path.abspath(out_file))
107 return True
108
80 109
81 def __parse_metadata_field(self, data:str) -> dict: 110 def __parse_metadata_field(self, data:str) -> dict:
82 metadata = {} 111 metadata = {}