summaryrefslogtreecommitdiff
path: root/src/parsers/pdf.py
diff options
context:
space:
mode:
Diffstat (limited to 'src/parsers/pdf.py')
-rw-r--r--src/parsers/pdf.py106
1 files changed, 106 insertions, 0 deletions
diff --git a/src/parsers/pdf.py b/src/parsers/pdf.py
new file mode 100644
index 0000000..c25b324
--- /dev/null
+++ b/src/parsers/pdf.py
@@ -0,0 +1,106 @@
1""" Handle PDF
2
3"""
4
5import os
6import logging
7import tempfile
8import shutil
9import io
10
11import cairo
12import gi
13gi.require_version('Poppler', '0.18')
14from gi.repository import Poppler
15
16try:
17 from PIL import Image
18except ImportError:
19 Image = None
20
21from . import abstract
22
23logging.basicConfig(level=logging.DEBUG)
24
25
26class PDFParser(abstract.AbstractParser):
27 def __init__(self, filename):
28 super().__init__(filename)
29 self.meta_list = {'title', 'author', 'subject',
30 'keywords', 'creator', 'producer', 'metadata'}
31 self.uri = 'file://' + os.path.abspath(self.filename)
32 self.password = None
33
34 def __optimize_image_size(self, img: io.BytesIO) -> io.BytesIO:
35 """ This is useless as fuck. """
36 if Image is None:
37 return img
38 ret = io.BytesIO()
39 im = Image.open(img)
40 w, h = im.size
41 resized = im.resize((w, h), Image.ANTIALIAS)
42 resized.save(ret, optimize=True, format="PNG")
43 ret.seek(0)
44
45 return ret
46
47
48 def remove_all(self):
49 """
50 Load the document into Poppler, render pages on PNG,
51 and shove those PNG into a new PDF. Metadata from the new
52 PDF are removed via Poppler, because there is no way to tell
53 cairo to not add "created by cairo" during rendering.
54
55 TODO: Improve the resolution
56 TODO: Don't use a temp file
57 """
58 document = Poppler.Document.new_from_file(self.uri, self.password)
59
60 pdf_surface = cairo.PDFSurface("OUT.pdf", 128, 128)
61 pdf_context = cairo.Context(pdf_surface)
62
63 for pagenum in range(document.get_n_pages()):
64 page = document.get_page(pagenum)
65 page_width, page_height = page.get_size()
66 logging.info("Rendering page %d/%d", pagenum + 1, document.get_n_pages())
67
68 img_surface = cairo.ImageSurface(cairo.FORMAT_ARGB32, int(page_width)*2, int(page_height)*2)
69 img_context = cairo.Context(img_surface)
70
71 img_context.scale(2, 2)
72 page.render_for_printing_with_options(img_context, Poppler.PrintFlags.DOCUMENT)
73 img_context.show_page()
74
75 buf = io.BytesIO()
76 img_surface.write_to_png(buf)
77 img_surface.finish()
78 buf.seek(0)
79
80 #buf = self.__optimize_image_size(buf)
81
82 img = cairo.ImageSurface.create_from_png(buf)
83 pdf_surface.set_size(page_width*2, page_height*2)
84 pdf_context.set_source_surface(img, 0, 0)
85 pdf_context.paint()
86 pdf_context.show_page()
87
88 pdf_surface.finish()
89
90 document = Poppler.Document.new_from_file('file://' + os.path.abspath('OUT.pdf'), self.password)
91 document.set_producer('totally not MAT2 ;)')
92 document.set_creator('')
93 document.save('file://' + os.path.abspath("OUT_clean.pdf"))
94
95 return True
96
97 def get_meta(self):
98 """ Return a dict with all the meta of the file
99 """
100 print("URI: %s", self.uri)
101 document = Poppler.Document.new_from_file(self.uri, self.password)
102 metadata = {}
103 for key in self.meta_list:
104 if document.get_property(key):
105 metadata[key] = document.get_property(key)
106 return metadata