summaryrefslogtreecommitdiff
path: root/libmat2/pdf.py
diff options
context:
space:
mode:
authorjvoisin2018-05-18 23:52:40 +0200
committerjvoisin2018-05-18 23:52:40 +0200
commit38fae60b8beaf9c7b37c65325d2d285e62b6cb85 (patch)
treee6bd4f699d6190dfada7618ebd04455eb7de9660 /libmat2/pdf.py
parent57d5cd04284276c49899034a9ad321b680624d8f (diff)
Rename some files to simplify packaging
- the `src` folder is now `libmat2` - the `main.py` script is now `mat2.py`
Diffstat (limited to 'libmat2/pdf.py')
-rw-r--r--libmat2/pdf.py135
1 files changed, 135 insertions, 0 deletions
diff --git a/libmat2/pdf.py b/libmat2/pdf.py
new file mode 100644
index 0000000..5b99192
--- /dev/null
+++ b/libmat2/pdf.py
@@ -0,0 +1,135 @@
1""" Handle PDF
2
3"""
4
5import os
6import re
7import logging
8import tempfile
9import io
10
11import cairo
12import gi
13gi.require_version('Poppler', '0.18')
14from gi.repository import Poppler, GLib
15
16from . import abstract
17
18logging.basicConfig(level=logging.DEBUG)
19
20
21class PDFParser(abstract.AbstractParser):
22 mimetypes = {'application/pdf', }
23 meta_list = {'author', 'creation-date', 'creator', 'format', 'keywords',
24 'metadata', 'mod-date', 'producer', 'subject', 'title',
25 'viewer-preferences'}
26
27 def __init__(self, filename):
28 super().__init__(filename)
29 self.uri = 'file://' + os.path.abspath(self.filename)
30 self.__scale = 2 # how much precision do we want for the render
31 try: # Check now that the file is valid, to avoid surprises later
32 Poppler.Document.new_from_file(self.uri, None)
33 except GLib.GError: # Invalid PDF
34 raise ValueError
35
36 def remove_all_lightweight(self):
37 """
38 Load the document into Poppler, render pages on a new PDFSurface.
39 """
40 document = Poppler.Document.new_from_file(self.uri, None)
41 pages_count = document.get_n_pages()
42
43 tmp_path = tempfile.mkstemp()[1]
44 pdf_surface = cairo.PDFSurface(tmp_path, 10, 10)
45 pdf_context = cairo.Context(pdf_surface) # context draws on the surface
46
47 for pagenum in range(pages_count):
48 logging.info("Rendering page %d/%d", pagenum + 1, pages_count)
49 page = document.get_page(pagenum)
50 page_width, page_height = page.get_size()
51 pdf_surface.set_size(page_width, page_height)
52 pdf_context.save()
53 page.render_for_printing(pdf_context)
54 pdf_context.restore()
55 pdf_context.show_page() # draw pdf_context on pdf_surface
56 pdf_surface.finish()
57
58 self.__remove_superficial_meta(tmp_path, self.output_filename)
59 os.remove(tmp_path)
60
61 return True
62
63 def remove_all(self):
64 """
65 Load the document into Poppler, render pages on PNG,
66 and shove those PNG into a new PDF.
67 """
68 document = Poppler.Document.new_from_file(self.uri, None)
69 pages_count = document.get_n_pages()
70
71 _, tmp_path = tempfile.mkstemp()
72 pdf_surface = cairo.PDFSurface(tmp_path, 32, 32) # resized later anyway
73 pdf_context = cairo.Context(pdf_surface)
74
75 for pagenum in range(pages_count):
76 page = document.get_page(pagenum)
77 page_width, page_height = page.get_size()
78 logging.info("Rendering page %d/%d", pagenum + 1, pages_count)
79
80 img_surface = cairo.ImageSurface(cairo.FORMAT_ARGB32, int(page_width) * self.__scale, int(page_height) * self.__scale)
81 img_context = cairo.Context(img_surface)
82
83 img_context.scale(self.__scale, self.__scale)
84 page.render_for_printing(img_context)
85 img_context.show_page()
86
87 buf = io.BytesIO()
88 img_surface.write_to_png(buf)
89 img_surface.finish()
90 buf.seek(0)
91
92 img = cairo.ImageSurface.create_from_png(buf)
93 pdf_surface.set_size(page_width*self.__scale, page_height*self.__scale)
94 pdf_context.set_source_surface(img, 0, 0)
95 pdf_context.paint()
96 pdf_context.show_page()
97
98 pdf_surface.finish()
99
100 # Removes metadata added by Poppler
101 self.__remove_superficial_meta(tmp_path, self.output_filename)
102 os.remove(tmp_path)
103
104 return True
105
106 @staticmethod
107 def __remove_superficial_meta(in_file: str, out_file: str) -> bool:
108 document = Poppler.Document.new_from_file('file://' + in_file)
109 document.set_producer('')
110 document.set_creator('')
111 document.set_creation_date(-1)
112 document.save('file://' + os.path.abspath(out_file))
113 return True
114
115
116 @staticmethod
117 def __parse_metadata_field(data: str) -> dict:
118 metadata = {}
119 for (_, key, value) in re.findall(r"<(xmp|pdfx|pdf|xmpMM):(.+)>(.+)</\1:\2>", data, re.I):
120 metadata[key] = value
121 return metadata
122
123 def get_meta(self):
124 """ Return a dict with all the meta of the file
125 """
126 metadata = {}
127 document = Poppler.Document.new_from_file(self.uri, None)
128
129 for key in self.meta_list:
130 if document.get_property(key):
131 metadata[key] = document.get_property(key)
132 if 'metadata' in metadata:
133 parsed_meta = self.__parse_metadata_field(metadata['metadata'])
134 return {**metadata, **parsed_meta}
135 return metadata