diff options
| author | jvoisin | 2018-05-18 23:52:40 +0200 |
|---|---|---|
| committer | jvoisin | 2018-05-18 23:52:40 +0200 |
| commit | 38fae60b8beaf9c7b37c65325d2d285e62b6cb85 (patch) | |
| tree | e6bd4f699d6190dfada7618ebd04455eb7de9660 /libmat2/pdf.py | |
| parent | 57d5cd04284276c49899034a9ad321b680624d8f (diff) | |
Rename some files to simplify packaging
- the `src` folder is now `libmat2`
- the `main.py` script is now `mat2.py`
Diffstat (limited to 'libmat2/pdf.py')
| -rw-r--r-- | libmat2/pdf.py | 135 |
1 files changed, 135 insertions, 0 deletions
diff --git a/libmat2/pdf.py b/libmat2/pdf.py new file mode 100644 index 0000000..5b99192 --- /dev/null +++ b/libmat2/pdf.py | |||
| @@ -0,0 +1,135 @@ | |||
| 1 | """ Handle PDF | ||
| 2 | |||
| 3 | """ | ||
| 4 | |||
| 5 | import os | ||
| 6 | import re | ||
| 7 | import logging | ||
| 8 | import tempfile | ||
| 9 | import io | ||
| 10 | |||
| 11 | import cairo | ||
| 12 | import gi | ||
| 13 | gi.require_version('Poppler', '0.18') | ||
| 14 | from gi.repository import Poppler, GLib | ||
| 15 | |||
| 16 | from . import abstract | ||
| 17 | |||
| 18 | logging.basicConfig(level=logging.DEBUG) | ||
| 19 | |||
| 20 | |||
| 21 | class PDFParser(abstract.AbstractParser): | ||
| 22 | mimetypes = {'application/pdf', } | ||
| 23 | meta_list = {'author', 'creation-date', 'creator', 'format', 'keywords', | ||
| 24 | 'metadata', 'mod-date', 'producer', 'subject', 'title', | ||
| 25 | 'viewer-preferences'} | ||
| 26 | |||
| 27 | def __init__(self, filename): | ||
| 28 | super().__init__(filename) | ||
| 29 | self.uri = 'file://' + os.path.abspath(self.filename) | ||
| 30 | self.__scale = 2 # how much precision do we want for the render | ||
| 31 | try: # Check now that the file is valid, to avoid surprises later | ||
| 32 | Poppler.Document.new_from_file(self.uri, None) | ||
| 33 | except GLib.GError: # Invalid PDF | ||
| 34 | raise ValueError | ||
| 35 | |||
| 36 | def remove_all_lightweight(self): | ||
| 37 | """ | ||
| 38 | Load the document into Poppler, render pages on a new PDFSurface. | ||
| 39 | """ | ||
| 40 | document = Poppler.Document.new_from_file(self.uri, None) | ||
| 41 | pages_count = document.get_n_pages() | ||
| 42 | |||
| 43 | tmp_path = tempfile.mkstemp()[1] | ||
| 44 | pdf_surface = cairo.PDFSurface(tmp_path, 10, 10) | ||
| 45 | pdf_context = cairo.Context(pdf_surface) # context draws on the surface | ||
| 46 | |||
| 47 | for pagenum in range(pages_count): | ||
| 48 | logging.info("Rendering page %d/%d", pagenum + 1, pages_count) | ||
| 49 | page = document.get_page(pagenum) | ||
| 50 | page_width, page_height = page.get_size() | ||
| 51 | pdf_surface.set_size(page_width, page_height) | ||
| 52 | pdf_context.save() | ||
| 53 | page.render_for_printing(pdf_context) | ||
| 54 | pdf_context.restore() | ||
| 55 | pdf_context.show_page() # draw pdf_context on pdf_surface | ||
| 56 | pdf_surface.finish() | ||
| 57 | |||
| 58 | self.__remove_superficial_meta(tmp_path, self.output_filename) | ||
| 59 | os.remove(tmp_path) | ||
| 60 | |||
| 61 | return True | ||
| 62 | |||
| 63 | def remove_all(self): | ||
| 64 | """ | ||
| 65 | Load the document into Poppler, render pages on PNG, | ||
| 66 | and shove those PNG into a new PDF. | ||
| 67 | """ | ||
| 68 | document = Poppler.Document.new_from_file(self.uri, None) | ||
| 69 | pages_count = document.get_n_pages() | ||
| 70 | |||
| 71 | _, tmp_path = tempfile.mkstemp() | ||
| 72 | pdf_surface = cairo.PDFSurface(tmp_path, 32, 32) # resized later anyway | ||
| 73 | pdf_context = cairo.Context(pdf_surface) | ||
| 74 | |||
| 75 | for pagenum in range(pages_count): | ||
| 76 | page = document.get_page(pagenum) | ||
| 77 | page_width, page_height = page.get_size() | ||
| 78 | logging.info("Rendering page %d/%d", pagenum + 1, pages_count) | ||
| 79 | |||
| 80 | img_surface = cairo.ImageSurface(cairo.FORMAT_ARGB32, int(page_width) * self.__scale, int(page_height) * self.__scale) | ||
| 81 | img_context = cairo.Context(img_surface) | ||
| 82 | |||
| 83 | img_context.scale(self.__scale, self.__scale) | ||
| 84 | page.render_for_printing(img_context) | ||
| 85 | img_context.show_page() | ||
| 86 | |||
| 87 | buf = io.BytesIO() | ||
| 88 | img_surface.write_to_png(buf) | ||
| 89 | img_surface.finish() | ||
| 90 | buf.seek(0) | ||
| 91 | |||
| 92 | img = cairo.ImageSurface.create_from_png(buf) | ||
| 93 | pdf_surface.set_size(page_width*self.__scale, page_height*self.__scale) | ||
| 94 | pdf_context.set_source_surface(img, 0, 0) | ||
| 95 | pdf_context.paint() | ||
| 96 | pdf_context.show_page() | ||
| 97 | |||
| 98 | pdf_surface.finish() | ||
| 99 | |||
| 100 | # Removes metadata added by Poppler | ||
| 101 | self.__remove_superficial_meta(tmp_path, self.output_filename) | ||
| 102 | os.remove(tmp_path) | ||
| 103 | |||
| 104 | return True | ||
| 105 | |||
| 106 | @staticmethod | ||
| 107 | def __remove_superficial_meta(in_file: str, out_file: str) -> bool: | ||
| 108 | document = Poppler.Document.new_from_file('file://' + in_file) | ||
| 109 | document.set_producer('') | ||
| 110 | document.set_creator('') | ||
| 111 | document.set_creation_date(-1) | ||
| 112 | document.save('file://' + os.path.abspath(out_file)) | ||
| 113 | return True | ||
| 114 | |||
| 115 | |||
| 116 | @staticmethod | ||
| 117 | def __parse_metadata_field(data: str) -> dict: | ||
| 118 | metadata = {} | ||
| 119 | for (_, key, value) in re.findall(r"<(xmp|pdfx|pdf|xmpMM):(.+)>(.+)</\1:\2>", data, re.I): | ||
| 120 | metadata[key] = value | ||
| 121 | return metadata | ||
| 122 | |||
| 123 | def get_meta(self): | ||
| 124 | """ Return a dict with all the meta of the file | ||
| 125 | """ | ||
| 126 | metadata = {} | ||
| 127 | document = Poppler.Document.new_from_file(self.uri, None) | ||
| 128 | |||
| 129 | for key in self.meta_list: | ||
| 130 | if document.get_property(key): | ||
| 131 | metadata[key] = document.get_property(key) | ||
| 132 | if 'metadata' in metadata: | ||
| 133 | parsed_meta = self.__parse_metadata_field(metadata['metadata']) | ||
| 134 | return {**metadata, **parsed_meta} | ||
| 135 | return metadata | ||
