From 38fae60b8beaf9c7b37c65325d2d285e62b6cb85 Mon Sep 17 00:00:00 2001 From: jvoisin Date: Fri, 18 May 2018 23:52:40 +0200 Subject: Rename some files to simplify packaging - the `src` folder is now `libmat2` - the `main.py` script is now `mat2.py` --- libmat2/pdf.py | 135 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 135 insertions(+) create mode 100644 libmat2/pdf.py (limited to 'libmat2/pdf.py') diff --git a/libmat2/pdf.py b/libmat2/pdf.py new file mode 100644 index 0000000..5b99192 --- /dev/null +++ b/libmat2/pdf.py @@ -0,0 +1,135 @@ +""" Handle PDF + +""" + +import os +import re +import logging +import tempfile +import io + +import cairo +import gi +gi.require_version('Poppler', '0.18') +from gi.repository import Poppler, GLib + +from . import abstract + +logging.basicConfig(level=logging.DEBUG) + + +class PDFParser(abstract.AbstractParser): + mimetypes = {'application/pdf', } + meta_list = {'author', 'creation-date', 'creator', 'format', 'keywords', + 'metadata', 'mod-date', 'producer', 'subject', 'title', + 'viewer-preferences'} + + def __init__(self, filename): + super().__init__(filename) + self.uri = 'file://' + os.path.abspath(self.filename) + self.__scale = 2 # how much precision do we want for the render + try: # Check now that the file is valid, to avoid surprises later + Poppler.Document.new_from_file(self.uri, None) + except GLib.GError: # Invalid PDF + raise ValueError + + def remove_all_lightweight(self): + """ + Load the document into Poppler, render pages on a new PDFSurface. + """ + document = Poppler.Document.new_from_file(self.uri, None) + pages_count = document.get_n_pages() + + tmp_path = tempfile.mkstemp()[1] + pdf_surface = cairo.PDFSurface(tmp_path, 10, 10) + pdf_context = cairo.Context(pdf_surface) # context draws on the surface + + for pagenum in range(pages_count): + logging.info("Rendering page %d/%d", pagenum + 1, pages_count) + page = document.get_page(pagenum) + page_width, page_height = page.get_size() + pdf_surface.set_size(page_width, page_height) + pdf_context.save() + page.render_for_printing(pdf_context) + pdf_context.restore() + pdf_context.show_page() # draw pdf_context on pdf_surface + pdf_surface.finish() + + self.__remove_superficial_meta(tmp_path, self.output_filename) + os.remove(tmp_path) + + return True + + def remove_all(self): + """ + Load the document into Poppler, render pages on PNG, + and shove those PNG into a new PDF. + """ + document = Poppler.Document.new_from_file(self.uri, None) + pages_count = document.get_n_pages() + + _, tmp_path = tempfile.mkstemp() + pdf_surface = cairo.PDFSurface(tmp_path, 32, 32) # resized later anyway + pdf_context = cairo.Context(pdf_surface) + + for pagenum in range(pages_count): + page = document.get_page(pagenum) + page_width, page_height = page.get_size() + logging.info("Rendering page %d/%d", pagenum + 1, pages_count) + + img_surface = cairo.ImageSurface(cairo.FORMAT_ARGB32, int(page_width) * self.__scale, int(page_height) * self.__scale) + img_context = cairo.Context(img_surface) + + img_context.scale(self.__scale, self.__scale) + page.render_for_printing(img_context) + img_context.show_page() + + buf = io.BytesIO() + img_surface.write_to_png(buf) + img_surface.finish() + buf.seek(0) + + img = cairo.ImageSurface.create_from_png(buf) + pdf_surface.set_size(page_width*self.__scale, page_height*self.__scale) + pdf_context.set_source_surface(img, 0, 0) + pdf_context.paint() + pdf_context.show_page() + + pdf_surface.finish() + + # Removes metadata added by Poppler + self.__remove_superficial_meta(tmp_path, self.output_filename) + os.remove(tmp_path) + + return True + + @staticmethod + def __remove_superficial_meta(in_file: str, out_file: str) -> bool: + document = Poppler.Document.new_from_file('file://' + in_file) + document.set_producer('') + document.set_creator('') + document.set_creation_date(-1) + document.save('file://' + os.path.abspath(out_file)) + return True + + + @staticmethod + def __parse_metadata_field(data: str) -> dict: + metadata = {} + for (_, key, value) in re.findall(r"<(xmp|pdfx|pdf|xmpMM):(.+)>(.+)", data, re.I): + metadata[key] = value + return metadata + + def get_meta(self): + """ Return a dict with all the meta of the file + """ + metadata = {} + document = Poppler.Document.new_from_file(self.uri, None) + + for key in self.meta_list: + if document.get_property(key): + metadata[key] = document.get_property(key) + if 'metadata' in metadata: + parsed_meta = self.__parse_metadata_field(metadata['metadata']) + return {**metadata, **parsed_meta} + return metadata -- cgit v1.3