From 38fae60b8beaf9c7b37c65325d2d285e62b6cb85 Mon Sep 17 00:00:00 2001 From: jvoisin Date: Fri, 18 May 2018 23:52:40 +0200 Subject: Rename some files to simplify packaging - the `src` folder is now `libmat2` - the `main.py` script is now `mat2.py` --- src/pdf.py | 135 ------------------------------------------------------------- 1 file changed, 135 deletions(-) delete mode 100644 src/pdf.py (limited to 'src/pdf.py') diff --git a/src/pdf.py b/src/pdf.py deleted file mode 100644 index 5b99192..0000000 --- a/src/pdf.py +++ /dev/null @@ -1,135 +0,0 @@ -""" Handle PDF - -""" - -import os -import re -import logging -import tempfile -import io - -import cairo -import gi -gi.require_version('Poppler', '0.18') -from gi.repository import Poppler, GLib - -from . import abstract - -logging.basicConfig(level=logging.DEBUG) - - -class PDFParser(abstract.AbstractParser): - mimetypes = {'application/pdf', } - meta_list = {'author', 'creation-date', 'creator', 'format', 'keywords', - 'metadata', 'mod-date', 'producer', 'subject', 'title', - 'viewer-preferences'} - - def __init__(self, filename): - super().__init__(filename) - self.uri = 'file://' + os.path.abspath(self.filename) - self.__scale = 2 # how much precision do we want for the render - try: # Check now that the file is valid, to avoid surprises later - Poppler.Document.new_from_file(self.uri, None) - except GLib.GError: # Invalid PDF - raise ValueError - - def remove_all_lightweight(self): - """ - Load the document into Poppler, render pages on a new PDFSurface. - """ - document = Poppler.Document.new_from_file(self.uri, None) - pages_count = document.get_n_pages() - - tmp_path = tempfile.mkstemp()[1] - pdf_surface = cairo.PDFSurface(tmp_path, 10, 10) - pdf_context = cairo.Context(pdf_surface) # context draws on the surface - - for pagenum in range(pages_count): - logging.info("Rendering page %d/%d", pagenum + 1, pages_count) - page = document.get_page(pagenum) - page_width, page_height = page.get_size() - pdf_surface.set_size(page_width, page_height) - pdf_context.save() - page.render_for_printing(pdf_context) - pdf_context.restore() - pdf_context.show_page() # draw pdf_context on pdf_surface - pdf_surface.finish() - - self.__remove_superficial_meta(tmp_path, self.output_filename) - os.remove(tmp_path) - - return True - - def remove_all(self): - """ - Load the document into Poppler, render pages on PNG, - and shove those PNG into a new PDF. - """ - document = Poppler.Document.new_from_file(self.uri, None) - pages_count = document.get_n_pages() - - _, tmp_path = tempfile.mkstemp() - pdf_surface = cairo.PDFSurface(tmp_path, 32, 32) # resized later anyway - pdf_context = cairo.Context(pdf_surface) - - for pagenum in range(pages_count): - page = document.get_page(pagenum) - page_width, page_height = page.get_size() - logging.info("Rendering page %d/%d", pagenum + 1, pages_count) - - img_surface = cairo.ImageSurface(cairo.FORMAT_ARGB32, int(page_width) * self.__scale, int(page_height) * self.__scale) - img_context = cairo.Context(img_surface) - - img_context.scale(self.__scale, self.__scale) - page.render_for_printing(img_context) - img_context.show_page() - - buf = io.BytesIO() - img_surface.write_to_png(buf) - img_surface.finish() - buf.seek(0) - - img = cairo.ImageSurface.create_from_png(buf) - pdf_surface.set_size(page_width*self.__scale, page_height*self.__scale) - pdf_context.set_source_surface(img, 0, 0) - pdf_context.paint() - pdf_context.show_page() - - pdf_surface.finish() - - # Removes metadata added by Poppler - self.__remove_superficial_meta(tmp_path, self.output_filename) - os.remove(tmp_path) - - return True - - @staticmethod - def __remove_superficial_meta(in_file: str, out_file: str) -> bool: - document = Poppler.Document.new_from_file('file://' + in_file) - document.set_producer('') - document.set_creator('') - document.set_creation_date(-1) - document.save('file://' + os.path.abspath(out_file)) - return True - - - @staticmethod - def __parse_metadata_field(data: str) -> dict: - metadata = {} - for (_, key, value) in re.findall(r"<(xmp|pdfx|pdf|xmpMM):(.+)>(.+)", data, re.I): - metadata[key] = value - return metadata - - def get_meta(self): - """ Return a dict with all the meta of the file - """ - metadata = {} - document = Poppler.Document.new_from_file(self.uri, None) - - for key in self.meta_list: - if document.get_property(key): - metadata[key] = document.get_property(key) - if 'metadata' in metadata: - parsed_meta = self.__parse_metadata_field(metadata['metadata']) - return {**metadata, **parsed_meta} - return metadata -- cgit v1.3