From df3c27d79dec231809deb4e617070a16858c306d Mon Sep 17 00:00:00 2001 From: jvoisin Date: Sun, 18 Mar 2018 21:42:12 +0100 Subject: Improve the testsuite --- libmat2/__init__.py | 1 + libmat2/parsers/__init__.py | 0 libmat2/parsers/abstract.py | 10 +++++ libmat2/parsers/pdf.py | 105 ++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 116 insertions(+) create mode 100644 libmat2/__init__.py create mode 100644 libmat2/parsers/__init__.py create mode 100644 libmat2/parsers/abstract.py create mode 100644 libmat2/parsers/pdf.py (limited to 'libmat2') diff --git a/libmat2/__init__.py b/libmat2/__init__.py new file mode 100644 index 0000000..3b3dacb --- /dev/null +++ b/libmat2/__init__.py @@ -0,0 +1 @@ +__version__ = '2.0' diff --git a/libmat2/parsers/__init__.py b/libmat2/parsers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/libmat2/parsers/abstract.py b/libmat2/parsers/abstract.py new file mode 100644 index 0000000..a9129cc --- /dev/null +++ b/libmat2/parsers/abstract.py @@ -0,0 +1,10 @@ +class AbstractParser(object): + def __init__(self, filename: str): + self.filename = filename + self.meta_list = set() + + def get_meta(self): + raise NotImplementedError + + def remove_all(self): + raise NotImplementedError diff --git a/libmat2/parsers/pdf.py b/libmat2/parsers/pdf.py new file mode 100644 index 0000000..f6bc110 --- /dev/null +++ b/libmat2/parsers/pdf.py @@ -0,0 +1,105 @@ +""" Handle PDF + +""" + +import os +import logging +import tempfile +import shutil +import io + +import cairo +import gi +gi.require_version('Poppler', '0.18') +from gi.repository import Poppler, Gio, GLib + +try: + from PIL import Image +except ImportError: + Image = None + +from . import abstract + +logging.basicConfig(level=logging.DEBUG) + + +class PDFParser(abstract.AbstractParser): + def __init__(self, filename): + super().__init__(filename) + self.meta_list = {'title', 'author', 'subject', + 'keywords', 'creator', 'producer', 'metadata'} + self.uri = 'file://' + os.path.abspath(self.filename) + self.password = None + + def remove_all(self): + """ + Load the document into Poppler, render pages on PNG, + and shove those PNG into a new PDF. Metadata from the new + PDF are removed via Poppler, because there is no way to tell + cairo to not add "created by cairo" during rendering. + + TODO: Improve the resolution + TODO: Don't use a temp file + """ + document = Poppler.Document.new_from_file(self.uri, self.password) + + pdf_out = io.BytesIO() + pdf_surface = cairo.PDFSurface(pdf_out, 128, 128) + pdf_context = cairo.Context(pdf_surface) + + for pagenum in range(document.get_n_pages()): + page = document.get_page(pagenum) + page_width, page_height = page.get_size() + logging.info("Rendering page %d/%d", pagenum + 1, document.get_n_pages()) + + img_surface = cairo.ImageSurface(cairo.FORMAT_ARGB32, int(page_width)*2, int(page_height)*2) + img_context = cairo.Context(img_surface) + + img_context.scale(2, 2) + page.render_for_printing_with_options(img_context, Poppler.PrintFlags.DOCUMENT) + img_context.show_page() + + buf = io.BytesIO() + img_surface.write_to_png(buf) + img_surface.finish() + buf.seek(0) + + img = cairo.ImageSurface.create_from_png(buf) + pdf_surface.set_size(page_width*2, page_height*2) + pdf_context.set_source_surface(img, 0, 0) + pdf_context.paint() + pdf_context.show_page() + + pdf_surface.finish() + + b = GLib.Bytes(pdf_out.getvalue()) + input_stream = Gio.MemoryInputStream.new_from_bytes(b) + out_document = Poppler.Document.new_from_stream(input_stream, -1, self.password, None) + metadata = {} + for key in self.meta_list: + if out_document.get_property(key): + metadata[key] = str(out_document.get_property(key)) + out_document.set_producer('totally not MAT2 ;)') + out_document.set_creator('') + print("AFTER") + metadata = {} + for key in self.meta_list: + if out_document.get_property(key): + metadata[key] = str(out_document.get_property(key)) + print("LOL") + out_document.save('file://' + os.path.abspath("olol.pdf")) + + print(metadata) + + return True + + def get_meta(self): + """ Return a dict with all the meta of the file + """ + print("URI: %s", self.uri) + document = Poppler.Document.new_from_file(self.uri, self.password) + metadata = {} + for key in self.meta_list: + if document.get_property(key): + metadata[key] = str(document.get_property(key)) + return metadata -- cgit v1.3