diff options
Diffstat (limited to 'src/parsers/pdf.py')
| -rw-r--r-- | src/parsers/pdf.py | 106 |
1 files changed, 106 insertions, 0 deletions
diff --git a/src/parsers/pdf.py b/src/parsers/pdf.py new file mode 100644 index 0000000..c25b324 --- /dev/null +++ b/src/parsers/pdf.py | |||
| @@ -0,0 +1,106 @@ | |||
| 1 | """ Handle PDF | ||
| 2 | |||
| 3 | """ | ||
| 4 | |||
| 5 | import os | ||
| 6 | import logging | ||
| 7 | import tempfile | ||
| 8 | import shutil | ||
| 9 | import io | ||
| 10 | |||
| 11 | import cairo | ||
| 12 | import gi | ||
| 13 | gi.require_version('Poppler', '0.18') | ||
| 14 | from gi.repository import Poppler | ||
| 15 | |||
| 16 | try: | ||
| 17 | from PIL import Image | ||
| 18 | except ImportError: | ||
| 19 | Image = None | ||
| 20 | |||
| 21 | from . import abstract | ||
| 22 | |||
| 23 | logging.basicConfig(level=logging.DEBUG) | ||
| 24 | |||
| 25 | |||
| 26 | class PDFParser(abstract.AbstractParser): | ||
| 27 | def __init__(self, filename): | ||
| 28 | super().__init__(filename) | ||
| 29 | self.meta_list = {'title', 'author', 'subject', | ||
| 30 | 'keywords', 'creator', 'producer', 'metadata'} | ||
| 31 | self.uri = 'file://' + os.path.abspath(self.filename) | ||
| 32 | self.password = None | ||
| 33 | |||
| 34 | def __optimize_image_size(self, img: io.BytesIO) -> io.BytesIO: | ||
| 35 | """ This is useless as fuck. """ | ||
| 36 | if Image is None: | ||
| 37 | return img | ||
| 38 | ret = io.BytesIO() | ||
| 39 | im = Image.open(img) | ||
| 40 | w, h = im.size | ||
| 41 | resized = im.resize((w, h), Image.ANTIALIAS) | ||
| 42 | resized.save(ret, optimize=True, format="PNG") | ||
| 43 | ret.seek(0) | ||
| 44 | |||
| 45 | return ret | ||
| 46 | |||
| 47 | |||
| 48 | def remove_all(self): | ||
| 49 | """ | ||
| 50 | Load the document into Poppler, render pages on PNG, | ||
| 51 | and shove those PNG into a new PDF. Metadata from the new | ||
| 52 | PDF are removed via Poppler, because there is no way to tell | ||
| 53 | cairo to not add "created by cairo" during rendering. | ||
| 54 | |||
| 55 | TODO: Improve the resolution | ||
| 56 | TODO: Don't use a temp file | ||
| 57 | """ | ||
| 58 | document = Poppler.Document.new_from_file(self.uri, self.password) | ||
| 59 | |||
| 60 | pdf_surface = cairo.PDFSurface("OUT.pdf", 128, 128) | ||
| 61 | pdf_context = cairo.Context(pdf_surface) | ||
| 62 | |||
| 63 | for pagenum in range(document.get_n_pages()): | ||
| 64 | page = document.get_page(pagenum) | ||
| 65 | page_width, page_height = page.get_size() | ||
| 66 | logging.info("Rendering page %d/%d", pagenum + 1, document.get_n_pages()) | ||
| 67 | |||
| 68 | img_surface = cairo.ImageSurface(cairo.FORMAT_ARGB32, int(page_width)*2, int(page_height)*2) | ||
| 69 | img_context = cairo.Context(img_surface) | ||
| 70 | |||
| 71 | img_context.scale(2, 2) | ||
| 72 | page.render_for_printing_with_options(img_context, Poppler.PrintFlags.DOCUMENT) | ||
| 73 | img_context.show_page() | ||
| 74 | |||
| 75 | buf = io.BytesIO() | ||
| 76 | img_surface.write_to_png(buf) | ||
| 77 | img_surface.finish() | ||
| 78 | buf.seek(0) | ||
| 79 | |||
| 80 | #buf = self.__optimize_image_size(buf) | ||
| 81 | |||
| 82 | img = cairo.ImageSurface.create_from_png(buf) | ||
| 83 | pdf_surface.set_size(page_width*2, page_height*2) | ||
| 84 | pdf_context.set_source_surface(img, 0, 0) | ||
| 85 | pdf_context.paint() | ||
| 86 | pdf_context.show_page() | ||
| 87 | |||
| 88 | pdf_surface.finish() | ||
| 89 | |||
| 90 | document = Poppler.Document.new_from_file('file://' + os.path.abspath('OUT.pdf'), self.password) | ||
| 91 | document.set_producer('totally not MAT2 ;)') | ||
| 92 | document.set_creator('') | ||
| 93 | document.save('file://' + os.path.abspath("OUT_clean.pdf")) | ||
| 94 | |||
| 95 | return True | ||
| 96 | |||
| 97 | def get_meta(self): | ||
| 98 | """ Return a dict with all the meta of the file | ||
| 99 | """ | ||
| 100 | print("URI: %s", self.uri) | ||
| 101 | document = Poppler.Document.new_from_file(self.uri, self.password) | ||
| 102 | metadata = {} | ||
| 103 | for key in self.meta_list: | ||
| 104 | if document.get_property(key): | ||
| 105 | metadata[key] = document.get_property(key) | ||
| 106 | return metadata | ||
