diff options
| author | jvoisin | 2018-04-14 21:23:31 +0200 |
|---|---|---|
| committer | jvoisin | 2018-04-14 21:23:31 +0200 |
| commit | 96299c6a5350f59eab022a09400eddcc347daede (patch) | |
| tree | 492df3a7637b2d1cb45424615ab2777043043eab /src/pdf.py | |
| parent | 6f4ed2490fbcde0b74e7b8251ad71e29b430b8ef (diff) | |
Add lightweight processing for PDF
Diffstat (limited to 'src/pdf.py')
| -rw-r--r-- | src/pdf.py | 45 |
1 files changed, 37 insertions, 8 deletions
| @@ -29,18 +29,43 @@ class PDFParser(abstract.AbstractParser): | |||
| 29 | self.uri = 'file://' + os.path.abspath(self.filename) | 29 | self.uri = 'file://' + os.path.abspath(self.filename) |
| 30 | self.__scale = 2 # how much precision do we want for the render | 30 | self.__scale = 2 # how much precision do we want for the render |
| 31 | 31 | ||
| 32 | def remove_all_lightweight(self): | ||
| 33 | """ | ||
| 34 | Load the document into Poppler, render pages on a new PDFSurface. | ||
| 35 | """ | ||
| 36 | document = Poppler.Document.new_from_file(self.uri, None) | ||
| 37 | pages_count = document.get_n_pages() | ||
| 38 | |||
| 39 | tmp_path = tempfile.mkstemp()[1] | ||
| 40 | pdf_surface = cairo.PDFSurface(tmp_path, 10, 10) | ||
| 41 | pdf_context = cairo.Context(pdf_surface) # context draws on the surface | ||
| 42 | |||
| 43 | for pagenum in range(pages_count): | ||
| 44 | logging.info("Rendering page %d/%d", pagenum + 1, pages_count) | ||
| 45 | page = document.get_page(pagenum) | ||
| 46 | page_width, page_height = page.get_size() | ||
| 47 | pdf_surface.set_size(page_width, page_height) | ||
| 48 | pdf_context.save() | ||
| 49 | page.render_for_printing(pdf_context) | ||
| 50 | pdf_context.restore() | ||
| 51 | pdf_context.show_page() # draw pdf_context on pdf_surface | ||
| 52 | pdf_surface.finish() | ||
| 53 | |||
| 54 | self.__remove_superficial_meta(tmp_path, self.output_filename) | ||
| 55 | os.remove(tmp_path) | ||
| 56 | |||
| 57 | return True | ||
| 58 | |||
| 32 | def remove_all(self): | 59 | def remove_all(self): |
| 33 | """ | 60 | """ |
| 34 | Load the document into Poppler, render pages on PNG, | 61 | Load the document into Poppler, render pages on PNG, |
| 35 | and shove those PNG into a new PDF. Metadata from the new | 62 | and shove those PNG into a new PDF. |
| 36 | PDF are removed via Poppler, because there is no way to tell | ||
| 37 | cairo to not add "created by cairo" during rendering. | ||
| 38 | """ | 63 | """ |
| 39 | document = Poppler.Document.new_from_file(self.uri, None) | 64 | document = Poppler.Document.new_from_file(self.uri, None) |
| 40 | pages_count = document.get_n_pages() | 65 | pages_count = document.get_n_pages() |
| 41 | 66 | ||
| 42 | _, tmp_path = tempfile.mkstemp() | 67 | _, tmp_path = tempfile.mkstemp() |
| 43 | pdf_surface = cairo.PDFSurface(tmp_path, 128, 128) | 68 | pdf_surface = cairo.PDFSurface(tmp_path, 32, 32) # resized later anyway |
| 44 | pdf_context = cairo.Context(pdf_surface) | 69 | pdf_context = cairo.Context(pdf_surface) |
| 45 | 70 | ||
| 46 | for pagenum in range(pages_count): | 71 | for pagenum in range(pages_count): |
| @@ -69,14 +94,18 @@ class PDFParser(abstract.AbstractParser): | |||
| 69 | pdf_surface.finish() | 94 | pdf_surface.finish() |
| 70 | 95 | ||
| 71 | # Removes metadata added by Poppler | 96 | # Removes metadata added by Poppler |
| 72 | document = Poppler.Document.new_from_file('file://' + tmp_path) | 97 | self.__remove_superficial_meta(tmp_path, self.output_filename) |
| 73 | document.set_producer('') | ||
| 74 | document.set_creator('') | ||
| 75 | document.save('file://' + os.path.abspath(self.output_filename)) | ||
| 76 | os.remove(tmp_path) | 98 | os.remove(tmp_path) |
| 77 | 99 | ||
| 78 | return True | 100 | return True |
| 79 | 101 | ||
| 102 | def __remove_superficial_meta(self, in_file:str, out_file: str) -> bool: | ||
| 103 | document = Poppler.Document.new_from_file('file://' + in_file) | ||
| 104 | document.set_producer('') | ||
| 105 | document.set_creator('') | ||
| 106 | document.save('file://' + os.path.abspath(out_file)) | ||
| 107 | return True | ||
| 108 | |||
| 80 | 109 | ||
| 81 | def __parse_metadata_field(self, data:str) -> dict: | 110 | def __parse_metadata_field(self, data:str) -> dict: |
| 82 | metadata = {} | 111 | metadata = {} |
