diff options
| -rw-r--r-- | libmat2/__init__.py | 1 | ||||
| -rw-r--r-- | libmat2/parsers/__init__.py | 0 | ||||
| -rw-r--r-- | libmat2/parsers/abstract.py | 10 | ||||
| -rw-r--r-- | libmat2/parsers/pdf.py | 105 | ||||
| -rw-r--r-- | src/parsers/abstract.py | 1 | ||||
| -rw-r--r-- | src/parsers/pdf.py | 25 | ||||
| -rw-r--r-- | tests/__init__.py | 0 | ||||
| -rw-r--r-- | tests/test_libmat2.py | 21 |
8 files changed, 138 insertions, 25 deletions
diff --git a/libmat2/__init__.py b/libmat2/__init__.py new file mode 100644 index 0000000..3b3dacb --- /dev/null +++ b/libmat2/__init__.py | |||
| @@ -0,0 +1 @@ | |||
| __version__ = '2.0' | |||
diff --git a/libmat2/parsers/__init__.py b/libmat2/parsers/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/libmat2/parsers/__init__.py | |||
diff --git a/libmat2/parsers/abstract.py b/libmat2/parsers/abstract.py new file mode 100644 index 0000000..a9129cc --- /dev/null +++ b/libmat2/parsers/abstract.py | |||
| @@ -0,0 +1,10 @@ | |||
| 1 | class AbstractParser(object): | ||
| 2 | def __init__(self, filename: str): | ||
| 3 | self.filename = filename | ||
| 4 | self.meta_list = set() | ||
| 5 | |||
| 6 | def get_meta(self): | ||
| 7 | raise NotImplementedError | ||
| 8 | |||
| 9 | def remove_all(self): | ||
| 10 | raise NotImplementedError | ||
diff --git a/libmat2/parsers/pdf.py b/libmat2/parsers/pdf.py new file mode 100644 index 0000000..f6bc110 --- /dev/null +++ b/libmat2/parsers/pdf.py | |||
| @@ -0,0 +1,105 @@ | |||
| 1 | """ Handle PDF | ||
| 2 | |||
| 3 | """ | ||
| 4 | |||
| 5 | import os | ||
| 6 | import logging | ||
| 7 | import tempfile | ||
| 8 | import shutil | ||
| 9 | import io | ||
| 10 | |||
| 11 | import cairo | ||
| 12 | import gi | ||
| 13 | gi.require_version('Poppler', '0.18') | ||
| 14 | from gi.repository import Poppler, Gio, GLib | ||
| 15 | |||
| 16 | try: | ||
| 17 | from PIL import Image | ||
| 18 | except ImportError: | ||
| 19 | Image = None | ||
| 20 | |||
| 21 | from . import abstract | ||
| 22 | |||
| 23 | logging.basicConfig(level=logging.DEBUG) | ||
| 24 | |||
| 25 | |||
| 26 | class PDFParser(abstract.AbstractParser): | ||
| 27 | def __init__(self, filename): | ||
| 28 | super().__init__(filename) | ||
| 29 | self.meta_list = {'title', 'author', 'subject', | ||
| 30 | 'keywords', 'creator', 'producer', 'metadata'} | ||
| 31 | self.uri = 'file://' + os.path.abspath(self.filename) | ||
| 32 | self.password = None | ||
| 33 | |||
| 34 | def remove_all(self): | ||
| 35 | """ | ||
| 36 | Load the document into Poppler, render pages on PNG, | ||
| 37 | and shove those PNG into a new PDF. Metadata from the new | ||
| 38 | PDF are removed via Poppler, because there is no way to tell | ||
| 39 | cairo to not add "created by cairo" during rendering. | ||
| 40 | |||
| 41 | TODO: Improve the resolution | ||
| 42 | TODO: Don't use a temp file | ||
| 43 | """ | ||
| 44 | document = Poppler.Document.new_from_file(self.uri, self.password) | ||
| 45 | |||
| 46 | pdf_out = io.BytesIO() | ||
| 47 | pdf_surface = cairo.PDFSurface(pdf_out, 128, 128) | ||
| 48 | pdf_context = cairo.Context(pdf_surface) | ||
| 49 | |||
| 50 | for pagenum in range(document.get_n_pages()): | ||
| 51 | page = document.get_page(pagenum) | ||
| 52 | page_width, page_height = page.get_size() | ||
| 53 | logging.info("Rendering page %d/%d", pagenum + 1, document.get_n_pages()) | ||
| 54 | |||
| 55 | img_surface = cairo.ImageSurface(cairo.FORMAT_ARGB32, int(page_width)*2, int(page_height)*2) | ||
| 56 | img_context = cairo.Context(img_surface) | ||
| 57 | |||
| 58 | img_context.scale(2, 2) | ||
| 59 | page.render_for_printing_with_options(img_context, Poppler.PrintFlags.DOCUMENT) | ||
| 60 | img_context.show_page() | ||
| 61 | |||
| 62 | buf = io.BytesIO() | ||
| 63 | img_surface.write_to_png(buf) | ||
| 64 | img_surface.finish() | ||
| 65 | buf.seek(0) | ||
| 66 | |||
| 67 | img = cairo.ImageSurface.create_from_png(buf) | ||
| 68 | pdf_surface.set_size(page_width*2, page_height*2) | ||
| 69 | pdf_context.set_source_surface(img, 0, 0) | ||
| 70 | pdf_context.paint() | ||
| 71 | pdf_context.show_page() | ||
| 72 | |||
| 73 | pdf_surface.finish() | ||
| 74 | |||
| 75 | b = GLib.Bytes(pdf_out.getvalue()) | ||
| 76 | input_stream = Gio.MemoryInputStream.new_from_bytes(b) | ||
| 77 | out_document = Poppler.Document.new_from_stream(input_stream, -1, self.password, None) | ||
| 78 | metadata = {} | ||
| 79 | for key in self.meta_list: | ||
| 80 | if out_document.get_property(key): | ||
| 81 | metadata[key] = str(out_document.get_property(key)) | ||
| 82 | out_document.set_producer('totally not MAT2 ;)') | ||
| 83 | out_document.set_creator('') | ||
| 84 | print("AFTER") | ||
| 85 | metadata = {} | ||
| 86 | for key in self.meta_list: | ||
| 87 | if out_document.get_property(key): | ||
| 88 | metadata[key] = str(out_document.get_property(key)) | ||
| 89 | print("LOL") | ||
| 90 | out_document.save('file://' + os.path.abspath("olol.pdf")) | ||
| 91 | |||
| 92 | print(metadata) | ||
| 93 | |||
| 94 | return True | ||
| 95 | |||
| 96 | def get_meta(self): | ||
| 97 | """ Return a dict with all the meta of the file | ||
| 98 | """ | ||
| 99 | print("URI: %s", self.uri) | ||
| 100 | document = Poppler.Document.new_from_file(self.uri, self.password) | ||
| 101 | metadata = {} | ||
| 102 | for key in self.meta_list: | ||
| 103 | if document.get_property(key): | ||
| 104 | metadata[key] = str(document.get_property(key)) | ||
| 105 | return metadata | ||
diff --git a/src/parsers/abstract.py b/src/parsers/abstract.py index a9129cc..d0e7108 100644 --- a/src/parsers/abstract.py +++ b/src/parsers/abstract.py | |||
| @@ -1,6 +1,7 @@ | |||
| 1 | class AbstractParser(object): | 1 | class AbstractParser(object): |
| 2 | def __init__(self, filename: str): | 2 | def __init__(self, filename: str): |
| 3 | self.filename = filename | 3 | self.filename = filename |
| 4 | self.output_filename = filename + '.cleaned' | ||
| 4 | self.meta_list = set() | 5 | self.meta_list = set() |
| 5 | 6 | ||
| 6 | def get_meta(self): | 7 | def get_meta(self): |
diff --git a/src/parsers/pdf.py b/src/parsers/pdf.py index c25b324..a77eabd 100644 --- a/src/parsers/pdf.py +++ b/src/parsers/pdf.py | |||
| @@ -31,20 +31,6 @@ class PDFParser(abstract.AbstractParser): | |||
| 31 | self.uri = 'file://' + os.path.abspath(self.filename) | 31 | self.uri = 'file://' + os.path.abspath(self.filename) |
| 32 | self.password = None | 32 | self.password = None |
| 33 | 33 | ||
| 34 | def __optimize_image_size(self, img: io.BytesIO) -> io.BytesIO: | ||
| 35 | """ This is useless as fuck. """ | ||
| 36 | if Image is None: | ||
| 37 | return img | ||
| 38 | ret = io.BytesIO() | ||
| 39 | im = Image.open(img) | ||
| 40 | w, h = im.size | ||
| 41 | resized = im.resize((w, h), Image.ANTIALIAS) | ||
| 42 | resized.save(ret, optimize=True, format="PNG") | ||
| 43 | ret.seek(0) | ||
| 44 | |||
| 45 | return ret | ||
| 46 | |||
| 47 | |||
| 48 | def remove_all(self): | 34 | def remove_all(self): |
| 49 | """ | 35 | """ |
| 50 | Load the document into Poppler, render pages on PNG, | 36 | Load the document into Poppler, render pages on PNG, |
| @@ -57,7 +43,7 @@ class PDFParser(abstract.AbstractParser): | |||
| 57 | """ | 43 | """ |
| 58 | document = Poppler.Document.new_from_file(self.uri, self.password) | 44 | document = Poppler.Document.new_from_file(self.uri, self.password) |
| 59 | 45 | ||
| 60 | pdf_surface = cairo.PDFSurface("OUT.pdf", 128, 128) | 46 | pdf_surface = cairo.PDFSurface(self.output_filename, 128, 128) |
| 61 | pdf_context = cairo.Context(pdf_surface) | 47 | pdf_context = cairo.Context(pdf_surface) |
| 62 | 48 | ||
| 63 | for pagenum in range(document.get_n_pages()): | 49 | for pagenum in range(document.get_n_pages()): |
| @@ -87,10 +73,11 @@ class PDFParser(abstract.AbstractParser): | |||
| 87 | 73 | ||
| 88 | pdf_surface.finish() | 74 | pdf_surface.finish() |
| 89 | 75 | ||
| 90 | document = Poppler.Document.new_from_file('file://' + os.path.abspath('OUT.pdf'), self.password) | 76 | # This is removing metadata |
| 91 | document.set_producer('totally not MAT2 ;)') | 77 | #document = Poppler.Document.new_from_file('file://' + os.path.abspath('OUT.pdf'), self.password) |
| 92 | document.set_creator('') | 78 | #document.set_producer('totally not MAT2 ;)') |
| 93 | document.save('file://' + os.path.abspath("OUT_clean.pdf")) | 79 | #document.set_creator('') |
| 80 | #document.save('file://' + os.path.abspath("OUT_clean.pdf")) | ||
| 94 | 81 | ||
| 95 | return True | 82 | return True |
| 96 | 83 | ||
diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/tests/__init__.py | |||
diff --git a/tests/test_libmat2.py b/tests/test_libmat2.py index 56b960e..4751aa4 100644 --- a/tests/test_libmat2.py +++ b/tests/test_libmat2.py | |||
| @@ -10,18 +10,27 @@ from src.parsers import pdf | |||
| 10 | class TestGetMeta(unittest.TestCase): | 10 | class TestGetMeta(unittest.TestCase): |
| 11 | def test_pdf(self): | 11 | def test_pdf(self): |
| 12 | p = pdf.PDFParser('./tests/data/dirty.pdf') | 12 | p = pdf.PDFParser('./tests/data/dirty.pdf') |
| 13 | meta = p.get_meta().items() | 13 | meta = p.get_meta() |
| 14 | 14 | self.assertEqual(meta['producer'], 'pdfTeX-1.40.14') | |
| 15 | self.assertEqual(meta['creator'], "'Certified by IEEE PDFeXpress at 03/19/2016 2:56:07 AM'") | ||
| 15 | 16 | ||
| 16 | class TestCleaning(unittest.TestCase): | 17 | class TestCleaning(unittest.TestCase): |
| 17 | def setUp(self): | 18 | def setUp(self): |
| 18 | shutil.copy('./tests/data/dirty.pdf', './tests/data/clean.pdf') | 19 | shutil.copy('./tests/data/dirty.pdf', './tests/data/clean.pdf') |
| 19 | 20 | ||
| 20 | def tearDown(self): | 21 | def tearDown(self): |
| 21 | #os.remove('./tests/data/clean.pdf') | 22 | os.remove('./tests/data/clean.pdf') |
| 22 | pass | ||
| 23 | 23 | ||
| 24 | def test_pdf(self): | 24 | def test_pdf(self): |
| 25 | p = pdf.PDFParser('./tests/data/clean.pdf') | 25 | p = pdf.PDFParser('./tests/data/clean.pdf') |
| 26 | p.remove_all() | 26 | |
| 27 | #self.assertEqual(p.get_meta(), {}) | 27 | meta = p.get_meta() |
| 28 | self.assertEqual(meta['producer'], 'pdfTeX-1.40.14') | ||
| 29 | |||
| 30 | ret = p.remove_all() | ||
| 31 | self.assertTrue(ret) | ||
| 32 | |||
| 33 | p = pdf.PDFParser('./tests/data/clean.pdf.cleaned') | ||
| 34 | remaining_meta = {'creator': 'cairo 1.14.10 (http://cairographics.org)', | ||
| 35 | 'producer': 'cairo 1.14.10 (http://cairographics.org)'} | ||
| 36 | self.assertEqual(p.get_meta(), remaining_meta) | ||
