summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--libmat2/__init__.py1
-rw-r--r--libmat2/parsers/__init__.py0
-rw-r--r--libmat2/parsers/abstract.py10
-rw-r--r--libmat2/parsers/pdf.py105
-rw-r--r--src/parsers/abstract.py1
-rw-r--r--src/parsers/pdf.py25
-rw-r--r--tests/__init__.py0
-rw-r--r--tests/test_libmat2.py21
8 files changed, 138 insertions, 25 deletions
diff --git a/libmat2/__init__.py b/libmat2/__init__.py
new file mode 100644
index 0000000..3b3dacb
--- /dev/null
+++ b/libmat2/__init__.py
@@ -0,0 +1 @@
__version__ = '2.0'
diff --git a/libmat2/parsers/__init__.py b/libmat2/parsers/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/libmat2/parsers/__init__.py
diff --git a/libmat2/parsers/abstract.py b/libmat2/parsers/abstract.py
new file mode 100644
index 0000000..a9129cc
--- /dev/null
+++ b/libmat2/parsers/abstract.py
@@ -0,0 +1,10 @@
1class AbstractParser(object):
2 def __init__(self, filename: str):
3 self.filename = filename
4 self.meta_list = set()
5
6 def get_meta(self):
7 raise NotImplementedError
8
9 def remove_all(self):
10 raise NotImplementedError
diff --git a/libmat2/parsers/pdf.py b/libmat2/parsers/pdf.py
new file mode 100644
index 0000000..f6bc110
--- /dev/null
+++ b/libmat2/parsers/pdf.py
@@ -0,0 +1,105 @@
1""" Handle PDF
2
3"""
4
5import os
6import logging
7import tempfile
8import shutil
9import io
10
11import cairo
12import gi
13gi.require_version('Poppler', '0.18')
14from gi.repository import Poppler, Gio, GLib
15
16try:
17 from PIL import Image
18except ImportError:
19 Image = None
20
21from . import abstract
22
23logging.basicConfig(level=logging.DEBUG)
24
25
26class PDFParser(abstract.AbstractParser):
27 def __init__(self, filename):
28 super().__init__(filename)
29 self.meta_list = {'title', 'author', 'subject',
30 'keywords', 'creator', 'producer', 'metadata'}
31 self.uri = 'file://' + os.path.abspath(self.filename)
32 self.password = None
33
34 def remove_all(self):
35 """
36 Load the document into Poppler, render pages on PNG,
37 and shove those PNG into a new PDF. Metadata from the new
38 PDF are removed via Poppler, because there is no way to tell
39 cairo to not add "created by cairo" during rendering.
40
41 TODO: Improve the resolution
42 TODO: Don't use a temp file
43 """
44 document = Poppler.Document.new_from_file(self.uri, self.password)
45
46 pdf_out = io.BytesIO()
47 pdf_surface = cairo.PDFSurface(pdf_out, 128, 128)
48 pdf_context = cairo.Context(pdf_surface)
49
50 for pagenum in range(document.get_n_pages()):
51 page = document.get_page(pagenum)
52 page_width, page_height = page.get_size()
53 logging.info("Rendering page %d/%d", pagenum + 1, document.get_n_pages())
54
55 img_surface = cairo.ImageSurface(cairo.FORMAT_ARGB32, int(page_width)*2, int(page_height)*2)
56 img_context = cairo.Context(img_surface)
57
58 img_context.scale(2, 2)
59 page.render_for_printing_with_options(img_context, Poppler.PrintFlags.DOCUMENT)
60 img_context.show_page()
61
62 buf = io.BytesIO()
63 img_surface.write_to_png(buf)
64 img_surface.finish()
65 buf.seek(0)
66
67 img = cairo.ImageSurface.create_from_png(buf)
68 pdf_surface.set_size(page_width*2, page_height*2)
69 pdf_context.set_source_surface(img, 0, 0)
70 pdf_context.paint()
71 pdf_context.show_page()
72
73 pdf_surface.finish()
74
75 b = GLib.Bytes(pdf_out.getvalue())
76 input_stream = Gio.MemoryInputStream.new_from_bytes(b)
77 out_document = Poppler.Document.new_from_stream(input_stream, -1, self.password, None)
78 metadata = {}
79 for key in self.meta_list:
80 if out_document.get_property(key):
81 metadata[key] = str(out_document.get_property(key))
82 out_document.set_producer('totally not MAT2 ;)')
83 out_document.set_creator('')
84 print("AFTER")
85 metadata = {}
86 for key in self.meta_list:
87 if out_document.get_property(key):
88 metadata[key] = str(out_document.get_property(key))
89 print("LOL")
90 out_document.save('file://' + os.path.abspath("olol.pdf"))
91
92 print(metadata)
93
94 return True
95
96 def get_meta(self):
97 """ Return a dict with all the meta of the file
98 """
99 print("URI: %s", self.uri)
100 document = Poppler.Document.new_from_file(self.uri, self.password)
101 metadata = {}
102 for key in self.meta_list:
103 if document.get_property(key):
104 metadata[key] = str(document.get_property(key))
105 return metadata
diff --git a/src/parsers/abstract.py b/src/parsers/abstract.py
index a9129cc..d0e7108 100644
--- a/src/parsers/abstract.py
+++ b/src/parsers/abstract.py
@@ -1,6 +1,7 @@
1class AbstractParser(object): 1class AbstractParser(object):
2 def __init__(self, filename: str): 2 def __init__(self, filename: str):
3 self.filename = filename 3 self.filename = filename
4 self.output_filename = filename + '.cleaned'
4 self.meta_list = set() 5 self.meta_list = set()
5 6
6 def get_meta(self): 7 def get_meta(self):
diff --git a/src/parsers/pdf.py b/src/parsers/pdf.py
index c25b324..a77eabd 100644
--- a/src/parsers/pdf.py
+++ b/src/parsers/pdf.py
@@ -31,20 +31,6 @@ class PDFParser(abstract.AbstractParser):
31 self.uri = 'file://' + os.path.abspath(self.filename) 31 self.uri = 'file://' + os.path.abspath(self.filename)
32 self.password = None 32 self.password = None
33 33
34 def __optimize_image_size(self, img: io.BytesIO) -> io.BytesIO:
35 """ This is useless as fuck. """
36 if Image is None:
37 return img
38 ret = io.BytesIO()
39 im = Image.open(img)
40 w, h = im.size
41 resized = im.resize((w, h), Image.ANTIALIAS)
42 resized.save(ret, optimize=True, format="PNG")
43 ret.seek(0)
44
45 return ret
46
47
48 def remove_all(self): 34 def remove_all(self):
49 """ 35 """
50 Load the document into Poppler, render pages on PNG, 36 Load the document into Poppler, render pages on PNG,
@@ -57,7 +43,7 @@ class PDFParser(abstract.AbstractParser):
57 """ 43 """
58 document = Poppler.Document.new_from_file(self.uri, self.password) 44 document = Poppler.Document.new_from_file(self.uri, self.password)
59 45
60 pdf_surface = cairo.PDFSurface("OUT.pdf", 128, 128) 46 pdf_surface = cairo.PDFSurface(self.output_filename, 128, 128)
61 pdf_context = cairo.Context(pdf_surface) 47 pdf_context = cairo.Context(pdf_surface)
62 48
63 for pagenum in range(document.get_n_pages()): 49 for pagenum in range(document.get_n_pages()):
@@ -87,10 +73,11 @@ class PDFParser(abstract.AbstractParser):
87 73
88 pdf_surface.finish() 74 pdf_surface.finish()
89 75
90 document = Poppler.Document.new_from_file('file://' + os.path.abspath('OUT.pdf'), self.password) 76 # This is removing metadata
91 document.set_producer('totally not MAT2 ;)') 77 #document = Poppler.Document.new_from_file('file://' + os.path.abspath('OUT.pdf'), self.password)
92 document.set_creator('') 78 #document.set_producer('totally not MAT2 ;)')
93 document.save('file://' + os.path.abspath("OUT_clean.pdf")) 79 #document.set_creator('')
80 #document.save('file://' + os.path.abspath("OUT_clean.pdf"))
94 81
95 return True 82 return True
96 83
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/tests/__init__.py
diff --git a/tests/test_libmat2.py b/tests/test_libmat2.py
index 56b960e..4751aa4 100644
--- a/tests/test_libmat2.py
+++ b/tests/test_libmat2.py
@@ -10,18 +10,27 @@ from src.parsers import pdf
10class TestGetMeta(unittest.TestCase): 10class TestGetMeta(unittest.TestCase):
11 def test_pdf(self): 11 def test_pdf(self):
12 p = pdf.PDFParser('./tests/data/dirty.pdf') 12 p = pdf.PDFParser('./tests/data/dirty.pdf')
13 meta = p.get_meta().items() 13 meta = p.get_meta()
14 14 self.assertEqual(meta['producer'], 'pdfTeX-1.40.14')
15 self.assertEqual(meta['creator'], "'Certified by IEEE PDFeXpress at 03/19/2016 2:56:07 AM'")
15 16
16class TestCleaning(unittest.TestCase): 17class TestCleaning(unittest.TestCase):
17 def setUp(self): 18 def setUp(self):
18 shutil.copy('./tests/data/dirty.pdf', './tests/data/clean.pdf') 19 shutil.copy('./tests/data/dirty.pdf', './tests/data/clean.pdf')
19 20
20 def tearDown(self): 21 def tearDown(self):
21 #os.remove('./tests/data/clean.pdf') 22 os.remove('./tests/data/clean.pdf')
22 pass
23 23
24 def test_pdf(self): 24 def test_pdf(self):
25 p = pdf.PDFParser('./tests/data/clean.pdf') 25 p = pdf.PDFParser('./tests/data/clean.pdf')
26 p.remove_all() 26
27 #self.assertEqual(p.get_meta(), {}) 27 meta = p.get_meta()
28 self.assertEqual(meta['producer'], 'pdfTeX-1.40.14')
29
30 ret = p.remove_all()
31 self.assertTrue(ret)
32
33 p = pdf.PDFParser('./tests/data/clean.pdf.cleaned')
34 remaining_meta = {'creator': 'cairo 1.14.10 (http://cairographics.org)',
35 'producer': 'cairo 1.14.10 (http://cairographics.org)'}
36 self.assertEqual(p.get_meta(), remaining_meta)