summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/parsers/abstract.py1
-rw-r--r--src/parsers/pdf.py25
2 files changed, 7 insertions, 19 deletions
diff --git a/src/parsers/abstract.py b/src/parsers/abstract.py
index a9129cc..d0e7108 100644
--- a/src/parsers/abstract.py
+++ b/src/parsers/abstract.py
@@ -1,6 +1,7 @@
1class AbstractParser(object): 1class AbstractParser(object):
2 def __init__(self, filename: str): 2 def __init__(self, filename: str):
3 self.filename = filename 3 self.filename = filename
4 self.output_filename = filename + '.cleaned'
4 self.meta_list = set() 5 self.meta_list = set()
5 6
6 def get_meta(self): 7 def get_meta(self):
diff --git a/src/parsers/pdf.py b/src/parsers/pdf.py
index c25b324..a77eabd 100644
--- a/src/parsers/pdf.py
+++ b/src/parsers/pdf.py
@@ -31,20 +31,6 @@ class PDFParser(abstract.AbstractParser):
31 self.uri = 'file://' + os.path.abspath(self.filename) 31 self.uri = 'file://' + os.path.abspath(self.filename)
32 self.password = None 32 self.password = None
33 33
34 def __optimize_image_size(self, img: io.BytesIO) -> io.BytesIO:
35 """ This is useless as fuck. """
36 if Image is None:
37 return img
38 ret = io.BytesIO()
39 im = Image.open(img)
40 w, h = im.size
41 resized = im.resize((w, h), Image.ANTIALIAS)
42 resized.save(ret, optimize=True, format="PNG")
43 ret.seek(0)
44
45 return ret
46
47
48 def remove_all(self): 34 def remove_all(self):
49 """ 35 """
50 Load the document into Poppler, render pages on PNG, 36 Load the document into Poppler, render pages on PNG,
@@ -57,7 +43,7 @@ class PDFParser(abstract.AbstractParser):
57 """ 43 """
58 document = Poppler.Document.new_from_file(self.uri, self.password) 44 document = Poppler.Document.new_from_file(self.uri, self.password)
59 45
60 pdf_surface = cairo.PDFSurface("OUT.pdf", 128, 128) 46 pdf_surface = cairo.PDFSurface(self.output_filename, 128, 128)
61 pdf_context = cairo.Context(pdf_surface) 47 pdf_context = cairo.Context(pdf_surface)
62 48
63 for pagenum in range(document.get_n_pages()): 49 for pagenum in range(document.get_n_pages()):
@@ -87,10 +73,11 @@ class PDFParser(abstract.AbstractParser):
87 73
88 pdf_surface.finish() 74 pdf_surface.finish()
89 75
90 document = Poppler.Document.new_from_file('file://' + os.path.abspath('OUT.pdf'), self.password) 76 # This is removing metadata
91 document.set_producer('totally not MAT2 ;)') 77 #document = Poppler.Document.new_from_file('file://' + os.path.abspath('OUT.pdf'), self.password)
92 document.set_creator('') 78 #document.set_producer('totally not MAT2 ;)')
93 document.save('file://' + os.path.abspath("OUT_clean.pdf")) 79 #document.set_creator('')
80 #document.save('file://' + os.path.abspath("OUT_clean.pdf"))
94 81
95 return True 82 return True
96 83