From df3c27d79dec231809deb4e617070a16858c306d Mon Sep 17 00:00:00 2001 From: jvoisin Date: Sun, 18 Mar 2018 21:42:12 +0100 Subject: Improve the testsuite --- src/parsers/abstract.py | 1 + src/parsers/pdf.py | 25 ++++++------------------- 2 files changed, 7 insertions(+), 19 deletions(-) (limited to 'src/parsers') diff --git a/src/parsers/abstract.py b/src/parsers/abstract.py index a9129cc..d0e7108 100644 --- a/src/parsers/abstract.py +++ b/src/parsers/abstract.py @@ -1,6 +1,7 @@ class AbstractParser(object): def __init__(self, filename: str): self.filename = filename + self.output_filename = filename + '.cleaned' self.meta_list = set() def get_meta(self): diff --git a/src/parsers/pdf.py b/src/parsers/pdf.py index c25b324..a77eabd 100644 --- a/src/parsers/pdf.py +++ b/src/parsers/pdf.py @@ -31,20 +31,6 @@ class PDFParser(abstract.AbstractParser): self.uri = 'file://' + os.path.abspath(self.filename) self.password = None - def __optimize_image_size(self, img: io.BytesIO) -> io.BytesIO: - """ This is useless as fuck. """ - if Image is None: - return img - ret = io.BytesIO() - im = Image.open(img) - w, h = im.size - resized = im.resize((w, h), Image.ANTIALIAS) - resized.save(ret, optimize=True, format="PNG") - ret.seek(0) - - return ret - - def remove_all(self): """ Load the document into Poppler, render pages on PNG, @@ -57,7 +43,7 @@ class PDFParser(abstract.AbstractParser): """ document = Poppler.Document.new_from_file(self.uri, self.password) - pdf_surface = cairo.PDFSurface("OUT.pdf", 128, 128) + pdf_surface = cairo.PDFSurface(self.output_filename, 128, 128) pdf_context = cairo.Context(pdf_surface) for pagenum in range(document.get_n_pages()): @@ -87,10 +73,11 @@ class PDFParser(abstract.AbstractParser): pdf_surface.finish() - document = Poppler.Document.new_from_file('file://' + os.path.abspath('OUT.pdf'), self.password) - document.set_producer('totally not MAT2 ;)') - document.set_creator('') - document.save('file://' + os.path.abspath("OUT_clean.pdf")) + # This is removing metadata + #document = Poppler.Document.new_from_file('file://' + os.path.abspath('OUT.pdf'), self.password) + #document.set_producer('totally not MAT2 ;)') + #document.set_creator('') + #document.save('file://' + os.path.abspath("OUT_clean.pdf")) return True -- cgit v1.3