diff options
Diffstat (limited to 'src')
| -rw-r--r-- | src/parsers/abstract.py | 1 | ||||
| -rw-r--r-- | src/parsers/pdf.py | 25 |
2 files changed, 7 insertions, 19 deletions
diff --git a/src/parsers/abstract.py b/src/parsers/abstract.py index a9129cc..d0e7108 100644 --- a/src/parsers/abstract.py +++ b/src/parsers/abstract.py | |||
| @@ -1,6 +1,7 @@ | |||
| 1 | class AbstractParser(object): | 1 | class AbstractParser(object): |
| 2 | def __init__(self, filename: str): | 2 | def __init__(self, filename: str): |
| 3 | self.filename = filename | 3 | self.filename = filename |
| 4 | self.output_filename = filename + '.cleaned' | ||
| 4 | self.meta_list = set() | 5 | self.meta_list = set() |
| 5 | 6 | ||
| 6 | def get_meta(self): | 7 | def get_meta(self): |
diff --git a/src/parsers/pdf.py b/src/parsers/pdf.py index c25b324..a77eabd 100644 --- a/src/parsers/pdf.py +++ b/src/parsers/pdf.py | |||
| @@ -31,20 +31,6 @@ class PDFParser(abstract.AbstractParser): | |||
| 31 | self.uri = 'file://' + os.path.abspath(self.filename) | 31 | self.uri = 'file://' + os.path.abspath(self.filename) |
| 32 | self.password = None | 32 | self.password = None |
| 33 | 33 | ||
| 34 | def __optimize_image_size(self, img: io.BytesIO) -> io.BytesIO: | ||
| 35 | """ This is useless as fuck. """ | ||
| 36 | if Image is None: | ||
| 37 | return img | ||
| 38 | ret = io.BytesIO() | ||
| 39 | im = Image.open(img) | ||
| 40 | w, h = im.size | ||
| 41 | resized = im.resize((w, h), Image.ANTIALIAS) | ||
| 42 | resized.save(ret, optimize=True, format="PNG") | ||
| 43 | ret.seek(0) | ||
| 44 | |||
| 45 | return ret | ||
| 46 | |||
| 47 | |||
| 48 | def remove_all(self): | 34 | def remove_all(self): |
| 49 | """ | 35 | """ |
| 50 | Load the document into Poppler, render pages on PNG, | 36 | Load the document into Poppler, render pages on PNG, |
| @@ -57,7 +43,7 @@ class PDFParser(abstract.AbstractParser): | |||
| 57 | """ | 43 | """ |
| 58 | document = Poppler.Document.new_from_file(self.uri, self.password) | 44 | document = Poppler.Document.new_from_file(self.uri, self.password) |
| 59 | 45 | ||
| 60 | pdf_surface = cairo.PDFSurface("OUT.pdf", 128, 128) | 46 | pdf_surface = cairo.PDFSurface(self.output_filename, 128, 128) |
| 61 | pdf_context = cairo.Context(pdf_surface) | 47 | pdf_context = cairo.Context(pdf_surface) |
| 62 | 48 | ||
| 63 | for pagenum in range(document.get_n_pages()): | 49 | for pagenum in range(document.get_n_pages()): |
| @@ -87,10 +73,11 @@ class PDFParser(abstract.AbstractParser): | |||
| 87 | 73 | ||
| 88 | pdf_surface.finish() | 74 | pdf_surface.finish() |
| 89 | 75 | ||
| 90 | document = Poppler.Document.new_from_file('file://' + os.path.abspath('OUT.pdf'), self.password) | 76 | # This is removing metadata |
| 91 | document.set_producer('totally not MAT2 ;)') | 77 | #document = Poppler.Document.new_from_file('file://' + os.path.abspath('OUT.pdf'), self.password) |
| 92 | document.set_creator('') | 78 | #document.set_producer('totally not MAT2 ;)') |
| 93 | document.save('file://' + os.path.abspath("OUT_clean.pdf")) | 79 | #document.set_creator('') |
| 80 | #document.save('file://' + os.path.abspath("OUT_clean.pdf")) | ||
| 94 | 81 | ||
| 95 | return True | 82 | return True |
| 96 | 83 | ||
