summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--main.py9
-rw-r--r--src/parser_factory.py10
-rw-r--r--src/parsers/abstract.py1
-rw-r--r--src/parsers/pdf.py13
4 files changed, 23 insertions, 10 deletions
diff --git a/main.py b/main.py
index e4157e6..4b965b4 100644
--- a/main.py
+++ b/main.py
@@ -3,6 +3,7 @@ from shutil import copyfile
3import argparse 3import argparse
4 4
5from src.parsers import pdf 5from src.parsers import pdf
6from src import parser_factory
6 7
7 8
8def create_arg_parser(): 9def create_arg_parser():
@@ -19,7 +20,7 @@ def create_arg_parser():
19 return parser 20 return parser
20 21
21def show_meta(file_name:str): 22def show_meta(file_name:str):
22 p = pdf.PDFParser(file_name) 23 p = parser_factory(file_name)
23 for k,v in p.get_meta().items(): 24 for k,v in p.get_meta().items():
24 print("%s: %s" % (k, v)) 25 print("%s: %s" % (k, v))
25 26
@@ -32,10 +33,10 @@ def main():
32 show_meta(f) 33 show_meta(f)
33 return 0 34 return 0
34 elif not args.files: 35 elif not args.files:
35 return parser.show_help() 36 return argparser.show_help()
36 37
37 copyfile(sys.argv[1] + '.bak', sys.argv[1]) 38 #p = pdf.PDFParser(sys.argv[1])
38 p = pdf.PDFParser(sys.argv[1]) 39 p = parser_factory.get_parser(sys.argv[1])
39 p.remove_all() 40 p.remove_all()
40 p = pdf.PDFParser('OUT_clean.pdf') 41 p = pdf.PDFParser('OUT_clean.pdf')
41 print("ok") 42 print("ok")
diff --git a/src/parser_factory.py b/src/parser_factory.py
new file mode 100644
index 0000000..a93595a
--- /dev/null
+++ b/src/parser_factory.py
@@ -0,0 +1,10 @@
1import mimetypes
2
3from .parsers import abstract
4from .parsers import *
5
6def get_parser(filename: str):
7 mtype, _ = mimetypes.guess_type(filename)
8 for c in abstract.AbstractParser.__subclasses__():
9 if mtype in c.mimetypes:
10 return c(filename)
diff --git a/src/parsers/abstract.py b/src/parsers/abstract.py
index d0e7108..80bb812 100644
--- a/src/parsers/abstract.py
+++ b/src/parsers/abstract.py
@@ -3,6 +3,7 @@ class AbstractParser(object):
3 self.filename = filename 3 self.filename = filename
4 self.output_filename = filename + '.cleaned' 4 self.output_filename = filename + '.cleaned'
5 self.meta_list = set() 5 self.meta_list = set()
6 self.mimetypes = set()
6 7
7 def get_meta(self): 8 def get_meta(self):
8 raise NotImplementedError 9 raise NotImplementedError
diff --git a/src/parsers/pdf.py b/src/parsers/pdf.py
index 26985c6..e7bd00d 100644
--- a/src/parsers/pdf.py
+++ b/src/parsers/pdf.py
@@ -20,13 +20,14 @@ logging.basicConfig(level=logging.DEBUG)
20 20
21 21
22class PDFParser(abstract.AbstractParser): 22class PDFParser(abstract.AbstractParser):
23 mimetypes = {'application/pdf', }
24 meta_list = {'author', 'creation-date', 'creator', 'format', 'keywords',
25 'metadata', 'mod-date', 'producer', 'subject', 'title',
26 'viewer-preferences'}
27
23 def __init__(self, filename): 28 def __init__(self, filename):
24 super().__init__(filename) 29 super().__init__(filename)
25 self.meta_list = {'author', 'creation-date', 'creator', 'format', 'keywords',
26 'metadata', 'mod-date', 'producer', 'subject', 'title',
27 'viewer-preferences'}
28 self.uri = 'file://' + os.path.abspath(self.filename) 30 self.uri = 'file://' + os.path.abspath(self.filename)
29 self.password = None
30 31
31 def remove_all(self): 32 def remove_all(self):
32 """ 33 """
@@ -35,7 +36,7 @@ class PDFParser(abstract.AbstractParser):
35 PDF are removed via Poppler, because there is no way to tell 36 PDF are removed via Poppler, because there is no way to tell
36 cairo to not add "created by cairo" during rendering. 37 cairo to not add "created by cairo" during rendering.
37 """ 38 """
38 document = Poppler.Document.new_from_file(self.uri, self.password) 39 document = Poppler.Document.new_from_file(self.uri, None)
39 pages_count = document.get_n_pages() 40 pages_count = document.get_n_pages()
40 41
41 _, tmp_path = tempfile.mkstemp() 42 _, tmp_path = tempfile.mkstemp()
@@ -80,7 +81,7 @@ class PDFParser(abstract.AbstractParser):
80 """ Return a dict with all the meta of the file 81 """ Return a dict with all the meta of the file
81 """ 82 """
82 print("URI: %s", self.uri) 83 print("URI: %s", self.uri)
83 document = Poppler.Document.new_from_file(self.uri, self.password) 84 document = Poppler.Document.new_from_file(self.uri, None)
84 metadata = {} 85 metadata = {}
85 for key in self.meta_list: 86 for key in self.meta_list:
86 if document.get_property(key): 87 if document.get_property(key):