summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorjvoisin2018-03-19 23:43:49 +0100
committerjvoisin2018-03-19 23:43:49 +0100
commit8f44616366f9ca395314d59a98840e2912f488df (patch)
tree88d3d6fbdd45a6f9e35abf7646e9950933980e3a
parentd262f780f7653c7e6c5d3b30c5ceedbb25f41787 (diff)
Implement mimetype detection
-rw-r--r--main.py9
-rw-r--r--src/parser_factory.py10
-rw-r--r--src/parsers/abstract.py1
-rw-r--r--src/parsers/pdf.py13
4 files changed, 23 insertions, 10 deletions
diff --git a/main.py b/main.py
index e4157e6..4b965b4 100644
--- a/main.py
+++ b/main.py
@@ -3,6 +3,7 @@ from shutil import copyfile
3import argparse 3import argparse
4 4
5from src.parsers import pdf 5from src.parsers import pdf
6from src import parser_factory
6 7
7 8
8def create_arg_parser(): 9def create_arg_parser():
@@ -19,7 +20,7 @@ def create_arg_parser():
19 return parser 20 return parser
20 21
21def show_meta(file_name:str): 22def show_meta(file_name:str):
22 p = pdf.PDFParser(file_name) 23 p = parser_factory(file_name)
23 for k,v in p.get_meta().items(): 24 for k,v in p.get_meta().items():
24 print("%s: %s" % (k, v)) 25 print("%s: %s" % (k, v))
25 26
@@ -32,10 +33,10 @@ def main():
32 show_meta(f) 33 show_meta(f)
33 return 0 34 return 0
34 elif not args.files: 35 elif not args.files:
35 return parser.show_help() 36 return argparser.show_help()
36 37
37 copyfile(sys.argv[1] + '.bak', sys.argv[1]) 38 #p = pdf.PDFParser(sys.argv[1])
38 p = pdf.PDFParser(sys.argv[1]) 39 p = parser_factory.get_parser(sys.argv[1])
39 p.remove_all() 40 p.remove_all()
40 p = pdf.PDFParser('OUT_clean.pdf') 41 p = pdf.PDFParser('OUT_clean.pdf')
41 print("ok") 42 print("ok")
diff --git a/src/parser_factory.py b/src/parser_factory.py
new file mode 100644
index 0000000..a93595a
--- /dev/null
+++ b/src/parser_factory.py
@@ -0,0 +1,10 @@
1import mimetypes
2
3from .parsers import abstract
4from .parsers import *
5
6def get_parser(filename: str):
7 mtype, _ = mimetypes.guess_type(filename)
8 for c in abstract.AbstractParser.__subclasses__():
9 if mtype in c.mimetypes:
10 return c(filename)
diff --git a/src/parsers/abstract.py b/src/parsers/abstract.py
index d0e7108..80bb812 100644
--- a/src/parsers/abstract.py
+++ b/src/parsers/abstract.py
@@ -3,6 +3,7 @@ class AbstractParser(object):
3 self.filename = filename 3 self.filename = filename
4 self.output_filename = filename + '.cleaned' 4 self.output_filename = filename + '.cleaned'
5 self.meta_list = set() 5 self.meta_list = set()
6 self.mimetypes = set()
6 7
7 def get_meta(self): 8 def get_meta(self):
8 raise NotImplementedError 9 raise NotImplementedError
diff --git a/src/parsers/pdf.py b/src/parsers/pdf.py
index 26985c6..e7bd00d 100644
--- a/src/parsers/pdf.py
+++ b/src/parsers/pdf.py
@@ -20,13 +20,14 @@ logging.basicConfig(level=logging.DEBUG)
20 20
21 21
22class PDFParser(abstract.AbstractParser): 22class PDFParser(abstract.AbstractParser):
23 mimetypes = {'application/pdf', }
24 meta_list = {'author', 'creation-date', 'creator', 'format', 'keywords',
25 'metadata', 'mod-date', 'producer', 'subject', 'title',
26 'viewer-preferences'}
27
23 def __init__(self, filename): 28 def __init__(self, filename):
24 super().__init__(filename) 29 super().__init__(filename)
25 self.meta_list = {'author', 'creation-date', 'creator', 'format', 'keywords',
26 'metadata', 'mod-date', 'producer', 'subject', 'title',
27 'viewer-preferences'}
28 self.uri = 'file://' + os.path.abspath(self.filename) 30 self.uri = 'file://' + os.path.abspath(self.filename)
29 self.password = None
30 31
31 def remove_all(self): 32 def remove_all(self):
32 """ 33 """
@@ -35,7 +36,7 @@ class PDFParser(abstract.AbstractParser):
35 PDF are removed via Poppler, because there is no way to tell 36 PDF are removed via Poppler, because there is no way to tell
36 cairo to not add "created by cairo" during rendering. 37 cairo to not add "created by cairo" during rendering.
37 """ 38 """
38 document = Poppler.Document.new_from_file(self.uri, self.password) 39 document = Poppler.Document.new_from_file(self.uri, None)
39 pages_count = document.get_n_pages() 40 pages_count = document.get_n_pages()
40 41
41 _, tmp_path = tempfile.mkstemp() 42 _, tmp_path = tempfile.mkstemp()
@@ -80,7 +81,7 @@ class PDFParser(abstract.AbstractParser):
80 """ Return a dict with all the meta of the file 81 """ Return a dict with all the meta of the file
81 """ 82 """
82 print("URI: %s", self.uri) 83 print("URI: %s", self.uri)
83 document = Poppler.Document.new_from_file(self.uri, self.password) 84 document = Poppler.Document.new_from_file(self.uri, None)
84 metadata = {} 85 metadata = {}
85 for key in self.meta_list: 86 for key in self.meta_list:
86 if document.get_property(key): 87 if document.get_property(key):