summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorjvoisin2018-03-19 23:43:49 +0100
committerjvoisin2018-03-19 23:43:49 +0100
commit8f44616366f9ca395314d59a98840e2912f488df (patch)
tree88d3d6fbdd45a6f9e35abf7646e9950933980e3a /src
parentd262f780f7653c7e6c5d3b30c5ceedbb25f41787 (diff)
Implement mimetype detection
Diffstat (limited to 'src')
-rw-r--r--src/parser_factory.py10
-rw-r--r--src/parsers/abstract.py1
-rw-r--r--src/parsers/pdf.py13
3 files changed, 18 insertions, 6 deletions
diff --git a/src/parser_factory.py b/src/parser_factory.py
new file mode 100644
index 0000000..a93595a
--- /dev/null
+++ b/src/parser_factory.py
@@ -0,0 +1,10 @@
1import mimetypes
2
3from .parsers import abstract
4from .parsers import *
5
6def get_parser(filename: str):
7 mtype, _ = mimetypes.guess_type(filename)
8 for c in abstract.AbstractParser.__subclasses__():
9 if mtype in c.mimetypes:
10 return c(filename)
diff --git a/src/parsers/abstract.py b/src/parsers/abstract.py
index d0e7108..80bb812 100644
--- a/src/parsers/abstract.py
+++ b/src/parsers/abstract.py
@@ -3,6 +3,7 @@ class AbstractParser(object):
3 self.filename = filename 3 self.filename = filename
4 self.output_filename = filename + '.cleaned' 4 self.output_filename = filename + '.cleaned'
5 self.meta_list = set() 5 self.meta_list = set()
6 self.mimetypes = set()
6 7
7 def get_meta(self): 8 def get_meta(self):
8 raise NotImplementedError 9 raise NotImplementedError
diff --git a/src/parsers/pdf.py b/src/parsers/pdf.py
index 26985c6..e7bd00d 100644
--- a/src/parsers/pdf.py
+++ b/src/parsers/pdf.py
@@ -20,13 +20,14 @@ logging.basicConfig(level=logging.DEBUG)
20 20
21 21
22class PDFParser(abstract.AbstractParser): 22class PDFParser(abstract.AbstractParser):
23 mimetypes = {'application/pdf', }
24 meta_list = {'author', 'creation-date', 'creator', 'format', 'keywords',
25 'metadata', 'mod-date', 'producer', 'subject', 'title',
26 'viewer-preferences'}
27
23 def __init__(self, filename): 28 def __init__(self, filename):
24 super().__init__(filename) 29 super().__init__(filename)
25 self.meta_list = {'author', 'creation-date', 'creator', 'format', 'keywords',
26 'metadata', 'mod-date', 'producer', 'subject', 'title',
27 'viewer-preferences'}
28 self.uri = 'file://' + os.path.abspath(self.filename) 30 self.uri = 'file://' + os.path.abspath(self.filename)
29 self.password = None
30 31
31 def remove_all(self): 32 def remove_all(self):
32 """ 33 """
@@ -35,7 +36,7 @@ class PDFParser(abstract.AbstractParser):
35 PDF are removed via Poppler, because there is no way to tell 36 PDF are removed via Poppler, because there is no way to tell
36 cairo to not add "created by cairo" during rendering. 37 cairo to not add "created by cairo" during rendering.
37 """ 38 """
38 document = Poppler.Document.new_from_file(self.uri, self.password) 39 document = Poppler.Document.new_from_file(self.uri, None)
39 pages_count = document.get_n_pages() 40 pages_count = document.get_n_pages()
40 41
41 _, tmp_path = tempfile.mkstemp() 42 _, tmp_path = tempfile.mkstemp()
@@ -80,7 +81,7 @@ class PDFParser(abstract.AbstractParser):
80 """ Return a dict with all the meta of the file 81 """ Return a dict with all the meta of the file
81 """ 82 """
82 print("URI: %s", self.uri) 83 print("URI: %s", self.uri)
83 document = Poppler.Document.new_from_file(self.uri, self.password) 84 document = Poppler.Document.new_from_file(self.uri, None)
84 metadata = {} 85 metadata = {}
85 for key in self.meta_list: 86 for key in self.meta_list:
86 if document.get_property(key): 87 if document.get_property(key):