diff options
| author | jvoisin | 2018-03-19 23:43:49 +0100 |
|---|---|---|
| committer | jvoisin | 2018-03-19 23:43:49 +0100 |
| commit | 8f44616366f9ca395314d59a98840e2912f488df (patch) | |
| tree | 88d3d6fbdd45a6f9e35abf7646e9950933980e3a /src | |
| parent | d262f780f7653c7e6c5d3b30c5ceedbb25f41787 (diff) | |
Implement mimetype detection
Diffstat (limited to 'src')
| -rw-r--r-- | src/parser_factory.py | 10 | ||||
| -rw-r--r-- | src/parsers/abstract.py | 1 | ||||
| -rw-r--r-- | src/parsers/pdf.py | 13 |
3 files changed, 18 insertions, 6 deletions
diff --git a/src/parser_factory.py b/src/parser_factory.py new file mode 100644 index 0000000..a93595a --- /dev/null +++ b/src/parser_factory.py | |||
| @@ -0,0 +1,10 @@ | |||
| 1 | import mimetypes | ||
| 2 | |||
| 3 | from .parsers import abstract | ||
| 4 | from .parsers import * | ||
| 5 | |||
| 6 | def get_parser(filename: str): | ||
| 7 | mtype, _ = mimetypes.guess_type(filename) | ||
| 8 | for c in abstract.AbstractParser.__subclasses__(): | ||
| 9 | if mtype in c.mimetypes: | ||
| 10 | return c(filename) | ||
diff --git a/src/parsers/abstract.py b/src/parsers/abstract.py index d0e7108..80bb812 100644 --- a/src/parsers/abstract.py +++ b/src/parsers/abstract.py | |||
| @@ -3,6 +3,7 @@ class AbstractParser(object): | |||
| 3 | self.filename = filename | 3 | self.filename = filename |
| 4 | self.output_filename = filename + '.cleaned' | 4 | self.output_filename = filename + '.cleaned' |
| 5 | self.meta_list = set() | 5 | self.meta_list = set() |
| 6 | self.mimetypes = set() | ||
| 6 | 7 | ||
| 7 | def get_meta(self): | 8 | def get_meta(self): |
| 8 | raise NotImplementedError | 9 | raise NotImplementedError |
diff --git a/src/parsers/pdf.py b/src/parsers/pdf.py index 26985c6..e7bd00d 100644 --- a/src/parsers/pdf.py +++ b/src/parsers/pdf.py | |||
| @@ -20,13 +20,14 @@ logging.basicConfig(level=logging.DEBUG) | |||
| 20 | 20 | ||
| 21 | 21 | ||
| 22 | class PDFParser(abstract.AbstractParser): | 22 | class PDFParser(abstract.AbstractParser): |
| 23 | mimetypes = {'application/pdf', } | ||
| 24 | meta_list = {'author', 'creation-date', 'creator', 'format', 'keywords', | ||
| 25 | 'metadata', 'mod-date', 'producer', 'subject', 'title', | ||
| 26 | 'viewer-preferences'} | ||
| 27 | |||
| 23 | def __init__(self, filename): | 28 | def __init__(self, filename): |
| 24 | super().__init__(filename) | 29 | super().__init__(filename) |
| 25 | self.meta_list = {'author', 'creation-date', 'creator', 'format', 'keywords', | ||
| 26 | 'metadata', 'mod-date', 'producer', 'subject', 'title', | ||
| 27 | 'viewer-preferences'} | ||
| 28 | self.uri = 'file://' + os.path.abspath(self.filename) | 30 | self.uri = 'file://' + os.path.abspath(self.filename) |
| 29 | self.password = None | ||
| 30 | 31 | ||
| 31 | def remove_all(self): | 32 | def remove_all(self): |
| 32 | """ | 33 | """ |
| @@ -35,7 +36,7 @@ class PDFParser(abstract.AbstractParser): | |||
| 35 | PDF are removed via Poppler, because there is no way to tell | 36 | PDF are removed via Poppler, because there is no way to tell |
| 36 | cairo to not add "created by cairo" during rendering. | 37 | cairo to not add "created by cairo" during rendering. |
| 37 | """ | 38 | """ |
| 38 | document = Poppler.Document.new_from_file(self.uri, self.password) | 39 | document = Poppler.Document.new_from_file(self.uri, None) |
| 39 | pages_count = document.get_n_pages() | 40 | pages_count = document.get_n_pages() |
| 40 | 41 | ||
| 41 | _, tmp_path = tempfile.mkstemp() | 42 | _, tmp_path = tempfile.mkstemp() |
| @@ -80,7 +81,7 @@ class PDFParser(abstract.AbstractParser): | |||
| 80 | """ Return a dict with all the meta of the file | 81 | """ Return a dict with all the meta of the file |
| 81 | """ | 82 | """ |
| 82 | print("URI: %s", self.uri) | 83 | print("URI: %s", self.uri) |
| 83 | document = Poppler.Document.new_from_file(self.uri, self.password) | 84 | document = Poppler.Document.new_from_file(self.uri, None) |
| 84 | metadata = {} | 85 | metadata = {} |
| 85 | for key in self.meta_list: | 86 | for key in self.meta_list: |
| 86 | if document.get_property(key): | 87 | if document.get_property(key): |
