diff options
Diffstat (limited to 'src/pdf.py')
| -rw-r--r-- | src/pdf.py | 12 |
1 files changed, 7 insertions, 5 deletions
| @@ -21,8 +21,8 @@ logging.basicConfig(level=logging.DEBUG) | |||
| 21 | class PDFParser(abstract.AbstractParser): | 21 | class PDFParser(abstract.AbstractParser): |
| 22 | mimetypes = {'application/pdf', } | 22 | mimetypes = {'application/pdf', } |
| 23 | meta_list = {'author', 'creation-date', 'creator', 'format', 'keywords', | 23 | meta_list = {'author', 'creation-date', 'creator', 'format', 'keywords', |
| 24 | 'metadata', 'mod-date', 'producer', 'subject', 'title', | 24 | 'metadata', 'mod-date', 'producer', 'subject', 'title', |
| 25 | 'viewer-preferences'} | 25 | 'viewer-preferences'} |
| 26 | 26 | ||
| 27 | def __init__(self, filename): | 27 | def __init__(self, filename): |
| 28 | super().__init__(filename) | 28 | super().__init__(filename) |
| @@ -103,7 +103,8 @@ class PDFParser(abstract.AbstractParser): | |||
| 103 | 103 | ||
| 104 | return True | 104 | return True |
| 105 | 105 | ||
| 106 | def __remove_superficial_meta(self, in_file:str, out_file: str) -> bool: | 106 | @staticmethod |
| 107 | def __remove_superficial_meta(in_file: str, out_file: str) -> bool: | ||
| 107 | document = Poppler.Document.new_from_file('file://' + in_file) | 108 | document = Poppler.Document.new_from_file('file://' + in_file) |
| 108 | document.set_producer('') | 109 | document.set_producer('') |
| 109 | document.set_creator('') | 110 | document.set_creator('') |
| @@ -112,7 +113,8 @@ class PDFParser(abstract.AbstractParser): | |||
| 112 | return True | 113 | return True |
| 113 | 114 | ||
| 114 | 115 | ||
| 115 | def __parse_metadata_field(self, data:str) -> dict: | 116 | @staticmethod |
| 117 | def __parse_metadata_field(data: str) -> dict: | ||
| 116 | metadata = {} | 118 | metadata = {} |
| 117 | for (_, key, value) in re.findall(r"<(xmp|pdfx|pdf|xmpMM):(.+)>(.+)</\1:\2>", data, re.I): | 119 | for (_, key, value) in re.findall(r"<(xmp|pdfx|pdf|xmpMM):(.+)>(.+)</\1:\2>", data, re.I): |
| 118 | metadata[key] = value | 120 | metadata[key] = value |
| @@ -128,6 +130,6 @@ class PDFParser(abstract.AbstractParser): | |||
| 128 | if document.get_property(key): | 130 | if document.get_property(key): |
| 129 | metadata[key] = document.get_property(key) | 131 | metadata[key] = document.get_property(key) |
| 130 | if 'metadata' in metadata: | 132 | if 'metadata' in metadata: |
| 131 | parsed_meta = self.__parse_metadata_field(metadata['metadata']) | 133 | parsed_meta = self.__parse_metadata_field(metadata['metadata']) |
| 132 | return {**metadata, **parsed_meta} | 134 | return {**metadata, **parsed_meta} |
| 133 | return metadata | 135 | return metadata |
