summaryrefslogtreecommitdiff
path: root/src/pdf.py
diff options
context:
space:
mode:
Diffstat (limited to 'src/pdf.py')
-rw-r--r--src/pdf.py12
1 files changed, 7 insertions, 5 deletions
diff --git a/src/pdf.py b/src/pdf.py
index fbc5175..5b99192 100644
--- a/src/pdf.py
+++ b/src/pdf.py
@@ -21,8 +21,8 @@ logging.basicConfig(level=logging.DEBUG)
21class PDFParser(abstract.AbstractParser): 21class PDFParser(abstract.AbstractParser):
22 mimetypes = {'application/pdf', } 22 mimetypes = {'application/pdf', }
23 meta_list = {'author', 'creation-date', 'creator', 'format', 'keywords', 23 meta_list = {'author', 'creation-date', 'creator', 'format', 'keywords',
24 'metadata', 'mod-date', 'producer', 'subject', 'title', 24 'metadata', 'mod-date', 'producer', 'subject', 'title',
25 'viewer-preferences'} 25 'viewer-preferences'}
26 26
27 def __init__(self, filename): 27 def __init__(self, filename):
28 super().__init__(filename) 28 super().__init__(filename)
@@ -103,7 +103,8 @@ class PDFParser(abstract.AbstractParser):
103 103
104 return True 104 return True
105 105
106 def __remove_superficial_meta(self, in_file:str, out_file: str) -> bool: 106 @staticmethod
107 def __remove_superficial_meta(in_file: str, out_file: str) -> bool:
107 document = Poppler.Document.new_from_file('file://' + in_file) 108 document = Poppler.Document.new_from_file('file://' + in_file)
108 document.set_producer('') 109 document.set_producer('')
109 document.set_creator('') 110 document.set_creator('')
@@ -112,7 +113,8 @@ class PDFParser(abstract.AbstractParser):
112 return True 113 return True
113 114
114 115
115 def __parse_metadata_field(self, data:str) -> dict: 116 @staticmethod
117 def __parse_metadata_field(data: str) -> dict:
116 metadata = {} 118 metadata = {}
117 for (_, key, value) in re.findall(r"<(xmp|pdfx|pdf|xmpMM):(.+)>(.+)</\1:\2>", data, re.I): 119 for (_, key, value) in re.findall(r"<(xmp|pdfx|pdf|xmpMM):(.+)>(.+)</\1:\2>", data, re.I):
118 metadata[key] = value 120 metadata[key] = value
@@ -128,6 +130,6 @@ class PDFParser(abstract.AbstractParser):
128 if document.get_property(key): 130 if document.get_property(key):
129 metadata[key] = document.get_property(key) 131 metadata[key] = document.get_property(key)
130 if 'metadata' in metadata: 132 if 'metadata' in metadata:
131 parsed_meta = self.__parse_metadata_field(metadata['metadata']) 133 parsed_meta = self.__parse_metadata_field(metadata['metadata'])
132 return {**metadata, **parsed_meta} 134 return {**metadata, **parsed_meta}
133 return metadata 135 return metadata