summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/pdf.py11
1 files changed, 11 insertions, 0 deletions
diff --git a/src/pdf.py b/src/pdf.py
index 96eec13..c119449 100644
--- a/src/pdf.py
+++ b/src/pdf.py
@@ -3,6 +3,7 @@
3""" 3"""
4 4
5import os 5import os
6import re
6import logging 7import logging
7import tempfile 8import tempfile
8import io 9import io
@@ -76,6 +77,13 @@ class PDFParser(abstract.AbstractParser):
76 77
77 return True 78 return True
78 79
80
81 def __parse_metadata_field(self, data:str) -> dict:
82 metadata = {}
83 for (_, key, value) in re.findall(r"<(xmp|pdfx|pdf|xmpMM):(.+)>(.+)</\1:\2>", data, re.I):
84 metadata[key] = value
85 return metadata
86
79 def get_meta(self): 87 def get_meta(self):
80 """ Return a dict with all the meta of the file 88 """ Return a dict with all the meta of the file
81 """ 89 """
@@ -84,4 +92,7 @@ class PDFParser(abstract.AbstractParser):
84 for key in self.meta_list: 92 for key in self.meta_list:
85 if document.get_property(key): 93 if document.get_property(key):
86 metadata[key] = document.get_property(key) 94 metadata[key] = document.get_property(key)
95 if 'metadata' in metadata:
96 parsed_meta = self.__parse_metadata_field(metadata['metadata'])
97 return {**metadata, **parsed_meta}
87 return metadata 98 return metadata