From 7ec1eff96e3125890b268dbafeebefe6fc923ef2 Mon Sep 17 00:00:00 2001 From: jvoisin Date: Wed, 11 Apr 2018 23:20:59 +0200 Subject: Improve the way we parse/display pdf metadata --- src/pdf.py | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'src') diff --git a/src/pdf.py b/src/pdf.py index 96eec13..c119449 100644 --- a/src/pdf.py +++ b/src/pdf.py @@ -3,6 +3,7 @@ """ import os +import re import logging import tempfile import io @@ -76,6 +77,13 @@ class PDFParser(abstract.AbstractParser): return True + + def __parse_metadata_field(self, data:str) -> dict: + metadata = {} + for (_, key, value) in re.findall(r"<(xmp|pdfx|pdf|xmpMM):(.+)>(.+)", data, re.I): + metadata[key] = value + return metadata + def get_meta(self): """ Return a dict with all the meta of the file """ @@ -84,4 +92,7 @@ class PDFParser(abstract.AbstractParser): for key in self.meta_list: if document.get_property(key): metadata[key] = document.get_property(key) + if 'metadata' in metadata: + parsed_meta = self.__parse_metadata_field(metadata['metadata']) + return {**metadata, **parsed_meta} return metadata -- cgit v1.3