diff options
| author | jvoisin | 2018-04-11 23:20:59 +0200 |
|---|---|---|
| committer | jvoisin | 2018-04-11 23:20:59 +0200 |
| commit | 7ec1eff96e3125890b268dbafeebefe6fc923ef2 (patch) | |
| tree | 2779681f8ff6f66e3d0e7def11bbf433791c6ef7 | |
| parent | 0239ab3b6a6ae38dbf5ba439f91f0cee498711a9 (diff) | |
Improve the way we parse/display pdf metadata
| -rw-r--r-- | src/pdf.py | 11 | ||||
| -rw-r--r-- | tests/test_libmat2.py | 4 |
2 files changed, 15 insertions, 0 deletions
| @@ -3,6 +3,7 @@ | |||
| 3 | """ | 3 | """ |
| 4 | 4 | ||
| 5 | import os | 5 | import os |
| 6 | import re | ||
| 6 | import logging | 7 | import logging |
| 7 | import tempfile | 8 | import tempfile |
| 8 | import io | 9 | import io |
| @@ -76,6 +77,13 @@ class PDFParser(abstract.AbstractParser): | |||
| 76 | 77 | ||
| 77 | return True | 78 | return True |
| 78 | 79 | ||
| 80 | |||
| 81 | def __parse_metadata_field(self, data:str) -> dict: | ||
| 82 | metadata = {} | ||
| 83 | for (_, key, value) in re.findall(r"<(xmp|pdfx|pdf|xmpMM):(.+)>(.+)</\1:\2>", data, re.I): | ||
| 84 | metadata[key] = value | ||
| 85 | return metadata | ||
| 86 | |||
| 79 | def get_meta(self): | 87 | def get_meta(self): |
| 80 | """ Return a dict with all the meta of the file | 88 | """ Return a dict with all the meta of the file |
| 81 | """ | 89 | """ |
| @@ -84,4 +92,7 @@ class PDFParser(abstract.AbstractParser): | |||
| 84 | for key in self.meta_list: | 92 | for key in self.meta_list: |
| 85 | if document.get_property(key): | 93 | if document.get_property(key): |
| 86 | metadata[key] = document.get_property(key) | 94 | metadata[key] = document.get_property(key) |
| 95 | if 'metadata' in metadata: | ||
| 96 | parsed_meta = self.__parse_metadata_field(metadata['metadata']) | ||
| 97 | return {**metadata, **parsed_meta} | ||
| 87 | return metadata | 98 | return metadata |
diff --git a/tests/test_libmat2.py b/tests/test_libmat2.py index 4cfb80a..6141dbe 100644 --- a/tests/test_libmat2.py +++ b/tests/test_libmat2.py | |||
| @@ -23,6 +23,10 @@ class TestGetMeta(unittest.TestCase): | |||
| 23 | meta = p.get_meta() | 23 | meta = p.get_meta() |
| 24 | self.assertEqual(meta['producer'], 'pdfTeX-1.40.14') | 24 | self.assertEqual(meta['producer'], 'pdfTeX-1.40.14') |
| 25 | self.assertEqual(meta['creator'], "'Certified by IEEE PDFeXpress at 03/19/2016 2:56:07 AM'") | 25 | self.assertEqual(meta['creator'], "'Certified by IEEE PDFeXpress at 03/19/2016 2:56:07 AM'") |
| 26 | self.assertEqual(meta['DocumentID'], "uuid:4a1a79c8-404e-4d38-9580-5bc081036e61") | ||
| 27 | self.assertEqual(meta['PTEX.Fullbanner'], "This is pdfTeX, Version " \ | ||
| 28 | "3.1415926-2.5-1.40.14 (TeX Live 2013/Debian) kpathsea " \ | ||
| 29 | "version 6.1.1") | ||
| 26 | 30 | ||
| 27 | def test_png(self): | 31 | def test_png(self): |
| 28 | p = images.PNGParser('./tests/data/dirty.png') | 32 | p = images.PNGParser('./tests/data/dirty.png') |
