From 1ee936420ca1df1ebff14f19de28df5c41602b2b Mon Sep 17 00:00:00 2001 From: jvoisin Date: Sat, 31 Mar 2018 20:56:15 +0200 Subject: Display docx metadata --- src/office.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'src') diff --git a/src/office.py b/src/office.py index 2bdeec7..5de0597 100644 --- a/src/office.py +++ b/src/office.py @@ -1,3 +1,4 @@ +import re import subprocess import json import zipfile @@ -16,11 +17,19 @@ class OfficeParser(abstract.AbstractParser): files_to_keep = {'_rels/.rels', 'word/_rels/document.xml.rels'} def get_meta(self): + """ + Yes, I know that parsing xml with regexp ain't pretty, + be my guest and fix it if you want. + """ metadata = {} zipin = zipfile.ZipFile(self.filename) for item in zipin.namelist(): - if item.startswith('docProps/'): - metadata[item] = 'harmful content' + if item.startswith('docProps/') and item.endswith('.xml'): + content = zipin.read(item).decode('utf-8') + for (key, value) in re.findall(r"<(.+)>(.+)", content, re.I): + metadata[key] = value + if not metadata: # better safe than sorry + metadata[item] = 'harmful content' zipin.close() return metadata -- cgit v1.3