summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorjvoisin2018-03-31 20:56:15 +0200
committerjvoisin2018-03-31 21:16:02 +0200
commit1ee936420ca1df1ebff14f19de28df5c41602b2b (patch)
tree8e90111424f9d61a298ecfdb2926d995247be821 /src
parente4d2506d6a61ff3aa38f4ca37238a527a2d174ea (diff)
Display docx metadata
Diffstat (limited to 'src')
-rw-r--r--src/office.py13
1 files changed, 11 insertions, 2 deletions
diff --git a/src/office.py b/src/office.py
index 2bdeec7..5de0597 100644
--- a/src/office.py
+++ b/src/office.py
@@ -1,3 +1,4 @@
1import re
1import subprocess 2import subprocess
2import json 3import json
3import zipfile 4import zipfile
@@ -16,11 +17,19 @@ class OfficeParser(abstract.AbstractParser):
16 files_to_keep = {'_rels/.rels', 'word/_rels/document.xml.rels'} 17 files_to_keep = {'_rels/.rels', 'word/_rels/document.xml.rels'}
17 18
18 def get_meta(self): 19 def get_meta(self):
20 """
21 Yes, I know that parsing xml with regexp ain't pretty,
22 be my guest and fix it if you want.
23 """
19 metadata = {} 24 metadata = {}
20 zipin = zipfile.ZipFile(self.filename) 25 zipin = zipfile.ZipFile(self.filename)
21 for item in zipin.namelist(): 26 for item in zipin.namelist():
22 if item.startswith('docProps/'): 27 if item.startswith('docProps/') and item.endswith('.xml'):
23 metadata[item] = 'harmful content' 28 content = zipin.read(item).decode('utf-8')
29 for (key, value) in re.findall(r"<(.+)>(.+)</\1>", content, re.I):
30 metadata[key] = value
31 if not metadata: # better safe than sorry
32 metadata[item] = 'harmful content'
24 zipin.close() 33 zipin.close()
25 return metadata 34 return metadata
26 35