diff options
| author | jvoisin | 2018-03-31 20:56:15 +0200 |
|---|---|---|
| committer | jvoisin | 2018-03-31 21:16:02 +0200 |
| commit | 1ee936420ca1df1ebff14f19de28df5c41602b2b (patch) | |
| tree | 8e90111424f9d61a298ecfdb2926d995247be821 /src | |
| parent | e4d2506d6a61ff3aa38f4ca37238a527a2d174ea (diff) | |
Display docx metadata
Diffstat (limited to 'src')
| -rw-r--r-- | src/office.py | 13 |
1 files changed, 11 insertions, 2 deletions
diff --git a/src/office.py b/src/office.py index 2bdeec7..5de0597 100644 --- a/src/office.py +++ b/src/office.py | |||
| @@ -1,3 +1,4 @@ | |||
| 1 | import re | ||
| 1 | import subprocess | 2 | import subprocess |
| 2 | import json | 3 | import json |
| 3 | import zipfile | 4 | import zipfile |
| @@ -16,11 +17,19 @@ class OfficeParser(abstract.AbstractParser): | |||
| 16 | files_to_keep = {'_rels/.rels', 'word/_rels/document.xml.rels'} | 17 | files_to_keep = {'_rels/.rels', 'word/_rels/document.xml.rels'} |
| 17 | 18 | ||
| 18 | def get_meta(self): | 19 | def get_meta(self): |
| 20 | """ | ||
| 21 | Yes, I know that parsing xml with regexp ain't pretty, | ||
| 22 | be my guest and fix it if you want. | ||
| 23 | """ | ||
| 19 | metadata = {} | 24 | metadata = {} |
| 20 | zipin = zipfile.ZipFile(self.filename) | 25 | zipin = zipfile.ZipFile(self.filename) |
| 21 | for item in zipin.namelist(): | 26 | for item in zipin.namelist(): |
| 22 | if item.startswith('docProps/'): | 27 | if item.startswith('docProps/') and item.endswith('.xml'): |
| 23 | metadata[item] = 'harmful content' | 28 | content = zipin.read(item).decode('utf-8') |
| 29 | for (key, value) in re.findall(r"<(.+)>(.+)</\1>", content, re.I): | ||
| 30 | metadata[key] = value | ||
| 31 | if not metadata: # better safe than sorry | ||
| 32 | metadata[item] = 'harmful content' | ||
| 24 | zipin.close() | 33 | zipin.close() |
| 25 | return metadata | 34 | return metadata |
| 26 | 35 | ||
