diff options
| -rw-r--r-- | src/office.py | 13 | ||||
| -rw-r--r-- | tests/test_libmat2.py | 4 |
2 files changed, 14 insertions, 3 deletions
diff --git a/src/office.py b/src/office.py index 2bdeec7..5de0597 100644 --- a/src/office.py +++ b/src/office.py | |||
| @@ -1,3 +1,4 @@ | |||
| 1 | import re | ||
| 1 | import subprocess | 2 | import subprocess |
| 2 | import json | 3 | import json |
| 3 | import zipfile | 4 | import zipfile |
| @@ -16,11 +17,19 @@ class OfficeParser(abstract.AbstractParser): | |||
| 16 | files_to_keep = {'_rels/.rels', 'word/_rels/document.xml.rels'} | 17 | files_to_keep = {'_rels/.rels', 'word/_rels/document.xml.rels'} |
| 17 | 18 | ||
| 18 | def get_meta(self): | 19 | def get_meta(self): |
| 20 | """ | ||
| 21 | Yes, I know that parsing xml with regexp ain't pretty, | ||
| 22 | be my guest and fix it if you want. | ||
| 23 | """ | ||
| 19 | metadata = {} | 24 | metadata = {} |
| 20 | zipin = zipfile.ZipFile(self.filename) | 25 | zipin = zipfile.ZipFile(self.filename) |
| 21 | for item in zipin.namelist(): | 26 | for item in zipin.namelist(): |
| 22 | if item.startswith('docProps/'): | 27 | if item.startswith('docProps/') and item.endswith('.xml'): |
| 23 | metadata[item] = 'harmful content' | 28 | content = zipin.read(item).decode('utf-8') |
| 29 | for (key, value) in re.findall(r"<(.+)>(.+)</\1>", content, re.I): | ||
| 30 | metadata[key] = value | ||
| 31 | if not metadata: # better safe than sorry | ||
| 32 | metadata[item] = 'harmful content' | ||
| 24 | zipin.close() | 33 | zipin.close() |
| 25 | return metadata | 34 | return metadata |
| 26 | 35 | ||
diff --git a/tests/test_libmat2.py b/tests/test_libmat2.py index 02579b0..717de3f 100644 --- a/tests/test_libmat2.py +++ b/tests/test_libmat2.py | |||
| @@ -42,7 +42,9 @@ class TestGetMeta(unittest.TestCase): | |||
| 42 | def test_docx(self): | 42 | def test_docx(self): |
| 43 | p = office.OfficeParser('./tests/data/dirty.docx') | 43 | p = office.OfficeParser('./tests/data/dirty.docx') |
| 44 | meta = p.get_meta() | 44 | meta = p.get_meta() |
| 45 | print(meta) | 45 | self.assertEqual(meta['cp:lastModifiedBy'], 'Julien Voisin') |
| 46 | self.assertEqual(meta['dc:creator'], 'julien voisin') | ||
| 47 | self.assertEqual(meta['Application'], 'LibreOffice/5.4.5.1$Linux_X86_64 LibreOffice_project/40m0$Build-1') | ||
| 46 | 48 | ||
| 47 | 49 | ||
| 48 | class TestCleaning(unittest.TestCase): | 50 | class TestCleaning(unittest.TestCase): |
