summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/office.py13
-rw-r--r--tests/test_libmat2.py4
2 files changed, 14 insertions, 3 deletions
diff --git a/src/office.py b/src/office.py
index 2bdeec7..5de0597 100644
--- a/src/office.py
+++ b/src/office.py
@@ -1,3 +1,4 @@
1import re
1import subprocess 2import subprocess
2import json 3import json
3import zipfile 4import zipfile
@@ -16,11 +17,19 @@ class OfficeParser(abstract.AbstractParser):
16 files_to_keep = {'_rels/.rels', 'word/_rels/document.xml.rels'} 17 files_to_keep = {'_rels/.rels', 'word/_rels/document.xml.rels'}
17 18
18 def get_meta(self): 19 def get_meta(self):
20 """
21 Yes, I know that parsing xml with regexp ain't pretty,
22 be my guest and fix it if you want.
23 """
19 metadata = {} 24 metadata = {}
20 zipin = zipfile.ZipFile(self.filename) 25 zipin = zipfile.ZipFile(self.filename)
21 for item in zipin.namelist(): 26 for item in zipin.namelist():
22 if item.startswith('docProps/'): 27 if item.startswith('docProps/') and item.endswith('.xml'):
23 metadata[item] = 'harmful content' 28 content = zipin.read(item).decode('utf-8')
29 for (key, value) in re.findall(r"<(.+)>(.+)</\1>", content, re.I):
30 metadata[key] = value
31 if not metadata: # better safe than sorry
32 metadata[item] = 'harmful content'
24 zipin.close() 33 zipin.close()
25 return metadata 34 return metadata
26 35
diff --git a/tests/test_libmat2.py b/tests/test_libmat2.py
index 02579b0..717de3f 100644
--- a/tests/test_libmat2.py
+++ b/tests/test_libmat2.py
@@ -42,7 +42,9 @@ class TestGetMeta(unittest.TestCase):
42 def test_docx(self): 42 def test_docx(self):
43 p = office.OfficeParser('./tests/data/dirty.docx') 43 p = office.OfficeParser('./tests/data/dirty.docx')
44 meta = p.get_meta() 44 meta = p.get_meta()
45 print(meta) 45 self.assertEqual(meta['cp:lastModifiedBy'], 'Julien Voisin')
46 self.assertEqual(meta['dc:creator'], 'julien voisin')
47 self.assertEqual(meta['Application'], 'LibreOffice/5.4.5.1$Linux_X86_64 LibreOffice_project/40m0$Build-1')
46 48
47 49
48class TestCleaning(unittest.TestCase): 50class TestCleaning(unittest.TestCase):