summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorjvoisin2018-03-31 15:47:06 +0200
committerjvoisin2018-03-31 15:47:06 +0200
commit865ad181ae075ebc168e5dab5d00b7c99a0b7c9b (patch)
treec019e0b0d72587371fd56d6c05e705ec79d34628
parent302a5ea002478cac86ff7be03d2add46c81a96de (diff)
Add support for docx
-rw-r--r--src/office.py52
-rw-r--r--tests/data/dirty.docxbin0 -> 598120 bytes
-rw-r--r--tests/test_libmat2.py20
3 files changed, 72 insertions, 0 deletions
diff --git a/src/office.py b/src/office.py
new file mode 100644
index 0000000..d6728c8
--- /dev/null
+++ b/src/office.py
@@ -0,0 +1,52 @@
1import subprocess
2import json
3import zipfile
4import tempfile
5import shutil
6import os
7
8from . import abstract, parser_factory
9
10class OfficeParser(abstract.AbstractParser):
11 mimetypes = {
12 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
13 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
14 'application/vnd.openxmlformats-officedocument.presentationml.presentation'
15 }
16 files_to_keep = {'_rels/.rels', 'word/_rels/document.xml.rels'}
17
18 def get_meta(self):
19 metadata = {}
20 zipin = zipfile.ZipFile(self.filename)
21 for item in zipin.namelist():
22 if item.startswith('docProps/'):
23 metadata[item] = 'harmful content'
24 zipin.close()
25 return metadata
26
27 def remove_all(self):
28 zin = zipfile.ZipFile(self.filename, 'r')
29 zout = zipfile.ZipFile(self.output_filename, 'w')
30 temp_folder = tempfile.mkdtemp()
31
32 for item in zin.infolist():
33 if item.is_dir():
34 continue
35 elif item.filename.startswith('docProps/'):
36 if not item.filename.endswith('.rels'):
37 continue # don't keep metadata files
38 if item.filename in self.files_to_keep:
39 zout.writestr(item, zin.read(item))
40 continue
41
42 zin.extract(member=item, path=temp_folder)
43 tmp_parser = parser_factory.get_parser(os.path.join(temp_folder, item.filename))
44 if tmp_parser is None:
45 print("%s isn't supported" % item.filename)
46 continue
47 tmp_parser.remove_all()
48 zout.write(tmp_parser.output_filename, item.filename)
49 shutil.rmtree(temp_folder)
50 zout.close()
51 zin.close()
52 return True
diff --git a/tests/data/dirty.docx b/tests/data/dirty.docx
new file mode 100644
index 0000000..97e2c21
--- /dev/null
+++ b/tests/data/dirty.docx
Binary files differ
diff --git a/tests/test_libmat2.py b/tests/test_libmat2.py
index c21185e..02579b0 100644
--- a/tests/test_libmat2.py
+++ b/tests/test_libmat2.py
@@ -39,6 +39,11 @@ class TestGetMeta(unittest.TestCase):
39 meta = p.get_meta() 39 meta = p.get_meta()
40 self.assertEqual(meta['TITLE'], ['I am so']) 40 self.assertEqual(meta['TITLE'], ['I am so'])
41 41
42 def test_docx(self):
43 p = office.OfficeParser('./tests/data/dirty.docx')
44 meta = p.get_meta()
45 print(meta)
46
42 47
43class TestCleaning(unittest.TestCase): 48class TestCleaning(unittest.TestCase):
44 def test_pdf(self): 49 def test_pdf(self):
@@ -131,3 +136,18 @@ class TestCleaning(unittest.TestCase):
131 self.assertEqual(p.get_meta(), {}) 136 self.assertEqual(p.get_meta(), {})
132 137
133 os.remove('./tests/data/clean.flac') 138 os.remove('./tests/data/clean.flac')
139
140 def test_office(self):
141 shutil.copy('./tests/data/dirty.docx', './tests/data/clean.docx')
142 p = office.OfficeParser('./tests/data/clean.docx')
143
144 meta = p.get_meta()
145 self.assertIsNotNone(meta)
146
147 ret = p.remove_all()
148 self.assertTrue(ret)
149
150 p = office.OfficeParser('./tests/data/clean.docx.cleaned')
151 self.assertEqual(p.get_meta(), {})
152
153 os.remove('./tests/data/clean.docx')