diff options
| -rw-r--r-- | src/office.py | 52 | ||||
| -rw-r--r-- | tests/data/dirty.docx | bin | 0 -> 598120 bytes | |||
| -rw-r--r-- | tests/test_libmat2.py | 20 |
3 files changed, 72 insertions, 0 deletions
diff --git a/src/office.py b/src/office.py new file mode 100644 index 0000000..d6728c8 --- /dev/null +++ b/src/office.py | |||
| @@ -0,0 +1,52 @@ | |||
| 1 | import subprocess | ||
| 2 | import json | ||
| 3 | import zipfile | ||
| 4 | import tempfile | ||
| 5 | import shutil | ||
| 6 | import os | ||
| 7 | |||
| 8 | from . import abstract, parser_factory | ||
| 9 | |||
| 10 | class OfficeParser(abstract.AbstractParser): | ||
| 11 | mimetypes = { | ||
| 12 | 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', | ||
| 13 | 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', | ||
| 14 | 'application/vnd.openxmlformats-officedocument.presentationml.presentation' | ||
| 15 | } | ||
| 16 | files_to_keep = {'_rels/.rels', 'word/_rels/document.xml.rels'} | ||
| 17 | |||
| 18 | def get_meta(self): | ||
| 19 | metadata = {} | ||
| 20 | zipin = zipfile.ZipFile(self.filename) | ||
| 21 | for item in zipin.namelist(): | ||
| 22 | if item.startswith('docProps/'): | ||
| 23 | metadata[item] = 'harmful content' | ||
| 24 | zipin.close() | ||
| 25 | return metadata | ||
| 26 | |||
| 27 | def remove_all(self): | ||
| 28 | zin = zipfile.ZipFile(self.filename, 'r') | ||
| 29 | zout = zipfile.ZipFile(self.output_filename, 'w') | ||
| 30 | temp_folder = tempfile.mkdtemp() | ||
| 31 | |||
| 32 | for item in zin.infolist(): | ||
| 33 | if item.is_dir(): | ||
| 34 | continue | ||
| 35 | elif item.filename.startswith('docProps/'): | ||
| 36 | if not item.filename.endswith('.rels'): | ||
| 37 | continue # don't keep metadata files | ||
| 38 | if item.filename in self.files_to_keep: | ||
| 39 | zout.writestr(item, zin.read(item)) | ||
| 40 | continue | ||
| 41 | |||
| 42 | zin.extract(member=item, path=temp_folder) | ||
| 43 | tmp_parser = parser_factory.get_parser(os.path.join(temp_folder, item.filename)) | ||
| 44 | if tmp_parser is None: | ||
| 45 | print("%s isn't supported" % item.filename) | ||
| 46 | continue | ||
| 47 | tmp_parser.remove_all() | ||
| 48 | zout.write(tmp_parser.output_filename, item.filename) | ||
| 49 | shutil.rmtree(temp_folder) | ||
| 50 | zout.close() | ||
| 51 | zin.close() | ||
| 52 | return True | ||
diff --git a/tests/data/dirty.docx b/tests/data/dirty.docx new file mode 100644 index 0000000..97e2c21 --- /dev/null +++ b/tests/data/dirty.docx | |||
| Binary files differ | |||
diff --git a/tests/test_libmat2.py b/tests/test_libmat2.py index c21185e..02579b0 100644 --- a/tests/test_libmat2.py +++ b/tests/test_libmat2.py | |||
| @@ -39,6 +39,11 @@ class TestGetMeta(unittest.TestCase): | |||
| 39 | meta = p.get_meta() | 39 | meta = p.get_meta() |
| 40 | self.assertEqual(meta['TITLE'], ['I am so']) | 40 | self.assertEqual(meta['TITLE'], ['I am so']) |
| 41 | 41 | ||
| 42 | def test_docx(self): | ||
| 43 | p = office.OfficeParser('./tests/data/dirty.docx') | ||
| 44 | meta = p.get_meta() | ||
| 45 | print(meta) | ||
| 46 | |||
| 42 | 47 | ||
| 43 | class TestCleaning(unittest.TestCase): | 48 | class TestCleaning(unittest.TestCase): |
| 44 | def test_pdf(self): | 49 | def test_pdf(self): |
| @@ -131,3 +136,18 @@ class TestCleaning(unittest.TestCase): | |||
| 131 | self.assertEqual(p.get_meta(), {}) | 136 | self.assertEqual(p.get_meta(), {}) |
| 132 | 137 | ||
| 133 | os.remove('./tests/data/clean.flac') | 138 | os.remove('./tests/data/clean.flac') |
| 139 | |||
| 140 | def test_office(self): | ||
| 141 | shutil.copy('./tests/data/dirty.docx', './tests/data/clean.docx') | ||
| 142 | p = office.OfficeParser('./tests/data/clean.docx') | ||
| 143 | |||
| 144 | meta = p.get_meta() | ||
| 145 | self.assertIsNotNone(meta) | ||
| 146 | |||
| 147 | ret = p.remove_all() | ||
| 148 | self.assertTrue(ret) | ||
| 149 | |||
| 150 | p = office.OfficeParser('./tests/data/clean.docx.cleaned') | ||
| 151 | self.assertEqual(p.get_meta(), {}) | ||
| 152 | |||
| 153 | os.remove('./tests/data/clean.docx') | ||
