summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorjvoisin2018-04-01 01:04:06 +0200
committerjvoisin2018-04-01 01:04:06 +0200
commiteac51dbc9964cac28bb83e7d12370cf87ff2b0c5 (patch)
tree6fba0d8323f3d27db72a68e96c656e51634ed164
parent2d7c703c52cae50034fc9618c72552365f7cc741 (diff)
Refactor office document handling
-rw-r--r--src/abstract.py4
-rw-r--r--src/libreoffice.py68
-rw-r--r--src/office.py95
-rw-r--r--tests/test_libmat2.py22
4 files changed, 86 insertions, 103 deletions
diff --git a/src/abstract.py b/src/abstract.py
index c2d282f..1f8ce6e 100644
--- a/src/abstract.py
+++ b/src/abstract.py
@@ -6,8 +6,8 @@ class AbstractParser(object):
6 self.filename = filename 6 self.filename = filename
7 self.output_filename = filename + '.cleaned' 7 self.output_filename = filename + '.cleaned'
8 8
9 def get_meta(self): 9 def get_meta(self) -> dict:
10 raise NotImplementedError 10 raise NotImplementedError
11 11
12 def remove_all(self): 12 def remove_all(self) -> bool:
13 raise NotImplementedError 13 raise NotImplementedError
diff --git a/src/libreoffice.py b/src/libreoffice.py
deleted file mode 100644
index 809ae3c..0000000
--- a/src/libreoffice.py
+++ /dev/null
@@ -1,68 +0,0 @@
1import re
2import subprocess
3import json
4import zipfile
5import tempfile
6import shutil
7import os
8
9from . import abstract, parser_factory
10
11class LibreOfficeParser(abstract.AbstractParser):
12 mimetypes = {
13 'application/vnd.oasis.opendocument.text',
14 'application/vnd.oasis.opendocument.spreadsheet',
15 'application/vnd.oasis.opendocument.presentation',
16 'application/vnd.oasis.opendocument.graphics',
17 'application/vnd.oasis.opendocument.chart'
18 }
19
20 def get_meta(self):
21 """
22 Yes, I know that parsing xml with regexp ain't pretty,
23 be my guest and fix it if you want.
24 """
25 metadata = {}
26 zipin = zipfile.ZipFile(self.filename)
27 for item in zipin.namelist():
28 if item == 'meta.xml':
29 content = zipin.read(item).decode('utf-8')
30 for (key, value) in re.findall(r"<((?:meta|dc).+?)>(.+)</\1>", content, re.I):
31 metadata[key] = value
32 if not metadata: # better safe than sorry
33 metadata[item] = 'harmful content'
34 zipin.close()
35 return metadata
36
37 def __clean_zipinfo(self, zipinfo:zipfile.ZipInfo) -> zipfile.ZipInfo:
38 zipinfo.compress_type = zipfile.ZIP_DEFLATED
39 zipinfo.create_system = 3 # Linux
40 zipinfo.comment = b''
41 zipinfo.date_time = (1980, 1, 1, 0, 0, 0)
42 return zipinfo
43
44 def remove_all(self):
45 zin = zipfile.ZipFile(self.filename, 'r')
46 zout = zipfile.ZipFile(self.output_filename, 'w')
47 temp_folder = tempfile.mkdtemp()
48
49 for item in zin.infolist():
50 if item.filename[-1] == '/':
51 continue # `is_dir` is added in Python3.6
52 elif item.filename == 'meta.xml':
53 continue # don't keep metadata files
54
55 zin.extract(member=item, path=temp_folder)
56 tmp_parser = parser_factory.get_parser(os.path.join(temp_folder, item.filename))
57 if tmp_parser is None:
58 print("%s isn't supported" % item.filename)
59 continue
60 tmp_parser.remove_all()
61 zinfo = zipfile.ZipInfo(item.filename)
62 item = self.__clean_zipinfo(item)
63 with open(tmp_parser.output_filename, 'rb') as f:
64 zout.writestr(zinfo, f.read())
65 shutil.rmtree(temp_folder)
66 zout.close()
67 zin.close()
68 return True
diff --git a/src/office.py b/src/office.py
index a729f2f..5083308 100644
--- a/src/office.py
+++ b/src/office.py
@@ -1,14 +1,34 @@
1import json
2import os
1import re 3import re
4import shutil
2import subprocess 5import subprocess
3import json
4import zipfile
5import tempfile 6import tempfile
6import shutil 7import zipfile
7import os
8 8
9from . import abstract, parser_factory 9from . import abstract, parser_factory
10 10
11class OfficeParser(abstract.AbstractParser): 11class ArchiveBasedAbstractParser(abstract.AbstractParser):
12 def _clean_zipinfo(self, zipinfo:zipfile.ZipInfo) -> zipfile.ZipInfo:
13 zipinfo.compress_type = zipfile.ZIP_DEFLATED
14 zipinfo.create_system = 3 # Linux
15 zipinfo.comment = b''
16 zipinfo.date_time = (1980, 1, 1, 0, 0, 0)
17 return zipinfo
18
19 def _clean_internal_file(self, item:zipfile.ZipInfo, temp_folder:str, zin:zipfile.ZipFile, zout:zipfile.ZipFile):
20 zin.extract(member=item, path=temp_folder)
21 tmp_parser = parser_factory.get_parser(os.path.join(temp_folder, item.filename))
22 if tmp_parser is None:
23 print("%s isn't supported" % item.filename)
24 return
25 tmp_parser.remove_all()
26 zinfo = zipfile.ZipInfo(item.filename)
27 item = self._clean_zipinfo(item)
28 with open(tmp_parser.output_filename, 'rb') as f:
29 zout.writestr(zinfo, f.read())
30
31class MSOfficeParser(ArchiveBasedAbstractParser):
12 mimetypes = { 32 mimetypes = {
13 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', 33 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
14 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', 34 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
@@ -33,12 +53,6 @@ class OfficeParser(abstract.AbstractParser):
33 zipin.close() 53 zipin.close()
34 return metadata 54 return metadata
35 55
36 def __clean_zipinfo(self, zipinfo:zipfile.ZipInfo) -> zipfile.ZipInfo:
37 zipinfo.compress_type = zipfile.ZIP_DEFLATED
38 zipinfo.create_system = 3 # Linux
39 zipinfo.comment = b''
40 zipinfo.date_time = (1980, 1, 1, 0, 0, 0)
41 return zipinfo
42 56
43 def remove_all(self): 57 def remove_all(self):
44 zin = zipfile.ZipFile(self.filename, 'r') 58 zin = zipfile.ZipFile(self.filename, 'r')
@@ -52,20 +66,57 @@ class OfficeParser(abstract.AbstractParser):
52 if not item.filename.endswith('.rels'): 66 if not item.filename.endswith('.rels'):
53 continue # don't keep metadata files 67 continue # don't keep metadata files
54 if item.filename in self.files_to_keep: 68 if item.filename in self.files_to_keep:
55 item = self.__clean_zipinfo(item) 69 item = self._clean_zipinfo(item)
56 zout.writestr(item, zin.read(item)) 70 zout.writestr(item, zin.read(item))
57 continue 71 continue
58 72
59 zin.extract(member=item, path=temp_folder) 73 self._clean_internal_file(item, temp_folder, zin, zout)
60 tmp_parser = parser_factory.get_parser(os.path.join(temp_folder, item.filename)) 74
61 if tmp_parser is None: 75 shutil.rmtree(temp_folder)
62 print("%s isn't supported" % item.filename) 76 zout.close()
63 continue 77 zin.close()
64 tmp_parser.remove_all() 78 return True
65 zinfo = zipfile.ZipInfo(item.filename) 79
66 item = self.__clean_zipinfo(item) 80
67 with open(tmp_parser.output_filename, 'rb') as f: 81
68 zout.writestr(zinfo, f.read()) 82class LibreOfficeParser(ArchiveBasedAbstractParser):
83 mimetypes = {
84 'application/vnd.oasis.opendocument.text',
85 'application/vnd.oasis.opendocument.spreadsheet',
86 'application/vnd.oasis.opendocument.presentation',
87 'application/vnd.oasis.opendocument.graphics',
88 'application/vnd.oasis.opendocument.chart'
89 }
90
91 def get_meta(self):
92 """
93 Yes, I know that parsing xml with regexp ain't pretty,
94 be my guest and fix it if you want.
95 """
96 metadata = {}
97 zipin = zipfile.ZipFile(self.filename)
98 for item in zipin.namelist():
99 if item == 'meta.xml':
100 content = zipin.read(item).decode('utf-8')
101 for (key, value) in re.findall(r"<((?:meta|dc|cp).+?)>(.+)</\1>", content, re.I):
102 metadata[key] = value
103 if not metadata: # better safe than sorry
104 metadata[item] = 'harmful content'
105 zipin.close()
106 return metadata
107
108 def remove_all(self):
109 zin = zipfile.ZipFile(self.filename, 'r')
110 zout = zipfile.ZipFile(self.output_filename, 'w')
111 temp_folder = tempfile.mkdtemp()
112
113 for item in zin.infolist():
114 if item.filename[-1] == '/':
115 continue # `is_dir` is added in Python3.6
116 elif item.filename == 'meta.xml':
117 continue # don't keep metadata files
118
119 self._clean_internal_file(item, temp_folder, zin, zout)
69 120
70 shutil.rmtree(temp_folder) 121 shutil.rmtree(temp_folder)
71 zout.close() 122 zout.close()
diff --git a/tests/test_libmat2.py b/tests/test_libmat2.py
index 89e690e..5b7dfb1 100644
--- a/tests/test_libmat2.py
+++ b/tests/test_libmat2.py
@@ -6,7 +6,7 @@ import os
6import zipfile 6import zipfile
7import tempfile 7import tempfile
8 8
9from src import pdf, png, images_pixbuf, audio, office, libreoffice, parser_factory 9from src import pdf, png, images_pixbuf, audio, office, parser_factory
10 10
11class TestGetMeta(unittest.TestCase): 11class TestGetMeta(unittest.TestCase):
12 def test_pdf(self): 12 def test_pdf(self):
@@ -49,14 +49,14 @@ class TestGetMeta(unittest.TestCase):
49 self.assertEqual(meta['TITLE'], ['I am so']) 49 self.assertEqual(meta['TITLE'], ['I am so'])
50 50
51 def test_docx(self): 51 def test_docx(self):
52 p = office.OfficeParser('./tests/data/dirty.docx') 52 p = office.MSOfficeParser('./tests/data/dirty.docx')
53 meta = p.get_meta() 53 meta = p.get_meta()
54 self.assertEqual(meta['cp:lastModifiedBy'], 'Julien Voisin') 54 self.assertEqual(meta['cp:lastModifiedBy'], 'Julien Voisin')
55 self.assertEqual(meta['dc:creator'], 'julien voisin') 55 self.assertEqual(meta['dc:creator'], 'julien voisin')
56 self.assertEqual(meta['Application'], 'LibreOffice/5.4.5.1$Linux_X86_64 LibreOffice_project/40m0$Build-1') 56 self.assertEqual(meta['Application'], 'LibreOffice/5.4.5.1$Linux_X86_64 LibreOffice_project/40m0$Build-1')
57 57
58 def test_libreoffice(self): 58 def test_libreoffice(self):
59 p = libreoffice.LibreOfficeParser('./tests/data/dirty.odt') 59 p = office.LibreOfficeParser('./tests/data/dirty.odt')
60 meta = p.get_meta() 60 meta = p.get_meta()
61 self.assertEqual(meta['meta:initial-creator'], 'jvoisin ') 61 self.assertEqual(meta['meta:initial-creator'], 'jvoisin ')
62 self.assertEqual(meta['meta:creation-date'], '2011-07-26T03:27:48') 62 self.assertEqual(meta['meta:creation-date'], '2011-07-26T03:27:48')
@@ -90,7 +90,7 @@ class TestDeepCleaning(unittest.TestCase):
90 90
91 def test_office(self): 91 def test_office(self):
92 shutil.copy('./tests/data/dirty.docx', './tests/data/clean.docx') 92 shutil.copy('./tests/data/dirty.docx', './tests/data/clean.docx')
93 p = office.OfficeParser('./tests/data/clean.docx') 93 p = office.MSOfficeParser('./tests/data/clean.docx')
94 94
95 meta = p.get_meta() 95 meta = p.get_meta()
96 self.assertIsNotNone(meta) 96 self.assertIsNotNone(meta)
@@ -98,7 +98,7 @@ class TestDeepCleaning(unittest.TestCase):
98 ret = p.remove_all() 98 ret = p.remove_all()
99 self.assertTrue(ret) 99 self.assertTrue(ret)
100 100
101 p = office.OfficeParser('./tests/data/clean.docx.cleaned') 101 p = office.MSOfficeParser('./tests/data/clean.docx.cleaned')
102 self.assertEqual(p.get_meta(), {}) 102 self.assertEqual(p.get_meta(), {})
103 103
104 self.__check_zip_meta(p) 104 self.__check_zip_meta(p)
@@ -109,7 +109,7 @@ class TestDeepCleaning(unittest.TestCase):
109 109
110 def test_libreoffice(self): 110 def test_libreoffice(self):
111 shutil.copy('./tests/data/dirty.odt', './tests/data/clean.odt') 111 shutil.copy('./tests/data/dirty.odt', './tests/data/clean.odt')
112 p = libreoffice.LibreOfficeParser('./tests/data/clean.odt') 112 p = office.LibreOfficeParser('./tests/data/clean.odt')
113 113
114 meta = p.get_meta() 114 meta = p.get_meta()
115 self.assertIsNotNone(meta) 115 self.assertIsNotNone(meta)
@@ -117,7 +117,7 @@ class TestDeepCleaning(unittest.TestCase):
117 ret = p.remove_all() 117 ret = p.remove_all()
118 self.assertTrue(ret) 118 self.assertTrue(ret)
119 119
120 p = libreoffice.LibreOfficeParser('./tests/data/clean.odt.cleaned') 120 p = office.LibreOfficeParser('./tests/data/clean.odt.cleaned')
121 self.assertEqual(p.get_meta(), {}) 121 self.assertEqual(p.get_meta(), {})
122 122
123 self.__check_zip_meta(p) 123 self.__check_zip_meta(p)
@@ -219,7 +219,7 @@ class TestCleaning(unittest.TestCase):
219 219
220 def test_office(self): 220 def test_office(self):
221 shutil.copy('./tests/data/dirty.docx', './tests/data/clean.docx') 221 shutil.copy('./tests/data/dirty.docx', './tests/data/clean.docx')
222 p = office.OfficeParser('./tests/data/clean.docx') 222 p = office.MSOfficeParser('./tests/data/clean.docx')
223 223
224 meta = p.get_meta() 224 meta = p.get_meta()
225 self.assertIsNotNone(meta) 225 self.assertIsNotNone(meta)
@@ -227,7 +227,7 @@ class TestCleaning(unittest.TestCase):
227 ret = p.remove_all() 227 ret = p.remove_all()
228 self.assertTrue(ret) 228 self.assertTrue(ret)
229 229
230 p = office.OfficeParser('./tests/data/clean.docx.cleaned') 230 p = office.MSOfficeParser('./tests/data/clean.docx.cleaned')
231 self.assertEqual(p.get_meta(), {}) 231 self.assertEqual(p.get_meta(), {})
232 232
233 os.remove('./tests/data/clean.docx') 233 os.remove('./tests/data/clean.docx')
@@ -235,7 +235,7 @@ class TestCleaning(unittest.TestCase):
235 235
236 def test_libreoffice(self): 236 def test_libreoffice(self):
237 shutil.copy('./tests/data/dirty.odt', './tests/data/clean.odt') 237 shutil.copy('./tests/data/dirty.odt', './tests/data/clean.odt')
238 p = libreoffice.LibreOfficeParser('./tests/data/clean.odt') 238 p = office.LibreOfficeParser('./tests/data/clean.odt')
239 239
240 meta = p.get_meta() 240 meta = p.get_meta()
241 self.assertIsNotNone(meta) 241 self.assertIsNotNone(meta)
@@ -243,7 +243,7 @@ class TestCleaning(unittest.TestCase):
243 ret = p.remove_all() 243 ret = p.remove_all()
244 self.assertTrue(ret) 244 self.assertTrue(ret)
245 245
246 p = libreoffice.LibreOfficeParser('./tests/data/clean.odt.cleaned') 246 p = office.LibreOfficeParser('./tests/data/clean.odt.cleaned')
247 self.assertEqual(p.get_meta(), {}) 247 self.assertEqual(p.get_meta(), {})
248 248
249 os.remove('./tests/data/clean.odt') 249 os.remove('./tests/data/clean.odt')