summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorjvoisin2018-04-01 00:17:06 +0200
committerjvoisin2018-04-01 00:17:06 +0200
commitc186fc42929b2660e5c507adeb8a8fb406593b11 (patch)
treec82d071d61ceec3a20d48961390e73f6139f2136
parent6d506b87575ded3a59c9fc4f7b28d4160d9e9c43 (diff)
Clean deep metadata for zip files
-rw-r--r--src/libreoffice.py12
-rw-r--r--src/office.py14
-rw-r--r--tests/test_libmat2.py17
3 files changed, 38 insertions, 5 deletions
diff --git a/src/libreoffice.py b/src/libreoffice.py
index a3481a1..809ae3c 100644
--- a/src/libreoffice.py
+++ b/src/libreoffice.py
@@ -34,6 +34,13 @@ class LibreOfficeParser(abstract.AbstractParser):
34 zipin.close() 34 zipin.close()
35 return metadata 35 return metadata
36 36
37 def __clean_zipinfo(self, zipinfo:zipfile.ZipInfo) -> zipfile.ZipInfo:
38 zipinfo.compress_type = zipfile.ZIP_DEFLATED
39 zipinfo.create_system = 3 # Linux
40 zipinfo.comment = b''
41 zipinfo.date_time = (1980, 1, 1, 0, 0, 0)
42 return zipinfo
43
37 def remove_all(self): 44 def remove_all(self):
38 zin = zipfile.ZipFile(self.filename, 'r') 45 zin = zipfile.ZipFile(self.filename, 'r')
39 zout = zipfile.ZipFile(self.output_filename, 'w') 46 zout = zipfile.ZipFile(self.output_filename, 'w')
@@ -51,7 +58,10 @@ class LibreOfficeParser(abstract.AbstractParser):
51 print("%s isn't supported" % item.filename) 58 print("%s isn't supported" % item.filename)
52 continue 59 continue
53 tmp_parser.remove_all() 60 tmp_parser.remove_all()
54 zout.write(tmp_parser.output_filename, item.filename) 61 zinfo = zipfile.ZipInfo(item.filename)
62 item = self.__clean_zipinfo(item)
63 with open(tmp_parser.output_filename, 'rb') as f:
64 zout.writestr(zinfo, f.read())
55 shutil.rmtree(temp_folder) 65 shutil.rmtree(temp_folder)
56 zout.close() 66 zout.close()
57 zin.close() 67 zin.close()
diff --git a/src/office.py b/src/office.py
index 5de0597..a729f2f 100644
--- a/src/office.py
+++ b/src/office.py
@@ -33,6 +33,13 @@ class OfficeParser(abstract.AbstractParser):
33 zipin.close() 33 zipin.close()
34 return metadata 34 return metadata
35 35
36 def __clean_zipinfo(self, zipinfo:zipfile.ZipInfo) -> zipfile.ZipInfo:
37 zipinfo.compress_type = zipfile.ZIP_DEFLATED
38 zipinfo.create_system = 3 # Linux
39 zipinfo.comment = b''
40 zipinfo.date_time = (1980, 1, 1, 0, 0, 0)
41 return zipinfo
42
36 def remove_all(self): 43 def remove_all(self):
37 zin = zipfile.ZipFile(self.filename, 'r') 44 zin = zipfile.ZipFile(self.filename, 'r')
38 zout = zipfile.ZipFile(self.output_filename, 'w') 45 zout = zipfile.ZipFile(self.output_filename, 'w')
@@ -45,6 +52,7 @@ class OfficeParser(abstract.AbstractParser):
45 if not item.filename.endswith('.rels'): 52 if not item.filename.endswith('.rels'):
46 continue # don't keep metadata files 53 continue # don't keep metadata files
47 if item.filename in self.files_to_keep: 54 if item.filename in self.files_to_keep:
55 item = self.__clean_zipinfo(item)
48 zout.writestr(item, zin.read(item)) 56 zout.writestr(item, zin.read(item))
49 continue 57 continue
50 58
@@ -54,7 +62,11 @@ class OfficeParser(abstract.AbstractParser):
54 print("%s isn't supported" % item.filename) 62 print("%s isn't supported" % item.filename)
55 continue 63 continue
56 tmp_parser.remove_all() 64 tmp_parser.remove_all()
57 zout.write(tmp_parser.output_filename, item.filename) 65 zinfo = zipfile.ZipInfo(item.filename)
66 item = self.__clean_zipinfo(item)
67 with open(tmp_parser.output_filename, 'rb') as f:
68 zout.writestr(zinfo, f.read())
69
58 shutil.rmtree(temp_folder) 70 shutil.rmtree(temp_folder)
59 zout.close() 71 zout.close()
60 zin.close() 72 zin.close()
diff --git a/tests/test_libmat2.py b/tests/test_libmat2.py
index c065237..888c782 100644
--- a/tests/test_libmat2.py
+++ b/tests/test_libmat2.py
@@ -57,7 +57,7 @@ class TestGetMeta(unittest.TestCase):
57 57
58 58
59class TestDeepCleaning(unittest.TestCase): 59class TestDeepCleaning(unittest.TestCase):
60 def __check_zip_clean(self, p): 60 def __check_deep_meta(self, p):
61 tempdir = tempfile.mkdtemp() 61 tempdir = tempfile.mkdtemp()
62 zipin = zipfile.ZipFile(p.filename) 62 zipin = zipfile.ZipFile(p.filename)
63 zipin.extractall(tempdir) 63 zipin.extractall(tempdir)
@@ -72,6 +72,15 @@ class TestDeepCleaning(unittest.TestCase):
72 self.assertEqual(inside_p.get_meta(), {}) 72 self.assertEqual(inside_p.get_meta(), {})
73 shutil.rmtree(tempdir) 73 shutil.rmtree(tempdir)
74 74
75
76 def __check_zip_meta(self, p):
77 zipin = zipfile.ZipFile(p.filename)
78 for item in zipin.infolist():
79 self.assertEqual(item.comment, b'')
80 self.assertEqual(item.date_time, (1980, 1, 1, 0, 0, 0))
81 self.assertEqual(item.create_system, 3) # 3 is UNIX
82
83
75 def test_office(self): 84 def test_office(self):
76 shutil.copy('./tests/data/dirty.docx', './tests/data/clean.docx') 85 shutil.copy('./tests/data/dirty.docx', './tests/data/clean.docx')
77 p = office.OfficeParser('./tests/data/clean.docx') 86 p = office.OfficeParser('./tests/data/clean.docx')
@@ -85,7 +94,8 @@ class TestDeepCleaning(unittest.TestCase):
85 p = office.OfficeParser('./tests/data/clean.docx.cleaned') 94 p = office.OfficeParser('./tests/data/clean.docx.cleaned')
86 self.assertEqual(p.get_meta(), {}) 95 self.assertEqual(p.get_meta(), {})
87 96
88 self.__check_zip_clean(p) 97 self.__check_zip_meta(p)
98 self.__check_deep_meta(p)
89 99
90 os.remove('./tests/data/clean.docx') 100 os.remove('./tests/data/clean.docx')
91 101
@@ -103,7 +113,8 @@ class TestDeepCleaning(unittest.TestCase):
103 p = libreoffice.LibreOfficeParser('./tests/data/clean.odt.cleaned') 113 p = libreoffice.LibreOfficeParser('./tests/data/clean.odt.cleaned')
104 self.assertEqual(p.get_meta(), {}) 114 self.assertEqual(p.get_meta(), {})
105 115
106 self.__check_zip_clean(p) 116 self.__check_zip_meta(p)
117 self.__check_deep_meta(p)
107 118
108 os.remove('./tests/data/clean.odt') 119 os.remove('./tests/data/clean.odt')
109 120