diff options
| author | jvoisin | 2018-04-01 00:17:06 +0200 |
|---|---|---|
| committer | jvoisin | 2018-04-01 00:17:06 +0200 |
| commit | c186fc42929b2660e5c507adeb8a8fb406593b11 (patch) | |
| tree | c82d071d61ceec3a20d48961390e73f6139f2136 | |
| parent | 6d506b87575ded3a59c9fc4f7b28d4160d9e9c43 (diff) | |
Clean deep metadata for zip files
| -rw-r--r-- | src/libreoffice.py | 12 | ||||
| -rw-r--r-- | src/office.py | 14 | ||||
| -rw-r--r-- | tests/test_libmat2.py | 17 |
3 files changed, 38 insertions, 5 deletions
diff --git a/src/libreoffice.py b/src/libreoffice.py index a3481a1..809ae3c 100644 --- a/src/libreoffice.py +++ b/src/libreoffice.py | |||
| @@ -34,6 +34,13 @@ class LibreOfficeParser(abstract.AbstractParser): | |||
| 34 | zipin.close() | 34 | zipin.close() |
| 35 | return metadata | 35 | return metadata |
| 36 | 36 | ||
| 37 | def __clean_zipinfo(self, zipinfo:zipfile.ZipInfo) -> zipfile.ZipInfo: | ||
| 38 | zipinfo.compress_type = zipfile.ZIP_DEFLATED | ||
| 39 | zipinfo.create_system = 3 # Linux | ||
| 40 | zipinfo.comment = b'' | ||
| 41 | zipinfo.date_time = (1980, 1, 1, 0, 0, 0) | ||
| 42 | return zipinfo | ||
| 43 | |||
| 37 | def remove_all(self): | 44 | def remove_all(self): |
| 38 | zin = zipfile.ZipFile(self.filename, 'r') | 45 | zin = zipfile.ZipFile(self.filename, 'r') |
| 39 | zout = zipfile.ZipFile(self.output_filename, 'w') | 46 | zout = zipfile.ZipFile(self.output_filename, 'w') |
| @@ -51,7 +58,10 @@ class LibreOfficeParser(abstract.AbstractParser): | |||
| 51 | print("%s isn't supported" % item.filename) | 58 | print("%s isn't supported" % item.filename) |
| 52 | continue | 59 | continue |
| 53 | tmp_parser.remove_all() | 60 | tmp_parser.remove_all() |
| 54 | zout.write(tmp_parser.output_filename, item.filename) | 61 | zinfo = zipfile.ZipInfo(item.filename) |
| 62 | item = self.__clean_zipinfo(item) | ||
| 63 | with open(tmp_parser.output_filename, 'rb') as f: | ||
| 64 | zout.writestr(zinfo, f.read()) | ||
| 55 | shutil.rmtree(temp_folder) | 65 | shutil.rmtree(temp_folder) |
| 56 | zout.close() | 66 | zout.close() |
| 57 | zin.close() | 67 | zin.close() |
diff --git a/src/office.py b/src/office.py index 5de0597..a729f2f 100644 --- a/src/office.py +++ b/src/office.py | |||
| @@ -33,6 +33,13 @@ class OfficeParser(abstract.AbstractParser): | |||
| 33 | zipin.close() | 33 | zipin.close() |
| 34 | return metadata | 34 | return metadata |
| 35 | 35 | ||
| 36 | def __clean_zipinfo(self, zipinfo:zipfile.ZipInfo) -> zipfile.ZipInfo: | ||
| 37 | zipinfo.compress_type = zipfile.ZIP_DEFLATED | ||
| 38 | zipinfo.create_system = 3 # Linux | ||
| 39 | zipinfo.comment = b'' | ||
| 40 | zipinfo.date_time = (1980, 1, 1, 0, 0, 0) | ||
| 41 | return zipinfo | ||
| 42 | |||
| 36 | def remove_all(self): | 43 | def remove_all(self): |
| 37 | zin = zipfile.ZipFile(self.filename, 'r') | 44 | zin = zipfile.ZipFile(self.filename, 'r') |
| 38 | zout = zipfile.ZipFile(self.output_filename, 'w') | 45 | zout = zipfile.ZipFile(self.output_filename, 'w') |
| @@ -45,6 +52,7 @@ class OfficeParser(abstract.AbstractParser): | |||
| 45 | if not item.filename.endswith('.rels'): | 52 | if not item.filename.endswith('.rels'): |
| 46 | continue # don't keep metadata files | 53 | continue # don't keep metadata files |
| 47 | if item.filename in self.files_to_keep: | 54 | if item.filename in self.files_to_keep: |
| 55 | item = self.__clean_zipinfo(item) | ||
| 48 | zout.writestr(item, zin.read(item)) | 56 | zout.writestr(item, zin.read(item)) |
| 49 | continue | 57 | continue |
| 50 | 58 | ||
| @@ -54,7 +62,11 @@ class OfficeParser(abstract.AbstractParser): | |||
| 54 | print("%s isn't supported" % item.filename) | 62 | print("%s isn't supported" % item.filename) |
| 55 | continue | 63 | continue |
| 56 | tmp_parser.remove_all() | 64 | tmp_parser.remove_all() |
| 57 | zout.write(tmp_parser.output_filename, item.filename) | 65 | zinfo = zipfile.ZipInfo(item.filename) |
| 66 | item = self.__clean_zipinfo(item) | ||
| 67 | with open(tmp_parser.output_filename, 'rb') as f: | ||
| 68 | zout.writestr(zinfo, f.read()) | ||
| 69 | |||
| 58 | shutil.rmtree(temp_folder) | 70 | shutil.rmtree(temp_folder) |
| 59 | zout.close() | 71 | zout.close() |
| 60 | zin.close() | 72 | zin.close() |
diff --git a/tests/test_libmat2.py b/tests/test_libmat2.py index c065237..888c782 100644 --- a/tests/test_libmat2.py +++ b/tests/test_libmat2.py | |||
| @@ -57,7 +57,7 @@ class TestGetMeta(unittest.TestCase): | |||
| 57 | 57 | ||
| 58 | 58 | ||
| 59 | class TestDeepCleaning(unittest.TestCase): | 59 | class TestDeepCleaning(unittest.TestCase): |
| 60 | def __check_zip_clean(self, p): | 60 | def __check_deep_meta(self, p): |
| 61 | tempdir = tempfile.mkdtemp() | 61 | tempdir = tempfile.mkdtemp() |
| 62 | zipin = zipfile.ZipFile(p.filename) | 62 | zipin = zipfile.ZipFile(p.filename) |
| 63 | zipin.extractall(tempdir) | 63 | zipin.extractall(tempdir) |
| @@ -72,6 +72,15 @@ class TestDeepCleaning(unittest.TestCase): | |||
| 72 | self.assertEqual(inside_p.get_meta(), {}) | 72 | self.assertEqual(inside_p.get_meta(), {}) |
| 73 | shutil.rmtree(tempdir) | 73 | shutil.rmtree(tempdir) |
| 74 | 74 | ||
| 75 | |||
| 76 | def __check_zip_meta(self, p): | ||
| 77 | zipin = zipfile.ZipFile(p.filename) | ||
| 78 | for item in zipin.infolist(): | ||
| 79 | self.assertEqual(item.comment, b'') | ||
| 80 | self.assertEqual(item.date_time, (1980, 1, 1, 0, 0, 0)) | ||
| 81 | self.assertEqual(item.create_system, 3) # 3 is UNIX | ||
| 82 | |||
| 83 | |||
| 75 | def test_office(self): | 84 | def test_office(self): |
| 76 | shutil.copy('./tests/data/dirty.docx', './tests/data/clean.docx') | 85 | shutil.copy('./tests/data/dirty.docx', './tests/data/clean.docx') |
| 77 | p = office.OfficeParser('./tests/data/clean.docx') | 86 | p = office.OfficeParser('./tests/data/clean.docx') |
| @@ -85,7 +94,8 @@ class TestDeepCleaning(unittest.TestCase): | |||
| 85 | p = office.OfficeParser('./tests/data/clean.docx.cleaned') | 94 | p = office.OfficeParser('./tests/data/clean.docx.cleaned') |
| 86 | self.assertEqual(p.get_meta(), {}) | 95 | self.assertEqual(p.get_meta(), {}) |
| 87 | 96 | ||
| 88 | self.__check_zip_clean(p) | 97 | self.__check_zip_meta(p) |
| 98 | self.__check_deep_meta(p) | ||
| 89 | 99 | ||
| 90 | os.remove('./tests/data/clean.docx') | 100 | os.remove('./tests/data/clean.docx') |
| 91 | 101 | ||
| @@ -103,7 +113,8 @@ class TestDeepCleaning(unittest.TestCase): | |||
| 103 | p = libreoffice.LibreOfficeParser('./tests/data/clean.odt.cleaned') | 113 | p = libreoffice.LibreOfficeParser('./tests/data/clean.odt.cleaned') |
| 104 | self.assertEqual(p.get_meta(), {}) | 114 | self.assertEqual(p.get_meta(), {}) |
| 105 | 115 | ||
| 106 | self.__check_zip_clean(p) | 116 | self.__check_zip_meta(p) |
| 117 | self.__check_deep_meta(p) | ||
| 107 | 118 | ||
| 108 | os.remove('./tests/data/clean.odt') | 119 | os.remove('./tests/data/clean.odt') |
| 109 | 120 | ||
