summaryrefslogtreecommitdiff
path: root/libmat2/office.py
diff options
context:
space:
mode:
authorjvoisin2018-06-27 21:48:46 +0200
committerjvoisin2018-06-27 21:48:46 +0200
commit177184ac671eebd2285561a39777dca2c9f70b04 (patch)
tree6305121db3bf47c31c58e2588d3d3fc9d4014747 /libmat2/office.py
parentf44769df4128239d34883bc225413654ff31dfc6 (diff)
Massively simplify how we're cleaning office files
Diffstat (limited to 'libmat2/office.py')
-rw-r--r--libmat2/office.py59
1 files changed, 26 insertions, 33 deletions
diff --git a/libmat2/office.py b/libmat2/office.py
index aea56b9..e813fae 100644
--- a/libmat2/office.py
+++ b/libmat2/office.py
@@ -47,45 +47,38 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
47 47
48 return metadata 48 return metadata
49 49
50 def remove_all(self) -> bool:
51 with zipfile.ZipFile(self.filename) as zin,\
52 zipfile.ZipFile(self.output_filename, 'w') as zout:
50 53
51 def _clean_internal_file(self, item: zipfile.ZipInfo, temp_folder: str, 54 temp_folder = tempfile.mkdtemp()
52 zin: zipfile.ZipFile, zout: zipfile.ZipFile) -> bool:
53 zin.extract(member=item, path=temp_folder)
54 full_path = os.path.join(temp_folder, item.filename)
55 tmp_parser, mtype = parser_factory.get_parser(full_path) # type: ignore
56 if not tmp_parser:
57 zout.close()
58 os.remove(self.output_filename)
59 print("%s's format (%s) isn't supported" % (item.filename, mtype))
60 return False
61 tmp_parser.remove_all()
62 55
63 zinfo = zipfile.ZipInfo(item.filename) # type: ignore 56 for item in zin.infolist():
64 clean_zinfo = self._clean_zipinfo(zinfo) 57 if item.filename[-1] == '/': # `is_dir` is added in Python3.6
65 with open(tmp_parser.output_filename, 'rb') as f: 58 continue # don't keep empty folders
66 zout.writestr(clean_zinfo, f.read()) 59 elif item.filename in self.files_to_keep:
67 return True 60 item = self._clean_zipinfo(item)
61 zout.writestr(item, zin.read(item))
62 continue
63 elif any(map(lambda r: r.search(item.filename), self.files_to_omit)):
64 continue
68 65
69 def remove_all(self) -> bool: 66 zin.extract(member=item, path=temp_folder)
70 zin = zipfile.ZipFile(self.filename, 'r') 67 full_path = os.path.join(temp_folder, item.filename)
71 zout = zipfile.ZipFile(self.output_filename, 'w') 68 tmp_parser, mtype = parser_factory.get_parser(full_path) # type: ignore
72 temp_folder = tempfile.mkdtemp() 69 if not tmp_parser:
70 shutil.rmtree(temp_folder)
71 os.remove(self.output_filename)
72 print("%s's format (%s) isn't supported" % (item.filename, mtype))
73 return False
74 tmp_parser.remove_all()
73 75
74 for item in zin.infolist(): 76 zinfo = zipfile.ZipInfo(item.filename) # type: ignore
75 if item.filename[-1] == '/': # `is_dir` is added in Python3.6 77 clean_zinfo = self._clean_zipinfo(zinfo)
76 continue # don't keep empty folders 78 with open(tmp_parser.output_filename, 'rb') as f:
77 elif item.filename in self.files_to_keep: 79 zout.writestr(clean_zinfo, f.read())
78 item = self._clean_zipinfo(item)
79 zout.writestr(item, zin.read(item))
80 continue
81 elif any(map(lambda r: r.search(item.filename), self.files_to_omit)):
82 continue
83 elif not self._clean_internal_file(item, temp_folder, zin, zout):
84 return False
85 80
86 shutil.rmtree(temp_folder) 81 shutil.rmtree(temp_folder)
87 zout.close()
88 zin.close()
89 return True 82 return True
90 83
91 84