summaryrefslogtreecommitdiff
path: root/libmat2/office.py
diff options
context:
space:
mode:
authorjvoisin2018-06-04 22:54:01 +0200
committerjvoisin2018-06-04 23:20:30 +0200
commit6a1b0b31f0fbfa59a78a8b9f4f07bf9ed3f91cdf (patch)
treefdb8e31a7ad5bf6982cb8c11a2012205a0cfe14f /libmat2/office.py
parent4ebf9754f84e28eb73a09df0f788b5be80c9c73e (diff)
Add more typing and use mypy in the CI
Diffstat (limited to 'libmat2/office.py')
-rw-r--r--libmat2/office.py38
1 files changed, 26 insertions, 12 deletions
diff --git a/libmat2/office.py b/libmat2/office.py
index 749fc7d..90f7c7a 100644
--- a/libmat2/office.py
+++ b/libmat2/office.py
@@ -4,11 +4,15 @@ import shutil
4import tempfile 4import tempfile
5import datetime 5import datetime
6import zipfile 6import zipfile
7from typing import Dict, Set
7 8
8from . import abstract, parser_factory 9from . import abstract, parser_factory
9 10
11assert Set # make pyflakes happy
10 12
11class ArchiveBasedAbstractParser(abstract.AbstractParser): 13class ArchiveBasedAbstractParser(abstract.AbstractParser):
14 whitelist = set() # type: Set[str]
15
12 def _clean_zipinfo(self, zipinfo: zipfile.ZipInfo) -> zipfile.ZipInfo: 16 def _clean_zipinfo(self, zipinfo: zipfile.ZipInfo) -> zipfile.ZipInfo:
13 zipinfo.compress_type = zipfile.ZIP_DEFLATED 17 zipinfo.compress_type = zipfile.ZIP_DEFLATED
14 zipinfo.create_system = 3 # Linux 18 zipinfo.create_system = 3 # Linux
@@ -16,7 +20,7 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
16 zipinfo.date_time = (1980, 1, 1, 0, 0, 0) 20 zipinfo.date_time = (1980, 1, 1, 0, 0, 0)
17 return zipinfo 21 return zipinfo
18 22
19 def _get_zipinfo_meta(self, zipinfo: zipfile.ZipInfo) -> dict: 23 def _get_zipinfo_meta(self, zipinfo: zipfile.ZipInfo) -> Dict[str, str]:
20 metadata = {} 24 metadata = {}
21 if zipinfo.create_system == 3: 25 if zipinfo.create_system == 3:
22 #metadata['create_system'] = 'Linux' 26 #metadata['create_system'] = 'Linux'
@@ -27,25 +31,31 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
27 metadata['create_system'] = 'Weird' 31 metadata['create_system'] = 'Weird'
28 32
29 if zipinfo.comment: 33 if zipinfo.comment:
30 metadata['comment'] = zipinfo.comment 34 metadata['comment'] = zipinfo.comment # type: ignore
31 35
32 if zipinfo.date_time != (1980, 1, 1, 0, 0, 0): 36 if zipinfo.date_time != (1980, 1, 1, 0, 0, 0):
33 metadata['date_time'] = datetime.datetime(*zipinfo.date_time) 37 metadata['date_time'] =str(datetime.datetime(*zipinfo.date_time))
34 38
35 return metadata 39 return metadata
36 40
37 41
38 def _clean_internal_file(self, item: zipfile.ZipInfo, temp_folder: str, 42 def _clean_internal_file(self, item: zipfile.ZipInfo, temp_folder: str,
39 zin: zipfile.ZipFile, zout: zipfile.ZipFile): 43 zin: zipfile.ZipFile, zout: zipfile.ZipFile):
44 output = ''
40 zin.extract(member=item, path=temp_folder) 45 zin.extract(member=item, path=temp_folder)
41 tmp_parser, mtype = parser_factory.get_parser(os.path.join(temp_folder, item.filename)) 46 if item.filename not in self.whitelist:
42 if not tmp_parser: 47 full_path = os.path.join(temp_folder, item.filename)
43 print("%s's format (%s) isn't supported" % (item.filename, mtype)) 48 tmp_parser, mtype = parser_factory.get_parser(full_path) # type: ignore
44 return 49 if not tmp_parser:
45 tmp_parser.remove_all() 50 print("%s's format (%s) isn't supported" % (item.filename, mtype))
46 zinfo = zipfile.ZipInfo(item.filename) 51 return
52 tmp_parser.remove_all()
53 output = tmp_parser.output_filename
54 else:
55 output = os.path.join(temp_folder, item.filename)
56 zinfo = zipfile.ZipInfo(item.filename) # type: ignore
47 clean_zinfo = self._clean_zipinfo(zinfo) 57 clean_zinfo = self._clean_zipinfo(zinfo)
48 with open(tmp_parser.output_filename, 'rb') as f: 58 with open(output, 'rb') as f:
49 zout.writestr(clean_zinfo, f.read()) 59 zout.writestr(clean_zinfo, f.read())
50 60
51 61
@@ -72,7 +82,8 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
72 if not metadata: # better safe than sorry 82 if not metadata: # better safe than sorry
73 metadata[item] = 'harmful content' 83 metadata[item] = 'harmful content'
74 84
75 metadata = {**metadata, **self._get_zipinfo_meta(item)} 85 for key, value in self._get_zipinfo_meta(item).items():
86 metadata[key] = value
76 zipin.close() 87 zipin.close()
77 return metadata 88 return metadata
78 89
@@ -112,6 +123,8 @@ class LibreOfficeParser(ArchiveBasedAbstractParser):
112 'application/vnd.oasis.opendocument.formula', 123 'application/vnd.oasis.opendocument.formula',
113 'application/vnd.oasis.opendocument.image', 124 'application/vnd.oasis.opendocument.image',
114 } 125 }
126 whitelist = {'mimetype', 'manifest.rdf'}
127
115 128
116 def get_meta(self): 129 def get_meta(self):
117 """ 130 """
@@ -127,7 +140,8 @@ class LibreOfficeParser(ArchiveBasedAbstractParser):
127 metadata[key] = value 140 metadata[key] = value
128 if not metadata: # better safe than sorry 141 if not metadata: # better safe than sorry
129 metadata[item] = 'harmful content' 142 metadata[item] = 'harmful content'
130 metadata = {**metadata, **self._get_zipinfo_meta(item)} 143 for key, value in self._get_zipinfo_meta(item).items():
144 metadata[key] = value
131 zipin.close() 145 zipin.close()
132 return metadata 146 return metadata
133 147