summaryrefslogtreecommitdiff
path: root/libmat2
diff options
context:
space:
mode:
Diffstat (limited to 'libmat2')
-rw-r--r--libmat2/archive.py256
-rw-r--r--libmat2/epub.py2
-rw-r--r--libmat2/office.py6
3 files changed, 197 insertions, 67 deletions
diff --git a/libmat2/archive.py b/libmat2/archive.py
index aa1b24c..2936f39 100644
--- a/libmat2/archive.py
+++ b/libmat2/archive.py
@@ -1,5 +1,7 @@
1import abc
1import zipfile 2import zipfile
2import datetime 3import datetime
4import tarfile
3import tempfile 5import tempfile
4import os 6import os
5import logging 7import logging
@@ -11,14 +13,37 @@ from . import abstract, UnknownMemberPolicy, parser_factory
11# Make pyflakes happy 13# Make pyflakes happy
12assert Set 14assert Set
13assert Pattern 15assert Pattern
14assert List 16
15assert Union 17# pylint: disable=not-callable,assignment-from-no-return
18
19# An ArchiveClass is a class representing an archive,
20# while an ArchiveMember is a class representing an element
21# (usually a file) of an archive.
22ArchiveClass = Union[zipfile.ZipFile, tarfile.TarFile]
23ArchiveMember = Union[zipfile.ZipInfo, tarfile.TarInfo]
16 24
17 25
18class ArchiveBasedAbstractParser(abstract.AbstractParser): 26class ArchiveBasedAbstractParser(abstract.AbstractParser):
19 """ Office files (.docx, .odt, …) are zipped files. """ 27 """Base class for all archive-based formats.
28
29 Welcome to a world of frustrating complexity and tediouness:
30 - A lot of file formats (docx, odt, epubs, …) are archive-based,
31 so we need to add callbacks erverywhere to allow their respective
32 parsers to apply specific cleanup to the required files.
33 - Python has two different modules to deal with .tar and .zip files,
34 with similar-but-yet-o-so-different API, so we need to write
35 a ghetto-wrapper to avoid duplicating everything
36 - The combination of @staticmethod and @abstractstaticmethod is
37 required because for now, mypy doesn't know that
38 @abstractstaticmethod is, indeed, a static method.
39 - Mypy is too dumb (yet) to realise that a type A is valid under
40 the Union[A, B] constrain, hence the weird `# type: ignore`
41 annotations.
42 """
20 def __init__(self, filename): 43 def __init__(self, filename):
21 super().__init__(filename) 44 super().__init__(filename)
45 self.archive_class = None # type: Optional[ArchiveClass]
46 self.member_class = None # type: Optional[ArchiveMember]
22 47
23 # Those are the files that have a format that _isn't_ 48 # Those are the files that have a format that _isn't_
24 # supported by MAT2, but that we want to keep anyway. 49 # supported by MAT2, but that we want to keep anyway.
@@ -32,10 +57,10 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
32 # the archive? 57 # the archive?
33 self.unknown_member_policy = UnknownMemberPolicy.ABORT # type: UnknownMemberPolicy 58 self.unknown_member_policy = UnknownMemberPolicy.ABORT # type: UnknownMemberPolicy
34 59
35 try: # better fail here than later 60 self.is_archive_valid()
36 zipfile.ZipFile(self.filename) 61
37 except zipfile.BadZipFile: 62 def is_archive_valid(self):
38 raise ValueError 63 """Raise a ValueError is the current archive isn't a valid one."""
39 64
40 def _specific_cleanup(self, full_path: str) -> bool: 65 def _specific_cleanup(self, full_path: str) -> bool:
41 """ This method can be used to apply specific treatment 66 """ This method can be used to apply specific treatment
@@ -50,59 +75,57 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
50 return {} # pragma: no cover 75 return {} # pragma: no cover
51 76
52 @staticmethod 77 @staticmethod
53 def _clean_zipinfo(zipinfo: zipfile.ZipInfo) -> zipfile.ZipInfo: 78 @abc.abstractstaticmethod
54 zipinfo.create_system = 3 # Linux 79 def _get_all_members(archive: ArchiveClass) -> List[ArchiveMember]:
55 zipinfo.comment = b'' 80 """Return all the members of the archive."""
56 zipinfo.date_time = (1980, 1, 1, 0, 0, 0) # this is as early as a zipfile can be
57 return zipinfo
58 81
59 @staticmethod 82 @staticmethod
60 def _get_zipinfo_meta(zipinfo: zipfile.ZipInfo) -> Dict[str, str]: 83 @abc.abstractstaticmethod
61 metadata = {} 84 def _clean_member(member: ArchiveMember) -> ArchiveMember:
62 if zipinfo.create_system == 3: # this is Linux 85 """Remove all the metadata for a given member."""
63 pass
64 elif zipinfo.create_system == 2:
65 metadata['create_system'] = 'Windows'
66 else:
67 metadata['create_system'] = 'Weird'
68 86
69 if zipinfo.comment: 87 @staticmethod
70 metadata['comment'] = zipinfo.comment # type: ignore 88 @abc.abstractstaticmethod
89 def _get_member_meta(member: ArchiveMember) -> Dict[str, str]:
90 """Return all the metadata of a given member."""
71 91
72 if zipinfo.date_time != (1980, 1, 1, 0, 0, 0): 92 @staticmethod
73 metadata['date_time'] = str(datetime.datetime(*zipinfo.date_time)) 93 @abc.abstractstaticmethod
94 def _get_member_name(member: ArchiveMember) -> str:
95 """Return the name of the given member."""
74 96
75 return metadata 97 @staticmethod
98 @abc.abstractstaticmethod
99 def _add_file_to_archive(archive: ArchiveClass, member: ArchiveMember,
100 full_path: str):
101 """Add the file at full_path to the archive, via the given member."""
76 102
77 def get_meta(self) -> Dict[str, Union[str, dict]]: 103 def get_meta(self) -> Dict[str, Union[str, dict]]:
78 meta = dict() # type: Dict[str, Union[str, dict]] 104 meta = dict() # type: Dict[str, Union[str, dict]]
79 105
80 with zipfile.ZipFile(self.filename) as zin: 106 with self.archive_class(self.filename) as zin:
81 temp_folder = tempfile.mkdtemp() 107 temp_folder = tempfile.mkdtemp()
82 108
83 for item in zin.infolist(): 109 for item in self._get_all_members(zin):
84 local_meta = dict() # type: Dict[str, Union[str, Dict]] 110 local_meta = self._get_member_meta(item)
85 for k, v in self._get_zipinfo_meta(item).items(): 111 member_name = self._get_member_name(item)
86 local_meta[k] = v
87 112
88 if item.filename[-1] == '/': # pragma: no cover 113 if member_name[-1] == '/': # pragma: no cover
89 # `is_dir` is added in Python3.6 114 # `is_dir` is added in Python3.6
90 continue # don't keep empty folders 115 continue # don't keep empty folders
91 116
92 zin.extract(member=item, path=temp_folder) 117 zin.extract(member=item, path=temp_folder)
93 full_path = os.path.join(temp_folder, item.filename) 118 full_path = os.path.join(temp_folder, member_name)
94 119
95 specific_meta = self._specific_get_meta(full_path, item.filename) 120 specific_meta = self._specific_get_meta(full_path, member_name)
96 for (k, v) in specific_meta.items(): 121 local_meta = {**local_meta, **specific_meta}
97 local_meta[k] = v
98 122
99 tmp_parser, _ = parser_factory.get_parser(full_path) # type: ignore 123 member_parser, _ = parser_factory.get_parser(full_path) # type: ignore
100 if tmp_parser: 124 if member_parser:
101 for k, v in tmp_parser.get_meta().items(): 125 local_meta = {**local_meta, **member_parser.get_meta()}
102 local_meta[k] = v
103 126
104 if local_meta: 127 if local_meta:
105 meta[item.filename] = local_meta 128 meta[member_name] = local_meta
106 129
107 shutil.rmtree(temp_folder) 130 shutil.rmtree(temp_folder)
108 return meta 131 return meta
@@ -110,17 +133,19 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
110 def remove_all(self) -> bool: 133 def remove_all(self) -> bool:
111 # pylint: disable=too-many-branches 134 # pylint: disable=too-many-branches
112 135
113 with zipfile.ZipFile(self.filename) as zin,\ 136 with self.archive_class(self.filename) as zin,\
114 zipfile.ZipFile(self.output_filename, 'w') as zout: 137 self.archive_class(self.output_filename, 'w') as zout:
115 138
116 temp_folder = tempfile.mkdtemp() 139 temp_folder = tempfile.mkdtemp()
117 abort = False 140 abort = False
118 141
119 items = list() # type: List[zipfile.ZipInfo] 142 # Sort the items to process, to reduce fingerprinting,
120 for item in sorted(zin.infolist(), key=lambda z: z.filename): 143 # and keep them in the `items` variable.
144 items = list() # type: List[ArchiveMember]
145 for item in sorted(self._get_all_members(zin), key=self._get_member_name):
121 # Some fileformats do require to have the `mimetype` file 146 # Some fileformats do require to have the `mimetype` file
122 # as the first file in the archive. 147 # as the first file in the archive.
123 if item.filename == 'mimetype': 148 if self._get_member_name(item) == 'mimetype':
124 items = [item] + items 149 items = [item] + items
125 else: 150 else:
126 items.append(item) 151 items.append(item)
@@ -128,53 +153,53 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
128 # Since files order is a fingerprint factor, 153 # Since files order is a fingerprint factor,
129 # we're iterating (and thus inserting) them in lexicographic order. 154 # we're iterating (and thus inserting) them in lexicographic order.
130 for item in items: 155 for item in items:
131 if item.filename[-1] == '/': # `is_dir` is added in Python3.6 156 member_name = self._get_member_name(item)
157 if member_name[-1] == '/': # `is_dir` is added in Python3.6
132 continue # don't keep empty folders 158 continue # don't keep empty folders
133 159
134 zin.extract(member=item, path=temp_folder) 160 zin.extract(member=item, path=temp_folder)
135 full_path = os.path.join(temp_folder, item.filename) 161 full_path = os.path.join(temp_folder, member_name)
136 162
137 if self._specific_cleanup(full_path) is False: 163 if self._specific_cleanup(full_path) is False:
138 logging.warning("Something went wrong during deep cleaning of %s", 164 logging.warning("Something went wrong during deep cleaning of %s",
139 item.filename) 165 member_name)
140 abort = True 166 abort = True
141 continue 167 continue
142 168
143 if any(map(lambda r: r.search(item.filename), self.files_to_keep)): 169 if any(map(lambda r: r.search(member_name), self.files_to_keep)):
144 # those files aren't supported, but we want to add them anyway 170 # those files aren't supported, but we want to add them anyway
145 pass 171 pass
146 elif any(map(lambda r: r.search(item.filename), self.files_to_omit)): 172 elif any(map(lambda r: r.search(member_name), self.files_to_omit)):
147 continue 173 continue
148 else: # supported files that we want to first clean, then add 174 else: # supported files that we want to first clean, then add
149 tmp_parser, mtype = parser_factory.get_parser(full_path) # type: ignore 175 member_parser, mtype = parser_factory.get_parser(full_path) # type: ignore
150 if not tmp_parser: 176 if not member_parser:
151 if self.unknown_member_policy == UnknownMemberPolicy.OMIT: 177 if self.unknown_member_policy == UnknownMemberPolicy.OMIT:
152 logging.warning("In file %s, omitting unknown element %s (format: %s)", 178 logging.warning("In file %s, omitting unknown element %s (format: %s)",
153 self.filename, item.filename, mtype) 179 self.filename, member_name, mtype)
154 continue 180 continue
155 elif self.unknown_member_policy == UnknownMemberPolicy.KEEP: 181 elif self.unknown_member_policy == UnknownMemberPolicy.KEEP:
156 logging.warning("In file %s, keeping unknown element %s (format: %s)", 182 logging.warning("In file %s, keeping unknown element %s (format: %s)",
157 self.filename, item.filename, mtype) 183 self.filename, member_name, mtype)
158 else: 184 else:
159 logging.error("In file %s, element %s's format (%s) " \ 185 logging.error("In file %s, element %s's format (%s) " \
160 "isn't supported", 186 "isn't supported",
161 self.filename, item.filename, mtype) 187 self.filename, member_name, mtype)
162 abort = True 188 abort = True
163 continue 189 continue
164 if tmp_parser: 190 else:
165 if tmp_parser.remove_all() is False: 191 if member_parser.remove_all() is False:
166 logging.warning("In file %s, something went wrong \ 192 logging.warning("In file %s, something went wrong \
167 with the cleaning of %s \ 193 with the cleaning of %s \
168 (format: %s)", 194 (format: %s)",
169 self.filename, item.filename, mtype) 195 self.filename, member_name, mtype)
170 abort = True 196 abort = True
171 continue 197 continue
172 os.rename(tmp_parser.output_filename, full_path) 198 os.rename(member_parser.output_filename, full_path)
173 199
174 zinfo = zipfile.ZipInfo(item.filename) # type: ignore 200 zinfo = self.member_class(member_name) # type: ignore
175 clean_zinfo = self._clean_zipinfo(zinfo) 201 clean_zinfo = self._clean_member(zinfo)
176 with open(full_path, 'rb') as f: 202 self._add_file_to_archive(zout, clean_zinfo, full_path)
177 zout.writestr(clean_zinfo, f.read())
178 203
179 shutil.rmtree(temp_folder) 204 shutil.rmtree(temp_folder)
180 if abort: 205 if abort:
@@ -183,6 +208,111 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
183 return True 208 return True
184 209
185 210
211class TarParser(ArchiveBasedAbstractParser):
212 mimetypes = {'application/x-tar'}
213 def __init__(self, filename):
214 super().__init__(filename)
215 self.archive_class = tarfile.TarFile
216 self.member_class = tarfile.TarInfo
217
218 def is_archive_valid(self):
219 if tarfile.is_tarfile(self.filename) is False:
220 raise ValueError
221
222 @staticmethod
223 def _clean_member(member: ArchiveMember) -> ArchiveMember:
224 assert isinstance(member, tarfile.TarInfo) # please mypy
225 member.mtime = member.uid = member.gid = 0
226 member.uname = member.gname = ''
227 return member
228
229 @staticmethod
230 def _get_member_meta(member: ArchiveMember) -> Dict[str, str]:
231 assert isinstance(member, tarfile.TarInfo) # please mypy
232 metadata = {}
233 if member.mtime != 0:
234 metadata['mtime'] = str(member.mtime)
235 if member.uid != 0:
236 metadata['uid'] = str(member.uid)
237 if member.gid != 0:
238 metadata['gid'] = str(member.gid)
239 if member.uname != '':
240 metadata['uname'] = member.uname
241 if member.gname != '':
242 metadata['gname'] = member.gname
243 return metadata
244
245 @staticmethod
246 def _add_file_to_archive(archive: ArchiveClass, member: ArchiveMember,
247 full_path: str):
248 assert isinstance(member, tarfile.TarInfo) # please mypy
249 assert isinstance(archive, tarfile.TarFile) # please mypy
250 archive.add(full_path, member.name, filter=TarParser._clean_member) # type: ignore
251
252 @staticmethod
253 def _get_all_members(archive: ArchiveClass) -> List[ArchiveMember]:
254 assert isinstance(archive, tarfile.TarFile) # please mypy
255 return archive.getmembers() # type: ignore
256
257 @staticmethod
258 def _get_member_name(member: ArchiveMember) -> str:
259 assert isinstance(member, tarfile.TarInfo) # please mypy
260 return member.name
186 261
187class ZipParser(ArchiveBasedAbstractParser): 262class ZipParser(ArchiveBasedAbstractParser):
188 mimetypes = {'application/zip'} 263 mimetypes = {'application/zip'}
264 def __init__(self, filename):
265 super().__init__(filename)
266 self.archive_class = zipfile.ZipFile
267 self.member_class = zipfile.ZipInfo
268
269 def is_archive_valid(self):
270 try:
271 zipfile.ZipFile(self.filename)
272 except zipfile.BadZipFile:
273 raise ValueError
274
275 @staticmethod
276 def _clean_member(member: ArchiveMember) -> ArchiveMember:
277 assert isinstance(member, zipfile.ZipInfo) # please mypy
278 member.create_system = 3 # Linux
279 member.comment = b''
280 member.date_time = (1980, 1, 1, 0, 0, 0) # this is as early as a zipfile can be
281 return member
282
283 @staticmethod
284 def _get_member_meta(member: ArchiveMember) -> Dict[str, str]:
285 assert isinstance(member, zipfile.ZipInfo) # please mypy
286 metadata = {}
287 if member.create_system == 3: # this is Linux
288 pass
289 elif member.create_system == 2:
290 metadata['create_system'] = 'Windows'
291 else:
292 metadata['create_system'] = 'Weird'
293
294 if member.comment:
295 metadata['comment'] = member.comment # type: ignore
296
297 if member.date_time != (1980, 1, 1, 0, 0, 0):
298 metadata['date_time'] = str(datetime.datetime(*member.date_time))
299
300 return metadata
301
302 @staticmethod
303 def _add_file_to_archive(archive: ArchiveClass, member: ArchiveMember,
304 full_path: str):
305 assert isinstance(archive, zipfile.ZipFile) # please mypy
306 assert isinstance(member, zipfile.ZipInfo) # please mypy
307 with open(full_path, 'rb') as f:
308 archive.writestr(member, f.read())
309
310 @staticmethod
311 def _get_all_members(archive: ArchiveClass) -> List[ArchiveMember]:
312 assert isinstance(archive, zipfile.ZipFile) # please mypy
313 return archive.infolist() # type: ignore
314
315 @staticmethod
316 def _get_member_name(member: ArchiveMember) -> str:
317 assert isinstance(member, zipfile.ZipInfo) # please mypy
318 return member.filename
diff --git a/libmat2/epub.py b/libmat2/epub.py
index d385465..390ee63 100644
--- a/libmat2/epub.py
+++ b/libmat2/epub.py
@@ -5,7 +5,7 @@ import xml.etree.ElementTree as ET # type: ignore
5 5
6from . import archive, office 6from . import archive, office
7 7
8class EPUBParser(archive.ArchiveBasedAbstractParser): 8class EPUBParser(archive.ZipParser):
9 mimetypes = {'application/epub+zip', } 9 mimetypes = {'application/epub+zip', }
10 metadata_namespace = '{http://purl.org/dc/elements/1.1/}' 10 metadata_namespace = '{http://purl.org/dc/elements/1.1/}'
11 11
diff --git a/libmat2/office.py b/libmat2/office.py
index 2c9cbff..b769991 100644
--- a/libmat2/office.py
+++ b/libmat2/office.py
@@ -6,7 +6,7 @@ from typing import Dict, Set, Pattern, Tuple, Any
6 6
7import xml.etree.ElementTree as ET # type: ignore 7import xml.etree.ElementTree as ET # type: ignore
8 8
9from .archive import ArchiveBasedAbstractParser 9from .archive import ZipParser
10 10
11# pylint: disable=line-too-long 11# pylint: disable=line-too-long
12 12
@@ -43,7 +43,7 @@ def _sort_xml_attributes(full_path: str) -> bool:
43 return True 43 return True
44 44
45 45
46class MSOfficeParser(ArchiveBasedAbstractParser): 46class MSOfficeParser(ZipParser):
47 mimetypes = { 47 mimetypes = {
48 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', 48 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
49 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', 49 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
@@ -312,7 +312,7 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
312 return {file_path: 'harmful content', } 312 return {file_path: 'harmful content', }
313 313
314 314
315class LibreOfficeParser(ArchiveBasedAbstractParser): 315class LibreOfficeParser(ZipParser):
316 mimetypes = { 316 mimetypes = {
317 'application/vnd.oasis.opendocument.text', 317 'application/vnd.oasis.opendocument.text',
318 'application/vnd.oasis.opendocument.spreadsheet', 318 'application/vnd.oasis.opendocument.spreadsheet',