diff options
Diffstat (limited to 'libmat2/archive.py')
| -rw-r--r-- | libmat2/archive.py | 256 |
1 files changed, 193 insertions, 63 deletions
diff --git a/libmat2/archive.py b/libmat2/archive.py index aa1b24c..2936f39 100644 --- a/libmat2/archive.py +++ b/libmat2/archive.py | |||
| @@ -1,5 +1,7 @@ | |||
| 1 | import abc | ||
| 1 | import zipfile | 2 | import zipfile |
| 2 | import datetime | 3 | import datetime |
| 4 | import tarfile | ||
| 3 | import tempfile | 5 | import tempfile |
| 4 | import os | 6 | import os |
| 5 | import logging | 7 | import logging |
| @@ -11,14 +13,37 @@ from . import abstract, UnknownMemberPolicy, parser_factory | |||
| 11 | # Make pyflakes happy | 13 | # Make pyflakes happy |
| 12 | assert Set | 14 | assert Set |
| 13 | assert Pattern | 15 | assert Pattern |
| 14 | assert List | 16 | |
| 15 | assert Union | 17 | # pylint: disable=not-callable,assignment-from-no-return |
| 18 | |||
| 19 | # An ArchiveClass is a class representing an archive, | ||
| 20 | # while an ArchiveMember is a class representing an element | ||
| 21 | # (usually a file) of an archive. | ||
| 22 | ArchiveClass = Union[zipfile.ZipFile, tarfile.TarFile] | ||
| 23 | ArchiveMember = Union[zipfile.ZipInfo, tarfile.TarInfo] | ||
| 16 | 24 | ||
| 17 | 25 | ||
| 18 | class ArchiveBasedAbstractParser(abstract.AbstractParser): | 26 | class ArchiveBasedAbstractParser(abstract.AbstractParser): |
| 19 | """ Office files (.docx, .odt, …) are zipped files. """ | 27 | """Base class for all archive-based formats. |
| 28 | |||
| 29 | Welcome to a world of frustrating complexity and tediouness: | ||
| 30 | - A lot of file formats (docx, odt, epubs, …) are archive-based, | ||
| 31 | so we need to add callbacks erverywhere to allow their respective | ||
| 32 | parsers to apply specific cleanup to the required files. | ||
| 33 | - Python has two different modules to deal with .tar and .zip files, | ||
| 34 | with similar-but-yet-o-so-different API, so we need to write | ||
| 35 | a ghetto-wrapper to avoid duplicating everything | ||
| 36 | - The combination of @staticmethod and @abstractstaticmethod is | ||
| 37 | required because for now, mypy doesn't know that | ||
| 38 | @abstractstaticmethod is, indeed, a static method. | ||
| 39 | - Mypy is too dumb (yet) to realise that a type A is valid under | ||
| 40 | the Union[A, B] constrain, hence the weird `# type: ignore` | ||
| 41 | annotations. | ||
| 42 | """ | ||
| 20 | def __init__(self, filename): | 43 | def __init__(self, filename): |
| 21 | super().__init__(filename) | 44 | super().__init__(filename) |
| 45 | self.archive_class = None # type: Optional[ArchiveClass] | ||
| 46 | self.member_class = None # type: Optional[ArchiveMember] | ||
| 22 | 47 | ||
| 23 | # Those are the files that have a format that _isn't_ | 48 | # Those are the files that have a format that _isn't_ |
| 24 | # supported by MAT2, but that we want to keep anyway. | 49 | # supported by MAT2, but that we want to keep anyway. |
| @@ -32,10 +57,10 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser): | |||
| 32 | # the archive? | 57 | # the archive? |
| 33 | self.unknown_member_policy = UnknownMemberPolicy.ABORT # type: UnknownMemberPolicy | 58 | self.unknown_member_policy = UnknownMemberPolicy.ABORT # type: UnknownMemberPolicy |
| 34 | 59 | ||
| 35 | try: # better fail here than later | 60 | self.is_archive_valid() |
| 36 | zipfile.ZipFile(self.filename) | 61 | |
| 37 | except zipfile.BadZipFile: | 62 | def is_archive_valid(self): |
| 38 | raise ValueError | 63 | """Raise a ValueError is the current archive isn't a valid one.""" |
| 39 | 64 | ||
| 40 | def _specific_cleanup(self, full_path: str) -> bool: | 65 | def _specific_cleanup(self, full_path: str) -> bool: |
| 41 | """ This method can be used to apply specific treatment | 66 | """ This method can be used to apply specific treatment |
| @@ -50,59 +75,57 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser): | |||
| 50 | return {} # pragma: no cover | 75 | return {} # pragma: no cover |
| 51 | 76 | ||
| 52 | @staticmethod | 77 | @staticmethod |
| 53 | def _clean_zipinfo(zipinfo: zipfile.ZipInfo) -> zipfile.ZipInfo: | 78 | @abc.abstractstaticmethod |
| 54 | zipinfo.create_system = 3 # Linux | 79 | def _get_all_members(archive: ArchiveClass) -> List[ArchiveMember]: |
| 55 | zipinfo.comment = b'' | 80 | """Return all the members of the archive.""" |
| 56 | zipinfo.date_time = (1980, 1, 1, 0, 0, 0) # this is as early as a zipfile can be | ||
| 57 | return zipinfo | ||
| 58 | 81 | ||
| 59 | @staticmethod | 82 | @staticmethod |
| 60 | def _get_zipinfo_meta(zipinfo: zipfile.ZipInfo) -> Dict[str, str]: | 83 | @abc.abstractstaticmethod |
| 61 | metadata = {} | 84 | def _clean_member(member: ArchiveMember) -> ArchiveMember: |
| 62 | if zipinfo.create_system == 3: # this is Linux | 85 | """Remove all the metadata for a given member.""" |
| 63 | pass | ||
| 64 | elif zipinfo.create_system == 2: | ||
| 65 | metadata['create_system'] = 'Windows' | ||
| 66 | else: | ||
| 67 | metadata['create_system'] = 'Weird' | ||
| 68 | 86 | ||
| 69 | if zipinfo.comment: | 87 | @staticmethod |
| 70 | metadata['comment'] = zipinfo.comment # type: ignore | 88 | @abc.abstractstaticmethod |
| 89 | def _get_member_meta(member: ArchiveMember) -> Dict[str, str]: | ||
| 90 | """Return all the metadata of a given member.""" | ||
| 71 | 91 | ||
| 72 | if zipinfo.date_time != (1980, 1, 1, 0, 0, 0): | 92 | @staticmethod |
| 73 | metadata['date_time'] = str(datetime.datetime(*zipinfo.date_time)) | 93 | @abc.abstractstaticmethod |
| 94 | def _get_member_name(member: ArchiveMember) -> str: | ||
| 95 | """Return the name of the given member.""" | ||
| 74 | 96 | ||
| 75 | return metadata | 97 | @staticmethod |
| 98 | @abc.abstractstaticmethod | ||
| 99 | def _add_file_to_archive(archive: ArchiveClass, member: ArchiveMember, | ||
| 100 | full_path: str): | ||
| 101 | """Add the file at full_path to the archive, via the given member.""" | ||
| 76 | 102 | ||
| 77 | def get_meta(self) -> Dict[str, Union[str, dict]]: | 103 | def get_meta(self) -> Dict[str, Union[str, dict]]: |
| 78 | meta = dict() # type: Dict[str, Union[str, dict]] | 104 | meta = dict() # type: Dict[str, Union[str, dict]] |
| 79 | 105 | ||
| 80 | with zipfile.ZipFile(self.filename) as zin: | 106 | with self.archive_class(self.filename) as zin: |
| 81 | temp_folder = tempfile.mkdtemp() | 107 | temp_folder = tempfile.mkdtemp() |
| 82 | 108 | ||
| 83 | for item in zin.infolist(): | 109 | for item in self._get_all_members(zin): |
| 84 | local_meta = dict() # type: Dict[str, Union[str, Dict]] | 110 | local_meta = self._get_member_meta(item) |
| 85 | for k, v in self._get_zipinfo_meta(item).items(): | 111 | member_name = self._get_member_name(item) |
| 86 | local_meta[k] = v | ||
| 87 | 112 | ||
| 88 | if item.filename[-1] == '/': # pragma: no cover | 113 | if member_name[-1] == '/': # pragma: no cover |
| 89 | # `is_dir` is added in Python3.6 | 114 | # `is_dir` is added in Python3.6 |
| 90 | continue # don't keep empty folders | 115 | continue # don't keep empty folders |
| 91 | 116 | ||
| 92 | zin.extract(member=item, path=temp_folder) | 117 | zin.extract(member=item, path=temp_folder) |
| 93 | full_path = os.path.join(temp_folder, item.filename) | 118 | full_path = os.path.join(temp_folder, member_name) |
| 94 | 119 | ||
| 95 | specific_meta = self._specific_get_meta(full_path, item.filename) | 120 | specific_meta = self._specific_get_meta(full_path, member_name) |
| 96 | for (k, v) in specific_meta.items(): | 121 | local_meta = {**local_meta, **specific_meta} |
| 97 | local_meta[k] = v | ||
| 98 | 122 | ||
| 99 | tmp_parser, _ = parser_factory.get_parser(full_path) # type: ignore | 123 | member_parser, _ = parser_factory.get_parser(full_path) # type: ignore |
| 100 | if tmp_parser: | 124 | if member_parser: |
| 101 | for k, v in tmp_parser.get_meta().items(): | 125 | local_meta = {**local_meta, **member_parser.get_meta()} |
| 102 | local_meta[k] = v | ||
| 103 | 126 | ||
| 104 | if local_meta: | 127 | if local_meta: |
| 105 | meta[item.filename] = local_meta | 128 | meta[member_name] = local_meta |
| 106 | 129 | ||
| 107 | shutil.rmtree(temp_folder) | 130 | shutil.rmtree(temp_folder) |
| 108 | return meta | 131 | return meta |
| @@ -110,17 +133,19 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser): | |||
| 110 | def remove_all(self) -> bool: | 133 | def remove_all(self) -> bool: |
| 111 | # pylint: disable=too-many-branches | 134 | # pylint: disable=too-many-branches |
| 112 | 135 | ||
| 113 | with zipfile.ZipFile(self.filename) as zin,\ | 136 | with self.archive_class(self.filename) as zin,\ |
| 114 | zipfile.ZipFile(self.output_filename, 'w') as zout: | 137 | self.archive_class(self.output_filename, 'w') as zout: |
| 115 | 138 | ||
| 116 | temp_folder = tempfile.mkdtemp() | 139 | temp_folder = tempfile.mkdtemp() |
| 117 | abort = False | 140 | abort = False |
| 118 | 141 | ||
| 119 | items = list() # type: List[zipfile.ZipInfo] | 142 | # Sort the items to process, to reduce fingerprinting, |
| 120 | for item in sorted(zin.infolist(), key=lambda z: z.filename): | 143 | # and keep them in the `items` variable. |
| 144 | items = list() # type: List[ArchiveMember] | ||
| 145 | for item in sorted(self._get_all_members(zin), key=self._get_member_name): | ||
| 121 | # Some fileformats do require to have the `mimetype` file | 146 | # Some fileformats do require to have the `mimetype` file |
| 122 | # as the first file in the archive. | 147 | # as the first file in the archive. |
| 123 | if item.filename == 'mimetype': | 148 | if self._get_member_name(item) == 'mimetype': |
| 124 | items = [item] + items | 149 | items = [item] + items |
| 125 | else: | 150 | else: |
| 126 | items.append(item) | 151 | items.append(item) |
| @@ -128,53 +153,53 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser): | |||
| 128 | # Since files order is a fingerprint factor, | 153 | # Since files order is a fingerprint factor, |
| 129 | # we're iterating (and thus inserting) them in lexicographic order. | 154 | # we're iterating (and thus inserting) them in lexicographic order. |
| 130 | for item in items: | 155 | for item in items: |
| 131 | if item.filename[-1] == '/': # `is_dir` is added in Python3.6 | 156 | member_name = self._get_member_name(item) |
| 157 | if member_name[-1] == '/': # `is_dir` is added in Python3.6 | ||
| 132 | continue # don't keep empty folders | 158 | continue # don't keep empty folders |
| 133 | 159 | ||
| 134 | zin.extract(member=item, path=temp_folder) | 160 | zin.extract(member=item, path=temp_folder) |
| 135 | full_path = os.path.join(temp_folder, item.filename) | 161 | full_path = os.path.join(temp_folder, member_name) |
| 136 | 162 | ||
| 137 | if self._specific_cleanup(full_path) is False: | 163 | if self._specific_cleanup(full_path) is False: |
| 138 | logging.warning("Something went wrong during deep cleaning of %s", | 164 | logging.warning("Something went wrong during deep cleaning of %s", |
| 139 | item.filename) | 165 | member_name) |
| 140 | abort = True | 166 | abort = True |
| 141 | continue | 167 | continue |
| 142 | 168 | ||
| 143 | if any(map(lambda r: r.search(item.filename), self.files_to_keep)): | 169 | if any(map(lambda r: r.search(member_name), self.files_to_keep)): |
| 144 | # those files aren't supported, but we want to add them anyway | 170 | # those files aren't supported, but we want to add them anyway |
| 145 | pass | 171 | pass |
| 146 | elif any(map(lambda r: r.search(item.filename), self.files_to_omit)): | 172 | elif any(map(lambda r: r.search(member_name), self.files_to_omit)): |
| 147 | continue | 173 | continue |
| 148 | else: # supported files that we want to first clean, then add | 174 | else: # supported files that we want to first clean, then add |
| 149 | tmp_parser, mtype = parser_factory.get_parser(full_path) # type: ignore | 175 | member_parser, mtype = parser_factory.get_parser(full_path) # type: ignore |
| 150 | if not tmp_parser: | 176 | if not member_parser: |
| 151 | if self.unknown_member_policy == UnknownMemberPolicy.OMIT: | 177 | if self.unknown_member_policy == UnknownMemberPolicy.OMIT: |
| 152 | logging.warning("In file %s, omitting unknown element %s (format: %s)", | 178 | logging.warning("In file %s, omitting unknown element %s (format: %s)", |
| 153 | self.filename, item.filename, mtype) | 179 | self.filename, member_name, mtype) |
| 154 | continue | 180 | continue |
| 155 | elif self.unknown_member_policy == UnknownMemberPolicy.KEEP: | 181 | elif self.unknown_member_policy == UnknownMemberPolicy.KEEP: |
| 156 | logging.warning("In file %s, keeping unknown element %s (format: %s)", | 182 | logging.warning("In file %s, keeping unknown element %s (format: %s)", |
| 157 | self.filename, item.filename, mtype) | 183 | self.filename, member_name, mtype) |
| 158 | else: | 184 | else: |
| 159 | logging.error("In file %s, element %s's format (%s) " \ | 185 | logging.error("In file %s, element %s's format (%s) " \ |
| 160 | "isn't supported", | 186 | "isn't supported", |
| 161 | self.filename, item.filename, mtype) | 187 | self.filename, member_name, mtype) |
| 162 | abort = True | 188 | abort = True |
| 163 | continue | 189 | continue |
| 164 | if tmp_parser: | 190 | else: |
| 165 | if tmp_parser.remove_all() is False: | 191 | if member_parser.remove_all() is False: |
| 166 | logging.warning("In file %s, something went wrong \ | 192 | logging.warning("In file %s, something went wrong \ |
| 167 | with the cleaning of %s \ | 193 | with the cleaning of %s \ |
| 168 | (format: %s)", | 194 | (format: %s)", |
| 169 | self.filename, item.filename, mtype) | 195 | self.filename, member_name, mtype) |
| 170 | abort = True | 196 | abort = True |
| 171 | continue | 197 | continue |
| 172 | os.rename(tmp_parser.output_filename, full_path) | 198 | os.rename(member_parser.output_filename, full_path) |
| 173 | 199 | ||
| 174 | zinfo = zipfile.ZipInfo(item.filename) # type: ignore | 200 | zinfo = self.member_class(member_name) # type: ignore |
| 175 | clean_zinfo = self._clean_zipinfo(zinfo) | 201 | clean_zinfo = self._clean_member(zinfo) |
| 176 | with open(full_path, 'rb') as f: | 202 | self._add_file_to_archive(zout, clean_zinfo, full_path) |
| 177 | zout.writestr(clean_zinfo, f.read()) | ||
| 178 | 203 | ||
| 179 | shutil.rmtree(temp_folder) | 204 | shutil.rmtree(temp_folder) |
| 180 | if abort: | 205 | if abort: |
| @@ -183,6 +208,111 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser): | |||
| 183 | return True | 208 | return True |
| 184 | 209 | ||
| 185 | 210 | ||
| 211 | class TarParser(ArchiveBasedAbstractParser): | ||
| 212 | mimetypes = {'application/x-tar'} | ||
| 213 | def __init__(self, filename): | ||
| 214 | super().__init__(filename) | ||
| 215 | self.archive_class = tarfile.TarFile | ||
| 216 | self.member_class = tarfile.TarInfo | ||
| 217 | |||
| 218 | def is_archive_valid(self): | ||
| 219 | if tarfile.is_tarfile(self.filename) is False: | ||
| 220 | raise ValueError | ||
| 221 | |||
| 222 | @staticmethod | ||
| 223 | def _clean_member(member: ArchiveMember) -> ArchiveMember: | ||
| 224 | assert isinstance(member, tarfile.TarInfo) # please mypy | ||
| 225 | member.mtime = member.uid = member.gid = 0 | ||
| 226 | member.uname = member.gname = '' | ||
| 227 | return member | ||
| 228 | |||
| 229 | @staticmethod | ||
| 230 | def _get_member_meta(member: ArchiveMember) -> Dict[str, str]: | ||
| 231 | assert isinstance(member, tarfile.TarInfo) # please mypy | ||
| 232 | metadata = {} | ||
| 233 | if member.mtime != 0: | ||
| 234 | metadata['mtime'] = str(member.mtime) | ||
| 235 | if member.uid != 0: | ||
| 236 | metadata['uid'] = str(member.uid) | ||
| 237 | if member.gid != 0: | ||
| 238 | metadata['gid'] = str(member.gid) | ||
| 239 | if member.uname != '': | ||
| 240 | metadata['uname'] = member.uname | ||
| 241 | if member.gname != '': | ||
| 242 | metadata['gname'] = member.gname | ||
| 243 | return metadata | ||
| 244 | |||
| 245 | @staticmethod | ||
| 246 | def _add_file_to_archive(archive: ArchiveClass, member: ArchiveMember, | ||
| 247 | full_path: str): | ||
| 248 | assert isinstance(member, tarfile.TarInfo) # please mypy | ||
| 249 | assert isinstance(archive, tarfile.TarFile) # please mypy | ||
| 250 | archive.add(full_path, member.name, filter=TarParser._clean_member) # type: ignore | ||
| 251 | |||
| 252 | @staticmethod | ||
| 253 | def _get_all_members(archive: ArchiveClass) -> List[ArchiveMember]: | ||
| 254 | assert isinstance(archive, tarfile.TarFile) # please mypy | ||
| 255 | return archive.getmembers() # type: ignore | ||
| 256 | |||
| 257 | @staticmethod | ||
| 258 | def _get_member_name(member: ArchiveMember) -> str: | ||
| 259 | assert isinstance(member, tarfile.TarInfo) # please mypy | ||
| 260 | return member.name | ||
| 186 | 261 | ||
| 187 | class ZipParser(ArchiveBasedAbstractParser): | 262 | class ZipParser(ArchiveBasedAbstractParser): |
| 188 | mimetypes = {'application/zip'} | 263 | mimetypes = {'application/zip'} |
| 264 | def __init__(self, filename): | ||
| 265 | super().__init__(filename) | ||
| 266 | self.archive_class = zipfile.ZipFile | ||
| 267 | self.member_class = zipfile.ZipInfo | ||
| 268 | |||
| 269 | def is_archive_valid(self): | ||
| 270 | try: | ||
| 271 | zipfile.ZipFile(self.filename) | ||
| 272 | except zipfile.BadZipFile: | ||
| 273 | raise ValueError | ||
| 274 | |||
| 275 | @staticmethod | ||
| 276 | def _clean_member(member: ArchiveMember) -> ArchiveMember: | ||
| 277 | assert isinstance(member, zipfile.ZipInfo) # please mypy | ||
| 278 | member.create_system = 3 # Linux | ||
| 279 | member.comment = b'' | ||
| 280 | member.date_time = (1980, 1, 1, 0, 0, 0) # this is as early as a zipfile can be | ||
| 281 | return member | ||
| 282 | |||
| 283 | @staticmethod | ||
| 284 | def _get_member_meta(member: ArchiveMember) -> Dict[str, str]: | ||
| 285 | assert isinstance(member, zipfile.ZipInfo) # please mypy | ||
| 286 | metadata = {} | ||
| 287 | if member.create_system == 3: # this is Linux | ||
| 288 | pass | ||
| 289 | elif member.create_system == 2: | ||
| 290 | metadata['create_system'] = 'Windows' | ||
| 291 | else: | ||
| 292 | metadata['create_system'] = 'Weird' | ||
| 293 | |||
| 294 | if member.comment: | ||
| 295 | metadata['comment'] = member.comment # type: ignore | ||
| 296 | |||
| 297 | if member.date_time != (1980, 1, 1, 0, 0, 0): | ||
| 298 | metadata['date_time'] = str(datetime.datetime(*member.date_time)) | ||
| 299 | |||
| 300 | return metadata | ||
| 301 | |||
| 302 | @staticmethod | ||
| 303 | def _add_file_to_archive(archive: ArchiveClass, member: ArchiveMember, | ||
| 304 | full_path: str): | ||
| 305 | assert isinstance(archive, zipfile.ZipFile) # please mypy | ||
| 306 | assert isinstance(member, zipfile.ZipInfo) # please mypy | ||
| 307 | with open(full_path, 'rb') as f: | ||
| 308 | archive.writestr(member, f.read()) | ||
| 309 | |||
| 310 | @staticmethod | ||
| 311 | def _get_all_members(archive: ArchiveClass) -> List[ArchiveMember]: | ||
| 312 | assert isinstance(archive, zipfile.ZipFile) # please mypy | ||
| 313 | return archive.infolist() # type: ignore | ||
| 314 | |||
| 315 | @staticmethod | ||
| 316 | def _get_member_name(member: ArchiveMember) -> str: | ||
| 317 | assert isinstance(member, zipfile.ZipInfo) # please mypy | ||
| 318 | return member.filename | ||
