diff options
| author | jvoisin | 2019-04-27 06:03:09 -0700 |
|---|---|---|
| committer | jvoisin | 2019-04-27 06:03:09 -0700 |
| commit | 8e41b098d6a8eb8da5687824a59c3af07b18725b (patch) | |
| tree | 0b2e74eaa9d4c77cb3d93965897978c995f4c3b5 /libmat2 | |
| parent | 82cc822a1dc7090f7a6af977ed6d4b7b945d038a (diff) | |
Add support for compressed tar files
Diffstat (limited to 'libmat2')
| -rw-r--r-- | libmat2/abstract.py | 5 | ||||
| -rw-r--r-- | libmat2/archive.py | 28 | ||||
| -rw-r--r-- | libmat2/parser_factory.py | 4 |
3 files changed, 35 insertions, 2 deletions
diff --git a/libmat2/abstract.py b/libmat2/abstract.py index aaf00d7..a7c5fa5 100644 --- a/libmat2/abstract.py +++ b/libmat2/abstract.py | |||
| @@ -25,6 +25,11 @@ class AbstractParser(abc.ABC): | |||
| 25 | 25 | ||
| 26 | self.filename = filename | 26 | self.filename = filename |
| 27 | fname, extension = os.path.splitext(filename) | 27 | fname, extension = os.path.splitext(filename) |
| 28 | |||
| 29 | # Special case for tar.gz, tar.bz2, … files | ||
| 30 | if fname.endswith('.tar') and len(fname) > 4: | ||
| 31 | fname, extension = fname[:-4], '.tar' + extension | ||
| 32 | |||
| 28 | self.output_filename = fname + '.cleaned' + extension | 33 | self.output_filename = fname + '.cleaned' + extension |
| 29 | self.lightweight_cleaning = False | 34 | self.lightweight_cleaning = False |
| 30 | 35 | ||
diff --git a/libmat2/archive.py b/libmat2/archive.py index 2936f39..d295afe 100644 --- a/libmat2/archive.py +++ b/libmat2/archive.py | |||
| @@ -40,6 +40,10 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser): | |||
| 40 | the Union[A, B] constrain, hence the weird `# type: ignore` | 40 | the Union[A, B] constrain, hence the weird `# type: ignore` |
| 41 | annotations. | 41 | annotations. |
| 42 | """ | 42 | """ |
| 43 | # Tarfiles can optionally support compression | ||
| 44 | # https://docs.python.org/3/library/tarfile.html#tarfile.open | ||
| 45 | compression = '' | ||
| 46 | |||
| 43 | def __init__(self, filename): | 47 | def __init__(self, filename): |
| 44 | super().__init__(filename) | 48 | super().__init__(filename) |
| 45 | self.archive_class = None # type: Optional[ArchiveClass] | 49 | self.archive_class = None # type: Optional[ArchiveClass] |
| @@ -134,7 +138,7 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser): | |||
| 134 | # pylint: disable=too-many-branches | 138 | # pylint: disable=too-many-branches |
| 135 | 139 | ||
| 136 | with self.archive_class(self.filename) as zin,\ | 140 | with self.archive_class(self.filename) as zin,\ |
| 137 | self.archive_class(self.output_filename, 'w') as zout: | 141 | self.archive_class(self.output_filename, 'w' + self.compression) as zout: |
| 138 | 142 | ||
| 139 | temp_folder = tempfile.mkdtemp() | 143 | temp_folder = tempfile.mkdtemp() |
| 140 | abort = False | 144 | abort = False |
| @@ -212,7 +216,11 @@ class TarParser(ArchiveBasedAbstractParser): | |||
| 212 | mimetypes = {'application/x-tar'} | 216 | mimetypes = {'application/x-tar'} |
| 213 | def __init__(self, filename): | 217 | def __init__(self, filename): |
| 214 | super().__init__(filename) | 218 | super().__init__(filename) |
| 215 | self.archive_class = tarfile.TarFile | 219 | # yes, it's tarfile.TarFile.open and not tarfile.TarFile, |
| 220 | # as stated in the documentation: | ||
| 221 | # https://docs.python.org/3/library/tarfile.html#tarfile.TarFile | ||
| 222 | # This is required to support compressed archives. | ||
| 223 | self.archive_class = tarfile.TarFile.open | ||
| 216 | self.member_class = tarfile.TarInfo | 224 | self.member_class = tarfile.TarInfo |
| 217 | 225 | ||
| 218 | def is_archive_valid(self): | 226 | def is_archive_valid(self): |
| @@ -259,6 +267,22 @@ class TarParser(ArchiveBasedAbstractParser): | |||
| 259 | assert isinstance(member, tarfile.TarInfo) # please mypy | 267 | assert isinstance(member, tarfile.TarInfo) # please mypy |
| 260 | return member.name | 268 | return member.name |
| 261 | 269 | ||
| 270 | |||
| 271 | class TarGzParser(TarParser): | ||
| 272 | compression = ':gz' | ||
| 273 | mimetypes = {'application/x-tar+gz'} | ||
| 274 | |||
| 275 | |||
| 276 | class TarBz2Parser(TarParser): | ||
| 277 | compression = ':bz2' | ||
| 278 | mimetypes = {'application/x-tar+bz2'} | ||
| 279 | |||
| 280 | |||
| 281 | class TarXzParser(TarParser): | ||
| 282 | compression = ':xz' | ||
| 283 | mimetypes = {'application/x-tar+xz'} | ||
| 284 | |||
| 285 | |||
| 262 | class ZipParser(ArchiveBasedAbstractParser): | 286 | class ZipParser(ArchiveBasedAbstractParser): |
| 263 | mimetypes = {'application/zip'} | 287 | mimetypes = {'application/zip'} |
| 264 | def __init__(self, filename): | 288 | def __init__(self, filename): |
diff --git a/libmat2/parser_factory.py b/libmat2/parser_factory.py index e93ee4f..3931903 100644 --- a/libmat2/parser_factory.py +++ b/libmat2/parser_factory.py | |||
| @@ -50,6 +50,10 @@ def get_parser(filename: str) -> Tuple[Optional[T], Optional[str]]: | |||
| 50 | if extension.lower() in UNSUPPORTED_EXTENSIONS: | 50 | if extension.lower() in UNSUPPORTED_EXTENSIONS: |
| 51 | return None, mtype | 51 | return None, mtype |
| 52 | 52 | ||
| 53 | if mtype == 'application/x-tar': | ||
| 54 | if extension[1:] in ('bz2', 'gz', 'xz'): | ||
| 55 | mtype = mtype + '+' + extension[1:] | ||
| 56 | |||
| 53 | for parser_class in _get_parsers(): # type: ignore | 57 | for parser_class in _get_parsers(): # type: ignore |
| 54 | if mtype in parser_class.mimetypes: | 58 | if mtype in parser_class.mimetypes: |
| 55 | try: | 59 | try: |
