diff options
| author | jvoisin | 2019-04-27 06:03:09 -0700 |
|---|---|---|
| committer | jvoisin | 2019-04-27 06:03:09 -0700 |
| commit | 8e41b098d6a8eb8da5687824a59c3af07b18725b (patch) | |
| tree | 0b2e74eaa9d4c77cb3d93965897978c995f4c3b5 | |
| parent | 82cc822a1dc7090f7a6af977ed6d4b7b945d038a (diff) | |
Add support for compressed tar files
| -rw-r--r-- | libmat2/abstract.py | 5 | ||||
| -rw-r--r-- | libmat2/archive.py | 28 | ||||
| -rw-r--r-- | libmat2/parser_factory.py | 4 | ||||
| -rw-r--r-- | tests/test_libmat2.py | 115 |
4 files changed, 149 insertions, 3 deletions
diff --git a/libmat2/abstract.py b/libmat2/abstract.py index aaf00d7..a7c5fa5 100644 --- a/libmat2/abstract.py +++ b/libmat2/abstract.py | |||
| @@ -25,6 +25,11 @@ class AbstractParser(abc.ABC): | |||
| 25 | 25 | ||
| 26 | self.filename = filename | 26 | self.filename = filename |
| 27 | fname, extension = os.path.splitext(filename) | 27 | fname, extension = os.path.splitext(filename) |
| 28 | |||
| 29 | # Special case for tar.gz, tar.bz2, … files | ||
| 30 | if fname.endswith('.tar') and len(fname) > 4: | ||
| 31 | fname, extension = fname[:-4], '.tar' + extension | ||
| 32 | |||
| 28 | self.output_filename = fname + '.cleaned' + extension | 33 | self.output_filename = fname + '.cleaned' + extension |
| 29 | self.lightweight_cleaning = False | 34 | self.lightweight_cleaning = False |
| 30 | 35 | ||
diff --git a/libmat2/archive.py b/libmat2/archive.py index 2936f39..d295afe 100644 --- a/libmat2/archive.py +++ b/libmat2/archive.py | |||
| @@ -40,6 +40,10 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser): | |||
| 40 | the Union[A, B] constrain, hence the weird `# type: ignore` | 40 | the Union[A, B] constrain, hence the weird `# type: ignore` |
| 41 | annotations. | 41 | annotations. |
| 42 | """ | 42 | """ |
| 43 | # Tarfiles can optionally support compression | ||
| 44 | # https://docs.python.org/3/library/tarfile.html#tarfile.open | ||
| 45 | compression = '' | ||
| 46 | |||
| 43 | def __init__(self, filename): | 47 | def __init__(self, filename): |
| 44 | super().__init__(filename) | 48 | super().__init__(filename) |
| 45 | self.archive_class = None # type: Optional[ArchiveClass] | 49 | self.archive_class = None # type: Optional[ArchiveClass] |
| @@ -134,7 +138,7 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser): | |||
| 134 | # pylint: disable=too-many-branches | 138 | # pylint: disable=too-many-branches |
| 135 | 139 | ||
| 136 | with self.archive_class(self.filename) as zin,\ | 140 | with self.archive_class(self.filename) as zin,\ |
| 137 | self.archive_class(self.output_filename, 'w') as zout: | 141 | self.archive_class(self.output_filename, 'w' + self.compression) as zout: |
| 138 | 142 | ||
| 139 | temp_folder = tempfile.mkdtemp() | 143 | temp_folder = tempfile.mkdtemp() |
| 140 | abort = False | 144 | abort = False |
| @@ -212,7 +216,11 @@ class TarParser(ArchiveBasedAbstractParser): | |||
| 212 | mimetypes = {'application/x-tar'} | 216 | mimetypes = {'application/x-tar'} |
| 213 | def __init__(self, filename): | 217 | def __init__(self, filename): |
| 214 | super().__init__(filename) | 218 | super().__init__(filename) |
| 215 | self.archive_class = tarfile.TarFile | 219 | # yes, it's tarfile.TarFile.open and not tarfile.TarFile, |
| 220 | # as stated in the documentation: | ||
| 221 | # https://docs.python.org/3/library/tarfile.html#tarfile.TarFile | ||
| 222 | # This is required to support compressed archives. | ||
| 223 | self.archive_class = tarfile.TarFile.open | ||
| 216 | self.member_class = tarfile.TarInfo | 224 | self.member_class = tarfile.TarInfo |
| 217 | 225 | ||
| 218 | def is_archive_valid(self): | 226 | def is_archive_valid(self): |
| @@ -259,6 +267,22 @@ class TarParser(ArchiveBasedAbstractParser): | |||
| 259 | assert isinstance(member, tarfile.TarInfo) # please mypy | 267 | assert isinstance(member, tarfile.TarInfo) # please mypy |
| 260 | return member.name | 268 | return member.name |
| 261 | 269 | ||
| 270 | |||
| 271 | class TarGzParser(TarParser): | ||
| 272 | compression = ':gz' | ||
| 273 | mimetypes = {'application/x-tar+gz'} | ||
| 274 | |||
| 275 | |||
| 276 | class TarBz2Parser(TarParser): | ||
| 277 | compression = ':bz2' | ||
| 278 | mimetypes = {'application/x-tar+bz2'} | ||
| 279 | |||
| 280 | |||
| 281 | class TarXzParser(TarParser): | ||
| 282 | compression = ':xz' | ||
| 283 | mimetypes = {'application/x-tar+xz'} | ||
| 284 | |||
| 285 | |||
| 262 | class ZipParser(ArchiveBasedAbstractParser): | 286 | class ZipParser(ArchiveBasedAbstractParser): |
| 263 | mimetypes = {'application/zip'} | 287 | mimetypes = {'application/zip'} |
| 264 | def __init__(self, filename): | 288 | def __init__(self, filename): |
diff --git a/libmat2/parser_factory.py b/libmat2/parser_factory.py index e93ee4f..3931903 100644 --- a/libmat2/parser_factory.py +++ b/libmat2/parser_factory.py | |||
| @@ -50,6 +50,10 @@ def get_parser(filename: str) -> Tuple[Optional[T], Optional[str]]: | |||
| 50 | if extension.lower() in UNSUPPORTED_EXTENSIONS: | 50 | if extension.lower() in UNSUPPORTED_EXTENSIONS: |
| 51 | return None, mtype | 51 | return None, mtype |
| 52 | 52 | ||
| 53 | if mtype == 'application/x-tar': | ||
| 54 | if extension[1:] in ('bz2', 'gz', 'xz'): | ||
| 55 | mtype = mtype + '+' + extension[1:] | ||
| 56 | |||
| 53 | for parser_class in _get_parsers(): # type: ignore | 57 | for parser_class in _get_parsers(): # type: ignore |
| 54 | if mtype in parser_class.mimetypes: | 58 | if mtype in parser_class.mimetypes: |
| 55 | try: | 59 | try: |
diff --git a/tests/test_libmat2.py b/tests/test_libmat2.py index 1d2a22a..4f562e6 100644 --- a/tests/test_libmat2.py +++ b/tests/test_libmat2.py | |||
| @@ -30,6 +30,14 @@ class TestParserFactory(unittest.TestCase): | |||
| 30 | self.assertEqual(mimetype, 'audio/mpeg') | 30 | self.assertEqual(mimetype, 'audio/mpeg') |
| 31 | self.assertEqual(parser.__class__, audio.MP3Parser) | 31 | self.assertEqual(parser.__class__, audio.MP3Parser) |
| 32 | 32 | ||
| 33 | def test_tarfile_double_extension_handling(self): | ||
| 34 | """ Test that our module auto-detection is handling sub-sub-classes """ | ||
| 35 | with tarfile.TarFile.open('./tests/data/dirty.tar.bz2', 'w:bz2') as zout: | ||
| 36 | zout.add('./tests/data/dirty.jpg') | ||
| 37 | parser, mimetype = parser_factory.get_parser('./tests/data/dirty.tar.bz2') | ||
| 38 | self.assertEqual(mimetype, 'application/x-tar+bz2') | ||
| 39 | os.remove('./tests/data/dirty.tar.bz2') | ||
| 40 | |||
| 33 | 41 | ||
| 34 | class TestParameterInjection(unittest.TestCase): | 42 | class TestParameterInjection(unittest.TestCase): |
| 35 | def test_ver_injection(self): | 43 | def test_ver_injection(self): |
| @@ -719,7 +727,7 @@ class TestCleaning(unittest.TestCase): | |||
| 719 | os.remove('./tests/data/clean.cleaned.cleaned.css') | 727 | os.remove('./tests/data/clean.cleaned.cleaned.css') |
| 720 | 728 | ||
| 721 | def test_tar(self): | 729 | def test_tar(self): |
| 722 | with tarfile.TarFile('./tests/data/dirty.tar', 'w') as zout: | 730 | with tarfile.TarFile.open('./tests/data/dirty.tar', 'w') as zout: |
| 723 | zout.add('./tests/data/dirty.flac') | 731 | zout.add('./tests/data/dirty.flac') |
| 724 | zout.add('./tests/data/dirty.docx') | 732 | zout.add('./tests/data/dirty.docx') |
| 725 | zout.add('./tests/data/dirty.jpg') | 733 | zout.add('./tests/data/dirty.jpg') |
| @@ -752,3 +760,108 @@ class TestCleaning(unittest.TestCase): | |||
| 752 | os.remove('./tests/data/dirty.tar') | 760 | os.remove('./tests/data/dirty.tar') |
| 753 | os.remove('./tests/data/dirty.cleaned.tar') | 761 | os.remove('./tests/data/dirty.cleaned.tar') |
| 754 | os.remove('./tests/data/dirty.cleaned.cleaned.tar') | 762 | os.remove('./tests/data/dirty.cleaned.cleaned.tar') |
| 763 | |||
| 764 | def test_targz(self): | ||
| 765 | with tarfile.TarFile.open('./tests/data/dirty.tar.gz', 'w:gz') as zout: | ||
| 766 | zout.add('./tests/data/dirty.flac') | ||
| 767 | zout.add('./tests/data/dirty.docx') | ||
| 768 | zout.add('./tests/data/dirty.jpg') | ||
| 769 | p = archive.TarParser('./tests/data/dirty.tar.gz') | ||
| 770 | meta = p.get_meta() | ||
| 771 | self.assertEqual(meta['./tests/data/dirty.docx']['word/media/image1.png']['Comment'], 'This is a comment, be careful!') | ||
| 772 | |||
| 773 | ret = p.remove_all() | ||
| 774 | self.assertTrue(ret) | ||
| 775 | |||
| 776 | p = archive.TarParser('./tests/data/dirty.cleaned.tar.gz') | ||
| 777 | self.assertEqual(p.get_meta(), {}) | ||
| 778 | self.assertTrue(p.remove_all()) | ||
| 779 | |||
| 780 | tmp_dir = tempfile.mkdtemp() | ||
| 781 | with tarfile.open('./tests/data/dirty.cleaned.tar.gz') as zout: | ||
| 782 | zout.extractall(path=tmp_dir) | ||
| 783 | zout.close() | ||
| 784 | |||
| 785 | number_of_files = 0 | ||
| 786 | for root, _, fnames in os.walk(tmp_dir): | ||
| 787 | for f in fnames: | ||
| 788 | complete_path = os.path.join(root, f) | ||
| 789 | p, _ = parser_factory.get_parser(complete_path) | ||
| 790 | self.assertIsNotNone(p) | ||
| 791 | self.assertEqual(p.get_meta(), {}) | ||
| 792 | number_of_files += 1 | ||
| 793 | self.assertEqual(number_of_files, 3) | ||
| 794 | |||
| 795 | os.remove('./tests/data/dirty.tar.gz') | ||
| 796 | os.remove('./tests/data/dirty.cleaned.tar.gz') | ||
| 797 | os.remove('./tests/data/dirty.cleaned.cleaned.tar.gz') | ||
| 798 | |||
| 799 | def test_tarbz2(self): | ||
| 800 | with tarfile.TarFile.open('./tests/data/dirty.tar.bz2', 'w:bz2') as zout: | ||
| 801 | zout.add('./tests/data/dirty.flac') | ||
| 802 | zout.add('./tests/data/dirty.docx') | ||
| 803 | zout.add('./tests/data/dirty.jpg') | ||
| 804 | p = archive.TarParser('./tests/data/dirty.tar.bz2') | ||
| 805 | meta = p.get_meta() | ||
| 806 | self.assertEqual(meta['./tests/data/dirty.docx']['word/media/image1.png']['Comment'], 'This is a comment, be careful!') | ||
| 807 | |||
| 808 | ret = p.remove_all() | ||
| 809 | self.assertTrue(ret) | ||
| 810 | |||
| 811 | p = archive.TarParser('./tests/data/dirty.cleaned.tar.bz2') | ||
| 812 | self.assertEqual(p.get_meta(), {}) | ||
| 813 | self.assertTrue(p.remove_all()) | ||
| 814 | |||
| 815 | tmp_dir = tempfile.mkdtemp() | ||
| 816 | with tarfile.open('./tests/data/dirty.cleaned.tar.bz2') as zout: | ||
| 817 | zout.extractall(path=tmp_dir) | ||
| 818 | zout.close() | ||
| 819 | |||
| 820 | number_of_files = 0 | ||
| 821 | for root, _, fnames in os.walk(tmp_dir): | ||
| 822 | for f in fnames: | ||
| 823 | complete_path = os.path.join(root, f) | ||
| 824 | p, _ = parser_factory.get_parser(complete_path) | ||
| 825 | self.assertIsNotNone(p) | ||
| 826 | self.assertEqual(p.get_meta(), {}) | ||
| 827 | number_of_files += 1 | ||
| 828 | self.assertEqual(number_of_files, 3) | ||
| 829 | |||
| 830 | os.remove('./tests/data/dirty.tar.bz2') | ||
| 831 | os.remove('./tests/data/dirty.cleaned.tar.bz2') | ||
| 832 | os.remove('./tests/data/dirty.cleaned.cleaned.tar.bz2') | ||
| 833 | |||
| 834 | def test_tarxz(self): | ||
| 835 | with tarfile.TarFile.open('./tests/data/dirty.tar.xz', 'w:xz') as zout: | ||
| 836 | zout.add('./tests/data/dirty.flac') | ||
| 837 | zout.add('./tests/data/dirty.docx') | ||
| 838 | zout.add('./tests/data/dirty.jpg') | ||
| 839 | p = archive.TarParser('./tests/data/dirty.tar.xz') | ||
| 840 | meta = p.get_meta() | ||
| 841 | self.assertEqual(meta['./tests/data/dirty.docx']['word/media/image1.png']['Comment'], 'This is a comment, be careful!') | ||
| 842 | |||
| 843 | ret = p.remove_all() | ||
| 844 | self.assertTrue(ret) | ||
| 845 | |||
| 846 | p = archive.TarParser('./tests/data/dirty.cleaned.tar.xz') | ||
| 847 | self.assertEqual(p.get_meta(), {}) | ||
| 848 | self.assertTrue(p.remove_all()) | ||
| 849 | |||
| 850 | tmp_dir = tempfile.mkdtemp() | ||
| 851 | with tarfile.open('./tests/data/dirty.cleaned.tar.xz') as zout: | ||
| 852 | zout.extractall(path=tmp_dir) | ||
| 853 | zout.close() | ||
| 854 | |||
| 855 | number_of_files = 0 | ||
| 856 | for root, _, fnames in os.walk(tmp_dir): | ||
| 857 | for f in fnames: | ||
| 858 | complete_path = os.path.join(root, f) | ||
| 859 | p, _ = parser_factory.get_parser(complete_path) | ||
| 860 | self.assertIsNotNone(p) | ||
| 861 | self.assertEqual(p.get_meta(), {}) | ||
| 862 | number_of_files += 1 | ||
| 863 | self.assertEqual(number_of_files, 3) | ||
| 864 | |||
| 865 | os.remove('./tests/data/dirty.tar.xz') | ||
| 866 | os.remove('./tests/data/dirty.cleaned.tar.xz') | ||
| 867 | os.remove('./tests/data/dirty.cleaned.cleaned.tar.xz') | ||
