summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorjvoisin2019-04-27 06:03:09 -0700
committerjvoisin2019-04-27 06:03:09 -0700
commit8e41b098d6a8eb8da5687824a59c3af07b18725b (patch)
tree0b2e74eaa9d4c77cb3d93965897978c995f4c3b5
parent82cc822a1dc7090f7a6af977ed6d4b7b945d038a (diff)
Add support for compressed tar files
-rw-r--r--libmat2/abstract.py5
-rw-r--r--libmat2/archive.py28
-rw-r--r--libmat2/parser_factory.py4
-rw-r--r--tests/test_libmat2.py115
4 files changed, 149 insertions, 3 deletions
diff --git a/libmat2/abstract.py b/libmat2/abstract.py
index aaf00d7..a7c5fa5 100644
--- a/libmat2/abstract.py
+++ b/libmat2/abstract.py
@@ -25,6 +25,11 @@ class AbstractParser(abc.ABC):
25 25
26 self.filename = filename 26 self.filename = filename
27 fname, extension = os.path.splitext(filename) 27 fname, extension = os.path.splitext(filename)
28
29 # Special case for tar.gz, tar.bz2, … files
30 if fname.endswith('.tar') and len(fname) > 4:
31 fname, extension = fname[:-4], '.tar' + extension
32
28 self.output_filename = fname + '.cleaned' + extension 33 self.output_filename = fname + '.cleaned' + extension
29 self.lightweight_cleaning = False 34 self.lightweight_cleaning = False
30 35
diff --git a/libmat2/archive.py b/libmat2/archive.py
index 2936f39..d295afe 100644
--- a/libmat2/archive.py
+++ b/libmat2/archive.py
@@ -40,6 +40,10 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
40 the Union[A, B] constrain, hence the weird `# type: ignore` 40 the Union[A, B] constrain, hence the weird `# type: ignore`
41 annotations. 41 annotations.
42 """ 42 """
43 # Tarfiles can optionally support compression
44 # https://docs.python.org/3/library/tarfile.html#tarfile.open
45 compression = ''
46
43 def __init__(self, filename): 47 def __init__(self, filename):
44 super().__init__(filename) 48 super().__init__(filename)
45 self.archive_class = None # type: Optional[ArchiveClass] 49 self.archive_class = None # type: Optional[ArchiveClass]
@@ -134,7 +138,7 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
134 # pylint: disable=too-many-branches 138 # pylint: disable=too-many-branches
135 139
136 with self.archive_class(self.filename) as zin,\ 140 with self.archive_class(self.filename) as zin,\
137 self.archive_class(self.output_filename, 'w') as zout: 141 self.archive_class(self.output_filename, 'w' + self.compression) as zout:
138 142
139 temp_folder = tempfile.mkdtemp() 143 temp_folder = tempfile.mkdtemp()
140 abort = False 144 abort = False
@@ -212,7 +216,11 @@ class TarParser(ArchiveBasedAbstractParser):
212 mimetypes = {'application/x-tar'} 216 mimetypes = {'application/x-tar'}
213 def __init__(self, filename): 217 def __init__(self, filename):
214 super().__init__(filename) 218 super().__init__(filename)
215 self.archive_class = tarfile.TarFile 219 # yes, it's tarfile.TarFile.open and not tarfile.TarFile,
220 # as stated in the documentation:
221 # https://docs.python.org/3/library/tarfile.html#tarfile.TarFile
222 # This is required to support compressed archives.
223 self.archive_class = tarfile.TarFile.open
216 self.member_class = tarfile.TarInfo 224 self.member_class = tarfile.TarInfo
217 225
218 def is_archive_valid(self): 226 def is_archive_valid(self):
@@ -259,6 +267,22 @@ class TarParser(ArchiveBasedAbstractParser):
259 assert isinstance(member, tarfile.TarInfo) # please mypy 267 assert isinstance(member, tarfile.TarInfo) # please mypy
260 return member.name 268 return member.name
261 269
270
271class TarGzParser(TarParser):
272 compression = ':gz'
273 mimetypes = {'application/x-tar+gz'}
274
275
276class TarBz2Parser(TarParser):
277 compression = ':bz2'
278 mimetypes = {'application/x-tar+bz2'}
279
280
281class TarXzParser(TarParser):
282 compression = ':xz'
283 mimetypes = {'application/x-tar+xz'}
284
285
262class ZipParser(ArchiveBasedAbstractParser): 286class ZipParser(ArchiveBasedAbstractParser):
263 mimetypes = {'application/zip'} 287 mimetypes = {'application/zip'}
264 def __init__(self, filename): 288 def __init__(self, filename):
diff --git a/libmat2/parser_factory.py b/libmat2/parser_factory.py
index e93ee4f..3931903 100644
--- a/libmat2/parser_factory.py
+++ b/libmat2/parser_factory.py
@@ -50,6 +50,10 @@ def get_parser(filename: str) -> Tuple[Optional[T], Optional[str]]:
50 if extension.lower() in UNSUPPORTED_EXTENSIONS: 50 if extension.lower() in UNSUPPORTED_EXTENSIONS:
51 return None, mtype 51 return None, mtype
52 52
53 if mtype == 'application/x-tar':
54 if extension[1:] in ('bz2', 'gz', 'xz'):
55 mtype = mtype + '+' + extension[1:]
56
53 for parser_class in _get_parsers(): # type: ignore 57 for parser_class in _get_parsers(): # type: ignore
54 if mtype in parser_class.mimetypes: 58 if mtype in parser_class.mimetypes:
55 try: 59 try:
diff --git a/tests/test_libmat2.py b/tests/test_libmat2.py
index 1d2a22a..4f562e6 100644
--- a/tests/test_libmat2.py
+++ b/tests/test_libmat2.py
@@ -30,6 +30,14 @@ class TestParserFactory(unittest.TestCase):
30 self.assertEqual(mimetype, 'audio/mpeg') 30 self.assertEqual(mimetype, 'audio/mpeg')
31 self.assertEqual(parser.__class__, audio.MP3Parser) 31 self.assertEqual(parser.__class__, audio.MP3Parser)
32 32
33 def test_tarfile_double_extension_handling(self):
34 """ Test that our module auto-detection is handling sub-sub-classes """
35 with tarfile.TarFile.open('./tests/data/dirty.tar.bz2', 'w:bz2') as zout:
36 zout.add('./tests/data/dirty.jpg')
37 parser, mimetype = parser_factory.get_parser('./tests/data/dirty.tar.bz2')
38 self.assertEqual(mimetype, 'application/x-tar+bz2')
39 os.remove('./tests/data/dirty.tar.bz2')
40
33 41
34class TestParameterInjection(unittest.TestCase): 42class TestParameterInjection(unittest.TestCase):
35 def test_ver_injection(self): 43 def test_ver_injection(self):
@@ -719,7 +727,7 @@ class TestCleaning(unittest.TestCase):
719 os.remove('./tests/data/clean.cleaned.cleaned.css') 727 os.remove('./tests/data/clean.cleaned.cleaned.css')
720 728
721 def test_tar(self): 729 def test_tar(self):
722 with tarfile.TarFile('./tests/data/dirty.tar', 'w') as zout: 730 with tarfile.TarFile.open('./tests/data/dirty.tar', 'w') as zout:
723 zout.add('./tests/data/dirty.flac') 731 zout.add('./tests/data/dirty.flac')
724 zout.add('./tests/data/dirty.docx') 732 zout.add('./tests/data/dirty.docx')
725 zout.add('./tests/data/dirty.jpg') 733 zout.add('./tests/data/dirty.jpg')
@@ -752,3 +760,108 @@ class TestCleaning(unittest.TestCase):
752 os.remove('./tests/data/dirty.tar') 760 os.remove('./tests/data/dirty.tar')
753 os.remove('./tests/data/dirty.cleaned.tar') 761 os.remove('./tests/data/dirty.cleaned.tar')
754 os.remove('./tests/data/dirty.cleaned.cleaned.tar') 762 os.remove('./tests/data/dirty.cleaned.cleaned.tar')
763
764 def test_targz(self):
765 with tarfile.TarFile.open('./tests/data/dirty.tar.gz', 'w:gz') as zout:
766 zout.add('./tests/data/dirty.flac')
767 zout.add('./tests/data/dirty.docx')
768 zout.add('./tests/data/dirty.jpg')
769 p = archive.TarParser('./tests/data/dirty.tar.gz')
770 meta = p.get_meta()
771 self.assertEqual(meta['./tests/data/dirty.docx']['word/media/image1.png']['Comment'], 'This is a comment, be careful!')
772
773 ret = p.remove_all()
774 self.assertTrue(ret)
775
776 p = archive.TarParser('./tests/data/dirty.cleaned.tar.gz')
777 self.assertEqual(p.get_meta(), {})
778 self.assertTrue(p.remove_all())
779
780 tmp_dir = tempfile.mkdtemp()
781 with tarfile.open('./tests/data/dirty.cleaned.tar.gz') as zout:
782 zout.extractall(path=tmp_dir)
783 zout.close()
784
785 number_of_files = 0
786 for root, _, fnames in os.walk(tmp_dir):
787 for f in fnames:
788 complete_path = os.path.join(root, f)
789 p, _ = parser_factory.get_parser(complete_path)
790 self.assertIsNotNone(p)
791 self.assertEqual(p.get_meta(), {})
792 number_of_files += 1
793 self.assertEqual(number_of_files, 3)
794
795 os.remove('./tests/data/dirty.tar.gz')
796 os.remove('./tests/data/dirty.cleaned.tar.gz')
797 os.remove('./tests/data/dirty.cleaned.cleaned.tar.gz')
798
799 def test_tarbz2(self):
800 with tarfile.TarFile.open('./tests/data/dirty.tar.bz2', 'w:bz2') as zout:
801 zout.add('./tests/data/dirty.flac')
802 zout.add('./tests/data/dirty.docx')
803 zout.add('./tests/data/dirty.jpg')
804 p = archive.TarParser('./tests/data/dirty.tar.bz2')
805 meta = p.get_meta()
806 self.assertEqual(meta['./tests/data/dirty.docx']['word/media/image1.png']['Comment'], 'This is a comment, be careful!')
807
808 ret = p.remove_all()
809 self.assertTrue(ret)
810
811 p = archive.TarParser('./tests/data/dirty.cleaned.tar.bz2')
812 self.assertEqual(p.get_meta(), {})
813 self.assertTrue(p.remove_all())
814
815 tmp_dir = tempfile.mkdtemp()
816 with tarfile.open('./tests/data/dirty.cleaned.tar.bz2') as zout:
817 zout.extractall(path=tmp_dir)
818 zout.close()
819
820 number_of_files = 0
821 for root, _, fnames in os.walk(tmp_dir):
822 for f in fnames:
823 complete_path = os.path.join(root, f)
824 p, _ = parser_factory.get_parser(complete_path)
825 self.assertIsNotNone(p)
826 self.assertEqual(p.get_meta(), {})
827 number_of_files += 1
828 self.assertEqual(number_of_files, 3)
829
830 os.remove('./tests/data/dirty.tar.bz2')
831 os.remove('./tests/data/dirty.cleaned.tar.bz2')
832 os.remove('./tests/data/dirty.cleaned.cleaned.tar.bz2')
833
834 def test_tarxz(self):
835 with tarfile.TarFile.open('./tests/data/dirty.tar.xz', 'w:xz') as zout:
836 zout.add('./tests/data/dirty.flac')
837 zout.add('./tests/data/dirty.docx')
838 zout.add('./tests/data/dirty.jpg')
839 p = archive.TarParser('./tests/data/dirty.tar.xz')
840 meta = p.get_meta()
841 self.assertEqual(meta['./tests/data/dirty.docx']['word/media/image1.png']['Comment'], 'This is a comment, be careful!')
842
843 ret = p.remove_all()
844 self.assertTrue(ret)
845
846 p = archive.TarParser('./tests/data/dirty.cleaned.tar.xz')
847 self.assertEqual(p.get_meta(), {})
848 self.assertTrue(p.remove_all())
849
850 tmp_dir = tempfile.mkdtemp()
851 with tarfile.open('./tests/data/dirty.cleaned.tar.xz') as zout:
852 zout.extractall(path=tmp_dir)
853 zout.close()
854
855 number_of_files = 0
856 for root, _, fnames in os.walk(tmp_dir):
857 for f in fnames:
858 complete_path = os.path.join(root, f)
859 p, _ = parser_factory.get_parser(complete_path)
860 self.assertIsNotNone(p)
861 self.assertEqual(p.get_meta(), {})
862 number_of_files += 1
863 self.assertEqual(number_of_files, 3)
864
865 os.remove('./tests/data/dirty.tar.xz')
866 os.remove('./tests/data/dirty.cleaned.tar.xz')
867 os.remove('./tests/data/dirty.cleaned.cleaned.tar.xz')