summaryrefslogtreecommitdiff
path: root/libmat2
diff options
context:
space:
mode:
Diffstat (limited to 'libmat2')
-rw-r--r--libmat2/__init__.py7
-rw-r--r--libmat2/abstract.py8
-rw-r--r--libmat2/archive.py32
-rw-r--r--libmat2/audio.py15
-rw-r--r--libmat2/bubblewrap.py4
-rw-r--r--libmat2/epub.py7
-rw-r--r--libmat2/exiftool.py6
-rw-r--r--libmat2/harmless.py4
-rw-r--r--libmat2/images.py13
-rw-r--r--libmat2/office.py15
-rw-r--r--libmat2/pdf.py7
-rw-r--r--libmat2/torrent.py16
-rw-r--r--libmat2/video.py8
-rw-r--r--libmat2/web.py16
14 files changed, 81 insertions, 77 deletions
diff --git a/libmat2/__init__.py b/libmat2/__init__.py
index 762686f..2f20265 100644
--- a/libmat2/__init__.py
+++ b/libmat2/__init__.py
@@ -2,7 +2,7 @@
2 2
3import enum 3import enum
4import importlib 4import importlib
5from typing import Optional, Union 5from typing import Optional, Union, Dict
6 6
7from . import exiftool, video 7from . import exiftool, video
8 8
@@ -66,8 +66,9 @@ CMD_DEPENDENCIES = {
66 }, 66 },
67} 67}
68 68
69def check_dependencies() -> dict[str, dict[str, bool]]: 69
70 ret = dict() # type: dict[str, dict] 70def check_dependencies() -> Dict[str, Dict[str, bool]]:
71 ret = dict() # type: Dict[str, Dict]
71 72
72 for key, value in DEPENDENCIES.items(): 73 for key, value in DEPENDENCIES.items():
73 ret[key] = { 74 ret[key] = {
diff --git a/libmat2/abstract.py b/libmat2/abstract.py
index 426ccfc..1aff630 100644
--- a/libmat2/abstract.py
+++ b/libmat2/abstract.py
@@ -1,7 +1,7 @@
1import abc 1import abc
2import os 2import os
3import re 3import re
4from typing import Union 4from typing import Union, Set, Dict
5 5
6 6
7class AbstractParser(abc.ABC): 7class AbstractParser(abc.ABC):
@@ -9,8 +9,8 @@ class AbstractParser(abc.ABC):
9 It might yield `ValueError` on instantiation on invalid files, 9 It might yield `ValueError` on instantiation on invalid files,
10 and `RuntimeError` when something went wrong in `remove_all`. 10 and `RuntimeError` when something went wrong in `remove_all`.
11 """ 11 """
12 meta_list = set() # type: set[str] 12 meta_list = set() # type: Set[str]
13 mimetypes = set() # type: set[str] 13 mimetypes = set() # type: Set[str]
14 14
15 def __init__(self, filename: str) -> None: 15 def __init__(self, filename: str) -> None:
16 """ 16 """
@@ -33,7 +33,7 @@ class AbstractParser(abc.ABC):
33 self.sandbox = True 33 self.sandbox = True
34 34
35 @abc.abstractmethod 35 @abc.abstractmethod
36 def get_meta(self) -> dict[str, Union[str, dict]]: 36 def get_meta(self) -> Dict[str, Union[str, Dict]]:
37 """Return all the metadata of the current file""" 37 """Return all the metadata of the current file"""
38 38
39 @abc.abstractmethod 39 @abc.abstractmethod
diff --git a/libmat2/archive.py b/libmat2/archive.py
index 25ff7f9..cbedcd2 100644
--- a/libmat2/archive.py
+++ b/libmat2/archive.py
@@ -7,7 +7,7 @@ import tempfile
7import os 7import os
8import logging 8import logging
9import shutil 9import shutil
10from typing import Pattern, Union, Any 10from typing import Pattern, Union, Any, Set, Dict, List
11 11
12from . import abstract, UnknownMemberPolicy, parser_factory 12from . import abstract, UnknownMemberPolicy, parser_factory
13 13
@@ -44,16 +44,16 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
44 def __init__(self, filename): 44 def __init__(self, filename):
45 super().__init__(filename) 45 super().__init__(filename)
46 # We ignore typing here because mypy is too stupid 46 # We ignore typing here because mypy is too stupid
47 self.archive_class = None # type: ignore 47 self.archive_class = None # type: ignore
48 self.member_class = None # type: ignore 48 self.member_class = None # type: ignore
49 49
50 # Those are the files that have a format that _isn't_ 50 # Those are the files that have a format that _isn't_
51 # supported by mat2, but that we want to keep anyway. 51 # supported by mat2, but that we want to keep anyway.
52 self.files_to_keep = set() # type: set[Pattern] 52 self.files_to_keep = set() # type: Set[Pattern]
53 53
54 # Those are the files that we _do not_ want to keep, 54 # Those are the files that we _do not_ want to keep,
55 # no matter if they are supported or not. 55 # no matter if they are supported or not.
56 self.files_to_omit = set() # type: set[Pattern] 56 self.files_to_omit = set() # type: Set[Pattern]
57 57
58 # what should the parser do if it encounters an unknown file in 58 # what should the parser do if it encounters an unknown file in
59 # the archive? 59 # the archive?
@@ -72,7 +72,7 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
72 # pylint: disable=unused-argument 72 # pylint: disable=unused-argument
73 return True # pragma: no cover 73 return True # pragma: no cover
74 74
75 def _specific_get_meta(self, full_path: str, file_path: str) -> dict[str, Any]: 75 def _specific_get_meta(self, full_path: str, file_path: str) -> Dict[str, Any]:
76 """ This method can be used to extract specific metadata 76 """ This method can be used to extract specific metadata
77 from files present in the archive.""" 77 from files present in the archive."""
78 # pylint: disable=unused-argument 78 # pylint: disable=unused-argument
@@ -87,7 +87,7 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
87 87
88 @staticmethod 88 @staticmethod
89 @abc.abstractmethod 89 @abc.abstractmethod
90 def _get_all_members(archive: ArchiveClass) -> list[ArchiveMember]: 90 def _get_all_members(archive: ArchiveClass) -> List[ArchiveMember]:
91 """Return all the members of the archive.""" 91 """Return all the members of the archive."""
92 92
93 @staticmethod 93 @staticmethod
@@ -97,7 +97,7 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
97 97
98 @staticmethod 98 @staticmethod
99 @abc.abstractmethod 99 @abc.abstractmethod
100 def _get_member_meta(member: ArchiveMember) -> dict[str, str]: 100 def _get_member_meta(member: ArchiveMember) -> Dict[str, str]:
101 """Return all the metadata of a given member.""" 101 """Return all the metadata of a given member."""
102 102
103 @staticmethod 103 @staticmethod
@@ -128,8 +128,8 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
128 # pylint: disable=unused-argument 128 # pylint: disable=unused-argument
129 return member 129 return member
130 130
131 def get_meta(self) -> dict[str, Union[str, dict]]: 131 def get_meta(self) -> dict[str, Union[str, Dict]]:
132 meta = dict() # type: dict[str, Union[str, dict]] 132 meta = dict() # type: Dict[str, Union[str, Dict]]
133 133
134 with self.archive_class(self.filename) as zin: 134 with self.archive_class(self.filename) as zin:
135 temp_folder = tempfile.mkdtemp() 135 temp_folder = tempfile.mkdtemp()
@@ -264,6 +264,7 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
264 264
265class TarParser(ArchiveBasedAbstractParser): 265class TarParser(ArchiveBasedAbstractParser):
266 mimetypes = {'application/x-tar'} 266 mimetypes = {'application/x-tar'}
267
267 def __init__(self, filename): 268 def __init__(self, filename):
268 super().__init__(filename) 269 super().__init__(filename)
269 # yes, it's tarfile.open and not tarfile.TarFile, 270 # yes, it's tarfile.open and not tarfile.TarFile,
@@ -336,7 +337,7 @@ class TarParser(ArchiveBasedAbstractParser):
336 return member 337 return member
337 338
338 @staticmethod 339 @staticmethod
339 def _get_member_meta(member: ArchiveMember) -> dict[str, str]: 340 def _get_member_meta(member: ArchiveMember) -> Dict[str, str]:
340 assert isinstance(member, tarfile.TarInfo) # please mypy 341 assert isinstance(member, tarfile.TarInfo) # please mypy
341 metadata = {} 342 metadata = {}
342 if member.mtime != 0: 343 if member.mtime != 0:
@@ -358,7 +359,7 @@ class TarParser(ArchiveBasedAbstractParser):
358 archive.add(full_path, member.name, filter=TarParser._clean_member) # type: ignore 359 archive.add(full_path, member.name, filter=TarParser._clean_member) # type: ignore
359 360
360 @staticmethod 361 @staticmethod
361 def _get_all_members(archive: ArchiveClass) -> list[ArchiveMember]: 362 def _get_all_members(archive: ArchiveClass) -> List[ArchiveMember]:
362 assert isinstance(archive, tarfile.TarFile) # please mypy 363 assert isinstance(archive, tarfile.TarFile) # please mypy
363 return archive.getmembers() # type: ignore 364 return archive.getmembers() # type: ignore
364 365
@@ -391,7 +392,8 @@ class TarXzParser(TarParser):
391 392
392class ZipParser(ArchiveBasedAbstractParser): 393class ZipParser(ArchiveBasedAbstractParser):
393 mimetypes = {'application/zip'} 394 mimetypes = {'application/zip'}
394 def __init__(self, filename): 395
396 def __init__(self, filename: str):
395 super().__init__(filename) 397 super().__init__(filename)
396 self.archive_class = zipfile.ZipFile 398 self.archive_class = zipfile.ZipFile
397 self.member_class = zipfile.ZipInfo 399 self.member_class = zipfile.ZipInfo
@@ -412,7 +414,7 @@ class ZipParser(ArchiveBasedAbstractParser):
412 return member 414 return member
413 415
414 @staticmethod 416 @staticmethod
415 def _get_member_meta(member: ArchiveMember) -> dict[str, str]: 417 def _get_member_meta(member: ArchiveMember) -> Dict[str, str]:
416 assert isinstance(member, zipfile.ZipInfo) # please mypy 418 assert isinstance(member, zipfile.ZipInfo) # please mypy
417 metadata = {} 419 metadata = {}
418 if member.create_system == 3: # this is Linux 420 if member.create_system == 3: # this is Linux
@@ -439,7 +441,7 @@ class ZipParser(ArchiveBasedAbstractParser):
439 compress_type=member.compress_type) 441 compress_type=member.compress_type)
440 442
441 @staticmethod 443 @staticmethod
442 def _get_all_members(archive: ArchiveClass) -> list[ArchiveMember]: 444 def _get_all_members(archive: ArchiveClass) -> List[ArchiveMember]:
443 assert isinstance(archive, zipfile.ZipFile) # please mypy 445 assert isinstance(archive, zipfile.ZipFile) # please mypy
444 return archive.infolist() # type: ignore 446 return archive.infolist() # type: ignore
445 447
diff --git a/libmat2/audio.py b/libmat2/audio.py
index 366d451..aa4afdb 100644
--- a/libmat2/audio.py
+++ b/libmat2/audio.py
@@ -2,7 +2,7 @@ import mimetypes
2import os 2import os
3import shutil 3import shutil
4import tempfile 4import tempfile
5from typing import Union 5from typing import Union, Dict
6 6
7import mutagen 7import mutagen
8 8
@@ -18,10 +18,10 @@ class MutagenParser(abstract.AbstractParser):
18 except mutagen.MutagenError: 18 except mutagen.MutagenError:
19 raise ValueError 19 raise ValueError
20 20
21 def get_meta(self) -> dict[str, Union[str, dict]]: 21 def get_meta(self) -> Dict[str, Union[str, Dict]]:
22 f = mutagen.File(self.filename) 22 f = mutagen.File(self.filename)
23 if f.tags: 23 if f.tags:
24 return {k:', '.join(map(str, v)) for k, v in f.tags.items()} 24 return {k: ', '.join(map(str, v)) for k, v in f.tags.items()}
25 return {} 25 return {}
26 26
27 def remove_all(self) -> bool: 27 def remove_all(self) -> bool:
@@ -38,8 +38,8 @@ class MutagenParser(abstract.AbstractParser):
38class MP3Parser(MutagenParser): 38class MP3Parser(MutagenParser):
39 mimetypes = {'audio/mpeg', } 39 mimetypes = {'audio/mpeg', }
40 40
41 def get_meta(self) -> dict[str, Union[str, dict]]: 41 def get_meta(self) -> Dict[str, Union[str, Dict]]:
42 metadata = {} # type: dict[str, Union[str, dict]] 42 metadata = {} # type: Dict[str, Union[str, Dict]]
43 meta = mutagen.File(self.filename).tags 43 meta = mutagen.File(self.filename).tags
44 if not meta: 44 if not meta:
45 return metadata 45 return metadata
@@ -68,12 +68,12 @@ class FLACParser(MutagenParser):
68 f.save(deleteid3=True) 68 f.save(deleteid3=True)
69 return True 69 return True
70 70
71 def get_meta(self) -> dict[str, Union[str, dict]]: 71 def get_meta(self) -> Dict[str, Union[str, Dict]]:
72 meta = super().get_meta() 72 meta = super().get_meta()
73 for num, picture in enumerate(mutagen.File(self.filename).pictures): 73 for num, picture in enumerate(mutagen.File(self.filename).pictures):
74 name = picture.desc if picture.desc else 'Cover %d' % num 74 name = picture.desc if picture.desc else 'Cover %d' % num
75 extension = mimetypes.guess_extension(picture.mime) 75 extension = mimetypes.guess_extension(picture.mime)
76 if extension is None: # pragma: no cover 76 if extension is None: # pragma: no cover
77 meta[name] = 'harmful data' 77 meta[name] = 'harmful data'
78 continue 78 continue
79 79
@@ -98,6 +98,7 @@ class WAVParser(video.AbstractFFmpegParser):
98 'MIMEType', 'NumChannels', 'SampleRate', 'SourceFile', 98 'MIMEType', 'NumChannels', 'SampleRate', 'SourceFile',
99 } 99 }
100 100
101
101class AIFFParser(video.AbstractFFmpegParser): 102class AIFFParser(video.AbstractFFmpegParser):
102 mimetypes = {'audio/aiff', 'audio/x-aiff'} 103 mimetypes = {'audio/aiff', 'audio/x-aiff'}
103 meta_allowlist = {'AvgBytesPerSec', 'BitsPerSample', 'Directory', 104 meta_allowlist = {'AvgBytesPerSec', 'BitsPerSample', 'Directory',
diff --git a/libmat2/bubblewrap.py b/libmat2/bubblewrap.py
index 0e202b9..e59f111 100644
--- a/libmat2/bubblewrap.py
+++ b/libmat2/bubblewrap.py
@@ -12,7 +12,7 @@ import shutil
12import subprocess 12import subprocess
13import tempfile 13import tempfile
14import functools 14import functools
15from typing import Optional 15from typing import Optional, List
16 16
17 17
18__all__ = ['PIPE', 'run', 'CalledProcessError'] 18__all__ = ['PIPE', 'run', 'CalledProcessError']
@@ -33,7 +33,7 @@ def _get_bwrap_path() -> str:
33 33
34def _get_bwrap_args(tempdir: str, 34def _get_bwrap_args(tempdir: str,
35 input_filename: str, 35 input_filename: str,
36 output_filename: Optional[str] = None) -> list[str]: 36 output_filename: Optional[str] = None) -> List[str]:
37 ro_bind_args = [] 37 ro_bind_args = []
38 cwd = os.getcwd() 38 cwd = os.getcwd()
39 39
diff --git a/libmat2/epub.py b/libmat2/epub.py
index 7613d35..3c5046a 100644
--- a/libmat2/epub.py
+++ b/libmat2/epub.py
@@ -3,10 +3,11 @@ import re
3import uuid 3import uuid
4import zipfile 4import zipfile
5import xml.etree.ElementTree as ET # type: ignore 5import xml.etree.ElementTree as ET # type: ignore
6from typing import Any 6from typing import Any, Dict
7 7
8from . import archive, office 8from . import archive, office
9 9
10
10class EPUBParser(archive.ZipParser): 11class EPUBParser(archive.ZipParser):
11 mimetypes = {'application/epub+zip', } 12 mimetypes = {'application/epub+zip', }
12 metadata_namespace = '{http://purl.org/dc/elements/1.1/}' 13 metadata_namespace = '{http://purl.org/dc/elements/1.1/}'
@@ -28,7 +29,6 @@ class EPUBParser(archive.ZipParser):
28 })) 29 }))
29 self.uniqid = uuid.uuid4() 30 self.uniqid = uuid.uuid4()
30 31
31
32 def is_archive_valid(self): 32 def is_archive_valid(self):
33 super().is_archive_valid() 33 super().is_archive_valid()
34 with zipfile.ZipFile(self.filename) as zin: 34 with zipfile.ZipFile(self.filename) as zin:
@@ -37,7 +37,7 @@ class EPUBParser(archive.ZipParser):
37 if member_name.endswith('META-INF/encryption.xml'): 37 if member_name.endswith('META-INF/encryption.xml'):
38 raise ValueError('the file contains encrypted fonts') 38 raise ValueError('the file contains encrypted fonts')
39 39
40 def _specific_get_meta(self, full_path, file_path) -> dict[str, Any]: 40 def _specific_get_meta(self, full_path, file_path) -> Dict[str, Any]:
41 if not file_path.endswith('.opf'): 41 if not file_path.endswith('.opf'):
42 return {} 42 return {}
43 43
@@ -73,7 +73,6 @@ class EPUBParser(archive.ZipParser):
73 short_empty_elements=False) 73 short_empty_elements=False)
74 return True 74 return True
75 75
76
77 def __handle_tocncx(self, full_path: str) -> bool: 76 def __handle_tocncx(self, full_path: str) -> bool:
78 try: 77 try:
79 tree, namespace = office._parse_xml(full_path) 78 tree, namespace = office._parse_xml(full_path)
diff --git a/libmat2/exiftool.py b/libmat2/exiftool.py
index cdfce3d..5979a64 100644
--- a/libmat2/exiftool.py
+++ b/libmat2/exiftool.py
@@ -4,7 +4,7 @@ import logging
4import os 4import os
5import shutil 5import shutil
6import subprocess 6import subprocess
7from typing import Union 7from typing import Union, Set, Dict
8 8
9from . import abstract 9from . import abstract
10from . import bubblewrap 10from . import bubblewrap
@@ -15,9 +15,9 @@ class ExiftoolParser(abstract.AbstractParser):
15 from a import file, hence why several parsers are re-using its `get_meta` 15 from a import file, hence why several parsers are re-using its `get_meta`
16 method. 16 method.
17 """ 17 """
18 meta_allowlist = set() # type: set[str] 18 meta_allowlist = set() # type: Set[str]
19 19
20 def get_meta(self) -> dict[str, Union[str, dict]]: 20 def get_meta(self) -> Dict[str, Union[str, Dict]]:
21 try: 21 try:
22 if self.sandbox: 22 if self.sandbox:
23 out = bubblewrap.run([_get_exiftool_path(), '-json', 23 out = bubblewrap.run([_get_exiftool_path(), '-json',
diff --git a/libmat2/harmless.py b/libmat2/harmless.py
index 8688a9d..42b6eda 100644
--- a/libmat2/harmless.py
+++ b/libmat2/harmless.py
@@ -1,5 +1,5 @@
1import shutil 1import shutil
2from typing import Union 2from typing import Union, Dict
3from . import abstract 3from . import abstract
4 4
5 5
@@ -7,7 +7,7 @@ class HarmlessParser(abstract.AbstractParser):
7 """ This is the parser for filetypes that can not contain metadata. """ 7 """ This is the parser for filetypes that can not contain metadata. """
8 mimetypes = {'text/plain', 'image/x-ms-bmp'} 8 mimetypes = {'text/plain', 'image/x-ms-bmp'}
9 9
10 def get_meta(self) -> dict[str, Union[str, dict]]: 10 def get_meta(self) -> Dict[str, Union[str, Dict]]:
11 return dict() 11 return dict()
12 12
13 def remove_all(self) -> bool: 13 def remove_all(self) -> bool:
diff --git a/libmat2/images.py b/libmat2/images.py
index 083ff64..e7cdf5a 100644
--- a/libmat2/images.py
+++ b/libmat2/images.py
@@ -1,6 +1,6 @@
1import os 1import os
2import re 2import re
3from typing import Union, Any 3from typing import Union, Any, Dict
4 4
5import cairo 5import cairo
6 6
@@ -48,7 +48,7 @@ class SVGParser(exiftool.ExiftoolParser):
48 surface.finish() 48 surface.finish()
49 return True 49 return True
50 50
51 def get_meta(self) -> dict[str, Union[str, dict]]: 51 def get_meta(self) -> Dict[str, Union[str, Dict]]:
52 meta = super().get_meta() 52 meta = super().get_meta()
53 53
54 # The namespace is mandatory, but only the …/2000/svg is valid. 54 # The namespace is mandatory, but only the …/2000/svg is valid.
@@ -57,6 +57,7 @@ class SVGParser(exiftool.ExiftoolParser):
57 meta.pop('Xmlns') 57 meta.pop('Xmlns')
58 return meta 58 return meta
59 59
60
60class PNGParser(exiftool.ExiftoolParser): 61class PNGParser(exiftool.ExiftoolParser):
61 mimetypes = {'image/png', } 62 mimetypes = {'image/png', }
62 meta_allowlist = {'SourceFile', 'ExifToolVersion', 'FileName', 63 meta_allowlist = {'SourceFile', 'ExifToolVersion', 'FileName',
@@ -156,11 +157,12 @@ class TiffParser(GdkPixbufAbstractParser):
156 'FileTypeExtension', 'ImageHeight', 'ImageSize', 157 'FileTypeExtension', 'ImageHeight', 'ImageSize',
157 'ImageWidth', 'MIMEType', 'Megapixels', 'SourceFile'} 158 'ImageWidth', 'MIMEType', 'Megapixels', 'SourceFile'}
158 159
160
159class PPMParser(abstract.AbstractParser): 161class PPMParser(abstract.AbstractParser):
160 mimetypes = {'image/x-portable-pixmap'} 162 mimetypes = {'image/x-portable-pixmap'}
161 163
162 def get_meta(self) -> dict[str, Union[str, dict]]: 164 def get_meta(self) -> Dict[str, Union[str, Dict]]:
163 meta = {} # type: dict[str, Union[str, dict[Any, Any]]] 165 meta = {} # type: Dict[str, Union[str, Dict[Any, Any]]]
164 with open(self.filename) as f: 166 with open(self.filename) as f:
165 for idx, line in enumerate(f): 167 for idx, line in enumerate(f):
166 if line.lstrip().startswith('#'): 168 if line.lstrip().startswith('#'):
@@ -176,9 +178,10 @@ class PPMParser(abstract.AbstractParser):
176 fout.write(line) 178 fout.write(line)
177 return True 179 return True
178 180
181
179class HEICParser(exiftool.ExiftoolParser): 182class HEICParser(exiftool.ExiftoolParser):
180 mimetypes = {'image/heic'} 183 mimetypes = {'image/heic'}
181 meta_allowlist = {'SourceFile', 'ExifToolVersion', 'FileName','Directory', 184 meta_allowlist = {'SourceFile', 'ExifToolVersion', 'FileName', 'Directory',
182 'FileSize', 'FileModifyDate', 'FileAccessDate', 185 'FileSize', 'FileModifyDate', 'FileAccessDate',
183 'FileInodeChangeDate', 'FilePermissions', 'FileType', 186 'FileInodeChangeDate', 'FilePermissions', 'FileType',
184 'FileTypeExtension', 'MIMEType', 'MajorBrand', 'MinorVersion', 187 'FileTypeExtension', 'MIMEType', 'MajorBrand', 'MinorVersion',
diff --git a/libmat2/office.py b/libmat2/office.py
index 8ccaa02..87a0b7e 100644
--- a/libmat2/office.py
+++ b/libmat2/office.py
@@ -4,7 +4,7 @@ import logging
4import os 4import os
5import re 5import re
6import zipfile 6import zipfile
7from typing import Pattern, Any 7from typing import Pattern, Any, Tuple, Dict
8 8
9import xml.etree.ElementTree as ET # type: ignore 9import xml.etree.ElementTree as ET # type: ignore
10 10
@@ -12,7 +12,8 @@ from .archive import ZipParser
12 12
13# pylint: disable=line-too-long 13# pylint: disable=line-too-long
14 14
15def _parse_xml(full_path: str) -> tuple[ET.ElementTree, dict[str, str]]: 15
16def _parse_xml(full_path: str) -> Tuple[ET.ElementTree, Dict[str, str]]:
16 """ This function parses XML, with namespace support. """ 17 """ This function parses XML, with namespace support. """
17 namespace_map = dict() 18 namespace_map = dict()
18 for _, (key, value) in ET.iterparse(full_path, ("start-ns", )): 19 for _, (key, value) in ET.iterparse(full_path, ("start-ns", )):
@@ -68,7 +69,6 @@ class MSOfficeParser(ZipParser):
68 'application/vnd.openxmlformats-officedocument.wordprocessingml.settings+xml', 69 'application/vnd.openxmlformats-officedocument.wordprocessingml.settings+xml',
69 } 70 }
70 71
71
72 def __init__(self, filename): 72 def __init__(self, filename):
73 super().__init__(filename) 73 super().__init__(filename)
74 74
@@ -218,7 +218,7 @@ class MSOfficeParser(ZipParser):
218 if 'w' not in namespace: 218 if 'w' not in namespace:
219 return True 219 return True
220 220
221 parent_map = {c:p for p in tree.iter() for c in p} 221 parent_map = {c: p for p in tree.iter() for c in p}
222 222
223 elements_to_remove = list() 223 elements_to_remove = list()
224 for element in tree.iterfind('.//w:nsid', namespace): 224 for element in tree.iterfind('.//w:nsid', namespace):
@@ -229,7 +229,6 @@ class MSOfficeParser(ZipParser):
229 tree.write(full_path, xml_declaration=True) 229 tree.write(full_path, xml_declaration=True)
230 return True 230 return True
231 231
232
233 @staticmethod 232 @staticmethod
234 def __remove_revisions(full_path: str) -> bool: 233 def __remove_revisions(full_path: str) -> bool:
235 try: 234 try:
@@ -319,7 +318,6 @@ class MSOfficeParser(ZipParser):
319 for i in re.findall(r'<p:cNvPr id="([0-9]+)"', content): 318 for i in re.findall(r'<p:cNvPr id="([0-9]+)"', content):
320 self.__counters['cNvPr'].add(int(i)) 319 self.__counters['cNvPr'].add(int(i))
321 320
322
323 @staticmethod 321 @staticmethod
324 def __randomize_creationId(full_path: str) -> bool: 322 def __randomize_creationId(full_path: str) -> bool:
325 try: 323 try:
@@ -441,8 +439,8 @@ class MSOfficeParser(ZipParser):
441 439
442 with open(full_path, encoding='utf-8') as f: 440 with open(full_path, encoding='utf-8') as f:
443 try: 441 try:
444 results = re.findall(r"<(.+)>(.+)</\1>", f.read(), re.I|re.M) 442 results = re.findall(r"<(.+)>(.+)</\1>", f.read(), re.I | re.M)
445 return {k:v for (k, v) in results} 443 return {k: v for (k, v) in results}
446 except (TypeError, UnicodeDecodeError): 444 except (TypeError, UnicodeDecodeError):
447 # We didn't manage to parse the xml file 445 # We didn't manage to parse the xml file
448 return {file_path: 'harmful content', } 446 return {file_path: 'harmful content', }
@@ -459,7 +457,6 @@ class LibreOfficeParser(ZipParser):
459 'application/vnd.oasis.opendocument.image', 457 'application/vnd.oasis.opendocument.image',
460 } 458 }
461 459
462
463 def __init__(self, filename): 460 def __init__(self, filename):
464 super().__init__(filename) 461 super().__init__(filename)
465 462
diff --git a/libmat2/pdf.py b/libmat2/pdf.py
index 63ed9c1..8c3055f 100644
--- a/libmat2/pdf.py
+++ b/libmat2/pdf.py
@@ -7,7 +7,7 @@ import re
7import logging 7import logging
8import tempfile 8import tempfile
9import io 9import io
10from typing import Union 10from typing import Union, Dict
11 11
12import cairo 12import cairo
13import gi 13import gi
@@ -18,6 +18,7 @@ from . import abstract
18 18
19FIXED_PDF_VERSION = cairo.PDFVersion.VERSION_1_5 19FIXED_PDF_VERSION = cairo.PDFVersion.VERSION_1_5
20 20
21
21class PDFParser(abstract.AbstractParser): 22class PDFParser(abstract.AbstractParser):
22 mimetypes = {'application/pdf', } 23 mimetypes = {'application/pdf', }
23 meta_list = {'author', 'creation-date', 'creator', 'format', 'keywords', 24 meta_list = {'author', 'creation-date', 'creator', 'format', 'keywords',
@@ -140,13 +141,13 @@ class PDFParser(abstract.AbstractParser):
140 return True 141 return True
141 142
142 @staticmethod 143 @staticmethod
143 def __parse_metadata_field(data: str) -> dict[str, str]: 144 def __parse_metadata_field(data: str) -> Dict[str, str]:
144 metadata = {} 145 metadata = {}
145 for (_, key, value) in re.findall(r"<(xmp|pdfx|pdf|xmpMM):(.+)>(.+)</\1:\2>", data, re.I): 146 for (_, key, value) in re.findall(r"<(xmp|pdfx|pdf|xmpMM):(.+)>(.+)</\1:\2>", data, re.I):
146 metadata[key] = value 147 metadata[key] = value
147 return metadata 148 return metadata
148 149
149 def get_meta(self) -> dict[str, Union[str, dict]]: 150 def get_meta(self) -> Dict[str, Union[str, Dict]]:
150 """ Return a dict with all the meta of the file 151 """ Return a dict with all the meta of the file
151 """ 152 """
152 metadata = {} 153 metadata = {}
diff --git a/libmat2/torrent.py b/libmat2/torrent.py
index c547a20..e6407ff 100644
--- a/libmat2/torrent.py
+++ b/libmat2/torrent.py
@@ -1,5 +1,5 @@
1import logging 1import logging
2from typing import Union 2from typing import Union, Dict, List, Tuple
3 3
4from . import abstract 4from . import abstract
5 5
@@ -15,7 +15,7 @@ class TorrentParser(abstract.AbstractParser):
15 if self.dict_repr is None: 15 if self.dict_repr is None:
16 raise ValueError 16 raise ValueError
17 17
18 def get_meta(self) -> dict[str, Union[str, dict]]: 18 def get_meta(self) -> Dict[str, Union[str, Dict]]:
19 metadata = {} 19 metadata = {}
20 for key, value in self.dict_repr.items(): 20 for key, value in self.dict_repr.items():
21 if key not in self.allowlist: 21 if key not in self.allowlist:
@@ -56,7 +56,7 @@ class _BencodeHandler:
56 } 56 }
57 57
58 @staticmethod 58 @staticmethod
59 def __decode_int(s: bytes) -> tuple[int, bytes]: 59 def __decode_int(s: bytes) -> Tuple[int, bytes]:
60 s = s[1:] 60 s = s[1:]
61 next_idx = s.index(b'e') 61 next_idx = s.index(b'e')
62 if s.startswith(b'-0'): 62 if s.startswith(b'-0'):
@@ -66,7 +66,7 @@ class _BencodeHandler:
66 return int(s[:next_idx]), s[next_idx+1:] 66 return int(s[:next_idx]), s[next_idx+1:]
67 67
68 @staticmethod 68 @staticmethod
69 def __decode_string(s: bytes) -> tuple[bytes, bytes]: 69 def __decode_string(s: bytes) -> Tuple[bytes, bytes]:
70 colon = s.index(b':') 70 colon = s.index(b':')
71 # FIXME Python3 is broken here, the call to `ord` shouldn't be needed, 71 # FIXME Python3 is broken here, the call to `ord` shouldn't be needed,
72 # but apparently it is. This is utterly idiotic. 72 # but apparently it is. This is utterly idiotic.
@@ -76,7 +76,7 @@ class _BencodeHandler:
76 s = s[1:] 76 s = s[1:]
77 return s[colon:colon+str_len], s[colon+str_len:] 77 return s[colon:colon+str_len], s[colon+str_len:]
78 78
79 def __decode_list(self, s: bytes) -> tuple[list, bytes]: 79 def __decode_list(self, s: bytes) -> Tuple[List, bytes]:
80 ret = list() 80 ret = list()
81 s = s[1:] # skip leading `l` 81 s = s[1:] # skip leading `l`
82 while s[0] != ord('e'): 82 while s[0] != ord('e'):
@@ -84,7 +84,7 @@ class _BencodeHandler:
84 ret.append(value) 84 ret.append(value)
85 return ret, s[1:] 85 return ret, s[1:]
86 86
87 def __decode_dict(self, s: bytes) -> tuple[dict, bytes]: 87 def __decode_dict(self, s: bytes) -> Tuple[Dict, bytes]:
88 ret = dict() 88 ret = dict()
89 s = s[1:] # skip leading `d` 89 s = s[1:] # skip leading `d`
90 while s[0] != ord(b'e'): 90 while s[0] != ord(b'e'):
@@ -113,10 +113,10 @@ class _BencodeHandler:
113 ret += self.__encode_func[type(value)](value) 113 ret += self.__encode_func[type(value)](value)
114 return b'd' + ret + b'e' 114 return b'd' + ret + b'e'
115 115
116 def bencode(self, s: Union[dict, list, bytes, int]) -> bytes: 116 def bencode(self, s: Union[Dict, List, bytes, int]) -> bytes:
117 return self.__encode_func[type(s)](s) 117 return self.__encode_func[type(s)](s)
118 118
119 def bdecode(self, s: bytes) -> Union[dict, None]: 119 def bdecode(self, s: bytes) -> Union[Dict, None]:
120 try: 120 try:
121 ret, trail = self.__decode_func[s[0]](s) 121 ret, trail = self.__decode_func[s[0]](s)
122 except (IndexError, KeyError, ValueError) as e: 122 except (IndexError, KeyError, ValueError) as e:
diff --git a/libmat2/video.py b/libmat2/video.py
index 4d33aa4..772a89e 100644
--- a/libmat2/video.py
+++ b/libmat2/video.py
@@ -3,7 +3,7 @@ import functools
3import shutil 3import shutil
4import logging 4import logging
5 5
6from typing import Union 6from typing import Union, Dict
7 7
8from . import exiftool 8from . import exiftool
9from . import bubblewrap 9from . import bubblewrap
@@ -12,7 +12,7 @@ from . import bubblewrap
12class AbstractFFmpegParser(exiftool.ExiftoolParser): 12class AbstractFFmpegParser(exiftool.ExiftoolParser):
13 """ Abstract parser for all FFmpeg-based ones, mainly for video. """ 13 """ Abstract parser for all FFmpeg-based ones, mainly for video. """
14 # Some fileformats have mandatory metadata fields 14 # Some fileformats have mandatory metadata fields
15 meta_key_value_allowlist = {} # type: dict[str, Union[str, int]] 15 meta_key_value_allowlist = {} # type: Dict[str, Union[str, int]]
16 16
17 def remove_all(self) -> bool: 17 def remove_all(self) -> bool:
18 if self.meta_key_value_allowlist: 18 if self.meta_key_value_allowlist:
@@ -45,10 +45,10 @@ class AbstractFFmpegParser(exiftool.ExiftoolParser):
45 return False 45 return False
46 return True 46 return True
47 47
48 def get_meta(self) -> dict[str, Union[str, dict]]: 48 def get_meta(self) -> Dict[str, Union[str, Dict]]:
49 meta = super().get_meta() 49 meta = super().get_meta()
50 50
51 ret = dict() # type: dict[str, Union[str, dict]] 51 ret = dict() # type: Dict[str, Union[str, Dict]]
52 for key, value in meta.items(): 52 for key, value in meta.items():
53 if key in self.meta_key_value_allowlist: 53 if key in self.meta_key_value_allowlist:
54 if value == self.meta_key_value_allowlist[key]: 54 if value == self.meta_key_value_allowlist[key]:
diff --git a/libmat2/web.py b/libmat2/web.py
index 574bdd7..f2938e2 100644
--- a/libmat2/web.py
+++ b/libmat2/web.py
@@ -1,5 +1,5 @@
1from html import parser, escape 1from html import parser, escape
2from typing import Any, Optional 2from typing import Any, Optional, Dict, List, Tuple, Set
3import re 3import re
4import string 4import string
5 5
@@ -25,7 +25,7 @@ class CSSParser(abstract.AbstractParser):
25 f.write(cleaned) 25 f.write(cleaned)
26 return True 26 return True
27 27
28 def get_meta(self) -> dict[str, Any]: 28 def get_meta(self) -> Dict[str, Any]:
29 metadata = {} 29 metadata = {}
30 with open(self.filename, encoding='utf-8') as f: 30 with open(self.filename, encoding='utf-8') as f:
31 try: 31 try:
@@ -44,10 +44,10 @@ class CSSParser(abstract.AbstractParser):
44 44
45 45
46class AbstractHTMLParser(abstract.AbstractParser): 46class AbstractHTMLParser(abstract.AbstractParser):
47 tags_blocklist = set() # type: set[str] 47 tags_blocklist = set() # type: Set[str]
48 # In some html/xml-based formats some tags are mandatory, 48 # In some html/xml-based formats some tags are mandatory,
49 # so we're keeping them, but are discarding their content 49 # so we're keeping them, but are discarding their content
50 tags_required_blocklist = set() # type: set[str] 50 tags_required_blocklist = set() # type: Set[str]
51 51
52 def __init__(self, filename): 52 def __init__(self, filename):
53 super().__init__(filename) 53 super().__init__(filename)
@@ -57,7 +57,7 @@ class AbstractHTMLParser(abstract.AbstractParser):
57 self.__parser.feed(f.read()) 57 self.__parser.feed(f.read())
58 self.__parser.close() 58 self.__parser.close()
59 59
60 def get_meta(self) -> dict[str, Any]: 60 def get_meta(self) -> Dict[str, Any]:
61 return self.__parser.get_meta() 61 return self.__parser.get_meta()
62 62
63 def remove_all(self) -> bool: 63 def remove_all(self) -> bool:
@@ -112,7 +112,7 @@ class _HTMLParser(parser.HTMLParser):
112 """ 112 """
113 raise ValueError(message) 113 raise ValueError(message)
114 114
115 def handle_starttag(self, tag: str, attrs: list[tuple[str, Optional[str]]]): 115 def handle_starttag(self, tag: str, attrs: List[Tuple[str, Optional[str]]]):
116 # Ignore the type, because mypy is too stupid to infer 116 # Ignore the type, because mypy is too stupid to infer
117 # that get_starttag_text() can't return None. 117 # that get_starttag_text() can't return None.
118 original_tag = self.get_starttag_text() # type: ignore 118 original_tag = self.get_starttag_text() # type: ignore
@@ -159,7 +159,7 @@ class _HTMLParser(parser.HTMLParser):
159 self.__textrepr += escape(data) 159 self.__textrepr += escape(data)
160 160
161 def handle_startendtag(self, tag: str, 161 def handle_startendtag(self, tag: str,
162 attrs: list[tuple[str, Optional[str]]]): 162 attrs: List[Tuple[str, Optional[str]]]):
163 if tag in self.tag_required_blocklist | self.tag_blocklist: 163 if tag in self.tag_required_blocklist | self.tag_blocklist:
164 meta = {k:v for k, v in attrs} 164 meta = {k:v for k, v in attrs}
165 name = meta.get('name', 'harmful metadata') 165 name = meta.get('name', 'harmful metadata')
@@ -184,7 +184,7 @@ class _HTMLParser(parser.HTMLParser):
184 f.write(self.__textrepr) 184 f.write(self.__textrepr)
185 return True 185 return True
186 186
187 def get_meta(self) -> dict[str, Any]: 187 def get_meta(self) -> Dict[str, Any]:
188 if self.__validation_queue: 188 if self.__validation_queue:
189 raise ValueError("Some tags (%s) were left unclosed in %s" % ( 189 raise ValueError("Some tags (%s) were left unclosed in %s" % (
190 ', '.join(self.__validation_queue), 190 ', '.join(self.__validation_queue),