8 files changed, 79 insertions, 69 deletions
diff --git a/src/__init__.py b/src/__init__.py
index 3f5c478..07d3036 100644
--- a/src/__init__.py
+++ b/src/__init__.py
@@ -2,4 +2,5 @@
 # A set of extension that aren't supported, despite matching a supported mimetype
 unsupported_extensions = set(['bat', 'c', 'h', 'ksh', 'pl', 'txt', 'asc',
-    'text', 'pot', 'brf', 'srt', 'rdf', 'wsdl', 'xpdl', 'xsl', 'xsd'])
+                              'text', 'pot', 'brf', 'srt', 'rdf', 'wsdl',
+                              'xpdl', 'xsl', 'xsd'])
diff --git a/src/audio.py b/src/audio.py
index 4a385b2..3a6aa79 100644
--- a/src/audio.py
+++ b/src/audio.py
@@ -9,7 +9,7 @@ class MutagenParser(abstract.AbstractParser):
    def get_meta(self):
        f = mutagen.File(self.filename)
        if f.tags:
-            return {k:', '.join(v) for k,v in f.tags.items()}
+            return {k:', '.join(v) for k, v in f.tags.items()}
        return {}
    def remove_all(self):
diff --git a/src/harmless.py b/src/harmless.py
index fbc2897..aa00582 100644
--- a/src/harmless.py
+++ b/src/harmless.py
@@ -6,6 +6,7 @@ class HarmlessParser(abstract.AbstractParser):
    mimetypes = {'application/xml', 'text/plain'}
    def __init__(self, filename: str):
+        super().__init__(filename)
        self.filename = filename
        self.output_filename = filename
diff --git a/src/images.py b/src/images.py
index 6cc3dfe..c84952a 100644
--- a/src/images.py
+++ b/src/images.py
@@ -14,11 +14,12 @@ from . import abstract
 class PNGParser(abstract.AbstractParser):
    mimetypes = {'image/png', }
    meta_whitelist = {'SourceFile', 'ExifToolVersion', 'FileName',
-            'Directory', 'FileSize', 'FileModifyDate', 'FileAccessDate',
+                      'Directory', 'FileSize', 'FileModifyDate',
-            "FileInodeChangeDate", 'FilePermissions', 'FileType',
+                      'FileAccessDate', 'FileInodeChangeDate',
-            'FileTypeExtension', 'MIMEType', 'ImageWidth', 'BitDepth', 'ColorType',
+                      'FilePermissions', 'FileType', 'FileTypeExtension',
-            'Compression', 'Filter', 'Interlace', 'BackgroundColor', 'ImageSize',
+                      'MIMEType', 'ImageWidth', 'BitDepth', 'ColorType',
-            'Megapixels', 'ImageHeight'}
+                      'Compression', 'Filter', 'Interlace', 'BackgroundColor',
+                      'ImageSize', 'Megapixels', 'ImageHeight'}
    def __init__(self, filename):
        super().__init__(filename)
@@ -63,36 +64,38 @@ class GdkPixbufAbstractParser(abstract.AbstractParser):
 class JPGParser(GdkPixbufAbstractParser):
    mimetypes = {'image/jpeg'}
    meta_whitelist = {'SourceFile', 'ExifToolVersion', 'FileName',
-            'Directory', 'FileSize', 'FileModifyDate', 'FileAccessDate',
+                      'Directory', 'FileSize', 'FileModifyDate',
-            "FileInodeChangeDate", 'FilePermissions', 'FileType',
+                      'FileAccessDate', "FileInodeChangeDate",
-            'FileTypeExtension', 'MIMEType', 'ImageWidth',
+                      'FilePermissions', 'FileType', 'FileTypeExtension',
-            'ImageSize', 'BitsPerSample', 'ColorComponents', 'EncodingProcess',
+                      'MIMEType', 'ImageWidth', 'ImageSize', 'BitsPerSample',
-            'JFIFVersion', 'ResolutionUnit', 'XResolution', 'YCbCrSubSampling',
+                      'ColorComponents', 'EncodingProcess', 'JFIFVersion',
-            'YResolution', 'Megapixels', 'ImageHeight'}
+                      'ResolutionUnit', 'XResolution', 'YCbCrSubSampling',
+                      'YResolution', 'Megapixels', 'ImageHeight'}
 class TiffParser(GdkPixbufAbstractParser):
    mimetypes = {'image/tiff'}
    meta_whitelist = {'Compression', 'ExifByteOrder', 'ExtraSamples',
-            'FillOrder', 'PhotometricInterpretation', 'PlanarConfiguration',
+                      'FillOrder', 'PhotometricInterpretation',
-            'RowsPerStrip', 'SamplesPerPixel', 'StripByteCounts',
+                      'PlanarConfiguration', 'RowsPerStrip', 'SamplesPerPixel',
-            'StripOffsets', 'BitsPerSample', 'Directory', 'ExifToolVersion',
+                      'StripByteCounts', 'StripOffsets', 'BitsPerSample',
-            'FileAccessDate', 'FileInodeChangeDate', 'FileModifyDate',
+                      'Directory', 'ExifToolVersion', 'FileAccessDate',
-            'FileName', 'FilePermissions', 'FileSize', 'FileType',
+                      'FileInodeChangeDate', 'FileModifyDate', 'FileName',
-            'FileTypeExtension', 'ImageHeight', 'ImageSize', 'ImageWidth',
+                      'FilePermissions', 'FileSize', 'FileType',
-            'MIMEType', 'Megapixels', 'SourceFile'}
+                      'FileTypeExtension', 'ImageHeight', 'ImageSize',
+                      'ImageWidth', 'MIMEType', 'Megapixels', 'SourceFile'}
 class BMPParser(GdkPixbufAbstractParser):
    mimetypes = {'image/x-ms-bmp'}
    meta_whitelist = {'SourceFile', 'ExifToolVersion', 'FileName', 'Directory',
-            'FileSize', 'FileModifyDate', 'FileAccessDate',
+                      'FileSize', 'FileModifyDate', 'FileAccessDate',
-            'FileInodeChangeDate', 'FilePermissions', 'FileType',
+                      'FileInodeChangeDate', 'FilePermissions', 'FileType',
-            'FileTypeExtension', 'MIMEType', 'BMPVersion', 'ImageWidth',
+                      'FileTypeExtension', 'MIMEType', 'BMPVersion',
-            'ImageHeight', 'Planes', 'BitDepth', 'Compression', 'ImageLength',
+                      'ImageWidth', 'ImageHeight', 'Planes', 'BitDepth',
-            'PixelsPerMeterX', 'PixelsPerMeterY', 'NumColors',
+                      'Compression', 'ImageLength', 'PixelsPerMeterX',
-            'NumImportantColors', 'RedMask', 'GreenMask', 'BlueMask',
+                      'PixelsPerMeterY', 'NumColors', 'NumImportantColors',
-            'AlphaMask', 'ColorSpace', 'RedEndpoint', 'GreenEndpoint',
+                      'RedMask', 'GreenMask', 'BlueMask', 'AlphaMask',
-            'BlueEndpoint', 'GammaRed', 'GammaGreen', 'GammaBlue', 'ImageSize',
+                      'ColorSpace', 'RedEndpoint', 'GreenEndpoint',
-            'Megapixels'}
+                      'BlueEndpoint', 'GammaRed', 'GammaGreen', 'GammaBlue',
+                      'ImageSize', 'Megapixels'}
diff --git a/src/office.py b/src/office.py
index da6168e..749fc7d 100644
--- a/src/office.py
+++ b/src/office.py
@@ -9,14 +9,14 @@ from . import abstract, parser_factory
 class ArchiveBasedAbstractParser(abstract.AbstractParser):
-    def _clean_zipinfo(self, zipinfo:zipfile.ZipInfo) -> zipfile.ZipInfo:
+    def _clean_zipinfo(self, zipinfo: zipfile.ZipInfo) -> zipfile.ZipInfo:
        zipinfo.compress_type = zipfile.ZIP_DEFLATED
        zipinfo.create_system = 3  # Linux
        zipinfo.comment = b''
        zipinfo.date_time = (1980, 1, 1, 0, 0, 0)
        return zipinfo
-    def _get_zipinfo_meta(self, zipinfo:zipfile.ZipInfo) -> dict:
+    def _get_zipinfo_meta(self, zipinfo: zipfile.ZipInfo) -> dict:
        metadata = {}
        if zipinfo.create_system == 3:
            #metadata['create_system'] = 'Linux'
@@ -35,7 +35,8 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
        return metadata
-    def _clean_internal_file(self, item:zipfile.ZipInfo, temp_folder:str, zin:zipfile.ZipFile, zout:zipfile.ZipFile):
+    def _clean_internal_file(self, item: zipfile.ZipInfo, temp_folder: str,
+                             zin: zipfile.ZipFile, zout: zipfile.ZipFile):
        zin.extract(member=item, path=temp_folder)
        tmp_parser, mtype = parser_factory.get_parser(os.path.join(temp_folder, item.filename))
        if not tmp_parser:
@@ -50,9 +51,9 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
 class MSOfficeParser(ArchiveBasedAbstractParser):
    mimetypes = {
-            'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
+        'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
-            'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
+        'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
-            'application/vnd.openxmlformats-officedocument.presentationml.presentation'
+        'application/vnd.openxmlformats-officedocument.presentationml.presentation'
    }
    files_to_keep = {'_rels/.rels', 'word/_rels/document.xml.rels'}
@@ -103,13 +104,13 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
 class LibreOfficeParser(ArchiveBasedAbstractParser):
    mimetypes = {
-            'application/vnd.oasis.opendocument.text',
+        'application/vnd.oasis.opendocument.text',
-            'application/vnd.oasis.opendocument.spreadsheet',
+        'application/vnd.oasis.opendocument.spreadsheet',
-            'application/vnd.oasis.opendocument.presentation',
+        'application/vnd.oasis.opendocument.presentation',
-            'application/vnd.oasis.opendocument.graphics',
+        'application/vnd.oasis.opendocument.graphics',
-            'application/vnd.oasis.opendocument.chart',
+        'application/vnd.oasis.opendocument.chart',
-            'application/vnd.oasis.opendocument.formula',
+        'application/vnd.oasis.opendocument.formula',
-            'application/vnd.oasis.opendocument.image',
+        'application/vnd.oasis.opendocument.image',
    }
    def get_meta(self):
diff --git a/src/parser_factory.py b/src/parser_factory.py
index 2c30659..48616b0 100644
--- a/src/parser_factory.py
+++ b/src/parser_factory.py
@@ -2,10 +2,10 @@ import os
 import mimetypes
 import importlib
 import pkgutil
+from typing import TypeVar
 from . import abstract, unsupported_extensions
-from typing import TypeVar
 T = TypeVar('T', bound='abstract.AbstractParser')
diff --git a/src/pdf.py b/src/pdf.py
index fbc5175..5b99192 100644
--- a/src/pdf.py
+++ b/src/pdf.py
@@ -21,8 +21,8 @@ logging.basicConfig(level=logging.DEBUG)
 class PDFParser(abstract.AbstractParser):
    mimetypes = {'application/pdf', }
    meta_list = {'author', 'creation-date', 'creator', 'format', 'keywords',
-            'metadata', 'mod-date', 'producer', 'subject', 'title',
+                 'metadata', 'mod-date', 'producer', 'subject', 'title',
-            'viewer-preferences'}
+                 'viewer-preferences'}
    def __init__(self, filename):
        super().__init__(filename)
@@ -103,7 +103,8 @@ class PDFParser(abstract.AbstractParser):
        return True
-    def __remove_superficial_meta(self, in_file:str, out_file: str) -> bool:
+    @staticmethod
+    def __remove_superficial_meta(in_file: str, out_file: str) -> bool:
        document = Poppler.Document.new_from_file('file://' + in_file)
        document.set_producer('')
        document.set_creator('')
@@ -112,7 +113,8 @@ class PDFParser(abstract.AbstractParser):
        return True
-    def __parse_metadata_field(self, data:str) -> dict:
+    @staticmethod
+    def __parse_metadata_field(data: str) -> dict:
        metadata = {}
        for (_, key, value) in re.findall(r"<(xmp|pdfx|pdf|xmpMM):(.+)>(.+)</\1:\2>", data, re.I):
            metadata[key] = value
@@ -128,6 +130,6 @@ class PDFParser(abstract.AbstractParser):
            if document.get_property(key):
                metadata[key] = document.get_property(key)
        if 'metadata' in metadata:
-            parsed_meta =  self.__parse_metadata_field(metadata['metadata'])
+            parsed_meta = self.__parse_metadata_field(metadata['metadata'])
            return {**metadata, **parsed_meta}
        return metadata
diff --git a/src/torrent.py b/src/torrent.py
index bdf83ce..cb4b5e3 100644
--- a/src/torrent.py
+++ b/src/torrent.py
@@ -11,7 +11,7 @@ class TorrentParser(abstract.AbstractParser):
            d = _BencodeHandler().bdecode(f.read())
        if d is None:
            return {'Unknown meta': 'Unable to parse torrent file "%s".' % self.filename}
-        for k,v in d.items():
+        for k, v in d.items():
            if k not in self.whitelist:
                metadata[k.decode('utf-8')] = v
        return metadata
@@ -23,7 +23,7 @@ class TorrentParser(abstract.AbstractParser):
            d = _BencodeHandler().bdecode(f.read())
        if d is None:
            return False
-        for k,v in d.items():
+        for k, v in d.items():
            if k in self.whitelist:
                cleaned[k] = v
        with open(self.output_filename, 'wb') as f:
@@ -39,21 +39,22 @@ class _BencodeHandler(object):
    """
    def __init__(self):
        self.__decode_func = {
-                    ord('d'): self.__decode_dict,
+            ord('d'): self.__decode_dict,
-                    ord('i'): self.__decode_int,
+            ord('i'): self.__decode_int,
-                    ord('l'): self.__decode_list,
+            ord('l'): self.__decode_list,
-            }
+        }
        for i in range(0, 10):
            self.__decode_func[ord(str(i))] = self.__decode_string
        self.__encode_func = {
-                bytes: self.__encode_string,
+            bytes: self.__encode_string,
-                dict: self.__encode_dict,
+            dict: self.__encode_dict,
-                int: self.__encode_int,
+            int: self.__encode_int,
-                list: self.__encode_list,
+            list: self.__encode_list,
        }
-    def __decode_int(self, s:str) -> (int, str):
+    @staticmethod
+    def __decode_int(s: str) -> (int, str):
        s = s[1:]
        next_idx = s.index(b'e')
        if s.startswith(b'-0'):
@@ -62,7 +63,8 @@ class _BencodeHandler(object):
            raise ValueError  # no leading zero except for zero itself
        return int(s[:next_idx]), s[next_idx+1:]
-    def __decode_string(self, s:str) -> (str, str):
+    @staticmethod
+    def __decode_string(s: str) -> (str, str):
        sep = s.index(b':')
        str_len = int(s[:sep])
        if str_len < 0:
@@ -72,7 +74,7 @@ class _BencodeHandler(object):
        s = s[1:]
        return s[sep:sep+str_len], s[sep+str_len:]
-    def __decode_list(self, s:str) -> (list, str):
+    def __decode_list(self, s: str) -> (list, str):
        r = list()
        s = s[1:]  # skip leading `l`
        while s[0] != ord('e'):
@@ -80,7 +82,7 @@ class _BencodeHandler(object):
            r.append(v)
        return r, s[1:]
-    def __decode_dict(self, s:str) -> (dict, str):
+    def __decode_dict(self, s: str) -> (dict, str):
        r = dict()
        s = s[1:]  # skip leading `d`
        while s[0] != ord(b'e'):
@@ -89,30 +91,30 @@ class _BencodeHandler(object):
        return r, s[1:]
    @staticmethod
-    def __encode_int(x:str) -> bytes:
+    def __encode_int(x: str) -> bytes:
        return b'i' + bytes(str(x), 'utf-8') + b'e'
    @staticmethod
-    def __encode_string(x:str) -> bytes:
+    def __encode_string(x: str) -> bytes:
        return bytes((str(len(x))), 'utf-8') + b':' + x
-    def __encode_list(self, x:str) -> bytes:
+    def __encode_list(self, x: str) -> bytes:
        ret = b''
        for i in x:
            ret += self.__encode_func[type(i)](i)
        return b'l' + ret + b'e'
-    def __encode_dict(self, x:str) -> bytes:
+    def __encode_dict(self, x: str) -> bytes:
        ret = b''
        for k, v in sorted(x.items()):
            ret += self.__encode_func[type(k)](k)
            ret += self.__encode_func[type(v)](v)
        return b'd' + ret + b'e'
-    def bencode(self, s:str) -> bytes:
+    def bencode(self, s: str) -> bytes:
        return self.__encode_func[type(s)](s)
-    def bdecode(self, s:str):
+    def bdecode(self, s: str):
        try:
            r, l = self.__decode_func[s[0]](s)
        except (IndexError, KeyError, ValueError) as e:

diff --git a/src/__init__.py b/src/__init__.py index 3f5c478..07d3036 100644 --- a/src/__init__.py +++ b/src/__init__.py
@@ -2,4 +2,5 @@
2		2
3	# A set of extension that aren't supported, despite matching a supported mimetype	3	# A set of extension that aren't supported, despite matching a supported mimetype
4	unsupported_extensions = set(['bat', 'c', 'h', 'ksh', 'pl', 'txt', 'asc',	4	unsupported_extensions = set(['bat', 'c', 'h', 'ksh', 'pl', 'txt', 'asc',
5	'text', 'pot', 'brf', 'srt', 'rdf', 'wsdl', 'xpdl', 'xsl', 'xsd'])	5	'text', 'pot', 'brf', 'srt', 'rdf', 'wsdl',
		6	'xpdl', 'xsl', 'xsd'])


diff --git a/src/audio.py b/src/audio.py index 4a385b2..3a6aa79 100644 --- a/src/audio.py +++ b/src/audio.py
@@ -9,7 +9,7 @@ class MutagenParser(abstract.AbstractParser):
9	def get_meta(self):	9	def get_meta(self):
10	f = mutagen.File(self.filename)	10	f = mutagen.File(self.filename)
11	if f.tags:	11	if f.tags:
12	return {k:', '.join(v) for k,v in f.tags.items()}	12	return {k:', '.join(v) for k, v in f.tags.items()}
13	return {}	13	return {}
14		14
15	def remove_all(self):	15	def remove_all(self):


diff --git a/src/harmless.py b/src/harmless.py index fbc2897..aa00582 100644 --- a/src/harmless.py +++ b/src/harmless.py
@@ -6,6 +6,7 @@ class HarmlessParser(abstract.AbstractParser):
6	mimetypes = {'application/xml', 'text/plain'}	6	mimetypes = {'application/xml', 'text/plain'}
7		7
8	def __init__(self, filename: str):	8	def __init__(self, filename: str):
		9	super().__init__(filename)
9	self.filename = filename	10	self.filename = filename
10	self.output_filename = filename	11	self.output_filename = filename
11		12


diff --git a/src/images.py b/src/images.py index 6cc3dfe..c84952a 100644 --- a/src/images.py +++ b/src/images.py
@@ -14,11 +14,12 @@ from . import abstract
14	class PNGParser(abstract.AbstractParser):	14	class PNGParser(abstract.AbstractParser):
15	mimetypes = {'image/png', }	15	mimetypes = {'image/png', }
16	meta_whitelist = {'SourceFile', 'ExifToolVersion', 'FileName',	16	meta_whitelist = {'SourceFile', 'ExifToolVersion', 'FileName',
17	'Directory', 'FileSize', 'FileModifyDate', 'FileAccessDate',	17	'Directory', 'FileSize', 'FileModifyDate',
18	"FileInodeChangeDate", 'FilePermissions', 'FileType',	18	'FileAccessDate', 'FileInodeChangeDate',
19	'FileTypeExtension', 'MIMEType', 'ImageWidth', 'BitDepth', 'ColorType',	19	'FilePermissions', 'FileType', 'FileTypeExtension',
20	'Compression', 'Filter', 'Interlace', 'BackgroundColor', 'ImageSize',	20	'MIMEType', 'ImageWidth', 'BitDepth', 'ColorType',
21	'Megapixels', 'ImageHeight'}	21	'Compression', 'Filter', 'Interlace', 'BackgroundColor',
		22	'ImageSize', 'Megapixels', 'ImageHeight'}
22		23
23	def __init__(self, filename):	24	def __init__(self, filename):
24	super().__init__(filename)	25	super().__init__(filename)
@@ -63,36 +64,38 @@ class GdkPixbufAbstractParser(abstract.AbstractParser):
63	class JPGParser(GdkPixbufAbstractParser):	64	class JPGParser(GdkPixbufAbstractParser):
64	mimetypes = {'image/jpeg'}	65	mimetypes = {'image/jpeg'}
65	meta_whitelist = {'SourceFile', 'ExifToolVersion', 'FileName',	66	meta_whitelist = {'SourceFile', 'ExifToolVersion', 'FileName',
66	'Directory', 'FileSize', 'FileModifyDate', 'FileAccessDate',	67	'Directory', 'FileSize', 'FileModifyDate',
67	"FileInodeChangeDate", 'FilePermissions', 'FileType',	68	'FileAccessDate', "FileInodeChangeDate",
68	'FileTypeExtension', 'MIMEType', 'ImageWidth',	69	'FilePermissions', 'FileType', 'FileTypeExtension',
69	'ImageSize', 'BitsPerSample', 'ColorComponents', 'EncodingProcess',	70	'MIMEType', 'ImageWidth', 'ImageSize', 'BitsPerSample',
70	'JFIFVersion', 'ResolutionUnit', 'XResolution', 'YCbCrSubSampling',	71	'ColorComponents', 'EncodingProcess', 'JFIFVersion',
71	'YResolution', 'Megapixels', 'ImageHeight'}	72	'ResolutionUnit', 'XResolution', 'YCbCrSubSampling',
		73	'YResolution', 'Megapixels', 'ImageHeight'}
72		74
73		75
74	class TiffParser(GdkPixbufAbstractParser):	76	class TiffParser(GdkPixbufAbstractParser):
75	mimetypes = {'image/tiff'}	77	mimetypes = {'image/tiff'}
76	meta_whitelist = {'Compression', 'ExifByteOrder', 'ExtraSamples',	78	meta_whitelist = {'Compression', 'ExifByteOrder', 'ExtraSamples',
77	'FillOrder', 'PhotometricInterpretation', 'PlanarConfiguration',	79	'FillOrder', 'PhotometricInterpretation',
78	'RowsPerStrip', 'SamplesPerPixel', 'StripByteCounts',	80	'PlanarConfiguration', 'RowsPerStrip', 'SamplesPerPixel',
79	'StripOffsets', 'BitsPerSample', 'Directory', 'ExifToolVersion',	81	'StripByteCounts', 'StripOffsets', 'BitsPerSample',
80	'FileAccessDate', 'FileInodeChangeDate', 'FileModifyDate',	82	'Directory', 'ExifToolVersion', 'FileAccessDate',
81	'FileName', 'FilePermissions', 'FileSize', 'FileType',	83	'FileInodeChangeDate', 'FileModifyDate', 'FileName',
82	'FileTypeExtension', 'ImageHeight', 'ImageSize', 'ImageWidth',	84	'FilePermissions', 'FileSize', 'FileType',
83	'MIMEType', 'Megapixels', 'SourceFile'}	85	'FileTypeExtension', 'ImageHeight', 'ImageSize',
		86	'ImageWidth', 'MIMEType', 'Megapixels', 'SourceFile'}
84		87
85		88
86	class BMPParser(GdkPixbufAbstractParser):	89	class BMPParser(GdkPixbufAbstractParser):
87	mimetypes = {'image/x-ms-bmp'}	90	mimetypes = {'image/x-ms-bmp'}
88	meta_whitelist = {'SourceFile', 'ExifToolVersion', 'FileName', 'Directory',	91	meta_whitelist = {'SourceFile', 'ExifToolVersion', 'FileName', 'Directory',
89	'FileSize', 'FileModifyDate', 'FileAccessDate',	92	'FileSize', 'FileModifyDate', 'FileAccessDate',
90	'FileInodeChangeDate', 'FilePermissions', 'FileType',	93	'FileInodeChangeDate', 'FilePermissions', 'FileType',
91	'FileTypeExtension', 'MIMEType', 'BMPVersion', 'ImageWidth',	94	'FileTypeExtension', 'MIMEType', 'BMPVersion',
92	'ImageHeight', 'Planes', 'BitDepth', 'Compression', 'ImageLength',	95	'ImageWidth', 'ImageHeight', 'Planes', 'BitDepth',
93	'PixelsPerMeterX', 'PixelsPerMeterY', 'NumColors',	96	'Compression', 'ImageLength', 'PixelsPerMeterX',
94	'NumImportantColors', 'RedMask', 'GreenMask', 'BlueMask',	97	'PixelsPerMeterY', 'NumColors', 'NumImportantColors',
95	'AlphaMask', 'ColorSpace', 'RedEndpoint', 'GreenEndpoint',	98	'RedMask', 'GreenMask', 'BlueMask', 'AlphaMask',
96	'BlueEndpoint', 'GammaRed', 'GammaGreen', 'GammaBlue', 'ImageSize',	99	'ColorSpace', 'RedEndpoint', 'GreenEndpoint',
97	'Megapixels'}	100	'BlueEndpoint', 'GammaRed', 'GammaGreen', 'GammaBlue',
98		101	'ImageSize', 'Megapixels'}


diff --git a/src/office.py b/src/office.py index da6168e..749fc7d 100644 --- a/src/office.py +++ b/src/office.py
@@ -9,14 +9,14 @@ from . import abstract, parser_factory
9		9
10		10
11	class ArchiveBasedAbstractParser(abstract.AbstractParser):	11	class ArchiveBasedAbstractParser(abstract.AbstractParser):
12	def _clean_zipinfo(self, zipinfo:zipfile.ZipInfo) -> zipfile.ZipInfo:	12	def _clean_zipinfo(self, zipinfo: zipfile.ZipInfo) -> zipfile.ZipInfo:
13	zipinfo.compress_type = zipfile.ZIP_DEFLATED	13	zipinfo.compress_type = zipfile.ZIP_DEFLATED
14	zipinfo.create_system = 3 # Linux	14	zipinfo.create_system = 3 # Linux
15	zipinfo.comment = b''	15	zipinfo.comment = b''
16	zipinfo.date_time = (1980, 1, 1, 0, 0, 0)	16	zipinfo.date_time = (1980, 1, 1, 0, 0, 0)
17	return zipinfo	17	return zipinfo
18		18
19	def _get_zipinfo_meta(self, zipinfo:zipfile.ZipInfo) -> dict:	19	def _get_zipinfo_meta(self, zipinfo: zipfile.ZipInfo) -> dict:
20	metadata = {}	20	metadata = {}
21	if zipinfo.create_system == 3:	21	if zipinfo.create_system == 3:
22	#metadata['create_system'] = 'Linux'	22	#metadata['create_system'] = 'Linux'
@@ -35,7 +35,8 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
35	return metadata	35	return metadata
36		36
37		37
38	def _clean_internal_file(self, item:zipfile.ZipInfo, temp_folder:str, zin:zipfile.ZipFile, zout:zipfile.ZipFile):	38	def _clean_internal_file(self, item: zipfile.ZipInfo, temp_folder: str,
		39	zin: zipfile.ZipFile, zout: zipfile.ZipFile):
39	zin.extract(member=item, path=temp_folder)	40	zin.extract(member=item, path=temp_folder)
40	tmp_parser, mtype = parser_factory.get_parser(os.path.join(temp_folder, item.filename))	41	tmp_parser, mtype = parser_factory.get_parser(os.path.join(temp_folder, item.filename))
41	if not tmp_parser:	42	if not tmp_parser:
@@ -50,9 +51,9 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
50		51
51	class MSOfficeParser(ArchiveBasedAbstractParser):	52	class MSOfficeParser(ArchiveBasedAbstractParser):
52	mimetypes = {	53	mimetypes = {
53	'application/vnd.openxmlformats-officedocument.wordprocessingml.document',	54	'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
54	'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',	55	'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
55	'application/vnd.openxmlformats-officedocument.presentationml.presentation'	56	'application/vnd.openxmlformats-officedocument.presentationml.presentation'
56	}	57	}
57	files_to_keep = {'_rels/.rels', 'word/_rels/document.xml.rels'}	58	files_to_keep = {'_rels/.rels', 'word/_rels/document.xml.rels'}
58		59
@@ -103,13 +104,13 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
103		104
104	class LibreOfficeParser(ArchiveBasedAbstractParser):	105	class LibreOfficeParser(ArchiveBasedAbstractParser):
105	mimetypes = {	106	mimetypes = {
106	'application/vnd.oasis.opendocument.text',	107	'application/vnd.oasis.opendocument.text',
107	'application/vnd.oasis.opendocument.spreadsheet',	108	'application/vnd.oasis.opendocument.spreadsheet',
108	'application/vnd.oasis.opendocument.presentation',	109	'application/vnd.oasis.opendocument.presentation',
109	'application/vnd.oasis.opendocument.graphics',	110	'application/vnd.oasis.opendocument.graphics',
110	'application/vnd.oasis.opendocument.chart',	111	'application/vnd.oasis.opendocument.chart',
111	'application/vnd.oasis.opendocument.formula',	112	'application/vnd.oasis.opendocument.formula',
112	'application/vnd.oasis.opendocument.image',	113	'application/vnd.oasis.opendocument.image',
113	}	114	}
114		115
115	def get_meta(self):	116	def get_meta(self):


diff --git a/src/parser_factory.py b/src/parser_factory.py index 2c30659..48616b0 100644 --- a/src/parser_factory.py +++ b/src/parser_factory.py
@@ -2,10 +2,10 @@ import os
2	import mimetypes	2	import mimetypes
3	import importlib	3	import importlib
4	import pkgutil	4	import pkgutil
		5	from typing import TypeVar
5		6
6	from . import abstract, unsupported_extensions	7	from . import abstract, unsupported_extensions
7		8
8	from typing import TypeVar
9		9
10	T = TypeVar('T', bound='abstract.AbstractParser')	10	T = TypeVar('T', bound='abstract.AbstractParser')
11		11


diff --git a/src/pdf.py b/src/pdf.py index fbc5175..5b99192 100644 --- a/src/pdf.py +++ b/src/pdf.py
@@ -21,8 +21,8 @@ logging.basicConfig(level=logging.DEBUG)
21	class PDFParser(abstract.AbstractParser):	21	class PDFParser(abstract.AbstractParser):
22	mimetypes = {'application/pdf', }	22	mimetypes = {'application/pdf', }
23	meta_list = {'author', 'creation-date', 'creator', 'format', 'keywords',	23	meta_list = {'author', 'creation-date', 'creator', 'format', 'keywords',
24	'metadata', 'mod-date', 'producer', 'subject', 'title',	24	'metadata', 'mod-date', 'producer', 'subject', 'title',
25	'viewer-preferences'}	25	'viewer-preferences'}
26		26
27	def __init__(self, filename):	27	def __init__(self, filename):
28	super().__init__(filename)	28	super().__init__(filename)
@@ -103,7 +103,8 @@ class PDFParser(abstract.AbstractParser):
103		103
104	return True	104	return True
105		105
106	def __remove_superficial_meta(self, in_file:str, out_file: str) -> bool:	106	@staticmethod
		107	def __remove_superficial_meta(in_file: str, out_file: str) -> bool:
107	document = Poppler.Document.new_from_file('file://' + in_file)	108	document = Poppler.Document.new_from_file('file://' + in_file)
108	document.set_producer('')	109	document.set_producer('')
109	document.set_creator('')	110	document.set_creator('')
@@ -112,7 +113,8 @@ class PDFParser(abstract.AbstractParser):
112	return True	113	return True
113		114
114		115
115	def __parse_metadata_field(self, data:str) -> dict:	116	@staticmethod
		117	def __parse_metadata_field(data: str) -> dict:
116	metadata = {}	118	metadata = {}
117	for (_, key, value) in re.findall(r"<(xmp\|pdfx\|pdf\|xmpMM):(.+)>(.+)</\1:\2>", data, re.I):	119	for (_, key, value) in re.findall(r"<(xmp\|pdfx\|pdf\|xmpMM):(.+)>(.+)</\1:\2>", data, re.I):
118	metadata[key] = value	120	metadata[key] = value
@@ -128,6 +130,6 @@ class PDFParser(abstract.AbstractParser):
128	if document.get_property(key):	130	if document.get_property(key):
129	metadata[key] = document.get_property(key)	131	metadata[key] = document.get_property(key)
130	if 'metadata' in metadata:	132	if 'metadata' in metadata:
131	parsed_meta = self.__parse_metadata_field(metadata['metadata'])	133	parsed_meta = self.__parse_metadata_field(metadata['metadata'])
132	return {metadata, parsed_meta}	134	return {metadata, parsed_meta}
133	return metadata	135	return metadata


diff --git a/src/torrent.py b/src/torrent.py index bdf83ce..cb4b5e3 100644 --- a/src/torrent.py +++ b/src/torrent.py
@@ -11,7 +11,7 @@ class TorrentParser(abstract.AbstractParser):
11	d = _BencodeHandler().bdecode(f.read())	11	d = _BencodeHandler().bdecode(f.read())
12	if d is None:	12	if d is None:
13	return {'Unknown meta': 'Unable to parse torrent file "%s".' % self.filename}	13	return {'Unknown meta': 'Unable to parse torrent file "%s".' % self.filename}
14	for k,v in d.items():	14	for k, v in d.items():
15	if k not in self.whitelist:	15	if k not in self.whitelist:
16	metadata[k.decode('utf-8')] = v	16	metadata[k.decode('utf-8')] = v
17	return metadata	17	return metadata
@@ -23,7 +23,7 @@ class TorrentParser(abstract.AbstractParser):
23	d = _BencodeHandler().bdecode(f.read())	23	d = _BencodeHandler().bdecode(f.read())
24	if d is None:	24	if d is None:
25	return False	25	return False
26	for k,v in d.items():	26	for k, v in d.items():
27	if k in self.whitelist:	27	if k in self.whitelist:
28	cleaned[k] = v	28	cleaned[k] = v
29	with open(self.output_filename, 'wb') as f:	29	with open(self.output_filename, 'wb') as f:
@@ -39,21 +39,22 @@ class _BencodeHandler(object):
39	"""	39	"""
40	def __init__(self):	40	def __init__(self):
41	self.__decode_func = {	41	self.__decode_func = {
42	ord('d'): self.__decode_dict,	42	ord('d'): self.__decode_dict,
43	ord('i'): self.__decode_int,	43	ord('i'): self.__decode_int,
44	ord('l'): self.__decode_list,	44	ord('l'): self.__decode_list,
45	}	45	}
46	for i in range(0, 10):	46	for i in range(0, 10):
47	self.__decode_func[ord(str(i))] = self.__decode_string	47	self.__decode_func[ord(str(i))] = self.__decode_string
48		48
49	self.__encode_func = {	49	self.__encode_func = {
50	bytes: self.__encode_string,	50	bytes: self.__encode_string,
51	dict: self.__encode_dict,	51	dict: self.__encode_dict,
52	int: self.__encode_int,	52	int: self.__encode_int,
53	list: self.__encode_list,	53	list: self.__encode_list,
54	}	54	}
55		55
56	def __decode_int(self, s:str) -> (int, str):	56	@staticmethod
		57	def __decode_int(s: str) -> (int, str):
57	s = s[1:]	58	s = s[1:]
58	next_idx = s.index(b'e')	59	next_idx = s.index(b'e')
59	if s.startswith(b'-0'):	60	if s.startswith(b'-0'):
@@ -62,7 +63,8 @@ class _BencodeHandler(object):
62	raise ValueError # no leading zero except for zero itself	63	raise ValueError # no leading zero except for zero itself
63	return int(s[:next_idx]), s[next_idx+1:]	64	return int(s[:next_idx]), s[next_idx+1:]
64		65
65	def __decode_string(self, s:str) -> (str, str):	66	@staticmethod
		67	def __decode_string(s: str) -> (str, str):
66	sep = s.index(b':')	68	sep = s.index(b':')
67	str_len = int(s[:sep])	69	str_len = int(s[:sep])
68	if str_len < 0:	70	if str_len < 0:
@@ -72,7 +74,7 @@ class _BencodeHandler(object):
72	s = s[1:]	74	s = s[1:]
73	return s[sep:sep+str_len], s[sep+str_len:]	75	return s[sep:sep+str_len], s[sep+str_len:]
74		76
75	def __decode_list(self, s:str) -> (list, str):	77	def __decode_list(self, s: str) -> (list, str):
76	r = list()	78	r = list()
77	s = s[1:] # skip leading `l`	79	s = s[1:] # skip leading `l`
78	while s[0] != ord('e'):	80	while s[0] != ord('e'):
@@ -80,7 +82,7 @@ class _BencodeHandler(object):
80	r.append(v)	82	r.append(v)
81	return r, s[1:]	83	return r, s[1:]
82		84
83	def __decode_dict(self, s:str) -> (dict, str):	85	def __decode_dict(self, s: str) -> (dict, str):
84	r = dict()	86	r = dict()
85	s = s[1:] # skip leading `d`	87	s = s[1:] # skip leading `d`
86	while s[0] != ord(b'e'):	88	while s[0] != ord(b'e'):
@@ -89,30 +91,30 @@ class _BencodeHandler(object):
89	return r, s[1:]	91	return r, s[1:]
90		92
91	@staticmethod	93	@staticmethod
92	def __encode_int(x:str) -> bytes:	94	def __encode_int(x: str) -> bytes:
93	return b'i' + bytes(str(x), 'utf-8') + b'e'	95	return b'i' + bytes(str(x), 'utf-8') + b'e'
94		96
95	@staticmethod	97	@staticmethod
96	def __encode_string(x:str) -> bytes:	98	def __encode_string(x: str) -> bytes:
97	return bytes((str(len(x))), 'utf-8') + b':' + x	99	return bytes((str(len(x))), 'utf-8') + b':' + x
98		100
99	def __encode_list(self, x:str) -> bytes:	101	def __encode_list(self, x: str) -> bytes:
100	ret = b''	102	ret = b''
101	for i in x:	103	for i in x:
102	ret += self.__encode_func[type(i)](i)	104	ret += self.__encode_func[type(i)](i)
103	return b'l' + ret + b'e'	105	return b'l' + ret + b'e'
104		106
105	def __encode_dict(self, x:str) -> bytes:	107	def __encode_dict(self, x: str) -> bytes:
106	ret = b''	108	ret = b''
107	for k, v in sorted(x.items()):	109	for k, v in sorted(x.items()):
108	ret += self.__encode_func[type(k)](k)	110	ret += self.__encode_func[type(k)](k)
109	ret += self.__encode_func[type(v)](v)	111	ret += self.__encode_func[type(v)](v)
110	return b'd' + ret + b'e'	112	return b'd' + ret + b'e'
111		113
112	def bencode(self, s:str) -> bytes:	114	def bencode(self, s: str) -> bytes:
113	return self.__encode_func[type(s)](s)	115	return self.__encode_func[type(s)](s)
114		116
115	def bdecode(self, s:str):	117	def bdecode(self, s: str):
116	try:	118	try:
117	r, l = self.__decode_func[s[0]](s)	119	r, l = self.__decode_func[s[0]](s)
118	except (IndexError, KeyError, ValueError) as e:	120	except (IndexError, KeyError, ValueError) as e: