summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/__init__.py3
-rw-r--r--src/audio.py2
-rw-r--r--src/harmless.py1
-rw-r--r--src/images.py59
-rw-r--r--src/office.py27
-rw-r--r--src/parser_factory.py2
-rw-r--r--src/pdf.py12
-rw-r--r--src/torrent.py42
8 files changed, 79 insertions, 69 deletions
diff --git a/src/__init__.py b/src/__init__.py
index 3f5c478..07d3036 100644
--- a/src/__init__.py
+++ b/src/__init__.py
@@ -2,4 +2,5 @@
2 2
3# A set of extension that aren't supported, despite matching a supported mimetype 3# A set of extension that aren't supported, despite matching a supported mimetype
4unsupported_extensions = set(['bat', 'c', 'h', 'ksh', 'pl', 'txt', 'asc', 4unsupported_extensions = set(['bat', 'c', 'h', 'ksh', 'pl', 'txt', 'asc',
5 'text', 'pot', 'brf', 'srt', 'rdf', 'wsdl', 'xpdl', 'xsl', 'xsd']) 5 'text', 'pot', 'brf', 'srt', 'rdf', 'wsdl',
6 'xpdl', 'xsl', 'xsd'])
diff --git a/src/audio.py b/src/audio.py
index 4a385b2..3a6aa79 100644
--- a/src/audio.py
+++ b/src/audio.py
@@ -9,7 +9,7 @@ class MutagenParser(abstract.AbstractParser):
9 def get_meta(self): 9 def get_meta(self):
10 f = mutagen.File(self.filename) 10 f = mutagen.File(self.filename)
11 if f.tags: 11 if f.tags:
12 return {k:', '.join(v) for k,v in f.tags.items()} 12 return {k:', '.join(v) for k, v in f.tags.items()}
13 return {} 13 return {}
14 14
15 def remove_all(self): 15 def remove_all(self):
diff --git a/src/harmless.py b/src/harmless.py
index fbc2897..aa00582 100644
--- a/src/harmless.py
+++ b/src/harmless.py
@@ -6,6 +6,7 @@ class HarmlessParser(abstract.AbstractParser):
6 mimetypes = {'application/xml', 'text/plain'} 6 mimetypes = {'application/xml', 'text/plain'}
7 7
8 def __init__(self, filename: str): 8 def __init__(self, filename: str):
9 super().__init__(filename)
9 self.filename = filename 10 self.filename = filename
10 self.output_filename = filename 11 self.output_filename = filename
11 12
diff --git a/src/images.py b/src/images.py
index 6cc3dfe..c84952a 100644
--- a/src/images.py
+++ b/src/images.py
@@ -14,11 +14,12 @@ from . import abstract
14class PNGParser(abstract.AbstractParser): 14class PNGParser(abstract.AbstractParser):
15 mimetypes = {'image/png', } 15 mimetypes = {'image/png', }
16 meta_whitelist = {'SourceFile', 'ExifToolVersion', 'FileName', 16 meta_whitelist = {'SourceFile', 'ExifToolVersion', 'FileName',
17 'Directory', 'FileSize', 'FileModifyDate', 'FileAccessDate', 17 'Directory', 'FileSize', 'FileModifyDate',
18 "FileInodeChangeDate", 'FilePermissions', 'FileType', 18 'FileAccessDate', 'FileInodeChangeDate',
19 'FileTypeExtension', 'MIMEType', 'ImageWidth', 'BitDepth', 'ColorType', 19 'FilePermissions', 'FileType', 'FileTypeExtension',
20 'Compression', 'Filter', 'Interlace', 'BackgroundColor', 'ImageSize', 20 'MIMEType', 'ImageWidth', 'BitDepth', 'ColorType',
21 'Megapixels', 'ImageHeight'} 21 'Compression', 'Filter', 'Interlace', 'BackgroundColor',
22 'ImageSize', 'Megapixels', 'ImageHeight'}
22 23
23 def __init__(self, filename): 24 def __init__(self, filename):
24 super().__init__(filename) 25 super().__init__(filename)
@@ -63,36 +64,38 @@ class GdkPixbufAbstractParser(abstract.AbstractParser):
63class JPGParser(GdkPixbufAbstractParser): 64class JPGParser(GdkPixbufAbstractParser):
64 mimetypes = {'image/jpeg'} 65 mimetypes = {'image/jpeg'}
65 meta_whitelist = {'SourceFile', 'ExifToolVersion', 'FileName', 66 meta_whitelist = {'SourceFile', 'ExifToolVersion', 'FileName',
66 'Directory', 'FileSize', 'FileModifyDate', 'FileAccessDate', 67 'Directory', 'FileSize', 'FileModifyDate',
67 "FileInodeChangeDate", 'FilePermissions', 'FileType', 68 'FileAccessDate', "FileInodeChangeDate",
68 'FileTypeExtension', 'MIMEType', 'ImageWidth', 69 'FilePermissions', 'FileType', 'FileTypeExtension',
69 'ImageSize', 'BitsPerSample', 'ColorComponents', 'EncodingProcess', 70 'MIMEType', 'ImageWidth', 'ImageSize', 'BitsPerSample',
70 'JFIFVersion', 'ResolutionUnit', 'XResolution', 'YCbCrSubSampling', 71 'ColorComponents', 'EncodingProcess', 'JFIFVersion',
71 'YResolution', 'Megapixels', 'ImageHeight'} 72 'ResolutionUnit', 'XResolution', 'YCbCrSubSampling',
73 'YResolution', 'Megapixels', 'ImageHeight'}
72 74
73 75
74class TiffParser(GdkPixbufAbstractParser): 76class TiffParser(GdkPixbufAbstractParser):
75 mimetypes = {'image/tiff'} 77 mimetypes = {'image/tiff'}
76 meta_whitelist = {'Compression', 'ExifByteOrder', 'ExtraSamples', 78 meta_whitelist = {'Compression', 'ExifByteOrder', 'ExtraSamples',
77 'FillOrder', 'PhotometricInterpretation', 'PlanarConfiguration', 79 'FillOrder', 'PhotometricInterpretation',
78 'RowsPerStrip', 'SamplesPerPixel', 'StripByteCounts', 80 'PlanarConfiguration', 'RowsPerStrip', 'SamplesPerPixel',
79 'StripOffsets', 'BitsPerSample', 'Directory', 'ExifToolVersion', 81 'StripByteCounts', 'StripOffsets', 'BitsPerSample',
80 'FileAccessDate', 'FileInodeChangeDate', 'FileModifyDate', 82 'Directory', 'ExifToolVersion', 'FileAccessDate',
81 'FileName', 'FilePermissions', 'FileSize', 'FileType', 83 'FileInodeChangeDate', 'FileModifyDate', 'FileName',
82 'FileTypeExtension', 'ImageHeight', 'ImageSize', 'ImageWidth', 84 'FilePermissions', 'FileSize', 'FileType',
83 'MIMEType', 'Megapixels', 'SourceFile'} 85 'FileTypeExtension', 'ImageHeight', 'ImageSize',
86 'ImageWidth', 'MIMEType', 'Megapixels', 'SourceFile'}
84 87
85 88
86class BMPParser(GdkPixbufAbstractParser): 89class BMPParser(GdkPixbufAbstractParser):
87 mimetypes = {'image/x-ms-bmp'} 90 mimetypes = {'image/x-ms-bmp'}
88 meta_whitelist = {'SourceFile', 'ExifToolVersion', 'FileName', 'Directory', 91 meta_whitelist = {'SourceFile', 'ExifToolVersion', 'FileName', 'Directory',
89 'FileSize', 'FileModifyDate', 'FileAccessDate', 92 'FileSize', 'FileModifyDate', 'FileAccessDate',
90 'FileInodeChangeDate', 'FilePermissions', 'FileType', 93 'FileInodeChangeDate', 'FilePermissions', 'FileType',
91 'FileTypeExtension', 'MIMEType', 'BMPVersion', 'ImageWidth', 94 'FileTypeExtension', 'MIMEType', 'BMPVersion',
92 'ImageHeight', 'Planes', 'BitDepth', 'Compression', 'ImageLength', 95 'ImageWidth', 'ImageHeight', 'Planes', 'BitDepth',
93 'PixelsPerMeterX', 'PixelsPerMeterY', 'NumColors', 96 'Compression', 'ImageLength', 'PixelsPerMeterX',
94 'NumImportantColors', 'RedMask', 'GreenMask', 'BlueMask', 97 'PixelsPerMeterY', 'NumColors', 'NumImportantColors',
95 'AlphaMask', 'ColorSpace', 'RedEndpoint', 'GreenEndpoint', 98 'RedMask', 'GreenMask', 'BlueMask', 'AlphaMask',
96 'BlueEndpoint', 'GammaRed', 'GammaGreen', 'GammaBlue', 'ImageSize', 99 'ColorSpace', 'RedEndpoint', 'GreenEndpoint',
97 'Megapixels'} 100 'BlueEndpoint', 'GammaRed', 'GammaGreen', 'GammaBlue',
98 101 'ImageSize', 'Megapixels'}
diff --git a/src/office.py b/src/office.py
index da6168e..749fc7d 100644
--- a/src/office.py
+++ b/src/office.py
@@ -9,14 +9,14 @@ from . import abstract, parser_factory
9 9
10 10
11class ArchiveBasedAbstractParser(abstract.AbstractParser): 11class ArchiveBasedAbstractParser(abstract.AbstractParser):
12 def _clean_zipinfo(self, zipinfo:zipfile.ZipInfo) -> zipfile.ZipInfo: 12 def _clean_zipinfo(self, zipinfo: zipfile.ZipInfo) -> zipfile.ZipInfo:
13 zipinfo.compress_type = zipfile.ZIP_DEFLATED 13 zipinfo.compress_type = zipfile.ZIP_DEFLATED
14 zipinfo.create_system = 3 # Linux 14 zipinfo.create_system = 3 # Linux
15 zipinfo.comment = b'' 15 zipinfo.comment = b''
16 zipinfo.date_time = (1980, 1, 1, 0, 0, 0) 16 zipinfo.date_time = (1980, 1, 1, 0, 0, 0)
17 return zipinfo 17 return zipinfo
18 18
19 def _get_zipinfo_meta(self, zipinfo:zipfile.ZipInfo) -> dict: 19 def _get_zipinfo_meta(self, zipinfo: zipfile.ZipInfo) -> dict:
20 metadata = {} 20 metadata = {}
21 if zipinfo.create_system == 3: 21 if zipinfo.create_system == 3:
22 #metadata['create_system'] = 'Linux' 22 #metadata['create_system'] = 'Linux'
@@ -35,7 +35,8 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
35 return metadata 35 return metadata
36 36
37 37
38 def _clean_internal_file(self, item:zipfile.ZipInfo, temp_folder:str, zin:zipfile.ZipFile, zout:zipfile.ZipFile): 38 def _clean_internal_file(self, item: zipfile.ZipInfo, temp_folder: str,
39 zin: zipfile.ZipFile, zout: zipfile.ZipFile):
39 zin.extract(member=item, path=temp_folder) 40 zin.extract(member=item, path=temp_folder)
40 tmp_parser, mtype = parser_factory.get_parser(os.path.join(temp_folder, item.filename)) 41 tmp_parser, mtype = parser_factory.get_parser(os.path.join(temp_folder, item.filename))
41 if not tmp_parser: 42 if not tmp_parser:
@@ -50,9 +51,9 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
50 51
51class MSOfficeParser(ArchiveBasedAbstractParser): 52class MSOfficeParser(ArchiveBasedAbstractParser):
52 mimetypes = { 53 mimetypes = {
53 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', 54 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
54 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', 55 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
55 'application/vnd.openxmlformats-officedocument.presentationml.presentation' 56 'application/vnd.openxmlformats-officedocument.presentationml.presentation'
56 } 57 }
57 files_to_keep = {'_rels/.rels', 'word/_rels/document.xml.rels'} 58 files_to_keep = {'_rels/.rels', 'word/_rels/document.xml.rels'}
58 59
@@ -103,13 +104,13 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
103 104
104class LibreOfficeParser(ArchiveBasedAbstractParser): 105class LibreOfficeParser(ArchiveBasedAbstractParser):
105 mimetypes = { 106 mimetypes = {
106 'application/vnd.oasis.opendocument.text', 107 'application/vnd.oasis.opendocument.text',
107 'application/vnd.oasis.opendocument.spreadsheet', 108 'application/vnd.oasis.opendocument.spreadsheet',
108 'application/vnd.oasis.opendocument.presentation', 109 'application/vnd.oasis.opendocument.presentation',
109 'application/vnd.oasis.opendocument.graphics', 110 'application/vnd.oasis.opendocument.graphics',
110 'application/vnd.oasis.opendocument.chart', 111 'application/vnd.oasis.opendocument.chart',
111 'application/vnd.oasis.opendocument.formula', 112 'application/vnd.oasis.opendocument.formula',
112 'application/vnd.oasis.opendocument.image', 113 'application/vnd.oasis.opendocument.image',
113 } 114 }
114 115
115 def get_meta(self): 116 def get_meta(self):
diff --git a/src/parser_factory.py b/src/parser_factory.py
index 2c30659..48616b0 100644
--- a/src/parser_factory.py
+++ b/src/parser_factory.py
@@ -2,10 +2,10 @@ import os
2import mimetypes 2import mimetypes
3import importlib 3import importlib
4import pkgutil 4import pkgutil
5from typing import TypeVar
5 6
6from . import abstract, unsupported_extensions 7from . import abstract, unsupported_extensions
7 8
8from typing import TypeVar
9 9
10T = TypeVar('T', bound='abstract.AbstractParser') 10T = TypeVar('T', bound='abstract.AbstractParser')
11 11
diff --git a/src/pdf.py b/src/pdf.py
index fbc5175..5b99192 100644
--- a/src/pdf.py
+++ b/src/pdf.py
@@ -21,8 +21,8 @@ logging.basicConfig(level=logging.DEBUG)
21class PDFParser(abstract.AbstractParser): 21class PDFParser(abstract.AbstractParser):
22 mimetypes = {'application/pdf', } 22 mimetypes = {'application/pdf', }
23 meta_list = {'author', 'creation-date', 'creator', 'format', 'keywords', 23 meta_list = {'author', 'creation-date', 'creator', 'format', 'keywords',
24 'metadata', 'mod-date', 'producer', 'subject', 'title', 24 'metadata', 'mod-date', 'producer', 'subject', 'title',
25 'viewer-preferences'} 25 'viewer-preferences'}
26 26
27 def __init__(self, filename): 27 def __init__(self, filename):
28 super().__init__(filename) 28 super().__init__(filename)
@@ -103,7 +103,8 @@ class PDFParser(abstract.AbstractParser):
103 103
104 return True 104 return True
105 105
106 def __remove_superficial_meta(self, in_file:str, out_file: str) -> bool: 106 @staticmethod
107 def __remove_superficial_meta(in_file: str, out_file: str) -> bool:
107 document = Poppler.Document.new_from_file('file://' + in_file) 108 document = Poppler.Document.new_from_file('file://' + in_file)
108 document.set_producer('') 109 document.set_producer('')
109 document.set_creator('') 110 document.set_creator('')
@@ -112,7 +113,8 @@ class PDFParser(abstract.AbstractParser):
112 return True 113 return True
113 114
114 115
115 def __parse_metadata_field(self, data:str) -> dict: 116 @staticmethod
117 def __parse_metadata_field(data: str) -> dict:
116 metadata = {} 118 metadata = {}
117 for (_, key, value) in re.findall(r"<(xmp|pdfx|pdf|xmpMM):(.+)>(.+)</\1:\2>", data, re.I): 119 for (_, key, value) in re.findall(r"<(xmp|pdfx|pdf|xmpMM):(.+)>(.+)</\1:\2>", data, re.I):
118 metadata[key] = value 120 metadata[key] = value
@@ -128,6 +130,6 @@ class PDFParser(abstract.AbstractParser):
128 if document.get_property(key): 130 if document.get_property(key):
129 metadata[key] = document.get_property(key) 131 metadata[key] = document.get_property(key)
130 if 'metadata' in metadata: 132 if 'metadata' in metadata:
131 parsed_meta = self.__parse_metadata_field(metadata['metadata']) 133 parsed_meta = self.__parse_metadata_field(metadata['metadata'])
132 return {**metadata, **parsed_meta} 134 return {**metadata, **parsed_meta}
133 return metadata 135 return metadata
diff --git a/src/torrent.py b/src/torrent.py
index bdf83ce..cb4b5e3 100644
--- a/src/torrent.py
+++ b/src/torrent.py
@@ -11,7 +11,7 @@ class TorrentParser(abstract.AbstractParser):
11 d = _BencodeHandler().bdecode(f.read()) 11 d = _BencodeHandler().bdecode(f.read())
12 if d is None: 12 if d is None:
13 return {'Unknown meta': 'Unable to parse torrent file "%s".' % self.filename} 13 return {'Unknown meta': 'Unable to parse torrent file "%s".' % self.filename}
14 for k,v in d.items(): 14 for k, v in d.items():
15 if k not in self.whitelist: 15 if k not in self.whitelist:
16 metadata[k.decode('utf-8')] = v 16 metadata[k.decode('utf-8')] = v
17 return metadata 17 return metadata
@@ -23,7 +23,7 @@ class TorrentParser(abstract.AbstractParser):
23 d = _BencodeHandler().bdecode(f.read()) 23 d = _BencodeHandler().bdecode(f.read())
24 if d is None: 24 if d is None:
25 return False 25 return False
26 for k,v in d.items(): 26 for k, v in d.items():
27 if k in self.whitelist: 27 if k in self.whitelist:
28 cleaned[k] = v 28 cleaned[k] = v
29 with open(self.output_filename, 'wb') as f: 29 with open(self.output_filename, 'wb') as f:
@@ -39,21 +39,22 @@ class _BencodeHandler(object):
39 """ 39 """
40 def __init__(self): 40 def __init__(self):
41 self.__decode_func = { 41 self.__decode_func = {
42 ord('d'): self.__decode_dict, 42 ord('d'): self.__decode_dict,
43 ord('i'): self.__decode_int, 43 ord('i'): self.__decode_int,
44 ord('l'): self.__decode_list, 44 ord('l'): self.__decode_list,
45 } 45 }
46 for i in range(0, 10): 46 for i in range(0, 10):
47 self.__decode_func[ord(str(i))] = self.__decode_string 47 self.__decode_func[ord(str(i))] = self.__decode_string
48 48
49 self.__encode_func = { 49 self.__encode_func = {
50 bytes: self.__encode_string, 50 bytes: self.__encode_string,
51 dict: self.__encode_dict, 51 dict: self.__encode_dict,
52 int: self.__encode_int, 52 int: self.__encode_int,
53 list: self.__encode_list, 53 list: self.__encode_list,
54 } 54 }
55 55
56 def __decode_int(self, s:str) -> (int, str): 56 @staticmethod
57 def __decode_int(s: str) -> (int, str):
57 s = s[1:] 58 s = s[1:]
58 next_idx = s.index(b'e') 59 next_idx = s.index(b'e')
59 if s.startswith(b'-0'): 60 if s.startswith(b'-0'):
@@ -62,7 +63,8 @@ class _BencodeHandler(object):
62 raise ValueError # no leading zero except for zero itself 63 raise ValueError # no leading zero except for zero itself
63 return int(s[:next_idx]), s[next_idx+1:] 64 return int(s[:next_idx]), s[next_idx+1:]
64 65
65 def __decode_string(self, s:str) -> (str, str): 66 @staticmethod
67 def __decode_string(s: str) -> (str, str):
66 sep = s.index(b':') 68 sep = s.index(b':')
67 str_len = int(s[:sep]) 69 str_len = int(s[:sep])
68 if str_len < 0: 70 if str_len < 0:
@@ -72,7 +74,7 @@ class _BencodeHandler(object):
72 s = s[1:] 74 s = s[1:]
73 return s[sep:sep+str_len], s[sep+str_len:] 75 return s[sep:sep+str_len], s[sep+str_len:]
74 76
75 def __decode_list(self, s:str) -> (list, str): 77 def __decode_list(self, s: str) -> (list, str):
76 r = list() 78 r = list()
77 s = s[1:] # skip leading `l` 79 s = s[1:] # skip leading `l`
78 while s[0] != ord('e'): 80 while s[0] != ord('e'):
@@ -80,7 +82,7 @@ class _BencodeHandler(object):
80 r.append(v) 82 r.append(v)
81 return r, s[1:] 83 return r, s[1:]
82 84
83 def __decode_dict(self, s:str) -> (dict, str): 85 def __decode_dict(self, s: str) -> (dict, str):
84 r = dict() 86 r = dict()
85 s = s[1:] # skip leading `d` 87 s = s[1:] # skip leading `d`
86 while s[0] != ord(b'e'): 88 while s[0] != ord(b'e'):
@@ -89,30 +91,30 @@ class _BencodeHandler(object):
89 return r, s[1:] 91 return r, s[1:]
90 92
91 @staticmethod 93 @staticmethod
92 def __encode_int(x:str) -> bytes: 94 def __encode_int(x: str) -> bytes:
93 return b'i' + bytes(str(x), 'utf-8') + b'e' 95 return b'i' + bytes(str(x), 'utf-8') + b'e'
94 96
95 @staticmethod 97 @staticmethod
96 def __encode_string(x:str) -> bytes: 98 def __encode_string(x: str) -> bytes:
97 return bytes((str(len(x))), 'utf-8') + b':' + x 99 return bytes((str(len(x))), 'utf-8') + b':' + x
98 100
99 def __encode_list(self, x:str) -> bytes: 101 def __encode_list(self, x: str) -> bytes:
100 ret = b'' 102 ret = b''
101 for i in x: 103 for i in x:
102 ret += self.__encode_func[type(i)](i) 104 ret += self.__encode_func[type(i)](i)
103 return b'l' + ret + b'e' 105 return b'l' + ret + b'e'
104 106
105 def __encode_dict(self, x:str) -> bytes: 107 def __encode_dict(self, x: str) -> bytes:
106 ret = b'' 108 ret = b''
107 for k, v in sorted(x.items()): 109 for k, v in sorted(x.items()):
108 ret += self.__encode_func[type(k)](k) 110 ret += self.__encode_func[type(k)](k)
109 ret += self.__encode_func[type(v)](v) 111 ret += self.__encode_func[type(v)](v)
110 return b'd' + ret + b'e' 112 return b'd' + ret + b'e'
111 113
112 def bencode(self, s:str) -> bytes: 114 def bencode(self, s: str) -> bytes:
113 return self.__encode_func[type(s)](s) 115 return self.__encode_func[type(s)](s)
114 116
115 def bdecode(self, s:str): 117 def bdecode(self, s: str):
116 try: 118 try:
117 r, l = self.__decode_func[s[0]](s) 119 r, l = self.__decode_func[s[0]](s)
118 except (IndexError, KeyError, ValueError) as e: 120 except (IndexError, KeyError, ValueError) as e: