diff options
| author | jvoisin | 2018-05-18 23:52:40 +0200 |
|---|---|---|
| committer | jvoisin | 2018-05-18 23:52:40 +0200 |
| commit | 38fae60b8beaf9c7b37c65325d2d285e62b6cb85 (patch) | |
| tree | e6bd4f699d6190dfada7618ebd04455eb7de9660 /libmat2 | |
| parent | 57d5cd04284276c49899034a9ad321b680624d8f (diff) | |
Rename some files to simplify packaging
- the `src` folder is now `libmat2`
- the `main.py` script is now `mat2.py`
Diffstat (limited to 'libmat2')
| -rw-r--r-- | libmat2/__init__.py | 6 | ||||
| -rw-r--r-- | libmat2/abstract.py | 24 | ||||
| -rw-r--r-- | libmat2/audio.py | 39 | ||||
| -rw-r--r-- | libmat2/harmless.py | 17 | ||||
| -rw-r--r-- | libmat2/images.py | 101 | ||||
| -rw-r--r-- | libmat2/office.py | 150 | ||||
| -rw-r--r-- | libmat2/parser_factory.py | 42 | ||||
| -rw-r--r-- | libmat2/pdf.py | 135 | ||||
| -rw-r--r-- | libmat2/torrent.py | 126 |
9 files changed, 640 insertions, 0 deletions
diff --git a/libmat2/__init__.py b/libmat2/__init__.py new file mode 100644 index 0000000..07d3036 --- /dev/null +++ b/libmat2/__init__.py | |||
| @@ -0,0 +1,6 @@ | |||
| 1 | #!/bin/env python3 | ||
| 2 | |||
| 3 | # A set of extension that aren't supported, despite matching a supported mimetype | ||
| 4 | unsupported_extensions = set(['bat', 'c', 'h', 'ksh', 'pl', 'txt', 'asc', | ||
| 5 | 'text', 'pot', 'brf', 'srt', 'rdf', 'wsdl', | ||
| 6 | 'xpdl', 'xsl', 'xsd']) | ||
diff --git a/libmat2/abstract.py b/libmat2/abstract.py new file mode 100644 index 0000000..e4838a9 --- /dev/null +++ b/libmat2/abstract.py | |||
| @@ -0,0 +1,24 @@ | |||
| 1 | import abc | ||
| 2 | import os | ||
| 3 | |||
| 4 | |||
| 5 | class AbstractParser(abc.ABC): | ||
| 6 | meta_list = set() | ||
| 7 | mimetypes = set() | ||
| 8 | |||
| 9 | def __init__(self, filename: str): | ||
| 10 | self.filename = filename | ||
| 11 | fname, extension = os.path.splitext(filename) | ||
| 12 | self.output_filename = fname + '.cleaned' + extension | ||
| 13 | |||
| 14 | @abc.abstractmethod | ||
| 15 | def get_meta(self) -> dict: | ||
| 16 | pass | ||
| 17 | |||
| 18 | @abc.abstractmethod | ||
| 19 | def remove_all(self) -> bool: | ||
| 20 | pass | ||
| 21 | |||
| 22 | def remove_all_lightweight(self) -> bool: | ||
| 23 | """ Remove _SOME_ metadata. """ | ||
| 24 | return self.remove_all() | ||
diff --git a/libmat2/audio.py b/libmat2/audio.py new file mode 100644 index 0000000..3a6aa79 --- /dev/null +++ b/libmat2/audio.py | |||
| @@ -0,0 +1,39 @@ | |||
| 1 | import shutil | ||
| 2 | |||
| 3 | import mutagen | ||
| 4 | |||
| 5 | from . import abstract | ||
| 6 | |||
| 7 | |||
| 8 | class MutagenParser(abstract.AbstractParser): | ||
| 9 | def get_meta(self): | ||
| 10 | f = mutagen.File(self.filename) | ||
| 11 | if f.tags: | ||
| 12 | return {k:', '.join(v) for k, v in f.tags.items()} | ||
| 13 | return {} | ||
| 14 | |||
| 15 | def remove_all(self): | ||
| 16 | shutil.copy(self.filename, self.output_filename) | ||
| 17 | f = mutagen.File(self.output_filename) | ||
| 18 | f.delete() | ||
| 19 | f.save() | ||
| 20 | return True | ||
| 21 | |||
| 22 | |||
| 23 | class MP3Parser(MutagenParser): | ||
| 24 | mimetypes = {'audio/mpeg', } | ||
| 25 | |||
| 26 | def get_meta(self): | ||
| 27 | metadata = {} | ||
| 28 | meta = mutagen.File(self.filename).tags | ||
| 29 | for key in meta: | ||
| 30 | metadata[key.rstrip(' \t\r\n\0')] = ', '.join(map(str, meta[key].text)) | ||
| 31 | return metadata | ||
| 32 | |||
| 33 | |||
| 34 | class OGGParser(MutagenParser): | ||
| 35 | mimetypes = {'audio/ogg', } | ||
| 36 | |||
| 37 | |||
| 38 | class FLACParser(MutagenParser): | ||
| 39 | mimetypes = {'audio/flac', } | ||
diff --git a/libmat2/harmless.py b/libmat2/harmless.py new file mode 100644 index 0000000..aa00582 --- /dev/null +++ b/libmat2/harmless.py | |||
| @@ -0,0 +1,17 @@ | |||
| 1 | from . import abstract | ||
| 2 | |||
| 3 | |||
| 4 | class HarmlessParser(abstract.AbstractParser): | ||
| 5 | """ This is the parser for filetypes that do not contain metadata. """ | ||
| 6 | mimetypes = {'application/xml', 'text/plain'} | ||
| 7 | |||
| 8 | def __init__(self, filename: str): | ||
| 9 | super().__init__(filename) | ||
| 10 | self.filename = filename | ||
| 11 | self.output_filename = filename | ||
| 12 | |||
| 13 | def get_meta(self): | ||
| 14 | return dict() | ||
| 15 | |||
| 16 | def remove_all(self): | ||
| 17 | return True | ||
diff --git a/libmat2/images.py b/libmat2/images.py new file mode 100644 index 0000000..c84952a --- /dev/null +++ b/libmat2/images.py | |||
| @@ -0,0 +1,101 @@ | |||
| 1 | import subprocess | ||
| 2 | import json | ||
| 3 | import os | ||
| 4 | |||
| 5 | import cairo | ||
| 6 | |||
| 7 | import gi | ||
| 8 | gi.require_version('GdkPixbuf', '2.0') | ||
| 9 | from gi.repository import GdkPixbuf | ||
| 10 | |||
| 11 | from . import abstract | ||
| 12 | |||
| 13 | |||
| 14 | class PNGParser(abstract.AbstractParser): | ||
| 15 | mimetypes = {'image/png', } | ||
| 16 | meta_whitelist = {'SourceFile', 'ExifToolVersion', 'FileName', | ||
| 17 | 'Directory', 'FileSize', 'FileModifyDate', | ||
| 18 | 'FileAccessDate', 'FileInodeChangeDate', | ||
| 19 | 'FilePermissions', 'FileType', 'FileTypeExtension', | ||
| 20 | 'MIMEType', 'ImageWidth', 'BitDepth', 'ColorType', | ||
| 21 | 'Compression', 'Filter', 'Interlace', 'BackgroundColor', | ||
| 22 | 'ImageSize', 'Megapixels', 'ImageHeight'} | ||
| 23 | |||
| 24 | def __init__(self, filename): | ||
| 25 | super().__init__(filename) | ||
| 26 | try: # better fail here than later | ||
| 27 | cairo.ImageSurface.create_from_png(self.filename) | ||
| 28 | except MemoryError: | ||
| 29 | raise ValueError | ||
| 30 | |||
| 31 | def get_meta(self): | ||
| 32 | out = subprocess.check_output(['/usr/bin/exiftool', '-json', self.filename]) | ||
| 33 | meta = json.loads(out.decode('utf-8'))[0] | ||
| 34 | for key in self.meta_whitelist: | ||
| 35 | meta.pop(key, None) | ||
| 36 | return meta | ||
| 37 | |||
| 38 | def remove_all(self): | ||
| 39 | surface = cairo.ImageSurface.create_from_png(self.filename) | ||
| 40 | surface.write_to_png(self.output_filename) | ||
| 41 | return True | ||
| 42 | |||
| 43 | |||
| 44 | class GdkPixbufAbstractParser(abstract.AbstractParser): | ||
| 45 | """ GdkPixbuf can handle a lot of surfaces, so we're rending images on it, | ||
| 46 | this has the side-effect of removing metadata completely. | ||
| 47 | """ | ||
| 48 | def get_meta(self): | ||
| 49 | out = subprocess.check_output(['/usr/bin/exiftool', '-json', self.filename]) | ||
| 50 | meta = json.loads(out.decode('utf-8'))[0] | ||
| 51 | for key in self.meta_whitelist: | ||
| 52 | meta.pop(key, None) | ||
| 53 | return meta | ||
| 54 | |||
| 55 | def remove_all(self): | ||
| 56 | _, extension = os.path.splitext(self.filename) | ||
| 57 | pixbuf = GdkPixbuf.Pixbuf.new_from_file(self.filename) | ||
| 58 | if extension == '.jpg': | ||
| 59 | extension = '.jpeg' | ||
| 60 | pixbuf.savev(self.output_filename, extension[1:], [], []) | ||
| 61 | return True | ||
| 62 | |||
| 63 | |||
| 64 | class JPGParser(GdkPixbufAbstractParser): | ||
| 65 | mimetypes = {'image/jpeg'} | ||
| 66 | meta_whitelist = {'SourceFile', 'ExifToolVersion', 'FileName', | ||
| 67 | 'Directory', 'FileSize', 'FileModifyDate', | ||
| 68 | 'FileAccessDate', "FileInodeChangeDate", | ||
| 69 | 'FilePermissions', 'FileType', 'FileTypeExtension', | ||
| 70 | 'MIMEType', 'ImageWidth', 'ImageSize', 'BitsPerSample', | ||
| 71 | 'ColorComponents', 'EncodingProcess', 'JFIFVersion', | ||
| 72 | 'ResolutionUnit', 'XResolution', 'YCbCrSubSampling', | ||
| 73 | 'YResolution', 'Megapixels', 'ImageHeight'} | ||
| 74 | |||
| 75 | |||
| 76 | class TiffParser(GdkPixbufAbstractParser): | ||
| 77 | mimetypes = {'image/tiff'} | ||
| 78 | meta_whitelist = {'Compression', 'ExifByteOrder', 'ExtraSamples', | ||
| 79 | 'FillOrder', 'PhotometricInterpretation', | ||
| 80 | 'PlanarConfiguration', 'RowsPerStrip', 'SamplesPerPixel', | ||
| 81 | 'StripByteCounts', 'StripOffsets', 'BitsPerSample', | ||
| 82 | 'Directory', 'ExifToolVersion', 'FileAccessDate', | ||
| 83 | 'FileInodeChangeDate', 'FileModifyDate', 'FileName', | ||
| 84 | 'FilePermissions', 'FileSize', 'FileType', | ||
| 85 | 'FileTypeExtension', 'ImageHeight', 'ImageSize', | ||
| 86 | 'ImageWidth', 'MIMEType', 'Megapixels', 'SourceFile'} | ||
| 87 | |||
| 88 | |||
| 89 | class BMPParser(GdkPixbufAbstractParser): | ||
| 90 | mimetypes = {'image/x-ms-bmp'} | ||
| 91 | meta_whitelist = {'SourceFile', 'ExifToolVersion', 'FileName', 'Directory', | ||
| 92 | 'FileSize', 'FileModifyDate', 'FileAccessDate', | ||
| 93 | 'FileInodeChangeDate', 'FilePermissions', 'FileType', | ||
| 94 | 'FileTypeExtension', 'MIMEType', 'BMPVersion', | ||
| 95 | 'ImageWidth', 'ImageHeight', 'Planes', 'BitDepth', | ||
| 96 | 'Compression', 'ImageLength', 'PixelsPerMeterX', | ||
| 97 | 'PixelsPerMeterY', 'NumColors', 'NumImportantColors', | ||
| 98 | 'RedMask', 'GreenMask', 'BlueMask', 'AlphaMask', | ||
| 99 | 'ColorSpace', 'RedEndpoint', 'GreenEndpoint', | ||
| 100 | 'BlueEndpoint', 'GammaRed', 'GammaGreen', 'GammaBlue', | ||
| 101 | 'ImageSize', 'Megapixels'} | ||
diff --git a/libmat2/office.py b/libmat2/office.py new file mode 100644 index 0000000..749fc7d --- /dev/null +++ b/libmat2/office.py | |||
| @@ -0,0 +1,150 @@ | |||
| 1 | import os | ||
| 2 | import re | ||
| 3 | import shutil | ||
| 4 | import tempfile | ||
| 5 | import datetime | ||
| 6 | import zipfile | ||
| 7 | |||
| 8 | from . import abstract, parser_factory | ||
| 9 | |||
| 10 | |||
| 11 | class ArchiveBasedAbstractParser(abstract.AbstractParser): | ||
| 12 | def _clean_zipinfo(self, zipinfo: zipfile.ZipInfo) -> zipfile.ZipInfo: | ||
| 13 | zipinfo.compress_type = zipfile.ZIP_DEFLATED | ||
| 14 | zipinfo.create_system = 3 # Linux | ||
| 15 | zipinfo.comment = b'' | ||
| 16 | zipinfo.date_time = (1980, 1, 1, 0, 0, 0) | ||
| 17 | return zipinfo | ||
| 18 | |||
| 19 | def _get_zipinfo_meta(self, zipinfo: zipfile.ZipInfo) -> dict: | ||
| 20 | metadata = {} | ||
| 21 | if zipinfo.create_system == 3: | ||
| 22 | #metadata['create_system'] = 'Linux' | ||
| 23 | pass | ||
| 24 | elif zipinfo.create_system == 2: | ||
| 25 | metadata['create_system'] = 'Windows' | ||
| 26 | else: | ||
| 27 | metadata['create_system'] = 'Weird' | ||
| 28 | |||
| 29 | if zipinfo.comment: | ||
| 30 | metadata['comment'] = zipinfo.comment | ||
| 31 | |||
| 32 | if zipinfo.date_time != (1980, 1, 1, 0, 0, 0): | ||
| 33 | metadata['date_time'] = datetime.datetime(*zipinfo.date_time) | ||
| 34 | |||
| 35 | return metadata | ||
| 36 | |||
| 37 | |||
| 38 | def _clean_internal_file(self, item: zipfile.ZipInfo, temp_folder: str, | ||
| 39 | zin: zipfile.ZipFile, zout: zipfile.ZipFile): | ||
| 40 | zin.extract(member=item, path=temp_folder) | ||
| 41 | tmp_parser, mtype = parser_factory.get_parser(os.path.join(temp_folder, item.filename)) | ||
| 42 | if not tmp_parser: | ||
| 43 | print("%s's format (%s) isn't supported" % (item.filename, mtype)) | ||
| 44 | return | ||
| 45 | tmp_parser.remove_all() | ||
| 46 | zinfo = zipfile.ZipInfo(item.filename) | ||
| 47 | clean_zinfo = self._clean_zipinfo(zinfo) | ||
| 48 | with open(tmp_parser.output_filename, 'rb') as f: | ||
| 49 | zout.writestr(clean_zinfo, f.read()) | ||
| 50 | |||
| 51 | |||
| 52 | class MSOfficeParser(ArchiveBasedAbstractParser): | ||
| 53 | mimetypes = { | ||
| 54 | 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', | ||
| 55 | 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', | ||
| 56 | 'application/vnd.openxmlformats-officedocument.presentationml.presentation' | ||
| 57 | } | ||
| 58 | files_to_keep = {'_rels/.rels', 'word/_rels/document.xml.rels'} | ||
| 59 | |||
| 60 | def get_meta(self): | ||
| 61 | """ | ||
| 62 | Yes, I know that parsing xml with regexp ain't pretty, | ||
| 63 | be my guest and fix it if you want. | ||
| 64 | """ | ||
| 65 | metadata = {} | ||
| 66 | zipin = zipfile.ZipFile(self.filename) | ||
| 67 | for item in zipin.infolist(): | ||
| 68 | if item.filename.startswith('docProps/') and item.filename.endswith('.xml'): | ||
| 69 | content = zipin.read(item).decode('utf-8') | ||
| 70 | for (key, value) in re.findall(r"<(.+)>(.+)</\1>", content, re.I): | ||
| 71 | metadata[key] = value | ||
| 72 | if not metadata: # better safe than sorry | ||
| 73 | metadata[item] = 'harmful content' | ||
| 74 | |||
| 75 | metadata = {**metadata, **self._get_zipinfo_meta(item)} | ||
| 76 | zipin.close() | ||
| 77 | return metadata | ||
| 78 | |||
| 79 | |||
| 80 | def remove_all(self): | ||
| 81 | zin = zipfile.ZipFile(self.filename, 'r') | ||
| 82 | zout = zipfile.ZipFile(self.output_filename, 'w') | ||
| 83 | temp_folder = tempfile.mkdtemp() | ||
| 84 | |||
| 85 | for item in zin.infolist(): | ||
| 86 | if item.filename[-1] == '/': | ||
| 87 | continue # `is_dir` is added in Python3.6 | ||
| 88 | elif item.filename.startswith('docProps/'): | ||
| 89 | if not item.filename.endswith('.rels'): | ||
| 90 | continue # don't keep metadata files | ||
| 91 | if item.filename in self.files_to_keep: | ||
| 92 | item = self._clean_zipinfo(item) | ||
| 93 | zout.writestr(item, zin.read(item)) | ||
| 94 | continue | ||
| 95 | |||
| 96 | self._clean_internal_file(item, temp_folder, zin, zout) | ||
| 97 | |||
| 98 | shutil.rmtree(temp_folder) | ||
| 99 | zout.close() | ||
| 100 | zin.close() | ||
| 101 | return True | ||
| 102 | |||
| 103 | |||
| 104 | |||
| 105 | class LibreOfficeParser(ArchiveBasedAbstractParser): | ||
| 106 | mimetypes = { | ||
| 107 | 'application/vnd.oasis.opendocument.text', | ||
| 108 | 'application/vnd.oasis.opendocument.spreadsheet', | ||
| 109 | 'application/vnd.oasis.opendocument.presentation', | ||
| 110 | 'application/vnd.oasis.opendocument.graphics', | ||
| 111 | 'application/vnd.oasis.opendocument.chart', | ||
| 112 | 'application/vnd.oasis.opendocument.formula', | ||
| 113 | 'application/vnd.oasis.opendocument.image', | ||
| 114 | } | ||
| 115 | |||
| 116 | def get_meta(self): | ||
| 117 | """ | ||
| 118 | Yes, I know that parsing xml with regexp ain't pretty, | ||
| 119 | be my guest and fix it if you want. | ||
| 120 | """ | ||
| 121 | metadata = {} | ||
| 122 | zipin = zipfile.ZipFile(self.filename) | ||
| 123 | for item in zipin.infolist(): | ||
| 124 | if item.filename == 'meta.xml': | ||
| 125 | content = zipin.read(item).decode('utf-8') | ||
| 126 | for (key, value) in re.findall(r"<((?:meta|dc|cp).+?)>(.+)</\1>", content, re.I): | ||
| 127 | metadata[key] = value | ||
| 128 | if not metadata: # better safe than sorry | ||
| 129 | metadata[item] = 'harmful content' | ||
| 130 | metadata = {**metadata, **self._get_zipinfo_meta(item)} | ||
| 131 | zipin.close() | ||
| 132 | return metadata | ||
| 133 | |||
| 134 | def remove_all(self): | ||
| 135 | zin = zipfile.ZipFile(self.filename, 'r') | ||
| 136 | zout = zipfile.ZipFile(self.output_filename, 'w') | ||
| 137 | temp_folder = tempfile.mkdtemp() | ||
| 138 | |||
| 139 | for item in zin.infolist(): | ||
| 140 | if item.filename[-1] == '/': | ||
| 141 | continue # `is_dir` is added in Python3.6 | ||
| 142 | elif item.filename == 'meta.xml': | ||
| 143 | continue # don't keep metadata files | ||
| 144 | |||
| 145 | self._clean_internal_file(item, temp_folder, zin, zout) | ||
| 146 | |||
| 147 | shutil.rmtree(temp_folder) | ||
| 148 | zout.close() | ||
| 149 | zin.close() | ||
| 150 | return True | ||
diff --git a/libmat2/parser_factory.py b/libmat2/parser_factory.py new file mode 100644 index 0000000..dbe68b9 --- /dev/null +++ b/libmat2/parser_factory.py | |||
| @@ -0,0 +1,42 @@ | |||
| 1 | import os | ||
| 2 | import mimetypes | ||
| 3 | import importlib | ||
| 4 | import pkgutil | ||
| 5 | from typing import TypeVar | ||
| 6 | |||
| 7 | from . import abstract, unsupported_extensions | ||
| 8 | |||
| 9 | |||
| 10 | T = TypeVar('T', bound='abstract.AbstractParser') | ||
| 11 | |||
| 12 | # This loads every parser in a dynamic way | ||
| 13 | for module_loader, name, ispkg in pkgutil.walk_packages('.libmat2'): | ||
| 14 | if not name.startswith('libmat2.'): | ||
| 15 | continue | ||
| 16 | elif name == 'libmat2.abstract': | ||
| 17 | continue | ||
| 18 | importlib.import_module(name) | ||
| 19 | |||
| 20 | |||
| 21 | def _get_parsers() -> list: | ||
| 22 | """ Get all our parsers!""" | ||
| 23 | def __get_parsers(cls): | ||
| 24 | return cls.__subclasses__() + \ | ||
| 25 | [g for s in cls.__subclasses__() for g in __get_parsers(s)] | ||
| 26 | return __get_parsers(abstract.AbstractParser) | ||
| 27 | |||
| 28 | |||
| 29 | def get_parser(filename: str) -> (T, str): | ||
| 30 | mtype, _ = mimetypes.guess_type(filename) | ||
| 31 | |||
| 32 | _, extension = os.path.splitext(filename) | ||
| 33 | if extension in unsupported_extensions: | ||
| 34 | return None, mtype | ||
| 35 | |||
| 36 | for c in _get_parsers(): | ||
| 37 | if mtype in c.mimetypes: | ||
| 38 | try: | ||
| 39 | return c(filename), mtype | ||
| 40 | except ValueError: | ||
| 41 | return None, mtype | ||
| 42 | return None, mtype | ||
diff --git a/libmat2/pdf.py b/libmat2/pdf.py new file mode 100644 index 0000000..5b99192 --- /dev/null +++ b/libmat2/pdf.py | |||
| @@ -0,0 +1,135 @@ | |||
| 1 | """ Handle PDF | ||
| 2 | |||
| 3 | """ | ||
| 4 | |||
| 5 | import os | ||
| 6 | import re | ||
| 7 | import logging | ||
| 8 | import tempfile | ||
| 9 | import io | ||
| 10 | |||
| 11 | import cairo | ||
| 12 | import gi | ||
| 13 | gi.require_version('Poppler', '0.18') | ||
| 14 | from gi.repository import Poppler, GLib | ||
| 15 | |||
| 16 | from . import abstract | ||
| 17 | |||
| 18 | logging.basicConfig(level=logging.DEBUG) | ||
| 19 | |||
| 20 | |||
| 21 | class PDFParser(abstract.AbstractParser): | ||
| 22 | mimetypes = {'application/pdf', } | ||
| 23 | meta_list = {'author', 'creation-date', 'creator', 'format', 'keywords', | ||
| 24 | 'metadata', 'mod-date', 'producer', 'subject', 'title', | ||
| 25 | 'viewer-preferences'} | ||
| 26 | |||
| 27 | def __init__(self, filename): | ||
| 28 | super().__init__(filename) | ||
| 29 | self.uri = 'file://' + os.path.abspath(self.filename) | ||
| 30 | self.__scale = 2 # how much precision do we want for the render | ||
| 31 | try: # Check now that the file is valid, to avoid surprises later | ||
| 32 | Poppler.Document.new_from_file(self.uri, None) | ||
| 33 | except GLib.GError: # Invalid PDF | ||
| 34 | raise ValueError | ||
| 35 | |||
| 36 | def remove_all_lightweight(self): | ||
| 37 | """ | ||
| 38 | Load the document into Poppler, render pages on a new PDFSurface. | ||
| 39 | """ | ||
| 40 | document = Poppler.Document.new_from_file(self.uri, None) | ||
| 41 | pages_count = document.get_n_pages() | ||
| 42 | |||
| 43 | tmp_path = tempfile.mkstemp()[1] | ||
| 44 | pdf_surface = cairo.PDFSurface(tmp_path, 10, 10) | ||
| 45 | pdf_context = cairo.Context(pdf_surface) # context draws on the surface | ||
| 46 | |||
| 47 | for pagenum in range(pages_count): | ||
| 48 | logging.info("Rendering page %d/%d", pagenum + 1, pages_count) | ||
| 49 | page = document.get_page(pagenum) | ||
| 50 | page_width, page_height = page.get_size() | ||
| 51 | pdf_surface.set_size(page_width, page_height) | ||
| 52 | pdf_context.save() | ||
| 53 | page.render_for_printing(pdf_context) | ||
| 54 | pdf_context.restore() | ||
| 55 | pdf_context.show_page() # draw pdf_context on pdf_surface | ||
| 56 | pdf_surface.finish() | ||
| 57 | |||
| 58 | self.__remove_superficial_meta(tmp_path, self.output_filename) | ||
| 59 | os.remove(tmp_path) | ||
| 60 | |||
| 61 | return True | ||
| 62 | |||
| 63 | def remove_all(self): | ||
| 64 | """ | ||
| 65 | Load the document into Poppler, render pages on PNG, | ||
| 66 | and shove those PNG into a new PDF. | ||
| 67 | """ | ||
| 68 | document = Poppler.Document.new_from_file(self.uri, None) | ||
| 69 | pages_count = document.get_n_pages() | ||
| 70 | |||
| 71 | _, tmp_path = tempfile.mkstemp() | ||
| 72 | pdf_surface = cairo.PDFSurface(tmp_path, 32, 32) # resized later anyway | ||
| 73 | pdf_context = cairo.Context(pdf_surface) | ||
| 74 | |||
| 75 | for pagenum in range(pages_count): | ||
| 76 | page = document.get_page(pagenum) | ||
| 77 | page_width, page_height = page.get_size() | ||
| 78 | logging.info("Rendering page %d/%d", pagenum + 1, pages_count) | ||
| 79 | |||
| 80 | img_surface = cairo.ImageSurface(cairo.FORMAT_ARGB32, int(page_width) * self.__scale, int(page_height) * self.__scale) | ||
| 81 | img_context = cairo.Context(img_surface) | ||
| 82 | |||
| 83 | img_context.scale(self.__scale, self.__scale) | ||
| 84 | page.render_for_printing(img_context) | ||
| 85 | img_context.show_page() | ||
| 86 | |||
| 87 | buf = io.BytesIO() | ||
| 88 | img_surface.write_to_png(buf) | ||
| 89 | img_surface.finish() | ||
| 90 | buf.seek(0) | ||
| 91 | |||
| 92 | img = cairo.ImageSurface.create_from_png(buf) | ||
| 93 | pdf_surface.set_size(page_width*self.__scale, page_height*self.__scale) | ||
| 94 | pdf_context.set_source_surface(img, 0, 0) | ||
| 95 | pdf_context.paint() | ||
| 96 | pdf_context.show_page() | ||
| 97 | |||
| 98 | pdf_surface.finish() | ||
| 99 | |||
| 100 | # Removes metadata added by Poppler | ||
| 101 | self.__remove_superficial_meta(tmp_path, self.output_filename) | ||
| 102 | os.remove(tmp_path) | ||
| 103 | |||
| 104 | return True | ||
| 105 | |||
| 106 | @staticmethod | ||
| 107 | def __remove_superficial_meta(in_file: str, out_file: str) -> bool: | ||
| 108 | document = Poppler.Document.new_from_file('file://' + in_file) | ||
| 109 | document.set_producer('') | ||
| 110 | document.set_creator('') | ||
| 111 | document.set_creation_date(-1) | ||
| 112 | document.save('file://' + os.path.abspath(out_file)) | ||
| 113 | return True | ||
| 114 | |||
| 115 | |||
| 116 | @staticmethod | ||
| 117 | def __parse_metadata_field(data: str) -> dict: | ||
| 118 | metadata = {} | ||
| 119 | for (_, key, value) in re.findall(r"<(xmp|pdfx|pdf|xmpMM):(.+)>(.+)</\1:\2>", data, re.I): | ||
| 120 | metadata[key] = value | ||
| 121 | return metadata | ||
| 122 | |||
| 123 | def get_meta(self): | ||
| 124 | """ Return a dict with all the meta of the file | ||
| 125 | """ | ||
| 126 | metadata = {} | ||
| 127 | document = Poppler.Document.new_from_file(self.uri, None) | ||
| 128 | |||
| 129 | for key in self.meta_list: | ||
| 130 | if document.get_property(key): | ||
| 131 | metadata[key] = document.get_property(key) | ||
| 132 | if 'metadata' in metadata: | ||
| 133 | parsed_meta = self.__parse_metadata_field(metadata['metadata']) | ||
| 134 | return {**metadata, **parsed_meta} | ||
| 135 | return metadata | ||
diff --git a/libmat2/torrent.py b/libmat2/torrent.py new file mode 100644 index 0000000..cb4b5e3 --- /dev/null +++ b/libmat2/torrent.py | |||
| @@ -0,0 +1,126 @@ | |||
| 1 | from . import abstract | ||
| 2 | |||
| 3 | |||
| 4 | class TorrentParser(abstract.AbstractParser): | ||
| 5 | mimetypes = {'application/x-bittorrent', } | ||
| 6 | whitelist = {b'announce', b'announce-list', b'info'} | ||
| 7 | |||
| 8 | def get_meta(self) -> dict: | ||
| 9 | metadata = {} | ||
| 10 | with open(self.filename, 'rb') as f: | ||
| 11 | d = _BencodeHandler().bdecode(f.read()) | ||
| 12 | if d is None: | ||
| 13 | return {'Unknown meta': 'Unable to parse torrent file "%s".' % self.filename} | ||
| 14 | for k, v in d.items(): | ||
| 15 | if k not in self.whitelist: | ||
| 16 | metadata[k.decode('utf-8')] = v | ||
| 17 | return metadata | ||
| 18 | |||
| 19 | |||
| 20 | def remove_all(self) -> bool: | ||
| 21 | cleaned = dict() | ||
| 22 | with open(self.filename, 'rb') as f: | ||
| 23 | d = _BencodeHandler().bdecode(f.read()) | ||
| 24 | if d is None: | ||
| 25 | return False | ||
| 26 | for k, v in d.items(): | ||
| 27 | if k in self.whitelist: | ||
| 28 | cleaned[k] = v | ||
| 29 | with open(self.output_filename, 'wb') as f: | ||
| 30 | f.write(_BencodeHandler().bencode(cleaned)) | ||
| 31 | return True | ||
| 32 | |||
| 33 | |||
| 34 | class _BencodeHandler(object): | ||
| 35 | """ | ||
| 36 | Since bencode isn't that hard to parse, | ||
| 37 | MAT2 comes with its own parser, based on the spec | ||
| 38 | https://wiki.theory.org/index.php/BitTorrentSpecification#Bencoding | ||
| 39 | """ | ||
| 40 | def __init__(self): | ||
| 41 | self.__decode_func = { | ||
| 42 | ord('d'): self.__decode_dict, | ||
| 43 | ord('i'): self.__decode_int, | ||
| 44 | ord('l'): self.__decode_list, | ||
| 45 | } | ||
| 46 | for i in range(0, 10): | ||
| 47 | self.__decode_func[ord(str(i))] = self.__decode_string | ||
| 48 | |||
| 49 | self.__encode_func = { | ||
| 50 | bytes: self.__encode_string, | ||
| 51 | dict: self.__encode_dict, | ||
| 52 | int: self.__encode_int, | ||
| 53 | list: self.__encode_list, | ||
| 54 | } | ||
| 55 | |||
| 56 | @staticmethod | ||
| 57 | def __decode_int(s: str) -> (int, str): | ||
| 58 | s = s[1:] | ||
| 59 | next_idx = s.index(b'e') | ||
| 60 | if s.startswith(b'-0'): | ||
| 61 | raise ValueError # negative zero doesn't exist | ||
| 62 | elif s.startswith(b'0') and next_idx != 1: | ||
| 63 | raise ValueError # no leading zero except for zero itself | ||
| 64 | return int(s[:next_idx]), s[next_idx+1:] | ||
| 65 | |||
| 66 | @staticmethod | ||
| 67 | def __decode_string(s: str) -> (str, str): | ||
| 68 | sep = s.index(b':') | ||
| 69 | str_len = int(s[:sep]) | ||
| 70 | if str_len < 0: | ||
| 71 | raise ValueError | ||
| 72 | elif s[0] == b'0' and sep != 1: | ||
| 73 | raise ValueError | ||
| 74 | s = s[1:] | ||
| 75 | return s[sep:sep+str_len], s[sep+str_len:] | ||
| 76 | |||
| 77 | def __decode_list(self, s: str) -> (list, str): | ||
| 78 | r = list() | ||
| 79 | s = s[1:] # skip leading `l` | ||
| 80 | while s[0] != ord('e'): | ||
| 81 | v, s = self.__decode_func[s[0]](s) | ||
| 82 | r.append(v) | ||
| 83 | return r, s[1:] | ||
| 84 | |||
| 85 | def __decode_dict(self, s: str) -> (dict, str): | ||
| 86 | r = dict() | ||
| 87 | s = s[1:] # skip leading `d` | ||
| 88 | while s[0] != ord(b'e'): | ||
| 89 | k, s = self.__decode_string(s) | ||
| 90 | r[k], s = self.__decode_func[s[0]](s) | ||
| 91 | return r, s[1:] | ||
| 92 | |||
| 93 | @staticmethod | ||
| 94 | def __encode_int(x: str) -> bytes: | ||
| 95 | return b'i' + bytes(str(x), 'utf-8') + b'e' | ||
| 96 | |||
| 97 | @staticmethod | ||
| 98 | def __encode_string(x: str) -> bytes: | ||
| 99 | return bytes((str(len(x))), 'utf-8') + b':' + x | ||
| 100 | |||
| 101 | def __encode_list(self, x: str) -> bytes: | ||
| 102 | ret = b'' | ||
| 103 | for i in x: | ||
| 104 | ret += self.__encode_func[type(i)](i) | ||
| 105 | return b'l' + ret + b'e' | ||
| 106 | |||
| 107 | def __encode_dict(self, x: str) -> bytes: | ||
| 108 | ret = b'' | ||
| 109 | for k, v in sorted(x.items()): | ||
| 110 | ret += self.__encode_func[type(k)](k) | ||
| 111 | ret += self.__encode_func[type(v)](v) | ||
| 112 | return b'd' + ret + b'e' | ||
| 113 | |||
| 114 | def bencode(self, s: str) -> bytes: | ||
| 115 | return self.__encode_func[type(s)](s) | ||
| 116 | |||
| 117 | def bdecode(self, s: str): | ||
| 118 | try: | ||
| 119 | r, l = self.__decode_func[s[0]](s) | ||
| 120 | except (IndexError, KeyError, ValueError) as e: | ||
| 121 | print("not a valid bencoded string: %s" % e) | ||
| 122 | return None | ||
| 123 | if l != b'': | ||
| 124 | print("invalid bencoded value (data after valid prefix)") | ||
| 125 | return None | ||
| 126 | return r | ||
