From f391c9603c36a8ec80942c23ac6ba39fca5df72a Mon Sep 17 00:00:00 2001 From: jvoisin Date: Sat, 31 Mar 2018 15:46:17 +0200 Subject: Change a bit the source code organisation --- src/__init__.py | 1 + src/abstract.py | 13 ++++++++ src/audio.py | 37 ++++++++++++++++++++ src/jpg.py | 30 +++++++++++++++++ src/parser_factory.py | 11 +++--- src/parsers/__init__.py | 0 src/parsers/abstract.py | 13 -------- src/parsers/audio.py | 37 -------------------- src/parsers/jpg.py | 30 ----------------- src/parsers/pdf.py | 89 ------------------------------------------------- src/parsers/png.py | 27 --------------- src/pdf.py | 89 +++++++++++++++++++++++++++++++++++++++++++++++++ src/png.py | 27 +++++++++++++++ tests/test_libmat2.py | 3 +- 14 files changed, 204 insertions(+), 203 deletions(-) create mode 100644 src/abstract.py create mode 100644 src/audio.py create mode 100644 src/jpg.py delete mode 100644 src/parsers/__init__.py delete mode 100644 src/parsers/abstract.py delete mode 100644 src/parsers/audio.py delete mode 100644 src/parsers/jpg.py delete mode 100644 src/parsers/pdf.py delete mode 100644 src/parsers/png.py create mode 100644 src/pdf.py create mode 100644 src/png.py diff --git a/src/__init__.py b/src/__init__.py index e69de29..7557381 100644 --- a/src/__init__.py +++ b/src/__init__.py @@ -0,0 +1 @@ +#!/bin/env python3 \ No newline at end of file diff --git a/src/abstract.py b/src/abstract.py new file mode 100644 index 0000000..c2d282f --- /dev/null +++ b/src/abstract.py @@ -0,0 +1,13 @@ +class AbstractParser(object): + meta_list = set() + mimetypes = set() + + def __init__(self, filename: str): + self.filename = filename + self.output_filename = filename + '.cleaned' + + def get_meta(self): + raise NotImplementedError + + def remove_all(self): + raise NotImplementedError diff --git a/src/audio.py b/src/audio.py new file mode 100644 index 0000000..4da298c --- /dev/null +++ b/src/audio.py @@ -0,0 +1,37 @@ +import subprocess +import shutil +import json + +import mutagen + +from . import abstract + +class MutagenParser(abstract.AbstractParser): + def get_meta(self): + f = mutagen.File(self.filename) + if f.tags: + return f.tags + return {} + + def remove_all(self): + shutil.copy(self.filename, self.output_filename) + f = mutagen.File(self.output_filename) + f.delete() + f.save() + return True + +class MP3Parser(MutagenParser): + mimetypes = {'audio/mpeg', } + + def get_meta(self): + meta = super().get_meta() + metadata = {} + for key in meta: + metadata[key] = meta[key].text + return metadata + +class OGGParser(MutagenParser): + mimetypes = {'audio/ogg', } + +class FLACParser(MutagenParser): + mimetypes = {'audio/flac', } diff --git a/src/jpg.py b/src/jpg.py new file mode 100644 index 0000000..34fc04c --- /dev/null +++ b/src/jpg.py @@ -0,0 +1,30 @@ +import subprocess +import json + +import gi +gi.require_version('GdkPixbuf', '2.0') +from gi.repository import GdkPixbuf + +from . import abstract + +class JPGParser(abstract.AbstractParser): + mimetypes = {'image/jpg', } + meta_whitelist = {'SourceFile', 'ExifToolVersion', 'FileName', + 'Directory', 'FileSize', 'FileModifyDate', 'FileAccessDate', + "FileInodeChangeDate", 'FilePermissions', 'FileType', + 'FileTypeExtension', 'MIMEType', 'ImageWidth', + 'ImageSize', 'BitsPerSample', 'ColorComponents', 'EncodingProcess', + 'JFIFVersion', 'ResolutionUnit', 'XResolution', 'YCbCrSubSampling', + 'YResolution', 'Megapixels', 'ImageHeight'} + + def get_meta(self): + out = subprocess.check_output(['exiftool', '-json', self.filename]) + meta = json.loads(out.decode('utf-8'))[0] + for key in self.meta_whitelist: + meta.pop(key, None) + return meta + + def remove_all(self): + pixbuf = GdkPixbuf.Pixbuf.new_from_file(self.filename) + pixbuf.savev(self.output_filename, "jpeg", ["quality"], ["100"]) + return True diff --git a/src/parser_factory.py b/src/parser_factory.py index f4cf07b..176ff2b 100644 --- a/src/parser_factory.py +++ b/src/parser_factory.py @@ -2,12 +2,12 @@ import mimetypes import importlib import pkgutil -from .parsers import abstract +from . import abstract -for module_loader, name, ispkg in pkgutil.walk_packages('.src.parsers'): - if not name.startswith('src.parsers.'): +for module_loader, name, ispkg in pkgutil.walk_packages('.src'): + if not name.startswith('src.'): continue - elif name == 'src.parsers.abstract': + elif name == 'src.abstract': continue importlib.import_module(name) @@ -16,4 +16,5 @@ def get_parser(filename: str): for c in abstract.AbstractParser.__subclasses__(): if mtype in c.mimetypes: return c(filename) - print('Nope') + print('factory: %s is not supported' % mtype) + return None diff --git a/src/parsers/__init__.py b/src/parsers/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/src/parsers/abstract.py b/src/parsers/abstract.py deleted file mode 100644 index c2d282f..0000000 --- a/src/parsers/abstract.py +++ /dev/null @@ -1,13 +0,0 @@ -class AbstractParser(object): - meta_list = set() - mimetypes = set() - - def __init__(self, filename: str): - self.filename = filename - self.output_filename = filename + '.cleaned' - - def get_meta(self): - raise NotImplementedError - - def remove_all(self): - raise NotImplementedError diff --git a/src/parsers/audio.py b/src/parsers/audio.py deleted file mode 100644 index 4da298c..0000000 --- a/src/parsers/audio.py +++ /dev/null @@ -1,37 +0,0 @@ -import subprocess -import shutil -import json - -import mutagen - -from . import abstract - -class MutagenParser(abstract.AbstractParser): - def get_meta(self): - f = mutagen.File(self.filename) - if f.tags: - return f.tags - return {} - - def remove_all(self): - shutil.copy(self.filename, self.output_filename) - f = mutagen.File(self.output_filename) - f.delete() - f.save() - return True - -class MP3Parser(MutagenParser): - mimetypes = {'audio/mpeg', } - - def get_meta(self): - meta = super().get_meta() - metadata = {} - for key in meta: - metadata[key] = meta[key].text - return metadata - -class OGGParser(MutagenParser): - mimetypes = {'audio/ogg', } - -class FLACParser(MutagenParser): - mimetypes = {'audio/flac', } diff --git a/src/parsers/jpg.py b/src/parsers/jpg.py deleted file mode 100644 index 34fc04c..0000000 --- a/src/parsers/jpg.py +++ /dev/null @@ -1,30 +0,0 @@ -import subprocess -import json - -import gi -gi.require_version('GdkPixbuf', '2.0') -from gi.repository import GdkPixbuf - -from . import abstract - -class JPGParser(abstract.AbstractParser): - mimetypes = {'image/jpg', } - meta_whitelist = {'SourceFile', 'ExifToolVersion', 'FileName', - 'Directory', 'FileSize', 'FileModifyDate', 'FileAccessDate', - "FileInodeChangeDate", 'FilePermissions', 'FileType', - 'FileTypeExtension', 'MIMEType', 'ImageWidth', - 'ImageSize', 'BitsPerSample', 'ColorComponents', 'EncodingProcess', - 'JFIFVersion', 'ResolutionUnit', 'XResolution', 'YCbCrSubSampling', - 'YResolution', 'Megapixels', 'ImageHeight'} - - def get_meta(self): - out = subprocess.check_output(['exiftool', '-json', self.filename]) - meta = json.loads(out.decode('utf-8'))[0] - for key in self.meta_whitelist: - meta.pop(key, None) - return meta - - def remove_all(self): - pixbuf = GdkPixbuf.Pixbuf.new_from_file(self.filename) - pixbuf.savev(self.output_filename, "jpeg", ["quality"], ["100"]) - return True diff --git a/src/parsers/pdf.py b/src/parsers/pdf.py deleted file mode 100644 index 90f05e1..0000000 --- a/src/parsers/pdf.py +++ /dev/null @@ -1,89 +0,0 @@ -""" Handle PDF - -""" - -import os -import logging -import tempfile -import shutil -import io -import tempfile - -import cairo -import gi -gi.require_version('Poppler', '0.18') -from gi.repository import Poppler - -from . import abstract - -logging.basicConfig(level=logging.DEBUG) - - -class PDFParser(abstract.AbstractParser): - mimetypes = {'application/pdf', } - meta_list = {'author', 'creation-date', 'creator', 'format', 'keywords', - 'metadata', 'mod-date', 'producer', 'subject', 'title', - 'viewer-preferences'} - - def __init__(self, filename): - super().__init__(filename) - self.uri = 'file://' + os.path.abspath(self.filename) - self.__scale = 2 - - def remove_all(self): - """ - Load the document into Poppler, render pages on PNG, - and shove those PNG into a new PDF. Metadata from the new - PDF are removed via Poppler, because there is no way to tell - cairo to not add "created by cairo" during rendering. - """ - document = Poppler.Document.new_from_file(self.uri, None) - pages_count = document.get_n_pages() - - _, tmp_path = tempfile.mkstemp() - pdf_surface = cairo.PDFSurface(tmp_path, 128, 128) - pdf_context = cairo.Context(pdf_surface) - - for pagenum in range(pages_count): - page = document.get_page(pagenum) - page_width, page_height = page.get_size() - logging.info("Rendering page %d/%d", pagenum + 1, pages_count) - - img_surface = cairo.ImageSurface(cairo.FORMAT_ARGB32, int(page_width) * self.__scale, int(page_height) * self.__scale) - img_context = cairo.Context(img_surface) - - img_context.scale(self.__scale, self.__scale) - page.render_for_printing(img_context) - img_context.show_page() - - buf = io.BytesIO() - img_surface.write_to_png(buf) - img_surface.finish() - buf.seek(0) - - img = cairo.ImageSurface.create_from_png(buf) - pdf_surface.set_size(page_width*2, page_height*2) - pdf_context.set_source_surface(img, 0, 0) - pdf_context.paint() - pdf_context.show_page() - - pdf_surface.finish() - - # This is removing metadata added by Poppler - document = Poppler.Document.new_from_file('file://' + tmp_path) - document.set_producer('') - document.set_creator('') - document.save('file://' + os.path.abspath(self.output_filename)) - os.remove(tmp_path) - - return True - - def get_meta(self): - """ Return a dict with all the meta of the file - """ - document = Poppler.Document.new_from_file(self.uri, None) - metadata = {} - for key in self.meta_list: - if document.get_property(key): - metadata[key] = document.get_property(key) - return metadata diff --git a/src/parsers/png.py b/src/parsers/png.py deleted file mode 100644 index 377682e..0000000 --- a/src/parsers/png.py +++ /dev/null @@ -1,27 +0,0 @@ -import subprocess -import json - -import cairo - -from . import abstract - -class PNGParser(abstract.AbstractParser): - mimetypes = {'image/png', } - meta_whitelist = {'SourceFile', 'ExifToolVersion', 'FileName', - 'Directory', 'FileSize', 'FileModifyDate', 'FileAccessDate', - "FileInodeChangeDate", 'FilePermissions', 'FileType', - 'FileTypeExtension', 'MIMEType', 'ImageWidth', 'BitDepth', 'ColorType', - 'Compression', 'Filter', 'Interlace', 'BackgroundColor', 'ImageSize', - 'Megapixels', 'ImageHeight'} - - def get_meta(self): - out = subprocess.check_output(['exiftool', '-json', self.filename]) - meta = json.loads(out.decode('utf-8'))[0] - for key in self.meta_whitelist: - meta.pop(key, None) - return meta - - def remove_all(self): - surface = cairo.ImageSurface.create_from_png(self.filename) - surface.write_to_png(self.output_filename) - return True diff --git a/src/pdf.py b/src/pdf.py new file mode 100644 index 0000000..90f05e1 --- /dev/null +++ b/src/pdf.py @@ -0,0 +1,89 @@ +""" Handle PDF + +""" + +import os +import logging +import tempfile +import shutil +import io +import tempfile + +import cairo +import gi +gi.require_version('Poppler', '0.18') +from gi.repository import Poppler + +from . import abstract + +logging.basicConfig(level=logging.DEBUG) + + +class PDFParser(abstract.AbstractParser): + mimetypes = {'application/pdf', } + meta_list = {'author', 'creation-date', 'creator', 'format', 'keywords', + 'metadata', 'mod-date', 'producer', 'subject', 'title', + 'viewer-preferences'} + + def __init__(self, filename): + super().__init__(filename) + self.uri = 'file://' + os.path.abspath(self.filename) + self.__scale = 2 + + def remove_all(self): + """ + Load the document into Poppler, render pages on PNG, + and shove those PNG into a new PDF. Metadata from the new + PDF are removed via Poppler, because there is no way to tell + cairo to not add "created by cairo" during rendering. + """ + document = Poppler.Document.new_from_file(self.uri, None) + pages_count = document.get_n_pages() + + _, tmp_path = tempfile.mkstemp() + pdf_surface = cairo.PDFSurface(tmp_path, 128, 128) + pdf_context = cairo.Context(pdf_surface) + + for pagenum in range(pages_count): + page = document.get_page(pagenum) + page_width, page_height = page.get_size() + logging.info("Rendering page %d/%d", pagenum + 1, pages_count) + + img_surface = cairo.ImageSurface(cairo.FORMAT_ARGB32, int(page_width) * self.__scale, int(page_height) * self.__scale) + img_context = cairo.Context(img_surface) + + img_context.scale(self.__scale, self.__scale) + page.render_for_printing(img_context) + img_context.show_page() + + buf = io.BytesIO() + img_surface.write_to_png(buf) + img_surface.finish() + buf.seek(0) + + img = cairo.ImageSurface.create_from_png(buf) + pdf_surface.set_size(page_width*2, page_height*2) + pdf_context.set_source_surface(img, 0, 0) + pdf_context.paint() + pdf_context.show_page() + + pdf_surface.finish() + + # This is removing metadata added by Poppler + document = Poppler.Document.new_from_file('file://' + tmp_path) + document.set_producer('') + document.set_creator('') + document.save('file://' + os.path.abspath(self.output_filename)) + os.remove(tmp_path) + + return True + + def get_meta(self): + """ Return a dict with all the meta of the file + """ + document = Poppler.Document.new_from_file(self.uri, None) + metadata = {} + for key in self.meta_list: + if document.get_property(key): + metadata[key] = document.get_property(key) + return metadata diff --git a/src/png.py b/src/png.py new file mode 100644 index 0000000..377682e --- /dev/null +++ b/src/png.py @@ -0,0 +1,27 @@ +import subprocess +import json + +import cairo + +from . import abstract + +class PNGParser(abstract.AbstractParser): + mimetypes = {'image/png', } + meta_whitelist = {'SourceFile', 'ExifToolVersion', 'FileName', + 'Directory', 'FileSize', 'FileModifyDate', 'FileAccessDate', + "FileInodeChangeDate", 'FilePermissions', 'FileType', + 'FileTypeExtension', 'MIMEType', 'ImageWidth', 'BitDepth', 'ColorType', + 'Compression', 'Filter', 'Interlace', 'BackgroundColor', 'ImageSize', + 'Megapixels', 'ImageHeight'} + + def get_meta(self): + out = subprocess.check_output(['exiftool', '-json', self.filename]) + meta = json.loads(out.decode('utf-8'))[0] + for key in self.meta_whitelist: + meta.pop(key, None) + return meta + + def remove_all(self): + surface = cairo.ImageSurface.create_from_png(self.filename) + surface.write_to_png(self.output_filename) + return True diff --git a/tests/test_libmat2.py b/tests/test_libmat2.py index 27bb8d1..c21185e 100644 --- a/tests/test_libmat2.py +++ b/tests/test_libmat2.py @@ -4,8 +4,7 @@ import unittest import shutil import os -from src import parsers -from src.parsers import pdf, png, jpg, audio +from src import pdf, png, jpg, audio, office class TestGetMeta(unittest.TestCase): def test_pdf(self): -- cgit v1.3