diff options
| -rw-r--r-- | libmat2/abstract.py | 11 | ||||
| -rw-r--r-- | libmat2/harmless.py | 2 | ||||
| -rw-r--r-- | libmat2/images.py | 5 | ||||
| -rw-r--r-- | libmat2/office.py | 24 | ||||
| -rw-r--r-- | libmat2/parser_factory.py | 8 | ||||
| -rw-r--r-- | libmat2/pdf.py | 4 |
6 files changed, 36 insertions, 18 deletions
diff --git a/libmat2/abstract.py b/libmat2/abstract.py index 41a720a..701ab60 100644 --- a/libmat2/abstract.py +++ b/libmat2/abstract.py | |||
| @@ -6,10 +6,16 @@ assert Set # make pyflakes happy | |||
| 6 | 6 | ||
| 7 | 7 | ||
| 8 | class AbstractParser(abc.ABC): | 8 | class AbstractParser(abc.ABC): |
| 9 | """ This is the base classe of every parser. | ||
| 10 | It might yeild `ValueError` on instanciation on invalid files. | ||
| 11 | """ | ||
| 9 | meta_list = set() # type: Set[str] | 12 | meta_list = set() # type: Set[str] |
| 10 | mimetypes = set() # type: Set[str] | 13 | mimetypes = set() # type: Set[str] |
| 11 | 14 | ||
| 12 | def __init__(self, filename: str) -> None: | 15 | def __init__(self, filename: str) -> None: |
| 16 | """ | ||
| 17 | :raises ValueError: Raised upon an invalid file | ||
| 18 | """ | ||
| 13 | self.filename = filename | 19 | self.filename = filename |
| 14 | fname, extension = os.path.splitext(filename) | 20 | fname, extension = os.path.splitext(filename) |
| 15 | self.output_filename = fname + '.cleaned' + extension | 21 | self.output_filename = fname + '.cleaned' + extension |
| @@ -23,5 +29,8 @@ class AbstractParser(abc.ABC): | |||
| 23 | pass # pragma: no cover | 29 | pass # pragma: no cover |
| 24 | 30 | ||
| 25 | def remove_all_lightweight(self) -> bool: | 31 | def remove_all_lightweight(self) -> bool: |
| 26 | """ Remove _SOME_ metadata. """ | 32 | """ This method removes _SOME_ metadata. |
| 33 | I might be useful to implement it for fileformats that do | ||
| 34 | not support non-destructive cleaning. | ||
| 35 | """ | ||
| 27 | return self.remove_all() | 36 | return self.remove_all() |
diff --git a/libmat2/harmless.py b/libmat2/harmless.py index 336873c..f646099 100644 --- a/libmat2/harmless.py +++ b/libmat2/harmless.py | |||
| @@ -4,7 +4,7 @@ from . import abstract | |||
| 4 | 4 | ||
| 5 | 5 | ||
| 6 | class HarmlessParser(abstract.AbstractParser): | 6 | class HarmlessParser(abstract.AbstractParser): |
| 7 | """ This is the parser for filetypes that do not contain metadata. """ | 7 | """ This is the parser for filetypes that can not contain metadata. """ |
| 8 | mimetypes = {'text/plain', 'image/x-ms-bmp'} | 8 | mimetypes = {'text/plain', 'image/x-ms-bmp'} |
| 9 | 9 | ||
| 10 | def get_meta(self) -> Dict[str, str]: | 10 | def get_meta(self) -> Dict[str, str]: |
diff --git a/libmat2/images.py b/libmat2/images.py index f9171e5..d47536b 100644 --- a/libmat2/images.py +++ b/libmat2/images.py | |||
| @@ -19,6 +19,9 @@ from . import abstract | |||
| 19 | assert Set | 19 | assert Set |
| 20 | 20 | ||
| 21 | class _ImageParser(abstract.AbstractParser): | 21 | class _ImageParser(abstract.AbstractParser): |
| 22 | """ Since we use `exiftool` to get metadata from | ||
| 23 | all images fileformat, `get_meta` is implemented in this class, | ||
| 24 | and all the image-handling ones are inheriting from it.""" | ||
| 22 | meta_whitelist = set() # type: Set[str] | 25 | meta_whitelist = set() # type: Set[str] |
| 23 | 26 | ||
| 24 | @staticmethod | 27 | @staticmethod |
| @@ -72,7 +75,7 @@ class PNGParser(_ImageParser): | |||
| 72 | 75 | ||
| 73 | class GdkPixbufAbstractParser(_ImageParser): | 76 | class GdkPixbufAbstractParser(_ImageParser): |
| 74 | """ GdkPixbuf can handle a lot of surfaces, so we're rending images on it, | 77 | """ GdkPixbuf can handle a lot of surfaces, so we're rending images on it, |
| 75 | this has the side-effect of removing metadata completely. | 78 | this has the side-effect of completely removing metadata. |
| 76 | """ | 79 | """ |
| 77 | _type = '' | 80 | _type = '' |
| 78 | 81 | ||
diff --git a/libmat2/office.py b/libmat2/office.py index c6c4688..62d0395 100644 --- a/libmat2/office.py +++ b/libmat2/office.py | |||
| @@ -33,6 +33,7 @@ def _parse_xml(full_path: str): | |||
| 33 | 33 | ||
| 34 | 34 | ||
| 35 | class ArchiveBasedAbstractParser(abstract.AbstractParser): | 35 | class ArchiveBasedAbstractParser(abstract.AbstractParser): |
| 36 | """ Office files (.docx, .odt, …) are zipped files. """ | ||
| 36 | # Those are the files that have a format that _isn't_ | 37 | # Those are the files that have a format that _isn't_ |
| 37 | # supported by MAT2, but that we want to keep anyway. | 38 | # supported by MAT2, but that we want to keep anyway. |
| 38 | files_to_keep = set() # type: Set[str] | 39 | files_to_keep = set() # type: Set[str] |
| @@ -58,14 +59,13 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser): | |||
| 58 | def _clean_zipinfo(zipinfo: zipfile.ZipInfo) -> zipfile.ZipInfo: | 59 | def _clean_zipinfo(zipinfo: zipfile.ZipInfo) -> zipfile.ZipInfo: |
| 59 | zipinfo.create_system = 3 # Linux | 60 | zipinfo.create_system = 3 # Linux |
| 60 | zipinfo.comment = b'' | 61 | zipinfo.comment = b'' |
| 61 | zipinfo.date_time = (1980, 1, 1, 0, 0, 0) | 62 | zipinfo.date_time = (1980, 1, 1, 0, 0, 0) # this is as early as a zipfile can be |
| 62 | return zipinfo | 63 | return zipinfo |
| 63 | 64 | ||
| 64 | @staticmethod | 65 | @staticmethod |
| 65 | def _get_zipinfo_meta(zipinfo: zipfile.ZipInfo) -> Dict[str, str]: | 66 | def _get_zipinfo_meta(zipinfo: zipfile.ZipInfo) -> Dict[str, str]: |
| 66 | metadata = {} | 67 | metadata = {} |
| 67 | if zipinfo.create_system == 3: | 68 | if zipinfo.create_system == 3: # this is Linux |
| 68 | #metadata['create_system'] = 'Linux' | ||
| 69 | pass | 69 | pass |
| 70 | elif zipinfo.create_system == 2: | 70 | elif zipinfo.create_system == 2: |
| 71 | metadata['create_system'] = 'Windows' | 71 | metadata['create_system'] = 'Windows' |
| @@ -145,23 +145,27 @@ class MSOfficeParser(ArchiveBasedAbstractParser): | |||
| 145 | 145 | ||
| 146 | @staticmethod | 146 | @staticmethod |
| 147 | def __remove_revisions(full_path: str) -> bool: | 147 | def __remove_revisions(full_path: str) -> bool: |
| 148 | """ In this function, we're changing the XML | 148 | """ In this function, we're changing the XML document in several |
| 149 | document in two times, since we don't want | 149 | different times, since we don't want to change the tree we're currently |
| 150 | to change the tree we're iterating on.""" | 150 | iterating on. |
| 151 | """ | ||
| 151 | try: | 152 | try: |
| 152 | tree, namespace = _parse_xml(full_path) | 153 | tree, namespace = _parse_xml(full_path) |
| 153 | except ET.ParseError: | 154 | except ET.ParseError: |
| 154 | return False | 155 | return False |
| 155 | 156 | ||
| 156 | # No revisions are present | 157 | # Revisions are either deletions (`w:del`) or |
| 158 | # insertions (`w:ins`) | ||
| 157 | del_presence = tree.find('.//w:del', namespace) | 159 | del_presence = tree.find('.//w:del', namespace) |
| 158 | ins_presence = tree.find('.//w:ins', namespace) | 160 | ins_presence = tree.find('.//w:ins', namespace) |
| 159 | if del_presence is None and ins_presence is None: | 161 | if del_presence is None and ins_presence is None: |
| 160 | return True | 162 | return True # No revisions are present |
| 161 | 163 | ||
| 162 | parent_map = {c:p for p in tree.iter() for c in p} | 164 | parent_map = {c:p for p in tree.iter() for c in p} |
| 163 | 165 | ||
| 164 | elements = list([element for element in tree.iterfind('.//w:del', namespace)]) | 166 | elements = list() |
| 167 | for element in tree.iterfind('.//w:del', namespace): | ||
| 168 | elements.append(element) | ||
| 165 | for element in elements: | 169 | for element in elements: |
| 166 | parent_map[element].remove(element) | 170 | parent_map[element].remove(element) |
| 167 | 171 | ||
| @@ -172,7 +176,6 @@ class MSOfficeParser(ArchiveBasedAbstractParser): | |||
| 172 | for children in element.iterfind('./*'): | 176 | for children in element.iterfind('./*'): |
| 173 | elements.append((element, position, children)) | 177 | elements.append((element, position, children)) |
| 174 | break | 178 | break |
| 175 | |||
| 176 | for (element, position, children) in elements: | 179 | for (element, position, children) in elements: |
| 177 | parent_map[element].insert(position, children) | 180 | parent_map[element].insert(position, children) |
| 178 | parent_map[element].remove(element) | 181 | parent_map[element].remove(element) |
| @@ -183,6 +186,7 @@ class MSOfficeParser(ArchiveBasedAbstractParser): | |||
| 183 | 186 | ||
| 184 | def _specific_cleanup(self, full_path: str) -> bool: | 187 | def _specific_cleanup(self, full_path: str) -> bool: |
| 185 | if full_path.endswith('/word/document.xml'): | 188 | if full_path.endswith('/word/document.xml'): |
| 189 | # this file contains the revisions | ||
| 186 | return self.__remove_revisions(full_path) | 190 | return self.__remove_revisions(full_path) |
| 187 | return True | 191 | return True |
| 188 | 192 | ||
diff --git a/libmat2/parser_factory.py b/libmat2/parser_factory.py index bd442b8..f42acfb 100644 --- a/libmat2/parser_factory.py +++ b/libmat2/parser_factory.py | |||
| @@ -13,10 +13,12 @@ T = TypeVar('T', bound='abstract.AbstractParser') | |||
| 13 | def __load_all_parsers(): | 13 | def __load_all_parsers(): |
| 14 | """ Loads every parser in a dynamic way """ | 14 | """ Loads every parser in a dynamic way """ |
| 15 | current_dir = os.path.dirname(__file__) | 15 | current_dir = os.path.dirname(__file__) |
| 16 | for name in glob.glob(os.path.join(current_dir, '*.py')): | 16 | for fname in glob.glob(os.path.join(current_dir, '*.py')): |
| 17 | if name.endswith('abstract.py') or name.endswith('__init__.py'): | 17 | if fname.endswith('abstract.py'): |
| 18 | continue | 18 | continue |
| 19 | basename = os.path.basename(name) | 19 | elif fname.endswith('__init__.py'): |
| 20 | continue | ||
| 21 | basename = os.path.basename(fname) | ||
| 20 | name, _ = os.path.splitext(basename) | 22 | name, _ = os.path.splitext(basename) |
| 21 | importlib.import_module('.' + name, package='libmat2') | 23 | importlib.import_module('.' + name, package='libmat2') |
| 22 | 24 | ||
diff --git a/libmat2/pdf.py b/libmat2/pdf.py index 053a768..d3c4698 100644 --- a/libmat2/pdf.py +++ b/libmat2/pdf.py | |||
| @@ -47,7 +47,7 @@ class PDFParser(abstract.AbstractParser): | |||
| 47 | pages_count = document.get_n_pages() | 47 | pages_count = document.get_n_pages() |
| 48 | 48 | ||
| 49 | tmp_path = tempfile.mkstemp()[1] | 49 | tmp_path = tempfile.mkstemp()[1] |
| 50 | pdf_surface = cairo.PDFSurface(tmp_path, 10, 10) | 50 | pdf_surface = cairo.PDFSurface(tmp_path, 10, 10) # resized later anyway |
| 51 | pdf_context = cairo.Context(pdf_surface) # context draws on the surface | 51 | pdf_context = cairo.Context(pdf_surface) # context draws on the surface |
| 52 | 52 | ||
| 53 | for pagenum in range(pages_count): | 53 | for pagenum in range(pages_count): |
| @@ -101,7 +101,7 @@ class PDFParser(abstract.AbstractParser): | |||
| 101 | pdf_surface.set_size(page_width*self.__scale, page_height*self.__scale) | 101 | pdf_surface.set_size(page_width*self.__scale, page_height*self.__scale) |
| 102 | pdf_context.set_source_surface(img, 0, 0) | 102 | pdf_context.set_source_surface(img, 0, 0) |
| 103 | pdf_context.paint() | 103 | pdf_context.paint() |
| 104 | pdf_context.show_page() | 104 | pdf_context.show_page() # draw pdf_context on pdf_surface |
| 105 | 105 | ||
| 106 | pdf_surface.finish() | 106 | pdf_surface.finish() |
| 107 | 107 | ||
