summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--libmat2/abstract.py11
-rw-r--r--libmat2/harmless.py2
-rw-r--r--libmat2/images.py5
-rw-r--r--libmat2/office.py24
-rw-r--r--libmat2/parser_factory.py8
-rw-r--r--libmat2/pdf.py4
6 files changed, 36 insertions, 18 deletions
diff --git a/libmat2/abstract.py b/libmat2/abstract.py
index 41a720a..701ab60 100644
--- a/libmat2/abstract.py
+++ b/libmat2/abstract.py
@@ -6,10 +6,16 @@ assert Set # make pyflakes happy
6 6
7 7
8class AbstractParser(abc.ABC): 8class AbstractParser(abc.ABC):
9 """ This is the base classe of every parser.
10 It might yeild `ValueError` on instanciation on invalid files.
11 """
9 meta_list = set() # type: Set[str] 12 meta_list = set() # type: Set[str]
10 mimetypes = set() # type: Set[str] 13 mimetypes = set() # type: Set[str]
11 14
12 def __init__(self, filename: str) -> None: 15 def __init__(self, filename: str) -> None:
16 """
17 :raises ValueError: Raised upon an invalid file
18 """
13 self.filename = filename 19 self.filename = filename
14 fname, extension = os.path.splitext(filename) 20 fname, extension = os.path.splitext(filename)
15 self.output_filename = fname + '.cleaned' + extension 21 self.output_filename = fname + '.cleaned' + extension
@@ -23,5 +29,8 @@ class AbstractParser(abc.ABC):
23 pass # pragma: no cover 29 pass # pragma: no cover
24 30
25 def remove_all_lightweight(self) -> bool: 31 def remove_all_lightweight(self) -> bool:
26 """ Remove _SOME_ metadata. """ 32 """ This method removes _SOME_ metadata.
33 I might be useful to implement it for fileformats that do
34 not support non-destructive cleaning.
35 """
27 return self.remove_all() 36 return self.remove_all()
diff --git a/libmat2/harmless.py b/libmat2/harmless.py
index 336873c..f646099 100644
--- a/libmat2/harmless.py
+++ b/libmat2/harmless.py
@@ -4,7 +4,7 @@ from . import abstract
4 4
5 5
6class HarmlessParser(abstract.AbstractParser): 6class HarmlessParser(abstract.AbstractParser):
7 """ This is the parser for filetypes that do not contain metadata. """ 7 """ This is the parser for filetypes that can not contain metadata. """
8 mimetypes = {'text/plain', 'image/x-ms-bmp'} 8 mimetypes = {'text/plain', 'image/x-ms-bmp'}
9 9
10 def get_meta(self) -> Dict[str, str]: 10 def get_meta(self) -> Dict[str, str]:
diff --git a/libmat2/images.py b/libmat2/images.py
index f9171e5..d47536b 100644
--- a/libmat2/images.py
+++ b/libmat2/images.py
@@ -19,6 +19,9 @@ from . import abstract
19assert Set 19assert Set
20 20
21class _ImageParser(abstract.AbstractParser): 21class _ImageParser(abstract.AbstractParser):
22 """ Since we use `exiftool` to get metadata from
23 all images fileformat, `get_meta` is implemented in this class,
24 and all the image-handling ones are inheriting from it."""
22 meta_whitelist = set() # type: Set[str] 25 meta_whitelist = set() # type: Set[str]
23 26
24 @staticmethod 27 @staticmethod
@@ -72,7 +75,7 @@ class PNGParser(_ImageParser):
72 75
73class GdkPixbufAbstractParser(_ImageParser): 76class GdkPixbufAbstractParser(_ImageParser):
74 """ GdkPixbuf can handle a lot of surfaces, so we're rending images on it, 77 """ GdkPixbuf can handle a lot of surfaces, so we're rending images on it,
75 this has the side-effect of removing metadata completely. 78 this has the side-effect of completely removing metadata.
76 """ 79 """
77 _type = '' 80 _type = ''
78 81
diff --git a/libmat2/office.py b/libmat2/office.py
index c6c4688..62d0395 100644
--- a/libmat2/office.py
+++ b/libmat2/office.py
@@ -33,6 +33,7 @@ def _parse_xml(full_path: str):
33 33
34 34
35class ArchiveBasedAbstractParser(abstract.AbstractParser): 35class ArchiveBasedAbstractParser(abstract.AbstractParser):
36 """ Office files (.docx, .odt, …) are zipped files. """
36 # Those are the files that have a format that _isn't_ 37 # Those are the files that have a format that _isn't_
37 # supported by MAT2, but that we want to keep anyway. 38 # supported by MAT2, but that we want to keep anyway.
38 files_to_keep = set() # type: Set[str] 39 files_to_keep = set() # type: Set[str]
@@ -58,14 +59,13 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
58 def _clean_zipinfo(zipinfo: zipfile.ZipInfo) -> zipfile.ZipInfo: 59 def _clean_zipinfo(zipinfo: zipfile.ZipInfo) -> zipfile.ZipInfo:
59 zipinfo.create_system = 3 # Linux 60 zipinfo.create_system = 3 # Linux
60 zipinfo.comment = b'' 61 zipinfo.comment = b''
61 zipinfo.date_time = (1980, 1, 1, 0, 0, 0) 62 zipinfo.date_time = (1980, 1, 1, 0, 0, 0) # this is as early as a zipfile can be
62 return zipinfo 63 return zipinfo
63 64
64 @staticmethod 65 @staticmethod
65 def _get_zipinfo_meta(zipinfo: zipfile.ZipInfo) -> Dict[str, str]: 66 def _get_zipinfo_meta(zipinfo: zipfile.ZipInfo) -> Dict[str, str]:
66 metadata = {} 67 metadata = {}
67 if zipinfo.create_system == 3: 68 if zipinfo.create_system == 3: # this is Linux
68 #metadata['create_system'] = 'Linux'
69 pass 69 pass
70 elif zipinfo.create_system == 2: 70 elif zipinfo.create_system == 2:
71 metadata['create_system'] = 'Windows' 71 metadata['create_system'] = 'Windows'
@@ -145,23 +145,27 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
145 145
146 @staticmethod 146 @staticmethod
147 def __remove_revisions(full_path: str) -> bool: 147 def __remove_revisions(full_path: str) -> bool:
148 """ In this function, we're changing the XML 148 """ In this function, we're changing the XML document in several
149 document in two times, since we don't want 149 different times, since we don't want to change the tree we're currently
150 to change the tree we're iterating on.""" 150 iterating on.
151 """
151 try: 152 try:
152 tree, namespace = _parse_xml(full_path) 153 tree, namespace = _parse_xml(full_path)
153 except ET.ParseError: 154 except ET.ParseError:
154 return False 155 return False
155 156
156 # No revisions are present 157 # Revisions are either deletions (`w:del`) or
158 # insertions (`w:ins`)
157 del_presence = tree.find('.//w:del', namespace) 159 del_presence = tree.find('.//w:del', namespace)
158 ins_presence = tree.find('.//w:ins', namespace) 160 ins_presence = tree.find('.//w:ins', namespace)
159 if del_presence is None and ins_presence is None: 161 if del_presence is None and ins_presence is None:
160 return True 162 return True # No revisions are present
161 163
162 parent_map = {c:p for p in tree.iter() for c in p} 164 parent_map = {c:p for p in tree.iter() for c in p}
163 165
164 elements = list([element for element in tree.iterfind('.//w:del', namespace)]) 166 elements = list()
167 for element in tree.iterfind('.//w:del', namespace):
168 elements.append(element)
165 for element in elements: 169 for element in elements:
166 parent_map[element].remove(element) 170 parent_map[element].remove(element)
167 171
@@ -172,7 +176,6 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
172 for children in element.iterfind('./*'): 176 for children in element.iterfind('./*'):
173 elements.append((element, position, children)) 177 elements.append((element, position, children))
174 break 178 break
175
176 for (element, position, children) in elements: 179 for (element, position, children) in elements:
177 parent_map[element].insert(position, children) 180 parent_map[element].insert(position, children)
178 parent_map[element].remove(element) 181 parent_map[element].remove(element)
@@ -183,6 +186,7 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
183 186
184 def _specific_cleanup(self, full_path: str) -> bool: 187 def _specific_cleanup(self, full_path: str) -> bool:
185 if full_path.endswith('/word/document.xml'): 188 if full_path.endswith('/word/document.xml'):
189 # this file contains the revisions
186 return self.__remove_revisions(full_path) 190 return self.__remove_revisions(full_path)
187 return True 191 return True
188 192
diff --git a/libmat2/parser_factory.py b/libmat2/parser_factory.py
index bd442b8..f42acfb 100644
--- a/libmat2/parser_factory.py
+++ b/libmat2/parser_factory.py
@@ -13,10 +13,12 @@ T = TypeVar('T', bound='abstract.AbstractParser')
13def __load_all_parsers(): 13def __load_all_parsers():
14 """ Loads every parser in a dynamic way """ 14 """ Loads every parser in a dynamic way """
15 current_dir = os.path.dirname(__file__) 15 current_dir = os.path.dirname(__file__)
16 for name in glob.glob(os.path.join(current_dir, '*.py')): 16 for fname in glob.glob(os.path.join(current_dir, '*.py')):
17 if name.endswith('abstract.py') or name.endswith('__init__.py'): 17 if fname.endswith('abstract.py'):
18 continue 18 continue
19 basename = os.path.basename(name) 19 elif fname.endswith('__init__.py'):
20 continue
21 basename = os.path.basename(fname)
20 name, _ = os.path.splitext(basename) 22 name, _ = os.path.splitext(basename)
21 importlib.import_module('.' + name, package='libmat2') 23 importlib.import_module('.' + name, package='libmat2')
22 24
diff --git a/libmat2/pdf.py b/libmat2/pdf.py
index 053a768..d3c4698 100644
--- a/libmat2/pdf.py
+++ b/libmat2/pdf.py
@@ -47,7 +47,7 @@ class PDFParser(abstract.AbstractParser):
47 pages_count = document.get_n_pages() 47 pages_count = document.get_n_pages()
48 48
49 tmp_path = tempfile.mkstemp()[1] 49 tmp_path = tempfile.mkstemp()[1]
50 pdf_surface = cairo.PDFSurface(tmp_path, 10, 10) 50 pdf_surface = cairo.PDFSurface(tmp_path, 10, 10) # resized later anyway
51 pdf_context = cairo.Context(pdf_surface) # context draws on the surface 51 pdf_context = cairo.Context(pdf_surface) # context draws on the surface
52 52
53 for pagenum in range(pages_count): 53 for pagenum in range(pages_count):
@@ -101,7 +101,7 @@ class PDFParser(abstract.AbstractParser):
101 pdf_surface.set_size(page_width*self.__scale, page_height*self.__scale) 101 pdf_surface.set_size(page_width*self.__scale, page_height*self.__scale)
102 pdf_context.set_source_surface(img, 0, 0) 102 pdf_context.set_source_surface(img, 0, 0)
103 pdf_context.paint() 103 pdf_context.paint()
104 pdf_context.show_page() 104 pdf_context.show_page() # draw pdf_context on pdf_surface
105 105
106 pdf_surface.finish() 106 pdf_surface.finish()
107 107