summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.gitlab-ci.yml2
-rw-r--r--libmat2/office.py17
-rwxr-xr-xmat231
3 files changed, 26 insertions, 24 deletions
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index b31d088..29e3553 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -9,7 +9,7 @@ bandit:
9 script: # TODO: remove B405 and B314 9 script: # TODO: remove B405 and B314
10 - apt-get -qqy update 10 - apt-get -qqy update
11 - apt-get -qqy install --no-install-recommends python3-bandit 11 - apt-get -qqy install --no-install-recommends python3-bandit
12 - bandit ./mat2 --format txt 12 - bandit ./mat2 --format txt --skip B101
13 - bandit -r ./nautilus/ --format txt --skip B101 13 - bandit -r ./nautilus/ --format txt --skip B101
14 - bandit -r ./libmat2 --format txt --skip B101,B404,B603,B405,B314 14 - bandit -r ./libmat2 --format txt --skip B101,B404,B603,B405,B314
15 15
diff --git a/libmat2/office.py b/libmat2/office.py
index 54347ea..32e7b75 100644
--- a/libmat2/office.py
+++ b/libmat2/office.py
@@ -2,7 +2,7 @@ import logging
2import os 2import os
3import re 3import re
4import zipfile 4import zipfile
5from typing import Dict, Set, Pattern 5from typing import Dict, Set, Pattern, Tuple
6 6
7import xml.etree.ElementTree as ET # type: ignore 7import xml.etree.ElementTree as ET # type: ignore
8 8
@@ -14,9 +14,8 @@ from .archive import ArchiveBasedAbstractParser
14assert Set 14assert Set
15assert Pattern 15assert Pattern
16 16
17def _parse_xml(full_path: str): 17def _parse_xml(full_path: str) -> Tuple[ET.ElementTree, Dict[str, str]]:
18 """ This function parses XML, with namespace support. """ 18 """ This function parses XML, with namespace support. """
19
20 namespace_map = dict() 19 namespace_map = dict()
21 for _, (key, value) in ET.iterparse(full_path, ("start-ns", )): 20 for _, (key, value) in ET.iterparse(full_path, ("start-ns", )):
22 # The ns[0-9]+ namespaces are reserved for internal usage, so 21 # The ns[0-9]+ namespaces are reserved for internal usage, so
@@ -183,20 +182,20 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
183 182
184 parent_map = {c:p for p in tree.iter() for c in p} 183 parent_map = {c:p for p in tree.iter() for c in p}
185 184
186 elements = list() 185 elements_del = list()
187 for element in tree.iterfind('.//w:del', namespace): 186 for element in tree.iterfind('.//w:del', namespace):
188 elements.append(element) 187 elements_del.append(element)
189 for element in elements: 188 for element in elements_del:
190 parent_map[element].remove(element) 189 parent_map[element].remove(element)
191 190
192 elements = list() 191 elements_ins = list()
193 for element in tree.iterfind('.//w:ins', namespace): 192 for element in tree.iterfind('.//w:ins', namespace):
194 for position, item in enumerate(tree.iter()): # pragma: no cover 193 for position, item in enumerate(tree.iter()): # pragma: no cover
195 if item == element: 194 if item == element:
196 for children in element.iterfind('./*'): 195 for children in element.iterfind('./*'):
197 elements.append((element, position, children)) 196 elements_ins.append((element, position, children))
198 break 197 break
199 for (element, position, children) in elements: 198 for (element, position, children) in elements_ins:
200 parent_map[element].insert(position, children) 199 parent_map[element].insert(position, children)
201 parent_map[element].remove(element) 200 parent_map[element].remove(element)
202 201
diff --git a/mat2 b/mat2
index 6c23836..987e439 100755
--- a/mat2
+++ b/mat2
@@ -1,7 +1,7 @@
1#!/usr/bin/env python3 1#!/usr/bin/env python3
2 2
3import os 3import os
4from typing import Tuple 4from typing import Tuple, Generator, List
5import sys 5import sys
6import mimetypes 6import mimetypes
7import argparse 7import argparse
@@ -16,6 +16,10 @@ except ValueError as e:
16 16
17__version__ = '0.4.0' 17__version__ = '0.4.0'
18 18
19# Make pyflakes happy
20assert Tuple
21
22
19def __check_file(filename: str, mode: int=os.R_OK) -> bool: 23def __check_file(filename: str, mode: int=os.R_OK) -> bool:
20 if not os.path.exists(filename): 24 if not os.path.exists(filename):
21 print("[-] %s is doesn't exist." % filename) 25 print("[-] %s is doesn't exist." % filename)
@@ -29,7 +33,7 @@ def __check_file(filename: str, mode: int=os.R_OK) -> bool:
29 return True 33 return True
30 34
31 35
32def create_arg_parser(): 36def create_arg_parser() -> argparse.ArgumentParser:
33 parser = argparse.ArgumentParser(description='Metadata anonymisation toolkit 2') 37 parser = argparse.ArgumentParser(description='Metadata anonymisation toolkit 2')
34 parser.add_argument('files', nargs='*', help='the files to process') 38 parser.add_argument('files', nargs='*', help='the files to process')
35 parser.add_argument('-v', '--version', action='version', 39 parser.add_argument('-v', '--version', action='version',
@@ -63,19 +67,18 @@ def show_meta(filename: str):
63 return 67 return
64 68
65 print("[+] Metadata for %s:" % filename) 69 print("[+] Metadata for %s:" % filename)
66 meta = p.get_meta().items() 70 metadata = p.get_meta().items()
67 if not meta: 71 if not metadata:
68 print(" No metadata found") 72 print(" No metadata found")
69 return 73 return
70 74
71 for k, v in meta: 75 for k, v in metadata:
72 try: # FIXME this is ugly. 76 try: # FIXME this is ugly.
73 print(" %s: %s" % (k, v)) 77 print(" %s: %s" % (k, v))
74 except UnicodeEncodeError: 78 except UnicodeEncodeError:
75 print(" %s: harmful content" % k) 79 print(" %s: harmful content" % k)
76 80
77def clean_meta(params: Tuple[str, bool, UnknownMemberPolicy]) -> bool: 81def clean_meta(filename: str, is_lightweight: bool, policy: UnknownMemberPolicy) -> bool:
78 filename, is_lightweight, unknown_member_policy = params
79 if not __check_file(filename, os.R_OK|os.W_OK): 82 if not __check_file(filename, os.R_OK|os.W_OK):
80 return False 83 return False
81 84
@@ -83,7 +86,7 @@ def clean_meta(params: Tuple[str, bool, UnknownMemberPolicy]) -> bool:
83 if p is None: 86 if p is None:
84 print("[-] %s's format (%s) is not supported" % (filename, mtype)) 87 print("[-] %s's format (%s) is not supported" % (filename, mtype))
85 return False 88 return False
86 p.unknown_member_policy = unknown_member_policy 89 p.unknown_member_policy = policy
87 if is_lightweight: 90 if is_lightweight:
88 return p.remove_all_lightweight() 91 return p.remove_all_lightweight()
89 return p.remove_all() 92 return p.remove_all()
@@ -91,7 +94,7 @@ def clean_meta(params: Tuple[str, bool, UnknownMemberPolicy]) -> bool:
91 94
92def show_parsers(): 95def show_parsers():
93 print('[+] Supported formats:') 96 print('[+] Supported formats:')
94 formats = list() 97 formats = set()
95 for parser in parser_factory._get_parsers(): 98 for parser in parser_factory._get_parsers():
96 for mtype in parser.mimetypes: 99 for mtype in parser.mimetypes:
97 extensions = set() 100 extensions = set()
@@ -102,11 +105,11 @@ def show_parsers():
102 # we're not supporting a single extension in the current 105 # we're not supporting a single extension in the current
103 # mimetype, so there is not point in showing the mimetype at all 106 # mimetype, so there is not point in showing the mimetype at all
104 continue 107 continue
105 formats.append(' - %s (%s)' % (mtype, ', '.join(extensions))) 108 formats.add(' - %s (%s)' % (mtype, ', '.join(extensions)))
106 print('\n'.join(sorted(formats))) 109 print('\n'.join(sorted(formats)))
107 110
108 111
109def __get_files_recursively(files): 112def __get_files_recursively(files: List[str]) -> Generator[str, None, None]:
110 for f in files: 113 for f in files:
111 if os.path.isdir(f): 114 if os.path.isdir(f):
112 for path, _, _files in os.walk(f): 115 for path, _, _files in os.walk(f):
@@ -141,13 +144,13 @@ def main():
141 return 0 144 return 0
142 145
143 else: 146 else:
144 unknown_member_policy = UnknownMemberPolicy(args.unknown_members) 147 policy = UnknownMemberPolicy(args.unknown_members)
145 if unknown_member_policy == UnknownMemberPolicy.KEEP: 148 if policy == UnknownMemberPolicy.KEEP:
146 logging.warning('Keeping unknown member files may leak metadata in the resulting file!') 149 logging.warning('Keeping unknown member files may leak metadata in the resulting file!')
147 150
148 no_failure = True 151 no_failure = True
149 for f in __get_files_recursively(args.files): 152 for f in __get_files_recursively(args.files):
150 if clean_meta([f, args.lightweight, unknown_member_policy]) is False: 153 if clean_meta(f, args.lightweight, policy) is False:
151 no_failure = False 154 no_failure = False
152 return 0 if no_failure is True else -1 155 return 0 if no_failure is True else -1
153 156