summaryrefslogtreecommitdiff
path: root/libmat2
diff options
context:
space:
mode:
Diffstat (limited to 'libmat2')
-rw-r--r--libmat2/archive.py127
-rw-r--r--libmat2/office.py118
2 files changed, 128 insertions, 117 deletions
diff --git a/libmat2/archive.py b/libmat2/archive.py
new file mode 100644
index 0000000..d8f9007
--- /dev/null
+++ b/libmat2/archive.py
@@ -0,0 +1,127 @@
1import zipfile
2import datetime
3import tempfile
4import os
5import logging
6import shutil
7from typing import Dict, Set, Pattern
8
9from . import abstract, UnknownMemberPolicy, parser_factory
10
11# Make pyflakes happy
12assert Set
13assert Pattern
14
15
16class ArchiveBasedAbstractParser(abstract.AbstractParser):
17 """ Office files (.docx, .odt, …) are zipped files. """
18 # Those are the files that have a format that _isn't_
19 # supported by MAT2, but that we want to keep anyway.
20 files_to_keep = set() # type: Set[str]
21
22 # Those are the files that we _do not_ want to keep,
23 # no matter if they are supported or not.
24 files_to_omit = set() # type: Set[Pattern]
25
26 # what should the parser do if it encounters an unknown file in
27 # the archive?
28 unknown_member_policy = UnknownMemberPolicy.ABORT # type: UnknownMemberPolicy
29
30 def __init__(self, filename):
31 super().__init__(filename)
32 try: # better fail here than later
33 zipfile.ZipFile(self.filename)
34 except zipfile.BadZipFile:
35 raise ValueError
36
37 def _specific_cleanup(self, full_path: str) -> bool:
38 """ This method can be used to apply specific treatment
39 to files present in the archive."""
40 # pylint: disable=unused-argument,no-self-use
41 return True # pragma: no cover
42
43 @staticmethod
44 def _clean_zipinfo(zipinfo: zipfile.ZipInfo) -> zipfile.ZipInfo:
45 zipinfo.create_system = 3 # Linux
46 zipinfo.comment = b''
47 zipinfo.date_time = (1980, 1, 1, 0, 0, 0) # this is as early as a zipfile can be
48 return zipinfo
49
50 @staticmethod
51 def _get_zipinfo_meta(zipinfo: zipfile.ZipInfo) -> Dict[str, str]:
52 metadata = {}
53 if zipinfo.create_system == 3: # this is Linux
54 pass
55 elif zipinfo.create_system == 2:
56 metadata['create_system'] = 'Windows'
57 else:
58 metadata['create_system'] = 'Weird'
59
60 if zipinfo.comment:
61 metadata['comment'] = zipinfo.comment # type: ignore
62
63 if zipinfo.date_time != (1980, 1, 1, 0, 0, 0):
64 metadata['date_time'] = str(datetime.datetime(*zipinfo.date_time))
65
66 return metadata
67
68 def remove_all(self) -> bool:
69 # pylint: disable=too-many-branches
70
71 with zipfile.ZipFile(self.filename) as zin,\
72 zipfile.ZipFile(self.output_filename, 'w') as zout:
73
74 temp_folder = tempfile.mkdtemp()
75 abort = False
76
77 for item in zin.infolist():
78 if item.filename[-1] == '/': # `is_dir` is added in Python3.6
79 continue # don't keep empty folders
80
81 zin.extract(member=item, path=temp_folder)
82 full_path = os.path.join(temp_folder, item.filename)
83
84 if self._specific_cleanup(full_path) is False:
85 logging.warning("Something went wrong during deep cleaning of %s",
86 item.filename)
87 abort = True
88 continue
89
90 if item.filename in self.files_to_keep:
91 # those files aren't supported, but we want to add them anyway
92 pass
93 elif any(map(lambda r: r.search(item.filename), self.files_to_omit)):
94 continue
95 else:
96 # supported files that we want to clean then add
97 tmp_parser, mtype = parser_factory.get_parser(full_path) # type: ignore
98 if not tmp_parser:
99 if self.unknown_member_policy == UnknownMemberPolicy.OMIT:
100 logging.warning("In file %s, omitting unknown element %s (format: %s)",
101 self.filename, item.filename, mtype)
102 continue
103 elif self.unknown_member_policy == UnknownMemberPolicy.KEEP:
104 logging.warning("In file %s, keeping unknown element %s (format: %s)",
105 self.filename, item.filename, mtype)
106 else:
107 logging.error("In file %s, element %s's format (%s) " +
108 "isn't supported",
109 self.filename, item.filename, mtype)
110 abort = True
111 continue
112 if tmp_parser:
113 tmp_parser.remove_all()
114 os.rename(tmp_parser.output_filename, full_path)
115
116 zinfo = zipfile.ZipInfo(item.filename) # type: ignore
117 clean_zinfo = self._clean_zipinfo(zinfo)
118 with open(full_path, 'rb') as f:
119 zout.writestr(clean_zinfo, f.read())
120
121 shutil.rmtree(temp_folder)
122 if abort:
123 os.remove(self.output_filename)
124 return False
125 return True
126
127
diff --git a/libmat2/office.py b/libmat2/office.py
index 60c5478..50b776e 100644
--- a/libmat2/office.py
+++ b/libmat2/office.py
@@ -1,15 +1,11 @@
1import os 1import os
2import re 2import re
3import shutil
4import tempfile
5import datetime
6import zipfile 3import zipfile
7import logging
8from typing import Dict, Set, Pattern 4from typing import Dict, Set, Pattern
9 5
10import xml.etree.ElementTree as ET # type: ignore 6import xml.etree.ElementTree as ET # type: ignore
11 7
12from . import abstract, parser_factory, UnknownMemberPolicy 8from .archive import ArchiveBasedAbstractParser
13 9
14# Make pyflakes happy 10# Make pyflakes happy
15assert Set 11assert Set
@@ -26,118 +22,6 @@ def _parse_xml(full_path: str):
26 return ET.parse(full_path), namespace_map 22 return ET.parse(full_path), namespace_map
27 23
28 24
29class ArchiveBasedAbstractParser(abstract.AbstractParser):
30 """ Office files (.docx, .odt, …) are zipped files. """
31 # Those are the files that have a format that _isn't_
32 # supported by MAT2, but that we want to keep anyway.
33 files_to_keep = set() # type: Set[str]
34
35 # Those are the files that we _do not_ want to keep,
36 # no matter if they are supported or not.
37 files_to_omit = set() # type: Set[Pattern]
38
39 # what should the parser do if it encounters an unknown file in
40 # the archive?
41 unknown_member_policy = UnknownMemberPolicy.ABORT # type: UnknownMemberPolicy
42
43 def __init__(self, filename):
44 super().__init__(filename)
45 try: # better fail here than later
46 zipfile.ZipFile(self.filename)
47 except zipfile.BadZipFile:
48 raise ValueError
49
50 def _specific_cleanup(self, full_path: str) -> bool:
51 """ This method can be used to apply specific treatment
52 to files present in the archive."""
53 # pylint: disable=unused-argument,no-self-use
54 return True # pragma: no cover
55
56 @staticmethod
57 def _clean_zipinfo(zipinfo: zipfile.ZipInfo) -> zipfile.ZipInfo:
58 zipinfo.create_system = 3 # Linux
59 zipinfo.comment = b''
60 zipinfo.date_time = (1980, 1, 1, 0, 0, 0) # this is as early as a zipfile can be
61 return zipinfo
62
63 @staticmethod
64 def _get_zipinfo_meta(zipinfo: zipfile.ZipInfo) -> Dict[str, str]:
65 metadata = {}
66 if zipinfo.create_system == 3: # this is Linux
67 pass
68 elif zipinfo.create_system == 2:
69 metadata['create_system'] = 'Windows'
70 else:
71 metadata['create_system'] = 'Weird'
72
73 if zipinfo.comment:
74 metadata['comment'] = zipinfo.comment # type: ignore
75
76 if zipinfo.date_time != (1980, 1, 1, 0, 0, 0):
77 metadata['date_time'] = str(datetime.datetime(*zipinfo.date_time))
78
79 return metadata
80
81 def remove_all(self) -> bool:
82 # pylint: disable=too-many-branches
83
84 with zipfile.ZipFile(self.filename) as zin,\
85 zipfile.ZipFile(self.output_filename, 'w') as zout:
86
87 temp_folder = tempfile.mkdtemp()
88 abort = False
89
90 for item in zin.infolist():
91 if item.filename[-1] == '/': # `is_dir` is added in Python3.6
92 continue # don't keep empty folders
93
94 zin.extract(member=item, path=temp_folder)
95 full_path = os.path.join(temp_folder, item.filename)
96
97 if self._specific_cleanup(full_path) is False:
98 logging.warning("Something went wrong during deep cleaning of %s",
99 item.filename)
100 abort = True
101 continue
102
103 if item.filename in self.files_to_keep:
104 # those files aren't supported, but we want to add them anyway
105 pass
106 elif any(map(lambda r: r.search(item.filename), self.files_to_omit)):
107 continue
108 else:
109 # supported files that we want to clean then add
110 tmp_parser, mtype = parser_factory.get_parser(full_path) # type: ignore
111 if not tmp_parser:
112 if self.unknown_member_policy == UnknownMemberPolicy.OMIT:
113 logging.warning("In file %s, omitting unknown element %s (format: %s)",
114 self.filename, item.filename, mtype)
115 continue
116 elif self.unknown_member_policy == UnknownMemberPolicy.KEEP:
117 logging.warning("In file %s, keeping unknown element %s (format: %s)",
118 self.filename, item.filename, mtype)
119 else:
120 logging.error("In file %s, element %s's format (%s) " +
121 "isn't supported",
122 self.filename, item.filename, mtype)
123 abort = True
124 continue
125 if tmp_parser:
126 tmp_parser.remove_all()
127 os.rename(tmp_parser.output_filename, full_path)
128
129 zinfo = zipfile.ZipInfo(item.filename) # type: ignore
130 clean_zinfo = self._clean_zipinfo(zinfo)
131 with open(full_path, 'rb') as f:
132 zout.writestr(clean_zinfo, f.read())
133
134 shutil.rmtree(temp_folder)
135 if abort:
136 os.remove(self.output_filename)
137 return False
138 return True
139
140
141class MSOfficeParser(ArchiveBasedAbstractParser): 25class MSOfficeParser(ArchiveBasedAbstractParser):
142 mimetypes = { 26 mimetypes = {
143 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', 27 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',