summaryrefslogtreecommitdiff
path: root/libmat2/office.py
diff options
context:
space:
mode:
authorjvoisin2018-09-06 11:32:45 +0200
committerjvoisin2018-09-06 11:34:14 +0200
commite3d817f57e77676c30fbfa05ed08deee7918b238 (patch)
tree419e1be09305b7f164b6d5354b6c39aa2ba4008e /libmat2/office.py
parent2e9adab86aeeb9b2d9dfbd65b7bf3fc0010364bd (diff)
Split office and archives
Diffstat (limited to 'libmat2/office.py')
-rw-r--r--libmat2/office.py118
1 files changed, 1 insertions, 117 deletions
diff --git a/libmat2/office.py b/libmat2/office.py
index 60c5478..50b776e 100644
--- a/libmat2/office.py
+++ b/libmat2/office.py
@@ -1,15 +1,11 @@
1import os 1import os
2import re 2import re
3import shutil
4import tempfile
5import datetime
6import zipfile 3import zipfile
7import logging
8from typing import Dict, Set, Pattern 4from typing import Dict, Set, Pattern
9 5
10import xml.etree.ElementTree as ET # type: ignore 6import xml.etree.ElementTree as ET # type: ignore
11 7
12from . import abstract, parser_factory, UnknownMemberPolicy 8from .archive import ArchiveBasedAbstractParser
13 9
14# Make pyflakes happy 10# Make pyflakes happy
15assert Set 11assert Set
@@ -26,118 +22,6 @@ def _parse_xml(full_path: str):
26 return ET.parse(full_path), namespace_map 22 return ET.parse(full_path), namespace_map
27 23
28 24
29class ArchiveBasedAbstractParser(abstract.AbstractParser):
30 """ Office files (.docx, .odt, …) are zipped files. """
31 # Those are the files that have a format that _isn't_
32 # supported by MAT2, but that we want to keep anyway.
33 files_to_keep = set() # type: Set[str]
34
35 # Those are the files that we _do not_ want to keep,
36 # no matter if they are supported or not.
37 files_to_omit = set() # type: Set[Pattern]
38
39 # what should the parser do if it encounters an unknown file in
40 # the archive?
41 unknown_member_policy = UnknownMemberPolicy.ABORT # type: UnknownMemberPolicy
42
43 def __init__(self, filename):
44 super().__init__(filename)
45 try: # better fail here than later
46 zipfile.ZipFile(self.filename)
47 except zipfile.BadZipFile:
48 raise ValueError
49
50 def _specific_cleanup(self, full_path: str) -> bool:
51 """ This method can be used to apply specific treatment
52 to files present in the archive."""
53 # pylint: disable=unused-argument,no-self-use
54 return True # pragma: no cover
55
56 @staticmethod
57 def _clean_zipinfo(zipinfo: zipfile.ZipInfo) -> zipfile.ZipInfo:
58 zipinfo.create_system = 3 # Linux
59 zipinfo.comment = b''
60 zipinfo.date_time = (1980, 1, 1, 0, 0, 0) # this is as early as a zipfile can be
61 return zipinfo
62
63 @staticmethod
64 def _get_zipinfo_meta(zipinfo: zipfile.ZipInfo) -> Dict[str, str]:
65 metadata = {}
66 if zipinfo.create_system == 3: # this is Linux
67 pass
68 elif zipinfo.create_system == 2:
69 metadata['create_system'] = 'Windows'
70 else:
71 metadata['create_system'] = 'Weird'
72
73 if zipinfo.comment:
74 metadata['comment'] = zipinfo.comment # type: ignore
75
76 if zipinfo.date_time != (1980, 1, 1, 0, 0, 0):
77 metadata['date_time'] = str(datetime.datetime(*zipinfo.date_time))
78
79 return metadata
80
81 def remove_all(self) -> bool:
82 # pylint: disable=too-many-branches
83
84 with zipfile.ZipFile(self.filename) as zin,\
85 zipfile.ZipFile(self.output_filename, 'w') as zout:
86
87 temp_folder = tempfile.mkdtemp()
88 abort = False
89
90 for item in zin.infolist():
91 if item.filename[-1] == '/': # `is_dir` is added in Python3.6
92 continue # don't keep empty folders
93
94 zin.extract(member=item, path=temp_folder)
95 full_path = os.path.join(temp_folder, item.filename)
96
97 if self._specific_cleanup(full_path) is False:
98 logging.warning("Something went wrong during deep cleaning of %s",
99 item.filename)
100 abort = True
101 continue
102
103 if item.filename in self.files_to_keep:
104 # those files aren't supported, but we want to add them anyway
105 pass
106 elif any(map(lambda r: r.search(item.filename), self.files_to_omit)):
107 continue
108 else:
109 # supported files that we want to clean then add
110 tmp_parser, mtype = parser_factory.get_parser(full_path) # type: ignore
111 if not tmp_parser:
112 if self.unknown_member_policy == UnknownMemberPolicy.OMIT:
113 logging.warning("In file %s, omitting unknown element %s (format: %s)",
114 self.filename, item.filename, mtype)
115 continue
116 elif self.unknown_member_policy == UnknownMemberPolicy.KEEP:
117 logging.warning("In file %s, keeping unknown element %s (format: %s)",
118 self.filename, item.filename, mtype)
119 else:
120 logging.error("In file %s, element %s's format (%s) " +
121 "isn't supported",
122 self.filename, item.filename, mtype)
123 abort = True
124 continue
125 if tmp_parser:
126 tmp_parser.remove_all()
127 os.rename(tmp_parser.output_filename, full_path)
128
129 zinfo = zipfile.ZipInfo(item.filename) # type: ignore
130 clean_zinfo = self._clean_zipinfo(zinfo)
131 with open(full_path, 'rb') as f:
132 zout.writestr(clean_zinfo, f.read())
133
134 shutil.rmtree(temp_folder)
135 if abort:
136 os.remove(self.output_filename)
137 return False
138 return True
139
140
141class MSOfficeParser(ArchiveBasedAbstractParser): 25class MSOfficeParser(ArchiveBasedAbstractParser):
142 mimetypes = { 26 mimetypes = {
143 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', 27 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',