summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorjvoisin2018-04-01 01:04:06 +0200
committerjvoisin2018-04-01 01:04:06 +0200
commiteac51dbc9964cac28bb83e7d12370cf87ff2b0c5 (patch)
tree6fba0d8323f3d27db72a68e96c656e51634ed164 /src
parent2d7c703c52cae50034fc9618c72552365f7cc741 (diff)
Refactor office document handling
Diffstat (limited to 'src')
-rw-r--r--src/abstract.py4
-rw-r--r--src/libreoffice.py68
-rw-r--r--src/office.py95
3 files changed, 75 insertions, 92 deletions
diff --git a/src/abstract.py b/src/abstract.py
index c2d282f..1f8ce6e 100644
--- a/src/abstract.py
+++ b/src/abstract.py
@@ -6,8 +6,8 @@ class AbstractParser(object):
6 self.filename = filename 6 self.filename = filename
7 self.output_filename = filename + '.cleaned' 7 self.output_filename = filename + '.cleaned'
8 8
9 def get_meta(self): 9 def get_meta(self) -> dict:
10 raise NotImplementedError 10 raise NotImplementedError
11 11
12 def remove_all(self): 12 def remove_all(self) -> bool:
13 raise NotImplementedError 13 raise NotImplementedError
diff --git a/src/libreoffice.py b/src/libreoffice.py
deleted file mode 100644
index 809ae3c..0000000
--- a/src/libreoffice.py
+++ /dev/null
@@ -1,68 +0,0 @@
1import re
2import subprocess
3import json
4import zipfile
5import tempfile
6import shutil
7import os
8
9from . import abstract, parser_factory
10
11class LibreOfficeParser(abstract.AbstractParser):
12 mimetypes = {
13 'application/vnd.oasis.opendocument.text',
14 'application/vnd.oasis.opendocument.spreadsheet',
15 'application/vnd.oasis.opendocument.presentation',
16 'application/vnd.oasis.opendocument.graphics',
17 'application/vnd.oasis.opendocument.chart'
18 }
19
20 def get_meta(self):
21 """
22 Yes, I know that parsing xml with regexp ain't pretty,
23 be my guest and fix it if you want.
24 """
25 metadata = {}
26 zipin = zipfile.ZipFile(self.filename)
27 for item in zipin.namelist():
28 if item == 'meta.xml':
29 content = zipin.read(item).decode('utf-8')
30 for (key, value) in re.findall(r"<((?:meta|dc).+?)>(.+)</\1>", content, re.I):
31 metadata[key] = value
32 if not metadata: # better safe than sorry
33 metadata[item] = 'harmful content'
34 zipin.close()
35 return metadata
36
37 def __clean_zipinfo(self, zipinfo:zipfile.ZipInfo) -> zipfile.ZipInfo:
38 zipinfo.compress_type = zipfile.ZIP_DEFLATED
39 zipinfo.create_system = 3 # Linux
40 zipinfo.comment = b''
41 zipinfo.date_time = (1980, 1, 1, 0, 0, 0)
42 return zipinfo
43
44 def remove_all(self):
45 zin = zipfile.ZipFile(self.filename, 'r')
46 zout = zipfile.ZipFile(self.output_filename, 'w')
47 temp_folder = tempfile.mkdtemp()
48
49 for item in zin.infolist():
50 if item.filename[-1] == '/':
51 continue # `is_dir` is added in Python3.6
52 elif item.filename == 'meta.xml':
53 continue # don't keep metadata files
54
55 zin.extract(member=item, path=temp_folder)
56 tmp_parser = parser_factory.get_parser(os.path.join(temp_folder, item.filename))
57 if tmp_parser is None:
58 print("%s isn't supported" % item.filename)
59 continue
60 tmp_parser.remove_all()
61 zinfo = zipfile.ZipInfo(item.filename)
62 item = self.__clean_zipinfo(item)
63 with open(tmp_parser.output_filename, 'rb') as f:
64 zout.writestr(zinfo, f.read())
65 shutil.rmtree(temp_folder)
66 zout.close()
67 zin.close()
68 return True
diff --git a/src/office.py b/src/office.py
index a729f2f..5083308 100644
--- a/src/office.py
+++ b/src/office.py
@@ -1,14 +1,34 @@
1import json
2import os
1import re 3import re
4import shutil
2import subprocess 5import subprocess
3import json
4import zipfile
5import tempfile 6import tempfile
6import shutil 7import zipfile
7import os
8 8
9from . import abstract, parser_factory 9from . import abstract, parser_factory
10 10
11class OfficeParser(abstract.AbstractParser): 11class ArchiveBasedAbstractParser(abstract.AbstractParser):
12 def _clean_zipinfo(self, zipinfo:zipfile.ZipInfo) -> zipfile.ZipInfo:
13 zipinfo.compress_type = zipfile.ZIP_DEFLATED
14 zipinfo.create_system = 3 # Linux
15 zipinfo.comment = b''
16 zipinfo.date_time = (1980, 1, 1, 0, 0, 0)
17 return zipinfo
18
19 def _clean_internal_file(self, item:zipfile.ZipInfo, temp_folder:str, zin:zipfile.ZipFile, zout:zipfile.ZipFile):
20 zin.extract(member=item, path=temp_folder)
21 tmp_parser = parser_factory.get_parser(os.path.join(temp_folder, item.filename))
22 if tmp_parser is None:
23 print("%s isn't supported" % item.filename)
24 return
25 tmp_parser.remove_all()
26 zinfo = zipfile.ZipInfo(item.filename)
27 item = self._clean_zipinfo(item)
28 with open(tmp_parser.output_filename, 'rb') as f:
29 zout.writestr(zinfo, f.read())
30
31class MSOfficeParser(ArchiveBasedAbstractParser):
12 mimetypes = { 32 mimetypes = {
13 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', 33 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
14 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', 34 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
@@ -33,12 +53,6 @@ class OfficeParser(abstract.AbstractParser):
33 zipin.close() 53 zipin.close()
34 return metadata 54 return metadata
35 55
36 def __clean_zipinfo(self, zipinfo:zipfile.ZipInfo) -> zipfile.ZipInfo:
37 zipinfo.compress_type = zipfile.ZIP_DEFLATED
38 zipinfo.create_system = 3 # Linux
39 zipinfo.comment = b''
40 zipinfo.date_time = (1980, 1, 1, 0, 0, 0)
41 return zipinfo
42 56
43 def remove_all(self): 57 def remove_all(self):
44 zin = zipfile.ZipFile(self.filename, 'r') 58 zin = zipfile.ZipFile(self.filename, 'r')
@@ -52,20 +66,57 @@ class OfficeParser(abstract.AbstractParser):
52 if not item.filename.endswith('.rels'): 66 if not item.filename.endswith('.rels'):
53 continue # don't keep metadata files 67 continue # don't keep metadata files
54 if item.filename in self.files_to_keep: 68 if item.filename in self.files_to_keep:
55 item = self.__clean_zipinfo(item) 69 item = self._clean_zipinfo(item)
56 zout.writestr(item, zin.read(item)) 70 zout.writestr(item, zin.read(item))
57 continue 71 continue
58 72
59 zin.extract(member=item, path=temp_folder) 73 self._clean_internal_file(item, temp_folder, zin, zout)
60 tmp_parser = parser_factory.get_parser(os.path.join(temp_folder, item.filename)) 74
61 if tmp_parser is None: 75 shutil.rmtree(temp_folder)
62 print("%s isn't supported" % item.filename) 76 zout.close()
63 continue 77 zin.close()
64 tmp_parser.remove_all() 78 return True
65 zinfo = zipfile.ZipInfo(item.filename) 79
66 item = self.__clean_zipinfo(item) 80
67 with open(tmp_parser.output_filename, 'rb') as f: 81
68 zout.writestr(zinfo, f.read()) 82class LibreOfficeParser(ArchiveBasedAbstractParser):
83 mimetypes = {
84 'application/vnd.oasis.opendocument.text',
85 'application/vnd.oasis.opendocument.spreadsheet',
86 'application/vnd.oasis.opendocument.presentation',
87 'application/vnd.oasis.opendocument.graphics',
88 'application/vnd.oasis.opendocument.chart'
89 }
90
91 def get_meta(self):
92 """
93 Yes, I know that parsing xml with regexp ain't pretty,
94 be my guest and fix it if you want.
95 """
96 metadata = {}
97 zipin = zipfile.ZipFile(self.filename)
98 for item in zipin.namelist():
99 if item == 'meta.xml':
100 content = zipin.read(item).decode('utf-8')
101 for (key, value) in re.findall(r"<((?:meta|dc|cp).+?)>(.+)</\1>", content, re.I):
102 metadata[key] = value
103 if not metadata: # better safe than sorry
104 metadata[item] = 'harmful content'
105 zipin.close()
106 return metadata
107
108 def remove_all(self):
109 zin = zipfile.ZipFile(self.filename, 'r')
110 zout = zipfile.ZipFile(self.output_filename, 'w')
111 temp_folder = tempfile.mkdtemp()
112
113 for item in zin.infolist():
114 if item.filename[-1] == '/':
115 continue # `is_dir` is added in Python3.6
116 elif item.filename == 'meta.xml':
117 continue # don't keep metadata files
118
119 self._clean_internal_file(item, temp_folder, zin, zout)
69 120
70 shutil.rmtree(temp_folder) 121 shutil.rmtree(temp_folder)
71 zout.close() 122 zout.close()