summaryrefslogtreecommitdiff
path: root/src/libreoffice.py
diff options
context:
space:
mode:
authorjvoisin2018-04-01 01:04:06 +0200
committerjvoisin2018-04-01 01:04:06 +0200
commiteac51dbc9964cac28bb83e7d12370cf87ff2b0c5 (patch)
tree6fba0d8323f3d27db72a68e96c656e51634ed164 /src/libreoffice.py
parent2d7c703c52cae50034fc9618c72552365f7cc741 (diff)
Refactor office document handling
Diffstat (limited to 'src/libreoffice.py')
-rw-r--r--src/libreoffice.py68
1 files changed, 0 insertions, 68 deletions
diff --git a/src/libreoffice.py b/src/libreoffice.py
deleted file mode 100644
index 809ae3c..0000000
--- a/src/libreoffice.py
+++ /dev/null
@@ -1,68 +0,0 @@
1import re
2import subprocess
3import json
4import zipfile
5import tempfile
6import shutil
7import os
8
9from . import abstract, parser_factory
10
11class LibreOfficeParser(abstract.AbstractParser):
12 mimetypes = {
13 'application/vnd.oasis.opendocument.text',
14 'application/vnd.oasis.opendocument.spreadsheet',
15 'application/vnd.oasis.opendocument.presentation',
16 'application/vnd.oasis.opendocument.graphics',
17 'application/vnd.oasis.opendocument.chart'
18 }
19
20 def get_meta(self):
21 """
22 Yes, I know that parsing xml with regexp ain't pretty,
23 be my guest and fix it if you want.
24 """
25 metadata = {}
26 zipin = zipfile.ZipFile(self.filename)
27 for item in zipin.namelist():
28 if item == 'meta.xml':
29 content = zipin.read(item).decode('utf-8')
30 for (key, value) in re.findall(r"<((?:meta|dc).+?)>(.+)</\1>", content, re.I):
31 metadata[key] = value
32 if not metadata: # better safe than sorry
33 metadata[item] = 'harmful content'
34 zipin.close()
35 return metadata
36
37 def __clean_zipinfo(self, zipinfo:zipfile.ZipInfo) -> zipfile.ZipInfo:
38 zipinfo.compress_type = zipfile.ZIP_DEFLATED
39 zipinfo.create_system = 3 # Linux
40 zipinfo.comment = b''
41 zipinfo.date_time = (1980, 1, 1, 0, 0, 0)
42 return zipinfo
43
44 def remove_all(self):
45 zin = zipfile.ZipFile(self.filename, 'r')
46 zout = zipfile.ZipFile(self.output_filename, 'w')
47 temp_folder = tempfile.mkdtemp()
48
49 for item in zin.infolist():
50 if item.filename[-1] == '/':
51 continue # `is_dir` is added in Python3.6
52 elif item.filename == 'meta.xml':
53 continue # don't keep metadata files
54
55 zin.extract(member=item, path=temp_folder)
56 tmp_parser = parser_factory.get_parser(os.path.join(temp_folder, item.filename))
57 if tmp_parser is None:
58 print("%s isn't supported" % item.filename)
59 continue
60 tmp_parser.remove_all()
61 zinfo = zipfile.ZipInfo(item.filename)
62 item = self.__clean_zipinfo(item)
63 with open(tmp_parser.output_filename, 'rb') as f:
64 zout.writestr(zinfo, f.read())
65 shutil.rmtree(temp_folder)
66 zout.close()
67 zin.close()
68 return True