From eac51dbc9964cac28bb83e7d12370cf87ff2b0c5 Mon Sep 17 00:00:00 2001 From: jvoisin Date: Sun, 1 Apr 2018 01:04:06 +0200 Subject: Refactor office document handling --- src/libreoffice.py | 68 ------------------------------------------------------ 1 file changed, 68 deletions(-) delete mode 100644 src/libreoffice.py (limited to 'src/libreoffice.py') diff --git a/src/libreoffice.py b/src/libreoffice.py deleted file mode 100644 index 809ae3c..0000000 --- a/src/libreoffice.py +++ /dev/null @@ -1,68 +0,0 @@ -import re -import subprocess -import json -import zipfile -import tempfile -import shutil -import os - -from . import abstract, parser_factory - -class LibreOfficeParser(abstract.AbstractParser): - mimetypes = { - 'application/vnd.oasis.opendocument.text', - 'application/vnd.oasis.opendocument.spreadsheet', - 'application/vnd.oasis.opendocument.presentation', - 'application/vnd.oasis.opendocument.graphics', - 'application/vnd.oasis.opendocument.chart' - } - - def get_meta(self): - """ - Yes, I know that parsing xml with regexp ain't pretty, - be my guest and fix it if you want. - """ - metadata = {} - zipin = zipfile.ZipFile(self.filename) - for item in zipin.namelist(): - if item == 'meta.xml': - content = zipin.read(item).decode('utf-8') - for (key, value) in re.findall(r"<((?:meta|dc).+?)>(.+)", content, re.I): - metadata[key] = value - if not metadata: # better safe than sorry - metadata[item] = 'harmful content' - zipin.close() - return metadata - - def __clean_zipinfo(self, zipinfo:zipfile.ZipInfo) -> zipfile.ZipInfo: - zipinfo.compress_type = zipfile.ZIP_DEFLATED - zipinfo.create_system = 3 # Linux - zipinfo.comment = b'' - zipinfo.date_time = (1980, 1, 1, 0, 0, 0) - return zipinfo - - def remove_all(self): - zin = zipfile.ZipFile(self.filename, 'r') - zout = zipfile.ZipFile(self.output_filename, 'w') - temp_folder = tempfile.mkdtemp() - - for item in zin.infolist(): - if item.filename[-1] == '/': - continue # `is_dir` is added in Python3.6 - elif item.filename == 'meta.xml': - continue # don't keep metadata files - - zin.extract(member=item, path=temp_folder) - tmp_parser = parser_factory.get_parser(os.path.join(temp_folder, item.filename)) - if tmp_parser is None: - print("%s isn't supported" % item.filename) - continue - tmp_parser.remove_all() - zinfo = zipfile.ZipInfo(item.filename) - item = self.__clean_zipinfo(item) - with open(tmp_parser.output_filename, 'rb') as f: - zout.writestr(zinfo, f.read()) - shutil.rmtree(temp_folder) - zout.close() - zin.close() - return True -- cgit v1.3