From e1dd439fc86ba15816e2331e8bed67dd7147e368 Mon Sep 17 00:00:00 2001 From: jvoisin Date: Thu, 7 Feb 2019 21:58:10 +0100 Subject: Use of the archive refactoring for the office documents too --- libmat2/office.py | 28 +++++++++++----------------- 1 file changed, 11 insertions(+), 17 deletions(-) (limited to 'libmat2/office.py') diff --git a/libmat2/office.py b/libmat2/office.py index dfad3b3..0c9caa8 100644 --- a/libmat2/office.py +++ b/libmat2/office.py @@ -2,7 +2,7 @@ import logging import os import re import zipfile -from typing import Dict, Set, Pattern, Tuple, Union, Any +from typing import Dict, Set, Pattern, Tuple, Any import xml.etree.ElementTree as ET # type: ignore @@ -375,23 +375,17 @@ class LibreOfficeParser(ArchiveBasedAbstractParser): return False return True - def get_meta(self) -> Dict[str, Union[str, dict]]: + def _specific_get_meta(self, full_path: str, file_path: str) -> Dict[str, Any]: """ Yes, I know that parsing xml with regexp ain't pretty, be my guest and fix it if you want. """ - metadata = {} - zipin = zipfile.ZipFile(self.filename) - for item in zipin.infolist(): - if item.filename == 'meta.xml': - try: - content = zipin.read(item).decode('utf-8') - results = re.findall(r"<((?:meta|dc|cp).+?)>(.+)", content, re.I|re.M) - for (key, value) in results: - metadata[key] = value - except (TypeError, UnicodeDecodeError): # We didn't manage to parse the xml file - metadata[item.filename] = 'harmful content' - for key, value in self._get_zipinfo_meta(item).items(): - metadata[key] = value - zipin.close() - return metadata + if file_path != 'meta.xml': + return {} + with open(full_path, encoding='utf-8') as f: + try: + results = re.findall(r"<((?:meta|dc|cp).+?)>(.+)", f.read(), re.I|re.M) + return {k:v for (k, v) in results} + except (TypeError, UnicodeDecodeError): # We didn't manage to parse the xml file + # We didn't manage to parse the xml file + return {file_path: 'harmful content', } -- cgit v1.3