From b9a62d798af14ea799ae5fceab1ed7a537d1cbdd Mon Sep 17 00:00:00 2001 From: jvoisin Date: Sun, 3 Feb 2019 22:55:15 +0100 Subject: Refactor a bit office get_meta handling This should make easier to get more metadata from archive-based file formats. --- libmat2/office.py | 29 ++++++++++++----------------- 1 file changed, 12 insertions(+), 17 deletions(-) (limited to 'libmat2/office.py') diff --git a/libmat2/office.py b/libmat2/office.py index 365c230..dfad3b3 100644 --- a/libmat2/office.py +++ b/libmat2/office.py @@ -2,7 +2,7 @@ import logging import os import re import zipfile -from typing import Dict, Set, Pattern, Tuple, Union +from typing import Dict, Set, Pattern, Tuple, Union, Any import xml.etree.ElementTree as ET # type: ignore @@ -295,26 +295,21 @@ class MSOfficeParser(ArchiveBasedAbstractParser): return True - def get_meta(self) -> Dict[str, Union[str, dict]]: + def _specific_get_meta(self, full_path: str, file_path: str) -> Dict[str, Any]: """ Yes, I know that parsing xml with regexp ain't pretty, be my guest and fix it if you want. """ - metadata = super().get_meta() - zipin = zipfile.ZipFile(self.filename) - for item in zipin.infolist(): - if item.filename.startswith('docProps/') and item.filename.endswith('.xml'): - try: - content = zipin.read(item).decode('utf-8') - results = re.findall(r"<(.+)>(.+)", content, re.I|re.M) - for (key, value) in results: - metadata[key] = value - except (TypeError, UnicodeDecodeError): # We didn't manage to parse the xml file - metadata[item.filename] = 'harmful content' - for key, value in self._get_zipinfo_meta(item).items(): - metadata[key] = value - zipin.close() - return metadata + if not file_path.startswith('docProps/') or not file_path.endswith('.xml'): + return {} + + with open(full_path, encoding='utf-8') as f: + try: + results = re.findall(r"<(.+)>(.+)", f.read(), re.I|re.M) + return {k:v for (k, v) in results} + except (TypeError, UnicodeDecodeError): + # We didn't manage to parse the xml file + return {file_path: 'harmful content', } class LibreOfficeParser(ArchiveBasedAbstractParser): -- cgit v1.3