Refactor a bit office get_meta handling

This should make easier to get more metadata from archive-based file formats.
author: jvoisin 2019-02-03 22:55:15 +0100
committer: jvoisin 2019-02-04 00:31:26 +0100
commit: b9a62d798af14ea799ae5fceab1ed7a537d1cbdd (patch)
tree: a50622baf990acface31398adaef395bb398ed5d /libmat2/office.py
parent: 54e50450ad9f8657ed7c60d5c0f9ab5c648d08ee (diff)
1 files changed, 12 insertions, 17 deletions
diff --git a/libmat2/office.py b/libmat2/office.py
index 365c230..dfad3b3 100644
--- a/libmat2/office.py
+++ b/libmat2/office.py
@@ -2,7 +2,7 @@ import logging
 import os
 import re
 import zipfile
-from typing import Dict, Set, Pattern, Tuple, Union
+from typing import Dict, Set, Pattern, Tuple, Union, Any
 import xml.etree.ElementTree as ET  # type: ignore
@@ -295,26 +295,21 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
        return True
-    def get_meta(self) -> Dict[str, Union[str, dict]]:
+    def _specific_get_meta(self, full_path: str, file_path: str) -> Dict[str, Any]:
        """
        Yes, I know that parsing xml with regexp ain't pretty,
        be my guest and fix it if you want.
        """
-        metadata = super().get_meta()
+        if not file_path.startswith('docProps/') or not file_path.endswith('.xml'):
-        zipin = zipfile.ZipFile(self.filename)
+            return {}
-        for item in zipin.infolist():
-            if item.filename.startswith('docProps/') and item.filename.endswith('.xml'):
+        with open(full_path, encoding='utf-8') as f:
-                try:
+            try:
-                    content = zipin.read(item).decode('utf-8')
+                results = re.findall(r"<(.+)>(.+)</\1>", f.read(), re.I|re.M)
-                    results = re.findall(r"<(.+)>(.+)</\1>", content, re.I|re.M)
+                return {k:v for (k, v) in results}
-                    for (key, value) in results:
+            except (TypeError, UnicodeDecodeError):
-                        metadata[key] = value
+                # We didn't manage to parse the xml file
-                except (TypeError, UnicodeDecodeError):  # We didn't manage to parse the xml file
+                return {file_path: 'harmful content', }
-                    metadata[item.filename] = 'harmful content'
-            for key, value in self._get_zipinfo_meta(item).items():
-                metadata[key] = value
-        zipin.close()
-        return metadata
 class LibreOfficeParser(ArchiveBasedAbstractParser):
author	jvoisin	2019-02-03 22:55:15 +0100
committer	jvoisin	2019-02-04 00:31:26 +0100
commit	b9a62d798af14ea799ae5fceab1ed7a537d1cbdd (patch)
tree	a50622baf990acface31398adaef395bb398ed5d /libmat2/office.py
parent	54e50450ad9f8657ed7c60d5c0f9ab5c648d08ee (diff)

diff --git a/libmat2/office.py b/libmat2/office.py index 365c230..dfad3b3 100644 --- a/libmat2/office.py +++ b/libmat2/office.py
@@ -2,7 +2,7 @@ import logging
2	import os	2	import os
3	import re	3	import re
4	import zipfile	4	import zipfile
5	from typing import Dict, Set, Pattern, Tuple, Union	5	from typing import Dict, Set, Pattern, Tuple, Union, Any
6		6
7	import xml.etree.ElementTree as ET # type: ignore	7	import xml.etree.ElementTree as ET # type: ignore
8		8
@@ -295,26 +295,21 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
295		295
296	return True	296	return True
297		297
298	def get_meta(self) -> Dict[str, Union[str, dict]]:	298	def _specific_get_meta(self, full_path: str, file_path: str) -> Dict[str, Any]:
299	"""	299	"""
300	Yes, I know that parsing xml with regexp ain't pretty,	300	Yes, I know that parsing xml with regexp ain't pretty,
301	be my guest and fix it if you want.	301	be my guest and fix it if you want.
302	"""	302	"""
303	metadata = super().get_meta()	303	if not file_path.startswith('docProps/') or not file_path.endswith('.xml'):
304	zipin = zipfile.ZipFile(self.filename)	304	return {}
305	for item in zipin.infolist():	305
306	if item.filename.startswith('docProps/') and item.filename.endswith('.xml'):	306	with open(full_path, encoding='utf-8') as f:
307	try:	307	try:
308	content = zipin.read(item).decode('utf-8')	308	results = re.findall(r"<(.+)>(.+)</\1>", f.read(), re.I\|re.M)
309	results = re.findall(r"<(.+)>(.+)</\1>", content, re.I\|re.M)	309	return {k:v for (k, v) in results}
310	for (key, value) in results:	310	except (TypeError, UnicodeDecodeError):
311	metadata[key] = value	311	# We didn't manage to parse the xml file
312	except (TypeError, UnicodeDecodeError): # We didn't manage to parse the xml file	312	return {file_path: 'harmful content', }
313	metadata[item.filename] = 'harmful content'
314	for key, value in self._get_zipinfo_meta(item).items():
315	metadata[key] = value
316	zipin.close()
317	return metadata
318		313
319		314
320	class LibreOfficeParser(ArchiveBasedAbstractParser):	315	class LibreOfficeParser(ArchiveBasedAbstractParser):