Use of the archive refactoring for the office documents too

author: jvoisin 2019-02-07 21:58:10 +0100
committer: jvoisin 2019-02-07 22:19:37 +0100
commit: e1dd439fc86ba15816e2331e8bed67dd7147e368 (patch)
tree: 0c8e368fcb9c409fa2182018b166ec4f18cdd98c /libmat2/office.py
parent: b9a62d798af14ea799ae5fceab1ed7a537d1cbdd (diff)
1 files changed, 11 insertions, 17 deletions
diff --git a/libmat2/office.py b/libmat2/office.py
index dfad3b3..0c9caa8 100644
--- a/libmat2/office.py
+++ b/libmat2/office.py
@@ -2,7 +2,7 @@ import logging
 import os
 import re
 import zipfile
-from typing import Dict, Set, Pattern, Tuple, Union, Any
+from typing import Dict, Set, Pattern, Tuple, Any
 import xml.etree.ElementTree as ET  # type: ignore
@@ -375,23 +375,17 @@ class LibreOfficeParser(ArchiveBasedAbstractParser):
                return False
        return True
-    def get_meta(self) -> Dict[str, Union[str, dict]]:
+    def _specific_get_meta(self, full_path: str, file_path: str) -> Dict[str, Any]:
        """
        Yes, I know that parsing xml with regexp ain't pretty,
        be my guest and fix it if you want.
        """
-        metadata = {}
+        if file_path != 'meta.xml':
-        zipin = zipfile.ZipFile(self.filename)
+            return {}
-        for item in zipin.infolist():
+        with open(full_path, encoding='utf-8') as f:
-            if item.filename == 'meta.xml':
+            try:
-                try:
+                results = re.findall(r"<((?:meta|dc|cp).+?)>(.+)</\1>", f.read(), re.I|re.M)
-                    content = zipin.read(item).decode('utf-8')
+                return {k:v for (k, v) in results}
-                    results = re.findall(r"<((?:meta|dc|cp).+?)>(.+)</\1>", content, re.I|re.M)
+            except (TypeError, UnicodeDecodeError):  # We didn't manage to parse the xml file
-                    for (key, value) in results:
+                # We didn't manage to parse the xml file
-                        metadata[key] = value
+                return {file_path: 'harmful content', }
-                except (TypeError, UnicodeDecodeError):  # We didn't manage to parse the xml file
-                    metadata[item.filename] = 'harmful content'
-            for key, value in self._get_zipinfo_meta(item).items():
-                metadata[key] = value
-        zipin.close()
-        return metadata
author	jvoisin	2019-02-07 21:58:10 +0100
committer	jvoisin	2019-02-07 22:19:37 +0100
commit	e1dd439fc86ba15816e2331e8bed67dd7147e368 (patch)
tree	0c8e368fcb9c409fa2182018b166ec4f18cdd98c /libmat2/office.py
parent	b9a62d798af14ea799ae5fceab1ed7a537d1cbdd (diff)

diff --git a/libmat2/office.py b/libmat2/office.py index dfad3b3..0c9caa8 100644 --- a/libmat2/office.py +++ b/libmat2/office.py
@@ -2,7 +2,7 @@ import logging
2	import os	2	import os
3	import re	3	import re
4	import zipfile	4	import zipfile
5	from typing import Dict, Set, Pattern, Tuple, Union, Any	5	from typing import Dict, Set, Pattern, Tuple, Any
6		6
7	import xml.etree.ElementTree as ET # type: ignore	7	import xml.etree.ElementTree as ET # type: ignore
8		8
@@ -375,23 +375,17 @@ class LibreOfficeParser(ArchiveBasedAbstractParser):
375	return False	375	return False
376	return True	376	return True
377		377
378	def get_meta(self) -> Dict[str, Union[str, dict]]:	378	def _specific_get_meta(self, full_path: str, file_path: str) -> Dict[str, Any]:
379	"""	379	"""
380	Yes, I know that parsing xml with regexp ain't pretty,	380	Yes, I know that parsing xml with regexp ain't pretty,
381	be my guest and fix it if you want.	381	be my guest and fix it if you want.
382	"""	382	"""
383	metadata = {}	383	if file_path != 'meta.xml':
384	zipin = zipfile.ZipFile(self.filename)	384	return {}
385	for item in zipin.infolist():	385	with open(full_path, encoding='utf-8') as f:
386	if item.filename == 'meta.xml':	386	try:
387	try:	387	results = re.findall(r"<((?:meta\|dc\|cp).+?)>(.+)</\1>", f.read(), re.I\|re.M)
388	content = zipin.read(item).decode('utf-8')	388	return {k:v for (k, v) in results}
389	results = re.findall(r"<((?:meta\|dc\|cp).+?)>(.+)</\1>", content, re.I\|re.M)	389	except (TypeError, UnicodeDecodeError): # We didn't manage to parse the xml file
390	for (key, value) in results:	390	# We didn't manage to parse the xml file
391	metadata[key] = value	391	return {file_path: 'harmful content', }
392	except (TypeError, UnicodeDecodeError): # We didn't manage to parse the xml file
393	metadata[item.filename] = 'harmful content'
394	for key, value in self._get_zipinfo_meta(item).items():
395	metadata[key] = value
396	zipin.close()
397	return metadata