diff options
Diffstat (limited to 'libmat2')
| -rw-r--r-- | libmat2/office.py | 16 |
1 files changed, 12 insertions, 4 deletions
diff --git a/libmat2/office.py b/libmat2/office.py index 914fd39..6ab7e80 100644 --- a/libmat2/office.py +++ b/libmat2/office.py | |||
| @@ -78,8 +78,12 @@ class MSOfficeParser(ArchiveBasedAbstractParser): | |||
| 78 | for item in zipin.infolist(): | 78 | for item in zipin.infolist(): |
| 79 | if item.filename.startswith('docProps/') and item.filename.endswith('.xml'): | 79 | if item.filename.startswith('docProps/') and item.filename.endswith('.xml'): |
| 80 | content = zipin.read(item).decode('utf-8') | 80 | content = zipin.read(item).decode('utf-8') |
| 81 | for (key, value) in re.findall(r"<(.+)>(.+)</\1>", content, re.I): | 81 | try: |
| 82 | metadata[key] = value | 82 | results = re.findall(r"<(.+)>(.+)</\1>", content, re.I|re.M) |
| 83 | for (key, value) in results: | ||
| 84 | metadata[key] = value | ||
| 85 | except TypeError: # We didn't manage to parse the xml file | ||
| 86 | pass | ||
| 83 | if not metadata: # better safe than sorry | 87 | if not metadata: # better safe than sorry |
| 84 | metadata[item] = 'harmful content' | 88 | metadata[item] = 'harmful content' |
| 85 | 89 | ||
| @@ -140,8 +144,12 @@ class LibreOfficeParser(ArchiveBasedAbstractParser): | |||
| 140 | for item in zipin.infolist(): | 144 | for item in zipin.infolist(): |
| 141 | if item.filename == 'meta.xml': | 145 | if item.filename == 'meta.xml': |
| 142 | content = zipin.read(item).decode('utf-8') | 146 | content = zipin.read(item).decode('utf-8') |
| 143 | for (key, value) in re.findall(r"<((?:meta|dc|cp).+?)>(.+)</\1>", content, re.I): | 147 | try: |
| 144 | metadata[key] = value | 148 | results = re.findall(r"<((?:meta|dc|cp).+?)>(.+)</\1>", content, re.I|re.M) |
| 149 | for (key, value) in results: | ||
| 150 | metadata[key] = value | ||
| 151 | except TypeError: # We didn't manage to parse the xml file | ||
| 152 | pass | ||
| 145 | if not metadata: # better safe than sorry | 153 | if not metadata: # better safe than sorry |
| 146 | metadata[item] = 'harmful content' | 154 | metadata[item] = 'harmful content' |
| 147 | for key, value in self._get_zipinfo_meta(item).items(): | 155 | for key, value in self._get_zipinfo_meta(item).items(): |
