diff options
Diffstat (limited to 'libmat/office.py')
| -rw-r--r-- | libmat/office.py | 89 |
1 files changed, 1 insertions, 88 deletions
diff --git a/libmat/office.py b/libmat/office.py index b23ec84..b4a05a7 100644 --- a/libmat/office.py +++ b/libmat/office.py | |||
| @@ -18,94 +18,7 @@ except ImportError: | |||
| 18 | logging.info('office.py loaded without PDF support') | 18 | logging.info('office.py loaded without PDF support') |
| 19 | 19 | ||
| 20 | from libmat import parser | 20 | from libmat import parser |
| 21 | from libmat import archive | 21 | #from libmat import archive |
| 22 | |||
| 23 | |||
| 24 | class OpenDocumentStripper(archive.TerminalZipStripper): | ||
| 25 | """ An open document file is a zip, with xml file into. | ||
| 26 | The one that interest us is meta.xml | ||
| 27 | """ | ||
| 28 | |||
| 29 | def get_meta(self): | ||
| 30 | """ Return a dict with all the meta of the file by | ||
| 31 | trying to read the meta.xml file. | ||
| 32 | """ | ||
| 33 | metadata = super(OpenDocumentStripper, self).get_meta() | ||
| 34 | zipin = zipfile.ZipFile(self.filename, 'r') | ||
| 35 | try: | ||
| 36 | content = zipin.read('meta.xml') | ||
| 37 | dom1 = minidom.parseString(content) | ||
| 38 | elements = dom1.getElementsByTagName('office:meta') | ||
| 39 | for i in elements[0].childNodes: | ||
| 40 | if i.tagName != 'meta:document-statistic': | ||
| 41 | nodename = ''.join(i.nodeName.split(':')[1:]) | ||
| 42 | metadata[nodename] = ''.join([j.data for j in i.childNodes]) | ||
| 43 | except KeyError: # no meta.xml file found | ||
| 44 | logging.debug('%s has no opendocument metadata', self.filename) | ||
| 45 | zipin.close() | ||
| 46 | return metadata | ||
| 47 | |||
| 48 | def remove_all(self): | ||
| 49 | """ Removes metadata | ||
| 50 | """ | ||
| 51 | return super(OpenDocumentStripper, self).remove_all(ending_blacklist=['meta.xml']) | ||
| 52 | |||
| 53 | def is_clean(self): | ||
| 54 | """ Check if the file is clean from harmful metadatas | ||
| 55 | """ | ||
| 56 | clean_super = super(OpenDocumentStripper, self).is_clean() | ||
| 57 | if clean_super is False: | ||
| 58 | return False | ||
| 59 | |||
| 60 | zipin = zipfile.ZipFile(self.filename, 'r') | ||
| 61 | try: | ||
| 62 | zipin.getinfo('meta.xml') | ||
| 63 | except KeyError: # no meta.xml in the file | ||
| 64 | return True | ||
| 65 | zipin.close() | ||
| 66 | return False | ||
| 67 | |||
| 68 | |||
| 69 | class OpenXmlStripper(archive.TerminalZipStripper): | ||
| 70 | """ Represent an office openxml document, which is like | ||
| 71 | an opendocument format, with some tricky stuff added. | ||
| 72 | It contains mostly xml, but can have media blobs, crap, ... | ||
| 73 | (I don't like this format.) | ||
| 74 | """ | ||
| 75 | |||
| 76 | def remove_all(self): | ||
| 77 | """ Remove harmful metadata, by deleting everything that doesn't end with '.rels' in the | ||
| 78 | 'docProps' folder. """ | ||
| 79 | return super(OpenXmlStripper, self).remove_all( | ||
| 80 | beginning_blacklist=['docProps/'], whitelist=['.rels']) | ||
| 81 | |||
| 82 | def is_clean(self): | ||
| 83 | """ Check if the file is clean from harmful metadatas. | ||
| 84 | This implementation is faster than something like | ||
| 85 | "return this.get_meta() == {}". | ||
| 86 | """ | ||
| 87 | clean_super = super(OpenXmlStripper, self).is_clean() | ||
| 88 | if clean_super is False: | ||
| 89 | return False | ||
| 90 | |||
| 91 | zipin = zipfile.ZipFile(self.filename) | ||
| 92 | for item in zipin.namelist(): | ||
| 93 | if item.startswith('docProps/'): | ||
| 94 | return False | ||
| 95 | zipin.close() | ||
| 96 | return True | ||
| 97 | |||
| 98 | def get_meta(self): | ||
| 99 | """ Return a dict with all the meta of the file | ||
| 100 | """ | ||
| 101 | metadata = super(OpenXmlStripper, self).get_meta() | ||
| 102 | |||
| 103 | zipin = zipfile.ZipFile(self.filename) | ||
| 104 | for item in zipin.namelist(): | ||
| 105 | if item.startswith('docProps/'): | ||
| 106 | metadata[item] = 'harmful content' | ||
| 107 | zipin.close() | ||
| 108 | return metadata | ||
| 109 | 22 | ||
| 110 | 23 | ||
| 111 | class PdfStripper(parser.GenericParser): | 24 | class PdfStripper(parser.GenericParser): |
