diff options
Diffstat (limited to 'MAT/office.py')
| -rw-r--r-- | MAT/office.py | 187 |
1 files changed, 54 insertions, 133 deletions
diff --git a/MAT/office.py b/MAT/office.py index f60fc64..97405b3 100644 --- a/MAT/office.py +++ b/MAT/office.py | |||
| @@ -1,13 +1,12 @@ | |||
| 1 | ''' Care about office's formats | 1 | ''' Care about office's formats |
| 2 | ''' | 2 | ''' |
| 3 | 3 | ||
| 4 | import os | ||
| 5 | import logging | 4 | import logging |
| 6 | import zipfile | 5 | import os |
| 7 | import fileinput | ||
| 8 | import tempfile | ||
| 9 | import shutil | 6 | import shutil |
| 7 | import tempfile | ||
| 10 | import xml.dom.minidom as minidom | 8 | import xml.dom.minidom as minidom |
| 9 | import zipfile | ||
| 11 | 10 | ||
| 12 | try: | 11 | try: |
| 13 | import cairo | 12 | import cairo |
| @@ -16,7 +15,6 @@ except ImportError: | |||
| 16 | logging.info('office.py loaded without PDF support') | 15 | logging.info('office.py loaded without PDF support') |
| 17 | pass | 16 | pass |
| 18 | 17 | ||
| 19 | import mat | ||
| 20 | import parser | 18 | import parser |
| 21 | import archive | 19 | import archive |
| 22 | 20 | ||
| @@ -30,89 +28,83 @@ class OpenDocumentStripper(archive.ZipStripper): | |||
| 30 | ''' Return a dict with all the meta of the file by | 28 | ''' Return a dict with all the meta of the file by |
| 31 | trying to read the meta.xml file. | 29 | trying to read the meta.xml file. |
| 32 | ''' | 30 | ''' |
| 31 | metadata = super(OpenDocumentStripper, self).get_meta() | ||
| 33 | zipin = zipfile.ZipFile(self.filename, 'r') | 32 | zipin = zipfile.ZipFile(self.filename, 'r') |
| 34 | metadata = {} | ||
| 35 | try: | 33 | try: |
| 36 | content = zipin.read('meta.xml') | 34 | content = zipin.read('meta.xml') |
| 37 | dom1 = minidom.parseString(content) | 35 | dom1 = minidom.parseString(content) |
| 38 | elements = dom1.getElementsByTagName('office:meta') | 36 | elements = dom1.getElementsByTagName('office:meta') |
| 39 | for i in elements[0].childNodes: | 37 | for i in elements[0].childNodes: |
| 40 | if i.tagName != 'meta:document-statistic': | 38 | if i.tagName != 'meta:document-statistic': |
| 41 | nodename = ''.join([k for k in i.nodeName.split(':')[1:]]) | 39 | nodename = ''.join(i.nodeName.split(':')[1:]) |
| 42 | metadata[nodename] = ''.join([j.data for j in i.childNodes]) | 40 | metadata[nodename] = ''.join([j.data for j in i.childNodes]) |
| 43 | else: | 41 | else: |
| 44 | # thank you w3c for not providing a nice | 42 | # thank you w3c for not providing a nice |
| 45 | # method to get all attributes of a node | 43 | # method to get all attributes of a node |
| 46 | pass | 44 | pass |
| 47 | zipin.close() | ||
| 48 | except KeyError: # no meta.xml file found | 45 | except KeyError: # no meta.xml file found |
| 49 | logging.debug('%s has no opendocument metadata' % self.filename) | 46 | logging.debug('%s has no opendocument metadata' % self.filename) |
| 47 | zipin.close() | ||
| 50 | return metadata | 48 | return metadata |
| 51 | 49 | ||
| 52 | def remove_all(self): | 50 | def remove_all(self): |
| 51 | ''' Removes metadata | ||
| 53 | ''' | 52 | ''' |
| 54 | FIXME ? | 53 | return super(OpenDocumentStripper, self).remove_all(ending_blacklist=['meta.xml']) |
| 55 | There is a patch implementing the Zipfile.remove() | 54 | |
| 56 | method here : http://bugs.python.org/issue6818 | 55 | def is_clean(self): |
| 56 | ''' Check if the file is clean from harmful metadatas | ||
| 57 | ''' | 57 | ''' |
| 58 | clean_super = super(OpenDocumentStripper, self).is_clean() | ||
| 59 | if clean_super is False: | ||
| 60 | return False | ||
| 61 | |||
| 58 | zipin = zipfile.ZipFile(self.filename, 'r') | 62 | zipin = zipfile.ZipFile(self.filename, 'r') |
| 59 | zipout = zipfile.ZipFile(self.output, 'w', allowZip64=True) | 63 | try: |
| 64 | zipin.getinfo('meta.xml') | ||
| 65 | except KeyError: # no meta.xml in the file | ||
| 66 | return True | ||
| 67 | zipin.close() | ||
| 68 | return False | ||
| 60 | 69 | ||
| 61 | for item in zipin.namelist(): | ||
| 62 | name = os.path.join(self.tempdir, item) | ||
| 63 | _, ext = os.path.splitext(name) | ||
| 64 | 70 | ||
| 65 | if item.endswith('manifest.xml'): | 71 | class OpenXmlStripper(archive.ZipStripper): |
| 66 | # contain the list of all files present in the archive | 72 | ''' Represent an office openxml document, which is like |
| 67 | zipin.extract(item, self.tempdir) | 73 | an opendocument format, with some tricky stuff added. |
| 68 | for line in fileinput.input(name, inplace=1): | 74 | It contains mostly xml, but can have media blobs, crap, ... |
| 69 | # remove the line which contains "meta.xml" | 75 | (I don't like this format.) |
| 70 | line = line.strip() | 76 | ''' |
| 71 | if not 'meta.xml' in line: | 77 | def remove_all(self): |
| 72 | print line | 78 | return super(OpenXmlStripper, self).remove_all( |
| 73 | zipout.write(name, item) | 79 | beginning_blacklist=('docProps/'), whitelist=('.rels')) |
| 74 | 80 | ||
| 75 | elif ext in parser.NOMETA or item == 'mimetype': | 81 | def is_clean(self): |
| 76 | # keep NOMETA files, and the "manifest" file | 82 | ''' Check if the file is clean from harmful metadatas. |
| 77 | if item != 'meta.xml': # contains the metadata | 83 | This implementation is faster than something like |
| 78 | zipin.extract(item, self.tempdir) | 84 | "return this.get_meta() == {}". |
| 79 | zipout.write(name, item) | 85 | ''' |
| 86 | clean_super = super(OpenXmlStripper, self).is_clean() | ||
| 87 | if clean_super is False: | ||
| 88 | return False | ||
| 80 | 89 | ||
| 81 | else: | 90 | zipin = zipfile.ZipFile(self.filename, 'r') |
| 82 | zipin.extract(item, self.tempdir) | 91 | for item in zipin.namelist(): |
| 83 | if os.path.isfile(name): | 92 | if item.startswith('docProps/'): |
| 84 | try: | 93 | return False |
| 85 | cfile = mat.create_class_file(name, False, | ||
| 86 | add2archive=self.add2archive) | ||
| 87 | cfile.remove_all() | ||
| 88 | logging.debug('Processing %s from %s' % (item, | ||
| 89 | self.filename)) | ||
| 90 | zipout.write(name, item) | ||
| 91 | except: | ||
| 92 | logging.info('%s\'s fileformat is not supported' % item) | ||
| 93 | if self.add2archive: | ||
| 94 | zipout.write(name, item) | ||
| 95 | zipout.comment = '' | ||
| 96 | logging.info('%s processed' % self.filename) | ||
| 97 | zipin.close() | 94 | zipin.close() |
| 98 | zipout.close() | ||
| 99 | self.do_backup() | ||
| 100 | return True | 95 | return True |
| 101 | 96 | ||
| 102 | def is_clean(self): | 97 | def get_meta(self): |
| 103 | ''' Check if the file is clean from harmful metadatas | 98 | ''' Return a dict with all the meta of the file |
| 104 | ''' | 99 | ''' |
| 100 | metadata = super(OpenXmlStripper, self).get_meta() | ||
| 101 | |||
| 105 | zipin = zipfile.ZipFile(self.filename, 'r') | 102 | zipin = zipfile.ZipFile(self.filename, 'r') |
| 106 | try: | 103 | for item in zipin.namelist(): |
| 107 | zipin.getinfo('meta.xml') | 104 | if item.startswith('docProps/'): |
| 108 | except KeyError: # no meta.xml in the file | 105 | metadata[item] = 'harmful content' |
| 109 | czf = archive.ZipStripper(self.filename, self.parser, | ||
| 110 | 'application/zip', False, True, add2archive=self.add2archive) | ||
| 111 | if czf.is_clean(): | ||
| 112 | zipin.close() | ||
| 113 | return True | ||
| 114 | zipin.close() | 106 | zipin.close() |
| 115 | return False | 107 | return metadata |
| 116 | 108 | ||
| 117 | 109 | ||
| 118 | class PdfStripper(parser.GenericParser): | 110 | class PdfStripper(parser.GenericParser): |
| @@ -128,8 +120,8 @@ class PdfStripper(parser.GenericParser): | |||
| 128 | self.pdf_quality = False | 120 | self.pdf_quality = False |
| 129 | 121 | ||
| 130 | self.document = Poppler.Document.new_from_file(uri, self.password) | 122 | self.document = Poppler.Document.new_from_file(uri, self.password) |
| 131 | self.meta_list = frozenset(['title', 'author', 'subject', 'keywords', 'creator', | 123 | self.meta_list = frozenset(['title', 'author', 'subject', |
| 132 | 'producer', 'metadata']) | 124 | 'keywords', 'creator', 'producer', 'metadata']) |
| 133 | 125 | ||
| 134 | def is_clean(self): | 126 | def is_clean(self): |
| 135 | ''' Check if the file is clean from harmful metadatas | 127 | ''' Check if the file is clean from harmful metadatas |
| @@ -168,7 +160,7 @@ class PdfStripper(parser.GenericParser): | |||
| 168 | surface.finish() | 160 | surface.finish() |
| 169 | shutil.move(output, self.output) | 161 | shutil.move(output, self.output) |
| 170 | except: | 162 | except: |
| 171 | logging.error('Something went wrong when cleaning %s. File not cleaned' % self.filename) | 163 | logging.error('Something went wrong when cleaning %s.' % self.filename) |
| 172 | return False | 164 | return False |
| 173 | 165 | ||
| 174 | try: | 166 | try: |
| @@ -182,8 +174,7 @@ class PdfStripper(parser.GenericParser): | |||
| 182 | writer.write(self.output) | 174 | writer.write(self.output) |
| 183 | self.do_backup() | 175 | self.do_backup() |
| 184 | except: | 176 | except: |
| 185 | logging.error('Unable to remove all metadata from %s, please install\ | 177 | logging.error('Unable to remove all metadata from %s, please install pdfrw' % self.output) |
| 186 | pdfrw' % self.output) | ||
| 187 | return False | 178 | return False |
| 188 | return True | 179 | return True |
| 189 | 180 | ||
| @@ -195,73 +186,3 @@ pdfrw' % self.output) | |||
| 195 | if self.document.get_property(key): | 186 | if self.document.get_property(key): |
| 196 | metadata[key] = self.document.get_property(key) | 187 | metadata[key] = self.document.get_property(key) |
| 197 | return metadata | 188 | return metadata |
| 198 | |||
| 199 | |||
| 200 | class OpenXmlStripper(archive.GenericArchiveStripper): | ||
| 201 | ''' | ||
| 202 | Represent an office openxml document, which is like | ||
| 203 | an opendocument format, with some tricky stuff added. | ||
| 204 | It contains mostly xml, but can have media blobs, crap, ... | ||
| 205 | (I don't like this format.) | ||
| 206 | ''' | ||
| 207 | def remove_all(self): | ||
| 208 | ''' | ||
| 209 | FIXME ? | ||
| 210 | There is a patch implementing the Zipfile.remove() | ||
| 211 | method here : http://bugs.python.org/issue6818 | ||
| 212 | ''' | ||
| 213 | zipin = zipfile.ZipFile(self.filename, 'r') | ||
| 214 | zipout = zipfile.ZipFile(self.output, 'w', | ||
| 215 | allowZip64=True) | ||
| 216 | for item in zipin.namelist(): | ||
| 217 | name = os.path.join(self.tempdir, item) | ||
| 218 | _, ext = os.path.splitext(name) | ||
| 219 | if item.startswith('docProps/'): # metadatas | ||
| 220 | pass | ||
| 221 | elif ext in parser.NOMETA or item == '.rels': | ||
| 222 | # keep parser.NOMETA files, and the file named ".rels" | ||
| 223 | zipin.extract(item, self.tempdir) | ||
| 224 | zipout.write(name, item) | ||
| 225 | else: | ||
| 226 | zipin.extract(item, self.tempdir) | ||
| 227 | if os.path.isfile(name): # don't care about folders | ||
| 228 | try: | ||
| 229 | cfile = mat.create_class_file(name, False, | ||
| 230 | add2archive=self.add2archive) | ||
| 231 | cfile.remove_all() | ||
| 232 | logging.debug('Processing %s from %s' % (item, | ||
| 233 | self.filename)) | ||
| 234 | zipout.write(name, item) | ||
| 235 | except: | ||
| 236 | logging.info('%s\'s fileformat is not supported' % item) | ||
| 237 | if self.add2archive: | ||
| 238 | zipout.write(name, item) | ||
| 239 | zipout.comment = '' | ||
| 240 | logging.info('%s processed' % self.filename) | ||
| 241 | zipin.close() | ||
| 242 | zipout.close() | ||
| 243 | self.do_backup() | ||
| 244 | return True | ||
| 245 | |||
| 246 | def is_clean(self): | ||
| 247 | ''' Check if the file is clean from harmful metadatas | ||
| 248 | ''' | ||
| 249 | zipin = zipfile.ZipFile(self.filename, 'r') | ||
| 250 | for item in zipin.namelist(): | ||
| 251 | if item.startswith('docProps/'): | ||
| 252 | return False | ||
| 253 | zipin.close() | ||
| 254 | czf = archive.ZipStripper(self.filename, self.parser, | ||
| 255 | 'application/zip', False, True, add2archive=self.add2archive) | ||
| 256 | return czf.is_clean() | ||
| 257 | |||
| 258 | def get_meta(self): | ||
| 259 | ''' Return a dict with all the meta of the file | ||
| 260 | ''' | ||
| 261 | zipin = zipfile.ZipFile(self.filename, 'r') | ||
| 262 | metadata = {} | ||
| 263 | for item in zipin.namelist(): | ||
| 264 | if item.startswith('docProps/'): | ||
| 265 | metadata[item] = 'harmful content' | ||
| 266 | zipin.close() | ||
| 267 | return metadata | ||
