diff options
| author | jvoisin | 2011-08-16 18:11:24 +0200 |
|---|---|---|
| committer | jvoisin | 2011-08-16 18:11:24 +0200 |
| commit | 4bd3e47da02fde08acfada1795cc55170abdb00a (patch) | |
| tree | f8c7aa5fd5e1b07a28b350c5ded8125ef2467c51 /mat | |
| parent | baf8e080125614326ba9c96ca8f2404fd12b050e (diff) | |
setup.py now works !
Diffstat (limited to 'mat')
| -rw-r--r-- | mat/__init__.py | 1 | ||||
| -rw-r--r-- | mat/archive.py | 289 | ||||
| -rw-r--r-- | mat/audio.py | 98 | ||||
| -rw-r--r-- | mat/images.py | 37 | ||||
| -rw-r--r-- | mat/mat.py | 162 | ||||
| -rw-r--r-- | mat/misc.py | 62 | ||||
| -rw-r--r-- | mat/office.py | 280 | ||||
| -rw-r--r-- | mat/parser.py | 104 |
8 files changed, 1033 insertions, 0 deletions
diff --git a/mat/__init__.py b/mat/__init__.py new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/mat/__init__.py | |||
| @@ -0,0 +1 @@ | |||
diff --git a/mat/archive.py b/mat/archive.py new file mode 100644 index 0000000..77db71c --- /dev/null +++ b/mat/archive.py | |||
| @@ -0,0 +1,289 @@ | |||
| 1 | ''' | ||
| 2 | Take care of archives formats | ||
| 3 | ''' | ||
| 4 | |||
| 5 | import zipfile | ||
| 6 | import shutil | ||
| 7 | import os | ||
| 8 | import logging | ||
| 9 | import tempfile | ||
| 10 | |||
| 11 | import parser | ||
| 12 | import mat | ||
| 13 | from tarfile import tarfile | ||
| 14 | |||
| 15 | |||
| 16 | class GenericArchiveStripper(parser.GenericParser): | ||
| 17 | ''' | ||
| 18 | Represent a generic archive | ||
| 19 | ''' | ||
| 20 | def __init__(self, filename, parser, mime, backup, add2archive): | ||
| 21 | super(GenericArchiveStripper, self).__init__(filename, parser, mime, | ||
| 22 | backup, add2archive) | ||
| 23 | self.compression = '' | ||
| 24 | self.add2archive = add2archive | ||
| 25 | self.tempdir = tempfile.mkdtemp() | ||
| 26 | |||
| 27 | def __del__(self): | ||
| 28 | ''' | ||
| 29 | Remove the files inside the temp dir, | ||
| 30 | then remove the temp dir | ||
| 31 | ''' | ||
| 32 | for root, dirs, files in os.walk(self.tempdir): | ||
| 33 | for item in files: | ||
| 34 | path_file = os.path.join(root, item) | ||
| 35 | mat.secure_remove(path_file) | ||
| 36 | shutil.rmtree(self.tempdir) | ||
| 37 | |||
| 38 | def remove_all(self): | ||
| 39 | ''' | ||
| 40 | Call _remove_all() with in argument : "normal" | ||
| 41 | ''' | ||
| 42 | self._remove_all('normal') | ||
| 43 | |||
| 44 | def remove_all_ugly(self): | ||
| 45 | ''' | ||
| 46 | call remove_all() with in argument : "ugly" | ||
| 47 | ''' | ||
| 48 | self._remove_all('ugly') | ||
| 49 | |||
| 50 | def _remove_all(self, method): | ||
| 51 | ''' | ||
| 52 | Remove all meta, normal way if method is "normal", | ||
| 53 | else, use the ugly way (with possible data loss) | ||
| 54 | ''' | ||
| 55 | raise NotImplementedError | ||
| 56 | |||
| 57 | |||
| 58 | class ZipStripper(GenericArchiveStripper): | ||
| 59 | ''' | ||
| 60 | Represent a zip file | ||
| 61 | ''' | ||
| 62 | def is_file_clean(self, fileinfo): | ||
| 63 | ''' | ||
| 64 | Check if a ZipInfo object is clean of metadatas added | ||
| 65 | by zip itself, independently of the corresponding file metadatas | ||
| 66 | ''' | ||
| 67 | if fileinfo.comment is not '': | ||
| 68 | return False | ||
| 69 | elif fileinfo.date_time is not 0: | ||
| 70 | return False | ||
| 71 | elif fileinfo.create_system is not 0: | ||
| 72 | return False | ||
| 73 | elif fileinfo.create_version is not 0: | ||
| 74 | return False | ||
| 75 | else: | ||
| 76 | return True | ||
| 77 | |||
| 78 | def is_clean(self): | ||
| 79 | ''' | ||
| 80 | Check if the given file is clean from harmful metadata | ||
| 81 | ''' | ||
| 82 | zipin = zipfile.ZipFile(self.filename, 'r') | ||
| 83 | if zipin.comment != '': | ||
| 84 | logging.debug('%s has a comment' % self.filename) | ||
| 85 | return False | ||
| 86 | for item in zipin.infolist(): | ||
| 87 | #I have not found a way to remove the crap added by zipfile :/ | ||
| 88 | #if not self.is_file_clean(item): | ||
| 89 | # logging.debug('%s from %s has compromizing zipinfo' % | ||
| 90 | # (item.filename, self.filename)) | ||
| 91 | # return False | ||
| 92 | zipin.extract(item, self.tempdir) | ||
| 93 | name = os.path.join(self.tempdir, item.filename) | ||
| 94 | if os.path.isfile(name): | ||
| 95 | try: | ||
| 96 | cfile = mat.create_class_file(name, False, | ||
| 97 | self.add2archive) | ||
| 98 | if not cfile.is_clean(): | ||
| 99 | return False | ||
| 100 | except: | ||
| 101 | #best solution I have found | ||
| 102 | logging.info('%s\'s fileformat is not supported, or is a \ | ||
| 103 | harmless format' % item.filename) | ||
| 104 | _, ext = os.path.splitext(name) | ||
| 105 | bname = os.path.basename(item.filename) | ||
| 106 | if ext not in parser.NOMETA: | ||
| 107 | if bname != 'mimetype' and bname != '.rels': | ||
| 108 | return False | ||
| 109 | zipin.close() | ||
| 110 | return True | ||
| 111 | |||
| 112 | def get_meta(self): | ||
| 113 | ''' | ||
| 114 | Return all the metadata of a ZipFile (don't return metadatas | ||
| 115 | of contained files : should it ?) | ||
| 116 | ''' | ||
| 117 | zipin = zipfile.ZipFile(self.filename, 'r') | ||
| 118 | metadata = {} | ||
| 119 | for field in zipin.infolist(): | ||
| 120 | zipmeta = {} | ||
| 121 | zipmeta['comment'] = field.comment | ||
| 122 | zipmeta['modified'] = field.date_time | ||
| 123 | zipmeta['system'] = field.create_system | ||
| 124 | zipmeta['zip_version'] = field.create_version | ||
| 125 | metadata[field.filename] = zipmeta | ||
| 126 | metadata["%s comment" % self.filename] = zipin.comment | ||
| 127 | zipin.close() | ||
| 128 | return metadata | ||
| 129 | |||
| 130 | def _remove_all(self, method): | ||
| 131 | ''' | ||
| 132 | So far, the zipfile module does not allow to write a ZipInfo | ||
| 133 | object into a zipfile (and it's a shame !) : so data added | ||
| 134 | by zipfile itself could not be removed. It's a big concern. | ||
| 135 | Is shiping a patched version of zipfile.py a good idea ? | ||
| 136 | ''' | ||
| 137 | zipin = zipfile.ZipFile(self.filename, 'r') | ||
| 138 | zipout = zipfile.ZipFile(self.output, 'w', allowZip64=True) | ||
| 139 | for item in zipin.infolist(): | ||
| 140 | zipin.extract(item, self.tempdir) | ||
| 141 | name = os.path.join(self.tempdir, item.filename) | ||
| 142 | if os.path.isfile(name): | ||
| 143 | try: | ||
| 144 | cfile = mat.create_class_file(name, False, | ||
| 145 | self.add2archive) | ||
| 146 | if method is 'normal': | ||
| 147 | cfile.remove_all() | ||
| 148 | else: | ||
| 149 | cfile.remove_all_ugly() | ||
| 150 | logging.debug('Processing %s from %s' % (item.filename, | ||
| 151 | self.filename)) | ||
| 152 | zipout.write(name, item.filename) | ||
| 153 | except: | ||
| 154 | logging.info('%s\'s format is not supported or harmless' % | ||
| 155 | item.filename) | ||
| 156 | _, ext = os.path.splitext(name) | ||
| 157 | if self.add2archive or ext in parser.NOMETA: | ||
| 158 | zipout.write(name, item.filename) | ||
| 159 | zipout.comment = '' | ||
| 160 | zipin.close() | ||
| 161 | zipout.close() | ||
| 162 | logging.info('%s treated' % self.filename) | ||
| 163 | self.do_backup() | ||
| 164 | |||
| 165 | |||
| 166 | class TarStripper(GenericArchiveStripper): | ||
| 167 | ''' | ||
| 168 | Represent a tarfile archive | ||
| 169 | ''' | ||
| 170 | def _remove(self, current_file): | ||
| 171 | ''' | ||
| 172 | remove the meta added by tar itself to the file | ||
| 173 | ''' | ||
| 174 | current_file.mtime = 0 | ||
| 175 | current_file.uid = 0 | ||
| 176 | current_file.gid = 0 | ||
| 177 | current_file.uname = '' | ||
| 178 | current_file.gname = '' | ||
| 179 | return current_file | ||
| 180 | |||
| 181 | def _remove_all(self, method): | ||
| 182 | tarin = tarfile.open(self.filename, 'r' + self.compression) | ||
| 183 | tarout = tarfile.open(self.output, 'w' + self.compression) | ||
| 184 | for item in tarin.getmembers(): | ||
| 185 | tarin.extract(item, self.tempdir) | ||
| 186 | name = os.path.join(self.tempdir, item.name) | ||
| 187 | if item.type is '0': # is item a regular file ? | ||
| 188 | #no backup file | ||
| 189 | try: | ||
| 190 | cfile = mat.create_class_file(name, False, | ||
| 191 | self.add2archive) | ||
| 192 | if method is 'normal': | ||
| 193 | cfile.remove_all() | ||
| 194 | else: | ||
| 195 | cfile.remove_all_ugly() | ||
| 196 | tarout.add(name, item.name, filter=self._remove) | ||
| 197 | except: | ||
| 198 | logging.info('%s\' format is not supported or harmless' % | ||
| 199 | item.name) | ||
| 200 | _, ext = os.path.splitext(name) | ||
| 201 | if self.add2archive or ext in parser.NOMETA: | ||
| 202 | tarout.add(name, item.name, filter=self._remove) | ||
| 203 | tarin.close() | ||
| 204 | tarout.close() | ||
| 205 | self.do_backup() | ||
| 206 | |||
| 207 | def is_file_clean(self, current_file): | ||
| 208 | ''' | ||
| 209 | Check metadatas added by tar | ||
| 210 | ''' | ||
| 211 | if current_file.mtime is not 0: | ||
| 212 | return False | ||
| 213 | elif current_file.uid is not 0: | ||
| 214 | return False | ||
| 215 | elif current_file.gid is not 0: | ||
| 216 | return False | ||
| 217 | elif current_file.uname is not '': | ||
| 218 | return False | ||
| 219 | elif current_file.gname is not '': | ||
| 220 | return False | ||
| 221 | else: | ||
| 222 | return True | ||
| 223 | |||
| 224 | def is_clean(self): | ||
| 225 | ''' | ||
| 226 | Check if the file is clean from harmful metadatas | ||
| 227 | ''' | ||
| 228 | tarin = tarfile.open(self.filename, 'r' + self.compression) | ||
| 229 | for item in tarin.getmembers(): | ||
| 230 | if not self.is_file_clean(item): | ||
| 231 | tarin.close() | ||
| 232 | return False | ||
| 233 | tarin.extract(item, self.tempdir) | ||
| 234 | name = os.path.join(self.tempdir, item.name) | ||
| 235 | if item.type is '0': # is item a regular file ? | ||
| 236 | try: | ||
| 237 | class_file = mat.create_class_file(name, | ||
| 238 | False, self.add2archive) # no backup file | ||
| 239 | if not class_file.is_clean(): | ||
| 240 | tarin.close() | ||
| 241 | return False | ||
| 242 | except: | ||
| 243 | logging.error('%s\'s foramt is not supported or harmless' % | ||
| 244 | item.filename) | ||
| 245 | _, ext = os.path.splitext(name) | ||
| 246 | if ext not in parser.NOMETA: | ||
| 247 | tarin.close() | ||
| 248 | return False | ||
| 249 | tarin.close() | ||
| 250 | return True | ||
| 251 | |||
| 252 | def get_meta(self): | ||
| 253 | ''' | ||
| 254 | Return a dict with all the meta of the file | ||
| 255 | ''' | ||
| 256 | tarin = tarfile.open(self.filename, 'r' + self.compression) | ||
| 257 | metadata = {} | ||
| 258 | for current_file in tarin.getmembers(): | ||
| 259 | if current_file.type is '0': | ||
| 260 | if not self.is_file_clean(current_file): # if there is meta | ||
| 261 | current_meta = {} | ||
| 262 | current_meta['mtime'] = current_file.mtime | ||
| 263 | current_meta['uid'] = current_file.uid | ||
| 264 | current_meta['gid'] = current_file.gid | ||
| 265 | current_meta['uname'] = current_file.uname | ||
| 266 | current_meta['gname'] = current_file.gname | ||
| 267 | metadata[current_file.name] = current_meta | ||
| 268 | tarin.close() | ||
| 269 | return metadata | ||
| 270 | |||
| 271 | |||
| 272 | class GzipStripper(TarStripper): | ||
| 273 | ''' | ||
| 274 | Represent a tar.gz archive | ||
| 275 | ''' | ||
| 276 | def __init__(self, filename, parser, mime, backup, add2archive): | ||
| 277 | super(GzipStripper, self).__init__(filename, parser, mime, backup, | ||
| 278 | add2archive) | ||
| 279 | self.compression = ':gz' | ||
| 280 | |||
| 281 | |||
| 282 | class Bzip2Stripper(TarStripper): | ||
| 283 | ''' | ||
| 284 | Represents a tar.bz2 archive | ||
| 285 | ''' | ||
| 286 | def __init__(self, filename, parser, mime, backup, add2archive): | ||
| 287 | super(Bzip2Stripper, self).__init__(filename, parser, mime, backup, | ||
| 288 | add2archive) | ||
| 289 | self.compression = ':bz2' | ||
diff --git a/mat/audio.py b/mat/audio.py new file mode 100644 index 0000000..21a94be --- /dev/null +++ b/mat/audio.py | |||
| @@ -0,0 +1,98 @@ | |||
| 1 | ''' | ||
| 2 | Care about audio fileformat | ||
| 3 | ''' | ||
| 4 | try: | ||
| 5 | from mutagen.flac import FLAC | ||
| 6 | from mutagen.oggvorbis import OggVorbis | ||
| 7 | except ImportError: | ||
| 8 | pass | ||
| 9 | |||
| 10 | |||
| 11 | import parser | ||
| 12 | import shutil | ||
| 13 | |||
| 14 | |||
| 15 | class MpegAudioStripper(parser.GenericParser): | ||
| 16 | ''' | ||
| 17 | Represent mpeg audio file (mp3, ...) | ||
| 18 | ''' | ||
| 19 | def _should_remove(self, field): | ||
| 20 | if field.name in ("id3v1", "id3v2"): | ||
| 21 | return True | ||
| 22 | else: | ||
| 23 | return False | ||
| 24 | |||
| 25 | |||
| 26 | class OggStripper(parser.GenericParser): | ||
| 27 | ''' | ||
| 28 | Represent an ogg vorbis file | ||
| 29 | ''' | ||
| 30 | def remove_all(self): | ||
| 31 | if self.backup is True: | ||
| 32 | shutil.copy2(self.filename, self.output) | ||
| 33 | self.filename = self.output | ||
| 34 | |||
| 35 | mfile = OggVorbis(self.filename) | ||
| 36 | mfile.delete() | ||
| 37 | mfile.save() | ||
| 38 | |||
| 39 | def is_clean(self): | ||
| 40 | ''' | ||
| 41 | Check if the "metadata" block is present in the file | ||
| 42 | ''' | ||
| 43 | mfile = OggVorbis(self.filename) | ||
| 44 | if mfile.tags == []: | ||
| 45 | return True | ||
| 46 | else: | ||
| 47 | return False | ||
| 48 | |||
| 49 | def get_meta(self): | ||
| 50 | ''' | ||
| 51 | Return the content of the metadata block if present | ||
| 52 | ''' | ||
| 53 | metadata = {} | ||
| 54 | mfile = OggVorbis(self.filename) | ||
| 55 | for key, value in mfile.tags: | ||
| 56 | metadata[key] = value | ||
| 57 | return metadata | ||
| 58 | |||
| 59 | |||
| 60 | class FlacStripper(parser.GenericParser): | ||
| 61 | ''' | ||
| 62 | Represent a Flac audio file | ||
| 63 | ''' | ||
| 64 | def remove_all(self): | ||
| 65 | ''' | ||
| 66 | Remove the "metadata" block from the file | ||
| 67 | ''' | ||
| 68 | if self.backup is True: | ||
| 69 | shutil.copy2(self.filename, self.output) | ||
| 70 | self.filename = self.output | ||
| 71 | |||
| 72 | mfile = FLAC(self.filename) | ||
| 73 | mfile.delete() | ||
| 74 | mfile.clear_pictures() | ||
| 75 | mfile.save() | ||
| 76 | |||
| 77 | def is_clean(self): | ||
| 78 | ''' | ||
| 79 | Check if the "metadata" block is present in the file | ||
| 80 | ''' | ||
| 81 | mfile = FLAC(self.filename) | ||
| 82 | if mfile.tags is None and mfile.pictures == []: | ||
| 83 | return True | ||
| 84 | else: | ||
| 85 | return False | ||
| 86 | |||
| 87 | def get_meta(self): | ||
| 88 | ''' | ||
| 89 | Return the content of the metadata block if present | ||
| 90 | ''' | ||
| 91 | metadata = {} | ||
| 92 | mfile = FLAC(self.filename) | ||
| 93 | if mfile.tags is not None: | ||
| 94 | if mfile.pictures != []: | ||
| 95 | metadata['picture :'] = 'yes' | ||
| 96 | for key, value in mfile.tags: | ||
| 97 | metadata[key] = value | ||
| 98 | return metadata | ||
diff --git a/mat/images.py b/mat/images.py new file mode 100644 index 0000000..d090015 --- /dev/null +++ b/mat/images.py | |||
| @@ -0,0 +1,37 @@ | |||
| 1 | ''' | ||
| 2 | Takes care about pictures formats | ||
| 3 | ''' | ||
| 4 | |||
| 5 | import parser | ||
| 6 | |||
| 7 | |||
| 8 | class JpegStripper(parser.GenericParser): | ||
| 9 | ''' | ||
| 10 | represents a jpeg file | ||
| 11 | ''' | ||
| 12 | def _should_remove(self, field): | ||
| 13 | ''' | ||
| 14 | return True if the field is compromizing | ||
| 15 | ''' | ||
| 16 | if field.name.startswith('comment'): | ||
| 17 | return True | ||
| 18 | elif field.name in ("photoshop", "exif", "adobe"): | ||
| 19 | return True | ||
| 20 | else: | ||
| 21 | return False | ||
| 22 | |||
| 23 | |||
| 24 | class PngStripper(parser.GenericParser): | ||
| 25 | ''' | ||
| 26 | represents a png file | ||
| 27 | ''' | ||
| 28 | def _should_remove(self, field): | ||
| 29 | ''' | ||
| 30 | return True if the field is compromizing | ||
| 31 | ''' | ||
| 32 | if field.name.startswith("text["): | ||
| 33 | return True | ||
| 34 | elif field.name is "time": | ||
| 35 | return True | ||
| 36 | else: | ||
| 37 | return False | ||
diff --git a/mat/mat.py b/mat/mat.py new file mode 100644 index 0000000..fd13287 --- /dev/null +++ b/mat/mat.py | |||
| @@ -0,0 +1,162 @@ | |||
| 1 | #!/usr/bin/env python | ||
| 2 | |||
| 3 | ''' | ||
| 4 | Metadata anonymisation toolkit library | ||
| 5 | ''' | ||
| 6 | |||
| 7 | import os | ||
| 8 | import subprocess | ||
| 9 | import logging | ||
| 10 | import mimetypes | ||
| 11 | import xml.sax | ||
| 12 | |||
| 13 | import hachoir_core.cmd_line | ||
| 14 | import hachoir_parser | ||
| 15 | |||
| 16 | import images | ||
| 17 | import audio | ||
| 18 | import office | ||
| 19 | import archive | ||
| 20 | import misc | ||
| 21 | |||
| 22 | __version__ = '0.1' | ||
| 23 | __author__ = 'jvoisin' | ||
| 24 | |||
| 25 | LOGGING_LEVEL = logging.DEBUG | ||
| 26 | |||
| 27 | logging.basicConfig(level=LOGGING_LEVEL) | ||
| 28 | |||
| 29 | STRIPPERS = { | ||
| 30 | 'application/x-tar': archive.TarStripper, | ||
| 31 | 'application/x-gzip': archive.GzipStripper, | ||
| 32 | 'application/x-bzip2': archive.Bzip2Stripper, | ||
| 33 | 'application/zip': archive.ZipStripper, | ||
| 34 | 'audio/mpeg': audio.MpegAudioStripper, | ||
| 35 | 'image/jpeg': images.JpegStripper, | ||
| 36 | 'image/png': images.PngStripper, | ||
| 37 | 'application/x-bittorrent': misc.TorrentStripper, | ||
| 38 | 'application/opendocument': office.OpenDocumentStripper, | ||
| 39 | 'application/officeopenxml': office.OpenXmlStripper, | ||
| 40 | } | ||
| 41 | |||
| 42 | try: | ||
| 43 | import poppler | ||
| 44 | import cairo | ||
| 45 | STRIPPERS['application/x-pdf'] = office.PdfStripper | ||
| 46 | STRIPPERS['application/pdf'] = office.PdfStripper | ||
| 47 | except ImportError: | ||
| 48 | print('Unable to import python-poppler and/or python-cairo: no pdf \ | ||
| 49 | support') | ||
| 50 | |||
| 51 | try: | ||
| 52 | import mutagen | ||
| 53 | STRIPPERS['audio/x-flac'] = audio.FlacStripper | ||
| 54 | STRIPPERS['audio/vorbis'] = audio.OggStripper | ||
| 55 | except ImportError: | ||
| 56 | print('unable to import python-mutagen : limited audio format support') | ||
| 57 | |||
| 58 | |||
| 59 | class XMLParser(xml.sax.handler.ContentHandler): | ||
| 60 | ''' | ||
| 61 | Parse the supported format xml, and return a corresponding | ||
| 62 | list of dict | ||
| 63 | ''' | ||
| 64 | def __init__(self): | ||
| 65 | self.dict = {} | ||
| 66 | self.list = [] | ||
| 67 | self.content, self.key = '', '' | ||
| 68 | self.between = False | ||
| 69 | |||
| 70 | def startElement(self, name, attrs): | ||
| 71 | ''' | ||
| 72 | Called when entering into xml balise | ||
| 73 | ''' | ||
| 74 | self.between = True | ||
| 75 | self.key = name | ||
| 76 | self.content = '' | ||
| 77 | |||
| 78 | def endElement(self, name): | ||
| 79 | ''' | ||
| 80 | Called when exiting a xml balise | ||
| 81 | ''' | ||
| 82 | if name == 'format': # exiting a fileformat section | ||
| 83 | self.list.append(self.dict.copy()) | ||
| 84 | self.dict.clear() | ||
| 85 | else: | ||
| 86 | content = self.content.replace('\s', ' ') | ||
| 87 | self.dict[self.key] = content | ||
| 88 | self.between = False | ||
| 89 | |||
| 90 | def characters(self, characters): | ||
| 91 | ''' | ||
| 92 | Concatenate the content between opening and closing balises | ||
| 93 | ''' | ||
| 94 | if self.between is True: | ||
| 95 | self.content += characters | ||
| 96 | |||
| 97 | |||
| 98 | def secure_remove(filename): | ||
| 99 | ''' | ||
| 100 | securely remove the file | ||
| 101 | ''' | ||
| 102 | removed = False | ||
| 103 | try: | ||
| 104 | subprocess.call('shred --remove %s' % filename, shell=True) | ||
| 105 | removed = True | ||
| 106 | except: | ||
| 107 | logging.error('Unable to securely remove %s' % filename) | ||
| 108 | |||
| 109 | if removed is False: | ||
| 110 | try: | ||
| 111 | os.remove(filename) | ||
| 112 | except: | ||
| 113 | logging.error('Unable to remove %s' % filename) | ||
| 114 | |||
| 115 | |||
| 116 | def is_secure(filename): | ||
| 117 | ''' | ||
| 118 | Prevent shell injection | ||
| 119 | ''' | ||
| 120 | if not(os.path.isfile(filename)): # check if the file exist | ||
| 121 | logging.error('%s is not a valid file' % filename) | ||
| 122 | return False | ||
| 123 | else: | ||
| 124 | return True | ||
| 125 | |||
| 126 | |||
| 127 | def create_class_file(name, backup, add2archive): | ||
| 128 | ''' | ||
| 129 | return a $FILETYPEStripper() class, | ||
| 130 | corresponding to the filetype of the given file | ||
| 131 | ''' | ||
| 132 | if not is_secure(name): | ||
| 133 | return | ||
| 134 | |||
| 135 | filename = '' | ||
| 136 | try: | ||
| 137 | filename = hachoir_core.cmd_line.unicodeFilename(name) | ||
| 138 | except TypeError: # get rid of "decoding Unicode is not supported" | ||
| 139 | filename = name | ||
| 140 | |||
| 141 | parser = hachoir_parser.createParser(filename) | ||
| 142 | if not parser: | ||
| 143 | logging.info('Unable to parse %s' % filename) | ||
| 144 | return | ||
| 145 | |||
| 146 | mime = parser.mime_type | ||
| 147 | |||
| 148 | if mime == 'application/zip': # some formats are zipped stuff | ||
| 149 | mime = mimetypes.guess_type(name)[0] | ||
| 150 | |||
| 151 | if mime.startswith('application/vnd.oasis.opendocument'): | ||
| 152 | mime = 'application/opendocument' # opendocument fileformat | ||
| 153 | elif mime.startswith('application/vnd.openxmlformats-officedocument'): | ||
| 154 | mime = 'application/officeopenxml' # office openxml | ||
| 155 | |||
| 156 | try: | ||
| 157 | stripper_class = STRIPPERS[mime] | ||
| 158 | except KeyError: | ||
| 159 | logging.info('Don\'t have stripper for %s format' % mime) | ||
| 160 | return | ||
| 161 | |||
| 162 | return stripper_class(filename, parser, mime, backup, add2archive) | ||
diff --git a/mat/misc.py b/mat/misc.py new file mode 100644 index 0000000..f7b256f --- /dev/null +++ b/mat/misc.py | |||
| @@ -0,0 +1,62 @@ | |||
| 1 | ''' | ||
| 2 | Care about misc formats | ||
| 3 | ''' | ||
| 4 | |||
| 5 | import parser | ||
| 6 | |||
| 7 | from bencode import bencode | ||
| 8 | |||
| 9 | |||
| 10 | class TorrentStripper(parser.GenericParser): | ||
| 11 | ''' | ||
| 12 | Represent a torrent file with the help | ||
| 13 | of the bencode lib from Petru Paler | ||
| 14 | ''' | ||
| 15 | def __init__(self, filename, parser, mime, backup, add2archive): | ||
| 16 | super(TorrentStripper, self).__init__(filename, parser, mime, | ||
| 17 | backup, add2archive) | ||
| 18 | self.fields = ['comment', 'creation date', 'created by'] | ||
| 19 | |||
| 20 | def is_clean(self): | ||
| 21 | ''' | ||
| 22 | Check if the file is clean from harmful metadatas | ||
| 23 | ''' | ||
| 24 | with open(self.filename, 'r') as f: | ||
| 25 | decoded = bencode.bdecode(f.read()) | ||
| 26 | for key in self.fields: | ||
| 27 | try: | ||
| 28 | if decoded[key] != '': | ||
| 29 | return False | ||
| 30 | except: | ||
| 31 | pass | ||
| 32 | return True | ||
| 33 | |||
| 34 | def get_meta(self): | ||
| 35 | ''' | ||
| 36 | Return a dict with all the meta of the file | ||
| 37 | ''' | ||
| 38 | metadata = {} | ||
| 39 | with open(self.filename, 'r') as f: | ||
| 40 | decoded = bencode.bdecode(f.read()) | ||
| 41 | for key in self.fields: | ||
| 42 | try: | ||
| 43 | if decoded[key] != '': | ||
| 44 | metadata[key] = decoded[key] | ||
| 45 | except: | ||
| 46 | pass | ||
| 47 | return metadata | ||
| 48 | |||
| 49 | def remove_all(self): | ||
| 50 | ''' | ||
| 51 | Remove all the files that are compromizing | ||
| 52 | ''' | ||
| 53 | with open(self.filename, 'r') as f: | ||
| 54 | decoded = bencode.bdecode(f.read()) | ||
| 55 | for key in self.fields: | ||
| 56 | try: | ||
| 57 | decoded[key] = '' | ||
| 58 | except: | ||
| 59 | pass | ||
| 60 | with open(self.output, 'w') as f: # encode the decoded torrent | ||
| 61 | f.write(bencode.bencode(decoded)) # and write it in self.output | ||
| 62 | self.do_backup() | ||
diff --git a/mat/office.py b/mat/office.py new file mode 100644 index 0000000..cb9c609 --- /dev/null +++ b/mat/office.py | |||
| @@ -0,0 +1,280 @@ | |||
| 1 | ''' | ||
| 2 | Care about office's formats | ||
| 3 | ''' | ||
| 4 | |||
| 5 | import os | ||
| 6 | import logging | ||
| 7 | import zipfile | ||
| 8 | import fileinput | ||
| 9 | |||
| 10 | try: | ||
| 11 | import cairo | ||
| 12 | import poppler | ||
| 13 | except ImportError: | ||
| 14 | pass | ||
| 15 | |||
| 16 | import mat | ||
| 17 | import parser | ||
| 18 | import archive | ||
| 19 | import pdfrw | ||
| 20 | |||
| 21 | |||
| 22 | class OpenDocumentStripper(archive.GenericArchiveStripper): | ||
| 23 | ''' | ||
| 24 | An open document file is a zip, with xml file into. | ||
| 25 | The one that interest us is meta.xml | ||
| 26 | ''' | ||
| 27 | |||
| 28 | def get_meta(self): | ||
| 29 | ''' | ||
| 30 | Return a dict with all the meta of the file by | ||
| 31 | trying to read the meta.xml file. | ||
| 32 | ''' | ||
| 33 | zipin = zipfile.ZipFile(self.filename, 'r') | ||
| 34 | metadata = {} | ||
| 35 | try: | ||
| 36 | content = zipin.read('meta.xml') | ||
| 37 | zipin.close() | ||
| 38 | metadata[self.filename] = 'harful meta' | ||
| 39 | except KeyError: # no meta.xml file found | ||
| 40 | logging.debug('%s has no opendocument metadata' % self.filename) | ||
| 41 | return metadata | ||
| 42 | |||
| 43 | def _remove_all(self, method): | ||
| 44 | ''' | ||
| 45 | FIXME ? | ||
| 46 | There is a patch implementing the Zipfile.remove() | ||
| 47 | method here : http://bugs.python.org/issue6818 | ||
| 48 | ''' | ||
| 49 | zipin = zipfile.ZipFile(self.filename, 'r') | ||
| 50 | zipout = zipfile.ZipFile(self.output, 'w', allowZip64=True) | ||
| 51 | |||
| 52 | for item in zipin.namelist(): | ||
| 53 | name = os.path.join(self.tempdir, item) | ||
| 54 | _, ext = os.path.splitext(name) | ||
| 55 | |||
| 56 | if item.endswith('manifest.xml'): | ||
| 57 | # contain the list of all files present in the archive | ||
| 58 | zipin.extract(item, self.tempdir) | ||
| 59 | for line in fileinput.input(name, inplace=1): | ||
| 60 | #remove the line which contains "meta.xml" | ||
| 61 | line = line.strip() | ||
| 62 | if not 'meta.xml' in line: | ||
| 63 | print line | ||
| 64 | zipout.write(name, item) | ||
| 65 | |||
| 66 | elif ext in parser.NOMETA or item == 'mimetype': | ||
| 67 | #keep NOMETA files, and the "manifest" file | ||
| 68 | if item != 'meta.xml': # contains the metadata | ||
| 69 | zipin.extract(item, self.tempdir) | ||
| 70 | zipout.write(name, item) | ||
| 71 | |||
| 72 | else: | ||
| 73 | zipin.extract(item, self.tempdir) | ||
| 74 | if os.path.isfile(name): | ||
| 75 | try: | ||
| 76 | cfile = mat.create_class_file(name, False, | ||
| 77 | self.add2archive) | ||
| 78 | if method == 'normal': | ||
| 79 | cfile.remove_all() | ||
| 80 | else: | ||
| 81 | cfile.remove_all_ugly() | ||
| 82 | logging.debug('Processing %s from %s' % (item, | ||
| 83 | self.filename)) | ||
| 84 | zipout.write(name, item) | ||
| 85 | except: | ||
| 86 | logging.info('%s\' fileformat is not supported' % item) | ||
| 87 | if self.add2archive: | ||
| 88 | zipout.write(name, item) | ||
| 89 | zipout.comment = '' | ||
| 90 | logging.info('%s treated' % self.filename) | ||
| 91 | zipin.close() | ||
| 92 | zipout.close() | ||
| 93 | self.do_backup() | ||
| 94 | |||
| 95 | def is_clean(self): | ||
| 96 | ''' | ||
| 97 | Check if the file is clean from harmful metadatas | ||
| 98 | ''' | ||
| 99 | zipin = zipfile.ZipFile(self.filename, 'r') | ||
| 100 | try: | ||
| 101 | zipin.getinfo('meta.xml') | ||
| 102 | except KeyError: # no meta.xml in the file | ||
| 103 | czf = archive.ZipStripper(self.filename, self.parser, | ||
| 104 | 'application/zip', self.backup, self.add2archive) | ||
| 105 | if czf.is_clean(): | ||
| 106 | zipin.close() | ||
| 107 | return True | ||
| 108 | zipin.close() | ||
| 109 | return False | ||
| 110 | |||
| 111 | |||
| 112 | class PdfStripper(parser.GenericParser): | ||
| 113 | ''' | ||
| 114 | Represent a pdf file | ||
| 115 | ''' | ||
| 116 | def __init__(self, filename, parser, mime, backup, add2archive): | ||
| 117 | super(PdfStripper, self).__init__(filename, parser, mime, backup, | ||
| 118 | add2archive) | ||
| 119 | uri = 'file://' + os.path.abspath(self.filename) | ||
| 120 | self.password = None | ||
| 121 | self.document = poppler.document_new_from_file(uri, self.password) | ||
| 122 | self.meta_list = ('title', 'author', 'subject', 'keywords', 'creator', | ||
| 123 | 'producer', 'creation-date', 'mod-date', 'metadata') | ||
| 124 | |||
| 125 | def is_clean(self): | ||
| 126 | ''' | ||
| 127 | Check if the file is clean from harmful metadatas | ||
| 128 | ''' | ||
| 129 | for key in self.meta_list: | ||
| 130 | if key == 'creation-date' or key == 'mod-date': | ||
| 131 | if self.document.get_property(key) != -1: | ||
| 132 | return False | ||
| 133 | elif self.document.get_property(key) is not None and \ | ||
| 134 | self.document.get_property(key) != '': | ||
| 135 | return False | ||
| 136 | return True | ||
| 137 | |||
| 138 | def remove_all_ugly(self): | ||
| 139 | page = self.document.get_page(0) | ||
| 140 | page_width, page_height = page.get_size() | ||
| 141 | surface = cairo.PDFSurface(self.output, page_width, page_height) | ||
| 142 | context = cairo.Context(surface) # context draws on the surface | ||
| 143 | logging.debug('Pdf rendering of %s' % self.filename) | ||
| 144 | for pagenum in xrange(self.document.get_n_pages()): | ||
| 145 | page = self.document.get_page(pagenum) | ||
| 146 | context.translate(0, 0) | ||
| 147 | page.render(context) # render the page on context | ||
| 148 | context.show_page() # draw context on surface | ||
| 149 | surface.finish() | ||
| 150 | |||
| 151 | #For now, poppler cannot write meta, so we must use pdfrw | ||
| 152 | logging.debug('Removing %s\'s superficial metadata' % self.filename) | ||
| 153 | trailer = pdfrw.PdfReader(self.output) | ||
| 154 | trailer.Info.Producer = trailer.Info.Creator = None | ||
| 155 | writer = pdfrw.PdfWriter() | ||
| 156 | writer.trailer = trailer | ||
| 157 | writer.write(self.output) | ||
| 158 | self.do_backup() | ||
| 159 | |||
| 160 | |||
| 161 | def remove_all(self): | ||
| 162 | ''' | ||
| 163 | Opening the pdf with poppler, then doing a render | ||
| 164 | on a cairo pdfsurface for each pages. | ||
| 165 | Thanks to Lunar^for the idea. | ||
| 166 | http://cairographics.org/documentation/pycairo/2/ | ||
| 167 | python-poppler is not documented at all : have fun ;) | ||
| 168 | ''' | ||
| 169 | page = self.document.get_page(0) | ||
| 170 | page_width, page_height = page.get_size() | ||
| 171 | surface = cairo.PDFSurface(self.output, page_width, page_height) | ||
| 172 | context = cairo.Context(surface) # context draws on the surface | ||
| 173 | logging.debug('Pdf rendering of %s' % self.filename) | ||
| 174 | for pagenum in xrange(self.document.get_n_pages()): | ||
| 175 | page = self.document.get_page(pagenum) | ||
| 176 | context.translate(0, 0) | ||
| 177 | page.render(context) # render the page on context | ||
| 178 | context.show_page() # draw context on surface | ||
| 179 | surface.finish() | ||
| 180 | |||
| 181 | #For now, poppler cannot write meta, so we must use pdfrw | ||
| 182 | logging.debug('Removing %s\'s superficial metadata' % self.filename) | ||
| 183 | trailer = pdfrw.PdfReader(self.output) | ||
| 184 | trailer.Info.Producer = trailer.Info.Creator = None | ||
| 185 | writer = pdfrw.PdfWriter() | ||
| 186 | writer.trailer = trailer | ||
| 187 | writer.write(self.output) | ||
| 188 | self.do_backup() | ||
| 189 | |||
| 190 | def get_meta(self): | ||
| 191 | ''' | ||
| 192 | Return a dict with all the meta of the file | ||
| 193 | ''' | ||
| 194 | metadata = {} | ||
| 195 | for key in self.meta_list: | ||
| 196 | if key == 'creation-date' or key == 'mod-date': | ||
| 197 | #creation and modification are set to -1 | ||
| 198 | if self.document.get_property(key) != -1: | ||
| 199 | metadata[key] = self.document.get_property(key) | ||
| 200 | elif self.document.get_property(key) is not None and \ | ||
| 201 | self.document.get_property(key) != '': | ||
| 202 | metadata[key] = self.document.get_property(key) | ||
| 203 | return metadata | ||
| 204 | |||
| 205 | |||
| 206 | class OpenXmlStripper(archive.GenericArchiveStripper): | ||
| 207 | ''' | ||
| 208 | Represent an office openxml document, which is like | ||
| 209 | an opendocument format, with some tricky stuff added. | ||
| 210 | It contains mostly xml, but can have media blobs, crap, ... | ||
| 211 | (I don't like this format.) | ||
| 212 | ''' | ||
| 213 | def _remove_all(self, method): | ||
| 214 | ''' | ||
| 215 | FIXME ? | ||
| 216 | There is a patch implementing the Zipfile.remove() | ||
| 217 | method here : http://bugs.python.org/issue6818 | ||
| 218 | ''' | ||
| 219 | zipin = zipfile.ZipFile(self.filename, 'r') | ||
| 220 | zipout = zipfile.ZipFile(self.output, 'w', | ||
| 221 | allowZip64=True) | ||
| 222 | for item in zipin.namelist(): | ||
| 223 | name = os.path.join(self.tempdir, item) | ||
| 224 | _, ext = os.path.splitext(name) | ||
| 225 | if item.startswith('docProps/'): # metadatas | ||
| 226 | pass | ||
| 227 | elif ext in parser.NOMETA or item == '.rels': | ||
| 228 | #keep parser.NOMETA files, and the file named ".rels" | ||
| 229 | zipin.extract(item, self.tempdir) | ||
| 230 | zipout.write(name, item) | ||
| 231 | else: | ||
| 232 | zipin.extract(item, self.tempdir) | ||
| 233 | if os.path.isfile(name): # don't care about folders | ||
| 234 | try: | ||
| 235 | cfile = mat.create_class_file(name, False, | ||
| 236 | self.add2archive) | ||
| 237 | if method == 'normal': | ||
| 238 | cfile.remove_all() | ||
| 239 | else: | ||
| 240 | cfile.remove_all_ugly() | ||
| 241 | logging.debug('Processing %s from %s' % (item, | ||
| 242 | self.filename)) | ||
| 243 | zipout.write(name, item) | ||
| 244 | except: | ||
| 245 | logging.info('%s\' fileformat is not supported' % item) | ||
| 246 | if self.add2archive: | ||
| 247 | zipout.write(name, item) | ||
| 248 | zipout.comment = '' | ||
| 249 | logging.info('%s treated' % self.filename) | ||
| 250 | zipin.close() | ||
| 251 | zipout.close() | ||
| 252 | self.do_backup() | ||
| 253 | |||
| 254 | def is_clean(self): | ||
| 255 | ''' | ||
| 256 | Check if the file is clean from harmful metadatas | ||
| 257 | ''' | ||
| 258 | zipin = zipfile.ZipFile(self.filename, 'r') | ||
| 259 | for item in zipin.namelist(): | ||
| 260 | if item.startswith('docProps/'): | ||
| 261 | return False | ||
| 262 | zipin.close() | ||
| 263 | czf = archive.ZipStripper(self.filename, self.parser, | ||
| 264 | 'application/zip', self.backup, self.add2archive) | ||
| 265 | if not czf.is_clean(): | ||
| 266 | return False | ||
| 267 | else: | ||
| 268 | return True | ||
| 269 | |||
| 270 | def get_meta(self): | ||
| 271 | ''' | ||
| 272 | Return a dict with all the meta of the file | ||
| 273 | ''' | ||
| 274 | zipin = zipfile.ZipFile(self.filename, 'r') | ||
| 275 | metadata = {} | ||
| 276 | for item in zipin.namelist(): | ||
| 277 | if item.startswith('docProps/'): | ||
| 278 | metadata[item] = 'harmful content' | ||
| 279 | zipin.close() | ||
| 280 | return metadata | ||
diff --git a/mat/parser.py b/mat/parser.py new file mode 100644 index 0000000..58dd7fa --- /dev/null +++ b/mat/parser.py | |||
| @@ -0,0 +1,104 @@ | |||
| 1 | ''' | ||
| 2 | Parent class of all parser | ||
| 3 | ''' | ||
| 4 | |||
| 5 | import hachoir_core | ||
| 6 | import hachoir_editor | ||
| 7 | |||
| 8 | import os | ||
| 9 | |||
| 10 | import mat | ||
| 11 | |||
| 12 | NOMETA = ('.bmp', '.rdf', '.txt', '.xml', '.rels') | ||
| 13 | #bmp : image | ||
| 14 | #rdf : text | ||
| 15 | #txt : plain text | ||
| 16 | #xml : formated text | ||
| 17 | #rels : openxml foramted text | ||
| 18 | |||
| 19 | |||
| 20 | class GenericParser(object): | ||
| 21 | ''' | ||
| 22 | Parent class of all parsers | ||
| 23 | ''' | ||
| 24 | def __init__(self, filename, parser, mime, backup, add2archive): | ||
| 25 | self.filename = '' | ||
| 26 | self.parser = parser | ||
| 27 | self.mime = mime | ||
| 28 | self.backup = backup | ||
| 29 | self.editor = hachoir_editor.createEditor(parser) | ||
| 30 | self.realname = filename | ||
| 31 | try: | ||
| 32 | self.filename = hachoir_core.cmd_line.unicodeFilename(filename) | ||
| 33 | except TypeError: # get rid of "decoding Unicode is not supported" | ||
| 34 | self.filename = filename | ||
| 35 | basename, ext = os.path.splitext(filename) | ||
| 36 | self.output = basename + '.cleaned' + ext | ||
| 37 | self.basename = os.path.basename(filename) # only filename | ||
| 38 | |||
| 39 | def is_clean(self): | ||
| 40 | ''' | ||
| 41 | Check if the file is clean from harmful metadatas | ||
| 42 | ''' | ||
| 43 | for field in self.editor: | ||
| 44 | if self._should_remove(field): | ||
| 45 | return False | ||
| 46 | return True | ||
| 47 | |||
| 48 | def remove_all(self): | ||
| 49 | ''' | ||
| 50 | Remove all the files that are compromizing | ||
| 51 | ''' | ||
| 52 | for field in self.editor: | ||
| 53 | if self._should_remove(field): | ||
| 54 | self._remove(field.name) | ||
| 55 | hachoir_core.field.writeIntoFile(self.editor, self.output) | ||
| 56 | self.do_backup() | ||
| 57 | |||
| 58 | def remove_all_ugly(self): | ||
| 59 | ''' | ||
| 60 | If the remove_all() is not efficient enough, | ||
| 61 | this method is implemented : | ||
| 62 | It is efficient, but destructive. | ||
| 63 | In a perfect world, with nice fileformat, | ||
| 64 | this method would not exist. | ||
| 65 | ''' | ||
| 66 | self.remove_all() | ||
| 67 | |||
| 68 | def _remove(self, field): | ||
| 69 | ''' | ||
| 70 | Delete the given field | ||
| 71 | ''' | ||
| 72 | del self.editor[field] | ||
| 73 | |||
| 74 | def get_meta(self): | ||
| 75 | ''' | ||
| 76 | Return a dict with all the meta of the file | ||
| 77 | ''' | ||
| 78 | metadata = {} | ||
| 79 | for field in self.editor: | ||
| 80 | if self._should_remove(field): | ||
| 81 | try: | ||
| 82 | metadata[field.name] = field.value | ||
| 83 | except: | ||
| 84 | metadata[field.name] = 'harmful content' | ||
| 85 | return metadata | ||
| 86 | |||
| 87 | def _should_remove(self, key): | ||
| 88 | ''' | ||
| 89 | return True if the field is compromizing | ||
| 90 | abstract method | ||
| 91 | ''' | ||
| 92 | raise NotImplementedError | ||
| 93 | |||
| 94 | def do_backup(self): | ||
| 95 | ''' | ||
| 96 | Do a backup of the file if asked, | ||
| 97 | and change his creation/access date | ||
| 98 | ''' | ||
| 99 | if self.backup is True: | ||
| 100 | os.utime(self.output, (0, 0)) | ||
| 101 | else: | ||
| 102 | mat.secure_remove(self.filename) | ||
| 103 | os.rename(self.output, self.filename) | ||
| 104 | os.utime(self.filename, (0, 0)) | ||
