setup.py now works !

author: jvoisin 2011-08-16 18:11:24 +0200
committer: jvoisin 2011-08-16 18:11:24 +0200
commit: 4bd3e47da02fde08acfada1795cc55170abdb00a (patch)
tree: f8c7aa5fd5e1b07a28b350c5ded8125ef2467c51 /mat
parent: baf8e080125614326ba9c96ca8f2404fd12b050e (diff)
8 files changed, 1033 insertions, 0 deletions
diff --git a/mat/__init__.py b/mat/__init__.py
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/mat/__init__.py
@@ -0,0 +1 @@
diff --git a/mat/archive.py b/mat/archive.py
new file mode 100644
index 0000000..77db71c
--- /dev/null
+++ b/mat/archive.py
@@ -0,0 +1,289 @@
+'''
+    Take care of archives formats
+'''
+import zipfile
+import shutil
+import os
+import logging
+import tempfile
+import parser
+import mat
+from tarfile import tarfile
+class GenericArchiveStripper(parser.GenericParser):
+    '''
+        Represent a generic archive
+    '''
+    def __init__(self, filename, parser, mime, backup, add2archive):
+        super(GenericArchiveStripper, self).__init__(filename, parser, mime,
+        backup, add2archive)
+        self.compression = ''
+        self.add2archive = add2archive
+        self.tempdir = tempfile.mkdtemp()
+    def __del__(self):
+        '''
+            Remove the files inside the temp dir,
+            then remove the temp dir
+        '''
+        for root, dirs, files in os.walk(self.tempdir):
+            for item in files:
+                path_file = os.path.join(root, item)
+                mat.secure_remove(path_file)
+        shutil.rmtree(self.tempdir)
+    def remove_all(self):
+        '''
+            Call _remove_all() with in argument : "normal"
+        '''
+        self._remove_all('normal')
+    def remove_all_ugly(self):
+        '''
+            call remove_all() with in argument : "ugly"
+        '''
+        self._remove_all('ugly')
+    def _remove_all(self, method):
+        '''
+            Remove all meta, normal way if method is "normal",
+            else, use the ugly way (with possible data loss)
+        '''
+        raise NotImplementedError
+class ZipStripper(GenericArchiveStripper):
+    '''
+        Represent a zip file
+    '''
+    def is_file_clean(self, fileinfo):
+        '''
+            Check if a ZipInfo object is clean of metadatas added
+            by zip itself, independently of the corresponding file metadatas
+        '''
+        if fileinfo.comment is not '':
+            return False
+        elif fileinfo.date_time is not 0:
+            return False
+        elif fileinfo.create_system is not 0:
+            return False
+        elif fileinfo.create_version is not 0:
+            return False
+        else:
+            return True
+    def is_clean(self):
+        '''
+            Check if the given file is clean from harmful metadata
+        '''
+        zipin = zipfile.ZipFile(self.filename, 'r')
+        if zipin.comment != '':
+            logging.debug('%s has a comment' % self.filename)
+            return False
+        for item in zipin.infolist():
+            #I have not found a way to remove the crap added by zipfile :/
+            #if not self.is_file_clean(item):
+            #    logging.debug('%s from %s has compromizing zipinfo' %
+            #        (item.filename, self.filename))
+            #    return False
+            zipin.extract(item, self.tempdir)
+            name = os.path.join(self.tempdir, item.filename)
+            if os.path.isfile(name):
+                try:
+                    cfile = mat.create_class_file(name, False,
+                        self.add2archive)
+                    if not cfile.is_clean():
+                        return False
+                except:
+                    #best solution I have found
+                    logging.info('%s\'s fileformat is not supported, or is a \
+harmless format' % item.filename)
+                    _, ext = os.path.splitext(name)
+                    bname = os.path.basename(item.filename)
+                    if ext not in parser.NOMETA:
+                        if bname != 'mimetype' and bname != '.rels':
+                            return False
+        zipin.close()
+        return True
+    def get_meta(self):
+        '''
+            Return all the metadata of a ZipFile (don't return metadatas
+            of contained files : should it ?)
+        '''
+        zipin = zipfile.ZipFile(self.filename, 'r')
+        metadata = {}
+        for field in zipin.infolist():
+            zipmeta = {}
+            zipmeta['comment'] = field.comment
+            zipmeta['modified'] = field.date_time
+            zipmeta['system'] = field.create_system
+            zipmeta['zip_version'] = field.create_version
+            metadata[field.filename] = zipmeta
+        metadata["%s comment" % self.filename] = zipin.comment
+        zipin.close()
+        return metadata
+    def _remove_all(self, method):
+        '''
+            So far, the zipfile module does not allow to write a ZipInfo
+            object into a zipfile (and it's a shame !) : so data added
+            by zipfile itself could not be removed. It's a big concern.
+            Is shiping a patched version of zipfile.py a good idea ?
+        '''
+        zipin = zipfile.ZipFile(self.filename, 'r')
+        zipout = zipfile.ZipFile(self.output, 'w', allowZip64=True)
+        for item in zipin.infolist():
+            zipin.extract(item, self.tempdir)
+            name = os.path.join(self.tempdir, item.filename)
+            if os.path.isfile(name):
+                try:
+                    cfile = mat.create_class_file(name, False,
+                        self.add2archive)
+                    if method is 'normal':
+                        cfile.remove_all()
+                    else:
+                        cfile.remove_all_ugly()
+                    logging.debug('Processing %s from %s' % (item.filename,
+                        self.filename))
+                    zipout.write(name, item.filename)
+                except:
+                    logging.info('%s\'s format is not supported or harmless' %
+                        item.filename)
+                    _, ext = os.path.splitext(name)
+                    if self.add2archive or ext in parser.NOMETA:
+                        zipout.write(name, item.filename)
+        zipout.comment = ''
+        zipin.close()
+        zipout.close()
+        logging.info('%s treated' % self.filename)
+        self.do_backup()
+class TarStripper(GenericArchiveStripper):
+    '''
+        Represent a tarfile archive
+    '''
+    def _remove(self, current_file):
+        '''
+            remove the meta added by tar itself to the file
+        '''
+        current_file.mtime = 0
+        current_file.uid = 0
+        current_file.gid = 0
+        current_file.uname = ''
+        current_file.gname = ''
+        return current_file
+    def _remove_all(self, method):
+        tarin = tarfile.open(self.filename, 'r' + self.compression)
+        tarout = tarfile.open(self.output, 'w' + self.compression)
+        for item in tarin.getmembers():
+            tarin.extract(item, self.tempdir)
+            name = os.path.join(self.tempdir, item.name)
+            if item.type is '0':  # is item a regular file ?
+                #no backup file
+                try:
+                    cfile = mat.create_class_file(name, False,
+                    self.add2archive)
+                    if method is 'normal':
+                        cfile.remove_all()
+                    else:
+                        cfile.remove_all_ugly()
+                    tarout.add(name, item.name, filter=self._remove)
+                except:
+                    logging.info('%s\' format is not supported or harmless' %
+                        item.name)
+                    _, ext = os.path.splitext(name)
+                    if self.add2archive or ext in parser.NOMETA:
+                        tarout.add(name, item.name, filter=self._remove)
+        tarin.close()
+        tarout.close()
+        self.do_backup()
+    def is_file_clean(self, current_file):
+        '''
+            Check metadatas added by tar
+        '''
+        if current_file.mtime is not 0:
+            return False
+        elif current_file.uid is not 0:
+            return False
+        elif current_file.gid is not 0:
+            return False
+        elif current_file.uname is not '':
+            return False
+        elif current_file.gname is not '':
+            return False
+        else:
+            return True
+    def is_clean(self):
+        '''
+            Check if the file is clean from harmful metadatas
+        '''
+        tarin = tarfile.open(self.filename, 'r' + self.compression)
+        for item in tarin.getmembers():
+            if not self.is_file_clean(item):
+                tarin.close()
+                return False
+            tarin.extract(item, self.tempdir)
+            name = os.path.join(self.tempdir, item.name)
+            if item.type is '0':  # is item a regular file ?
+                try:
+                    class_file = mat.create_class_file(name,
+                        False, self.add2archive)  # no backup file
+                    if not class_file.is_clean():
+                        tarin.close()
+                        return False
+                except:
+                    logging.error('%s\'s foramt is not supported or harmless' %
+                        item.filename)
+                    _, ext = os.path.splitext(name)
+                    if ext not in parser.NOMETA:
+                        tarin.close()
+                        return False
+        tarin.close()
+        return True
+    def get_meta(self):
+        '''
+            Return a dict with all the meta of the file
+        '''
+        tarin = tarfile.open(self.filename, 'r' + self.compression)
+        metadata = {}
+        for current_file in tarin.getmembers():
+            if current_file.type is '0':
+                if not self.is_file_clean(current_file):  # if there is meta
+                    current_meta = {}
+                    current_meta['mtime'] = current_file.mtime
+                    current_meta['uid'] = current_file.uid
+                    current_meta['gid'] = current_file.gid
+                    current_meta['uname'] = current_file.uname
+                    current_meta['gname'] = current_file.gname
+                    metadata[current_file.name] = current_meta
+        tarin.close()
+        return metadata
+class GzipStripper(TarStripper):
+    '''
+        Represent a tar.gz archive
+    '''
+    def __init__(self, filename, parser, mime, backup, add2archive):
+        super(GzipStripper, self).__init__(filename, parser, mime, backup,
+            add2archive)
+        self.compression = ':gz'
+class Bzip2Stripper(TarStripper):
+    '''
+        Represents a tar.bz2 archive
+    '''
+    def __init__(self, filename, parser, mime, backup, add2archive):
+        super(Bzip2Stripper, self).__init__(filename, parser, mime, backup,
+            add2archive)
+        self.compression = ':bz2'
diff --git a/mat/audio.py b/mat/audio.py
new file mode 100644
index 0000000..21a94be
--- /dev/null
+++ b/mat/audio.py
@@ -0,0 +1,98 @@
+'''
+    Care about audio fileformat
+'''
+try:
+    from mutagen.flac import FLAC
+    from mutagen.oggvorbis import OggVorbis
+except ImportError:
+    pass
+import parser
+import shutil
+class MpegAudioStripper(parser.GenericParser):
+    '''
+        Represent mpeg audio file (mp3, ...)
+    '''
+    def _should_remove(self, field):
+        if field.name in ("id3v1", "id3v2"):
+            return True
+        else:
+            return False
+class OggStripper(parser.GenericParser):
+    '''
+        Represent an ogg vorbis file
+    '''
+    def remove_all(self):
+        if self.backup is True:
+            shutil.copy2(self.filename, self.output)
+            self.filename = self.output
+        mfile = OggVorbis(self.filename)
+        mfile.delete()
+        mfile.save()
+    def is_clean(self):
+        '''
+            Check if the "metadata" block is present in the file
+        '''
+        mfile = OggVorbis(self.filename)
+        if mfile.tags == []:
+            return True
+        else:
+            return False
+    def get_meta(self):
+        '''
+            Return the content of the metadata block if present
+        '''
+        metadata = {}
+        mfile = OggVorbis(self.filename)
+        for key, value in mfile.tags:
+            metadata[key] = value
+        return metadata
+class FlacStripper(parser.GenericParser):
+    '''
+        Represent a Flac audio file
+    '''
+    def remove_all(self):
+        '''
+            Remove the "metadata" block from the file
+        '''
+        if self.backup is True:
+            shutil.copy2(self.filename, self.output)
+            self.filename = self.output
+        mfile = FLAC(self.filename)
+        mfile.delete()
+        mfile.clear_pictures()
+        mfile.save()
+    def is_clean(self):
+        '''
+            Check if the "metadata" block is present in the file
+        '''
+        mfile = FLAC(self.filename)
+        if mfile.tags is None and mfile.pictures == []:
+            return True
+        else:
+            return False
+    def get_meta(self):
+        '''
+            Return the content of the metadata block if present
+        '''
+        metadata = {}
+        mfile = FLAC(self.filename)
+        if mfile.tags is not None:
+            if mfile.pictures != []:
+                metadata['picture :'] = 'yes'
+            for key, value in mfile.tags:
+                metadata[key] = value
+        return metadata
diff --git a/mat/images.py b/mat/images.py
new file mode 100644
index 0000000..d090015
--- /dev/null
+++ b/mat/images.py
@@ -0,0 +1,37 @@
+'''
+    Takes care about pictures formats
+'''
+import parser
+class JpegStripper(parser.GenericParser):
+    '''
+        represents a jpeg file
+    '''
+    def _should_remove(self, field):
+        '''
+            return True if the field is compromizing
+        '''
+        if field.name.startswith('comment'):
+            return True
+        elif field.name in ("photoshop", "exif", "adobe"):
+            return True
+        else:
+            return False
+class PngStripper(parser.GenericParser):
+    '''
+        represents a png file
+    '''
+    def _should_remove(self, field):
+        '''
+            return True if the field is compromizing
+        '''
+        if field.name.startswith("text["):
+            return True
+        elif field.name is "time":
+            return True
+        else:
+            return False
diff --git a/mat/mat.py b/mat/mat.py
new file mode 100644
index 0000000..fd13287
--- /dev/null
+++ b/mat/mat.py
@@ -0,0 +1,162 @@
+#!/usr/bin/env python
+'''
+    Metadata anonymisation toolkit library
+'''
+import os
+import subprocess
+import logging
+import mimetypes
+import xml.sax
+import hachoir_core.cmd_line
+import hachoir_parser
+import images
+import audio
+import office
+import archive
+import misc
+__version__ = '0.1'
+__author__ = 'jvoisin'
+LOGGING_LEVEL = logging.DEBUG
+logging.basicConfig(level=LOGGING_LEVEL)
+STRIPPERS = {
+    'application/x-tar': archive.TarStripper,
+    'application/x-gzip': archive.GzipStripper,
+    'application/x-bzip2': archive.Bzip2Stripper,
+    'application/zip': archive.ZipStripper,
+    'audio/mpeg': audio.MpegAudioStripper,
+    'image/jpeg': images.JpegStripper,
+    'image/png': images.PngStripper,
+    'application/x-bittorrent': misc.TorrentStripper,
+    'application/opendocument': office.OpenDocumentStripper,
+    'application/officeopenxml': office.OpenXmlStripper,
+}
+try:
+    import poppler
+    import cairo
+    STRIPPERS['application/x-pdf'] = office.PdfStripper
+    STRIPPERS['application/pdf'] = office.PdfStripper
+except ImportError:
+    print('Unable to import python-poppler and/or python-cairo: no pdf \
+        support')
+try:
+    import mutagen
+    STRIPPERS['audio/x-flac'] = audio.FlacStripper
+    STRIPPERS['audio/vorbis'] = audio.OggStripper
+except ImportError:
+    print('unable to import python-mutagen : limited audio format support')
+class XMLParser(xml.sax.handler.ContentHandler):
+    '''
+        Parse the supported format xml, and return a corresponding
+        list of dict
+    '''
+    def __init__(self):
+        self.dict = {}
+        self.list = []
+        self.content, self.key = '', ''
+        self.between = False
+    def startElement(self, name, attrs):
+        '''
+            Called when entering into xml balise
+        '''
+        self.between = True
+        self.key = name
+        self.content = ''
+    def endElement(self, name):
+        '''
+            Called when exiting a xml balise
+        '''
+        if name == 'format':  # exiting a fileformat section
+            self.list.append(self.dict.copy())
+            self.dict.clear()
+        else:
+            content = self.content.replace('\s', ' ')
+            self.dict[self.key] = content
+            self.between = False
+    def characters(self, characters):
+        '''
+            Concatenate the content between opening and closing balises
+        '''
+        if self.between is True:
+            self.content += characters
+def secure_remove(filename):
+    '''
+        securely remove the file
+    '''
+    removed = False
+    try:
+        subprocess.call('shred --remove %s' % filename, shell=True)
+        removed = True
+    except:
+        logging.error('Unable to securely remove %s' % filename)
+    if removed is False:
+        try:
+            os.remove(filename)
+        except:
+            logging.error('Unable to remove %s' % filename)
+def is_secure(filename):
+    '''
+        Prevent shell injection
+    '''
+    if not(os.path.isfile(filename)):  # check if the file exist
+        logging.error('%s is not a valid file' % filename)
+        return False
+    else:
+        return True
+def create_class_file(name, backup, add2archive):
+    '''
+        return a $FILETYPEStripper() class,
+        corresponding to the filetype of the given file
+    '''
+    if not is_secure(name):
+        return
+    filename = ''
+    try:
+        filename = hachoir_core.cmd_line.unicodeFilename(name)
+    except TypeError:  # get rid of "decoding Unicode is not supported"
+        filename = name
+    parser = hachoir_parser.createParser(filename)
+    if not parser:
+        logging.info('Unable to parse %s' % filename)
+        return
+    mime = parser.mime_type
+    if mime == 'application/zip':  # some formats are zipped stuff
+        mime = mimetypes.guess_type(name)[0]
+    if mime.startswith('application/vnd.oasis.opendocument'):
+        mime = 'application/opendocument'  # opendocument fileformat
+    elif mime.startswith('application/vnd.openxmlformats-officedocument'):
+        mime = 'application/officeopenxml'  # office openxml
+    try:
+        stripper_class = STRIPPERS[mime]
+    except KeyError:
+        logging.info('Don\'t have stripper for %s format' % mime)
+        return
+    return stripper_class(filename, parser, mime, backup, add2archive)
diff --git a/mat/misc.py b/mat/misc.py
new file mode 100644
index 0000000..f7b256f
--- /dev/null
+++ b/mat/misc.py
@@ -0,0 +1,62 @@
+'''
+    Care about misc formats
+'''
+import parser
+from bencode import bencode
+class TorrentStripper(parser.GenericParser):
+    '''
+        Represent a torrent file with the help
+        of the bencode lib from Petru Paler
+    '''
+    def __init__(self, filename, parser, mime, backup, add2archive):
+        super(TorrentStripper, self).__init__(filename, parser, mime,
+            backup, add2archive)
+        self.fields = ['comment', 'creation date', 'created by']
+    def is_clean(self):
+        '''
+            Check if the file is clean from harmful metadatas
+        '''
+        with open(self.filename, 'r') as f:
+            decoded = bencode.bdecode(f.read())
+        for key in self.fields:
+            try:
+                if decoded[key] != '':
+                    return False
+            except:
+                pass
+        return True
+    def get_meta(self):
+        '''
+            Return a dict with all the meta of the file
+        '''
+        metadata = {}
+        with open(self.filename, 'r') as f:
+            decoded = bencode.bdecode(f.read())
+        for key in self.fields:
+            try:
+                if decoded[key] != '':
+                    metadata[key] = decoded[key]
+            except:
+                pass
+        return metadata
+    def remove_all(self):
+        '''
+            Remove all the files that are compromizing
+        '''
+        with open(self.filename, 'r') as f:
+            decoded = bencode.bdecode(f.read())
+        for key in self.fields:
+            try:
+                decoded[key] = ''
+            except:
+                pass
+        with open(self.output, 'w') as f:  # encode the decoded torrent
+            f.write(bencode.bencode(decoded))  # and write it in self.output
+        self.do_backup()
diff --git a/mat/office.py b/mat/office.py
new file mode 100644
index 0000000..cb9c609
--- /dev/null
+++ b/mat/office.py
@@ -0,0 +1,280 @@
+'''
+    Care about office's formats
+'''
+import os
+import logging
+import zipfile
+import fileinput
+try:
+    import cairo
+    import poppler
+except ImportError:
+    pass
+import mat
+import parser
+import archive
+import pdfrw
+class OpenDocumentStripper(archive.GenericArchiveStripper):
+    '''
+        An open document file is a zip, with xml file into.
+        The one that interest us is meta.xml
+    '''
+    def get_meta(self):
+        '''
+            Return a dict with all the meta of the file by
+            trying to read the meta.xml file.
+        '''
+        zipin = zipfile.ZipFile(self.filename, 'r')
+        metadata = {}
+        try:
+            content = zipin.read('meta.xml')
+            zipin.close()
+            metadata[self.filename] = 'harful meta'
+        except KeyError:  # no meta.xml file found
+            logging.debug('%s has no opendocument metadata' % self.filename)
+        return metadata
+    def _remove_all(self, method):
+        '''
+            FIXME ?
+            There is a patch implementing the Zipfile.remove()
+            method here : http://bugs.python.org/issue6818
+        '''
+        zipin = zipfile.ZipFile(self.filename, 'r')
+        zipout = zipfile.ZipFile(self.output, 'w', allowZip64=True)
+        for item in zipin.namelist():
+            name = os.path.join(self.tempdir, item)
+            _, ext = os.path.splitext(name)
+            if item.endswith('manifest.xml'):
+            # contain the list of all files present in the archive
+                zipin.extract(item, self.tempdir)
+                for line in fileinput.input(name, inplace=1):
+                    #remove the line which contains "meta.xml"
+                    line = line.strip()
+                    if not 'meta.xml' in line:
+                        print line
+                zipout.write(name, item)
+            elif ext in parser.NOMETA or item == 'mimetype':
+                #keep NOMETA files, and the "manifest" file
+                if item != 'meta.xml':  # contains the metadata
+                    zipin.extract(item, self.tempdir)
+                    zipout.write(name, item)
+            else:
+                zipin.extract(item, self.tempdir)
+                if os.path.isfile(name):
+                    try:
+                        cfile = mat.create_class_file(name, False,
+                            self.add2archive)
+                        if method == 'normal':
+                            cfile.remove_all()
+                        else:
+                            cfile.remove_all_ugly()
+                        logging.debug('Processing %s from %s' % (item,
+                            self.filename))
+                        zipout.write(name, item)
+                    except:
+                        logging.info('%s\' fileformat is not supported' % item)
+                        if self.add2archive:
+                            zipout.write(name, item)
+        zipout.comment = ''
+        logging.info('%s treated' % self.filename)
+        zipin.close()
+        zipout.close()
+        self.do_backup()
+    def is_clean(self):
+        '''
+            Check if the file is clean from harmful metadatas
+        '''
+        zipin = zipfile.ZipFile(self.filename, 'r')
+        try:
+            zipin.getinfo('meta.xml')
+        except KeyError:  # no meta.xml in the file
+            czf = archive.ZipStripper(self.filename, self.parser,
+                'application/zip', self.backup, self.add2archive)
+            if czf.is_clean():
+                zipin.close()
+                return True
+        zipin.close()
+        return False
+class PdfStripper(parser.GenericParser):
+    '''
+        Represent a pdf file
+    '''
+    def __init__(self, filename, parser, mime, backup, add2archive):
+        super(PdfStripper, self).__init__(filename, parser, mime, backup,
+            add2archive)
+        uri = 'file://' + os.path.abspath(self.filename)
+        self.password = None
+        self.document = poppler.document_new_from_file(uri, self.password)
+        self.meta_list = ('title', 'author', 'subject', 'keywords', 'creator',
+            'producer', 'creation-date', 'mod-date', 'metadata')
+    def is_clean(self):
+        '''
+            Check if the file is clean from harmful metadatas
+        '''
+        for key in self.meta_list:
+            if key == 'creation-date' or key == 'mod-date':
+                if self.document.get_property(key) != -1:
+                    return False
+            elif self.document.get_property(key) is not None and \
+                self.document.get_property(key) != '':
+                return False
+        return True
+    def remove_all_ugly(self):
+        page = self.document.get_page(0)
+        page_width, page_height = page.get_size()
+        surface = cairo.PDFSurface(self.output, page_width, page_height)
+        context = cairo.Context(surface)  # context draws on the surface
+        logging.debug('Pdf rendering of %s' % self.filename)
+        for pagenum in xrange(self.document.get_n_pages()):
+            page = self.document.get_page(pagenum)
+            context.translate(0, 0)
+            page.render(context)  # render the page on context
+            context.show_page()  # draw context on surface
+        surface.finish()
+        #For now, poppler cannot write meta, so we must use pdfrw
+        logging.debug('Removing %s\'s superficial metadata' % self.filename)
+        trailer = pdfrw.PdfReader(self.output)
+        trailer.Info.Producer = trailer.Info.Creator = None
+        writer = pdfrw.PdfWriter()
+        writer.trailer = trailer
+        writer.write(self.output)
+        self.do_backup()
+    def remove_all(self):
+        '''
+            Opening the pdf with poppler, then doing a render
+            on a cairo pdfsurface for each pages.
+            Thanks to Lunar^for the idea.
+            http://cairographics.org/documentation/pycairo/2/
+            python-poppler is not documented at all : have fun ;)
+        '''
+        page = self.document.get_page(0)
+        page_width, page_height = page.get_size()
+        surface = cairo.PDFSurface(self.output, page_width, page_height)
+        context = cairo.Context(surface)  # context draws on the surface
+        logging.debug('Pdf rendering of %s' % self.filename)
+        for pagenum in xrange(self.document.get_n_pages()):
+            page = self.document.get_page(pagenum)
+            context.translate(0, 0)
+            page.render(context)  # render the page on context
+            context.show_page()  # draw context on surface
+        surface.finish()
+        #For now, poppler cannot write meta, so we must use pdfrw
+        logging.debug('Removing %s\'s superficial metadata' % self.filename)
+        trailer = pdfrw.PdfReader(self.output)
+        trailer.Info.Producer = trailer.Info.Creator = None
+        writer = pdfrw.PdfWriter()
+        writer.trailer = trailer
+        writer.write(self.output)
+        self.do_backup()
+    def get_meta(self):
+        '''
+            Return a dict with all the meta of the file
+        '''
+        metadata = {}
+        for key in self.meta_list:
+            if key == 'creation-date' or key == 'mod-date':
+                #creation and modification are set to -1
+                if self.document.get_property(key) != -1:
+                    metadata[key] = self.document.get_property(key)
+            elif self.document.get_property(key) is not None and \
+                self.document.get_property(key) != '':
+                metadata[key] = self.document.get_property(key)
+        return metadata
+class OpenXmlStripper(archive.GenericArchiveStripper):
+    '''
+        Represent an office openxml document, which is like
+        an opendocument format, with some tricky stuff added.
+        It contains mostly xml, but can have media blobs, crap, ...
+        (I don't like this format.)
+    '''
+    def _remove_all(self, method):
+        '''
+            FIXME ?
+            There is a patch implementing the Zipfile.remove()
+            method here : http://bugs.python.org/issue6818
+        '''
+        zipin = zipfile.ZipFile(self.filename, 'r')
+        zipout = zipfile.ZipFile(self.output, 'w',
+            allowZip64=True)
+        for item in zipin.namelist():
+            name = os.path.join(self.tempdir, item)
+            _, ext = os.path.splitext(name)
+            if item.startswith('docProps/'):  # metadatas
+                pass
+            elif ext in parser.NOMETA or item == '.rels':
+                #keep parser.NOMETA files, and the file named ".rels"
+                zipin.extract(item, self.tempdir)
+                zipout.write(name, item)
+            else:
+                zipin.extract(item, self.tempdir)
+                if os.path.isfile(name):  # don't care about folders
+                    try:
+                        cfile = mat.create_class_file(name, False,
+                            self.add2archive)
+                        if method == 'normal':
+                            cfile.remove_all()
+                        else:
+                            cfile.remove_all_ugly()
+                        logging.debug('Processing %s from %s' % (item,
+                            self.filename))
+                        zipout.write(name, item)
+                    except:
+                        logging.info('%s\' fileformat is not supported' % item)
+                        if self.add2archive:
+                            zipout.write(name, item)
+        zipout.comment = ''
+        logging.info('%s treated' % self.filename)
+        zipin.close()
+        zipout.close()
+        self.do_backup()
+    def is_clean(self):
+        '''
+            Check if the file is clean from harmful metadatas
+        '''
+        zipin = zipfile.ZipFile(self.filename, 'r')
+        for item in zipin.namelist():
+            if item.startswith('docProps/'):
+                return False
+        zipin.close()
+        czf = archive.ZipStripper(self.filename, self.parser,
+                'application/zip', self.backup, self.add2archive)
+        if not czf.is_clean():
+            return False
+        else:
+            return True
+    def get_meta(self):
+        '''
+            Return a dict with all the meta of the file
+        '''
+        zipin = zipfile.ZipFile(self.filename, 'r')
+        metadata = {}
+        for item in zipin.namelist():
+            if item.startswith('docProps/'):
+                metadata[item] = 'harmful content'
+        zipin.close()
+        return metadata
diff --git a/mat/parser.py b/mat/parser.py
new file mode 100644
index 0000000..58dd7fa
--- /dev/null
+++ b/mat/parser.py
@@ -0,0 +1,104 @@
+'''
+    Parent class of all parser
+'''
+import hachoir_core
+import hachoir_editor
+import os
+import mat
+NOMETA = ('.bmp', '.rdf', '.txt', '.xml', '.rels')
+#bmp : image
+#rdf : text
+#txt : plain text
+#xml : formated text
+#rels : openxml foramted text
+class GenericParser(object):
+    '''
+        Parent class of all parsers
+    '''
+    def __init__(self, filename, parser, mime, backup, add2archive):
+        self.filename = ''
+        self.parser = parser
+        self.mime = mime
+        self.backup = backup
+        self.editor = hachoir_editor.createEditor(parser)
+        self.realname = filename
+        try:
+            self.filename = hachoir_core.cmd_line.unicodeFilename(filename)
+        except TypeError:  # get rid of "decoding Unicode is not supported"
+            self.filename = filename
+        basename, ext = os.path.splitext(filename)
+        self.output = basename + '.cleaned' + ext
+        self.basename = os.path.basename(filename)  # only filename
+    def is_clean(self):
+        '''
+            Check if the file is clean from harmful metadatas
+        '''
+        for field in self.editor:
+            if self._should_remove(field):
+                return False
+        return True
+    def remove_all(self):
+        '''
+            Remove all the files that are compromizing
+        '''
+        for field in self.editor:
+            if self._should_remove(field):
+                self._remove(field.name)
+        hachoir_core.field.writeIntoFile(self.editor, self.output)
+        self.do_backup()
+    def remove_all_ugly(self):
+        '''
+            If the remove_all() is not efficient enough,
+            this method is implemented :
+            It is efficient, but destructive.
+            In a perfect world, with nice fileformat,
+            this method would not exist.
+        '''
+        self.remove_all()
+    def _remove(self, field):
+        '''
+            Delete the given field
+        '''
+        del self.editor[field]
+    def get_meta(self):
+        '''
+            Return a dict with all the meta of the file
+        '''
+        metadata = {}
+        for field in self.editor:
+            if self._should_remove(field):
+                try:
+                    metadata[field.name] = field.value
+                except:
+                    metadata[field.name] = 'harmful content'
+        return metadata
+    def _should_remove(self, key):
+        '''
+            return True if the field is compromizing
+            abstract method
+        '''
+        raise NotImplementedError
+    def do_backup(self):
+        '''
+            Do a backup of the file if asked,
+            and change his creation/access date
+        '''
+        if self.backup is True:
+            os.utime(self.output, (0, 0))
+        else:
+            mat.secure_remove(self.filename)
+            os.rename(self.output, self.filename)
+            os.utime(self.filename, (0, 0))
author	jvoisin	2011-08-16 18:11:24 +0200
committer	jvoisin	2011-08-16 18:11:24 +0200
commit	4bd3e47da02fde08acfada1795cc55170abdb00a (patch)
tree	f8c7aa5fd5e1b07a28b350c5ded8125ef2467c51 /mat
parent	baf8e080125614326ba9c96ca8f2404fd12b050e (diff)