summaryrefslogtreecommitdiff
path: root/mat.py
blob: 200fc0409cfd42b2fa9387daac161b25be298d89 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
#!/usr/bin/python

'''
    Metadata anonymisation toolkit library
'''

import hachoir_core.error
import hachoir_core.field
import hachoir_core.cmd_line
import hachoir_parser
import hachoir_metadata
import hachoir_editor

import sys
import os
import hachoir_parser.image

__version__ = "0.1"
__author__ = "jvoisin"

POSTFIX = ".cleaned"

class file():
    def __init__(self, realname, filename, parser, editor):
        self.meta = {}
        self.filename = filename
        self.realname = realname
        self.parser = parser
        self.editor = editor
        self.meta = self.__fill_meta()

    def __fill_meta(self):
        metadata = {}
        try:
            meta = hachoir_metadata.extractMetadata(self.parser)
        except hachoir_core.error.HachoirError, err:
            print("Metadata extraction error: %s" % err)

        if not meta:
            print("Unable to extract metadata from the file %s" % self.filename)
            sys.exit(1)

        for title in meta:
            #fixme i'm so dirty
            if title.values != []: #if the field is not empty
                value = ""
                for item in title.values:
                    value = item.text
                metadata[title.key] = value
        return metadata

    def is_clean(self):
        '''
            Check if the file is clean from harmful metadatas
        '''
        for field in self.editor:
            if self._should_remove(field):
                return False
        return True

    def remove_all(self):
        '''
            Remove all the files that are compromizing
        '''
        for field in self.editor:
            if self._should_remove(field):
                self._remove(field)
        hachoir_core.field.writeIntoFile(self.editor, self.filename + POSTFIX)

    def _remove(self, field):
        '''
            Remove the given field
        '''
        del self.editor[field.name]


    def get_meta(self):
        '''
            return a dict with all the meta of the file
        '''
        #am I useless ?
        return self.meta

    def _should_remove(self, key):
        '''
            return True if the field is compromizing
            abstract method
        '''
        raise NotImplementedError()

class JpegStripper(file):
    def _should_remove(self, field):
        if field.name.startswith('comment'):
            return True
        elif field.name in ("photoshop", "exif", "adobe"):
            return True
        else:
            return False

class PngStripper(file):
    def _should_remove(self, field):
        if field.name in ('comment'):
            return True
        else:
            return False

strippers = {
    hachoir_parser.image.JpegFile: JpegStripper,
    hachoir_parser.image.PngFile: PngStripper,
}

def create_class_file(name):
    '''
        return a $FILETYPEStripper() class,
        corresponding to the filetype of the given file
    '''
    if not(os.path.isfile(name)): #check if the file exist
        print("Error: %s is not a valid file" % name)
        sys.exit(1)

    filename = ""
    realname = name
    filename = hachoir_core.cmd_line.unicodeFilename(name)
    parser = hachoir_parser.createParser(filename)
    if not parser:
        print("Unable to parse the file %s : sorry" % filename)
        sys.exit(1)

    editor = hachoir_editor.createEditor(parser)
    try:
        '''this part is a little tricky :
        stripper_class will receice the name of the class $FILETYPEStripper,
        (which herits from the "file" class), based on the editor
        of given file (name)
        '''
        stripper_class = strippers[editor.input.__class__]
    except KeyError:
        #Place for another lib than hachoir
        print("Don't have stripper for file type: %s" % editor.description)
        sys.exit(1)
    return stripper_class(realname, filename, parser, editor)