summaryrefslogtreecommitdiff
path: root/lib/mat.py
blob: fd132873a8e1d8691bdec9821d49f85eb6b31a92 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
#!/usr/bin/env python

'''
    Metadata anonymisation toolkit library
'''

import os
import subprocess
import logging
import mimetypes
import xml.sax

import hachoir_core.cmd_line
import hachoir_parser

import images
import audio
import office
import archive
import misc

__version__ = '0.1'
__author__ = 'jvoisin'

LOGGING_LEVEL = logging.DEBUG

logging.basicConfig(level=LOGGING_LEVEL)

STRIPPERS = {
    'application/x-tar': archive.TarStripper,
    'application/x-gzip': archive.GzipStripper,
    'application/x-bzip2': archive.Bzip2Stripper,
    'application/zip': archive.ZipStripper,
    'audio/mpeg': audio.MpegAudioStripper,
    'image/jpeg': images.JpegStripper,
    'image/png': images.PngStripper,
    'application/x-bittorrent': misc.TorrentStripper,
    'application/opendocument': office.OpenDocumentStripper,
    'application/officeopenxml': office.OpenXmlStripper,
}

try:
    import poppler
    import cairo
    STRIPPERS['application/x-pdf'] = office.PdfStripper
    STRIPPERS['application/pdf'] = office.PdfStripper
except ImportError:
    print('Unable to import python-poppler and/or python-cairo: no pdf \
        support')

try:
    import mutagen
    STRIPPERS['audio/x-flac'] = audio.FlacStripper
    STRIPPERS['audio/vorbis'] = audio.OggStripper
except ImportError:
    print('unable to import python-mutagen : limited audio format support')


class XMLParser(xml.sax.handler.ContentHandler):
    '''
        Parse the supported format xml, and return a corresponding
        list of dict
    '''
    def __init__(self):
        self.dict = {}
        self.list = []
        self.content, self.key = '', ''
        self.between = False

    def startElement(self, name, attrs):
        '''
            Called when entering into xml balise
        '''
        self.between = True
        self.key = name
        self.content = ''

    def endElement(self, name):
        '''
            Called when exiting a xml balise
        '''
        if name == 'format':  # exiting a fileformat section
            self.list.append(self.dict.copy())
            self.dict.clear()
        else:
            content = self.content.replace('\s', ' ')
            self.dict[self.key] = content
            self.between = False

    def characters(self, characters):
        '''
            Concatenate the content between opening and closing balises
        '''
        if self.between is True:
            self.content += characters


def secure_remove(filename):
    '''
        securely remove the file
    '''
    removed = False
    try:
        subprocess.call('shred --remove %s' % filename, shell=True)
        removed = True
    except:
        logging.error('Unable to securely remove %s' % filename)

    if removed is False:
        try:
            os.remove(filename)
        except:
            logging.error('Unable to remove %s' % filename)


def is_secure(filename):
    '''
        Prevent shell injection
    '''
    if not(os.path.isfile(filename)):  # check if the file exist
        logging.error('%s is not a valid file' % filename)
        return False
    else:
        return True


def create_class_file(name, backup, add2archive):
    '''
        return a $FILETYPEStripper() class,
        corresponding to the filetype of the given file
    '''
    if not is_secure(name):
        return

    filename = ''
    try:
        filename = hachoir_core.cmd_line.unicodeFilename(name)
    except TypeError:  # get rid of "decoding Unicode is not supported"
        filename = name

    parser = hachoir_parser.createParser(filename)
    if not parser:
        logging.info('Unable to parse %s' % filename)
        return

    mime = parser.mime_type

    if mime == 'application/zip':  # some formats are zipped stuff
        mime = mimetypes.guess_type(name)[0]

    if mime.startswith('application/vnd.oasis.opendocument'):
        mime = 'application/opendocument'  # opendocument fileformat
    elif mime.startswith('application/vnd.openxmlformats-officedocument'):
        mime = 'application/officeopenxml'  # office openxml

    try:
        stripper_class = STRIPPERS[mime]
    except KeyError:
        logging.info('Don\'t have stripper for %s format' % mime)
        return

    return stripper_class(filename, parser, mime, backup, add2archive)