summaryrefslogtreecommitdiff
path: root/lib/mat.py
blob: dfcfc57844a1bf1604b3b2e75357581f82d6b7c2 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
#!/usr/bin/env python

'''
    Metadata anonymisation toolkit library
'''

import os
import subprocess
import logging
import mimetypes
import xml.sax

import hachoir_core.cmd_line
import hachoir_parser

import strippers

__version__ = '0.2.2'
__author__ = 'jvoisin'

#Silence
LOGGING_LEVEL = logging.CRITICAL
hachoir_core.config.quiet = True
fname = ''

#Verbose
LOGGING_LEVEL = logging.DEBUG
#hachoir_core.config.quiet = False
#logname = 'report.log'

logging.basicConfig(filename=fname, level=LOGGING_LEVEL)


def get_sharedir():
    '''
        An ugly hack to find where is the "FORMATS" file.
    '''
    if os.path.isfile('FORMATS'):
        return ''
    elif os.path.exists('/usr/local/share/mat/'):
        return '/usr/local/share/mat/'
    elif os.path.exists('/usr/share/mat/'):
        return '/usr/share/mat'


class XMLParser(xml.sax.handler.ContentHandler):
    '''
        Parse the supported format xml, and return a corresponding
        list of dict
    '''
    def __init__(self):
        self.dict = {}
        self.list = []
        self.content, self.key = '', ''
        self.between = False

    def startElement(self, name, attrs):
        '''
            Called when entering into xml balise
        '''
        self.between = True
        self.key = name
        self.content = ''

    def endElement(self, name):
        '''
            Called when exiting a xml balise
        '''
        if name == 'format':  # exiting a fileformat section
            self.list.append(self.dict.copy())
            self.dict.clear()
        else:
            content = self.content.replace('\s', ' ')
            self.dict[self.key] = content
            self.between = False

    def characters(self, characters):
        '''
            Concatenate the content between opening and closing balises
        '''
        if self.between:
            self.content += characters


def secure_remove(filename):
    '''
        securely remove the file
    '''
    removed = False
    try:
        subprocess.call(['shred', '--remove', filename])
        removed = True
    except:
        logging.error('Unable to securely remove %s' % filename)

    if removed is False:
        try:
            os.remove(filename)
        except:
            logging.error('Unable to remove %s' % filename)


def create_class_file(name, backup, add2archive):
    '''
        return a $FILETYPEStripper() class,
        corresponding to the filetype of the given file
    '''
    if not os.path.isfile(name):
        # check if the file exists
        logging.error('%s is not a valid file' % name)
        return None

    if not os.access(name, os.R_OK):
        #check read permissions
        logging.error('%s is is not readable' % name)
        return None

    if not os.access(name, os.W_OK):
        #check write permission
        logging.error('%s is not writtable' % name)
        return None

    filename = ''
    try:
        filename = hachoir_core.cmd_line.unicodeFilename(name)
    except TypeError:  # get rid of "decoding Unicode is not supported"
        filename = name

    parser = hachoir_parser.createParser(filename)
    if not parser:
        logging.info('Unable to parse %s' % filename)
        return None

    mime = parser.mime_type

    if mime == 'application/zip':  # some formats are zipped stuff
        mime = mimetypes.guess_type(name)[0]

    if mime.startswith('application/vnd.oasis.opendocument'):
        mime = 'application/opendocument'  # opendocument fileformat
    elif mime.startswith('application/vnd.openxmlformats-officedocument'):
        mime = 'application/officeopenxml'  # office openxml

    try:
        stripper_class = strippers.STRIPPERS[mime]
    except KeyError:
        logging.info('Don\'t have stripper for %s format' % mime)
        return None

    return stripper_class(filename, parser, mime, backup, add2archive)