1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
|
#!/usr/bin/env python
'''
Metadata anonymisation toolkit library
'''
import os
import subprocess
import logging
import mimetypes
import xml.sax
import hachoir_core.cmd_line
import hachoir_parser
import images
import audio
import office
import archive
__version__ = '0.1'
__author__ = 'jvoisin'
LOGGING_LEVEL = logging.DEBUG
logging.basicConfig(level=LOGGING_LEVEL)
STRIPPERS = {
'application/x-tar': archive.TarStripper,
'application/x-gzip': archive.GzipStripper,
'application/x-bzip2': archive.Bzip2Stripper,
'application/zip': archive.ZipStripper,
'audio/mpeg': audio.MpegAudioStripper,
'image/jpeg': images.JpegStripper,
'image/png': images.PngStripper,
'application/vnd.oasis.opendocument': office.OpenDocumentStripper,
'application/vnd.openxmlformats-officedocument': office.OpenXmlStripper,
}
try:
import poppler
import cairo
STRIPPERS['application/x-pdf'] = office.PdfStripper
STRIPPERS['application/pdf'] = office.PdfStripper
except ImportError:
print('Unable to import python-poppler and/or python-cairo: no pdf support')
try:
import mutagen
STRIPPERS['audio/x-flac'] = audio.FlacStripper
STRIPPERS['audio/vorbis'] = audio.OggStripper
except ImportError:
print('unable to import python-mutagen : limited audio format support')
class XMLParser(xml.sax.handler.ContentHandler):
'''
Parse the supported format xml, and return a corresponding
list of dict
'''
def __init__(self):
self.dict = {}
self.list = []
self.content, self.key = '', ''
self.between= False
def startElement(self, name, attrs):
'''
Called when entering into xml balise
'''
self.between = True
self.key = name
self.content = ''
def endElement(self, name):
'''
Called when exiting a xml balise
'''
if name == 'format': # exiting a fileformat section
self.list.append(self.dict.copy())
self.dict.clear()
else:
content = self.content.replace('\n', ' ')
self.dict[self.key] = content
self.between = False
def characters(self, characters):
'''
Concatenate the content between opening and closing balises
'''
if self.between is True:
self.content += characters
def secure_remove(filename):
'''
securely remove the file
'''
removed = False
try:
subprocess.call('shred --remove %s' % filename, shell=True)
removed = True
except:
logging.error('Unable to securely remove %s' % filename)
if removed is False:
try:
os.remove(filename)
except:
logging.error('Unable to remove %s' % filename)
def is_secure(filename):
'''
Prevent shell injection
'''
if not(os.path.isfile(filename)): # check if the file exist
logging.error('%s is not a valid file' % filename)
return False
else:
return True
def create_class_file(name, backup, add2archive):
'''
return a $FILETYPEStripper() class,
corresponding to the filetype of the given file
'''
if not is_secure(name):
return
filename = ''
try:
filename = hachoir_core.cmd_line.unicodeFilename(name)
except TypeError: # get rid of "decoding Unicode is not supported"
filename = name
parser = hachoir_parser.createParser(filename)
if not parser:
logging.info('Unable to parse %s' % filename)
return
mime = parser.mime_type
if mime == 'application/zip': # some formats are zipped stuff
mime = mimetypes.guess_type(name)[0]
if mime.startswith('application/vnd.oasis.opendocument'):
mime = 'application/vnd.oasis.opendocument' # opendocument fileformat
elif mime.startswith('application/vnd.openxmlformats-officedocument'):
mime = 'application/vnd.openxmlformats-officedocument'
try:
stripper_class = STRIPPERS[mime]
except KeyError:
logging.info('Don\'t have stripper for %s format' % mime)
return
return stripper_class(filename, parser, mime, backup, add2archive)
|