summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorjvoisin2011-08-03 18:39:53 +0200
committerjvoisin2011-08-03 18:39:53 +0200
commitbc2fb9a3944a013e05c2f84c1e324c35c26a1827 (patch)
tree4affa3fe9c077ee121ee8eea218760dac49e54d8
parent73e80a3859da461ca363cde6c4ab050e53159362 (diff)
Add (in xml) the supported fileformat list, and a parser
-rw-r--r--FORMATS86
-rw-r--r--lib/mat.py42
2 files changed, 126 insertions, 2 deletions
diff --git a/FORMATS b/FORMATS
new file mode 100644
index 0000000..cc38bae
--- /dev/null
+++ b/FORMATS
@@ -0,0 +1,86 @@
1<xml>
2 <format>
3 <name>Portable Network Graphics</name>
4 <extension>.png</extension>
5 <support>full</support>
6 <metadata>textual metadata + date</metadata>
7 <method>removal of harmful fields is done with hachoir</method>
8 </format>
9
10 <format>
11 <name>Jpeg</name>
12 <extension>.jpeg, .jpg</extension>
13 <support>full</support>
14 <metadata>comment + exif/photoshop/adobe</metadata>
15 <method>removal of harmful fields is done with hachoir</method>
16 </format>
17
18 <format>
19 <name>Open Document</name>
20 <extension>.odt, .odx, .ods, ...</extension>
21 <support>full</support>
22 <metadata>a meta.xml file</metadata>
23 <method>removal of the meta.xml file</method>
24 </format>
25
26 <format>
27 <name>Portable Document Fileformat</name>
28 <extension>.pdf</extension>
29 <support>full</support>
30 <metadata>a lot</metadata>
31 <method>rendering of the pdf file on a cairo surface with the help of
32 poppler in order to remove all the internal metadata,
33 then removal of the remaining metadata fields of the pdf itself with
34 pdfrw (the next version of python-cairo will support metadata,
35 so we should get rid of pdfrw)</method>
36 </format>
37
38 <format>
39 <name>Tape ARchive</name>
40 <extension>.tar, .tar.bz2, .tar.gz</extension>
41 <support>full</support>
42 <metadata>metadata from the file itself, metadata from the file contained
43 into the archive, and metadata added by tar to the file at then
44 creation of the archive</metadata>
45 <method>extraction of each file, treatement of the file, add treated file
46 to a new archive, right before the add, remove the metadata added by tar
47 itself. When the new archive is complete, remove all his metadata.</method>
48 </format>
49
50 <format>
51 <name>Zip</name>
52 <extension>.zip</extension>
53 <support>.partial</support>
54 <metadata>metadata from the file itself, metadata from the file contained
55 into the archive, and metadata added by zip to the file when added to
56 the archive.
57 </metadata>
58 <method>extraction of each file, treatement of the file, add treated file
59 to a new archive. When the new archive is complete, remove all his metadata</method>
60 <remaining>metadata added by zip itself to internal files</remaining>
61 </format>
62
63 <format>
64 <name>MPEG Audio</name>
65 <extension>.mp3, .mp2, .mp1</extension>
66 <support>full</support>
67 <metadata>id3</metadata>
68 <method>removal of harmful fields is done with hachoir</method>
69 </format>
70
71 <format>
72 <name>Ogg Vorbis</name>
73 <extension>.ogg</extension>
74 <support>full</support>
75 <metadata>Vorbis</metadata>
76 <method>removal of harmful fields is done with mutagen</method>
77 </format>
78
79 <format>
80 <name>Free Lossless Audio Codec</name>
81 <extension>.flac</extension>
82 <support>full</support>
83 <metadata>Flac, Vorbis</metadata>
84 <method>removal of harmful fields is done with mutagen</method>
85 </format>
86</xml>
diff --git a/lib/mat.py b/lib/mat.py
index 8226c7e..8fe6fb4 100644
--- a/lib/mat.py
+++ b/lib/mat.py
@@ -7,6 +7,7 @@
7import os 7import os
8import subprocess 8import subprocess
9import logging 9import logging
10import xml.sax
10 11
11import hachoir_core.cmd_line 12import hachoir_core.cmd_line
12import hachoir_parser 13import hachoir_parser
@@ -45,13 +46,50 @@ except ImportError:
45try: 46try:
46 import mutagen 47 import mutagen
47 STRIPPERS['audio/x-flac'] = audio.FlacStripper 48 STRIPPERS['audio/x-flac'] = audio.FlacStripper
48 STRIPPERS['audio/x-ape'] = audio.Apev2Stripper
49 STRIPPERS['audio/x-wavpack'] = audio.Apev2Stripper
50 STRIPPERS['audio/vorbis'] = audio.OggStripper 49 STRIPPERS['audio/vorbis'] = audio.OggStripper
51except ImportError: 50except ImportError:
52 print('unable to import python-mutagen : limited audio format support') 51 print('unable to import python-mutagen : limited audio format support')
53 52
54 53
54class XMLParser(xml.sax.handler.ContentHandler):
55 '''
56 Parse the supported format xml, and return a corresponding
57 list of dict
58 '''
59 def __init__(self):
60 self.dict = {}
61 self.list = []
62 self.content, self.key = '', ''
63 self.between= False
64
65 def startElement(self, name, attrs):
66 '''
67 Called when entering into xml balise
68 '''
69 self.between = True
70 self.key = name
71 self.content = ''
72
73 def endElement(self, name):
74 '''
75 Called when exiting a xml balise
76 '''
77 if name == 'format': # exiting a fileformat section
78 self.list.append(self.dict.copy())
79 self.dict.clear()
80 else:
81 content = self.content.replace('\n', ' ')
82 self.dict[self.key] = content
83 self.between = False
84
85 def characters(self, characters):
86 '''
87 Concatenate the content between opening and closing balises
88 '''
89 if self.between is True:
90 self.content += characters
91
92
55def secure_remove(filename): 93def secure_remove(filename):
56 ''' 94 '''
57 securely remove the file 95 securely remove the file