From bc2fb9a3944a013e05c2f84c1e324c35c26a1827 Mon Sep 17 00:00:00 2001 From: jvoisin Date: Wed, 3 Aug 2011 18:39:53 +0200 Subject: Add (in xml) the supported fileformat list, and a parser --- FORMATS | 86 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ lib/mat.py | 42 ++++++++++++++++++++++++++++-- 2 files changed, 126 insertions(+), 2 deletions(-) create mode 100644 FORMATS diff --git a/FORMATS b/FORMATS new file mode 100644 index 0000000..cc38bae --- /dev/null +++ b/FORMATS @@ -0,0 +1,86 @@ + + + Portable Network Graphics + .png + full + textual metadata + date + removal of harmful fields is done with hachoir + + + + Jpeg + .jpeg, .jpg + full + comment + exif/photoshop/adobe + removal of harmful fields is done with hachoir + + + + Open Document + .odt, .odx, .ods, ... + full + a meta.xml file + removal of the meta.xml file + + + + Portable Document Fileformat + .pdf + full + a lot + rendering of the pdf file on a cairo surface with the help of + poppler in order to remove all the internal metadata, + then removal of the remaining metadata fields of the pdf itself with + pdfrw (the next version of python-cairo will support metadata, + so we should get rid of pdfrw) + + + + Tape ARchive + .tar, .tar.bz2, .tar.gz + full + metadata from the file itself, metadata from the file contained + into the archive, and metadata added by tar to the file at then + creation of the archive + extraction of each file, treatement of the file, add treated file + to a new archive, right before the add, remove the metadata added by tar + itself. When the new archive is complete, remove all his metadata. + + + + Zip + .zip + .partial + metadata from the file itself, metadata from the file contained + into the archive, and metadata added by zip to the file when added to + the archive. + + extraction of each file, treatement of the file, add treated file + to a new archive. When the new archive is complete, remove all his metadata + metadata added by zip itself to internal files + + + + MPEG Audio + .mp3, .mp2, .mp1 + full + id3 + removal of harmful fields is done with hachoir + + + + Ogg Vorbis + .ogg + full + Vorbis + removal of harmful fields is done with mutagen + + + + Free Lossless Audio Codec + .flac + full + Flac, Vorbis + removal of harmful fields is done with mutagen + + diff --git a/lib/mat.py b/lib/mat.py index 8226c7e..8fe6fb4 100644 --- a/lib/mat.py +++ b/lib/mat.py @@ -7,6 +7,7 @@ import os import subprocess import logging +import xml.sax import hachoir_core.cmd_line import hachoir_parser @@ -45,13 +46,50 @@ except ImportError: try: import mutagen STRIPPERS['audio/x-flac'] = audio.FlacStripper - STRIPPERS['audio/x-ape'] = audio.Apev2Stripper - STRIPPERS['audio/x-wavpack'] = audio.Apev2Stripper STRIPPERS['audio/vorbis'] = audio.OggStripper except ImportError: print('unable to import python-mutagen : limited audio format support') +class XMLParser(xml.sax.handler.ContentHandler): + ''' + Parse the supported format xml, and return a corresponding + list of dict + ''' + def __init__(self): + self.dict = {} + self.list = [] + self.content, self.key = '', '' + self.between= False + + def startElement(self, name, attrs): + ''' + Called when entering into xml balise + ''' + self.between = True + self.key = name + self.content = '' + + def endElement(self, name): + ''' + Called when exiting a xml balise + ''' + if name == 'format': # exiting a fileformat section + self.list.append(self.dict.copy()) + self.dict.clear() + else: + content = self.content.replace('\n', ' ') + self.dict[self.key] = content + self.between = False + + def characters(self, characters): + ''' + Concatenate the content between opening and closing balises + ''' + if self.between is True: + self.content += characters + + def secure_remove(filename): ''' securely remove the file -- cgit v1.3