From 9e0f6cf0ea0a992450c12aec73b459403de5f3c2 Mon Sep 17 00:00:00 2001 From: jvoisin Date: Tue, 21 Jun 2011 17:27:11 +0200 Subject: Pdf metadata support --- lib/images.py | 4 +++- lib/mat.py | 8 +++++++- lib/parser.py | 30 +++++------------------------- 3 files changed, 15 insertions(+), 27 deletions(-) diff --git a/lib/images.py b/lib/images.py index 21229c2..76696fd 100644 --- a/lib/images.py +++ b/lib/images.py @@ -11,7 +11,9 @@ class JpegStripper(parser.Generic_parser): class PngStripper(parser.Generic_parser): def _should_remove(self, field): - if field.name in ('comment'): + if field.name.startswith("text["): + return True + elif field.name is "time": return True else: return False diff --git a/lib/mat.py b/lib/mat.py index 5641c62..3cbd81b 100644 --- a/lib/mat.py +++ b/lib/mat.py @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/env python ''' Metadata anonymisation toolkit library @@ -12,6 +12,8 @@ import hachoir_parser import hachoir_editor import images +import audio +import misc __version__ = "0.1" __author__ = "jvoisin" @@ -19,6 +21,8 @@ __author__ = "jvoisin" strippers = { hachoir_parser.image.JpegFile: images.JpegStripper, hachoir_parser.image.PngFile: images.PngStripper, + hachoir_parser.audio.MpegAudioFile: audio.MpegAudioStripper, + hachoir_parser.misc.PDFDocument: misc.PdfStripper, } def create_class_file(name): @@ -50,4 +54,6 @@ def create_class_file(name): #Place for another lib than hachoir print("Don't have stripper for file type: %s" % editor.description) sys.exit(1) + if editor.input.__class__ == hachoir_parser.misc.PDFDocument: + return stripper_class(filename) return stripper_class(realname, filename, parser, editor) diff --git a/lib/parser.py b/lib/parser.py index 6c9f6f3..1084de5 100644 --- a/lib/parser.py +++ b/lib/parser.py @@ -4,7 +4,6 @@ import hachoir_core.error import hachoir_parser -import hachoir_metadata import hachoir_editor import sys @@ -12,32 +11,10 @@ POSTFIX = ".cleaned" class Generic_parser(): def __init__(self, realname, filename, parser, editor): - self.meta = {} self.filename = filename self.realname = realname self.parser = parser self.editor = editor - #self.meta = self.__fill_meta() - - def __fill_meta(self): - metadata = {} - try: - meta = hachoir_metadata.extractMetadata(self.parser) - except hachoir_core.error.HachoirError, err: - print("Metadata extraction error: %s" % err) - - if not meta: - print("Unable to extract metadata from the file %s" % self.filename) - #sys.exit(1) - - for title in meta: - #fixme i'm so dirty - if title.values != []: #if the field is not empty - value = "" - for item in title.values: - value = item.text - metadata[title.key] = value - return metadata def is_clean(self): ''' @@ -84,10 +61,13 @@ class Generic_parser(): ''' return a dict with all the meta of the file ''' - metadata = [] + metadata = {} for field in self.editor: if self._should_remove(field): - metadata.append(field.name) + try: + metadata[field.name] = field.value + except: + metadata[field.name] = "harmful content" return metadata def _should_remove(self, key): -- cgit v1.3