summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorjvoisin2011-06-21 17:27:11 +0200
committerjvoisin2011-06-21 17:27:11 +0200
commit9e0f6cf0ea0a992450c12aec73b459403de5f3c2 (patch)
treeabce28e477f429f6bc174409a23955e0e1ca65a1
parent5aa90822631b80502afaca18c222432d4f8c3cc1 (diff)
Pdf metadata support
-rw-r--r--lib/images.py4
-rw-r--r--lib/mat.py8
-rw-r--r--lib/parser.py30
3 files changed, 15 insertions, 27 deletions
diff --git a/lib/images.py b/lib/images.py
index 21229c2..76696fd 100644
--- a/lib/images.py
+++ b/lib/images.py
@@ -11,7 +11,9 @@ class JpegStripper(parser.Generic_parser):
11 11
12class PngStripper(parser.Generic_parser): 12class PngStripper(parser.Generic_parser):
13 def _should_remove(self, field): 13 def _should_remove(self, field):
14 if field.name in ('comment'): 14 if field.name.startswith("text["):
15 return True
16 elif field.name is "time":
15 return True 17 return True
16 else: 18 else:
17 return False 19 return False
diff --git a/lib/mat.py b/lib/mat.py
index 5641c62..3cbd81b 100644
--- a/lib/mat.py
+++ b/lib/mat.py
@@ -1,4 +1,4 @@
1#!/usr/bin/python 1#!/usr/bin/env python
2 2
3''' 3'''
4 Metadata anonymisation toolkit library 4 Metadata anonymisation toolkit library
@@ -12,6 +12,8 @@ import hachoir_parser
12import hachoir_editor 12import hachoir_editor
13 13
14import images 14import images
15import audio
16import misc
15 17
16__version__ = "0.1" 18__version__ = "0.1"
17__author__ = "jvoisin" 19__author__ = "jvoisin"
@@ -19,6 +21,8 @@ __author__ = "jvoisin"
19strippers = { 21strippers = {
20 hachoir_parser.image.JpegFile: images.JpegStripper, 22 hachoir_parser.image.JpegFile: images.JpegStripper,
21 hachoir_parser.image.PngFile: images.PngStripper, 23 hachoir_parser.image.PngFile: images.PngStripper,
24 hachoir_parser.audio.MpegAudioFile: audio.MpegAudioStripper,
25 hachoir_parser.misc.PDFDocument: misc.PdfStripper,
22} 26}
23 27
24def create_class_file(name): 28def create_class_file(name):
@@ -50,4 +54,6 @@ def create_class_file(name):
50 #Place for another lib than hachoir 54 #Place for another lib than hachoir
51 print("Don't have stripper for file type: %s" % editor.description) 55 print("Don't have stripper for file type: %s" % editor.description)
52 sys.exit(1) 56 sys.exit(1)
57 if editor.input.__class__ == hachoir_parser.misc.PDFDocument:
58 return stripper_class(filename)
53 return stripper_class(realname, filename, parser, editor) 59 return stripper_class(realname, filename, parser, editor)
diff --git a/lib/parser.py b/lib/parser.py
index 6c9f6f3..1084de5 100644
--- a/lib/parser.py
+++ b/lib/parser.py
@@ -4,7 +4,6 @@
4 4
5import hachoir_core.error 5import hachoir_core.error
6import hachoir_parser 6import hachoir_parser
7import hachoir_metadata
8import hachoir_editor 7import hachoir_editor
9import sys 8import sys
10 9
@@ -12,32 +11,10 @@ POSTFIX = ".cleaned"
12 11
13class Generic_parser(): 12class Generic_parser():
14 def __init__(self, realname, filename, parser, editor): 13 def __init__(self, realname, filename, parser, editor):
15 self.meta = {}
16 self.filename = filename 14 self.filename = filename
17 self.realname = realname 15 self.realname = realname
18 self.parser = parser 16 self.parser = parser
19 self.editor = editor 17 self.editor = editor
20 #self.meta = self.__fill_meta()
21
22 def __fill_meta(self):
23 metadata = {}
24 try:
25 meta = hachoir_metadata.extractMetadata(self.parser)
26 except hachoir_core.error.HachoirError, err:
27 print("Metadata extraction error: %s" % err)
28
29 if not meta:
30 print("Unable to extract metadata from the file %s" % self.filename)
31 #sys.exit(1)
32
33 for title in meta:
34 #fixme i'm so dirty
35 if title.values != []: #if the field is not empty
36 value = ""
37 for item in title.values:
38 value = item.text
39 metadata[title.key] = value
40 return metadata
41 18
42 def is_clean(self): 19 def is_clean(self):
43 ''' 20 '''
@@ -84,10 +61,13 @@ class Generic_parser():
84 ''' 61 '''
85 return a dict with all the meta of the file 62 return a dict with all the meta of the file
86 ''' 63 '''
87 metadata = [] 64 metadata = {}
88 for field in self.editor: 65 for field in self.editor:
89 if self._should_remove(field): 66 if self._should_remove(field):
90 metadata.append(field.name) 67 try:
68 metadata[field.name] = field.value
69 except:
70 metadata[field.name] = "harmful content"
91 return metadata 71 return metadata
92 72
93 def _should_remove(self, key): 73 def _should_remove(self, key):