diff options
| author | jvoisin | 2011-06-21 17:27:11 +0200 |
|---|---|---|
| committer | jvoisin | 2011-06-21 17:27:11 +0200 |
| commit | 9e0f6cf0ea0a992450c12aec73b459403de5f3c2 (patch) | |
| tree | abce28e477f429f6bc174409a23955e0e1ca65a1 | |
| parent | 5aa90822631b80502afaca18c222432d4f8c3cc1 (diff) | |
Pdf metadata support
| -rw-r--r-- | lib/images.py | 4 | ||||
| -rw-r--r-- | lib/mat.py | 8 | ||||
| -rw-r--r-- | lib/parser.py | 30 |
3 files changed, 15 insertions, 27 deletions
diff --git a/lib/images.py b/lib/images.py index 21229c2..76696fd 100644 --- a/lib/images.py +++ b/lib/images.py | |||
| @@ -11,7 +11,9 @@ class JpegStripper(parser.Generic_parser): | |||
| 11 | 11 | ||
| 12 | class PngStripper(parser.Generic_parser): | 12 | class PngStripper(parser.Generic_parser): |
| 13 | def _should_remove(self, field): | 13 | def _should_remove(self, field): |
| 14 | if field.name in ('comment'): | 14 | if field.name.startswith("text["): |
| 15 | return True | ||
| 16 | elif field.name is "time": | ||
| 15 | return True | 17 | return True |
| 16 | else: | 18 | else: |
| 17 | return False | 19 | return False |
| @@ -1,4 +1,4 @@ | |||
| 1 | #!/usr/bin/python | 1 | #!/usr/bin/env python |
| 2 | 2 | ||
| 3 | ''' | 3 | ''' |
| 4 | Metadata anonymisation toolkit library | 4 | Metadata anonymisation toolkit library |
| @@ -12,6 +12,8 @@ import hachoir_parser | |||
| 12 | import hachoir_editor | 12 | import hachoir_editor |
| 13 | 13 | ||
| 14 | import images | 14 | import images |
| 15 | import audio | ||
| 16 | import misc | ||
| 15 | 17 | ||
| 16 | __version__ = "0.1" | 18 | __version__ = "0.1" |
| 17 | __author__ = "jvoisin" | 19 | __author__ = "jvoisin" |
| @@ -19,6 +21,8 @@ __author__ = "jvoisin" | |||
| 19 | strippers = { | 21 | strippers = { |
| 20 | hachoir_parser.image.JpegFile: images.JpegStripper, | 22 | hachoir_parser.image.JpegFile: images.JpegStripper, |
| 21 | hachoir_parser.image.PngFile: images.PngStripper, | 23 | hachoir_parser.image.PngFile: images.PngStripper, |
| 24 | hachoir_parser.audio.MpegAudioFile: audio.MpegAudioStripper, | ||
| 25 | hachoir_parser.misc.PDFDocument: misc.PdfStripper, | ||
| 22 | } | 26 | } |
| 23 | 27 | ||
| 24 | def create_class_file(name): | 28 | def create_class_file(name): |
| @@ -50,4 +54,6 @@ def create_class_file(name): | |||
| 50 | #Place for another lib than hachoir | 54 | #Place for another lib than hachoir |
| 51 | print("Don't have stripper for file type: %s" % editor.description) | 55 | print("Don't have stripper for file type: %s" % editor.description) |
| 52 | sys.exit(1) | 56 | sys.exit(1) |
| 57 | if editor.input.__class__ == hachoir_parser.misc.PDFDocument: | ||
| 58 | return stripper_class(filename) | ||
| 53 | return stripper_class(realname, filename, parser, editor) | 59 | return stripper_class(realname, filename, parser, editor) |
diff --git a/lib/parser.py b/lib/parser.py index 6c9f6f3..1084de5 100644 --- a/lib/parser.py +++ b/lib/parser.py | |||
| @@ -4,7 +4,6 @@ | |||
| 4 | 4 | ||
| 5 | import hachoir_core.error | 5 | import hachoir_core.error |
| 6 | import hachoir_parser | 6 | import hachoir_parser |
| 7 | import hachoir_metadata | ||
| 8 | import hachoir_editor | 7 | import hachoir_editor |
| 9 | import sys | 8 | import sys |
| 10 | 9 | ||
| @@ -12,32 +11,10 @@ POSTFIX = ".cleaned" | |||
| 12 | 11 | ||
| 13 | class Generic_parser(): | 12 | class Generic_parser(): |
| 14 | def __init__(self, realname, filename, parser, editor): | 13 | def __init__(self, realname, filename, parser, editor): |
| 15 | self.meta = {} | ||
| 16 | self.filename = filename | 14 | self.filename = filename |
| 17 | self.realname = realname | 15 | self.realname = realname |
| 18 | self.parser = parser | 16 | self.parser = parser |
| 19 | self.editor = editor | 17 | self.editor = editor |
| 20 | #self.meta = self.__fill_meta() | ||
| 21 | |||
| 22 | def __fill_meta(self): | ||
| 23 | metadata = {} | ||
| 24 | try: | ||
| 25 | meta = hachoir_metadata.extractMetadata(self.parser) | ||
| 26 | except hachoir_core.error.HachoirError, err: | ||
| 27 | print("Metadata extraction error: %s" % err) | ||
| 28 | |||
| 29 | if not meta: | ||
| 30 | print("Unable to extract metadata from the file %s" % self.filename) | ||
| 31 | #sys.exit(1) | ||
| 32 | |||
| 33 | for title in meta: | ||
| 34 | #fixme i'm so dirty | ||
| 35 | if title.values != []: #if the field is not empty | ||
| 36 | value = "" | ||
| 37 | for item in title.values: | ||
| 38 | value = item.text | ||
| 39 | metadata[title.key] = value | ||
| 40 | return metadata | ||
| 41 | 18 | ||
| 42 | def is_clean(self): | 19 | def is_clean(self): |
| 43 | ''' | 20 | ''' |
| @@ -84,10 +61,13 @@ class Generic_parser(): | |||
| 84 | ''' | 61 | ''' |
| 85 | return a dict with all the meta of the file | 62 | return a dict with all the meta of the file |
| 86 | ''' | 63 | ''' |
| 87 | metadata = [] | 64 | metadata = {} |
| 88 | for field in self.editor: | 65 | for field in self.editor: |
| 89 | if self._should_remove(field): | 66 | if self._should_remove(field): |
| 90 | metadata.append(field.name) | 67 | try: |
| 68 | metadata[field.name] = field.value | ||
| 69 | except: | ||
| 70 | metadata[field.name] = "harmful content" | ||
| 91 | return metadata | 71 | return metadata |
| 92 | 72 | ||
| 93 | def _should_remove(self, key): | 73 | def _should_remove(self, key): |
