diff options
| author | jvoisin | 2011-06-21 17:27:11 +0200 |
|---|---|---|
| committer | jvoisin | 2011-06-21 17:27:11 +0200 |
| commit | 9e0f6cf0ea0a992450c12aec73b459403de5f3c2 (patch) | |
| tree | abce28e477f429f6bc174409a23955e0e1ca65a1 /lib/parser.py | |
| parent | 5aa90822631b80502afaca18c222432d4f8c3cc1 (diff) | |
Pdf metadata support
Diffstat (limited to 'lib/parser.py')
| -rw-r--r-- | lib/parser.py | 30 |
1 files changed, 5 insertions, 25 deletions
diff --git a/lib/parser.py b/lib/parser.py index 6c9f6f3..1084de5 100644 --- a/lib/parser.py +++ b/lib/parser.py | |||
| @@ -4,7 +4,6 @@ | |||
| 4 | 4 | ||
| 5 | import hachoir_core.error | 5 | import hachoir_core.error |
| 6 | import hachoir_parser | 6 | import hachoir_parser |
| 7 | import hachoir_metadata | ||
| 8 | import hachoir_editor | 7 | import hachoir_editor |
| 9 | import sys | 8 | import sys |
| 10 | 9 | ||
| @@ -12,32 +11,10 @@ POSTFIX = ".cleaned" | |||
| 12 | 11 | ||
| 13 | class Generic_parser(): | 12 | class Generic_parser(): |
| 14 | def __init__(self, realname, filename, parser, editor): | 13 | def __init__(self, realname, filename, parser, editor): |
| 15 | self.meta = {} | ||
| 16 | self.filename = filename | 14 | self.filename = filename |
| 17 | self.realname = realname | 15 | self.realname = realname |
| 18 | self.parser = parser | 16 | self.parser = parser |
| 19 | self.editor = editor | 17 | self.editor = editor |
| 20 | #self.meta = self.__fill_meta() | ||
| 21 | |||
| 22 | def __fill_meta(self): | ||
| 23 | metadata = {} | ||
| 24 | try: | ||
| 25 | meta = hachoir_metadata.extractMetadata(self.parser) | ||
| 26 | except hachoir_core.error.HachoirError, err: | ||
| 27 | print("Metadata extraction error: %s" % err) | ||
| 28 | |||
| 29 | if not meta: | ||
| 30 | print("Unable to extract metadata from the file %s" % self.filename) | ||
| 31 | #sys.exit(1) | ||
| 32 | |||
| 33 | for title in meta: | ||
| 34 | #fixme i'm so dirty | ||
| 35 | if title.values != []: #if the field is not empty | ||
| 36 | value = "" | ||
| 37 | for item in title.values: | ||
| 38 | value = item.text | ||
| 39 | metadata[title.key] = value | ||
| 40 | return metadata | ||
| 41 | 18 | ||
| 42 | def is_clean(self): | 19 | def is_clean(self): |
| 43 | ''' | 20 | ''' |
| @@ -84,10 +61,13 @@ class Generic_parser(): | |||
| 84 | ''' | 61 | ''' |
| 85 | return a dict with all the meta of the file | 62 | return a dict with all the meta of the file |
| 86 | ''' | 63 | ''' |
| 87 | metadata = [] | 64 | metadata = {} |
| 88 | for field in self.editor: | 65 | for field in self.editor: |
| 89 | if self._should_remove(field): | 66 | if self._should_remove(field): |
| 90 | metadata.append(field.name) | 67 | try: |
| 68 | metadata[field.name] = field.value | ||
| 69 | except: | ||
| 70 | metadata[field.name] = "harmful content" | ||
| 91 | return metadata | 71 | return metadata |
| 92 | 72 | ||
| 93 | def _should_remove(self, key): | 73 | def _should_remove(self, key): |
