summaryrefslogtreecommitdiff
path: root/lib/parser.py
diff options
context:
space:
mode:
authorjvoisin2011-06-21 17:27:11 +0200
committerjvoisin2011-06-21 17:27:11 +0200
commit9e0f6cf0ea0a992450c12aec73b459403de5f3c2 (patch)
treeabce28e477f429f6bc174409a23955e0e1ca65a1 /lib/parser.py
parent5aa90822631b80502afaca18c222432d4f8c3cc1 (diff)
Pdf metadata support
Diffstat (limited to 'lib/parser.py')
-rw-r--r--lib/parser.py30
1 files changed, 5 insertions, 25 deletions
diff --git a/lib/parser.py b/lib/parser.py
index 6c9f6f3..1084de5 100644
--- a/lib/parser.py
+++ b/lib/parser.py
@@ -4,7 +4,6 @@
4 4
5import hachoir_core.error 5import hachoir_core.error
6import hachoir_parser 6import hachoir_parser
7import hachoir_metadata
8import hachoir_editor 7import hachoir_editor
9import sys 8import sys
10 9
@@ -12,32 +11,10 @@ POSTFIX = ".cleaned"
12 11
13class Generic_parser(): 12class Generic_parser():
14 def __init__(self, realname, filename, parser, editor): 13 def __init__(self, realname, filename, parser, editor):
15 self.meta = {}
16 self.filename = filename 14 self.filename = filename
17 self.realname = realname 15 self.realname = realname
18 self.parser = parser 16 self.parser = parser
19 self.editor = editor 17 self.editor = editor
20 #self.meta = self.__fill_meta()
21
22 def __fill_meta(self):
23 metadata = {}
24 try:
25 meta = hachoir_metadata.extractMetadata(self.parser)
26 except hachoir_core.error.HachoirError, err:
27 print("Metadata extraction error: %s" % err)
28
29 if not meta:
30 print("Unable to extract metadata from the file %s" % self.filename)
31 #sys.exit(1)
32
33 for title in meta:
34 #fixme i'm so dirty
35 if title.values != []: #if the field is not empty
36 value = ""
37 for item in title.values:
38 value = item.text
39 metadata[title.key] = value
40 return metadata
41 18
42 def is_clean(self): 19 def is_clean(self):
43 ''' 20 '''
@@ -84,10 +61,13 @@ class Generic_parser():
84 ''' 61 '''
85 return a dict with all the meta of the file 62 return a dict with all the meta of the file
86 ''' 63 '''
87 metadata = [] 64 metadata = {}
88 for field in self.editor: 65 for field in self.editor:
89 if self._should_remove(field): 66 if self._should_remove(field):
90 metadata.append(field.name) 67 try:
68 metadata[field.name] = field.value
69 except:
70 metadata[field.name] = "harmful content"
91 return metadata 71 return metadata
92 72
93 def _should_remove(self, key): 73 def _should_remove(self, key):