summaryrefslogtreecommitdiff
path: root/lib/office.py
diff options
context:
space:
mode:
Diffstat (limited to 'lib/office.py')
-rw-r--r--lib/office.py24
1 files changed, 11 insertions, 13 deletions
diff --git a/lib/office.py b/lib/office.py
index 27677d2..432bc0b 100644
--- a/lib/office.py
+++ b/lib/office.py
@@ -5,17 +5,16 @@ import tempfile
5import glob 5import glob
6import logging 6import logging
7import zipfile 7import zipfile
8import shutil
9import re 8import re
10from xml.etree import ElementTree 9from xml.etree import ElementTree
11 10
12import hachoir_core
13 11
14import pdfrw 12import pdfrw
15import mat 13import mat
16import parser 14import parser
17import archive 15import archive
18 16
17
19class OpenDocumentStripper(archive.GenericArchiveStripper): 18class OpenDocumentStripper(archive.GenericArchiveStripper):
20 ''' 19 '''
21 An open document file is a zip, with xml file into. 20 An open document file is a zip, with xml file into.
@@ -32,11 +31,10 @@ class OpenDocumentStripper(archive.GenericArchiveStripper):
32 for node in tree.iter(): 31 for node in tree.iter():
33 key = re.sub('{.*}', '', node.tag) 32 key = re.sub('{.*}', '', node.tag)
34 metadata[key] = node.text 33 metadata[key] = node.text
35 except KeyError:#no meta.xml file found 34 except KeyError: # no meta.xml file found
36 logging.debug('%s has no opendocument metadata' % self.filename) 35 logging.debug('%s has no opendocument metadata' % self.filename)
37 return metadata 36 return metadata
38 37
39
40 def _remove_all(self, method): 38 def _remove_all(self, method):
41 ''' 39 '''
42 FIXME ? 40 FIXME ?
@@ -50,7 +48,7 @@ class OpenDocumentStripper(archive.GenericArchiveStripper):
50 name = os.path.join(self.tempdir, item) 48 name = os.path.join(self.tempdir, item)
51 if item.endswith('.xml') or item == 'mimetype': 49 if item.endswith('.xml') or item == 'mimetype':
52 #keep .xml files, and the "manifest" file 50 #keep .xml files, and the "manifest" file
53 if item != 'meta.xml':#contains the metadata 51 if item != 'meta.xml': # contains the metadata
54 zipin.extract(item, self.tempdir) 52 zipin.extract(item, self.tempdir)
55 zipout.write(name, item) 53 zipout.write(name, item)
56 mat.secure_remove(name) 54 mat.secure_remove(name)
@@ -73,7 +71,7 @@ class OpenDocumentStripper(archive.GenericArchiveStripper):
73 self.filename)) 71 self.filename))
74 zipout.write(name, item) 72 zipout.write(name, item)
75 except: 73 except:
76 logging.info('%s\' fileformat is not supported' % item) 74 logging.info('%s\' fileformat is not supported' % item)
77 if self.add2archive: 75 if self.add2archive:
78 zipout.write(name, item) 76 zipout.write(name, item)
79 mat.secure_remove(name) 77 mat.secure_remove(name)
@@ -88,7 +86,7 @@ class OpenDocumentStripper(archive.GenericArchiveStripper):
88 try: 86 try:
89 zipin.getinfo('meta.xml') 87 zipin.getinfo('meta.xml')
90 return False 88 return False
91 except KeyError:#no meta.xml in the file 89 except KeyError: # no meta.xml in the file
92 zipin.close() 90 zipin.close()
93 czf = archive.ZipStripper(self.realname, self.filename, 91 czf = archive.ZipStripper(self.realname, self.filename,
94 self.parser, self.editor, self.backup, self.add2archive) 92 self.parser, self.editor, self.backup, self.add2archive)
@@ -104,7 +102,7 @@ class PdfStripper(parser.Generic_parser):
104 Represent a pdf file, with the help of pdfrw 102 Represent a pdf file, with the help of pdfrw
105 ''' 103 '''
106 def __init__(self, filename, realname, backup): 104 def __init__(self, filename, realname, backup):
107 name, path = os.path.splitext(filename) 105 name, ext = os.path.splitext(filename)
108 self.output = name + '.cleaned' + ext 106 self.output = name + '.cleaned' + ext
109 self.filename = filename 107 self.filename = filename
110 self.backup = backup 108 self.backup = backup
@@ -137,7 +135,7 @@ class PdfStripper(parser.Generic_parser):
137 ''' 135 '''
138 _, self.tmpdir = tempfile.mkstemp() 136 _, self.tmpdir = tempfile.mkstemp()
139 subprocess.call(self.convert % (self.filename, self.tmpdir + 137 subprocess.call(self.convert % (self.filename, self.tmpdir +
140 'temp.jpg'), shell=True)#Convert pages to jpg 138 'temp.jpg'), shell=True) # Convert pages to jpg
141 139
142 for current_file in glob.glob(self.tmpdir + 'temp*'): 140 for current_file in glob.glob(self.tmpdir + 'temp*'):
143 #Clean every jpg image 141 #Clean every jpg image
@@ -145,18 +143,18 @@ class PdfStripper(parser.Generic_parser):
145 class_file.remove_all() 143 class_file.remove_all()
146 144
147 subprocess.call(self.convert % (self.tmpdir + 145 subprocess.call(self.convert % (self.tmpdir +
148 'temp.jpg*', self.output), shell=True)#Assemble jpg into pdf 146 'temp.jpg*', self.output), shell=True) # Assemble jpg into pdf
149 147
150 for current_file in glob.glob(self.tmpdir + 'temp*'): 148 for current_file in glob.glob(self.tmpdir + 'temp*'):
151 #remove jpg files 149 #remove jpg files
152 mat.secure_remove(current_file) 150 mat.secure_remove(current_file)
153 151
154 if self.backup is False: 152 if self.backup is False:
155 mat.secure_remove(self.filename) #remove the old file 153 mat.secure_remove(self.filename) # remove the old file
156 os.rename(self.output, self.filename)#rename the new 154 os.rename(self.output, self.filename) # rename the new
157 name = self.realname 155 name = self.realname
158 else: 156 else:
159 name = output_file 157 name = self.output
160 class_file = mat.create_class_file(name, False) 158 class_file = mat.create_class_file(name, False)
161 class_file.remove_all() 159 class_file.remove_all()
162 160