diff options
Diffstat (limited to 'lib/office.py')
| -rw-r--r-- | lib/office.py | 24 |
1 files changed, 11 insertions, 13 deletions
diff --git a/lib/office.py b/lib/office.py index 27677d2..432bc0b 100644 --- a/lib/office.py +++ b/lib/office.py | |||
| @@ -5,17 +5,16 @@ import tempfile | |||
| 5 | import glob | 5 | import glob |
| 6 | import logging | 6 | import logging |
| 7 | import zipfile | 7 | import zipfile |
| 8 | import shutil | ||
| 9 | import re | 8 | import re |
| 10 | from xml.etree import ElementTree | 9 | from xml.etree import ElementTree |
| 11 | 10 | ||
| 12 | import hachoir_core | ||
| 13 | 11 | ||
| 14 | import pdfrw | 12 | import pdfrw |
| 15 | import mat | 13 | import mat |
| 16 | import parser | 14 | import parser |
| 17 | import archive | 15 | import archive |
| 18 | 16 | ||
| 17 | |||
| 19 | class OpenDocumentStripper(archive.GenericArchiveStripper): | 18 | class OpenDocumentStripper(archive.GenericArchiveStripper): |
| 20 | ''' | 19 | ''' |
| 21 | An open document file is a zip, with xml file into. | 20 | An open document file is a zip, with xml file into. |
| @@ -32,11 +31,10 @@ class OpenDocumentStripper(archive.GenericArchiveStripper): | |||
| 32 | for node in tree.iter(): | 31 | for node in tree.iter(): |
| 33 | key = re.sub('{.*}', '', node.tag) | 32 | key = re.sub('{.*}', '', node.tag) |
| 34 | metadata[key] = node.text | 33 | metadata[key] = node.text |
| 35 | except KeyError:#no meta.xml file found | 34 | except KeyError: # no meta.xml file found |
| 36 | logging.debug('%s has no opendocument metadata' % self.filename) | 35 | logging.debug('%s has no opendocument metadata' % self.filename) |
| 37 | return metadata | 36 | return metadata |
| 38 | 37 | ||
| 39 | |||
| 40 | def _remove_all(self, method): | 38 | def _remove_all(self, method): |
| 41 | ''' | 39 | ''' |
| 42 | FIXME ? | 40 | FIXME ? |
| @@ -50,7 +48,7 @@ class OpenDocumentStripper(archive.GenericArchiveStripper): | |||
| 50 | name = os.path.join(self.tempdir, item) | 48 | name = os.path.join(self.tempdir, item) |
| 51 | if item.endswith('.xml') or item == 'mimetype': | 49 | if item.endswith('.xml') or item == 'mimetype': |
| 52 | #keep .xml files, and the "manifest" file | 50 | #keep .xml files, and the "manifest" file |
| 53 | if item != 'meta.xml':#contains the metadata | 51 | if item != 'meta.xml': # contains the metadata |
| 54 | zipin.extract(item, self.tempdir) | 52 | zipin.extract(item, self.tempdir) |
| 55 | zipout.write(name, item) | 53 | zipout.write(name, item) |
| 56 | mat.secure_remove(name) | 54 | mat.secure_remove(name) |
| @@ -73,7 +71,7 @@ class OpenDocumentStripper(archive.GenericArchiveStripper): | |||
| 73 | self.filename)) | 71 | self.filename)) |
| 74 | zipout.write(name, item) | 72 | zipout.write(name, item) |
| 75 | except: | 73 | except: |
| 76 | logging.info('%s\' fileformat is not supported' % item) | 74 | logging.info('%s\' fileformat is not supported' % item) |
| 77 | if self.add2archive: | 75 | if self.add2archive: |
| 78 | zipout.write(name, item) | 76 | zipout.write(name, item) |
| 79 | mat.secure_remove(name) | 77 | mat.secure_remove(name) |
| @@ -88,7 +86,7 @@ class OpenDocumentStripper(archive.GenericArchiveStripper): | |||
| 88 | try: | 86 | try: |
| 89 | zipin.getinfo('meta.xml') | 87 | zipin.getinfo('meta.xml') |
| 90 | return False | 88 | return False |
| 91 | except KeyError:#no meta.xml in the file | 89 | except KeyError: # no meta.xml in the file |
| 92 | zipin.close() | 90 | zipin.close() |
| 93 | czf = archive.ZipStripper(self.realname, self.filename, | 91 | czf = archive.ZipStripper(self.realname, self.filename, |
| 94 | self.parser, self.editor, self.backup, self.add2archive) | 92 | self.parser, self.editor, self.backup, self.add2archive) |
| @@ -104,7 +102,7 @@ class PdfStripper(parser.Generic_parser): | |||
| 104 | Represent a pdf file, with the help of pdfrw | 102 | Represent a pdf file, with the help of pdfrw |
| 105 | ''' | 103 | ''' |
| 106 | def __init__(self, filename, realname, backup): | 104 | def __init__(self, filename, realname, backup): |
| 107 | name, path = os.path.splitext(filename) | 105 | name, ext = os.path.splitext(filename) |
| 108 | self.output = name + '.cleaned' + ext | 106 | self.output = name + '.cleaned' + ext |
| 109 | self.filename = filename | 107 | self.filename = filename |
| 110 | self.backup = backup | 108 | self.backup = backup |
| @@ -137,7 +135,7 @@ class PdfStripper(parser.Generic_parser): | |||
| 137 | ''' | 135 | ''' |
| 138 | _, self.tmpdir = tempfile.mkstemp() | 136 | _, self.tmpdir = tempfile.mkstemp() |
| 139 | subprocess.call(self.convert % (self.filename, self.tmpdir + | 137 | subprocess.call(self.convert % (self.filename, self.tmpdir + |
| 140 | 'temp.jpg'), shell=True)#Convert pages to jpg | 138 | 'temp.jpg'), shell=True) # Convert pages to jpg |
| 141 | 139 | ||
| 142 | for current_file in glob.glob(self.tmpdir + 'temp*'): | 140 | for current_file in glob.glob(self.tmpdir + 'temp*'): |
| 143 | #Clean every jpg image | 141 | #Clean every jpg image |
| @@ -145,18 +143,18 @@ class PdfStripper(parser.Generic_parser): | |||
| 145 | class_file.remove_all() | 143 | class_file.remove_all() |
| 146 | 144 | ||
| 147 | subprocess.call(self.convert % (self.tmpdir + | 145 | subprocess.call(self.convert % (self.tmpdir + |
| 148 | 'temp.jpg*', self.output), shell=True)#Assemble jpg into pdf | 146 | 'temp.jpg*', self.output), shell=True) # Assemble jpg into pdf |
| 149 | 147 | ||
| 150 | for current_file in glob.glob(self.tmpdir + 'temp*'): | 148 | for current_file in glob.glob(self.tmpdir + 'temp*'): |
| 151 | #remove jpg files | 149 | #remove jpg files |
| 152 | mat.secure_remove(current_file) | 150 | mat.secure_remove(current_file) |
| 153 | 151 | ||
| 154 | if self.backup is False: | 152 | if self.backup is False: |
| 155 | mat.secure_remove(self.filename) #remove the old file | 153 | mat.secure_remove(self.filename) # remove the old file |
| 156 | os.rename(self.output, self.filename)#rename the new | 154 | os.rename(self.output, self.filename) # rename the new |
| 157 | name = self.realname | 155 | name = self.realname |
| 158 | else: | 156 | else: |
| 159 | name = output_file | 157 | name = self.output |
| 160 | class_file = mat.create_class_file(name, False) | 158 | class_file = mat.create_class_file(name, False) |
| 161 | class_file.remove_all() | 159 | class_file.remove_all() |
| 162 | 160 | ||
