summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--FORMATS4
-rw-r--r--README14
-rw-r--r--mat/office.py10
-rw-r--r--mat/strippers.py4
-rw-r--r--test/test.py2
5 files changed, 17 insertions, 17 deletions
diff --git a/FORMATS b/FORMATS
index 6061a79..51fdb26 100644
--- a/FORMATS
+++ b/FORMATS
@@ -39,11 +39,11 @@
39 <extension>.pdf</extension> 39 <extension>.pdf</extension>
40 <support>full</support> 40 <support>full</support>
41 <metadata>a lot</metadata> 41 <metadata>a lot</metadata>
42 <method> rendering of the pdf file on a cairo surface with the help of 42 <method> rendering of the PDF file on a cairo surface with the help of
43 poppler in order to remove all the internal metadata. 43 poppler in order to remove all the internal metadata.
44 For now, cairo create some metadata. 44 For now, cairo create some metadata.
45 They can be remove if you install either exiftool, or python-pdfrw. 45 They can be remove if you install either exiftool, or python-pdfrw.
46 The next version of python-cairo will support pdf metadata. 46 The next version of python-cairo will support PDF metadata.
47 </format> 47 </format>
48 48
49 <format> 49 <format>
diff --git a/README b/README
index 27c307a..3943ff2 100644
--- a/README
+++ b/README
@@ -7,7 +7,7 @@ METADATA:
7METADATA AND PRIVACY: 7METADATA AND PRIVACY:
8 Metadata within a file can tell a lot about you. 8 Metadata within a file can tell a lot about you.
9 Cameras record data about when a picture was taken and what 9 Cameras record data about when a picture was taken and what
10 camera was used. Office documents like pdf or Office automatically adds 10 camera was used. Office documents like PDF or Office automatically adds
11 author and company information to documents and spreadsheets. 11 author and company information to documents and spreadsheets.
12 Maybe you don't want to disclose those information on the web. 12 Maybe you don't want to disclose those information on the web.
13 13
@@ -23,12 +23,12 @@ WARNING :
23DEPENDENCIES: 23DEPENDENCIES:
24 python2.6 (at least) 24 python2.6 (at least)
25 python-hachoir-core and python-hachoir-parser 25 python-hachoir-core and python-hachoir-parser
26 python-pdfrw or exiftool for full pdf support 26 python-pdfrw or exiftool for full PDF support
27 shred (should be already installed) 27 shred (should be already installed)
28 28
29 29
30OPTIONALS DEPENDENCIES: 30OPTIONALS DEPENDENCIES:
31 python-poppler and python-cairo : for pdf support 31 python-poppler and python-cairo : for PDF support
32 python-mutagen : for massive audio format support 32 python-mutagen : for massive audio format support
33 exiftool : for _massive_ image format support 33 exiftool : for _massive_ image format support
34 34
@@ -67,11 +67,11 @@ SUPPORTED FORMAT:
67 Portable Document Fileformat (.pdf) 67 Portable Document Fileformat (.pdf)
68 support : full 68 support : full
69 metadata : a lot 69 metadata : a lot
70 method : rendering of the pdf file on a cairo surface with the help of 70 method : rendering of the PDF file on a cairo surface with the help of
71 poppler in order to remove all the internal metadata. 71 poppler in order to remove all the internal metadata.
72 For now, cairo create some metadata. 72 For now, cairo create some metadata.
73 They can be remove if you install either exiftool, or python-pdfrw. 73 They can be remove if you install either exiftool, or python-pdfrw.
74 The next version of python-cairo will support pdf metadata. 74 The next version of python-cairo will support PDF metadata.
75 75
76 76
77 Tape ARchive (.tar, .tar.bz2, .tar.gz) 77 Tape ARchive (.tar, .tar.bz2, .tar.gz)
@@ -139,8 +139,8 @@ for images:
139 exiv2 (C++) : metadata manipulation 139 exiv2 (C++) : metadata manipulation
140 graphicsmagick (a fork from imagemagick) : cli image manipulation 140 graphicsmagick (a fork from imagemagick) : cli image manipulation
141 141
142for pdf: 142for PDF:
143 pdfminer (python) : pdf manipulation 143 pdfminer (python) : PDF manipulation
144 144
145other tools: 145other tools:
146 an hexadecimal editor 146 an hexadecimal editor
diff --git a/mat/office.py b/mat/office.py
index d1e781e..d6ad367 100644
--- a/mat/office.py
+++ b/mat/office.py
@@ -112,7 +112,7 @@ class OpenDocumentStripper(archive.GenericArchiveStripper):
112 112
113class PdfStripper(parser.GenericParser): 113class PdfStripper(parser.GenericParser):
114 ''' 114 '''
115 Represent a pdf file 115 Represent a PDF file
116 ''' 116 '''
117 def __init__(self, filename, parser, mime, backup, add2archive): 117 def __init__(self, filename, parser, mime, backup, add2archive):
118 super(PdfStripper, self).__init__(filename, parser, mime, backup, 118 super(PdfStripper, self).__init__(filename, parser, mime, backup,
@@ -143,7 +143,7 @@ class PdfStripper(parser.GenericParser):
143 143
144 def remove_all_ugly(self): 144 def remove_all_ugly(self):
145 ''' 145 '''
146 Opening the pdf with poppler, then doing a render 146 Opening the PDF with poppler, then doing a render
147 on a cairo pdfsurface for each pages. 147 on a cairo pdfsurface for each pages.
148 Thanks to Lunar^for the idea. 148 Thanks to Lunar^for the idea.
149 http://cairographics.org/documentation/pycairo/2/ 149 http://cairographics.org/documentation/pycairo/2/
@@ -153,7 +153,7 @@ class PdfStripper(parser.GenericParser):
153 page_width, page_height = page.get_size() 153 page_width, page_height = page.get_size()
154 surface = cairo.PDFSurface(self.output, page_width, page_height) 154 surface = cairo.PDFSurface(self.output, page_width, page_height)
155 context = cairo.Context(surface) # context draws on the surface 155 context = cairo.Context(surface) # context draws on the surface
156 logging.debug('Pdf rendering of %s' % self.filename) 156 logging.debug('PDF rendering of %s' % self.filename)
157 for pagenum in xrange(self.document.get_n_pages()): 157 for pagenum in xrange(self.document.get_n_pages()):
158 page = self.document.get_page(pagenum) 158 page = self.document.get_page(pagenum)
159 context.translate(0, 0) 159 context.translate(0, 0)
@@ -165,7 +165,7 @@ class PdfStripper(parser.GenericParser):
165 def _remove_meta(self): 165 def _remove_meta(self):
166 ''' 166 '''
167 Remove superficial/external metadata 167 Remove superficial/external metadata
168 from a pdf file, using exiftool, 168 from a PDF file, using exiftool,
169 of pdfrw if exiftool is not installed 169 of pdfrw if exiftool is not installed
170 ''' 170 '''
171 processed = False 171 processed = False
@@ -203,7 +203,7 @@ class PdfStripper(parser.GenericParser):
203 203
204 if processed is False: 204 if processed is False:
205 logging.error('Please install either pdfrw, or exiftool to\ 205 logging.error('Please install either pdfrw, or exiftool to\
206 fully handle pdf files') 206 fully handle PDF files')
207 return processed 207 return processed
208 208
209 def get_meta(self): 209 def get_meta(self):
diff --git a/mat/strippers.py b/mat/strippers.py
index 1cf2271..7d27874 100644
--- a/mat/strippers.py
+++ b/mat/strippers.py
@@ -20,13 +20,13 @@ STRIPPERS = {
20 'application/officeopenxml': office.OpenXmlStripper, 20 'application/officeopenxml': office.OpenXmlStripper,
21} 21}
22 22
23try: # pdf support 23try: # PDF support
24 import poppler 24 import poppler
25 import cairo 25 import cairo
26 STRIPPERS['application/x-pdf'] = office.PdfStripper 26 STRIPPERS['application/x-pdf'] = office.PdfStripper
27 STRIPPERS['application/pdf'] = office.PdfStripper 27 STRIPPERS['application/pdf'] = office.PdfStripper
28except ImportError: 28except ImportError:
29 print('Unable to import python-poppler and/or python-cairo: no pdf \ 29 print('Unable to import python-poppler and/or python-cairo: no PDF \
30 support') 30 support')
31 31
32try: # mutangen-python : audio format support 32try: # mutangen-python : audio format support
diff --git a/test/test.py b/test/test.py
index 601f5ef..c414c77 100644
--- a/test/test.py
+++ b/test/test.py
@@ -21,7 +21,7 @@ dirty.sort()
21 21
22FILE_LIST = zip(clean, dirty) 22FILE_LIST = zip(clean, dirty)
23 23
24try: # pdf render processing 24try: # PDF render processing
25 import poppler 25 import poppler
26 import cairo 26 import cairo
27except: 27except: