diff options
| author | jvoisin | 2011-12-04 13:53:04 +0100 |
|---|---|---|
| committer | jvoisin | 2011-12-04 13:53:04 +0100 |
| commit | 2ba7a313fd0dd3d5e61927e93be4ed71e2fbaee1 (patch) | |
| tree | 366ab80fa188ce436f264d5b2baf13971a0d2de1 | |
| parent | 697cf6187671c92bf9a81572e5329a1dea3c21e7 (diff) | |
"PDF" and not "pdf"
| -rw-r--r-- | FORMATS | 4 | ||||
| -rw-r--r-- | README | 14 | ||||
| -rw-r--r-- | mat/office.py | 10 | ||||
| -rw-r--r-- | mat/strippers.py | 4 | ||||
| -rw-r--r-- | test/test.py | 2 |
5 files changed, 17 insertions, 17 deletions
| @@ -39,11 +39,11 @@ | |||
| 39 | <extension>.pdf</extension> | 39 | <extension>.pdf</extension> |
| 40 | <support>full</support> | 40 | <support>full</support> |
| 41 | <metadata>a lot</metadata> | 41 | <metadata>a lot</metadata> |
| 42 | <method> rendering of the pdf file on a cairo surface with the help of | 42 | <method> rendering of the PDF file on a cairo surface with the help of |
| 43 | poppler in order to remove all the internal metadata. | 43 | poppler in order to remove all the internal metadata. |
| 44 | For now, cairo create some metadata. | 44 | For now, cairo create some metadata. |
| 45 | They can be remove if you install either exiftool, or python-pdfrw. | 45 | They can be remove if you install either exiftool, or python-pdfrw. |
| 46 | The next version of python-cairo will support pdf metadata. | 46 | The next version of python-cairo will support PDF metadata. |
| 47 | </format> | 47 | </format> |
| 48 | 48 | ||
| 49 | <format> | 49 | <format> |
| @@ -7,7 +7,7 @@ METADATA: | |||
| 7 | METADATA AND PRIVACY: | 7 | METADATA AND PRIVACY: |
| 8 | Metadata within a file can tell a lot about you. | 8 | Metadata within a file can tell a lot about you. |
| 9 | Cameras record data about when a picture was taken and what | 9 | Cameras record data about when a picture was taken and what |
| 10 | camera was used. Office documents like pdf or Office automatically adds | 10 | camera was used. Office documents like PDF or Office automatically adds |
| 11 | author and company information to documents and spreadsheets. | 11 | author and company information to documents and spreadsheets. |
| 12 | Maybe you don't want to disclose those information on the web. | 12 | Maybe you don't want to disclose those information on the web. |
| 13 | 13 | ||
| @@ -23,12 +23,12 @@ WARNING : | |||
| 23 | DEPENDENCIES: | 23 | DEPENDENCIES: |
| 24 | python2.6 (at least) | 24 | python2.6 (at least) |
| 25 | python-hachoir-core and python-hachoir-parser | 25 | python-hachoir-core and python-hachoir-parser |
| 26 | python-pdfrw or exiftool for full pdf support | 26 | python-pdfrw or exiftool for full PDF support |
| 27 | shred (should be already installed) | 27 | shred (should be already installed) |
| 28 | 28 | ||
| 29 | 29 | ||
| 30 | OPTIONALS DEPENDENCIES: | 30 | OPTIONALS DEPENDENCIES: |
| 31 | python-poppler and python-cairo : for pdf support | 31 | python-poppler and python-cairo : for PDF support |
| 32 | python-mutagen : for massive audio format support | 32 | python-mutagen : for massive audio format support |
| 33 | exiftool : for _massive_ image format support | 33 | exiftool : for _massive_ image format support |
| 34 | 34 | ||
| @@ -67,11 +67,11 @@ SUPPORTED FORMAT: | |||
| 67 | Portable Document Fileformat (.pdf) | 67 | Portable Document Fileformat (.pdf) |
| 68 | support : full | 68 | support : full |
| 69 | metadata : a lot | 69 | metadata : a lot |
| 70 | method : rendering of the pdf file on a cairo surface with the help of | 70 | method : rendering of the PDF file on a cairo surface with the help of |
| 71 | poppler in order to remove all the internal metadata. | 71 | poppler in order to remove all the internal metadata. |
| 72 | For now, cairo create some metadata. | 72 | For now, cairo create some metadata. |
| 73 | They can be remove if you install either exiftool, or python-pdfrw. | 73 | They can be remove if you install either exiftool, or python-pdfrw. |
| 74 | The next version of python-cairo will support pdf metadata. | 74 | The next version of python-cairo will support PDF metadata. |
| 75 | 75 | ||
| 76 | 76 | ||
| 77 | Tape ARchive (.tar, .tar.bz2, .tar.gz) | 77 | Tape ARchive (.tar, .tar.bz2, .tar.gz) |
| @@ -139,8 +139,8 @@ for images: | |||
| 139 | exiv2 (C++) : metadata manipulation | 139 | exiv2 (C++) : metadata manipulation |
| 140 | graphicsmagick (a fork from imagemagick) : cli image manipulation | 140 | graphicsmagick (a fork from imagemagick) : cli image manipulation |
| 141 | 141 | ||
| 142 | for pdf: | 142 | for PDF: |
| 143 | pdfminer (python) : pdf manipulation | 143 | pdfminer (python) : PDF manipulation |
| 144 | 144 | ||
| 145 | other tools: | 145 | other tools: |
| 146 | an hexadecimal editor | 146 | an hexadecimal editor |
diff --git a/mat/office.py b/mat/office.py index d1e781e..d6ad367 100644 --- a/mat/office.py +++ b/mat/office.py | |||
| @@ -112,7 +112,7 @@ class OpenDocumentStripper(archive.GenericArchiveStripper): | |||
| 112 | 112 | ||
| 113 | class PdfStripper(parser.GenericParser): | 113 | class PdfStripper(parser.GenericParser): |
| 114 | ''' | 114 | ''' |
| 115 | Represent a pdf file | 115 | Represent a PDF file |
| 116 | ''' | 116 | ''' |
| 117 | def __init__(self, filename, parser, mime, backup, add2archive): | 117 | def __init__(self, filename, parser, mime, backup, add2archive): |
| 118 | super(PdfStripper, self).__init__(filename, parser, mime, backup, | 118 | super(PdfStripper, self).__init__(filename, parser, mime, backup, |
| @@ -143,7 +143,7 @@ class PdfStripper(parser.GenericParser): | |||
| 143 | 143 | ||
| 144 | def remove_all_ugly(self): | 144 | def remove_all_ugly(self): |
| 145 | ''' | 145 | ''' |
| 146 | Opening the pdf with poppler, then doing a render | 146 | Opening the PDF with poppler, then doing a render |
| 147 | on a cairo pdfsurface for each pages. | 147 | on a cairo pdfsurface for each pages. |
| 148 | Thanks to Lunar^for the idea. | 148 | Thanks to Lunar^for the idea. |
| 149 | http://cairographics.org/documentation/pycairo/2/ | 149 | http://cairographics.org/documentation/pycairo/2/ |
| @@ -153,7 +153,7 @@ class PdfStripper(parser.GenericParser): | |||
| 153 | page_width, page_height = page.get_size() | 153 | page_width, page_height = page.get_size() |
| 154 | surface = cairo.PDFSurface(self.output, page_width, page_height) | 154 | surface = cairo.PDFSurface(self.output, page_width, page_height) |
| 155 | context = cairo.Context(surface) # context draws on the surface | 155 | context = cairo.Context(surface) # context draws on the surface |
| 156 | logging.debug('Pdf rendering of %s' % self.filename) | 156 | logging.debug('PDF rendering of %s' % self.filename) |
| 157 | for pagenum in xrange(self.document.get_n_pages()): | 157 | for pagenum in xrange(self.document.get_n_pages()): |
| 158 | page = self.document.get_page(pagenum) | 158 | page = self.document.get_page(pagenum) |
| 159 | context.translate(0, 0) | 159 | context.translate(0, 0) |
| @@ -165,7 +165,7 @@ class PdfStripper(parser.GenericParser): | |||
| 165 | def _remove_meta(self): | 165 | def _remove_meta(self): |
| 166 | ''' | 166 | ''' |
| 167 | Remove superficial/external metadata | 167 | Remove superficial/external metadata |
| 168 | from a pdf file, using exiftool, | 168 | from a PDF file, using exiftool, |
| 169 | of pdfrw if exiftool is not installed | 169 | of pdfrw if exiftool is not installed |
| 170 | ''' | 170 | ''' |
| 171 | processed = False | 171 | processed = False |
| @@ -203,7 +203,7 @@ class PdfStripper(parser.GenericParser): | |||
| 203 | 203 | ||
| 204 | if processed is False: | 204 | if processed is False: |
| 205 | logging.error('Please install either pdfrw, or exiftool to\ | 205 | logging.error('Please install either pdfrw, or exiftool to\ |
| 206 | fully handle pdf files') | 206 | fully handle PDF files') |
| 207 | return processed | 207 | return processed |
| 208 | 208 | ||
| 209 | def get_meta(self): | 209 | def get_meta(self): |
diff --git a/mat/strippers.py b/mat/strippers.py index 1cf2271..7d27874 100644 --- a/mat/strippers.py +++ b/mat/strippers.py | |||
| @@ -20,13 +20,13 @@ STRIPPERS = { | |||
| 20 | 'application/officeopenxml': office.OpenXmlStripper, | 20 | 'application/officeopenxml': office.OpenXmlStripper, |
| 21 | } | 21 | } |
| 22 | 22 | ||
| 23 | try: # pdf support | 23 | try: # PDF support |
| 24 | import poppler | 24 | import poppler |
| 25 | import cairo | 25 | import cairo |
| 26 | STRIPPERS['application/x-pdf'] = office.PdfStripper | 26 | STRIPPERS['application/x-pdf'] = office.PdfStripper |
| 27 | STRIPPERS['application/pdf'] = office.PdfStripper | 27 | STRIPPERS['application/pdf'] = office.PdfStripper |
| 28 | except ImportError: | 28 | except ImportError: |
| 29 | print('Unable to import python-poppler and/or python-cairo: no pdf \ | 29 | print('Unable to import python-poppler and/or python-cairo: no PDF \ |
| 30 | support') | 30 | support') |
| 31 | 31 | ||
| 32 | try: # mutangen-python : audio format support | 32 | try: # mutangen-python : audio format support |
diff --git a/test/test.py b/test/test.py index 601f5ef..c414c77 100644 --- a/test/test.py +++ b/test/test.py | |||
| @@ -21,7 +21,7 @@ dirty.sort() | |||
| 21 | 21 | ||
| 22 | FILE_LIST = zip(clean, dirty) | 22 | FILE_LIST = zip(clean, dirty) |
| 23 | 23 | ||
| 24 | try: # pdf render processing | 24 | try: # PDF render processing |
| 25 | import poppler | 25 | import poppler |
| 26 | import cairo | 26 | import cairo |
| 27 | except: | 27 | except: |
