/* This file is part of libextractor. (C) 2002, 2003 Vidyut Samanta and Christian Grothoff libextractor is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2, or (at your option) any later version. libextractor is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with libextractor; see the file COPYING. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. This code was inspired by pdfinfo and depends heavily on the xpdf code that pdfinfo is a part of. See also the INFO file in this directory. */ #include "platform.h" #include "extractor.h" #include "../convert.h" #include #include "parseargs.h" #include "GString.h" #include "gmem.h" #include "Object.h" #include "Stream.h" #include "Array.h" #include "Dict.h" #include "XRef.h" #include "Catalog.h" #include "Page.h" #include "PDFDoc.h" #include "Params.h" #include "Error.h" #include "config.h" extern "C" { static struct EXTRACTOR_Keywords * addKeyword(EXTRACTOR_KeywordType type, char * keyword, struct EXTRACTOR_Keywords * next) { EXTRACTOR_KeywordList * result; if (keyword == NULL) return next; result = (EXTRACTOR_KeywordList*) malloc(sizeof(EXTRACTOR_KeywordList)); result->next = next; result->keyword = keyword; result->keywordType = type; return result; } static struct EXTRACTOR_Keywords * printInfoString(Dict *infoDict, char *key, EXTRACTOR_KeywordType type, struct EXTRACTOR_Keywords * next) { Object obj; GString *s1; char * s; if (infoDict->lookup(key, &obj)->isString()) { s1 = obj.getString(); s = s1->getCString(); if ((((unsigned char)s[0]) & 0xff) == 0xfe && (((unsigned char)s[1]) & 0xff) == 0xff) { char * result; result = convertToUtf8((const char*) &s[2], s1->getLength() - 2, "UTF-16BE"); next = addKeyword(type, result, next); } else { unsigned int len = (NULL == s) ? 0 : strlen(s); while(0 < len) { /* * Avoid outputting trailing spaces. * * The following expression might be rewritten as * (! isspace(s[len - 1]) && 0xA0 != s[len - 1]). * There seem to exist isspace() implementations * which do return non-zero from NBSP (maybe locale-dependent). * Remove ISO-8859 non-breaking space (NBSP, hex value 0xA0) from * the expression if it looks suspicious (locale issues for instance). * * Squeezing out all non-printable characters might also be useful. */ if ( (' ' != s[len - 1]) && ((char)0xA0 != s[len - 1]) && ('\r' != s[len - 1]) && ('\n' != s[len - 1]) && ('\t' != s[len - 1]) && ('\v' != s[len - 1]) && ('\f' != s[len - 1]) ) break; else len --; } /* there should be a check to truncate preposterously long values. */ if (0 < len) { next = addKeyword(type, convertToUtf8(s, len, "ISO-8859-1"), next); } } } obj.free(); return next; } static struct EXTRACTOR_Keywords * printInfoDate(Dict *infoDict, char *key, EXTRACTOR_KeywordType type, struct EXTRACTOR_Keywords * next) { Object obj; char *s; GString *s1; if (infoDict->lookup(key, &obj)->isString()) { s1 = obj.getString(); s = s1->getCString(); if ((s1->getChar(0) & 0xff) == 0xfe && (s1->getChar(1) & 0xff) == 0xff) { /* isUnicode */ char * result; result = convertToUtf8((const char*)&s[2], s1->getLength() - 2, "UTF-16BE"); next = addKeyword(type, result, next); } else { if (s[0] == 'D' && s[1] == ':') { s += 2; } next = addKeyword(type, strdup(s), next); } /* printf(fmt, s);*/ } obj.free(); return next; } /* which mime-types should not be subjected to the PDF extractor? (no use trying!) */ static char * blacklist[] = { "image/jpeg", "image/gif", "image/png", "image/x-png", "audio/real", "audio/mpeg", "application/x-gzip", "application/x-dpkg", "application/bz2", "application/x-rpm", "application/x-rar", "application/x-zip", "application/x-arj", "application/x-compress", "application/x-tar", "application/x-lha", "application/x-gtar", "application/x-dpkg", "application/ogg", "video/real", "video/asf", "video/quicktime", NULL, }; static const char * extractLast (const EXTRACTOR_KeywordType type, EXTRACTOR_KeywordList * keywords) { char *result = NULL; while (keywords != NULL) { if (keywords->keywordType == type) result = keywords->keyword; keywords = keywords->next; } return result; } struct EXTRACTOR_Keywords * libextractor_pdf_extract(const char * filename, char * data, size_t size, struct EXTRACTOR_Keywords * prev) { PDFDoc * doc; Object info; Object obj; BaseStream * stream; struct EXTRACTOR_Keywords * result; const char * mime; /* if the mime-type of the file is blacklisted, don't run the printable extactor! */ mime = extractLast(EXTRACTOR_MIMETYPE, prev); if (mime != NULL) { int j; j = 0; while (blacklist[j] != NULL) { if (0 == strcmp(blacklist[j], mime)) return prev; j++; } } /* errorInit(); -- keep commented out, otherwise errors are printed to stderr for non-pdf files! */ obj.initNull(); stream = new MemStream(data, 0, size, &obj); doc = new PDFDoc(stream, NULL, NULL); if (! doc->isOk()) { delete doc; return prev; } result = addKeyword(EXTRACTOR_MIMETYPE, strdup("application/pdf"), prev); if ( (NULL != doc->getDocInfo(&info)) && (info.isDict()) ) { result = printInfoString(info.getDict(), "Title", EXTRACTOR_TITLE, result); result = printInfoString(info.getDict(), "Subject", EXTRACTOR_SUBJECT, result); result = printInfoString(info.getDict(), "Keywords", EXTRACTOR_KEYWORDS, result); result = printInfoString(info.getDict(), "Author", EXTRACTOR_AUTHOR, result); /* * we now believe that Adobe's Creator * is not a person nor an organisation, * but just a piece of software. */ result = printInfoString(info.getDict(), "Creator", EXTRACTOR_SOFTWARE, result); result = printInfoString(info.getDict(), "Producer", EXTRACTOR_PRODUCER, result); { char pcnt[20]; sprintf(pcnt, "%d", doc->getNumPages()); result = addKeyword(EXTRACTOR_PAGE_COUNT, strdup(pcnt), result); } { char pcnt[20]; sprintf(pcnt, "PDF %.1f", doc->getPDFVersion()); result = addKeyword(EXTRACTOR_FORMAT, strdup(pcnt), result); } result = printInfoDate(info.getDict(), "CreationDate", EXTRACTOR_CREATION_DATE, result); result = printInfoDate(info.getDict(), "ModDate", EXTRACTOR_MODIFICATION_DATE, result); } info.free(); delete doc; return result; } } void __attribute__ ((constructor)) xpdf_init(void) { initParams(".xpdfrc", ".xpdfrc"); } void __attribute__ ((destructor)) xpdf_done(void) { freeParams(); }