/* This file is part of libextractor. (C) 2006 Vidyut Samanta and Christian Grothoff libextractor is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2, or (at your option) any later version. libextractor is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with libextractor; see the file COPYING. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ /** * TODO: * - code clean up (factor out some parsing aspects?) * - proper dictionary support * - filters (compression!) * - page count (and other document catalog information, * such as language, viewer preferences, page layout, * Metadatastreams (10.2.2), legal and permissions info) * - pdf 1.5 support ((compressed) cross reference streams) */ #include "platform.h" #include "extractor.h" #include #ifndef _XOPEN_SOURCE #define _XOPEN_SOURCE 1 #endif #include #include "convert.h" static char * stndup(const char * str, size_t n) { char * tmp; tmp = malloc(n+1); tmp[n] = '\0'; memcpy(tmp, str, n); return tmp; } static struct EXTRACTOR_Keywords * addKeyword(EXTRACTOR_KeywordType type, char * keyword, struct EXTRACTOR_Keywords * next) { EXTRACTOR_KeywordList * result; if (keyword == NULL) return next; result = malloc(sizeof(EXTRACTOR_KeywordList)); result->next = next; result->keyword = keyword; result->keywordType = type; return result; } static unsigned char * dateDecode(const char * pdfString) { if (pdfString == NULL) return NULL; if (strlen(pdfString) < 4) return NULL; return (unsigned char*) stndup(&pdfString[3], strlen(pdfString) - 4); } static unsigned char * stringDecode(const char * pdfString, size_t * size) { size_t slen; size_t r; size_t w; unsigned char * ret; char hex[3]; int i; int val; slen = strlen(pdfString); if (slen < 2) return NULL; switch (pdfString[0]) { case '(': if (pdfString[slen-1] != ')') return NULL; ret = malloc(slen); w = 0; for (r=1;r= '0') && (pdfString[r] <= '9') ) buf[1] = pdfString[r++]; if ( (pdfString[r] >= '0') && (pdfString[r] <= '9') ) buf[2] = pdfString[r++]; if (1 == sscanf(buf, "%o", &u)) { ret[w++] = (char) u; } else { free(ret); return NULL; /* invalid! */ } break; } default: /* invalid */ free(ret); return NULL; } } else { ret[w++] = pdfString[r]; } } ret[w] = '/'; *size = w; return ret; case '<': if (pdfString[slen-1] != '>') return NULL; hex[2] = '\0'; ret = malloc(1 + ((slen - 1) / 2)); for (i=0;i 0) && (IS_NL(data[size-1])) ) size--; if (size < strlen(PDF_HEADER) + strlen(PDF_EOF) + strlen(PDF_SXR) + 3) return prev; if (0 != memcmp(data, PDF_HEADER, strlen(PDF_HEADER))) return prev; if (0 != memcmp(&data[size - strlen(PDF_EOF)], PDF_EOF, strlen(PDF_EOF))) return prev; /* PDF format is pretty much sure by now */ memcpy(buf, data, 8); buf[8] = '\0'; if (1 != sscanf(buf, "%%PDF-%f", &version)) { return prev; } sprintf(pcnt, "PDF %.1f", version); prev = addKeyword(EXTRACTOR_FORMAT, strdup(pcnt), prev); pos = size - strlen(PDF_EOF) - strlen(PDF_SXR); steps = 0; while ( (steps++ < MAX_STEPS) && (pos > 0) && (0 != memcmp(&data[pos], PDF_SXR, strlen(PDF_SXR))) ) pos--; if (0 != memcmp(&data[pos], PDF_SXR, strlen(PDF_SXR))) { /* cross reference streams not yet supported! */ return prev; } memcpy(buf, &data[pos + strlen(PDF_SXR)], steps); buf[steps] = '\0'; if (1 != sscanf(buf, "%llu", &startxref)) return prev; if (startxref >= size - strlen(PDF_XREF)) return prev; if (0 != memcmp(&data[startxref], PDF_XREF, strlen(PDF_XREF))) return prev; haveValidXref = 0; xrefpos = startxref + strlen(PDF_XREF); while (1) { pos = xrefpos; while ( (pos < size) && (IS_NL(data[pos])) ) pos++; memcpy(buf, &data[pos], MIN(MAX_STEPS, size - pos)); buf[MIN(MAX_STEPS,size-pos)] = '\0'; if (2 != sscanf(buf, "%u %u", &xstart, &xcount)) break; while ( (pos < size) && (! IS_NL(data[pos])) ) pos++; if ( (pos < size) && IS_NL(data[pos])) pos++; xrefpos = 20 * xcount + pos; if ( (xrefpos >= size) || (xrefpos < pos) ) return prev; /* invalid xref size */ haveValidXref = 1; } if (! haveValidXref) return prev; if (size - pos < strlen(PDF_TRAILER)) return prev; if (0 != memcmp(&data[pos], PDF_TRAILER, strlen(PDF_TRAILER))) return prev; pos += strlen(PDF_TRAILER); SKIP("<< \n\r", pos, data, size); while ( (pos < size) && (pos + strlen(PDF_INFO) < size) && (0 != memcmp(&data[pos], PDF_INFO, strlen(PDF_INFO))) ) { while ( (pos < size) && (! IS_NL(data[pos]) ) ) { if ( (data[pos] == '>') && (pos + 1 < size) && (data[pos+1] == '>') ) return prev; /* no info */ pos++; } while ( (pos < size) && (IS_NL(data[pos]) || isspace(data[pos]) ) ) pos++; } if ( ! ( (pos < size) && (pos + strlen(PDF_INFO) < size) && (0 == memcmp(&data[pos], PDF_INFO, strlen(PDF_INFO))) ) ) return prev; pos += strlen(PDF_INFO); memcpy(buf, &data[pos], MIN(MAX_STEPS, size - pos)); buf[MIN(MAX_STEPS,size-pos)] = '\0'; for (i=0;i xstart) && (xinfo < xstart + xcount) ) { haveValidXref = 1; pos += 20 * xinfo - xstart; memcpy(buf, &data[pos], 20); buf[20] = '\0'; sscanf(buf, "%10llu %*5u %*c", &info_offset); break; } xrefpos = 20 * xcount + pos; if ( (xrefpos >= size) || (xrefpos < pos) ) return prev; /* invalid xref size */ } if (! haveValidXref) return prev; pos = info_offset; while ( (pos < size - 4) && (! ( (data[pos] == '<') && (data[pos+1] == '<') ) ) ) pos++; pos++; if (pos >= size - 4) return prev; if ( (data[pos] == ' ') || (data[pos] == 10) || (data[pos] == 13) ) pos++; while ( (pos < size - 2) && ( ! ( (data[pos] == '>') && (data[pos+1] == '>') ) ) ) { i = 0; while (tagmap[i].name != NULL) { if ( (pos + strlen(tagmap[i].name) > pos) && (pos + strlen(tagmap[i].name) + 1 < size) && (0 == memcmp(&data[pos], tagmap[i].name, strlen(tagmap[i].name))) ) { pos += strlen(tagmap[i].name); if (isspace(data[pos])) pos++; spos = pos; while ( (pos < size + 2) && (! IS_NL(data[pos])) && (data[pos] != '/') && (! ( (data[pos] == '>') && (data[pos+1] == '>') ) ) ) pos++; meta = stndup(&data[spos], pos - spos); if (i == 0) { dmeta = dateDecode(meta); if (dmeta != NULL) mlen = strlen((const char*)dmeta); else mlen = 0; } else { dmeta = stringDecode(meta, &mlen); } if (meta != NULL) free(meta); if (dmeta != NULL) { meta = charsetDecode(dmeta, mlen); if (dmeta != NULL) free(dmeta); if (meta != NULL) { prev = addKeyword(tagmap[i].type, meta, prev); } } break; } i++; } if (tagmap[i].name == NULL) { while ( (pos < size) && (! IS_NL(data[pos])) ) pos++; } while ( (pos < size) && (IS_NL(data[pos])) ) pos++; } return prev; }