/* This file is part of libextractor. (C) 2004, 2005, 2006 Vidyut Samanta and Christian Grothoff libextractor is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2, or (at your option) any later version. libextractor is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with libextractor; see the file COPYING. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. This code makes extensive use of libgsf -- the Gnome Structured File Library Copyright (C) 2002-2004 Jody Goldberg (jody@gnome.org) Part of this code was borrowed from wordleaker.cpp. See also the README file in this directory. */ #include "platform.h" #include "extractor.h" #include "../convert.h" #include #include #include #include #include #include #include #include #include #define DEBUG_OLE2 0 /* ******************************** main extraction code ************************ */ /* using libgobject, needs init! */ void __attribute__ ((constructor)) ole_gobject_init(void) { g_type_init(); } static struct EXTRACTOR_Keywords * addKeyword(EXTRACTOR_KeywordList *oldhead, const char *phrase, EXTRACTOR_KeywordType type) { EXTRACTOR_KeywordList * keyword; if (strlen(phrase) == 0) return oldhead; if (0 == strcmp(phrase, "\"\"")) return oldhead; if (0 == strcmp(phrase, "\" \"")) return oldhead; if (0 == strcmp(phrase, " ")) return oldhead; keyword = malloc(sizeof(EXTRACTOR_KeywordList)); keyword->next = oldhead; keyword->keyword = strdup(phrase); keyword->keywordType = type; return keyword; } static guint8 const component_guid [] = { 0xe0, 0x85, 0x9f, 0xf2, 0xf9, 0x4f, 0x68, 0x10, 0xab, 0x91, 0x08, 0x00, 0x2b, 0x27, 0xb3, 0xd9 }; static guint8 const document_guid [] = { 0x02, 0xd5, 0xcd, 0xd5, 0x9c, 0x2e, 0x1b, 0x10, 0x93, 0x97, 0x08, 0x00, 0x2b, 0x2c, 0xf9, 0xae }; static guint8 const user_guid [] = { 0x05, 0xd5, 0xcd, 0xd5, 0x9c, 0x2e, 0x1b, 0x10, 0x93, 0x97, 0x08, 0x00, 0x2b, 0x2c, 0xf9, 0xae }; typedef struct { char * text; EXTRACTOR_KeywordType type; } Matches; static Matches tmap[] = { { "Title", EXTRACTOR_TITLE }, { "PresentationFormat", EXTRACTOR_FORMAT }, { "Category", EXTRACTOR_DESCRIPTION }, { "Manager", EXTRACTOR_MANAGER }, { "Company", EXTRACTOR_COMPANY }, { "Subject", EXTRACTOR_SUBJECT }, { "Author", EXTRACTOR_AUTHOR }, { "Keywords", EXTRACTOR_KEYWORDS }, { "Comments", EXTRACTOR_COMMENT }, { "Template", EXTRACTOR_TEMPLATE }, { "NumPages", EXTRACTOR_PAGE_COUNT }, { "AppName", EXTRACTOR_SOFTWARE }, { "RevisionNumber", EXTRACTOR_VERSIONNUMBER }, { "Dictionary", EXTRACTOR_LANGUAGE }, { "NumBytes", EXTRACTOR_SIZE }, { "CreatedTime", EXTRACTOR_CREATION_DATE }, { "LastSavedTime" , EXTRACTOR_MODIFICATION_DATE }, { "gsf:company", EXTRACTOR_COMPANY }, /* { "gsf:security", EXTRACTOR_SECURITY }, */ { "gsf:character-count", EXTRACTOR_CHARACTER_COUNT }, { "gsf:page-count", EXTRACTOR_PAGE_COUNT }, { "gsf:line-count", EXTRACTOR_LINE_COUNT }, { "gsf:word-count", EXTRACTOR_WORD_COUNT }, { "gsf:paragraph-count", EXTRACTOR_PARAGRAPH_COUNT }, { "gsf:last-saved-by", EXTRACTOR_LAST_SAVED_BY }, /* { "gsf:scale", EXTRACTOR_SCALE }, // always "false"? */ { "gsf:manager", EXTRACTOR_MANAGER }, { "dc:title", EXTRACTOR_TITLE }, { "dc:creator", EXTRACTOR_CREATOR }, { "dc:date", EXTRACTOR_DATE }, { "dc:subject", EXTRACTOR_SUBJECT }, { "dc:keywords", EXTRACTOR_KEYWORDS }, { "dc:last-printed", EXTRACTOR_LAST_PRINTED }, { "dc:description", EXTRACTOR_DESCRIPTION }, { "meta:creation-date", EXTRACTOR_CREATION_DATE }, /* { "meta:editing-duration", EXTRACTOR_TOTAL_EDITING_TIME }, // encoding? */ { "meta:generator", EXTRACTOR_GENERATOR }, { "meta:template", EXTRACTOR_TEMPLATE }, /* { "meta:editing-cycles", EXTRACTOR_EDITING_CYCLES }, // usually "FALSE" */ /* { "msole:codepage", EXTRACTOR_CHARACTER_SET }, */ { NULL, 0 }, }; static void processMetadata(gpointer key, gpointer value, gpointer user_data) { struct EXTRACTOR_Keywords ** pprev = user_data; const char * type = key; const GsfDocProp * prop = value; const GValue * gval; char * contents; int pos; if ( (key == NULL) || (value == NULL) ) return; gval = gsf_doc_prop_get_val(prop); if (G_VALUE_TYPE(gval) == G_TYPE_STRING) { contents = strdup(g_value_get_string(gval)); } else { /* convert other formats? */ contents = g_strdup_value_contents(gval); } if (contents == NULL) return; if ( (strlen(contents) > 0) && (contents[strlen(contents)-1] == '\n') ) contents[strlen(contents)-1] = '\0'; pos = 0; while (tmap[pos].text != NULL) { if (0 == strcmp(tmap[pos].text, type)) break; pos++; } if (tmap[pos].text != NULL) *pprev = addKeyword(*pprev, contents, tmap[pos].type); #if DEBUG_OLE2 else printf("No match for type `%s'\n", type); #endif free(contents); } static struct EXTRACTOR_Keywords * process(GsfInput * in, struct EXTRACTOR_Keywords * prev) { GsfDocMetaData * sections; GError * error; sections = gsf_doc_meta_data_new(); error = gsf_msole_metadata_read(in, sections); if (error == NULL) { gsf_doc_meta_data_foreach(sections, &processMetadata, &prev); } g_object_unref(G_OBJECT(sections)); return prev; } static struct EXTRACTOR_Keywords * processSO(GsfInput * src, struct EXTRACTOR_Keywords * prev) { off_t size; char * buf; size = gsf_input_size(src); if (size < 0x374) /* == 0x375?? */ return prev; buf = malloc(size); gsf_input_read(src, size, (unsigned char*) buf); if ( (buf[0] != 0x0F) || (buf[1] != 0x0) || (0 != strncmp(&buf[2], "SfxDocumentInfo", strlen("SfxDocumentInfo"))) || (buf[0x11] != 0x0B) || (buf[0x13] != 0x00) || /* pw protected! */ (buf[0x12] != 0x00) ) { free(buf); return prev; } buf[0xd3] = '\0'; if (buf[0x94] + buf[0x93] > 0) prev = addKeyword(prev, &buf[0x95], EXTRACTOR_TITLE); buf[0x114] = '\0'; if (buf[0xd5] + buf[0xd4] > 0) prev = addKeyword(prev, &buf[0xd6], EXTRACTOR_SUBJECT); buf[0x215] = '\0'; if (buf[0x115] + buf[0x116] > 0) prev = addKeyword(prev, &buf[0x117], EXTRACTOR_COMMENT); buf[0x296] = '\0'; if (buf[0x216] + buf[0x217] > 0) prev = addKeyword(prev, &buf[0x218], EXTRACTOR_KEYWORDS); /* fixme: do timestamps, mime-type, user-defined info's */ free(buf); return prev; } /* *************** wordleaker stuff *************** */ #define __(a) dgettext("iso-639", a) static const char * lidToLanguage( unsigned int lid ) { switch ( lid ) { case 0x0400: return _("No Proofing"); case 0x0401: return __("Arabic"); case 0x0402: return __("Bulgarian"); case 0x0403: return __("Catalan"); case 0x0404: return _("Traditional Chinese"); case 0x0804: return _("Simplified Chinese"); case 0x0405: return __("Chechen"); case 0x0406: return __("Danish"); case 0x0407: return __("German"); case 0x0807: return _("Swiss German"); case 0x0408: return __("Greek"); case 0x0409: return _("U.S. English"); case 0x0809: return _("U.K. English"); case 0x0c09: return _("Australian English"); case 0x040a: return _("Castilian Spanish"); case 0x080a: return _("Mexican Spanish"); case 0x040b: return __("Finnish"); case 0x040c: return __("French"); case 0x080c: return _("Belgian French"); case 0x0c0c: return _("Canadian French"); case 0x100c: return _("Swiss French"); case 0x040d: return __("Hebrew"); case 0x040e: return __("Hungarian"); case 0x040f: return __("Icelandic"); case 0x0410: return __("Italian"); case 0x0810: return _("Swiss Italian"); case 0x0411: return __("Japanese"); case 0x0412: return __("Korean"); case 0x0413: return __("Dutch"); case 0x0813: return _("Belgian Dutch"); case 0x0414: return _("Norwegian Bokmal"); case 0x0814: return __("Norwegian Nynorsk"); case 0x0415: return __("Polish"); case 0x0416: return __("Brazilian Portuguese"); case 0x0816: return __("Portuguese"); case 0x0417: return _("Rhaeto-Romanic"); case 0x0418: return __("Romanian"); case 0x0419: return __("Russian"); case 0x041a: return _("Croato-Serbian (Latin)"); case 0x081a: return _("Serbo-Croatian (Cyrillic)"); case 0x041b: return __("Slovak"); case 0x041c: return __("Albanian"); case 0x041d: return __("Swedish"); case 0x041e: return __("Thai"); case 0x041f: return __("Turkish"); case 0x0420: return __("Urdu"); case 0x0421: return __("Bahasa"); case 0x0422: return __("Ukrainian"); case 0x0423: return __("Byelorussian"); case 0x0424: return __("Slovenian"); case 0x0425: return __("Estonian"); case 0x0426: return __("Latvian"); case 0x0427: return __("Lithuanian"); case 0x0429: return _("Farsi"); case 0x042D: return __("Basque"); case 0x042F: return __("Macedonian"); case 0x0436: return __("Afrikaans"); case 0x043E: return __("Malayalam"); default: return NULL; } } static struct EXTRACTOR_Keywords * history_extract(GsfInput * stream, unsigned int lcbSttbSavedBy, unsigned int fcSttbSavedBy, struct EXTRACTOR_Keywords * prev) { unsigned int where = 0; unsigned char * lbuffer; unsigned int i; unsigned int length; char * author; char * filename; char * rbuf; unsigned int nRev; // goto offset of revision gsf_input_seek(stream, fcSttbSavedBy, G_SEEK_SET); if (gsf_input_remaining(stream) < lcbSttbSavedBy) return prev; lbuffer = malloc(lcbSttbSavedBy); // read all the revision history gsf_input_read(stream, lcbSttbSavedBy, lbuffer); // there are n strings, so n/2 revisions (author & file) nRev = (lbuffer[2] + (lbuffer[3] << 8)) / 2; where = 6; for (i=0; i < nRev; i++) { if (where >= lcbSttbSavedBy) break; length = lbuffer[where++]; if ( (where + 2 * length + 2 >= lcbSttbSavedBy) || (where + 2 * length + 2 <= where) ) break; author = convertToUtf8((const char*) &lbuffer[where], length * 2, "UTF-16BE"); where += length * 2 + 1; length = lbuffer[where++]; if ( (where + 2 * length >= lcbSttbSavedBy) || (where + 2 * length + 1 <= where) ) { free(author); break; } filename = convertToUtf8((const char*) &lbuffer[where], length * 2, "UTF-16BE"); where += length * 2 + 1; rbuf = malloc(strlen(author) + strlen(filename) + 512); snprintf(rbuf, 512 + strlen(author) + strlen(filename), _("Revision #%u: Author '%s' worked on '%s'"), i, author, filename); free(author); free(filename); prev = addKeyword(prev, rbuf, EXTRACTOR_REVISION_HISTORY); free(rbuf); } free(lbuffer); return prev; } /* ************** main method *********** */ struct EXTRACTOR_Keywords * libextractor_ole2_extract(const char * filename, const char * data, size_t size, struct EXTRACTOR_Keywords * prev) { GsfInput * input; GsfInfile * infile; GsfInput * src; GError * err = NULL; const char * name; const char * generator = NULL; int i; unsigned int lcb; unsigned int fcb; const unsigned char * data512; unsigned int lid; const char * lang; if (size < 512 + 898) return prev; /* can hardly be OLE2 */ input = gsf_input_memory_new((const guint8 *) data, (gsf_off_t) size, FALSE); if (input == NULL) return prev; infile = gsf_infile_msole_new(input, &err); if (infile == NULL) { g_object_unref(G_OBJECT(input)); return prev; } lcb = 0; fcb = 0; for (i=0;i= 6) { for (i=0;i