/* ** Modular Logfile Analyzer ** Copyright 2000 Jan Kneschke ** ** Homepage: http://www.modlogan.org ** This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version, and provided that the above copyright and permission notice is included with all distributed copies of this or derived software. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA ** ** $Id: process.c,v 1.73 2004/08/27 20:06:17 ostborn Exp $ */ #include #include #include #include #include #include #include #include #include "config.h" #include "mrecord.h" #include "mlocale.h" #include "mconfig.h" #include "mplugins.h" #include "mstate.h" #include "mdatatypes.h" #include "datatypes/count/datatype.h" #include "datatypes/visited/datatype.h" #include "datatypes/state/datatype.h" #include "datatypes/sublist/datatype.h" #include "datatypes/visit/datatype.h" #include "datatypes/brokenlink/datatype.h" #include "datatypes/location/datatype.h" #include "misc.h" #include "plugin_config.h" #include "md5_global.h" #include "md5.h" #ifdef HAVE_LIBLOCALIZER #include #endif #define UNRESOLVED_TLD "unre" #define M_WEB_IGNORE_HOST 1 #define M_WEB_IGNORE_REQ_URL 2 #define M_WEB_IGNORE_USERAGENT 3 #define M_WEB_IGNORE_HOSTMASK 4 #define M_WEB_IGNORE_SEARCHENGINE 5 #define M_WEB_HIDE_HOST 1 #define M_WEB_HIDE_REQ_URL 2 #define M_WEB_HIDE_REFERRER 3 #define M_WEB_HIDE_BROKENLINK 4 #define M_WEB_HIDE_EXTENSION 5 #define M_WEB_HIDE_HOSTMASK 6 #define M_WEB_HIDE_BROKENLINK_REF 7 #define M_WEB_GROUP_REFERRER 1 #define M_WEB_GROUP_HOST 2 #define M_WEB_GROUP_OS 3 #define M_WEB_GROUP_UA 4 #define M_WEB_GROUP_REQ_URL 5 #define M_WEB_GROUP_BROKENLINK 6 #define M_WEB_GROUP_SEARCHSTRING 7 #define M_WEB_GROUP_EXTENSION 8 #define M_WEB_GROUP_SEARCHENGINE 9 int is_matched(mlist *l, const char *url) { int url_len; if (!url || !l) return 0; url_len = strlen(url); for (; l; l = l->next) { mdata *data = l->data; if (data == NULL) { continue; } if (data->type != M_DATA_TYPE_MATCH) { fprintf(stderr, "%s.%d: wrong datatype for a match: %d\n", __FILE__, __LINE__, data->type); continue; } if (!data->data.match.match) { fprintf(stderr, "%s.%d: where is my match: %d\n", __FILE__, __LINE__, data->type); continue; } if (strmatch(data->data.match.match, data->data.match.study, url, url_len)) return 1; } return 0; } int hostmask_match(const char *hostmask, const char *host) { long hm, nm, ip; const char *h; int i; /* 0-3 are for the hostmask, 4 is for the netmask (/24) */ int hm_elem[5]; /* 0-3 are for the host */ int h_elem[4]; if (!hostmask) return 0; if (!host) return 0; memset(hm_elem, 0, sizeof(hm_elem)); memset(h_elem, 0, sizeof(h_elem)); i = 0; for (h = hostmask; *h; h++) { switch(*h) { case '.': i++; /* number of dots in the IP field */ if (i > 3) { fprintf(stderr, "%s.%d: too much dots in hostmask: '%s'\n", __FILE__, __LINE__, hostmask ); return 0; } break; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': hm_elem[i] = hm_elem[i] * 10 + (*h - '0'); /* overflow */ if (hm_elem[i] > 255) { fprintf(stderr, "%s.%d: value is too high '%d' in ip: '%s'\n", __FILE__, __LINE__, h_elem[i], hostmask ); return 0; } break; case '/': /* we need 3 dot before our netmask */ if (i != 3) { fprintf(stderr, "%s.%d: not enough dots in hostmask: '%s'\n", __FILE__, __LINE__, hostmask ); return 0; } i++; break; default: /* every after character is invalid */ fprintf(stderr, "%s.%d: invalid character '%c' in hostmask: '%s'\n", __FILE__, __LINE__, *h, hostmask ); return 0; } } /* everythink set ? */ if (i != 4) return 0; /* generate hostmask */ hm = hm_elem[0] << 24 | hm_elem[1] << 16 | hm_elem[2] << 8 | hm_elem[3] << 0; /* generate netmask */ nm = 0; for (i = 0; i < hm_elem[4]; i++) { nm |= 1 << (31 - i); } i = 0; for (h = host; *h; h++) { switch(*h) { case '.': i++; /* number of dots in the IP field */ if (i > 3) { fprintf(stderr, "%s.%d: too much dots in ip: '%s'\n", __FILE__, __LINE__, host ); return 0; } break; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': h_elem[i] = h_elem[i] * 10 + (*h - '0'); /* overflow */ if (h_elem[i] > 255) { fprintf(stderr, "%s.%d: value is too high '%d' in ip: '%s'\n", __FILE__, __LINE__, h_elem[i], host ); return 0; } break; default: /* every after character is invalid */ #if 0 fprintf(stderr, "%s.%d: invalid character '%c' in ip: '%s'\n", __FILE__, __LINE__, *h, host ); #endif return 0; } } /* everythink set ? */ if (i != 3) return 0; /* generate host-ip */ ip = h_elem[0] << 24 | h_elem[1] << 16 | h_elem[2] << 8 | h_elem[3] << 0; #if 0 fprintf(stderr, "-> %08lx & %08lx =? %08lx -> %d\n", ip, nm, hm, (ip & nm) == hm); #endif return ((ip & nm) == hm ? 1 : 0); } int is_matched_hostmask(mlist *l, const char *url) { if (!url || !l) return 0; for (; l; l = l->next) { mdata *data = l->data; if (data == NULL) { continue; } if (data->type != M_DATA_TYPE_COUNT) { fprintf(stderr, "%s.%d: wrong datatype for a match_hostmask: %d\n", __FILE__, __LINE__, data->type); continue; } if (hostmask_match(data->key, url)) return 1; } return 0; } int hide_field(mconfig *ext_conf, const char *url, int field) { config_processor *conf = ext_conf->plugin_conf; mlist *l = NULL; int ret; switch (field) { case M_WEB_HIDE_REFERRER: l = conf->hide_referrer; break; case M_WEB_HIDE_BROKENLINK: l = conf->hide_brokenlinks; break; case M_WEB_HIDE_REQ_URL: l = conf->hide_url; break; case M_WEB_HIDE_HOST: l = conf->hide_host; break; case M_WEB_HIDE_HOSTMASK: l = conf->hide_hostmask; break; case M_WEB_HIDE_EXTENSION: l = conf->hide_extension; break; case M_WEB_HIDE_BROKENLINK_REF: l = conf->hide_brokenlinks_ref; break; default: fprintf(stderr, "%s.%d: Unknown hide field: %d\n", __FILE__, __LINE__, field); break; } if (!url || !l) return 0; if (field != M_WEB_HIDE_HOSTMASK) { ret = is_matched(l, url); } else { ret = is_matched_hostmask(l, url); } return ret; } int ignore_field(mconfig *ext_conf, buffer *url, int field) { config_processor *conf = ext_conf->plugin_conf; mlist *l = NULL; int ret; switch (field) { case M_WEB_IGNORE_REQ_URL: l = conf->ignore_url; break; case M_WEB_IGNORE_HOST: l = conf->ignore_host; break; case M_WEB_IGNORE_USERAGENT: l = conf->ignore_ua; break; case M_WEB_IGNORE_HOSTMASK: l = conf->ignore_hostmask; break; case M_WEB_IGNORE_SEARCHENGINE: l = conf->ignore_searchengine; break; default: fprintf(stderr, "%s.%d: Unknown ignore field: %d\n", __FILE__, __LINE__, field); break; } if (!url->used || !l) return 0; if (field != M_WEB_IGNORE_HOSTMASK) { ret = is_matched(l, url->ptr); } else { ret = is_matched_hostmask(l, url->ptr); } return ret; } int is_grouped (mconfig *ext_conf, buffer *grouped, mlist *l, const char *str) { char *r = NULL; int str_len; if (!str || !l) return 0; str_len = strlen(str); for (; (l != NULL) && (r == NULL); l = l->next) { mdata *data = l->data; if (data == NULL) { continue; } if (data->type != M_DATA_TYPE_MATCH) { fprintf(stderr, "%s.%d: wrong datatype for a match: %d\n", __FILE__, __LINE__, data->type); continue; } if (!data->data.match.match) { fprintf(stderr, "%s.%d: %s %s\n", __FILE__, __LINE__, "no match", str); continue; } r = substitute(ext_conf, data->data.match.match, data->data.match.study, data->key, str, str_len); } if (r) { buffer_copy_string(grouped, r); free(r); return 1; } else { return 0; } } int is_grouped_field (mconfig *ext_conf, buffer *grouped, const char *str, int field) { config_processor *conf = ext_conf->plugin_conf; mlist *l = NULL; switch (field) { case M_WEB_GROUP_REFERRER: l = conf->group_referrer; break; case M_WEB_GROUP_HOST: l = conf->group_hosts; break; case M_WEB_GROUP_OS: l = conf->group_os; break; case M_WEB_GROUP_UA: l = conf->group_ua; break; case M_WEB_GROUP_REQ_URL: l = conf->group_url; break; case M_WEB_GROUP_BROKENLINK: l = conf->group_brokenlinks; break; case M_WEB_GROUP_SEARCHSTRING: l = conf->group_searchstrings; break; case M_WEB_GROUP_SEARCHENGINE: l = conf->group_searchengine; break; case M_WEB_GROUP_EXTENSION: l = conf->group_extension; break; default: fprintf(stderr, "%s.%d: Unknown group field: %d\n", __FILE__, __LINE__, field); break; } if (!str || !l) return 0; return is_grouped (ext_conf, grouped, l, str); } int is_existent(mlogrec_web *record) { return (record->req_status >= 200 && record->req_status < 400 && record->req_status != 301 && record->req_status != 300); } int is_file(mlogrec_web *record) { return record->req_status == 200; } int is_page(mconfig *ext_conf, mlogrec_web *record) { config_processor *conf = ext_conf->plugin_conf; mlist *l; buffer *url = record->req_url; if (url->used == 0) return 0; for (l = conf->page_type; l; l = l->next) { mdata *data = l->data; if (data && strmatch(data->data.match.match, data->data.match.study, url->ptr, url->used - 1)) { return 1; } } return 0; } int is_robot(buffer *url) { if (!url->used) return 0; return !strcmp(url->ptr, "/robots.txt"); } int insert_view_to_views(mconfig *ext_conf, mstate *state, time_t last_timestamp, mdata *visit, int is_hit) { config_processor *conf = ext_conf->plugin_conf; mlist* hlist = NULL; int debug_me = conf->debug_visits; mstate_web *staweb = state->ext; /* lookup the last hit which this duration applies */ for (hlist = visit->data.visit->hits; hlist->next && hlist->next->data; hlist = hlist->next) ; if (hlist->data) { const char* last_url = mdata_get_key(hlist->data, state); time_t duration = 0; /* check to see if this hit should be recorded as a page view */ if (!hide_field(ext_conf, last_url, M_WEB_HIDE_REQ_URL)) { mdata* data = NULL; /* use input generated timediff or calculate it for page views */ if (visit->data.visit->timediff) { duration = visit->data.visit->timediff; } else { duration = last_timestamp - visit->data.visit->timestamp; if (duration >= conf->visit_timeout) { duration = 5; /* XXXST create config variable or average over time */ } } if (is_grouped_field(ext_conf, conf->grouped, last_url, M_WEB_GROUP_REQ_URL)) { const char *key = splaytree_insert(ext_conf->strings, conf->grouped->ptr); data = mdata_Visited_create(key, duration, M_DATA_STATE_GROUPED, is_hit ? 1 : 0); } else { const char *key = splaytree_insert(ext_conf->strings, last_url); data = mdata_Visited_create(key, duration, M_DATA_STATE_PLAIN, is_hit ? 1 : 0); } mhash_insert_sorted(staweb->views, data); } } else { if (debug_me) { fprintf(stderr, "process.is_visit: No data for last hit!!\n"); } } return 0; } int append_hit_to_visit(mconfig *ext_conf, mstate *state, mlogrec *record, mdata *visit) { config_processor *conf = ext_conf->plugin_conf; mlogrec_web *recweb = record->ext; mlogrec_web_extclf *recext = NULL; mstate_web *staweb = state->ext; if (recweb == NULL) return -1; if (recweb->req_url->used == 0) return -1; if (recweb->ext_type == M_RECORD_TYPE_WEB_EXTCLF) { recext = recweb->ext; } /* set last visited page */ if (!hide_field(ext_conf,recweb->req_url->ptr, M_WEB_HIDE_REQ_URL)) { if (visit->data.visit->type == M_DATA_VISIT_ROBOT) { mdata *data; const char *key = splaytree_insert(ext_conf->strings, recweb->req_url->ptr); /* add the page the index pages */ data = mdata_Count_create(key, 1, M_DATA_STATE_PLAIN); mhash_insert_sorted(staweb->indexed_pages, data); } if (conf->max_hits_per_visit == 0 || visit->data.visit->count < conf->max_hits_per_visit) { mdata * hit; /* add the hits to the list of pages for this visit */ const char *key = splaytree_insert(ext_conf->strings, recweb->req_url->ptr); const char *ref = splaytree_insert(ext_conf->strings, ""); hit = mdata_BrokenLink_create(key, 1, M_DATA_STATE_PLAIN, record->timestamp, ref); mlist_append(visit->data.visit->hits, hit); /* using the count value as the number of elements in data.visit->hits */ visit->data.visit->count++; } } /* save the last record's stats */ visit->data.visit->timediff = recext ? recext->duration : 0; visit->data.visit->timestamp = record->timestamp; return 0; } int cleanup_visits(mconfig *ext_conf, mstate *state, time_t last_timestamp) { config_processor *conf = ext_conf->plugin_conf; mstate_web *staweb; mhash *h; int debug_me = conf->debug_visits; int j; static int vc = 0; if (!state) return -1; staweb = state->ext; if (!staweb) return -1; h = staweb->visit_hash; for ( j = 0; j < h->size; j++) { if (h->data[j]->list) { mlist *hl; for(hl = h->data[j]->list; hl; hl = hl->next) { mdata *data = hl->data; if (!hl->data) continue; /* check if the visits has timed out */ if (last_timestamp - data->data.visit->timestamp > conf->visit_timeout) { mlist *act; mlist *vlist, *l; mdata *d; int i; /* md5 */ char md5str[33]; MD5_CTX context; unsigned char digest[16]; char *r; const char *key; if (debug_me) { fprintf(stderr, "process.is_visit: <- %20s (%20s), time: %ld - %ld\n", data->key, data->data.visit->useragent, data->data.visit->timestamp, last_timestamp - data->data.visit->timestamp); } /* record the last hit as a view */ insert_view_to_views(ext_conf, state, last_timestamp, data, 1); /* extract the visit list */ vlist = data->data.visit->hits; data->data.visit->hits = NULL; /* build key */ md5str[0] = '\0'; MD5Init(&context); for (act = vlist; act && act->data; act = act->next) { const char *key = NULL; key = act->data->key; if (key == NULL) return -1; MD5Update(&context, key, strlen(key)); } MD5Final(digest, &context); for (i = 0, r = md5str; i < 16; i++, r += 2) { sprintf(r, "%02x", digest[i]); } *r = '\0'; for (l = vlist; l; l = l->next) vc++; key = splaytree_insert(ext_conf->strings, md5str); d = mdata_SubList_create(key, vlist); mhash_insert_sorted(staweb->visits, d); act = hl; if (hl->next) { hl = hl->next; if (act->next) act->next->prev = act->prev; if (act->prev) { act->prev->next = act->next; } else { h->data[j]->list = hl; } mlist_free_entry(act); } else { mdata_free(data); hl->data = NULL; } } } } } return 0; } /** * check if the record is the start of a visit * * */ int is_visit(mconfig *ext_conf, mstate *state,mlogrec *record) { config_processor *conf = ext_conf->plugin_conf; mstate_web *staweb = state->ext; mhash *h = staweb->visit_hash; int visited = 0; static int lc = 0; static int i; int debug_me = conf->debug_visits; mlogrec_web *recweb = record->ext; mlogrec_web_extclf *recext = NULL; static mtimer t[5]; /* md5 */ char md5str[33]; MD5_CTX context; unsigned char digest[16]; char *r, *key; if (recweb == NULL) return -1; if (recweb->req_url->used == 0) return -1; /* only pages are valid for our visit handling */ if (!is_page(ext_conf, recweb)) return 0; /* and only if the are valid */ if (!is_existent(recweb)) return 0; if (recweb->ext && recweb->ext_type == M_RECORD_TYPE_WEB_EXTCLF) { recext = recweb->ext; } if (lc == 0) { MTIMER_RESET(t[0]); MTIMER_RESET(t[1]); MTIMER_RESET(t[2]); MTIMER_RESET(t[3]); MTIMER_RESET(t[4]); } /* we need a hostname or an ip */ if (recweb->req_host_name->used == 0 && recweb->req_host_ip->used == 0) return -1; /* clean up visits every 100 lines */ if (((lc + 1) % 1000) == 0) { if (0 != cleanup_visits(ext_conf, state, record->timestamp)) { M_DEBUG0(ext_conf->debug_level, M_DEBUG_SECTION_PROCESSING, M_DEBUG_LEVEL_ERRORS, "cleaning up visits failed"); } } MTIMER_STOP(t[0]); MTIMER_CALC(t[0]); /* generate key from hostname/ip and useragent */ md5str[0] = '\0'; MD5Init(&context); key = recweb->req_host_name->used ? recweb->req_host_name->ptr : recweb->req_host_ip->ptr; MD5Update(&context, key, strlen(key)); if (recext && recext->req_useragent->used) { MD5Update(&context, recext->req_useragent->ptr, recext->req_useragent->used); } MD5Final(digest, &context); for (i = 0, r = md5str; i < 16; i++, r += 2) { sprintf(r, "%02x", digest[i]); } *r = '\0'; if (mhash_in_hash(h, md5str)) { mdata * data = mhash_get_data(h, md5str); if (record->timestamp - data->data.visit->timestamp > conf->visit_timeout) { /* visit timed out and hasn't been cleanup up, yet */ cleanup_visits(ext_conf, state, record->timestamp); /* set a new visit */ if (!hide_field(ext_conf,recweb->req_url->ptr, M_WEB_HIDE_REQ_URL)) { int type = is_robot(recweb->req_url); mdata *data; const char *key, *ua; MTIMER_START(t[1]); visited = 1; if (debug_me) { fprintf(stderr, "process.is_visit: -> %20s (%20s), time: %ld\n", /*recweb->req_host_name ? recweb->req_host_name : recweb->req_host_ip,*/ md5str, (recext && recext->req_useragent->used) ? recext->req_useragent->ptr : NULL, record->timestamp); } key = splaytree_insert(ext_conf->strings, md5str); ua = splaytree_insert(ext_conf->strings, (recext && recext->req_useragent->used) ? recext->req_useragent->ptr : ""); data = mdata_Visit_create(key, ua, 1, record->timestamp, 0, type); if (append_hit_to_visit(ext_conf, state, record, data)) { fprintf(stderr, "%s.%d (%s)\n", __FILE__, __LINE__, __FUNCTION__); } mhash_insert_sorted(h, data); } } else { if (debug_me) { fprintf(stderr, "process.is_visit: -- %20s (%20s), time: %ld - %ld\n", data->key, data->data.visit->useragent, record->timestamp, record->timestamp - data->data.visit->timestamp); } MTIMER_START(t[3]); insert_view_to_views(ext_conf, state, record->timestamp, data, 1); MTIMER_STOP(t[3]); MTIMER_CALC(t[3]); MTIMER_START(t[4]); append_hit_to_visit(ext_conf, state, record, data); MTIMER_STOP(t[4]); MTIMER_CALC(t[4]); } } else if (!hide_field(ext_conf, recweb->req_url->ptr, M_WEB_HIDE_REQ_URL)) { int type = is_robot(recweb->req_url); mdata *data; const char *key, *ua; MTIMER_START(t[1]); visited = 1; if (debug_me) { fprintf(stderr, "process.is_visit: -> %20s (%20s), time: %ld\n", /*recweb->req_host_name ? recweb->req_host_name : recweb->req_host_ip,*/ md5str, (recext && recext->req_useragent->used) ? recext->req_useragent->ptr : NULL, record->timestamp); } key = splaytree_insert(ext_conf->strings, md5str); ua = splaytree_insert(ext_conf->strings, (recext && recext->req_useragent->used) ? recext->req_useragent->ptr : ""); data = mdata_Visit_create(key, ua, 1, record->timestamp, 0, type); if (append_hit_to_visit(ext_conf, state, record, data)) { fprintf(stderr, "%s.%d (%s)\n", __FILE__, __LINE__, __FUNCTION__); } mhash_insert_sorted(h, data); } MTIMER_STOP(t[1]); MTIMER_CALC(t[1]); if(lc++ % 1000 == 0) { #if 0 printf("[%4d]", vc); printf("[ all: %ld, del: %ld, ins: (%ld, %ld) new: %ld ]\n", MTIMER_GET_USER_MSEC(t[0]), MTIMER_GET_USER_MSEC(t[2]), MTIMER_GET_USER_MSEC(t[3]), MTIMER_GET_USER_MSEC(t[4]), MTIMER_GET_USER_MSEC(t[1]) ); #endif } return visited; } int process_searchengine(mconfig *ext_conf, mstate *state, mlogrec_web_extclf *record) { config_processor *conf = ext_conf->plugin_conf; int site_found = 0; buffer *full; mlist *l = conf->searchengines; mstate_web *staweb = state->ext; if (!l) return 0; if (!(record->ref_getvars->used && record->ref_url->used)) return 0; full = buffer_init(); buffer_prepare_copy(full, record->ref_url->used + record->ref_getvars->used + 1); buffer_copy_string_buffer(full, record->ref_url); BUFFER_APPEND_STRING_CONST(full, "?"); buffer_append_string_buffer(full, record->ref_getvars); if (ignore_field(ext_conf, full, M_WEB_IGNORE_SEARCHENGINE)) { buffer_free(full); return 0; } for (l = conf->match_searchengine; l; l = l->next) { mdata *data = l->data; #define N 20 int ovector[3 * N], n; if (data == NULL) { continue; } if (data->type != M_DATA_TYPE_MATCH) { fprintf(stderr, "%s.%d: wrong datatype for a match: %d\n", __FILE__, __LINE__, data->type); continue; } /* match the string */ if ((n = pcre_exec(data->data.match.match, data->data.match.study, full->ptr, full->used - 1, 0, 0, ovector, 3 * N)) < 0) { if (n != PCRE_ERROR_NOMATCH) { fprintf(stderr, "%s.%d: execution error while matching: %d\n", __FILE__, __LINE__, n); return 0; } } #undef N /* the string has matched somehow */ if (n >= 0) { char *searchstring; /* get searchstring */ pcre_get_substring(full->ptr, ovector, n, 1, (const char **)&searchstring); if (conf->decode_searchstrings) url_decode_on_self(searchstring); if ((is_grouped_field(ext_conf, conf->grouped, searchstring, M_WEB_GROUP_SEARCHSTRING))) { mdata * mdata; const char *key = splaytree_insert(ext_conf->strings, conf->grouped->ptr); mdata = mdata_Count_create(key, 1, M_DATA_STATE_GROUPED); mhash_insert_sorted(staweb->searchstring, mdata); } else { mdata * mdata; const char *key = splaytree_insert(ext_conf->strings, searchstring); mdata = mdata_Count_create(key, 1, M_DATA_STATE_PLAIN); mhash_insert_sorted(staweb->searchstring, mdata); } pcre_free_substring(searchstring); /* get searchengine */ if ((is_grouped_field(ext_conf, conf->grouped, full->ptr, M_WEB_GROUP_SEARCHENGINE))) { mdata * mdata; const char *key = splaytree_insert(ext_conf->strings, conf->grouped->ptr); mdata = mdata_Count_create(key, 1, M_DATA_STATE_GROUPED); mhash_insert_sorted(staweb->searchsite, mdata); } else { mdata * mdata; const char *key = splaytree_insert(ext_conf->strings, record->ref_url->ptr); mdata = mdata_Count_create(key, 1, M_DATA_STATE_PLAIN); mhash_insert_sorted(staweb->searchsite, mdata); if (conf->log_ungrouped_FILE) { fprintf(conf->log_ungrouped_FILE, "%s\n", full->ptr); } } site_found = 1; break; } } buffer_free(full); #undef N return site_found; } /** * transform the protocol-part and the host-part to lowercase * * http://WWW.KDE.ORG/aBc -> http://www.kde.org/aBc */ char *urltolower(buffer *str) { char *s = str->ptr, *c; if (!str->used) return NULL; if (NULL != (c = strstr(str->ptr, "://"))) { /* transform protocoll */ for (;*s && *s != '/'; s++) { *s = tolower(*s); } /* move the pointer to the start of the host-part */ s = c + 3; } /* transform host-part */ for (;*s && *s != '/'; s++) { *s = tolower(*s); } return s; } mstate *splitter(mconfig *ext_conf, mlist *state_list, mlogrec *record) { config_processor *conf = ext_conf->plugin_conf; char *name = NULL; /* name if the state -> directory-name */ mstate *state = NULL; int split_enable = 0; /* record extension */ mlogrec_web *recweb = NULL; /* record web extensions */ mlogrec_web_extclf *recext = NULL; mlogrec_web_squid *recsquid = NULL; mlogrec_web_ftp *recftp = NULL; recweb = record->ext; switch (recweb->ext_type) { case M_RECORD_TYPE_WEB_EXTCLF: recext = recweb->ext; break; case M_RECORD_TYPE_WEB_FTP: recftp = recweb->ext; break; case M_RECORD_TYPE_WEB_SQUID: recsquid = recweb->ext; break; } if (conf->split_def) { mlist *l; for (l = conf->split_def; l; l = l->next) { mdata *data = l->data; char *str = NULL; if (!data) break; split_enable = 1; #define M(x) \ x->used ? x->ptr : NULL /* decide which field we shall look at */ switch(data->data.split.fieldtype) { case M_SPLIT_FIELD_REQURL: str = M(recweb->req_url); break; case M_SPLIT_FIELD_REQUSER: str = M(recweb->req_user); break; case M_SPLIT_FIELD_SRVHOST: if (recext) str = M(recext->srv_host); break; case M_SPLIT_FIELD_SRVPORT: if (recext) str = M(recext->srv_port); break; case M_SPLIT_FIELD_REQHOST: str = M(recweb->req_host_name); break; case M_SPLIT_FIELD_REFURL: if (recext) str = M(recext->ref_url); break; case M_SPLIT_FIELD_DEFAULT: break; default: fprintf(stderr, "%s.%d: unknown type: %d\n", __FILE__, __LINE__, data->type); } #undef M if (ext_conf->debug_level > 3) fprintf(stderr, "%s.%d: -1- type: %d - %s\n", __FILE__, __LINE__, data->type, str); if (str != NULL) { /* do the test on the string */ name = substitute(ext_conf, data->data.split.match, NULL, data->key, str, strlen(str)); #if 0 /* this should happen whenever a split doesn't apply. */ if (name == NULL) fprintf(stderr, "%s.%d: substitute failed\n", __FILE__, __LINE__); #endif } else if (data->data.split.fieldtype == M_SPLIT_FIELD_DEFAULT) { /* if a default is specified it is used when it occures */ name = strdup(data->key); if (ext_conf->debug_level > 3) fprintf(stderr, "%s.%d: (def) state-name: %s\n", __FILE__, __LINE__, name); } if (name) break; } } if (split_enable == 0) { /* splitter isn't enabled, take a default name */ name = malloc(1); *name = '\0'; } if (name) { /* we've got a name. try to find the list entry with this name */ mlist *l; for (l = state_list; l; l = l->next) { mdata *data = l->data; if (!data) break; if (!strcmp(name, data->key)) { state = data->data.state.state; break; } } if (!state) { const char *key = splaytree_insert(ext_conf->strings, name); mdata *data; data = mdata_State_create(key, NULL, NULL); mlist_insert_sorted(state_list, data); state = data->data.state.state; } free(name); } else { fprintf(stderr, "%s.%d: no match found by the splitter. isn't there a default ??\n", __FILE__, __LINE__); } return state; } int mplugins_processor_web_insert_record(mconfig *ext_conf, mlist *state_list, mlogrec *record) { config_processor *conf = ext_conf->plugin_conf; struct tm *tm; int isvisit = 0, isfile = 0, ispage = 0, i; #define TIMERS 6 static mtimer t[TIMERS]; static int lc = 0; /* record extension */ mlogrec_web *recweb = NULL; /* record web extensions */ mlogrec_web_extclf *recext = NULL; mlogrec_web_squid *recsquid = NULL; mlogrec_web_ftp *recftp = NULL; mstate_web *staweb = NULL; mstate *state = NULL; mdata *data = NULL; if (record->ext_type != M_RECORD_TYPE_WEB) return -1; if (record->ext == NULL) return -1; recweb = record->ext; state = splitter(ext_conf, state_list, record); if (state == NULL) return -1; switch (recweb->ext_type) { case M_RECORD_TYPE_WEB_EXTCLF: recext = recweb->ext; break; case M_RECORD_TYPE_WEB_FTP: recftp = recweb->ext; break; case M_RECORD_TYPE_WEB_SQUID: recsquid = recweb->ext; break; } if (state->ext) { switch(state->ext_type) { case M_STATE_TYPE_WEB: staweb = state->ext; break; default: fprintf(stderr, "%s.%d: unsupport state subtype\n", __FILE__, __LINE__); return -1; } } else { state->ext = mstate_init_web(); state->ext_type = M_STATE_TYPE_WEB; staweb = state->ext; } if (!lc) { for (i = 0; i < TIMERS; i++) { MTIMER_RESET(t[i]); } lc ++; } MTIMER_START(t[0]); urltolower(recweb->req_url); urltolower(recweb->req_host_name); if (recext != NULL) urltolower(recext->ref_url); /* skip ignored records */ if ( ignore_field(ext_conf, recweb->req_url, M_WEB_IGNORE_REQ_URL) || ignore_field(ext_conf, recweb->req_host_name, M_WEB_IGNORE_HOST ) || ignore_field(ext_conf, recweb->req_host_ip, M_WEB_IGNORE_HOST ) || ignore_field(ext_conf, recweb->req_host_ip, M_WEB_IGNORE_HOSTMASK ) || ( recext != NULL && ignore_field(ext_conf, recext->req_useragent, M_WEB_IGNORE_USERAGENT )) ) { return 0; } MTIMER_START(t[1]); /* hourly/daily stats */ if ((tm = localtime(&(record->timestamp)))) { /* perhaps we have created a new state */ if (!state->timestamp) { state->year = tm->tm_year+1900; state->month = tm->tm_mon+1; } state->timestamp = record->timestamp; staweb->hours[tm->tm_hour].xfersize += recweb->xfersize; staweb->days[tm->tm_mday-1].xfersize += recweb->xfersize; staweb->hours[tm->tm_hour].hits++; staweb->days[tm->tm_mday-1].hits++; if (is_page(ext_conf, recweb)) { staweb->hours[tm->tm_hour].pages++; staweb->days[tm->tm_mday-1].pages++; ispage = 1; } if (is_file(recweb) || recweb->ext_type == M_RECORD_TYPE_WEB_FTP) { staweb->hours[tm->tm_hour].files++; staweb->days[tm->tm_mday-1].files++; isfile = 1; } #if 1 if (is_visit(ext_conf, state, record) == 1) { staweb->hours[tm->tm_hour].visits++; staweb->days[tm->tm_mday-1].visits++; isvisit = 1; } #endif } MTIMER_STOP(t[1]); MTIMER_CALC(t[1]); /* rewrite the urls */ /* Used Protocol for this query*/ MTIMER_START(t[2]); if (recweb->req_protocol->used) { const char *key = splaytree_insert(ext_conf->strings, recweb->req_protocol->ptr); data = mdata_Count_create(key, 1, M_DATA_STATE_PLAIN); mhash_insert_sorted(staweb->req_prot_hash, data); } /* User Method for this query (GET, POST, PUT, HEAD, OPTIONS) */ if (recweb->req_method->used) { const char *key = splaytree_insert(ext_conf->strings, recweb->req_method->ptr); data = mdata_Count_create(key, 1, M_DATA_STATE_PLAIN); mhash_insert_sorted(staweb->req_meth_hash, data); } if (recweb->req_status && recweb->req_url->used) { char buf[4]; const char *key; sprintf(buf, "%3d", recweb->req_status); key = splaytree_insert(ext_conf->strings, buf); data = mdata_Count_create(key, 1, M_DATA_STATE_PLAIN); mhash_insert_sorted(staweb->status_hash, data); /* FIXME: specific to HTTP */ switch (recweb->req_status) { case 404: if (!hide_field(ext_conf, recweb->req_url->ptr, M_WEB_HIDE_BROKENLINK) && (recext == NULL || recext->ref_url->used == 0 || !hide_field(ext_conf, recext->ref_url->ptr, M_WEB_HIDE_BROKENLINK_REF))) { mdata *link; if ((is_grouped_field(ext_conf, conf->grouped, recweb->req_url->ptr, M_WEB_GROUP_BROKENLINK))) { const char *key = splaytree_insert(ext_conf->strings, conf->grouped->ptr); const char *ref = splaytree_insert(ext_conf->strings, (recext && recext->ref_url->used) ? recext->ref_url->ptr : ""); link = mdata_BrokenLink_create(key, 1, M_DATA_STATE_GROUPED, record->timestamp, ref); } else { const char *key = splaytree_insert(ext_conf->strings, recweb->req_url->ptr); const char *ref = splaytree_insert(ext_conf->strings, (recext && recext->ref_url->used) ? recext->ref_url->ptr : ""); link = mdata_BrokenLink_create(key, 1, M_DATA_STATE_PLAIN, record->timestamp, ref); } mhash_insert_sorted(staweb->status_missing_file, link); } break; case 500: { mdata *link; const char *key = splaytree_insert(ext_conf->strings, recweb->req_url->ptr); const char *ref = splaytree_insert(ext_conf->strings, (recext && recext->ref_url->used) ? recext->ref_url->ptr : ""); link = mdata_BrokenLink_create(key, 1, M_DATA_STATE_PLAIN, record->timestamp, ref); mhash_insert_sorted(staweb->status_internal_error, link); break; } } } MTIMER_STOP(t[2]); MTIMER_CALC(t[2]); MTIMER_START(t[3]); if ((recweb->req_host_name->used && !hide_field(ext_conf, recweb->req_host_name->ptr, M_WEB_HIDE_HOST)) || (recweb->req_host_ip->used && !hide_field(ext_conf, recweb->req_host_ip->ptr, M_WEB_HIDE_HOSTMASK))) { /* * NOTE: (FIXME) * the following grouping affects the "number of hosts" as most of the * hosts are grouped. Perhaps we should lists of hosts which are in a * specific group */ if (recweb->req_host_name->used) { char *entry, *key; /* * is_grouped_field() is expensive, caching reduces the problem here * */ /* cache decision */ for (i = 0, entry = NULL; i < conf->host_cache_max; i++) { if (conf->host_cache->entry[i]->key->used && recweb->req_host_name->used == conf->host_cache->entry[i]->key->used && 0 == strcmp(recweb->req_host_name->ptr, conf->host_cache->entry[i]->key->ptr)) { entry = conf->host_cache->entry[i]->value->ptr; break; } } if (entry) { /* hit */ const char *key = splaytree_insert(ext_conf->strings, entry); data = mdata_Visited_create(key, 1, M_DATA_STATE_GROUPED, isvisit); } else if ((is_grouped_field(ext_conf, conf->grouped, recweb->req_host_name->ptr, M_WEB_GROUP_HOST))) { /* miss - grouped */ int oldest = -1; time_t oldest_tstmp = conf->host_cache->last_tstmp; const char *key = splaytree_insert(ext_conf->strings, conf->grouped->ptr); data = mdata_Visited_create(key, 1, M_DATA_STATE_GROUPED, isvisit); /* insert into the cache */ for (i = 0; i < conf->host_cache_max; i++) { if (0 == conf->host_cache->entry[i]->key->used) continue; if (conf->host_cache->entry[i]->tstmp < oldest_tstmp) { oldest = i; oldest_tstmp = conf->host_cache->entry[i]->tstmp; } } /* free the oldest entry and insert */ if (oldest != -1) { buffer_copy_string_buffer(conf->host_cache->entry[oldest]->key, recweb->req_host_name); buffer_copy_string_buffer(conf->host_cache->entry[oldest]->value, conf->grouped); conf->host_cache->entry[oldest]->tstmp = conf->host_cache->last_tstmp++; } } else { /* miss - not grouped */ const char *key = splaytree_insert(ext_conf->strings, recweb->req_host_name->ptr); data = mdata_Visited_create(key, 1, M_DATA_STATE_PLAIN, isvisit); } #if 1 mhash_insert_sorted(staweb->host_hash, data); #else marray_insert(staweb->host_array, data); #endif /* splitting the TLD from the FQDN */ if ((entry = strrchr(recweb->req_host_name->ptr, '.'))) { if (misoname(entry + 1)) { key = splaytree_insert(ext_conf->strings, isondx(entry + 1) != M_RESOLV_UNRESOLVED ? entry + 1 : UNRESOLVED_TLD); data = mdata_Visited_create(key, 1, M_DATA_STATE_PLAIN, isvisit); mhash_insert_sorted(staweb->country_hash, data); } } /* * proxies-reports want * - traffic by req-host * */ key = splaytree_insert(ext_conf->strings, recweb->req_host_name->ptr); data = mdata_Visited_create(key, 1, M_DATA_STATE_PLAIN, recweb->xfersize); mhash_insert_sorted(staweb->host_traffic, data); } else { /* no countries */ const char *key = splaytree_insert(ext_conf->strings, recweb->req_host_ip->ptr); data = mdata_Visited_create(key, 1, M_DATA_STATE_PLAIN, isvisit); mhash_insert_sorted(staweb->host_hash, data); data = mdata_Visited_create(key, 1, M_DATA_STATE_PLAIN, recweb->xfersize); mhash_insert_sorted(staweb->host_traffic, data); } if (recweb->req_host_ip) { #ifdef HAVE_LIBLOCALIZER if (conf->localizer) { long ip = localizer_ip2int(recweb->req_host_ip->ptr); l_data_export ld; if (0 == (localizer_search(conf->localizer, ip, &ld))) { /* md5 */ char md5str[33]; MD5_CTX context; unsigned char digest[16]; char *r, *key; /* build key */ md5str[0] = '\0'; MD5Init(&context); #define M(x) \ MD5Update(&context, ld.x, strlen(ld.x)); M(city); M(province); M(country); M(provider); #undef M MD5Final(digest, &context); for (i = 0, r = md5str; i < 16; i++, r += 2) { sprintf(r, "%02x", digest[i]); } *r = '\0'; key = splaytree_insert(ext_conf->strings, md5str); data = mdata_Location_create(key, 1, ld.city, ld.province, ld.country, ld.provider); mhash_insert_sorted(staweb->location, data); } } #endif } } MTIMER_STOP(t[3]); MTIMER_CALC(t[3]); MTIMER_START(t[4]); if (recweb->req_url->used) { mdata *data; char *c1; /* * NOTE: (FIXME) * favicon.ico is requested by other browser than MSIE too * (Mozilla does that --Zas) * They are not using it exclusivly for bookmarking. */ if ((c1 = strstr(recweb->req_url->ptr, "favicon.ico")) ) { char c2 = *c1; const char *key; *c1 = '\0'; key = splaytree_insert(ext_conf->strings, recweb->req_url->ptr); data = mdata_Count_create(key, 1, M_DATA_STATE_PLAIN); mhash_insert_sorted(staweb->bookmarks, data); *c1 = c2; } else if (is_robot(recweb->req_url)) { if (recext == NULL || recext->req_useragent->used == 0) { /* worth nothing */ if (ext_conf->debug_level > 10) fprintf(stderr, "%s requested w/o useragent set - ignored.\n", recweb->req_url->ptr); } else { const char *key = splaytree_insert(ext_conf->strings, recext->req_useragent->ptr); data = mdata_Count_create(key, 1, M_DATA_STATE_PLAIN); mhash_insert_sorted(staweb->robots, data); } } /* file extensions */ if (is_existent(recweb) && !hide_field(ext_conf, recweb->req_url->ptr, M_WEB_HIDE_EXTENSION)) { if ((is_grouped_field(ext_conf, conf->grouped, recweb->req_url->ptr, M_WEB_GROUP_EXTENSION))) { const char *key = splaytree_insert(ext_conf->strings, conf->grouped->ptr); data = mdata_Visited_create(key, 1, M_DATA_STATE_GROUPED, recweb->xfersize); mhash_insert_sorted(staweb->extension, data); } else { if (ext_conf->debug_level > 5) fprintf(stderr,"%s.%d: the default rule for groupextension is missing for '%s'!!\n", __FILE__, __LINE__, recweb->req_url->ptr); } } /* hide url */ if (!hide_field(ext_conf, recweb->req_url->ptr, M_WEB_HIDE_REQ_URL)) { if ((is_grouped_field(ext_conf, conf->grouped, recweb->req_url->ptr, M_WEB_GROUP_REQ_URL))) { const char *key = splaytree_insert(ext_conf->strings, conf->grouped->ptr); data = mdata_Visited_create(key, 1, M_DATA_STATE_GROUPED, recweb->xfersize); } else { const char *key = splaytree_insert(ext_conf->strings, recweb->req_url->ptr); data = mdata_Visited_create(key, 1, M_DATA_STATE_PLAIN, recweb->xfersize); } mhash_insert_sorted(staweb->req_url_hash, data); } } MTIMER_STOP(t[4]); MTIMER_CALC(t[4]); /* more traffic-wise reports */ if (recweb->req_user->used) { /* handle the users traffic */ const char *key = splaytree_insert(ext_conf->strings, recweb->req_user->ptr); data = mdata_Visited_create(key, 1, M_DATA_STATE_PLAIN, recweb->xfersize); mhash_insert_sorted(staweb->users, data); } /* ** Extensions */ MTIMER_START(t[5]); /* User Operating System */ if (recext != NULL) { if (recext->req_useros->used) { if ((is_grouped_field(ext_conf, conf->grouped, recext->req_useros->ptr, M_WEB_GROUP_OS))) { const char *key = splaytree_insert(ext_conf->strings, conf->grouped->ptr); data = mdata_Visited_create(key, 1, M_DATA_STATE_GROUPED, isvisit); } else { const char *key = splaytree_insert(ext_conf->strings, recext->req_useros->ptr); data = mdata_Visited_create(key, 1, M_DATA_STATE_PLAIN, isvisit); } mhash_insert_sorted(staweb->os_hash, data); } /* User Agent */ if (recext->req_useragent->used) { if ((is_grouped_field(ext_conf, conf->grouped, recext->req_useragent->ptr, M_WEB_GROUP_UA))) { const char *key = splaytree_insert(ext_conf->strings, conf->grouped->ptr); data = mdata_Visited_create(key, 1, M_DATA_STATE_GROUPED, isvisit); } else { const char *key = splaytree_insert(ext_conf->strings, recext->req_useragent->ptr); data = mdata_Visited_create(key, 1, M_DATA_STATE_PLAIN, isvisit); } mhash_insert_sorted(staweb->ua_hash, data); } if (recext->ref_url->used) { if (!hide_field(ext_conf, recext->ref_url->ptr, M_WEB_HIDE_REFERRER)) { /* if the referrer is a searchengine, include it in the searchengine reports, too */ process_searchengine(ext_conf, state, recext); if ((is_grouped_field(ext_conf, conf->grouped, recext->ref_url->ptr, M_WEB_GROUP_REFERRER))) { const char *key = splaytree_insert(ext_conf->strings, conf->grouped->ptr); data = mdata_Count_create(key, 1, M_DATA_STATE_GROUPED); } else { const char *key = splaytree_insert(ext_conf->strings, recext->ref_url->ptr); data = mdata_Count_create(key, 1, M_DATA_STATE_PLAIN); } mhash_insert_sorted(staweb->ref_url_hash, data); } } if (recext->srv_host->used) { const char *key = splaytree_insert(ext_conf->strings, recext->srv_host->ptr); data = mdata_Visited_create(key, 1, M_DATA_STATE_PLAIN, isvisit); mhash_insert_sorted(staweb->vhost_hash, data); } /* * counting parallel processes */ if (recext->duration) { /* server started at timestamp - duration * died at timestamp */ time_t srv_start, srv_end; srv_start = record->timestamp - recext->duration; srv_end = record->timestamp; for (i = srv_start; i < srv_end; i++) { /* tag all seconds */ /* * seconds[i]++; */ } } /* * thoughput calculations ? */ } MTIMER_STOP(t[5]); MTIMER_CALC(t[5]); MTIMER_STOP(t[0]); MTIMER_CALC(t[0]); if (conf->debug_timing && (lc++ % 5000 == 0)) { fprintf(stderr, "--> "); for (i = 0; i < TIMERS; i++) { fprintf(stderr, "%ld ", MTIMER_GET_USER_MSEC(t[i])); } fprintf(stderr, "\n"); } #undef TIMERS return 0; }