/* elmo - ELectronic Mail Operator Copyright (C) 2003 rzyjontko, University of Wroclaw This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; version 2. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ---------------------------------------------------------------------- This file is a part of implementation of Bayesian Mail Filter, which is coverd by Paul Graham's article at http://www.paulgraham.com/spam.html and http://www.paulgraham.com/better.html This module was written for course of Artifficial Intelligence at University of Wroclaw, Institute of Computer Science. */ /**************************************************************************** * IMPLEMENTATION HEADERS ****************************************************************************/ #include #include #include #include "token.h" #include "error.h" /**************************************************************************** * IMPLEMENTATION PRIVATE DEFINITIONS / ENUMERATIONS / SIMPLE TYPEDEFS ****************************************************************************/ #define STREAM_COUNT 2 /**************************************************************************** * IMPLEMENTATION PRIVATE CLASS PROTOTYPES / EXTERNAL CLASS REFERENCES ****************************************************************************/ /**************************************************************************** * IMPLEMENTATION PRIVATE STRUCTURES / UTILITY CLASSES ****************************************************************************/ struct tstream { /* internal values */ int unused; regex_t compiled; regmatch_t matches[1]; char *position; char *last_token; /* user specified */ char *buffer; int max_len; int may_modify; }; /**************************************************************************** * IMPLEMENTATION REQUIRED EXTERNAL REFERENCES (AVOID) ****************************************************************************/ /**************************************************************************** * IMPLEMENTATION PRIVATE DATA ****************************************************************************/ static struct tstream streams[STREAM_COUNT]; /**************************************************************************** * INTERFACE DATA ****************************************************************************/ /**************************************************************************** * IMPLEMENTATION PRIVATE FUNCTION PROTOTYPES ****************************************************************************/ /**************************************************************************** * IMPLEMENTATION PRIVATE FUNCTIONS ****************************************************************************/ /**************************************************************************** * INTERFACE FUNCTIONS ****************************************************************************/ void token_init (void) { int i; for (i = 0; i < STREAM_COUNT; i++) streams[i].unused = 1; } int token_open (char *buffer, int max_len, const char *regex, int may_modify) { int ret; int i; if (buffer == NULL) return -1; for (i = 0; i < STREAM_COUNT; i++){ if (streams[i].unused) break; } if (i == STREAM_COUNT) return -1; streams[i].unused = 0; streams[i].buffer = buffer; streams[i].max_len = max_len; streams[i].may_modify = may_modify; streams[i].position = buffer; streams[i].last_token = NULL; ret = regcomp (&streams[i].compiled, regex, REG_ICASE | REG_EXTENDED | REG_NEWLINE); if (ret){ error_regex (ret, &streams[i].compiled, regex); regfree (&streams[i].compiled); return -1; } return i; } void token_close (int td) { if (streams[td].unused) return; streams[td].unused = 1; regfree (&streams[td].compiled); } char * token_read_next (int td) { int ret; int len; char *string; struct tstream *stream = streams + td; if (stream->unused) return NULL; string = stream->position; if (*string == '\0') return NULL; do { stream->position = string; ret = regexec (&stream->compiled, stream->position, 1, stream->matches, 0); if (ret){ if (ret != REG_NOMATCH){ error_regex (ret, &stream->compiled, NULL); } return NULL; } len = stream->matches[0].rm_eo - stream->matches[0].rm_so; string += stream->matches[0].rm_eo + 1; } while (stream->max_len > 0 && len > stream->max_len); stream->last_token = stream->position + stream->matches[0].rm_so; if (stream->may_modify){ *(stream->position + stream->matches[0].rm_eo) = '\0'; } stream->position += stream->matches[0].rm_eo; if (*stream->position) stream->position++; return stream->last_token; } /**************************************************************************** * INTERFACE CLASS BODIES ****************************************************************************/ /**************************************************************************** * * END MODULE token.c * ****************************************************************************/