/* * Copyright (C) 2002 Laird Breyer * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. * * Author: Laird Breyer */ #ifdef HAVE_CONFIG_H #undef HAVE_CONFIG_H #include "config.h" #endif #include #include #include #include #include #include #include #include #if defined HAVE_UNISTD_H #include #endif #include "util.h" #include "dbacl.h" extern options_t u_options; extern options_t m_options; extern myregex_t re[MAX_RE]; extern regex_count_t regex_count; extern regex_count_t antiregex_count; extern MBOX_State mbox; extern XML_State xml; extern char *textbuf; extern charbuf_len_t textbuf_len; extern char *aux_textbuf; extern charbuf_len_t aux_textbuf_len; #if defined HAVE_MBRTOWC extern wchar_t *wc_textbuf; extern charbuf_len_t wc_textbuf_len; #endif extern token_order_t ngram_order; extern long system_pagesize; extern void *in_iobuf; extern void *out_iobuf; extern int cmd; extern char *inputfile; extern long inputline; /*********************************************************** * EXPERIMENTAL: * * this code is an experiment to see if memory mapping is * * faster than buffered I/O. Surprisingly, the gain is at * * best marginal (less than two percent on my test). * * The code is left in for future experiments * * * * I suspect that mmap() isn't faster because we only read * * the input files sequentially, never backtrack, and my * * Debian development system probably reads ahead quite * * agressively. So normal buffered I/O does about the same * * amount of work, In any case, most of the time is spent * * in the hash tables and calculations anyway. * * * * The mmap() code below is written so as to easily replace* * the buffered I/O functions * ***********************************************************/ #undef EXPERIMENTAL #if defined EXPERIMENTAL #include #include typedef struct { FILE *f; size_t fsize; size_t fmappos; size_t fmapsize; char *seekpos; char *mstart; } MMFILE; int MMEOF(MMFILE *stream) { return stream->mstart && ((stream->seekpos - stream->mstart) + stream->fmappos >= stream->fsize); } MMFILE *MMOPEN(FILE *f) { MMFILE *stream = NULL; struct stat sb; if( f ) { stream = (MMFILE *)malloc(sizeof(MMFILE)); if( stream ) { stream->f = f; #define MEGABYTES 20 stream->fmapsize = MEGABYTES*1024*1024; if( fstat(fileno(f), &sb) == -1 ) { free(stream); return NULL; } stream->fsize = sb.st_size; stream->fmappos = 0; stream->mstart = MMAP(NULL, stream->fmapsize, PROT_READ, MAP_SHARED, fileno(stream->f), stream->fmappos); if( stream->mstart == MAP_FAILED ) { free(stream); return NULL; } stream->seekpos = stream->mstart; if( stream->mstart ) { MADVISE(stream->mstart, stream->fmapsize, MADV_SEQUENTIAL); } } } return stream; } int MMCLOSE(MMFILE *stream) { int r = 0; if( stream ) { r = MUNMAP(stream->mstart, stream->fmapsize); free(stream); } return r; } void MMFORWARD(MMFILE *stream) { size_t offset; if( stream->fmappos + stream->fmapsize < stream->fsize ) { offset = stream->seekpos - stream->mstart; if( MUNMAP(stream->mstart, stream->fmapsize) == -1 ) { free(stream); exit(0); } stream->fmappos += system_pagesize * (offset/system_pagesize); stream->mstart = MMAP(0, stream->fmapsize, PROT_READ, MAP_SHARED, fileno(stream->f), stream->fmappos); if( stream->mstart == MAP_FAILED ) { free(stream); exit(0); } stream->seekpos = stream->mstart + (offset - system_pagesize * (offset/system_pagesize)); if( stream->mstart ) { MADVISE(stream->mstart, stream->fmapsize, MADV_SEQUENTIAL); } } } char *MMGETS(char *s, size_t size, MMFILE *stream) { char *result = NULL; size_t left; left = stream->fmapsize - (stream->seekpos - stream->mstart); if( left < size ) { MMFORWARD(stream); left = stream->fmapsize - (stream->seekpos - stream->mstart); } if( left <= 0 ) { return NULL; } else if( size <= left + 1) { result = memccpy(s, stream->seekpos, '\n', size - 1); if( result ) { *result = '\0'; stream->seekpos += result - s; return s; } } else { result = memccpy(s, stream->seekpos, '\n', left); if( result ) { *result = '\0'; stream->seekpos = result; } else { memcpy(s, stream->seekpos, left); s[left] = '\0'; stream->seekpos += left; } return s; } return NULL; } #endif /*********************************************************** * MISCELLANEOUS FILE HANDLING * ***********************************************************/ void init_file_handling() { init_buffers(); init_mbox_line_filter(&mbox); } void cleanup_file_handling() { free_mbox_line_filter(&mbox); cleanup_buffers(); } /* Given a mime type, this selects an appropriate * xml default setting for the character filter. */ XML_Reset select_xml_defaults(MIME_Struct *mime) { switch(mime->type) { case ctUNDEF: case ctMESSAGE_RFC822: case ctTEXT_PLAIN: return (m_options & (1<hide = VISIBLE; xml->state = TEXT; if( m_options & (1<parser = xpSMART; } else if( m_options & (1<parser = xpDUMB; } else { xml->parser = xpSMART; } break; case xmlDISABLE: xml->state = DISABLED; break; case xmlSMART: if( (xml->state == DISABLED) || ((xml->parser != xpHTML) && (xml->parser != xpSMART)) ) { xml->state = TEXT; xml->hide = VISIBLE; xml->parser = xpSMART; } break; case xmlHTML: if( (xml->state == DISABLED) || ((xml->parser != xpHTML) && (xml->parser != xpSMART)) ) { xml->state = TEXT; xml->hide = VISIBLE; xml->parser = xpHTML; } break; case xmlDUMB: if( (xml->state == DISABLED) || (xml->parser != xpDUMB) ) { xml->state = TEXT; xml->hide = VISIBLE; xml->parser = xpDUMB; break; } case xmlUNDEF: /* ignore */ break; } } } void reset_mbox_line_filter(MBOX_State *mbox) { mbox->state = msUNDEF; mbox->substate = msuUNDEF; mbox->header.type = mbox->body.type = ctUNDEF; mbox->header.encoding = mbox->body.encoding = ceUNDEF; mbox->prev_line_empty = 1; mbox->corruption_check = 0; mbox->skip_header = 0; mbox->skip_until_boundary = 0; mbox->strip_header_char = '\0'; #if defined HAVE_MBRTOWC mbox->w_strip_header_char = L'\0'; #endif mbox->plainstate = psPLAIN; memset(&mbox->boundary, 0, sizeof(mbox->boundary)); /* no need to reserve space for both char and wide char caches */ if( m_options & (1< 0 ) { strcmp(textbuf, "\r\n\r\n"); return 1; } return 0; } void process_directory(char *name, int (*line_filter)(MBOX_State *, char *), void (*character_filter)(XML_State *, char *), void (*word_fun)(char *, token_type_t, regex_count_t), char *(*pre_line_fun)(char *), void (*post_line_fun)(char *)) { DIR *d; struct dirent *sd; FILE *input; struct stat statinfo; char fullp[_POSIX_PATH_MAX + 1]; char *fp; d = opendir(name); if( d ) { /* directory returns relative file names, but we need full paths */ strcpy(fullp, name); fp = fullp + strlen(name); *fp++ = '/'; for(sd = readdir(d); sd; sd = readdir(d)) { strcpy(fp, sd->d_name); if( stat(fullp, &statinfo) == 0 ) { switch(statinfo.st_mode & S_IFMT) { case S_IFREG: input = fopen(fullp, "rb"); if( input ) { inputfile = fullp; /* set some initial options */ reset_xml_character_filter(&xml, xmlRESET); if( m_options & (1<d_name); if( stat(fullp, &statinfo) == 0 ) { switch(statinfo.st_mode & S_IFMT) { case S_IFREG: input = fopen(fullp, "rb"); if( input ) { inputfile = fullp; /* set some initial options */ reset_xml_character_filter(&xml, xmlRESET); if( m_options & (1<