/************************************************************************************************* * Utility for indexing document files into a database of Odeum * Copyright (C) 2000-2003 Mikio Hirabayashi * This file is part of QDBM, Quick Database Manager. * QDBM is free software; you can redistribute it and/or modify it under the terms of the GNU * Lesser General Public License as published by the Free Software Foundation; either version * 2.1 of the License or any later version. QDBM is distributed in the hope that it will be * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more * details. * You should have received a copy of the GNU Lesser General Public License along with QDBM; if * not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA * 02111-1307 USA. *************************************************************************************************/ #include #include #include #include #include #include #include #include #include #include #undef TRUE #define TRUE 1 /* boolean true */ #undef FALSE #define FALSE 0 /* boolean false */ #define PATHCHR '/' /* delimiter character of path */ #define EXTCHR '.' /* delimiter character of extension */ #define CDIRSTR "." /* string of current directory */ #define PDIRSTR ".." /* string of parent directory */ #define PATHBUFSIZ 2048 /* size of a path buffer */ #define MAXLOAD 0.85 /* max ratio of bucket loading */ /* for Windows and RISC OS */ #if defined(_WIN32) #undef PATHCHR #define PATHCHR '\\' #undef EXTCHR #define EXTCHR '.' #undef CDIRSTR #define CDIRSTR "." #undef PDIRSTR #define PDIRSTR ".." #elif defined(__riscos__) || defined(__riscos) #include int __riscosify_control = __RISCOSIFY_NO_PROCESS; #undef PATHCHR #define PATHCHR '.' #undef EXTCHR #define EXTCHR '/' #undef CDIRSTR #define CDIRSTR "@" #undef PDIRSTR #define PDIRSTR "^" #endif /* global variables */ const char *progname; /* program name */ int sigterm; /* flag for termination signal */ /* function prototypes */ int main(int argc, char **argv); void setsignals(void); void sigtermhandler(int num); void usage(void); int runregister(int argc, char **argv); int runpurge(int argc, char **argv); int fwimatch(const char *str, const char *key); int bwimatch(const char *str, const char *key); int bwimatchlist(const char *str, const CBLIST *keys); char *fgetl(FILE *IN); void pdperror(const char *name); void printferror(const char *format, ...); void printfinfo(const char *format, ...); const char *datestr(int t); int proclist(const char *name, const char *lfile, int wmax, int ft, const CBLIST *tsuflist, const CBLIST *hsuflist); int procdir(const char *name, const char *dir, int wmax, int ft, const CBLIST *tsuflist, const CBLIST *hsuflist); int indexdir(ODEUM *odeum, const char *name, const char *dir, int wmax, int ft, const CBLIST *tsuflist, const CBLIST *hsuflist); int indexfile(ODEUM *odeum, const char *name, const char *file, int wmax, int ft, const CBLIST *tsuflist, const CBLIST *hsuflist); ODDOC *makedocplain(const char *uri, const char *text, const char *date); ODDOC *makedochtml(const char *uri, const char *html, const char *date); CBLIST *htmllist(const char *html); int procpurge(const char *name); /* main routine */ int main(int argc, char **argv){ int rv; progname = argv[0]; sigterm = FALSE; setsignals(); if(argc < 2) usage(); rv = 0; if(!strcmp(argv[1], "register")){ rv = runregister(argc, argv); } else if(!strcmp(argv[1], "purge")){ rv = runpurge(argc, argv); } else { usage(); } return rv; } /* set signal handlers */ void setsignals(void){ signal(1, sigtermhandler); signal(2, sigtermhandler); signal(3, sigtermhandler); signal(13, sigtermhandler); signal(15, sigtermhandler); } /* handler of termination signal */ void sigtermhandler(int num){ signal(num, SIG_DFL); sigterm = TRUE; printfinfo("the termination signal %d catched", num); } /* print the usage and exit */ void usage(void){ fprintf(stderr, "%s: indexer of document files\n", progname); fprintf(stderr, "\n"); fprintf(stderr, "usage:\n"); fprintf(stderr, " %s register [-l file] [-wmax num] [-tsuf sufs] [-hsuf sufs] [-ft]" " name [dir]\n", progname); fprintf(stderr, " %s purge name\n", progname); exit(1); } /* parse arguments of register command */ int runregister(int argc, char **argv){ char *name, *dir, *lfile, *tsuf, *hsuf, path[PATHBUFSIZ]; int i, wmax, ft, plen, rv; CBLIST *tsuflist, *hsuflist; name = NULL; dir = NULL; lfile = NULL; tsuf = NULL; hsuf = NULL; wmax = -1; ft = FALSE; for(i = 2; i < argc; i++){ if(!name && argv[i][0] == '-'){ if(!strcmp(argv[i], "-l")){ if(++i >= argc) usage(); lfile = argv[i]; } else if(!strcmp(argv[i], "-wmax")){ if(++i >= argc) usage(); wmax = atoi(argv[i]); } else if(!strcmp(argv[i], "-tsuf")){ if(++i >= argc) usage(); tsuf = argv[i]; } else if(!strcmp(argv[i], "-hsuf")){ if(++i >= argc) usage(); hsuf = argv[i]; } else if(!strcmp(argv[i], "-ft")){ ft = TRUE; } else { usage(); } } else if(!name){ name = argv[i]; } else if(!dir){ dir = argv[i]; } else { usage(); } } if(!name) usage(); if(!dir) dir = CDIRSTR; plen = sprintf(path, "%s", dir); if(plen > 1 && path[plen-1] == PATHCHR) path[plen-1] = '\0'; tsuflist = cbsplit(tsuf ? tsuf : ".txt,.text", -1, ","); hsuflist = cbsplit(hsuf ? hsuf : ".html,.htm", -1, ","); if(lfile){ rv = proclist(name, lfile, wmax, ft, tsuflist, hsuflist); } else { rv = procdir(name, path, wmax, ft, tsuflist, hsuflist); } cblistclose(hsuflist); cblistclose(tsuflist); return rv; } /* parse arguments of register command */ int runpurge(int argc, char **argv){ char *name; int i, rv; name = NULL; for(i = 2; i < argc; i++){ if(!name && argv[i][0] == '-'){ usage(); } else if(!name){ name = argv[i]; } else { usage(); } } if(!name) usage(); rv = procpurge(name); return rv; } /* case insensitive forward matching */ int fwimatch(const char *str, const char *key){ int len, i; len = strlen(key); for(i = 0; i < len; i++){ if(tolower(str[i]) != tolower(key[i]) || str[i] == '\0') return FALSE; } return TRUE; } /* case insensitive backward matching */ int bwimatch(const char *str, const char *key){ int slen, klen, i; slen = strlen(str); klen = strlen(key); for(i = 1; i <= klen; i++){ if(tolower(str[slen-i]) != tolower(key[klen-i]) || i > slen) return FALSE; } return TRUE; } /* case insensitive backward matching with a list */ int bwimatchlist(const char *str, const CBLIST *keys){ int i; for(i = 0; i < cblistnum(keys); i++){ if(bwimatch(str, cblistval(keys, i, NULL))) return TRUE; } return FALSE; } /* read a line */ char *fgetl(FILE *IN){ char *buf; int c, len, blen; buf = NULL; len = 0; blen = 256; while((c = fgetc(IN)) != EOF){ if(blen <= len) blen *= 2; buf = cbrealloc(buf, blen + 1); if(c == '\n') c = '\0'; buf[len++] = c; if(c == '\0') break; } if(!buf) return NULL; buf[len] = '\0'; return buf; } /* print an error message */ void pdperror(const char *name){ printf("%s: ERROR: %s: %s\n", progname, name, dperrmsg(dpecode)); fflush(stdout); } /* print formatted error string and flush the buffer */ void printferror(const char *format, ...){ va_list ap; va_start(ap, format); printf("%s: ERROR: ", progname); vprintf(format, ap); putchar('\n'); fflush(stdout); va_end(ap); } /* print formatted information string and flush the buffer */ void printfinfo(const char *format, ...){ va_list ap; va_start(ap, format); printf("%s: INFO: ", progname); vprintf(format, ap); putchar('\n'); fflush(stdout); va_end(ap); } /* get static string of the date */ const char *datestr(int t){ static char buf[32]; struct tm *stp; time_t tt; tt = (time_t)t; if(!(stp = localtime(&tt))) return "0000/00/00 00:00:00"; sprintf(buf, "%04d/%02d/%02d %02d:%02d:%02d", stp->tm_year + 1900, stp->tm_mon + 1, stp->tm_mday, stp->tm_hour, stp->tm_min, stp->tm_sec); return buf; } /* processing with finding files in a list file */ int proclist(const char *name, const char *lfile, int wmax, int ft, const CBLIST *tsuflist, const CBLIST *hsuflist){ ODEUM *odeum; FILE *IN; char *line; int err, fatal; if(!strcmp(lfile, "-")){ IN = stdin; } else { if(!(IN = fopen(lfile, "rb"))){ printferror("%s: file cannot be opened", lfile); return 1; } } printfinfo("%s: registration started", name); if(!(odeum = odopen(name, OD_OWRITER | OD_OCREAT))){ pdperror(name); return 1; } printfinfo("%s: database opened: fsiz=%d dnum=%d wnum=%d bnum=%d", name, odfsiz(odeum), oddnum(odeum), odwnum(odeum), odbnum(odeum)); err = FALSE; while((line = fgetl(IN)) != NULL){ if(sigterm){ printferror("aborting due to a termination signal"); free(line); err = TRUE; break; } if(!indexfile(odeum, name, line, wmax, ft, tsuflist, hsuflist)) err = TRUE; free(line); } fatal = odfatalerror(odeum); printfinfo("%s: database closing: fsiz=%d dnum=%d wnum=%d bnum=%d", name, odfsiz(odeum), oddnum(odeum), odwnum(odeum), odbnum(odeum)); if(!odclose(odeum)){ pdperror(name); err = TRUE; } if(IN != stdin) fclose(IN); if(err){ printfinfo("%s: registration was over%s", name, fatal ? " with fatal error" : ""); } else { printfinfo("%s: registration completed successfully", name); } return err ? 1 : 0; } /* processing with finding files in a directory */ int procdir(const char *name, const char *dir, int wmax, int ft, const CBLIST *tsuflist, const CBLIST *hsuflist){ ODEUM *odeum; int err, fatal; printfinfo("%s: registration started", name); if(!(odeum = odopen(name, OD_OWRITER | OD_OCREAT))){ pdperror(name); return 1; } printfinfo("%s: database opened: fsiz=%d dnum=%d wnum=%d bnum=%d", name, odfsiz(odeum), oddnum(odeum), odwnum(odeum), odbnum(odeum)); err = FALSE; if(!indexdir(odeum, name, dir, wmax, ft, tsuflist, hsuflist)) err = TRUE; fatal = odfatalerror(odeum); printfinfo("%s: database closing: fsiz=%d dnum=%d wnum=%d bnum=%d", name, odfsiz(odeum), oddnum(odeum), odwnum(odeum), odbnum(odeum)); if(!odclose(odeum)){ pdperror(name); err = TRUE; } if(err){ printfinfo("%s: registration was over%s", name, fatal ? " with fatal error" : ""); } else { printfinfo("%s: registration completed successfully", name); } return err ? 1 : 0; } /* find and index files in a directory */ int indexdir(ODEUM *odeum, const char *name, const char *dir, int wmax, int ft, const CBLIST *tsuflist, const CBLIST *hsuflist){ CBLIST *files; const char *file; char path[PATHBUFSIZ]; int i, isroot, isdir, err; if(!(files = cbdirlist(dir))){ printferror("%s: directory cannot be opened", dir); return FALSE; } isroot = dir[0] == PATHCHR && dir[1] == '\0'; err = FALSE; for(i = 0; i < cblistnum(files); i++){ if(sigterm){ printferror("aborting due to a termination signal"); cblistclose(files); return FALSE; } file = cblistval(files, i, NULL); if(!strcmp(file, CDIRSTR) || !strcmp(file, PDIRSTR)) continue; if(isroot){ sprintf(path, "%s%s", dir, file); } else { sprintf(path, "%s%c%s", dir, PATHCHR, file); } if(!cbfilestat(path, &isdir, NULL, NULL)){ printferror("%s: file does not exist", file); err = TRUE; continue; } if(isdir){ if(!indexdir(odeum, name, path, wmax, ft, tsuflist, hsuflist)) err = TRUE; } else { if(!indexfile(odeum, name, path, wmax, ft, tsuflist, hsuflist)) err = TRUE; } } cblistclose(files); return err ? FALSE : TRUE; } /* index a file into the database */ int indexfile(ODEUM *odeum, const char *name, const char *file, int wmax, int ft, const CBLIST *tsuflist, const CBLIST *hsuflist){ static int cnt = 0; char *buf; const char *title, *odate; int size, mtime, hot, wnum, bnum; ODDOC *doc, *old; if(!cbfilestat(file, NULL, &size, &mtime)){ printferror("%s: file does not exist", file); return FALSE; } doc = NULL; if(bwimatchlist(file, tsuflist)){ if(!(buf = cbreadfile(file, NULL))){ printferror("%s: file cannot be opened", file); return FALSE; } doc = makedocplain(file, buf, datestr(mtime)); free(buf); } else if(bwimatchlist(file, hsuflist)){ if(!(buf = cbreadfile(file, NULL))){ printferror("%s: file cannot be opened", file); return FALSE; } doc = makedochtml(file, buf, datestr(mtime)); free(buf); } if(doc){ if(ft && (!(title = oddocgetattr(doc, "title")) || strlen(title) < 1)){ if((title = strrchr(file, PATHCHR)) != NULL){ title++; } else { title = file; } oddocaddattr(doc, "title", title); } hot = TRUE; if((old = odget(odeum, file)) != NULL){ odate = oddocgetattr(old, "date"); if(odate && strcmp(oddocgetattr(doc, "date"), odate) <= 0) hot = FALSE; oddocclose(old); } if(hot){ if(odput(odeum, doc, wmax, TRUE)){ printfinfo("%s: registered: id=%d wnum=%d", file, oddocid(doc), cblistnum(oddocnwords(doc))); cnt++; } else { pdperror(file); } } else { printfinfo("%s: passed", file); } oddocclose(doc); } wnum = odwnum(odeum); bnum = odbnum(odeum); if(wnum != -1 && bnum != -1 && (double)wnum / (double)bnum > MAXLOAD){ printfinfo("%s: optimizing started: fsiz=%d dnum=%d wnum=%d bnum=%d", name, odfsiz(odeum), oddnum(odeum), odwnum(odeum), odbnum(odeum)); if(!odoptimize(odeum)){ pdperror(file); return FALSE; } printfinfo("%s: optimizing completed: fsiz=%d dnum=%d wnum=%d bnum=%d", name, odfsiz(odeum), oddnum(odeum), odwnum(odeum), odbnum(odeum)); } if(cnt >= 256){ printfinfo("%s: database status: fsiz=%d dnum=%d wnum=%d bnum=%d", name, odfsiz(odeum), oddnum(odeum), odwnum(odeum), odbnum(odeum)); cnt = 0; } return TRUE; } /* make a document of plain text */ ODDOC *makedocplain(const char *uri, const char *text, const char *date){ ODDOC *doc; CBLIST *awords; const char *asis; char *normal; int i; doc = oddocopen(uri); if(date) oddocaddattr(doc, "date", date); awords = odbreaktext(text); for(i = 0; i < cblistnum(awords); i++){ asis = cblistval(awords, i, NULL); normal = odnormalizeword(asis); oddocaddword(doc, normal, asis); free(normal); } cblistclose(awords); return doc; } /* make a document of HTML */ ODDOC *makedochtml(const char *uri, const char *html, const char *date){ ODDOC *doc; CBMAP *pairs; CBLIST *elems, *awords; const char *text, *asis; char *rtext, *normal; int i, j, body; doc = oddocopen(uri); if(date) oddocaddattr(doc, "date", date); pairs = cbmapopen(); cbmapput(pairs, "&", -1, "&", -1, TRUE); cbmapput(pairs, "<", -1, "<", -1, TRUE); cbmapput(pairs, ">", -1, ">", -1, TRUE); cbmapput(pairs, """, -1, "\"", -1, TRUE); cbmapput(pairs, " ", -1, " ", -1, TRUE); cbmapput(pairs, "&", -1, "&", -1, TRUE); cbmapput(pairs, "<", -1, "<", -1, TRUE); cbmapput(pairs, ">", -1, ">", -1, TRUE); cbmapput(pairs, "'", -1, "\"", -1, TRUE); cbmapput(pairs, " ", -1, " ", -1, TRUE); elems = htmllist(html); body = FALSE; for(i = 0; i < cblistnum(elems); i++){ text = cblistval(elems, i, NULL); if(fwimatch(text, "= 0){ if(rtext[j] != ' ') break; rtext[j] = '\0'; } for(j = 0; rtext[j] != '\0'; j++){ if(rtext[j] != ' ') break; } oddocaddattr(doc, "title", rtext + j); awords = odbreaktext(rtext); for(j = 0; j < cblistnum(awords); j++){ asis = cblistval(awords, j, NULL); normal = odnormalizeword(asis); oddocaddword(doc, normal, ""); free(normal); } cblistclose(awords); free(rtext); } } else if(fwimatch(text, " pv) cblistpush(list, html + pv, i - pv); break; } else if(fwimatch(html + i, "")) != NULL){ i = ep - html + 2; pv = i + 1; } } else if(!tag && html[i] == '<'){ if(i > pv) cblistpush(list, html + pv, i - pv); tag = TRUE; pv = i; } else if(tag && html[i] == '>'){ if(i > pv) cblistpush(list, html + pv, i - pv + 1); tag = FALSE; pv = i + 1; } i++; } return list; } /* purge documents which is not existing. */ int procpurge(const char *name){ ODEUM *odeum; ODDOC *doc; const char *file; int cnt, err, fatal; printfinfo("%s: purge started", name); if(!(odeum = odopen(name, OD_OWRITER))){ pdperror(name); return 1; } printfinfo("%s: database opened: fsiz=%d dnum=%d wnum=%d bnum=%d", name, odfsiz(odeum), oddnum(odeum), odwnum(odeum), odbnum(odeum)); err = FALSE; cnt = 0; if(!oditerinit(odeum)){ pdperror(name); err = TRUE; } else { while(TRUE){ if(sigterm){ printferror("aborting due to a termination signal"); err = TRUE; break; } if(!(doc = oditernext(odeum))){ if(dpecode != DP_ENOITEM){ pdperror(name); err = TRUE; } break; } file = oddocuri(doc); if(cbfilestat(file, NULL, NULL, NULL)){ printfinfo("%s: passed", file); } else { if(!odout(odeum, file)){ pdperror(file); err = TRUE; } printfinfo("%s: purged", file); cnt++; } oddocclose(doc); } } fatal = odfatalerror(odeum); printfinfo("%s: database closing: fsiz=%d dnum=%d wnum=%d bnum=%d", name, odfsiz(odeum), oddnum(odeum), odwnum(odeum), odbnum(odeum)); if(!odclose(odeum)){ pdperror(name); err = TRUE; } if(err){ printfinfo("%s: purge was over%s", name, fatal ? " with fatal error" : ""); } else { printfinfo("%s: purge completed successfully", name); } return err ? 1 : 0; } /* END OF FILE */