/************************************************************************************************* * Implementation of Odeum * Copyright (C) 2000-2003 Mikio Hirabayashi * This file is part of QDBM, Quick Database Manager. * QDBM is free software; you can redistribute it and/or modify it under the terms of the GNU * Lesser General Public License as published by the Free Software Foundation; either version * 2.1 of the License or any later version. QDBM is distributed in the hope that it will be * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more * details. * You should have received a copy of the GNU Lesser General Public License along with QDBM; if * not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA * 02111-1307 USA. *************************************************************************************************/ #include "odeum.h" #include "myconf.h" #define OD_NAMEMAX 256 /* max size of a database name */ #define OD_DIRMODE 00755 /* permission of a creating directory */ #define OD_PATHBUFSIZ 1024 /* size of a path buffer */ #define OD_NUMBUFSIZ 32 /* size of a buffer for a number */ #define OD_DOCSNAME "docs" /* name of the database for documents */ #define OD_INDEXNAME "index" /* name of the database for inverted index */ #define OD_RDOCSNAME "rdocs" /* name of the database for reverse dictionary */ #define OD_DOCSBNUM 509 /* initial bucket number of document database */ #define OD_DOCSDNUM 9 /* division number of document database */ #define OD_DOCSALIGN -4 /* alignment of document database */ #define OD_INDEXBNUM 8191 /* initial bucket number of inverted index */ #define OD_INDEXDNUM 5 /* division number of inverted index */ #define OD_INDEXALIGN -3 /* alignment of inverted index */ #define OD_RDOCSLRM 49 /* records in a leaf node of reverse dictionary */ #define OD_RDOCSNIM 192 /* records in a non-leaf node of reverse dictionary */ #define OD_RDOCSLCN 512 /* number of leaf cache of reverse dictionary */ #define OD_RDOCSNCN 256 /* number of non-leaf cache of reverse dictionary */ #define OD_SORTMAPMAX 262144 /* max number of records of sortmap */ #define OD_DMAXEXPR "dmax" /* key of max number of the document ID */ #define OD_DNUMEXPR "dnum" /* key of number of the documents */ #define OD_URIEXPR "1" /* map key of URI */ #define OD_ATTRSEXPR "2" /* map key of attributes */ #define OD_NWORDSEXPR "3" /* map key of normal words */ #define OD_AWORDSEXPR "4" /* map key of as-is words */ #define OD_WTOPRATE 0.1 /* ratio of top words */ #define OD_WTOPBONUS 5000 /* bonus points of top words */ #define OD_WOCCRPOINT 10000 /* points per occurence */ #define OD_KEYCANDS 64 /* max number of candidates for keywords */ #define OD_SPACECHARS "\t\n\v\f\r " /* space characters */ #define OD_DELIMCHARS "!\"#$%&'()*,./:;<=>?@[\\]^`{|}~" /* delimiter characters */ #define OD_MAXWORDLEN 48 /* max length of a word */ typedef struct { /* type of structure for word counting */ const char *word; /* pointer to the word */ int num; /* frequency of the word */ } ODWORD; /* private function prototypes */ static int odsortindex(ODEUM *odeum); static int odsortcompare(const void *a, const void *b); static int odpurgeindex(ODEUM *odeum); static CBMAP *odpairsmap(const ODPAIR *pairs, int num); static int odwordcompare(const void *a, const void *b); /************************************************************************************************* * public objects *************************************************************************************************/ /* Get a database handle. */ ODEUM *odopen(const char *name, int omode){ int cromode, vlomode, inode, dmax, dnum; char docsname[OD_PATHBUFSIZ], indexname[OD_PATHBUFSIZ], rdocsname[OD_PATHBUFSIZ], *tmp; struct stat sbuf; CURIA *docsdb, *indexdb; VILLA *rdocsdb; CBMAP *sortmap; ODEUM *odeum; assert(name); if(strlen(name) > OD_NAMEMAX){ dpecode = DP_EMISC; return NULL; } cromode = CR_OREADER; vlomode = VL_OREADER; if(omode & OD_OWRITER){ cromode = CR_OWRITER; vlomode = VL_OWRITER; if(omode & OD_OCREAT){ cromode |= CR_OCREAT; vlomode |= VL_OCREAT; } if(omode & OD_OTRUNC){ cromode |= CR_OTRUNC; vlomode |= VL_OTRUNC; } } if(omode & OD_ONOLCK){ cromode |= CR_ONOLCK; vlomode |= VL_ONOLCK; } sprintf(docsname, "%s%c%s", name, MYPATHCHR, OD_DOCSNAME); sprintf(indexname, "%s%c%s", name, MYPATHCHR, OD_INDEXNAME); sprintf(rdocsname, "%s%c%s", name, MYPATHCHR, OD_RDOCSNAME); docsdb = NULL; indexdb = NULL; rdocsdb = NULL; if((omode & OD_OWRITER) && (omode & OD_OCREAT)){ if(mkdir(name, OD_DIRMODE) == -1 && errno != EEXIST){ dpecode = DP_EMKDIR; return NULL; } } if(stat(name, &sbuf) == -1){ dpecode = DP_ESTAT; return NULL; } inode = sbuf.st_ino; if(!(docsdb = cropen(docsname, cromode, OD_DOCSBNUM, OD_DOCSDNUM))) return NULL; if(!(indexdb = cropen(indexname, cromode, OD_INDEXBNUM, OD_INDEXDNUM))){ crclose(docsdb); return NULL; } if(omode & OD_OWRITER){ if(!crsetalign(docsdb, OD_DOCSALIGN) || !crsetalign(indexdb, OD_INDEXALIGN)){ crclose(indexdb); crclose(docsdb); return NULL; } } if(!(rdocsdb = vlopen(rdocsname, vlomode, VL_CMPLEX))){ crclose(indexdb); crclose(docsdb); return NULL; } vlsettuning(rdocsdb, OD_RDOCSLRM, OD_RDOCSNIM, OD_RDOCSLCN, OD_RDOCSNCN); if(omode & OD_OWRITER){ sortmap = cbmapopen(); } else { sortmap = NULL; } if(vlrnum(rdocsdb) > 0){ dmax = -1; dnum = -1; if((tmp = vlget(rdocsdb, OD_DMAXEXPR, sizeof(OD_DMAXEXPR), NULL)) != NULL){ dmax = atoi(tmp); free(tmp); } if((tmp = vlget(rdocsdb, OD_DNUMEXPR, sizeof(OD_DNUMEXPR), NULL)) != NULL){ dnum = atoi(tmp); free(tmp); } if(dmax < 0 || dnum < 0){ vlclose(rdocsdb); crclose(indexdb); crclose(docsdb); dpecode = DP_EBROKEN; return NULL; } } else { dmax = 0; dnum = 0; } odeum = cbmalloc(sizeof(ODEUM)); odeum->name = cbmemdup(name, -1); odeum->wmode = omode & OD_OWRITER; odeum->fatal = FALSE; odeum->inode = inode; odeum->docsdb = docsdb; odeum->indexdb = indexdb; odeum->rdocsdb = rdocsdb; odeum->sortmap = sortmap; odeum->dmax = dmax; odeum->dnum = dnum; return odeum; } /* Close a database handle. */ int odclose(ODEUM *odeum){ char numbuf[OD_NUMBUFSIZ]; int err; assert(odeum); err = FALSE; if(odeum->wmode){ sprintf(numbuf, "%d", odeum->dmax); if(!vlput(odeum->rdocsdb, OD_DMAXEXPR, sizeof(OD_DMAXEXPR), numbuf, -1, VL_DOVER)) err = TRUE; sprintf(numbuf, "%d", odeum->dnum); if(!vlput(odeum->rdocsdb, OD_DNUMEXPR, sizeof(OD_DNUMEXPR), numbuf, -1, VL_DOVER)) err = TRUE; if(!odsortindex(odeum)) err = TRUE; cbmapclose(odeum->sortmap); } if(!vlclose(odeum->rdocsdb)) err = TRUE; if(!crclose(odeum->indexdb)) err = TRUE; if(!crclose(odeum->docsdb)) err = TRUE; free(odeum->name); free(odeum); return err ? FALSE : TRUE; } /* Store a document. */ int odput(ODEUM *odeum, ODDOC *doc, int wmax, int over){ char *tmp, *zbuf; const char *word, *ctmp; int i, docid, tsiz, wsiz, wnum, tmax, num, zsiz; double wlog; ODPAIR pair; CBMAP *map; CBLIST *tlist; assert(odeum); if(odeum->fatal){ dpecode = DP_EFATAL; return FALSE; } if(!odeum->wmode){ dpecode = DP_EMODE; return FALSE; } if((tmp = vlget(odeum->rdocsdb, doc->uri, -1, &tsiz)) != NULL){ if(!over){ free(tmp); dpecode = DP_EKEEP; return FALSE; } if(tsiz != sizeof(int) || !odoutbyid(odeum, *(int *)tmp)){ free(tmp); dpecode = DP_EBROKEN; odeum->fatal = TRUE; return FALSE; } free(tmp); } odeum->dmax++; odeum->dnum++; docid = odeum->dmax; map = cbmapopen(); cbmapput(map, OD_URIEXPR, sizeof(OD_URIEXPR), doc->uri, -1, TRUE); tmp = cbmapdump(doc->attrs, &tsiz); cbmapput(map, OD_ATTRSEXPR, sizeof(OD_ATTRSEXPR), tmp, tsiz, TRUE); free(tmp); if(wmax >= 0 && wmax < cblistnum(doc->nwords)){ tlist = cblistopen(); for(i = 0; i < wmax; i++){ ctmp = cblistval(doc->nwords, i, &wsiz); cblistpush(tlist, ctmp, wsiz); } tmp = cblistdump(tlist, &tsiz); cbmapput(map, OD_NWORDSEXPR, sizeof(OD_NWORDSEXPR), tmp, tsiz, TRUE); free(tmp); cblistclose(tlist); tlist = cblistopen(); for(i = 0; i < wmax; i++){ ctmp = cblistval(doc->awords, i, &wsiz); cblistpush(tlist, ctmp, wsiz); } tmp = cblistdump(tlist, &tsiz); cbmapput(map, OD_AWORDSEXPR, sizeof(OD_AWORDSEXPR), tmp, tsiz, TRUE); free(tmp); cblistclose(tlist); } else { tmp = cblistdump(doc->nwords, &tsiz); cbmapput(map, OD_NWORDSEXPR, sizeof(OD_NWORDSEXPR), tmp, tsiz, TRUE); free(tmp); tmp = cblistdump(doc->awords, &tsiz); cbmapput(map, OD_AWORDSEXPR, sizeof(OD_AWORDSEXPR), tmp, tsiz, TRUE); free(tmp); } tmp = cbmapdump(map, &tsiz); cbmapclose(map); if(_qdbm_deflate){ if(!(zbuf = _qdbm_deflate(tmp, tsiz, &zsiz))){ free(tmp); dpecode = DP_EMISC; odeum->fatal = TRUE; return FALSE; } free(tmp); tmp = zbuf; tsiz = zsiz; } if(!crput(odeum->docsdb, (char *)&docid, sizeof(int), tmp, tsiz, CR_DKEEP)){ free(tmp); if(dpecode == DP_EKEEP) dpecode = DP_EBROKEN; odeum->fatal = TRUE; return FALSE; } free(tmp); if(!vlput(odeum->rdocsdb, doc->uri, -1, (char *)&docid, sizeof(int), VL_DOVER)){ odeum->fatal = TRUE; return FALSE; } map = cbmapopen(); wnum = cblistnum(doc->nwords); tmax = wnum * OD_WTOPRATE; for(i = 0; i < wnum; i++){ word = cblistval(doc->nwords, i, &wsiz); if(wsiz < 1) continue; if((ctmp = cbmapget(map, word, wsiz, NULL)) != NULL){ num = *(int *)ctmp + OD_WOCCRPOINT; } else { num = i <= tmax ? OD_WTOPBONUS + OD_WOCCRPOINT : OD_WOCCRPOINT; } cbmapput(map, word, wsiz, (char *)&num, sizeof(int), TRUE); } wlog = odlogarithm(wnum); wlog = (wlog * wlog) / 4.0; if(wlog < 4.0) wlog = 4.0; cbmapiterinit(map); while((word = cbmapiternext(map, &wsiz)) != NULL){ pair.id = docid; pair.score = *(int *)cbmapget(map, word, wsiz, NULL) / wlog; if(!crput(odeum->indexdb, word, wsiz, (char *)&pair, sizeof(pair), CR_DCAT)){ cbmapclose(map); odeum->fatal = TRUE; return FALSE; } cbmapput(odeum->sortmap, word, wsiz, "", 0, FALSE); } cbmapclose(map); if(cbmaprnum(odeum->sortmap) > OD_SORTMAPMAX){ if(!odsortindex(odeum)){ odeum->fatal = TRUE; return FALSE; } } doc->id = docid; return TRUE; } /* Delete a document by a URL. */ int odout(ODEUM *odeum, const char *uri){ char *tmp; int tsiz, docid; assert(odeum && uri); if(odeum->fatal){ dpecode = DP_EFATAL; return FALSE; } if(!odeum->wmode){ dpecode = DP_EMODE; return FALSE; } if(!(tmp = vlget(odeum->rdocsdb, uri, -1, &tsiz))){ if(dpecode != DP_ENOITEM) odeum->fatal = TRUE; return FALSE; } if(tsiz != sizeof(int)){ free(tmp); dpecode = DP_EBROKEN; odeum->fatal = TRUE; return FALSE; } docid = *(int *)tmp; free(tmp); return odoutbyid(odeum, docid); } /* Delete a document specified by an ID number. */ int odoutbyid(ODEUM *odeum, int id){ char *tmp, *zbuf; const char *uritmp; int tsiz, uritsiz, zsiz; CBMAP *map; assert(odeum && id > 0); if(odeum->fatal){ dpecode = DP_EFATAL; return FALSE; } if(!odeum->wmode){ dpecode = DP_EMODE; return FALSE; } if(!(tmp = crget(odeum->docsdb, (char *)&id, sizeof(int), 0, -1, &tsiz))){ if(dpecode != DP_ENOITEM) odeum->fatal = TRUE; return FALSE; } if(_qdbm_inflate){ if(!(zbuf = _qdbm_inflate(tmp, tsiz, &zsiz))){ free(tmp); dpecode = DP_EBROKEN; odeum->fatal = TRUE; return FALSE; } free(tmp); tmp = zbuf; tsiz = zsiz; } map = cbmapload(tmp, tsiz); free(tmp); uritmp = cbmapget(map, OD_URIEXPR, sizeof(OD_URIEXPR), &uritsiz); if(!uritmp || !vlout(odeum->rdocsdb, uritmp, uritsiz)){ cbmapclose(map); dpecode = DP_EBROKEN; odeum->fatal = TRUE; return FALSE; } cbmapclose(map); if(!crout(odeum->docsdb, (char *)&id, sizeof(int))){ odeum->fatal = TRUE; return FALSE; } odeum->dnum--; return TRUE; } /* Retrieve a document by a URL. */ ODDOC *odget(ODEUM *odeum, const char *uri){ char *tmp; int tsiz, docid; assert(odeum && uri); if(odeum->fatal){ dpecode = DP_EFATAL; return NULL; } if(!(tmp = vlget(odeum->rdocsdb, uri, -1, &tsiz))){ if(dpecode != DP_ENOITEM) odeum->fatal = TRUE; return NULL; } if(tsiz != sizeof(int)){ free(tmp); dpecode = DP_EBROKEN; odeum->fatal = TRUE; return NULL; } docid = *(int *)tmp; free(tmp); return odgetbyid(odeum, docid); } /* Retrieve a document by an ID number. */ ODDOC *odgetbyid(ODEUM *odeum, int id){ char *tmp, *zbuf; const char *uritmp, *attrstmp, *nwordstmp, *awordstmp; int tsiz, uritsiz, attrstsiz, nwordstsiz, awordstsiz, zsiz; ODDOC *doc; CBMAP *map; assert(odeum); if(odeum->fatal){ dpecode = DP_EFATAL; return NULL; } if(id < 1){ dpecode = DP_ENOITEM; return NULL; } if(!(tmp = crget(odeum->docsdb, (char *)&id, sizeof(int), 0, -1, &tsiz))){ if(dpecode != DP_ENOITEM) odeum->fatal = TRUE; return NULL; } if(_qdbm_inflate){ if(!(zbuf = _qdbm_inflate(tmp, tsiz, &zsiz))){ free(tmp); dpecode = DP_EBROKEN; odeum->fatal = TRUE; return NULL; } free(tmp); tmp = zbuf; tsiz = zsiz; } map = cbmapload(tmp, tsiz); free(tmp); uritmp = cbmapget(map, OD_URIEXPR, sizeof(OD_URIEXPR), &uritsiz); attrstmp = cbmapget(map, OD_ATTRSEXPR, sizeof(OD_ATTRSEXPR), &attrstsiz); nwordstmp = cbmapget(map, OD_NWORDSEXPR, sizeof(OD_NWORDSEXPR), &nwordstsiz); awordstmp = cbmapget(map, OD_AWORDSEXPR, sizeof(OD_AWORDSEXPR), &awordstsiz); if(!uritmp || !attrstmp || !nwordstmp || !awordstmp){ cbmapclose(map); dpecode = DP_EBROKEN; odeum->fatal = TRUE; return NULL; } doc = cbmalloc(sizeof(ODDOC)); doc->id = id; doc->uri = cbmemdup(uritmp, uritsiz); doc->attrs = cbmapload(attrstmp, attrstsiz); doc->nwords = cblistload(nwordstmp, nwordstsiz); doc->awords = cblistload(awordstmp, awordstsiz); cbmapclose(map); return doc; } /* Search the inverted index for documents including a word. */ ODPAIR *odsearch(ODEUM *odeum, const char *word, int max, int *np){ char *tmp; int tsiz; assert(odeum && word && np); if(odeum->fatal){ dpecode = DP_EFATAL; return NULL; } if(odeum->wmode && cbmaprnum(odeum->sortmap) > 0 && !odsortindex(odeum)){ odeum->fatal = TRUE; return NULL; } max = max < 0 ? -1 : max * sizeof(ODPAIR); if(!(tmp = crget(odeum->indexdb, word, -1, 0, max, &tsiz))){ if(dpecode != DP_ENOITEM){ odeum->fatal = TRUE; return NULL; } *np = 0; return cbmalloc(1); } *np = tsiz / sizeof(ODPAIR); return (ODPAIR *)tmp; } /* Get the number of documents including a word. */ int odsearchdnum(ODEUM *odeum, const char *word){ int rv; assert(odeum && word); if(odeum->fatal){ dpecode = DP_EFATAL; return -1; } rv = crvsiz(odeum->indexdb, word, -1); return rv < 0 ? -1 : rv / sizeof(ODPAIR); } /* Initialize the iterator of a database handle. */ int oditerinit(ODEUM *odeum){ assert(odeum); if(odeum->fatal){ dpecode = DP_EFATAL; return FALSE; } return criterinit(odeum->docsdb); } /* Get the next key of the iterator. */ ODDOC *oditernext(ODEUM *odeum){ char *tmp; int tsiz, docsid; ODDOC *doc; assert(odeum); if(odeum->fatal){ dpecode = DP_EFATAL; return NULL; } doc = NULL; while(TRUE){ if(!(tmp = criternext(odeum->docsdb, &tsiz))){ if(dpecode != DP_ENOITEM) odeum->fatal = TRUE; return NULL; } if(tsiz != sizeof(int)){ free(tmp); dpecode = DP_EBROKEN; odeum->fatal = TRUE; return NULL; } docsid = *(int *)tmp; free(tmp); if((doc = odgetbyid(odeum, docsid)) != NULL) break; if(dpecode != DP_ENOITEM){ odeum->fatal = TRUE; return NULL; } } return doc; } /* Synchronize updating contents with the files and the devices. */ int odsync(ODEUM *odeum){ char numbuf[OD_NUMBUFSIZ]; assert(odeum); if(odeum->fatal){ dpecode = DP_EFATAL; return FALSE; } if(!odeum->wmode){ dpecode = DP_EMODE; return FALSE; } sprintf(numbuf, "%d", odeum->dmax); if(!vlput(odeum->rdocsdb, OD_DMAXEXPR, sizeof(OD_DMAXEXPR), numbuf, -1, VL_DOVER)){ odeum->fatal = TRUE; return FALSE; } sprintf(numbuf, "%d", odeum->dnum); if(!vlput(odeum->rdocsdb, OD_DNUMEXPR, sizeof(OD_DNUMEXPR), numbuf, -1, VL_DOVER)){ odeum->fatal = TRUE; return FALSE; } if(!odsortindex(odeum)){ odeum->fatal = TRUE; return FALSE; } if(!crsync(odeum->docsdb)){ odeum->fatal = TRUE; return FALSE; } if(!crsync(odeum->indexdb)){ odeum->fatal = TRUE; return FALSE; } if(!vlsync(odeum->rdocsdb)){ odeum->fatal = TRUE; return FALSE; } return TRUE; } /* Optimize a database. */ int odoptimize(ODEUM *odeum){ assert(odeum); if(odeum->fatal){ dpecode = DP_EFATAL; return FALSE; } if(!odeum->wmode){ dpecode = DP_EMODE; return FALSE; } if(!odpurgeindex(odeum)){ odeum->fatal = TRUE; return FALSE; } if(!odsortindex(odeum)){ odeum->fatal = TRUE; return FALSE; } if(!croptimize(odeum->docsdb, -1)){ odeum->fatal = TRUE; return FALSE; } if(!croptimize(odeum->indexdb, -1)){ odeum->fatal = TRUE; return FALSE; } if(!vloptimize(odeum->rdocsdb)){ odeum->fatal = TRUE; return FALSE; } return TRUE; } /* Get the name of a database. */ char *odname(ODEUM *odeum){ assert(odeum); if(odeum->fatal){ dpecode = DP_EFATAL; return NULL; } return cbmemdup(odeum->name, -1); } /* Get the total size of database files. */ int odfsiz(ODEUM *odeum){ int fsiz, rv; assert(odeum); if(odeum->fatal){ dpecode = DP_EFATAL; return -1; } fsiz = 0; if((rv = crfsiz(odeum->docsdb)) == -1) return -1; fsiz += rv; if((rv = crfsiz(odeum->indexdb)) == -1) return -1; fsiz += rv; if((rv = vlfsiz(odeum->rdocsdb)) == -1) return -1; fsiz += rv; return fsiz; } /* Get the number of the elements of the bucket array used in the inverted index. */ int odbnum(ODEUM *odeum){ assert(odeum); if(odeum->fatal){ dpecode = DP_EFATAL; return -1; } return crbnum(odeum->indexdb); } /* Get the number of the documents stored in a database. */ int oddnum(ODEUM *odeum){ assert(odeum); if(odeum->fatal){ dpecode = DP_EFATAL; return -1; } return odeum->dnum; } /* Get the number of the words stored in a database. */ int odwnum(ODEUM *odeum){ assert(odeum); if(odeum->fatal){ dpecode = DP_EFATAL; return -1; } return crrnum(odeum->indexdb); } /* Check whether a database handle is a writer or not. */ int odwritable(ODEUM *odeum){ assert(odeum); return odeum->wmode; } /* Check whether a database has a fatal error or not. */ int odfatalerror(ODEUM *odeum){ assert(odeum); return odeum->fatal; } /* Get the inode number of a database directory. */ int odinode(ODEUM *odeum){ assert(odeum); return odeum->inode; } /* Remove a database directory. */ int odremove(const char *name){ char docsname[OD_PATHBUFSIZ], indexname[OD_PATHBUFSIZ], rdocsname[OD_PATHBUFSIZ]; struct stat sbuf; assert(name); sprintf(docsname, "%s%c%s", name, MYPATHCHR, OD_DOCSNAME); sprintf(indexname, "%s%c%s", name, MYPATHCHR, OD_INDEXNAME); sprintf(rdocsname, "%s%c%s", name, MYPATHCHR, OD_RDOCSNAME); if(stat(name, &sbuf) == -1){ dpecode = DP_ESTAT; return FALSE; } if(stat(docsname, &sbuf) != -1 && !crremove(docsname)) return FALSE; if(stat(indexname, &sbuf) != -1 && !crremove(indexname)) return FALSE; if(stat(rdocsname, &sbuf) != -1 && !vlremove(rdocsname)) return FALSE; if(rmdir(name) == -1){ dpecode = DP_ERMDIR; return FALSE; } return TRUE; } /* Get a document handle. */ ODDOC *oddocopen(const char *uri){ ODDOC *doc; assert(uri); doc = cbmalloc(sizeof(ODDOC)); doc->id = -1; doc->uri = cbmemdup(uri, -1); doc->attrs = cbmapopen(); doc->nwords = cblistopen(); doc->awords = cblistopen(); return doc; } /* Close a document handle. */ void oddocclose(ODDOC *doc){ assert(doc); cblistclose(doc->awords); cblistclose(doc->nwords); cbmapclose(doc->attrs); free(doc->uri); free(doc); } /* Add an attribute to a document. */ void oddocaddattr(ODDOC *doc, const char *name, const char *value){ assert(doc && name && value); cbmapput(doc->attrs, name, -1, value, -1, TRUE); } /* Add a word to a document. */ void oddocaddword(ODDOC *doc, const char *normal, const char *asis){ assert(doc && normal && asis); cblistpush(doc->nwords, normal, -1); cblistpush(doc->awords, asis, -1); } /* Get the ID number of a document. */ int oddocid(const ODDOC *doc){ assert(doc); return doc->id; } /* Get the URI of a document. */ const char *oddocuri(const ODDOC *doc){ assert(doc); return doc->uri; } /* Get the value of an attribute of a document. */ const char *oddocgetattr(const ODDOC *doc, const char *name){ assert(doc && name); return cbmapget(doc->attrs, name, -1, NULL); } /* Get the list handle contains words in normalized form of a document. */ const CBLIST *oddocnwords(const ODDOC *doc){ assert(doc); return doc->nwords; } /* Get the list handle contains words in appearance form of a document. */ const CBLIST *oddocawords(const ODDOC *doc){ assert(doc); return doc->awords; } /* Get the list handle contains keywords in appearance form of a document. */ CBLIST *oddockwords(const ODDOC *doc, int max, ODEUM *odeum){ CBLIST *kwords; const CBLIST *nwords; CBMAP *map; const char *word, *ctmp; ODWORD *owords; int i, wsiz, wnum, hnum, mnum; double ival; assert(doc && max >= 0); map = cbmapopen(); nwords = oddocnwords(doc); for(i = 0; i < cblistnum(nwords); i++){ word = cblistval(nwords, i, &wsiz); if(wsiz < 1) continue; if((ctmp = cbmapget(map, word, wsiz, NULL)) != NULL){ wnum = *(int *)ctmp + OD_WOCCRPOINT; } else { wnum = OD_WOCCRPOINT; } cbmapput(map, word, wsiz, (char *)&wnum, sizeof(int), TRUE); } mnum = cbmaprnum(map); owords = cbmalloc(mnum * sizeof(ODWORD) + 1); cbmapiterinit(map); for(i = 0; (word = cbmapiternext(map, &wsiz)) != NULL; i++){ owords[i].word = word; owords[i].num = *(int *)cbmapget(map, word, wsiz, NULL); } cbqsort(owords, mnum, sizeof(ODWORD), odwordcompare); if(odeum){ if(mnum > OD_KEYCANDS) mnum = OD_KEYCANDS; for(i = 0; i < mnum; i++){ if((hnum = odsearchdnum(odeum, owords[i].word)) < 0) hnum = 0; ival = odlogarithm(hnum); ival = (ival * ival * ival) / 8.0; if(ival < 8.0) ival = 8.0; owords[i].num = owords[i].num / ival; } cbqsort(owords, mnum, sizeof(ODWORD), odwordcompare); } kwords = cblistopen(); if(mnum > max) mnum = max; for(i = 0; i < mnum; i++){ cblistpush(kwords, owords[i].word, -1); } free(owords); cbmapclose(map); return kwords; } /* Break a text into words in appearance form. */ CBLIST *odbreaktext(const char *text){ const char *word; CBLIST *elems, *words; int i, j, dif, wsiz, pv, delim; assert(text); words = cblistopen(); elems = cbsplit(text, -1, OD_SPACECHARS); for(i = 0; i < cblistnum(elems); i++){ word = cblistval(elems, i, &wsiz); delim = FALSE; j = 0; pv = 0; while(TRUE){ dif = j - pv; if(j >= wsiz){ if(dif > 0 && dif <= OD_MAXWORDLEN) cblistpush(words, word + pv, j - pv); break; } if(delim){ if(!strchr(OD_DELIMCHARS, word[j])){ if(dif > 0 && dif <= OD_MAXWORDLEN) cblistpush(words, word + pv, j - pv); pv = j; delim = FALSE; } } else { if(strchr(OD_DELIMCHARS, word[j])){ if(dif > 0 && dif <= OD_MAXWORDLEN) cblistpush(words, word + pv, j - pv); pv = j; delim = TRUE; } } j++; } } cblistclose(elems); return words; } /* Make the normalized form of a word. */ char *odnormalizeword(const char *asis){ char *nword; int i; assert(asis); for(i = 0; asis[i] != '\0'; i++){ if(!strchr(OD_DELIMCHARS, asis[i])) break; } if(asis[i] == '\0') return cbmemdup("", 0); nword = cbmemdup(asis, -1); for(i = 0; nword[i] != '\0'; i++){ if(nword[i] >= 'A' && nword[i] <= 'Z') nword[i] += 'a' - 'A'; } return nword; } /* Get the common elements of two sets of documents. */ ODPAIR *odpairsand(ODPAIR *apairs, int anum, ODPAIR *bpairs, int bnum, int *np){ CBMAP *map; ODPAIR *result; const char *tmp; int i, rnum; assert(apairs && anum >= 0 && bpairs && bnum >= 0); map = odpairsmap(bpairs, bnum); result = cbmalloc(sizeof(ODPAIR) * anum + 1); rnum = 0; for(i = 0; i < anum; i++){ if(!(tmp = cbmapget(map, (char *)&(apairs[i].id), sizeof(int), NULL))) continue; result[rnum].id = apairs[i].id; result[rnum].score = apairs[i].score + *(int *)tmp; rnum++; } cbmapclose(map); cbqsort(result, rnum, sizeof(ODPAIR), odsortcompare); *np = rnum; return result; } /* Get the sum of elements of two sets of documents. */ ODPAIR *odpairsor(ODPAIR *apairs, int anum, ODPAIR *bpairs, int bnum, int *np){ CBMAP *map; ODPAIR *result; const char *tmp; int i, score, rnum; assert(apairs && anum >= 0 && bpairs && bnum >= 0); map = odpairsmap(bpairs, bnum); for(i = 0; i < anum; i++){ score = 0; if((tmp = cbmapget(map, (char *)&(apairs[i].id), sizeof(int), NULL)) != NULL) score = *(int *)tmp; score += apairs[i].score; cbmapput(map, (char *)&(apairs[i].id), sizeof(int), (char *)&score, sizeof(int), TRUE); } rnum = cbmaprnum(map); result = cbmalloc(rnum * sizeof(ODPAIR) + 1); cbmapiterinit(map); for(i = 0; (tmp = cbmapiternext(map, NULL)) != NULL; i++){ result[i].id = *(int *)tmp; result[i].score = *(int *)cbmapget(map, tmp, sizeof(int), NULL); } cbmapclose(map); cbqsort(result, rnum, sizeof(ODPAIR), odsortcompare); *np = rnum; return result; } /* Get the difference set of documents. */ ODPAIR *odpairsnotand(ODPAIR *apairs, int anum, ODPAIR *bpairs, int bnum, int *np){ CBMAP *map; ODPAIR *result; const char *tmp; int i, rnum; assert(apairs && anum >= 0 && bpairs && bnum >= 0); map = odpairsmap(bpairs, bnum); result = cbmalloc(sizeof(ODPAIR) * anum + 1); rnum = 0; for(i = 0; i < anum; i++){ if((tmp = cbmapget(map, (char *)&(apairs[i].id), sizeof(int), NULL)) != NULL) continue; result[rnum].id = apairs[i].id; result[rnum].score = apairs[i].score; rnum++; } cbmapclose(map); cbqsort(result, rnum, sizeof(ODPAIR), odsortcompare); *np = rnum; return result; } /* Get the natural logarithm of a number. */ double odlogarithm(double x){ int i; if(x <= 1.0) return 0.0; x = x * x * x * x * x * x * x * x * x * x; for(i = 0; x > 1.0; i++){ x /= 2.718281828459; } return (double)i / 10.0; } /************************************************************************************************* * private objects *************************************************************************************************/ /* Sort the records of inverted index. `odeum' specifies a database handle. If successful, the return value is true, else, it is false. */ static int odsortindex(ODEUM *odeum){ const char *word; char *tmp; int wsiz, tsiz; ODPAIR *pairs; assert(odeum); cbmapiterinit(odeum->sortmap); while((word = cbmapiternext(odeum->sortmap, &wsiz)) != NULL){ if((tmp = crget(odeum->indexdb, word, wsiz, 0, -1, &tsiz)) != NULL){ pairs = (ODPAIR *)tmp; cbqsort(pairs, tsiz / sizeof(ODPAIR), sizeof(ODPAIR), odsortcompare); if(!crput(odeum->indexdb, word, wsiz, tmp, tsiz, CR_DOVER)){ free(tmp); return FALSE; } free(tmp); } else if(dpecode != DP_ENOITEM){ return FALSE; } } cbmapclose(odeum->sortmap); odeum->sortmap = cbmapopen(); return TRUE; } /* Compare two pairs of structures of a search result. `a' specifies the pointer to the region of one pair. `b' specifies the pointer to the region of the other pair. The return value is positive if the former is big, negative if the latter is big, 0 if both are equivalent. */ static int odsortcompare(const void *a, const void *b){ ODPAIR *ap, *bp; int rv; assert(a && b); ap = (ODPAIR *)a; bp = (ODPAIR *)b; rv = bp->score - ap->score; if(rv != 0) return rv; return ap->id - bp->id; } /* Purge the elements of the deleted documents from the inverted index. `odeum' specifies a database handle. If successful, the return value is true, else, it is false. */ static int odpurgeindex(ODEUM *odeum){ ODPAIR *pairs; char *kbuf, *vbuf; int i, ksiz, vsiz, pnum, wi; assert(odeum); if(!criterinit(odeum->indexdb)) return FALSE; while(TRUE){ if(!(kbuf = criternext(odeum->indexdb, &ksiz))){ if(dpecode != DP_ENOITEM) return FALSE; break; } if(!(vbuf = crget(odeum->indexdb, kbuf, ksiz, 0, -1, &vsiz))){ dpecode = DP_EBROKEN; free(kbuf); return FALSE; } pairs = (ODPAIR *)vbuf; pnum = vsiz / sizeof(ODPAIR); wi = 0; for(i = 0; i < pnum; i++){ if(crvsiz(odeum->docsdb, (char *)&(pairs[i].id), sizeof(int)) != -1){ pairs[wi++] = pairs[i]; } } if(wi > 0){ if(!crput(odeum->indexdb, kbuf, ksiz, vbuf, wi * sizeof(ODPAIR), CR_DOVER)){ free(vbuf); free(kbuf); return FALSE; } } else { if(!crout(odeum->indexdb, kbuf, ksiz)){ free(vbuf); free(kbuf); return FALSE; } } free(vbuf); free(kbuf); } return TRUE; } /* Create a map of a document array. `pairs' specifies the pointer to a document array. `num' specifies the number of elements of the array. The return value is a map of the document array. */ static CBMAP *odpairsmap(const ODPAIR *pairs, int num){ CBMAP *map; int i; assert(pairs && num >= 0); map = cbmapopen(); for(i = 0; i < num; i++){ cbmapput(map, (char *)&(pairs[i].id), sizeof(int), (char *)&(pairs[i].score), sizeof(int), TRUE); } return map; } /* compare two pairs of structures of words in a document. `a' specifies the pointer to the region of one word. `b' specifies the pointer to the region of the other word. The return value is positive if the former is big, negative if the latter is big, 0 if both are equivalent. */ static int odwordcompare(const void *a, const void *b){ ODWORD *ap, *bp; int rv; assert(a && b); ap = (ODWORD *)a; bp = (ODWORD *)b; if((rv = bp->num - ap->num) != 0) return rv; if((rv = strlen(bp->word) - strlen(ap->word)) != 0) return rv; return strcmp(ap->word, bp->word); } /* END OF FILE */