/* * Copyright (C) 2005 Network Applied Communication Laboratory Co., Ltd. * * This file is part of Rast. * See the file COPYING for redistribution information. * */ #include "rast/config.h" #include #include #include #include #include #include #include #include #include #include #include "rast/text_index.h" #include "rast/query.h" #if !HAVE_FSEEKO && !defined(fseeko) #define fseeko fseek #endif #if !HAVE_FTELLO && !defined(ftello) #define ftello ftell #endif #define RARE_NGRAM_DB_MAX_VALUE_SIZE (5 * 3) typedef struct free_list_entry_t { APR_RING_ENTRY(free_list_entry_t) link; rast_size_t block_no; rast_size_t block_count; } free_list_entry_t; typedef APR_RING_HEAD(free_list_head_t, free_list_entry_t) free_list_head_t; typedef struct { apr_pool_t *pool; rast_size_t version; free_list_head_t *first; free_list_head_t *added; free_list_head_t *recycled; } free_list_t; static free_list_t * free_list_create(apr_pool_t *pool) { free_list_t *free_list; free_list = (free_list_t *) apr_palloc(pool, sizeof(free_list_t)); free_list->pool = pool; free_list->version = 0; free_list->first = (free_list_head_t *) apr_palloc(pool, sizeof(free_list_head_t)); APR_RING_INIT(free_list->first, free_list_entry_t, link); free_list->added = (free_list_head_t *) apr_palloc(pool, sizeof(free_list_head_t)); APR_RING_INIT(free_list->added, free_list_entry_t, link); free_list->recycled = (free_list_head_t *) apr_palloc(pool, sizeof(free_list_head_t)); APR_RING_INIT(free_list->recycled, free_list_entry_t, link); return free_list; } static void free_list_clear(free_list_t *free_list) { APR_RING_CONCAT(free_list->first, free_list->added, free_list_entry_t, link); APR_RING_CONCAT(free_list->recycled, free_list->first, free_list_entry_t, link); } static void list_prepend(free_list_t *free_list, free_list_head_t *head, rast_size_t block_no, rast_size_t block_count) { free_list_entry_t *entry; if (APR_RING_EMPTY(free_list->recycled, free_list_entry_t, link)) { entry = (free_list_entry_t *) apr_palloc(free_list->pool, sizeof(free_list_entry_t)); } else { entry = APR_RING_FIRST(free_list->recycled); APR_RING_REMOVE(entry, link); } entry->block_no = block_no; entry->block_count = block_count; APR_RING_INSERT_HEAD(head, entry, free_list_entry_t, link); } static void free_list_add(free_list_t *free_list, rast_size_t block_no, rast_size_t block_count) { list_prepend(free_list, free_list->added, block_no, block_count); } #define FREE_AREA_NOT_FOUND ((rast_size_t) -1) static rast_size_t free_list_search(free_list_t *free_list, rast_size_t block_count) { free_list_entry_t *entry; free_list_entry_t *candidate = NULL; for (entry = APR_RING_FIRST(free_list->first); entry != APR_RING_SENTINEL(free_list->first, free_list_entry_t, link); entry = APR_RING_NEXT(entry, link)) { if (entry->block_count == block_count) { APR_RING_REMOVE(entry, link); APR_RING_INSERT_HEAD(free_list->recycled, entry, free_list_entry_t, link); return entry->block_no; } if (entry->block_count > block_count) { if (candidate == NULL || entry->block_count < candidate->block_count) { candidate = entry; } } } if (candidate == NULL) { return FREE_AREA_NOT_FOUND; } else { rast_size_t block_no = candidate->block_no; candidate->block_no += block_count; candidate->block_count -= block_count; return block_no; } } typedef struct pos_list_entry_t { APR_RING_ENTRY(pos_list_entry_t) link; rast_pos_t pos; } pos_list_entry_t; typedef APR_RING_HEAD(pos_list_head_t, pos_list_entry_t) pos_list_head_t; typedef struct cache_list_entry_t { APR_RING_ENTRY(cache_list_entry_t) link; rast_doc_id_t doc_id; pos_list_head_t *positions; } cache_list_entry_t; typedef APR_RING_HEAD(cache_list_head_t, cache_list_entry_t) cache_list_head_t; struct rast_text_index_t { apr_pool_t *pool; int flags; rast_encoding_module_t *encoding_module; DB *ngram_db; DB *rare_ngram_db; FILE *pos_file; rast_size_t pos_block_size; apr_hash_t *cache; apr_pool_t *cache_pool; char *free_list_name; free_list_t *free_list; int is_native; }; struct rast_text_indexer_t { apr_pool_t *pool; rast_text_index_t *index; rast_doc_id_t doc_id; rast_pos_t base_pos; apr_hash_t *position_table; }; static rast_error_t * write_rast_size_value(FILE *fp, rast_size_t i, int is_native) { rast_size_t n; n = rast_fix_byte_order(i, is_native); if (fwrite(&n, sizeof(rast_size_t), 1, fp) < 1) { return os_error_to_rast_error(errno); } return RAST_OK; } static rast_error_t * save_free_list(rast_text_index_t *index) { rast_error_t *error = RAST_OK; free_list_t *free_list = index->free_list; free_list_entry_t *entry; FILE *fp; APR_RING_CONCAT(free_list->first, free_list->added, free_list_entry_t, link); if ((fp = fopen(index->free_list_name, "w")) == NULL) { return os_error_to_rast_error(errno); } free_list->version++; error = write_rast_size_value(fp, free_list->version, index->is_native); if (error != RAST_OK) { fclose(fp); return error; } for (entry = APR_RING_FIRST(free_list->first); entry != APR_RING_SENTINEL(free_list->first, free_list_entry_t, link); entry = APR_RING_NEXT(entry, link)) { error = write_rast_size_value(fp, entry->block_no, index->is_native); if (error != RAST_OK) { break; } error = write_rast_size_value(fp, entry->block_count, index->is_native); if (error != RAST_OK) { break; } } fclose(fp); return error; } static rast_error_t * read_rast_size_value(FILE *fp, rast_size_t *i, int is_native) { rast_size_t n; if (fread(&n, sizeof(rast_size_t), 1, fp) < 1) { if (feof(fp)) { return rast_error(RAST_ERROR_EOF, NULL); } return os_error_to_rast_error(errno); } *i = rast_fix_byte_order(n, is_native); return RAST_OK; } static rast_error_t * load_free_list(rast_text_index_t *index) { rast_error_t *error = RAST_OK; FILE *fp; rast_size_t version; if ((fp = fopen(index->free_list_name, "r")) == NULL) { if (errno == ENOENT) { return RAST_OK; } return os_error_to_rast_error(errno); } error = read_rast_size_value(fp, &version, index->is_native); if (error != RAST_OK) { fclose(fp); return error; } if (index->free_list->version == version) { return RAST_OK; } free_list_clear(index->free_list); index->free_list->version = version; while (1) { rast_size_t block_no, block_count; error = read_rast_size_value(fp, &block_no, index->is_native); if (error != RAST_OK) { if (rast_error_is_eof(error)) { rast_error_destroy(error); error = RAST_OK; } break; } error = read_rast_size_value(fp, &block_count, index->is_native); if (error != RAST_OK) { break; } list_prepend(index->free_list, index->free_list->first, block_no, block_count); } fclose(fp); return error; } static rast_error_t * open_db(DB **dbp, const char *basename, const char *ext, int flags, DB_ENV *db_env, int lorder, apr_pool_t *pool) { DB *db; uint32_t db_flags; char *filename; int dberr; filename = apr_pstrcat(pool, basename, ext, NULL); dberr = db_create(&db, db_env, 0); if (dberr != 0) { return db_error_to_rast_error(dberr); } db_flags = flags & RAST_DB_RDONLY ? DB_RDONLY : DB_CREATE; if (lorder != 0) { db->set_lorder(db, lorder); } dberr = db->open(db, NULL, filename, NULL, DB_BTREE, db_flags, 0666); if (dberr != 0) { db->close(db, 0); return db_error_to_rast_error(dberr); } *dbp = db; return RAST_OK; } rast_error_t * rast_text_index_open(rast_text_index_t **index, const char *filename, int flags, rast_encoding_module_t *encoding_module, DB_ENV *db_env, int lorder, rast_size_t pos_block_size, apr_pool_t *pool) { apr_pool_t *sub_pool; rast_error_t *error; DB *ngram_db, *rare_ngram_db; FILE *pos_file; char *pos_file_path; apr_pool_create(&sub_pool, pool); error = open_db(&ngram_db, filename, ".ngm", flags, db_env, lorder, sub_pool); if (error != RAST_OK) { apr_pool_destroy(sub_pool); return error; } error = open_db(&rare_ngram_db, filename, ".rng", flags, db_env, lorder, sub_pool); if (error != RAST_OK) { ngram_db->close(ngram_db, 0); apr_pool_destroy(sub_pool); return error; } pos_file_path = apr_pstrcat(sub_pool, filename, ".pos", NULL); if (flags & RAST_DB_RDONLY) { pos_file = fopen(pos_file_path, "r"); } else { pos_file = fopen(pos_file_path, "r+"); if (pos_file == NULL && errno == ENOENT) { pos_file = fopen(pos_file_path, "w+"); } } if (pos_file == NULL) { rare_ngram_db->close(rare_ngram_db, 0); ngram_db->close(ngram_db, 0); apr_pool_destroy(sub_pool); return os_error_to_rast_error(errno); } *index = apr_palloc(pool, sizeof(rast_text_index_t)); (*index)->pool = pool; (*index)->flags = flags; (*index)->encoding_module = encoding_module; (*index)->ngram_db = ngram_db; (*index)->rare_ngram_db = rare_ngram_db; (*index)->pos_file = pos_file; (*index)->pos_block_size = pos_block_size; apr_pool_create(&(*index)->cache_pool, pool); (*index)->cache = apr_hash_make((*index)->cache_pool); (*index)->free_list_name = apr_pstrcat(pool, filename, ".pfl", NULL); (*index)->free_list = free_list_create(pool); (*index)->is_native = 1; if ((error = load_free_list(*index)) != RAST_OK) { ngram_db->close(ngram_db, 0); rare_ngram_db->close(rare_ngram_db, 0); fclose(pos_file); apr_pool_destroy(sub_pool); return error; } (*index)->is_native = 1; apr_pool_destroy(sub_pool); return RAST_OK; } rast_error_t * rast_text_index_close(rast_text_index_t *index) { rast_error_t *error = RAST_OK; if (!(index->flags & RAST_DB_RDONLY)) { error = rast_text_index_sync(index); } index->ngram_db->close(index->ngram_db, 0); index->rare_ngram_db->close(index->rare_ngram_db, 0); fclose(index->pos_file); return error; } static int number_length(int n) { int len = 0; if (n == 0) { return 1; } while (n > 0) { n = n / 0x80; len++; } return len; } static int pack_number(char *s, int n) { char *p = s; div_t d; if (n == 0) { *p = 0; return 1; } while (n > 0) { d = div(n, 0x80); n = d.quot; if (n > 0) { *p = d.rem | 0x80; } else { *p = d.rem; } p++; } return p - s; } static inline int unpack_number(const char *s, int len, int *np) { const char *p = s, *pend = s + len; int base = 1, n = 0; while (p < pend) { if (*p & 0x80) { n += base * (*p & 0x7F); base *= 0x80; p++; } else { n += *p * base; p++; break; } } *np = n; return p - s; } rast_error_t * rast_text_index_register(rast_text_index_t *index, rast_doc_id_t doc_id, rast_text_indexer_t **indexer, apr_pool_t *pool) { if (index->flags & RAST_DB_RDONLY) { return rast_error(RAST_ERROR_BAD_DB, "can't register to read-only db"); } *indexer = (rast_text_indexer_t *) apr_palloc(pool, sizeof(rast_text_indexer_t)); (*indexer)->pool = pool; (*indexer)->index = index; (*indexer)->doc_id = doc_id; (*indexer)->base_pos = 0; (*indexer)->position_table = apr_hash_make(pool); return RAST_OK; } rast_error_t * rast_text_indexer_add(rast_text_indexer_t *indexer, const char *s, int nbytes, rast_size_t *registered_chars) { rast_text_index_t *index = indexer->index; apr_hash_t *position_table = indexer->position_table; rast_tokenizer_t *tokenizer; rast_token_t token = { 0 }; rast_error_t *error; tokenizer = rast_register_tokenizer_create(indexer->pool, index->encoding_module, s, nbytes); while (!rast_register_tokenizer_is_done(tokenizer)) { pos_list_head_t *positions; pos_list_entry_t *entry; error = rast_register_tokenizer_get_current(tokenizer, &token); if (error != RAST_OK) { return error; } positions = apr_hash_get(position_table, token.ptr, token.nbytes); if (positions == NULL) { positions = (pos_list_head_t *) apr_palloc(index->cache_pool, sizeof(pos_list_head_t)); APR_RING_INIT(positions, pos_list_entry_t, link); apr_hash_set(position_table, token.ptr, token.nbytes, positions); } entry = (pos_list_entry_t *) apr_palloc(index->cache_pool, sizeof(pos_list_entry_t)); entry->pos = indexer->base_pos + token.pos; APR_RING_INSERT_TAIL(positions, entry, pos_list_entry_t, link); /* TODO: need test */ error = rast_register_tokenizer_next(tokenizer); if (error != RAST_OK) { return error; } } indexer->base_pos += nbytes; if (registered_chars != NULL) { *registered_chars = token.pos + token.nchars; } return RAST_OK; } rast_error_t * rast_text_indexer_commit(rast_text_indexer_t *indexer) { apr_hash_index_t *hi; rast_text_index_t *index = indexer->index; apr_hash_t *position_table = indexer->position_table; for (hi = apr_hash_first(indexer->pool, position_table); hi; hi = apr_hash_next(hi)) { const void *key; apr_ssize_t klen; void *val; cache_list_head_t *entries; cache_list_entry_t *entry; apr_hash_this(hi, &key, &klen, &val); entries = (cache_list_head_t *) apr_hash_get(index->cache, key, klen); if (entries == NULL) { entries = (cache_list_head_t *) apr_palloc(index->cache_pool, sizeof(cache_list_head_t)); APR_RING_INIT(entries, cache_list_entry_t, link); apr_hash_set(index->cache, apr_pmemdup(index->cache_pool, key, klen), klen, entries); } entry = (cache_list_entry_t *) apr_palloc(index->cache_pool, sizeof(cache_list_entry_t)); entry->doc_id = indexer->doc_id; entry->positions = (pos_list_head_t *) val; APR_RING_INSERT_TAIL(entries, entry, cache_list_entry_t, link); } return RAST_OK; } static int position_bytes(pos_list_head_t *positions) { int len = 0; pos_list_entry_t *entry; for (entry = APR_RING_FIRST(positions); entry != APR_RING_SENTINEL(positions, pos_list_entry_t, link); entry = APR_RING_NEXT(entry, link)) { len += number_length(entry->pos); } return len; } static int pack_position_data(char *buf, rast_doc_id_t doc_id, int bytes, pos_list_head_t *positions) { char *p = buf; pos_list_entry_t *entry; p += pack_number(p, doc_id); p += pack_number(p, bytes); for (entry = APR_RING_FIRST(positions); entry != APR_RING_SENTINEL(positions, pos_list_entry_t, link); entry = APR_RING_NEXT(entry, link)) { p += pack_number(p, entry->pos); } return p - buf; } static rast_error_t * db_put(DB *db, const char *key, apr_ssize_t key_len, void *value, int value_len) { DBT db_key = { 0 }, db_value = { 0 }; int dberr; db_key.data = (char *) key; db_key.size = key_len; db_value.data = (char *) value; db_value.size = value_len; dberr = db->put(db, NULL, &db_key, &db_value, 0); return db_error_to_rast_error(dberr); } static int pack_entries(apr_pool_t *pool, cache_list_head_t *entries, char **s, rast_doc_id_t *num_docs) { int len = 0, i; char *p; cache_list_entry_t *entry; apr_array_header_t *nbytes; apr_pool_t *sub_pool; apr_pool_create(&sub_pool, pool); nbytes = apr_array_make(sub_pool, 1, sizeof(rast_size_t)); for (entry = APR_RING_FIRST(entries); entry != APR_RING_SENTINEL(entries, cache_list_entry_t, link); entry = APR_RING_NEXT(entry, link)) { rast_size_t bytes; len += number_length(entry->doc_id); bytes = position_bytes(entry->positions); len += number_length(bytes); len += bytes; *(rast_size_t *) apr_array_push(nbytes) = bytes; } *s = (char *) apr_palloc(pool, len); p = *s; i = 0; for (entry = APR_RING_FIRST(entries); entry != APR_RING_SENTINEL(entries, cache_list_entry_t, link); entry = APR_RING_NEXT(entry, link)) { p += pack_position_data(p, entry->doc_id, ((rast_size_t *) nbytes->elts)[i], entry->positions); i++; } *num_docs = i; apr_pool_destroy(sub_pool); return len; } static rast_error_t * write_new_ngram_to_rare_ngram_db(DB *rare_ngram_db, const char *key, apr_ssize_t key_len, void *data, rast_size_t data_len) { return db_put(rare_ngram_db, key, key_len, data, data_len); } static rast_error_t * allocate_area(rast_text_index_t *index, rast_size_t block_count, rast_size_t *block_no) { off_t pos; *block_no = free_list_search(index->free_list, block_count); if (*block_no == FREE_AREA_NOT_FOUND) { if (fseeko(index->pos_file, 0, SEEK_END) == -1) { return os_error_to_rast_error(errno); } pos = ftello(index->pos_file); if (pos == -1) { return os_error_to_rast_error(errno); } *block_no = pos / index->pos_block_size; if (ftruncate(fileno(index->pos_file), pos + block_count * index->pos_block_size) == -1) { return os_error_to_rast_error(errno); } } else { if (fseeko(index->pos_file, *block_no * index->pos_block_size, SEEK_SET) == -1) { return os_error_to_rast_error(errno); } } return RAST_OK; } static rast_error_t * write_ngram_to_ngram_db(apr_pool_t *pool, rast_text_index_t *index, const char *key, apr_ssize_t key_len, rast_doc_id_t num_docs, const char *new_data, rast_size_t new_data_len, const char *old_data, rast_size_t old_data_len) { rast_size_t block_no, block_count; rast_uint_t buf[4]; rast_size_t new_nbytes; rast_error_t *error; if (old_data == NULL) { new_nbytes = new_data_len; } else { new_nbytes = old_data_len + new_data_len; } block_count = new_nbytes / index->pos_block_size + 1; error = allocate_area(index, block_count, &block_no); if (error != RAST_OK) { return error; } if (old_data != NULL) { if (fwrite(old_data, 1, old_data_len, index->pos_file) < old_data_len) { return os_error_to_rast_error(errno); } } if (fwrite(new_data, 1, new_data_len, index->pos_file) < new_data_len) { return os_error_to_rast_error(errno); } buf[0] = rast_fix_byte_order(block_no, index->is_native); buf[1] = rast_fix_byte_order(block_count, index->is_native); buf[2] = rast_fix_byte_order(new_nbytes, index->is_native); buf[3] = rast_fix_byte_order(num_docs, index->is_native); return db_put(index->ngram_db, key, key_len, buf, sizeof(buf)); } static rast_size_t calc_new_block_count(rast_size_t nbytes, rast_size_t block_count, rast_size_t pos_block_size) { int needed_block_count, new_block_count; needed_block_count = nbytes / pos_block_size + 1; new_block_count = block_count * 2; if (new_block_count < needed_block_count) { new_block_count = needed_block_count; } return new_block_count; } static rast_error_t * write_dup_ngram(apr_pool_t *pool, rast_text_index_t *index, const char *ngram, apr_ssize_t ngram_len, rast_doc_id_t num_docs, const char *db_value, int db_value_len, const char *data, rast_size_t data_len) { rast_size_t block_no, block_count, nbytes; rast_size_t new_block_no, new_block_count, new_nbytes; rast_uint_t buf[4]; rast_size_t *p; rast_doc_id_t new_num_docs; p = (rast_size_t *) db_value; block_no = rast_fix_byte_order(p[0], index->is_native); block_count = rast_fix_byte_order(p[1], index->is_native); nbytes = rast_fix_byte_order(p[2], index->is_native); new_num_docs = num_docs + rast_fix_byte_order(p[3], index->is_native); new_nbytes = nbytes + data_len; if (new_nbytes <= block_count * index->pos_block_size) { if (fseeko(index->pos_file, block_no * index->pos_block_size + nbytes, SEEK_SET) == -1) { return os_error_to_rast_error(errno); } if (fwrite(data, 1, data_len, index->pos_file) < data_len) { return os_error_to_rast_error(errno); } new_block_no = block_no; new_block_count = block_count; } else { rast_error_t *error; char *old_data; if (fseeko(index->pos_file, block_no * index->pos_block_size, SEEK_SET) == -1) { return os_error_to_rast_error(errno); } old_data = (char *) apr_palloc(pool, nbytes); if ((fread(old_data, 1, nbytes, index->pos_file)) < nbytes) { return os_error_to_rast_error(errno); } free_list_add(index->free_list, block_no, block_count); new_block_count = calc_new_block_count(new_nbytes, block_count, index->pos_block_size); error = allocate_area(index, new_block_count, &new_block_no); if (error != RAST_OK) { return error; } if (fwrite(old_data, 1, nbytes, index->pos_file) < nbytes) { return os_error_to_rast_error(errno); } if (fwrite(data, 1, data_len, index->pos_file) < data_len) { return os_error_to_rast_error(errno); } } buf[0] = rast_fix_byte_order(new_block_no, index->is_native); buf[1] = rast_fix_byte_order(new_block_count, index->is_native); buf[2] = rast_fix_byte_order(new_nbytes, index->is_native); buf[3] = rast_fix_byte_order(new_num_docs, index->is_native); return db_put(index->ngram_db, ngram, ngram_len, buf, sizeof(rast_size_t) * 4); } rast_error_t * rast_text_index_sync(rast_text_index_t *index) { apr_pool_t *pool, *sub_pool; apr_hash_index_t *hi; int dberr; rast_error_t *error; if (index->flags & RAST_DB_RDONLY) { return rast_error(RAST_ERROR_BAD_DB, "can't sync read-only db"); } apr_pool_create(&pool, index->pool); apr_pool_create(&sub_pool, pool); for (hi = apr_hash_first(pool, index->cache); hi; hi = apr_hash_next(hi)) { const void *ngram; apr_ssize_t ngram_len; void *value; cache_list_head_t *entries; cache_list_entry_t *entry; DBT db_key = { 0 }, db_value = { 0 }; int dberr, data_len; char *data; rast_doc_id_t num_docs; rast_uint_t buf[4]; char rbuf[RARE_NGRAM_DB_MAX_VALUE_SIZE]; apr_hash_this(hi, &ngram, &ngram_len, &value); entries = (cache_list_head_t *) value; data_len = pack_entries(sub_pool, entries, &data, &num_docs); db_key.data = (void *) ngram; db_key.size = ngram_len; db_value.data = &buf; db_value.ulen = sizeof(buf); db_value.flags = DB_DBT_USERMEM; dberr = index->ngram_db->get(index->ngram_db, NULL, &db_key, &db_value, 0); switch (dberr) { case 0: error = write_dup_ngram(sub_pool, index, ngram, ngram_len, num_docs, db_value.data, db_value.size, data, data_len); if (error != RAST_OK) { apr_pool_destroy(pool); apr_pool_clear(index->cache_pool); index->cache = apr_hash_make(index->cache_pool); return error; } break; case DB_NOTFOUND: db_value.data = &rbuf; db_value.size = sizeof(rbuf); dberr = index->rare_ngram_db->get(index->rare_ngram_db, NULL, &db_key, &db_value, 0); switch (dberr) { case 0: error = write_ngram_to_ngram_db(sub_pool, index, ngram, ngram_len, num_docs + 1, data, data_len, db_value.data, db_value.size); if (error == RAST_OK) { dberr = index->rare_ngram_db->del(index->rare_ngram_db, NULL, &db_key, 0); error = db_error_to_rast_error(dberr); } if (error != RAST_OK) { apr_pool_destroy(pool); apr_pool_clear(index->cache_pool); index->cache = apr_hash_make(index->cache_pool); return error; } break; case DB_NOTFOUND: entry = APR_RING_FIRST(entries); if (APR_RING_FIRST(entries) == APR_RING_LAST(entries) && (APR_RING_FIRST(entry->positions) == APR_RING_LAST(entry->positions))) { DB *rare_ngram_db = index->rare_ngram_db; error = write_new_ngram_to_rare_ngram_db(rare_ngram_db, ngram, ngram_len, data, data_len); } else { error = write_ngram_to_ngram_db(sub_pool, index, ngram, ngram_len, num_docs, data, data_len, NULL, 0); } if (error != RAST_OK) { apr_pool_destroy(pool); apr_pool_clear(index->cache_pool); index->cache = apr_hash_make(index->cache_pool); return error; } break; default: apr_pool_destroy(pool); apr_pool_clear(index->cache_pool); index->cache = apr_hash_make(index->cache_pool); return db_error_to_rast_error(dberr); } break; default: apr_pool_destroy(pool); apr_pool_clear(index->cache_pool); index->cache = apr_hash_make(index->cache_pool); return db_error_to_rast_error(dberr); } apr_pool_clear(sub_pool); } apr_pool_destroy(pool); if ((dberr = index->ngram_db->sync(index->ngram_db, 0)) != 0) { return db_error_to_rast_error(dberr); } if ((dberr = index->rare_ngram_db->sync(index->rare_ngram_db, 0)) != 0) { return db_error_to_rast_error(dberr); } if (fflush(index->pos_file) == EOF) { return os_error_to_rast_error(errno); } if ((error = save_free_list(index)) != RAST_OK) { return error; } apr_pool_clear(index->cache_pool); index->cache = apr_hash_make(index->cache_pool); return RAST_OK; } typedef struct { rast_doc_id_t doc_id; const char *pos_ptr; int pos_bytes; int bytes; } doc_t; static inline rast_error_t * parse_doc(const char *s, const char *s_end, doc_t *result) { const char *p = s; int n; n = unpack_number(p, s_end - p, &result->doc_id); if (n == 0) { return rast_error(RAST_ERROR_GENERAL, "no doc_id found"); } p += n; n = unpack_number(p, s_end - p, &result->pos_bytes); if (n == 0) { return rast_error(RAST_ERROR_GENERAL, "no pos_bytes found"); } p += n; result->pos_ptr = p; p += result->pos_bytes; result->bytes = p - s; return RAST_OK; } typedef struct pos_cursor_t pos_cursor_t; typedef struct { int (*get_current)(pos_cursor_t *base); void (*next)(pos_cursor_t *base); int (*is_done)(pos_cursor_t *base); } pos_cursor_type_t; struct pos_cursor_t { pos_cursor_type_t *type; rast_pos_t offset; }; typedef struct { pos_cursor_t base; int n; int next_offset; const char *ptr; const char *end; } single_pos_cursor_t; static int single_pos_cursor_get_current(pos_cursor_t *base) { single_pos_cursor_t *pos = (single_pos_cursor_t *) base; if (pos->next_offset == 0) { pos->next_offset = unpack_number(pos->ptr, pos->end - pos->ptr, &pos->n); } return pos->n; } static void single_pos_cursor_next(pos_cursor_t *base) { single_pos_cursor_t *pos = (single_pos_cursor_t *) base; if (pos->next_offset == 0) { pos->next_offset = unpack_number(pos->ptr, pos->end - pos->ptr, &pos->n); } pos->ptr += pos->next_offset; pos->next_offset = 0; } static int single_pos_cursor_is_done(pos_cursor_t *base) { single_pos_cursor_t *pos = (single_pos_cursor_t *) base; return pos->ptr >= pos->end; } static pos_cursor_type_t single_pos_cursor_type = { single_pos_cursor_get_current, single_pos_cursor_next, single_pos_cursor_is_done, }; typedef struct { pos_cursor_t base; pos_cursor_t **cursors; int num_cursors; pos_cursor_t *min_cursor; } multi_pos_cursor_t; static rast_pos_t get_min_pos(multi_pos_cursor_t *cursor) { rast_pos_t n, min_pos; pos_cursor_t *c; int i; min_pos = RAST_POS_MAX; for (i = 0; i < cursor->num_cursors; i++) { c = *(cursor->cursors + i); if (!c->type->is_done(c)) { n = c->type->get_current(c); if (n < min_pos) { min_pos = n; cursor->min_cursor = c; } } } return min_pos; } static int multi_pos_cursor_get_current(pos_cursor_t *base) { multi_pos_cursor_t *cursor = (multi_pos_cursor_t *) base; if (cursor->min_cursor == NULL) { return get_min_pos(cursor); } return cursor->min_cursor->type->get_current(cursor->min_cursor); } static void multi_pos_cursor_next(pos_cursor_t *base) { multi_pos_cursor_t *cursor = (multi_pos_cursor_t *) base; if (cursor->min_cursor == NULL) { get_min_pos(cursor); } cursor->min_cursor->type->next(cursor->min_cursor); cursor->min_cursor = NULL; } static int multi_pos_cursor_is_done(pos_cursor_t *base) { multi_pos_cursor_t *cursor = (multi_pos_cursor_t *) base; pos_cursor_t *c; int i; for (i = 0; i < cursor->num_cursors; i++) { c = *(cursor->cursors + i); if (!c->type->is_done(c)) { return 0; } } return 1; } static pos_cursor_type_t multi_pos_cursor_type = { multi_pos_cursor_get_current, multi_pos_cursor_next, multi_pos_cursor_is_done, }; typedef struct ngram_t ngram_t; typedef struct { rast_error_t *(*get_current_doc_id)(ngram_t *ngram, rast_doc_id_t *doc_id); rast_error_t *(*next_doc)(ngram_t *ngram); int (*is_done)(ngram_t *ngram); rast_error_t *(*pos_cursor_create)(ngram_t *base, apr_pool_t *pool, pos_cursor_t **base_pos_cursor); } ngram_type_t; struct ngram_t { ngram_type_t *type; rast_pos_t offset; rast_doc_id_t num_docs; const char *ptr; int nbytes; int nchars; APR_RING_ENTRY(ngram_t) link; }; APR_RING_HEAD(ngram_head_t, ngram_t); typedef struct ngram_head_t ngram_head_t; typedef struct { ngram_t base; apr_pool_t *pool; const char *doc_ptr; const char *doc_end; doc_t current; int parsed; } single_ngram_t; static rast_error_t * single_ngram_get_current_doc_id(ngram_t *base, rast_doc_id_t *doc_id) { single_ngram_t *ngram = (single_ngram_t *) base; if (!ngram->parsed) { rast_error_t *error = parse_doc(ngram->doc_ptr, ngram->doc_end, &ngram->current); if (error != RAST_OK) { return error; } ngram->parsed = 1; } *doc_id = ngram->current.doc_id; return RAST_OK; } static rast_error_t * single_ngram_next_doc(ngram_t *base) { single_ngram_t *ngram = (single_ngram_t *) base; rast_error_t *error; if (!ngram->parsed) { error = parse_doc(ngram->doc_ptr, ngram->doc_end, &ngram->current); if (error != RAST_OK) { return error; } } ngram->doc_ptr += ngram->current.bytes; ngram->parsed = 0; return RAST_OK; } static int single_ngram_is_done(ngram_t *base) { single_ngram_t *ngram = (single_ngram_t *) base; return ngram->doc_ptr >= ngram->doc_end; } static rast_error_t * single_ngram_pos_cursor_create(ngram_t *base, apr_pool_t *pool, pos_cursor_t **base_pos_cursor) { single_ngram_t *ngram = (single_ngram_t *) base; rast_doc_id_t doc_id; single_pos_cursor_t *pos_cursor; single_ngram_get_current_doc_id(base, &doc_id); pos_cursor = (single_pos_cursor_t *) apr_palloc(pool, sizeof(single_pos_cursor_t)); pos_cursor->base.type = &single_pos_cursor_type; pos_cursor->base.offset = base->offset; pos_cursor->n = 0; pos_cursor->next_offset = 0; pos_cursor->ptr = ngram->current.pos_ptr; pos_cursor->end = ngram->current.pos_ptr + ngram->current.pos_bytes; *base_pos_cursor = (pos_cursor_t *) pos_cursor; return RAST_OK; } static ngram_type_t single_ngram_type = { single_ngram_get_current_doc_id, single_ngram_next_doc, single_ngram_is_done, single_ngram_pos_cursor_create, }; static rast_error_t * get_packed_positions_from_pos_file(apr_pool_t *pool, rast_text_index_t *index, rast_size_t *db_values, const char **doc_ptr, rast_size_t *doc_len, rast_doc_id_t *num_docs) { rast_size_t block_no, block_count; off_t offset; block_no = rast_fix_byte_order(db_values[0], index->is_native); block_count = rast_fix_byte_order(db_values[1], index->is_native); *doc_len = rast_fix_byte_order(db_values[2], index->is_native); *num_docs = rast_fix_byte_order(db_values[3], index->is_native); offset = block_no * index->pos_block_size; if (fseeko(index->pos_file, offset, SEEK_SET) == -1) { return os_error_to_rast_error(errno); } *doc_ptr = (const char *) apr_palloc(pool, *doc_len); if (fread((char *) *doc_ptr, 1, *doc_len, index->pos_file) < 1) { if (feof(index->pos_file)) { return rast_error(RAST_ERROR_EOF, NULL); } return os_error_to_rast_error(errno); } return RAST_OK; } static rast_error_t * single_ngram_get_packed_positions(apr_pool_t *pool, rast_text_index_t *index, rast_token_t *token, const char **doc_ptr, rast_size_t *doc_len, rast_doc_id_t *num_docs) { DBT db_key = { 0 }, db_value = { 0 }; rast_error_t *error; int dberr; rast_uint_t buf[4]; char rbuf[RARE_NGRAM_DB_MAX_VALUE_SIZE]; cache_list_head_t *cache_entries; db_key.data = (char *) token->ptr; db_key.size = token->nbytes; db_value.data = &buf; db_value.ulen = sizeof(buf); db_value.flags = DB_DBT_USERMEM; dberr = index->ngram_db->get(index->ngram_db, NULL, &db_key, &db_value, 0); switch (dberr) { case 0: error = get_packed_positions_from_pos_file(pool, index, buf, doc_ptr, doc_len, num_docs); if (error != RAST_OK) { return error; } break; case DB_NOTFOUND: db_value.data = &rbuf; db_value.ulen = sizeof(rbuf); dberr = index->rare_ngram_db->get(index->rare_ngram_db, NULL, &db_key, &db_value, 0); switch (dberr) { case 0: *doc_ptr = (char *) apr_pmemdup(pool, db_value.data, db_value.size); *doc_len = db_value.size; *num_docs = 0; break; case DB_NOTFOUND: /* not found ngram in database */ *doc_ptr = NULL; *doc_len = 0; *num_docs = 0; break; default: return db_error_to_rast_error(dberr); } break; default: return db_error_to_rast_error(dberr); } cache_entries = (cache_list_head_t *) apr_hash_get(index->cache, token->ptr, token->nbytes); if (cache_entries != NULL) { char *s, *new_doc_ptr; int num_docs, nbytes, new_doc_len; nbytes = pack_entries(pool, cache_entries, &s, &num_docs); new_doc_len = *doc_len + nbytes; new_doc_ptr = (char *) apr_palloc(pool, new_doc_len); memcpy(new_doc_ptr, *doc_ptr, *doc_len); memcpy(new_doc_ptr + *doc_len, s, nbytes); *doc_ptr = new_doc_ptr; *doc_len = new_doc_len; } return RAST_OK; } static rast_error_t * single_ngram_allocate(apr_pool_t *pool, rast_text_index_t *index, rast_token_t *token, rast_doc_id_t num_docs, const char *doc_ptr, rast_size_t doc_len, ngram_t **base) { single_ngram_t *ngram; ngram = (single_ngram_t *) apr_palloc(pool, sizeof(single_ngram_t)); ngram->base.type = &single_ngram_type; ngram->base.offset = token->pos; ngram->base.num_docs = num_docs; ngram->base.ptr = token->ptr; ngram->base.nbytes = token->nbytes; ngram->base.nchars = token->nchars; ngram->pool = pool; ngram->doc_ptr = doc_ptr; ngram->doc_end = ngram->doc_ptr + doc_len; ngram->parsed = 0; *base = (ngram_t *) ngram; return RAST_OK; } static rast_error_t * single_ngram_create(apr_pool_t *pool, rast_text_index_t *index, rast_token_t *token, ngram_t **base) { const char *doc_ptr; rast_size_t doc_len; rast_error_t *error; rast_doc_id_t num_docs; error = single_ngram_get_packed_positions(pool, index, token, &doc_ptr, &doc_len, &num_docs); if (error != RAST_OK) { return error; } if (doc_ptr == NULL) { *base = NULL; return RAST_OK; } return single_ngram_allocate(pool, index, token, num_docs, doc_ptr, doc_len, base); } typedef struct { ngram_t base; ngram_head_t *ngrams; rast_doc_id_t min_doc_id; pos_cursor_t **cursors_buffer; } multi_ngram_t; static rast_error_t * multi_ngram_get_current_doc_id(ngram_t *base, rast_doc_id_t *doc_id) { multi_ngram_t *ngram = (multi_ngram_t *) base; ngram_t *n; rast_doc_id_t id; rast_error_t *error; if (ngram->min_doc_id == (rast_doc_id_t) -1) { for (n = APR_RING_FIRST(ngram->ngrams); n != APR_RING_SENTINEL(ngram->ngrams, ngram_t, link); n = APR_RING_NEXT(n, link)) { if (!n->type->is_done(n)) { error = n->type->get_current_doc_id(n, &id); if (error != RAST_OK) { return error; } if (id < ngram->min_doc_id) { ngram->min_doc_id = id; } } } } *doc_id = ngram->min_doc_id; return RAST_OK; } static rast_error_t * multi_ngram_next_doc(ngram_t *base) { multi_ngram_t *ngram = (multi_ngram_t *) base; ngram_t *n; rast_doc_id_t doc_id; rast_error_t *error; if (ngram->min_doc_id == (rast_doc_id_t) -1) { return rast_error(RAST_ERROR_GENERAL, "must be called after get_current_doc_id."); } for (n = APR_RING_FIRST(ngram->ngrams); n != APR_RING_SENTINEL(ngram->ngrams, ngram_t, link); n = APR_RING_NEXT(n, link)) { if (!n->type->is_done(n)) { error = n->type->get_current_doc_id(n, &doc_id); if (error != RAST_OK) { return error; } if (ngram->min_doc_id == doc_id) { error = n->type->next_doc(n); if (error != RAST_OK) { return error; } } } } ngram->min_doc_id = (rast_doc_id_t) -1; return RAST_OK; } static int multi_ngram_is_done(ngram_t *base) { multi_ngram_t *ngram = (multi_ngram_t *) base; ngram_t *n; for (n = APR_RING_FIRST(ngram->ngrams); n != APR_RING_SENTINEL(ngram->ngrams, ngram_t, link); n = APR_RING_NEXT(n, link)) { if (!n->type->is_done(n)) { return 0; } } return 1; } static rast_error_t * multi_ngram_pos_cursor_create(ngram_t *base, apr_pool_t *pool, pos_cursor_t **base_pos_cursor) { multi_ngram_t *ngram = (multi_ngram_t *) base; ngram_t *n; rast_doc_id_t doc_id; multi_pos_cursor_t *pos_cursor; int num_cursors; pos_cursor_t *c; rast_error_t *error; num_cursors = 0; for (n = APR_RING_FIRST(ngram->ngrams); n != APR_RING_SENTINEL(ngram->ngrams, ngram_t, link); n = APR_RING_NEXT(n, link)) { if (!n->type->is_done(n)) { error = n->type->get_current_doc_id(n, &doc_id); if (error != RAST_OK) { return error; } if (ngram->min_doc_id == doc_id) { error = n->type->pos_cursor_create(n, pool, &c); if (error != RAST_OK) { return error; } *(ngram->cursors_buffer + num_cursors) = c; num_cursors++; } } } pos_cursor = (multi_pos_cursor_t *) apr_palloc(pool, sizeof(multi_pos_cursor_t)); pos_cursor->base.type = &multi_pos_cursor_type; pos_cursor->base.offset = base->offset; pos_cursor->cursors = ngram->cursors_buffer; pos_cursor->num_cursors = num_cursors; pos_cursor->min_cursor = NULL; *base_pos_cursor = (pos_cursor_t *) pos_cursor; return RAST_OK; } static ngram_type_t multi_ngram_type = { multi_ngram_get_current_doc_id, multi_ngram_next_doc, multi_ngram_is_done, multi_ngram_pos_cursor_create, }; static rast_error_t * multi_ngram_get_packed_positions(apr_pool_t *pool, rast_text_index_t *index, rast_token_t *token, rast_doc_id_t *num_docs, ngram_head_t *ngrams, int *num_ngrams) { DBC *cursor; DBT db_key = { 0 }, db_value = { 0 }; const char *doc_ptr; rast_size_t doc_len; rast_doc_id_t child_num_docs; ngram_t *ngram; int dberr; rast_error_t *error; rast_token_t t; rast_uint_t buf[4]; apr_hash_index_t *hi; apr_pool_t *sub_pool; dberr = index->ngram_db->cursor(index->ngram_db, NULL, &cursor, 0); if (dberr != 0) { return db_error_to_rast_error(dberr); } apr_pool_create(&sub_pool, pool); db_key.data = (char *) token->ptr; db_key.size = token->nbytes; db_value.data = &buf; db_value.ulen = sizeof(buf); db_value.flags = DB_DBT_USERMEM; dberr = cursor->c_get(cursor, &db_key, &db_value, DB_SET_RANGE); switch (dberr) { case 0: while (dberr != DB_NOTFOUND && memcmp(token->ptr, db_key.data, token->nbytes) == 0) { error = get_packed_positions_from_pos_file(pool, index, buf, &doc_ptr, &doc_len, &child_num_docs); t.ptr = apr_pmemdup(pool, db_key.data, db_key.size); t.nbytes = db_key.size; t.nchars = token->nchars + rast_count_chars(index->encoding_module, db_key.data + token->nbytes, db_key.size - token->nbytes, sub_pool); t.pos = token->pos; if (error != RAST_OK) { cursor->c_close(cursor); return error; } single_ngram_allocate(pool, index, &t, child_num_docs, doc_ptr, doc_len, &ngram); APR_RING_INSERT_TAIL(ngrams, ngram, ngram_t, link); (*num_docs) += child_num_docs; (*num_ngrams)++; dberr = cursor->c_get(cursor, &db_key, &db_value, DB_NEXT); } break; case DB_NOTFOUND: break; default: cursor->c_close(cursor); return db_error_to_rast_error(dberr); } cursor->c_close(cursor); for (hi = apr_hash_first(sub_pool, index->cache); hi; hi = apr_hash_next(hi)) { const void *key; apr_ssize_t klen; void *val; apr_hash_this(hi, &key, &klen, &val); if (memcmp(key, token->ptr, token->nbytes) == 0) { cache_list_head_t *cache_entries = (cache_list_head_t *) val; char *s; doc_len = pack_entries(pool, cache_entries, &s, &child_num_docs); doc_ptr = s; t.ptr = apr_pmemdup(pool, key, klen); t.nbytes = klen; t.nchars = token->nchars + rast_count_chars(index->encoding_module, key + token->nbytes, klen - token->nbytes, sub_pool); t.pos = token->pos; single_ngram_allocate(pool, index, &t, child_num_docs, doc_ptr, doc_len, &ngram); APR_RING_INSERT_TAIL(ngrams, ngram, ngram_t, link); (*num_docs) += child_num_docs; (*num_ngrams)++; } } return RAST_OK; } static rast_error_t * multi_ngram_get_rare_packed_positions(apr_pool_t *pool, rast_text_index_t *index, rast_token_t *token, rast_doc_id_t *num_docs, ngram_head_t *ngrams, int *num_ngrams) { DBC *cursor; DBT db_key = { 0 }, db_value = { 0 }; const char *doc_ptr; rast_size_t doc_len; ngram_t *ngram; int dberr; char buf[RARE_NGRAM_DB_MAX_VALUE_SIZE]; dberr = index->rare_ngram_db->cursor(index->rare_ngram_db, NULL, &cursor, 0); if (dberr != 0) { return db_error_to_rast_error(dberr); } db_key.data = (char *) token->ptr; db_key.size = token->nbytes; db_value.data = &buf; db_value.ulen = sizeof(buf); db_value.flags = DB_DBT_USERMEM; dberr = cursor->c_get(cursor, &db_key, &db_value, DB_SET_RANGE); switch (dberr) { case 0: while (dberr != DB_NOTFOUND && db_key.size >= token->nbytes && memcmp(token->ptr, db_key.data, token->nbytes) == 0) { doc_ptr = (char *) apr_pmemdup(pool, db_value.data, db_value.size); doc_len = db_value.size; single_ngram_allocate(pool, index, token, 1, doc_ptr, doc_len, &ngram); APR_RING_INSERT_TAIL(ngrams, ngram, ngram_t, link); (*num_docs)++; (*num_ngrams)++; dberr = cursor->c_get(cursor, &db_key, &db_value, DB_NEXT); } break; case DB_NOTFOUND: break; default: cursor->c_close(cursor); return db_error_to_rast_error(dberr); } cursor->c_close(cursor); return RAST_OK; } static rast_error_t * multi_ngram_create(apr_pool_t *pool, rast_text_index_t *index, rast_token_t *token, ngram_t **base) { rast_doc_id_t num_docs; multi_ngram_t *ngram; ngram_head_t *ngrams; int num_ngrams; rast_error_t *error; ngrams = (ngram_head_t *) apr_palloc(pool, sizeof(ngram_head_t)); APR_RING_INIT(ngrams, ngram_t, link); num_docs = 0; num_ngrams = 0; error = multi_ngram_get_packed_positions(pool, index, token, &num_docs, ngrams, &num_ngrams); if (error != RAST_OK) { return error; } error = multi_ngram_get_rare_packed_positions(pool, index, token, &num_docs, ngrams, &num_ngrams); if (error != RAST_OK) { return error; } if (num_ngrams == 0) { *base = NULL; return RAST_OK; } ngram = (multi_ngram_t *) apr_palloc(pool, sizeof(multi_ngram_t)); ngram->base.type = &multi_ngram_type; ngram->base.offset = token->pos; ngram->base.num_docs = num_docs; ngram->base.ptr = token->ptr; ngram->base.nbytes = token->nbytes; ngram->base.nchars = token->nchars; ngram->ngrams = ngrams; ngram->min_doc_id = (rast_doc_id_t) -1; ngram->cursors_buffer = (pos_cursor_t **) apr_palloc(pool, sizeof(pos_cursor_t *) * num_ngrams); *base = (ngram_t *) ngram; return RAST_OK; } static rast_error_t * get_ngrams(apr_pool_t *pool, rast_text_index_t *index, const char *term, ngram_head_t **ngrams, int *num_ngrams) { rast_tokenizer_t *tokenizer; rast_token_t token; ngram_t *ngram; rast_error_t *error; *ngrams = (ngram_head_t *) apr_palloc(pool, sizeof(ngram_head_t)); APR_RING_INIT(*ngrams, ngram_t, link); *num_ngrams = 0; tokenizer = rast_search_tokenizer_create(pool, index->encoding_module, term, strlen(term)); while (!rast_search_tokenizer_is_done(tokenizer)) { rast_search_tokenizer_get_current(tokenizer, &token); if (token.is_complete) { error = single_ngram_create(pool, index, &token, &ngram); } else { error = multi_ngram_create(pool, index, &token, &ngram); } if (error != RAST_OK) { return error; } if (ngram == NULL) { *ngrams = NULL; *num_ngrams = 0; return RAST_OK; } APR_RING_INSERT_TAIL(*ngrams, ngram, ngram_t, link); (*num_ngrams)++; rast_search_tokenizer_next(tokenizer); } return RAST_OK; } static void optimize_ngrams(ngram_head_t *ngrams, int *num_ngrams) { ngram_t *ngram, *next; rast_size_t next_offset, min_doc_nums; ngram = APR_RING_FIRST(ngrams); min_doc_nums = ngram->num_docs; while (ngram != APR_RING_SENTINEL(ngrams, ngram_t, link)) { if (ngram->num_docs < min_doc_nums) { min_doc_nums = ngram->num_docs; } ngram = APR_RING_NEXT(ngram, link); } next_offset = 0; ngram = APR_RING_FIRST(ngrams); while (ngram != APR_RING_SENTINEL(ngrams, ngram_t, link)) { next = APR_RING_NEXT(ngram, link); if (ngram->offset == next_offset || next == APR_RING_SENTINEL(ngrams, ngram_t, link) || ngram->num_docs == min_doc_nums) { next_offset = ngram->offset + ngram->nchars; } else { APR_RING_REMOVE(ngram, link); (*num_ngrams)--; } ngram = next; } } static rast_error_t * is_candidate_document(apr_pool_t *pool, rast_doc_id_t base_doc_id, ngram_head_t *ngrams, int num_ngrams, int *is_candidate) { ngram_t *ngram; rast_doc_id_t doc_id; rast_error_t *error; ngram = APR_RING_NEXT(APR_RING_FIRST(ngrams), link); while (ngram != APR_RING_SENTINEL(ngrams, ngram_t, link)) { doc_id = (rast_doc_id_t) -1; while (!ngram->type->is_done(ngram)) { error = ngram->type->get_current_doc_id(ngram, &doc_id); if (error != RAST_OK) { return error; } if (base_doc_id <= doc_id) { break; } ngram->type->next_doc(ngram); } if (doc_id == (rast_doc_id_t) -1 || base_doc_id != doc_id) { *is_candidate = 0; return RAST_OK; } ngram = APR_RING_NEXT(ngram, link); } *is_candidate = 1; return RAST_OK; } static rast_error_t * create_pos_cursors(apr_pool_t *pool, ngram_head_t *ngrams, pos_cursor_t **pos_cursors, int *num_pos_cursors) { rast_error_t *error; int i; ngram_t *ngram; pos_cursor_t *cursor; i = 0; for (ngram = APR_RING_FIRST(ngrams); ngram != APR_RING_SENTINEL(ngrams, ngram_t, link); ngram = APR_RING_NEXT(ngram, link)) { error = ngram->type->pos_cursor_create(ngram, pool, &cursor); if (error != RAST_OK) { return error; } *(pos_cursors + i) = cursor; i++; } *num_pos_cursors = i; return RAST_OK; } static inline int is_valid_ngram(rast_pos_t base, pos_cursor_t *cursor) { rast_pos_t n; while (!cursor->type->is_done(cursor)) { n = cursor->type->get_current(cursor); if (n >= cursor->offset) { if (n - cursor->offset == base) { return 1; } else if (n - cursor->offset > base) { return 0; } } cursor->type->next(cursor); } return 0; } static int check_positions(pos_cursor_t **pos_cursors, rast_size_t num_pos_cursors) { pos_cursor_t *cursor; rast_pos_t n, base; int i; cursor = *(pos_cursors + 0); n = cursor->type->get_current(cursor); if (n < cursor->offset) { return 0; } base = n - cursor->offset; for (i = 1; i < num_pos_cursors; i++) { cursor = *(pos_cursors + i); if (!is_valid_ngram(base, cursor)) { return 0; } } return 1; } static rast_error_t * count_terms(apr_pool_t *pool, ngram_head_t *ngrams, pos_cursor_t **pos_cursors, int need_tf, int *count, rast_pos_t *first_pos) { rast_size_t num_pos_cursors; pos_cursor_t *cursor; apr_pool_t *sub_pool; rast_error_t *error; apr_pool_create(&sub_pool, pool); error = create_pos_cursors(pool, ngrams, pos_cursors, &num_pos_cursors); if (error != RAST_OK) { apr_pool_destroy(sub_pool); return error; } *count = 0; cursor = *(pos_cursors + 0); while (!cursor->type->is_done(cursor)) { if (check_positions(pos_cursors, num_pos_cursors)) { if (*count == 0) { *first_pos = cursor->type->get_current(cursor); } (*count)++; if (!need_tf) { apr_pool_destroy(sub_pool); return RAST_OK; } } cursor->type->next(cursor); } apr_pool_destroy(sub_pool); return RAST_OK; } static rast_candidate_t * create_candidate(apr_pool_t *pool, rast_doc_id_t doc_id, int count, rast_pos_t pos, int need_tf) { rast_candidate_t *result = (rast_candidate_t *) apr_palloc(pool, sizeof(rast_candidate_t)); rast_term_frequency_t *tf; result->doc_id = doc_id; APR_RING_INIT(&result->terms, rast_term_frequency_t, link); if (need_tf) { tf = (rast_term_frequency_t *) apr_palloc(pool, sizeof(rast_term_frequency_t)); tf->count = count; tf->pos = pos; APR_RING_INSERT_TAIL(&result->terms, tf, rast_term_frequency_t, link); } return result; } rast_error_t * rast_text_index_search(rast_text_index_t *index, const char *s, int need_tf, rast_query_result_t **result, apr_pool_t *pool) { rast_error_t *error; apr_pool_t *sub_pool; ngram_head_t *ngrams; int num_ngrams; ngram_t *ngram; rast_doc_id_t doc_id; int is_candidate, count; rast_pos_t pos; rast_term_t *term = NULL; pos_cursor_t **pos_cursors; apr_pool_create(&sub_pool, pool); error = get_ngrams(sub_pool, index, s, &ngrams, &num_ngrams); if (error != RAST_OK) { apr_pool_destroy(sub_pool); return error; } *result = rast_query_result_create(pool); if (need_tf) { term = (rast_term_t *) apr_palloc(pool, sizeof(rast_term_t)); term->term = s; term->doc_count = 0; APR_RING_INSERT_TAIL(&(*result)->terms, term, rast_term_t, link); } if (ngrams == NULL || num_ngrams <= 0) { /* should be error? */ apr_pool_destroy(sub_pool); return RAST_OK; } optimize_ngrams(ngrams, &num_ngrams); pos_cursors = (pos_cursor_t **) apr_palloc(sub_pool, sizeof(pos_cursor_t *) * num_ngrams); ngram = APR_RING_FIRST(ngrams); while (!ngram->type->is_done(ngram)) { error = ngram->type->get_current_doc_id(ngram, &doc_id); if (error != RAST_OK) { apr_pool_destroy(sub_pool); return error; } error = is_candidate_document(sub_pool, doc_id, ngrams, num_ngrams, &is_candidate); if (error != RAST_OK) { apr_pool_destroy(sub_pool); return error; } if (is_candidate) { error = count_terms(pool, ngrams, pos_cursors, need_tf, &count, &pos); if (error != RAST_OK) { apr_pool_destroy(sub_pool); return error; } if (count > 0) { rast_candidate_t *candidate = create_candidate(pool, doc_id, count, pos, need_tf); APR_RING_INSERT_TAIL(&(*result)->candidates, candidate, rast_candidate_t, link); if (need_tf) { term->doc_count++; } } } ngram->type->next_doc(ngram); } apr_pool_destroy(sub_pool); return RAST_OK; } static rast_error_t * pack_doc(doc_t *doc, char *s, off_t *nbytes) { char *p; int n; p = s; n = pack_number(p, doc->doc_id); if (n == 0) { return rast_error(RAST_ERROR_GENERAL, "invalid doc_id"); } p += n; n = pack_number(p, doc->pos_bytes); if (n == 0) { return rast_error(RAST_ERROR_GENERAL, "invalid position size"); } p += n; memcpy(p, doc->pos_ptr, doc->pos_bytes); p += doc->pos_bytes; *nbytes = p - s; return RAST_OK; } struct ngram_db_entry_t { const char *token; int token_nbytes; int num_docs; off_t block_nbytes; rast_size_t block_no; rast_size_t block_count; APR_RING_ENTRY(ngram_db_entry_t) link; }; typedef struct ngram_db_entry_t ngram_db_entry_t; APR_RING_HEAD(ngram_db_head_t, ngram_db_entry_t); typedef struct ngram_db_head_t ngram_db_head_t; static int compare_ngram_db_entry_func(const void *v1, const void *v2) { int result; const ngram_db_entry_t *entry1 = *(const ngram_db_entry_t **) v1; const ngram_db_entry_t *entry2 = *(const ngram_db_entry_t **) v2; result = entry2->block_nbytes - entry1->block_nbytes; if (result != 0) { return result; } return entry2->num_docs - entry1->num_docs; } typedef struct { DBC *cursor; DBT db_key; DBT db_value; int dberr; apr_pool_t *pool; } ngram_db_cursor_t; static rast_error_t * ngram_db_cursor_create(ngram_db_cursor_t **result, DB *ngram_db, DB_TXN *bdb_txn, apr_pool_t *pool) { ngram_db_cursor_t *cursor; int dberr; apr_pool_t *sub_pool; apr_pool_create(&sub_pool, pool); cursor = (ngram_db_cursor_t *) apr_palloc(sub_pool, sizeof(ngram_db_cursor_t)); dberr = ngram_db->cursor(ngram_db, bdb_txn, &cursor->cursor, 0); if (dberr != 0) { apr_pool_destroy(sub_pool); return db_error_to_rast_error(dberr); } memset(&cursor->db_key, 0, sizeof(DBT)); memset(&cursor->db_value, 0, sizeof(DBT)); dberr = cursor->cursor->c_get(cursor->cursor, &cursor->db_key, &cursor->db_value, DB_FIRST); if (dberr != 0 && dberr != DB_NOTFOUND) { cursor->cursor->c_close(cursor->cursor); apr_pool_destroy(sub_pool); return db_error_to_rast_error(dberr); } cursor->dberr = dberr; cursor->pool = sub_pool; *result = cursor; return RAST_OK; } static inline int ngram_db_cursor_is_done(ngram_db_cursor_t *cursor) { return cursor->dberr != 0; } static inline void ngram_db_cursor_get_next(ngram_db_cursor_t *cursor) { cursor->dberr = cursor->cursor->c_get(cursor->cursor, &cursor->db_key, &cursor->db_value, DB_NEXT); } static inline rast_error_t * ngram_db_cursor_close(ngram_db_cursor_t *cursor) { int dberr; apr_pool_t *pool; dberr = cursor->dberr; pool = cursor->pool; cursor->cursor->c_close(cursor->cursor); apr_pool_destroy(pool); if (dberr == DB_NOTFOUND) { return RAST_OK; } return db_error_to_rast_error(dberr); } static rast_error_t * create_new_pos_data(rast_text_index_t *old_index, apr_hash_t *id_map_table, DB_TXN *bdb_txn, ngram_db_head_t **ngram_db_ring, int *num_entries, FILE *tmp_file, apr_pool_t *pool) { off_t offset; ngram_db_cursor_t *cursor; rast_error_t *error; apr_pool_t *sub_pool, *doc_pool; error = ngram_db_cursor_create(&cursor, old_index->ngram_db, bdb_txn, pool); if (error != RAST_OK) { return error; } apr_pool_create(&sub_pool, cursor->pool); apr_pool_create(&doc_pool, cursor->pool); offset = 0; *num_entries = 0; *ngram_db_ring = (ngram_db_head_t *) apr_palloc(pool, sizeof(ngram_db_head_t)); APR_RING_INIT(*ngram_db_ring, ngram_db_entry_t, link); while(!ngram_db_cursor_is_done(cursor)) { rast_size_t doc_nbytes; rast_doc_id_t num_docs; const char *doc_ptr, *doc_ptr_end; doc_t doc; ngram_db_entry_t *entry; char *s; off_t nbytes; error = get_packed_positions_from_pos_file(doc_pool, old_index, (rast_size_t *) cursor->db_value.data, &doc_ptr, &doc_nbytes, &num_docs); if (error != RAST_OK) { ngram_db_cursor_close(cursor); return error; } doc_ptr_end = doc_ptr + doc_nbytes; entry = (ngram_db_entry_t *) apr_palloc(pool, sizeof(ngram_db_entry_t)); entry->token = apr_pmemdup(pool, cursor->db_key.data, cursor->db_key.size); entry->token_nbytes = cursor->db_key.size; entry->num_docs = 0; entry->block_nbytes = 0; while (doc_ptr < doc_ptr_end) { rast_doc_id_t *new_doc_id; error = parse_doc(doc_ptr, doc_ptr_end, &doc); if (error != RAST_OK) { ngram_db_cursor_close(cursor); return error; } doc_ptr += doc.bytes; new_doc_id = (rast_doc_id_t *) apr_hash_get(id_map_table, &doc.doc_id, sizeof(rast_doc_id_t)); if (new_doc_id == NULL) { continue; } doc.doc_id = *new_doc_id; s = (char *) apr_palloc(sub_pool, doc.bytes); error = pack_doc(&doc, s, &nbytes); if (error != RAST_OK) { ngram_db_cursor_close(cursor); return error; } if (fwrite(s, 1, nbytes, tmp_file) < nbytes) { ngram_db_cursor_close(cursor); return os_error_to_rast_error(errno); } entry->num_docs++; entry->block_nbytes += nbytes; apr_pool_clear(sub_pool); } apr_pool_clear(doc_pool); if (entry->num_docs > 0) { off_t new_offset; entry->block_no = offset / old_index->pos_block_size; entry->block_count = entry->block_nbytes / old_index->pos_block_size + 1; new_offset = offset + entry->block_count * old_index->pos_block_size; fseeko(tmp_file, new_offset, SEEK_SET); APR_RING_INSERT_TAIL(*ngram_db_ring, entry, ngram_db_entry_t, link); (*num_entries)++; offset = new_offset; } ngram_db_cursor_get_next(cursor); } ftruncate(fileno(tmp_file), offset); return ngram_db_cursor_close(cursor); } static rast_error_t * create_optimized_pos_file(ngram_db_entry_t **ngram_db_ary, int num_entries, rast_size_t pos_block_size, FILE *tmp_file, FILE *new_pos_file, apr_pool_t *pool) { rast_size_t block_no = 0; off_t offset, nbytes; char *s; apr_pool_t *sub_pool; int i; apr_pool_create(&sub_pool, pool); for (i = 0; i < num_entries; i++) { offset = ngram_db_ary[i]->block_no * pos_block_size; if (fseeko(tmp_file, offset, SEEK_SET) == -1) { apr_pool_destroy(sub_pool); return os_error_to_rast_error(errno); } nbytes = ngram_db_ary[i]->block_count * pos_block_size; s = (char *) apr_palloc(sub_pool, nbytes); if (fread(s, 1, nbytes, tmp_file) < nbytes) { apr_pool_destroy(sub_pool); return os_error_to_rast_error(errno); } if (fwrite(s, 1, nbytes, new_pos_file) < nbytes) { apr_pool_destroy(sub_pool); return os_error_to_rast_error(errno); } ngram_db_ary[i]->block_no = block_no; block_no += ngram_db_ary[i]->block_count; apr_pool_clear(sub_pool); } apr_pool_destroy(sub_pool); return RAST_OK; } static rast_error_t * create_optimized_ngram_db(ngram_db_entry_t **ngram_db_ary, int num_entries, DB *old_ngram_db, DB *new_ngram_db, DB_TXN *bdb_txn, int is_native, apr_pool_t *pool) { ngram_db_cursor_t *cursor; apr_hash_t *ngram_db_hash; apr_pool_t *sub_pool; rast_error_t *error; int i; error = ngram_db_cursor_create(&cursor, old_ngram_db, bdb_txn, pool); if (error != RAST_OK) { return error; } apr_pool_create(&sub_pool, pool); ngram_db_hash = apr_hash_make(sub_pool); for (i = 0; i < num_entries; i++) { apr_hash_set(ngram_db_hash, ngram_db_ary[i]->token, ngram_db_ary[i]->token_nbytes, ngram_db_ary[i]); } while (!ngram_db_cursor_is_done(cursor)) { ngram_db_entry_t *entry; entry = (ngram_db_entry_t *) apr_hash_get(ngram_db_hash, cursor->db_key.data, cursor->db_key.size); if (entry != NULL) { DBT db_value = { 0 }; rast_size_t ngram_db_value[4]; int dberr; ngram_db_value[0] = rast_fix_byte_order(entry->block_no, is_native); ngram_db_value[1] = rast_fix_byte_order(entry->block_count, is_native); ngram_db_value[2] = rast_fix_byte_order(entry->block_nbytes, is_native); ngram_db_value[3] = rast_fix_byte_order(entry->num_docs, is_native); db_value.data = ngram_db_value; db_value.size = sizeof(rast_size_t) * 4; dberr = new_ngram_db->put(new_ngram_db, bdb_txn, &cursor->db_key, &db_value, 0); if (dberr != 0) { apr_pool_destroy(sub_pool); ngram_db_cursor_close(cursor); return db_error_to_rast_error(dberr); } } ngram_db_cursor_get_next(cursor); } apr_pool_destroy(sub_pool); return ngram_db_cursor_close(cursor); } static rast_error_t * create_optimized_rare_ngram_db(apr_hash_t *id_map_table, DB *old_rare_ngram_db, DB *new_rare_ngram_db, DB_TXN *bdb_txn, int is_native, apr_pool_t *pool) { ngram_db_cursor_t *cursor; int dberr; apr_pool_t *sub_pool; rast_error_t *error; error = ngram_db_cursor_create(&cursor, old_rare_ngram_db, bdb_txn, pool); if (error != RAST_OK) { return error; } apr_pool_create(&sub_pool, cursor->pool); while(!ngram_db_cursor_is_done(cursor)) { char *p, *p_end; char *positions; int n; rast_doc_id_t old_doc_id, *new_doc_id; rast_size_t pos_bytes; p = (char *) cursor->db_value.data; p_end = p + cursor->db_value.size; n = unpack_number(p, p_end - p, &old_doc_id); p += n; positions = p; n = unpack_number(p, p_end - p, &pos_bytes); pos_bytes += n; new_doc_id = (rast_doc_id_t *) apr_hash_get(id_map_table, &old_doc_id, sizeof(rast_doc_id_t)); if (new_doc_id != NULL) { char *s; DBT db_value = { 0 }; s = apr_palloc(sub_pool, cursor->db_value.size); p = s; n = pack_number(p, *new_doc_id); p += n; memcpy(p, positions, pos_bytes); db_value.data = s; db_value.size = n + pos_bytes; dberr = new_rare_ngram_db->put(new_rare_ngram_db, bdb_txn, &cursor->db_key, &db_value, 0); if (dberr != 0) { ngram_db_cursor_close(cursor); return db_error_to_rast_error(dberr); } apr_pool_clear(sub_pool); } ngram_db_cursor_get_next(cursor); } return ngram_db_cursor_close(cursor); } static rast_error_t * create_optimized_text_index(rast_text_index_t *old_index, rast_text_index_t *new_index, const char *new_index_name, apr_hash_t *id_map_table, DB_TXN *bdb_txn, apr_pool_t *pool) { char *tmp_filename; FILE *tmp_file; ngram_db_entry_t *entry, **ngram_db_ary; ngram_db_head_t *ngram_db_ring; int num_entries, i; apr_pool_t *sub_pool; rast_error_t *error; tmp_filename = apr_pstrcat(pool, new_index_name, "-1.tmp", NULL); tmp_file = fopen(tmp_filename, "w+"); if (tmp_file == NULL) { return os_error_to_rast_error(errno); } error = create_new_pos_data(old_index, id_map_table, bdb_txn, &ngram_db_ring, &num_entries, tmp_file, pool); if (error != RAST_OK) { fclose(tmp_file); (void) apr_file_remove(tmp_filename, pool); return error; } i = 0; ngram_db_ary = (ngram_db_entry_t **) apr_palloc(pool, sizeof(ngram_db_entry_t *) * num_entries); for (entry = APR_RING_FIRST(ngram_db_ring); entry != APR_RING_SENTINEL(ngram_db_ring, ngram_db_entry_t, link); entry = APR_RING_NEXT(entry, link)) { ngram_db_ary[i] = entry; i++; } qsort(ngram_db_ary, num_entries, sizeof(ngram_db_entry_t *), compare_ngram_db_entry_func); apr_pool_create(&sub_pool, pool); error = create_optimized_pos_file(ngram_db_ary, num_entries, old_index->pos_block_size, tmp_file, new_index->pos_file, sub_pool); fclose(tmp_file); (void) apr_file_remove(tmp_filename, pool); if (error != RAST_OK) { apr_pool_destroy(sub_pool); return error; } apr_pool_clear(sub_pool); error = create_optimized_ngram_db(ngram_db_ary, num_entries, old_index->ngram_db, new_index->ngram_db, bdb_txn, old_index->is_native, sub_pool); if (error != RAST_OK) { apr_pool_destroy(sub_pool); return error; } apr_pool_clear(sub_pool); error = create_optimized_rare_ngram_db(id_map_table, old_index->rare_ngram_db, new_index->rare_ngram_db, bdb_txn, old_index->is_native, sub_pool); apr_pool_destroy(sub_pool); return error; } rast_error_t * rast_text_index_optimize(DB_ENV *bdb_env, DB_TXN *bdb_txn, int lorder, rast_text_index_t *old_index, const char *new_index_name, apr_hash_t *id_map_table) { apr_pool_t *pool; rast_text_index_t *new_index; rast_error_t *error; apr_pool_create(&pool, old_index->pool); error = rast_text_index_open(&new_index, new_index_name, RAST_DB_RDWR, old_index->encoding_module, bdb_env, lorder, old_index->pos_block_size, pool); if (error != RAST_OK) { apr_pool_destroy(pool); return error; } error = create_optimized_text_index(old_index, new_index, new_index_name, id_map_table, bdb_txn, pool); rast_text_index_close(new_index); apr_pool_destroy(pool); return error; } /* vim: set filetype=c sw=4 expandtab : */