/* * Copyright (C) 2005 Network Applied Communication Laboratory Co., Ltd. * * This file is part of Rast. * See the file COPYING for redistribution information. * */ #include #include #include #include "rast/merger.h" rast_error_t * rast_merger_open(rast_db_t **base, rast_db_t **merge_dbs, int num_merge_dbs, apr_pool_t *pool) { const static rast_db_t default_base = { NULL, NULL, rast_merger_close, NULL, NULL, rast_merger_search, NULL, NULL, NULL, NULL, rast_merger_encoding, rast_merger_properties, NULL, }; const char *merger_encoding, *encoding; int i; if (num_merge_dbs <= 0) { return rast_error(RAST_ERROR_INVALID_ARGUMENT, "empty indices: %d", num_merge_dbs); } merger_encoding = rast_db_encoding(merge_dbs[0]); for (i = 1; i < num_merge_dbs; i++) { encoding = rast_db_encoding(merge_dbs[i]); if (strcmp(encoding, merger_encoding) != 0) { return rast_error(RAST_ERROR_GENERAL, "merged indices must be same encoding: %s != %s", merger_encoding, encoding); } } { rast_merger_t *db = (rast_merger_t *) apr_palloc(pool, sizeof(rast_merger_t)); *base = (rast_db_t *) db; db->base = default_base; db->base.pool = pool; db->encoding = merger_encoding; db->num_merged_dbs = num_merge_dbs; db->merged_dbs = merge_dbs; db->properties = NULL; db->num_properties = 0; } return RAST_OK; } rast_error_t * rast_merger_close(rast_db_t *base) { return RAST_OK; } typedef struct { rast_result_item_t *item; rast_value_t *sort_property; } merger_sort_data_t; static int score_ascending_sort_compare_func(const void *v1, const void *v2) { rast_result_item_t *item1 = *(rast_result_item_t **) v1; rast_result_item_t *item2 = *(rast_result_item_t **) v2; return item1->score - item2->score; } static int score_descending_sort_compare_func(const void *v1, const void *v2) { return score_ascending_sort_compare_func(v2, v1); } static int property_string_ascending_sort_compare_func(const void *v1, const void *v2) { merger_sort_data_t *data1 = *(merger_sort_data_t **) v1; merger_sort_data_t *data2 = *(merger_sort_data_t **) v2; if (data1->sort_property->type == RAST_TYPE_UINT) { rast_uint_t n1 = rast_value_uint(data1->sort_property); rast_uint_t n2 = rast_value_uint(data2->sort_property); if (n1 < n2) { return -1; } else if (n1 == n2) { return 0; } else { return 1; } } else { return strcmp(rast_value_string(data1->sort_property), rast_value_string(data2->sort_property)); } } static int property_string_descending_sort_compare_func(const void *v1, const void *v2) { return property_string_ascending_sort_compare_func(v2, v1); } rast_error_t * sort_result(rast_search_option_t *options, int sort_property_number, rast_result_t *result, apr_pool_t *pool) { merger_sort_data_t **sort_data; int (*sort_func)(const void *, const void *); if (options->sort_method == RAST_SORT_METHOD_SCORE) { if (options->sort_order == RAST_SORT_ORDER_ASCENDING) { sort_func = score_ascending_sort_compare_func; } else { sort_func = score_descending_sort_compare_func; } qsort(result->items, result->num_items, sizeof(merger_sort_data_t *), sort_func); } else { int i; if (options->sort_order == RAST_SORT_ORDER_DESCENDING) { sort_func = property_string_descending_sort_compare_func; } else { sort_func = property_string_ascending_sort_compare_func; } sort_data = (merger_sort_data_t **) apr_palloc(pool, sizeof(merger_sort_data_t *) * result->num_items); for (i = 0; i < result->num_items; i++) { sort_data[i] = (merger_sort_data_t *) apr_palloc(pool, sizeof(merger_sort_data_t)); sort_data[i]->item = result->items[i]; sort_data[i]->sort_property = &result->items[i]->properties[sort_property_number]; } qsort(sort_data, result->num_items, sizeof(merger_sort_data_t *), sort_func); for (i = 0; i < result->num_items; i++) { result->items[i] = sort_data[i]->item; } } return RAST_OK; } static rast_search_option_t * get_zero_item_options(rast_search_option_t *options, apr_pool_t *pool) { rast_search_option_t *result; result = (rast_search_option_t *) apr_pmemdup(pool, options, sizeof(rast_search_option_t)); result->num_items = 0; return result; } static rast_search_option_t * get_child_options(rast_search_option_t *options, int *sort_property_number, apr_pool_t *pool) { int i; rast_search_option_t *result; result = (rast_search_option_t *) apr_pmemdup(pool, options, sizeof(rast_search_option_t)); result->properties = (const char **) apr_palloc(pool, sizeof(const char *) * (options->num_properties + 1)); memcpy(result->properties, options->properties, sizeof(char *) * options->num_properties); result->start_no = 0; result->num_items = options->start_no + options->num_items; if (options->sort_method == RAST_SORT_METHOD_PROPERTY) { for (i = 0; i < options->num_properties; i++) { if (strcmp(options->properties[i], options->sort_property) == 0) { *sort_property_number = i; return result; } } result->properties[options->num_properties] = options->sort_property; result->num_properties++; *sort_property_number = options->num_properties; } return result; } static rast_error_t * set_scoring_options(rast_merger_t *db, const char *query, rast_search_option_t *original_options, rast_search_option_t *options, apr_pool_t *pool) { int i, j; rast_search_option_t *zero_item_options; rast_result_t *child_results; rast_error_t *error; apr_pool_t *sub_pool; apr_pool_create(&sub_pool, pool); zero_item_options = get_zero_item_options(original_options, sub_pool); options->all_num_docs = 0; for (i = 0; i < db->num_merged_dbs; i++) { error = rast_db_search(db->merged_dbs[i], query, zero_item_options, &child_results, sub_pool); if (error != RAST_OK) { apr_pool_destroy(sub_pool); return error; } options->all_num_docs += child_results->num_docs; if (child_results->hit_count > 0) { options->num_terms = child_results->num_terms; options->terms = (int *) apr_palloc(pool, sizeof(int) * options->num_terms); for (j = 0; j < child_results->num_terms; j++) { options->terms[j] = child_results->terms[j].doc_count; } i++; break; } } for (; i < db->num_merged_dbs; i++) { error = rast_db_search(db->merged_dbs[i], query, zero_item_options, &child_results, sub_pool); if (error != RAST_OK) { apr_pool_destroy(sub_pool); return error; } options->all_num_docs += child_results->num_docs; if (child_results->hit_count > 0) { for (j = 0; j < options->num_terms; j++) { options->terms[j] += child_results->terms[j].doc_count; } } } apr_pool_destroy(sub_pool); return RAST_OK; } static void merge_child_result(rast_result_t *child_result, rast_result_t *result) { int i; result->num_docs += child_result->num_docs; if (child_result->hit_count > 0) { for (i = 0; i < result->num_terms; i++) { result->terms[i].doc_count += child_result->terms[i].doc_count; } memcpy(result->items + result->num_items, child_result->items, sizeof(rast_result_item_t *) * child_result->num_items); result->num_items += child_result->num_items; } } static void set_result_range(rast_search_option_t *options, rast_result_t *result) { if (result->num_items < options->start_no) { result->num_items = 0; } else { result->items += options->start_no; result->num_items -= options->start_no; if (options->num_items != RAST_RESULT_ALL_ITEMS && result->num_items > options->num_items) { result->num_items = options->num_items; } } } rast_error_t * rast_merger_search(rast_db_t *base, const char *query, rast_search_option_t *options, rast_result_t **results, apr_pool_t *pool) { rast_merger_t *db = (rast_merger_t *) base; int i, j, all_hit_count, doc_id_prefix; int sort_property_number; rast_result_t **child_results; rast_error_t *error = RAST_OK; apr_pool_t *sub_pool; rast_search_option_t *child_options; apr_pool_create(&sub_pool, pool); all_hit_count = 0; child_results = (rast_result_t **) apr_palloc(sub_pool, sizeof(rast_result_t *) * db->num_merged_dbs); doc_id_prefix = 0; child_options = get_child_options(options, &sort_property_number, sub_pool); if (options->score_method == RAST_SCORE_METHOD_TFIDF) { error = set_scoring_options(db, query, options, child_options, sub_pool); if (error != RAST_OK) { apr_pool_destroy(sub_pool); return error; } } for (i = 0; i < db->num_merged_dbs; i++) { error = rast_db_search(db->merged_dbs[i], query, child_options, child_results + i, pool); if (error != RAST_OK) { apr_pool_destroy(sub_pool); return error; } all_hit_count += child_results[i]->hit_count; for (j = 0; j < child_results[i]->num_items; j++) { child_results[i]->items[j]->doc_id += doc_id_prefix << 24; child_results[i]->items[j]->db_index = i; } doc_id_prefix += child_results[i]->num_indices; } *results = (rast_result_t *) apr_palloc(pool, sizeof(rast_result_t)); (*results)->num_indices = doc_id_prefix; (*results)->hit_count = all_hit_count; (*results)->terms = (rast_result_term_t *) apr_palloc(pool, sizeof(rast_result_term_t) * child_results[0]->num_terms); (*results)->items = (rast_result_item_t **) apr_palloc(pool, sizeof(rast_result_item_t *) * all_hit_count); (*results)->num_docs = child_results[0]->num_docs; memcpy((*results)->terms, child_results[0]->terms, sizeof(rast_result_term_t) * child_results[0]->num_terms); (*results)->num_terms = child_results[0]->num_terms; memcpy((*results)->items, child_results[0]->items, sizeof(rast_result_item_t *) * child_results[0]->num_items); (*results)->num_items = child_results[0]->num_items; for (i = 1; i < db->num_merged_dbs; i++) { merge_child_result(child_results[i], *results); } sort_result(child_options, sort_property_number, *results, sub_pool); set_result_range(options, *results); apr_pool_destroy(sub_pool); return RAST_OK; } const char * rast_merger_encoding(rast_db_t *base) { rast_merger_t *db = (rast_merger_t *) base; return db->encoding; } const rast_property_t * rast_merger_properties(rast_db_t *base, int *num_properties) { rast_merger_t *db = (rast_merger_t *) base; typedef struct property_list_entry_t { APR_RING_ENTRY(property_list_entry_t) link; const rast_property_t *property; } property_list_entry_t; typedef APR_RING_HEAD(property_list_head_t, property_list_entry_t) property_list_head_t; property_list_head_t *property_list; if (db->properties == NULL) { int i, j; const rast_property_t *child_properties; int child_num_properties; apr_pool_t *pool; property_list_entry_t *entry, *next_entry; apr_pool_create(&pool, db->base.pool); property_list = (property_list_head_t *) apr_palloc(pool, sizeof(property_list_head_t)); APR_RING_INIT(property_list, property_list_entry_t, link); child_properties = rast_db_properties(db->merged_dbs[0], &db->num_properties); for (i = 0; i < db->num_properties; i++) { entry = (property_list_entry_t *) apr_palloc(pool, sizeof(property_list_entry_t)); entry->property = &child_properties[i]; APR_RING_INSERT_TAIL(property_list, entry, property_list_entry_t, link); } for (i = 1; i < db->num_merged_dbs; i++) { child_properties = rast_db_properties(db->merged_dbs[i], &child_num_properties); entry = APR_RING_FIRST(property_list); while (entry != APR_RING_SENTINEL(property_list, property_list_entry_t, link)) { for (j = 0; j < child_num_properties; j++) { if (strcmp(child_properties[j].name, entry->property->name) == 0) { break; } } next_entry = APR_RING_NEXT(entry, link); if (j == child_num_properties) { APR_RING_REMOVE(entry, link); db->num_properties--; } entry = next_entry; } } db->properties = (const rast_property_t *) apr_palloc(db->base.pool, sizeof(const rast_property_t) * db->num_properties); i = 0; for (entry = APR_RING_FIRST(property_list); entry != APR_RING_SENTINEL(property_list, property_list_entry_t, link); entry = APR_RING_NEXT(entry, link)) { rast_property_t *property = (rast_property_t *) &db->properties[i]; property->name = entry->property->name; property->type = entry->property->type; property->flags = entry->property->flags; i++; } apr_pool_destroy(pool); } *num_properties = db->num_properties; return db->properties; }