/*
* 変換エンジンの内部情報を使うため、意図的に
* layer violationを放置している。
*
*/
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <anthy/anthy.h>
#include <anthy/convdb.h>
#include <anthy/segment.h>
#include <anthy/feature_set.h>
/**/
#include "../src-main/main.h"
#include "../src-splitter/wordborder.h"
#include "../src-worddic/dic_ent.h"
/* 自立語部か付属語部か */
#define WORD_INDEP 0
#define WORD_DEP 1
/* 単語(自立語or付属語) */
struct word {
/* WORD_* */
int type;
/* 付属語のhash(WORD_INDEP)もしくは変換後の文字列のhash(WORD_DEP) */
int hash;
/* 読みの文字列のhash */
int yomi_hash;
/* 変換前の文字列 */
xstr *raw_xs;
/* 変換後の文字列 */
xstr *conv_xs;
/* 変換後の品詞 */
const char *wt;
};
static struct cand_ent *
selected_candidate(struct seg_ent *seg)
{
if (seg->committed > -1) {
return seg->cands[seg->committed];
}
return seg->cands[0];
}
static void
get_res(anthy_context_t ac, char *res_buf, int conv)
{
struct anthy_conv_stat acs;
int i;
anthy_get_stat(ac, &acs);
res_buf[0] = 0;
if (!conv) {
strcat(res_buf, "|");
}
for (i = 0; i < acs.nr_segment; i++) {
char buf[1024];
if (conv) {
anthy_get_segment(ac, i, 0, buf, 1024);
strcat(res_buf, buf);
} else {
anthy_get_segment(ac, i, NTH_UNCONVERTED_CANDIDATE, buf, 1024);
strcat(res_buf, buf);
strcat(res_buf, "|");
}
}
}
static struct conv_res *
do_find_conv_res(struct res_db *db, const char *src, const char *res)
{
struct conv_res *cr;
for (cr = db->res_list.next; cr; cr = cr->next) {
if (((!cr->res_str && !res) ||
!strcmp(cr->res_str, res)) &&
!strcmp(cr->src_str, src)) {
return cr;
}
}
cr = (struct conv_res *)malloc(sizeof(struct conv_res));
cr->src_str = strdup(src);
if (res) {
cr->res_str = strdup(res);
} else {
cr->res_str = NULL;
}
cr->cand_str = NULL;
cr->check = CHK_UNKNOWN;
cr->used = 0;
cr->cand_check = NULL;
/**/
db->tail->next = cr;
cr->next = NULL;
db->tail = cr;
return cr;
}
struct conv_res *
find_conv_res(struct res_db *db, anthy_context_t ac,
const char *src, int conv)
{
char res_buf[1024];
get_res(ac, res_buf, conv);
return do_find_conv_res(db, src, res_buf);
}
static void
chomp_line(char *buf)
{
int len = strlen(buf);
if (buf[len-1] == '\n') {
buf[len-1] = 0;
}
}
struct res_db *
create_db(void)
{
struct res_db *db;
db = malloc(sizeof(struct res_db));
db->res_list.next = NULL;
db->tail = &db->res_list;
db->total = 0;
db->res.unknown = 0;
db->res.ok = 0;
db->res.miss = 0;
db->res.dontcare = 0;
db->split.unknown = 0;
db->split.ok = 0;
db->split.miss = 0;
db->split.dontcare = 0;
return db;
}
static void
strip_separator_vbar(char *buf, const char *str)
{
const char *src = str;
char *dst = buf;
while (*src) {
if (*src != '|' && *src != '~') {
*dst = *src;
dst ++;
}
src ++;
}
*dst = 0;
}
static void
parse_line(struct res_db *db, char *line)
{
char buf1[1024], buf2[1024], buf3[1024], buf4[1024];
char *src, *res;
const char *check;
struct conv_res *cr;
int nr;
chomp_line(line);
if (line[0] == '#' || line[0] == 0) {
return ;
}
nr = sscanf(line, "%s %s %s", buf1, buf2, buf3);
if (nr == 1) {
cr = do_find_conv_res(db, buf1, NULL);
cr->check = CHK_UNKNOWN;
return ;
}
if (nr < 2) {
return ;
}
if (buf1[0] != '|') {
/* buf1 buf2 buf3
* 平文 区切り文
* 平文 区切り文 変換後
* 平文 区切り文 check
*/
src = buf1;
res = buf2;
if (nr == 3) {
check = buf3;
} else {
check = "?";
}
} else {
/* buf1 buf2 (buf3)
* 区切り文
* 区切り文 変換後
* 区切り文 check
*/
strip_separator_vbar(buf4, buf1);
src = buf4;
res = buf1;
check = buf2;
}
cr = do_find_conv_res(db, src, res);
if (nr == 2 && check[0] != '|') {
cr->check = CHK_OK;
return ;
}
if (check[0] == 'O') {
cr->check = CHK_OK;
} else if (check[0] == 'X') {
cr->check = CHK_MISS;
} else if (check[0] == '*') {
cr->check = CHK_DONTCARE;
} else if (check[0] == '|') {
cr->check = CHK_UNKNOWN;
cr->cand_str = strdup(check);
} else {
cr->check = CHK_UNKNOWN;
}
}
void
read_db(struct res_db *db, const char *fn)
{
FILE *fp;
char line[1024];
if (!fn) {
return ;
}
fp = fopen(fn, "r");
if (!fp) {
return ;
}
while (fgets(line, 1024, fp)) {
parse_line(db, line);
}
}
static void
fill_conv_info(struct word *w, struct cand_elm *elm)
{
/*w->conv_xs, w->wt*/
struct dic_ent *de;
if (elm->nth == -1 ||
elm->nth >= elm->se->nr_dic_ents) {
w->conv_xs = NULL;
w->wt = NULL;
return ;
}
if (!elm->se->dic_ents) {
w->conv_xs = NULL;
w->wt = NULL;
return ;
}
/**/
de = elm->se->dic_ents[elm->nth];
w->conv_xs = anthy_xstr_dup(&de->str);
w->wt = de->wt_name;
w->hash = anthy_xstr_hash(w->conv_xs);
}
static void
init_word(struct word *w, int type)
{
w->type = type;
w->raw_xs = NULL;
w->conv_xs = NULL;
w->wt = NULL;
}
static void
free_word(struct word *w)
{
anthy_free_xstr(w->raw_xs);
anthy_free_xstr(w->conv_xs);
}
/* 自立語を作る */
static void
fill_indep_word(struct word *w, struct cand_elm *elm)
{
init_word(w, WORD_INDEP);
/* 変換前の読みを取得する */
w->raw_xs = anthy_xstr_dup(&elm->str);
w->yomi_hash = anthy_xstr_hash(w->raw_xs);
w->hash = 0;
/**/
fill_conv_info(w, elm);
}
/* 付属語を作る */
static void
fill_dep_word(struct word *w, struct cand_elm *elm)
{
init_word(w, WORD_DEP);
/**/
w->hash = anthy_xstr_hash(&elm->str);
w->yomi_hash = w->hash;
w->raw_xs = anthy_xstr_dup(&elm->str);
}
static void
print_features(struct feature_list *fl)
{
int i, nr;
if (!fl) {
return ;
}
nr = anthy_feature_list_nr(fl);
if (nr == 0) {
return ;
}
printf(" features=");
for (i = 0; i < nr; i++) {
if (i > 0) {
printf(",");
}
printf("%d", anthy_feature_list_nth(fl, i));
}
}
static void
print_word(const char *prefix, struct word *w, struct feature_list *fl)
{
printf("%s", prefix);
if (w->type == WORD_DEP) {
/* 付属語 */
printf("dep_word hash=%d ", w->hash);
anthy_putxstrln(w->raw_xs);
return ;
}
/* 自立語 */
printf("indep_word hash=%d", w->hash);
/**/
if (fl) {
print_features(fl);
}
/* 品詞 */
if (w->wt) {
printf(" %s", w->wt);
} else {
printf(" null");
}
/* 文字列 */
if (w->conv_xs) {
printf(" ");
anthy_putxstr(w->conv_xs);
} else {
printf(" null");
}
printf(" ");
anthy_putxstrln(w->raw_xs);
}
/** segの文節クラスを返す
* segがnullであれば、clをクラスとする
*/
static int
get_seg_class(struct seg_ent *seg, int cl)
{
struct cand_ent *ce;
if (!seg) {
return cl;
}
ce = selected_candidate(seg);
if (ce->mw) {
return ce->mw->seg_class;
}
return SEG_BUNSETSU;
}
static void
set_features(struct feature_list *fl,
struct seg_ent *prev_seg,
struct seg_ent *cur_seg)
{
int cl, pc;
cl = get_seg_class(cur_seg, SEG_TAIL);
pc = get_seg_class(prev_seg, SEG_HEAD);
anthy_feature_list_set_cur_class(fl, cl);
if (cur_seg) {
struct cand_ent *ce = selected_candidate(cur_seg);
anthy_feature_list_set_dep_word(fl, ce->dep_word_hash);
if (ce->mw) {
anthy_feature_list_set_dep_class(fl, ce->mw->dep_class);
anthy_feature_list_set_mw_features(fl, ce->mw->mw_features);
anthy_feature_list_set_noun_cos(fl, ce->mw->core_wt);
}
}
anthy_feature_list_set_class_trans(fl, pc, cl);
/**/
anthy_feature_list_sort(fl);
}
static void
print_element(const char *prefix,
struct cand_elm *elm, struct feature_list *fl)
{
struct word w;
if (elm->str.len == 0) {
return ;
}
if (elm->id != -1) {
/* 自立語 */
fill_indep_word(&w, elm);
print_word(prefix, &w, fl);
} else {
/* 付属語 */
fill_dep_word(&w, elm);
print_word(prefix, &w, NULL);
}
free_word(&w);
}
static void
print_unconverted(struct cand_ent *ce)
{
printf("unknown ");
anthy_putxstrln(&ce->str);
}
static void
print_eos(struct seg_ent *prev_seg)
{
struct feature_list fl;
anthy_feature_list_init(&fl);
set_features(&fl, prev_seg, NULL);
printf("eos ");
print_features(&fl);
printf("\n");
anthy_feature_list_free(&fl);
}
/* 候補のミスには '~'、文節長のミスには '!'を付ける
* 同じ文節内の二つめ以降の自立語には '^'を付ける
*/
static const char *
get_prefix(int flag)
{
if (flag & CONV_INVALID) {
return "^";
}
if (flag & CONV_SIZE_MISS) {
return "!";
}
if (flag & CONV_CAND_MISS) {
return "~";
}
return "";
}
static void
print_segment_info(int is_negative,
struct seg_ent *prev_seg,
struct seg_ent *seg)
{
int i;
struct feature_list fl;
struct cand_ent *ce = selected_candidate(seg);
int nr_indep = 0;
const char *prefix = get_prefix(is_negative);
anthy_feature_list_init(&fl);
set_features(&fl, prev_seg, seg);
for (i = 0; i < ce->nr_words; i++) {
struct cand_elm *elm = &ce->elm[i];
prefix = get_prefix(is_negative);
if (nr_indep > 0 && elm->id != -1) {
prefix = get_prefix(is_negative | CONV_INVALID);
}
/* 出力する */
print_element(prefix, elm, &fl);
/* 自立語を数える */
if (elm->id != -1) {
nr_indep ++;
}
}
anthy_feature_list_free(&fl);
}
void
print_size_miss_segment_info(anthy_context_t ac, int nth)
{
struct seg_ent *prev_seg = NULL;
struct seg_ent *seg = anthy_get_nth_segment(&ac->seg_list, nth);
if (nth > 0) {
prev_seg = anthy_get_nth_segment(&ac->seg_list, nth - 1);
}
print_segment_info(CONV_SIZE_MISS, prev_seg, seg);
}
void
print_cand_miss_segment_info(anthy_context_t ac, int nth)
{
struct seg_ent *prev_seg = NULL;
struct seg_ent *seg = anthy_get_nth_segment(&ac->seg_list, nth);
if (nth > 0) {
prev_seg = anthy_get_nth_segment(&ac->seg_list, nth - 1);
}
print_segment_info(CONV_CAND_MISS, prev_seg, seg);
}
void
print_context_info(anthy_context_t ac, struct conv_res *cr)
{
int i;
struct seg_ent *prev_seg = NULL;
printf("segments: %d\n", ac->seg_list.nr_segments);
/* 各文節に対して */
for (i = 0; i < ac->seg_list.nr_segments; i++) {
struct seg_ent *seg = anthy_get_nth_segment(&ac->seg_list, i);
struct cand_ent *ce = selected_candidate(seg);
int is_negative = 0;
if (cr && cr->cand_check && cr->cand_check[i]) {
is_negative = CONV_CAND_MISS;
}
/* 各要素に対して */
if (!ce->nr_words) {
/* 要素が無いものはそのまま表示 */
print_unconverted(ce);
} else {
/* 候補の変更があった場合はそれを表示 */
if (seg->committed > 0) {
int tmp = seg->committed;
seg->committed = 0;
print_cand_miss_segment_info(ac, i);
seg->committed = tmp;
}
/* 文節の構成を表示 */
print_segment_info(is_negative, prev_seg, seg);
}
/**/
prev_seg = seg;
}
print_eos(prev_seg);
printf("\n");
}
syntax highlighted by Code2HTML, v. 0.9.1