/*
* Copyright (C) 2000-2007 TABATA Yusuke
* Copyright (C) 2004-2006 YOSHIDA Yuichi
*/
/*
* 付属語グラフをバイナリ化する
* init_word_seq_tab()
* 付属語テーブル中のノードへのポインタの初期化
*/
/*
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <anthy/alloc.h>
#include <anthy/conf.h>
#include <anthy/ruleparser.h>
#include <anthy/xstr.h>
#include <anthy/logger.h>
#include <anthy/splitter.h>
#include <anthy/anthy.h>
#include <anthy/depgraph.h>
#include <anthy/diclib.h>
#ifndef SRCDIR
#define SRCDIR "."
#endif
static int verbose;
static struct dep_node* gNodes;
static char** gNodeNames;
static int nrNodes;
/* 単語接続ルール */
static struct wordseq_rule *gRules;
static int nrRules;
static int
get_node_id_by_name(const char *name)
{
int i;
/* 登録済みのものから探す */
for (i = 0; i < nrNodes; i++) {
if (!strcmp(name,gNodeNames[i])) {
return i;
}
}
/* なかったので作る */
gNodes = realloc(gNodes, sizeof(struct dep_node)*(nrNodes+1));
gNodeNames = realloc(gNodeNames, sizeof(char*)*(nrNodes+1));
gNodes[nrNodes].nr_branch = 0;
gNodes[nrNodes].branch = 0;
gNodeNames[nrNodes] = strdup(name);
nrNodes++;
return nrNodes-1;
}
/* 遷移条件からbranchを捜し出す */
static struct dep_branch *
find_branch(struct dep_node *node, xstr **strs, int nr_strs)
{
struct dep_branch *db;
int i, j;
/* 同じ遷移条件のブランチを探す */
for (i = 0; i < node->nr_branch; i++) {
db = &node->branch[i];
if (nr_strs != db->nr_strs) {
continue ;
}
for (j = 0; j < nr_strs; j++) {
if (anthy_xstrcmp(db->str[j], strs[j])) {
goto fail;
}
}
/**/
return db;
fail:;
}
/* 新しいブランチを確保する */
node->branch = realloc(node->branch,
sizeof(struct dep_branch)*(node->nr_branch+1));
db = &node->branch[node->nr_branch];
node->nr_branch++;
db->str = malloc(sizeof(xstr*)*nr_strs);
for (i = 0; i < nr_strs; i++) {
db->str[i] = strs[i];
}
db->nr_strs = nr_strs;
db->nr_transitions = 0;
db->transition = 0;
return db;
}
/*
* 遷移をparseする
* doc/SPLITTER参照
*/
static void
parse_transition(char *token, struct dep_transition *tr)
{
int ct = CT_NONE;
int pos = POS_NONE;
enum dep_class dc = DEP_NONE;
char *str = token;
tr->head_pos = POS_NONE;
tr->weak = 0;
/* 遷移の属性を解析*/
while (*token != '@') {
switch(*token){
case ':':
case '.':
tr->weak = 1;
break;
case 'C':
/* 活用形 */
switch (token[1]) {
case 'z': ct = CT_MIZEN; break;
case 'y': ct = CT_RENYOU; break;
case 's': ct = CT_SYUSI; break;
case 't': ct = CT_RENTAI; break;
case 'k': ct = CT_KATEI; break;
case 'm': ct = CT_MEIREI; break;
case 'g': ct = CT_HEAD; break;
}
token ++;
break;
case 'H':
/* 自立語部の品詞 */
switch (token[1]) {
case 'n': tr->head_pos = POS_NOUN; break;
case 'v': tr->head_pos = POS_V; break;
case 'j': tr->head_pos = POS_AJV; break;
}
token ++;
break;
case 'S':
/* 文節の属性 */
switch (token[1]) {
/* case 'n': sc = DEP_NO; break;*/
case 'f': dc = DEP_FUZOKUGO; break;
case 'k': dc = DEP_KAKUJOSHI; break;
case 'y': dc = DEP_RENYOU; break;
case 't': dc = DEP_RENTAI; break;
case 'e': dc = DEP_END; break;
case 'r': dc = DEP_RAW; break;
default: printf("unknown (S%c)\n", token[1]);
}
token ++;
break;
default:
printf("Unknown (%c) %s\n", *token, str);
break;
}
token ++;
}
/* @から後はノードの名前 */
tr->next_node = get_node_id_by_name(token);
/**/
tr->pos = pos;
tr->ct = ct;
tr->dc = dc;
}
/*
* ノード名 遷移条件+ 遷移先+
*/
static void
parse_dep(char **tokens, int nr)
{
int id, row = 0;
struct dep_branch *db;
struct dep_node *dn;
int nr_strs;
xstr **strs = alloca(sizeof(xstr*) * nr);
/* ノードとそのidを確保 */
id = get_node_id_by_name(tokens[row]);
dn = &gNodes[id];
row ++;
nr_strs = 0;
/* 遷移条件の付属語の配列を作る */
for (; row < nr && tokens[row][0] == '\"'; row++) {
char *s;
s = strdup(&tokens[row][1]);
s[strlen(s)-1] =0;
strs[nr_strs] = anthy_cstr_to_xstr(s, ANTHY_EUC_JP_ENCODING);
nr_strs ++;
free(s);
}
/* 遷移条件がない時は警告を出して、空の遷移条件を追加する */
if (nr_strs == 0) {
char *s;
anthy_log(0, "node %s has a branch without any transition condition.\n",
tokens[0]);
s = strdup("");
strs[0] = anthy_cstr_to_xstr(s, ANTHY_EUC_JP_ENCODING);
nr_strs = 1;
free(s);
}
/* ブランチに遷移先のノードを追加する */
db = find_branch(dn, strs, nr_strs);
for ( ; row < nr; row++){
struct dep_transition *tr;
db->transition = realloc(db->transition,
sizeof(struct dep_transition)*
(db->nr_transitions+1));
tr = &db->transition[db->nr_transitions];
parse_transition(tokens[row], tr);
db->nr_transitions ++;
}
}
/* 文法定義ファイル中に空のノードがあるかチェックする */
static void
check_nodes(void)
{
int i;
for (i = 1; i < nrNodes; i++) {
if (gNodes[i].nr_branch == 0) {
anthy_log(0, "node %s has no branch.\n", gNodeNames);
}
}
}
static int
init_depword_tab(void)
{
const char *fn;
char **tokens;
int nr;
/* id 0 を空ノードに割当てる */
get_node_id_by_name("@");
/**/
fn = anthy_conf_get_str("DEPWORD");
if (!fn) {
anthy_log(0, "Dependent word dictionary is unspecified.\n");
return -1;
}
if (anthy_open_file(fn) == -1) {
anthy_log(0, "Failed to open dep word dict (%s).\n", fn);
return -1;
}
/* 一行ずつ付属語グラフを読む */
while (!anthy_read_line(&tokens, &nr)) {
parse_dep(tokens, nr);
anthy_free_line();
}
anthy_close_file();
check_nodes();
return 0;
}
static void
parse_indep(char **tokens, int nr)
{
if (nr < 2) {
printf("Syntex error in indepword defs"
" :%d.\n", anthy_get_line_number());
return ;
}
gRules = realloc(gRules, sizeof(struct wordseq_rule)*(nrRules+1));
/* 行の先頭には品詞の名前が入っている */
gRules[nrRules].wt = anthy_init_wtype_by_name(tokens[0]);
/* その次にはノード名が入っている */
gRules[nrRules].node_id = get_node_id_by_name(tokens[1]);
if (verbose) {
printf("%d (%s)\n", nrRules, tokens[0]);
}
nrRules ++;
}
/** 自立語からの遷移表 */
static int
init_indep_word_seq_tab(void)
{
const char *fn;
char **tokens;
int nr;
fn = anthy_conf_get_str("INDEPWORD");
if (!fn){
printf("independent word dict unspecified.\n");
return -1;
}
if (anthy_open_file(fn) == -1) {
printf("Failed to open indep word dict (%s).\n", fn);
return -1;
}
/* ファイルを一行ずつ読む */
while (!anthy_read_line(&tokens, &nr)) {
parse_indep(tokens, nr);
anthy_free_line();
}
anthy_close_file();
return 0;
}
/*
ネットワークバイトオーダーで4byte書き出す
*/
static void
write_nl(FILE* fp, int i)
{
i = anthy_dic_htonl(i);
fwrite(&i, sizeof(int), 1, fp);
}
static void
write_transition(FILE* fp, struct dep_transition* transition)
{
write_nl(fp, transition->next_node);
write_nl(fp, transition->pos);
write_nl(fp, transition->ct);
write_nl(fp, transition->dc);
write_nl(fp, transition->head_pos);
write_nl(fp, transition->weak);
}
static void
write_xstr(FILE* fp, xstr* str)
{
int i;
xchar c;
write_nl(fp, str->len);
for (i = 0; i < str->len; i++) {
c = anthy_dic_htonl(str->str[i]);
fwrite(&c, sizeof(xchar), 1, fp);
}
}
static void
write_branch(FILE* fp, struct dep_branch* branch)
{
int i;
write_nl(fp, branch->nr_strs);
for (i = 0; i < branch->nr_strs; ++i) {
write_xstr(fp, branch->str[i]);
}
write_nl(fp, branch->nr_transitions);
for (i = 0; i < branch->nr_transitions; ++i) {
write_transition(fp, &branch->transition[i]);
}
}
static void
write_node(FILE* fp, struct dep_node* node)
{
int i;
write_nl(fp, node->nr_branch);
for (i = 0; i < node->nr_branch; ++i) {
write_branch(fp, &node->branch[i]);
}
}
static void
write_wtype(FILE *fp, wtype_t wt)
{
fputc(anthy_wtype_get_pos(wt), fp);
fputc(anthy_wtype_get_cos(wt), fp);
fputc(anthy_wtype_get_scos(wt), fp);
fputc(anthy_wtype_get_cc(wt), fp);
fputc(anthy_wtype_get_ct(wt), fp);
fputc(anthy_wtype_get_wf(wt), fp);
fputc(0, fp);
fputc(0, fp);
}
static void
write_file(const char* file_name)
{
int i;
FILE* fp = fopen(file_name, "w");
int* node_offset = malloc(sizeof(int) * nrNodes); /* gNodesのファイル上の位置 */
/* 各ルール */
write_nl(fp, nrRules);
for (i = 0; i < nrRules; ++i) {
write_wtype(fp, gRules[i].wt);
write_nl(fp, gRules[i].node_id);
}
write_nl(fp, nrNodes);
for (i = 0; i < nrNodes; ++i) {
write_node(fp, &gNodes[i]);
}
free(node_offset);
fclose(fp);
}
int
main(int argc, char* argv[])
{
/* 付属語辞書を読み込んでファイルに書き出す */
anthy_conf_override("CONFFILE", "../anthy-conf");
anthy_conf_override("ANTHYDIR", SRCDIR "/../depgraph/");
anthy_init_wtypes();
anthy_do_conf_init();
/* 付属語グラフ */
init_depword_tab();
/* 自立語からの遷移表 */
init_indep_word_seq_tab();
write_file("anthy.dep");
return 0;
}
syntax highlighted by Code2HTML, v. 0.9.1