ports//japanese/anthy/work/anthy-9100d/mkworddic/mkdic.c

/*
 * cannadic形式のファイルから辞書ファイルを作る
 *
 * Funded by IPA未踏ソフトウェア創造事業 2002 1/1
 *
 * Copyright (C) 2000-2007 TABATA Yusuke
 * Copyright (C) 2005 YOSHIDA Yuichi
 * Copyright (C) 2001-2002 TAKAI Kousuke
 */
/*
 * 辞書は読みをindexとし、品詞や変換後の単語(=entry)を検索
 * する構造になっている。
 *
 * 読み -> 単語、単語、、
 *
 * 辞書ファイルはネットワークバイトオーダーを用いる。
 *
 * 辞書ファイルは複数のセクションから構成されている
 *  0 ヘッダ 16*4 bytes
 *  2 読みのインデックス (読み512個ごと)
 *  3 読み
 *  4 ページ
 *  5 ページのインデックス
 *  6 用例辞書(?)
 *  7 読み hash
 *
 * source 元の辞書ファイル
 * file_dic 生成するファイル
 *
 * yomi_hash 辞書ファイルに出力されるhashのbitmap
 * index_hash このソース中でstruct yomi_entryを検索するためのhash
 *
 */

#include <sys/types.h>
#include <unistd.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
#include <ctype.h>

#include <config.h>

#include <anthy/anthy.h>
#include <anthy/xstr.h>
#include <anthy/wtype.h>
#include <anthy/ruleparser.h>
#include <anthy/word_dic.h>
#include <anthy/diclib.h>
#include "mkdic.h"

#define MAX_LINE_LEN 10240
#define NR_HEADER_SECTIONS 16
#define SECTION_ALIGNMENT 8
#define MAX_WTYPE_LEN 20

#define DEFAULT_FN "anthy.wdic"

static const char *progname;

/* writewords.cからアクセスするために、global変数 */
FILE *yomi_entry_index_out, *yomi_entry_out;
FILE *page_out, *page_index_out;
/**/
static FILE *uc_out;
static FILE *yomi_hash_out;
/* ハッシュの衝突の数、統計情報 */
static int yomi_hash_collision;

/* ファイル中の順序に従って並べる */
struct file_section {
  FILE **fpp;
  char *fn;
} file_array[] = {
  {&yomi_entry_index_out, NULL},
  {&yomi_entry_out, NULL},
  {&page_out, NULL},
  {&page_index_out, NULL},
  {&uc_out, NULL},
  {&yomi_hash_out, NULL},
  {NULL, NULL},
};

/* 辞書生成の状態 */
struct mkdic_stat {
  /* 単語のリスト */
  struct yomi_entry_list yl;
  /**/
  struct adjust_command ac_list;
  /* 用例辞書 */
  struct uc_dict *ud;
  /**/
  const char *output_fn;
  /**/
  int input_encoding;
  /**/
  int nr_excluded;
  char **excluded_wtypes;
};

/* 辞書の出力先のファイルをオープンする */
static void
open_output_files(void)
{
  struct file_section *fs;
  for (fs = file_array; fs->fpp; fs ++) {
    char *tmpdir = getenv("TMPDIR");
    fs->fn = NULL;
    if (tmpdir) {
      /* tmpfile()がTMPDIRを見ないため、TMPDIRを指定された場合mkstempを使う。*/
      char buf[256];
      int fd = -1;
      snprintf(buf, sizeof(buf), "%s/mkanthydic.XXXXXX", tmpdir);
      fd = mkstemp(buf);
      if (fd == -1) {
	*(fs->fpp) = NULL;
      } else {
	*(fs->fpp) = fdopen(fd, "w+");
	fs->fn = strdup(buf);
      }
    } else {
      *(fs->fpp) = tmpfile();
    }
    /**/
    if (!(*(fs->fpp))) {
      fprintf (stderr, "%s: cannot open temporary file: %s\n",
	       progname, strerror (errno));
      exit (2);
    }
  }
}

/* fflushする */
static void
flush_output_files (void)
{
  struct file_section *fs;
  for (fs = file_array; fs->fpp; fs ++) {
    if (ferror(*(fs->fpp))) {
      fprintf (stderr, "%s: write error\n", progname);
      exit (1);
    }
  }
  for (fs = file_array; fs->fpp; fs ++) {
    if (fflush(*(fs->fpp))) {
      fprintf (stderr, "%s: write error: %s\n", progname, strerror (errno));
      exit (1);
    }
  }
}

/* ネットワークbyteorderで4bytes書き出す */
void
write_nl(FILE *fp, int i)
{
  i = anthy_dic_htonl(i);
  fwrite(&i, sizeof(int), 1, fp);
}

static void
print_usage(void)
{
  printf("please do not use mkanthydic command directly.\n");
  exit(0);
}

static char *
read_line(FILE *fp, char *buf)
{
  /* 長すぎる行を無視する */
  int toolong = 0;

  while (fgets(buf, MAX_LINE_LEN, fp)) {
    int len = strlen(buf);
    if (buf[0] == '#') {
      continue ;
    }
    if (buf[len - 1] != '\n') {
      toolong = 1;
      continue ;
    }

    buf[len - 1] = 0;
    if (toolong) {
      toolong = 0;
    } else {
      return buf;
    }
  }
  return NULL;
}

/** cannadic形式の辞書の行からindexとなる部分を取り出す */
static xstr *
get_index_from_line(struct mkdic_stat *mds, char *buf)
{
  char *sp;
  xstr *xs;
  sp = strchr(buf, ' ');
  if (!sp) {
    /* 辞書のフォーマットがおかしい */
    return NULL;
  }
  *sp = 0;
  xs = anthy_cstr_to_xstr(buf, mds->input_encoding);
  *sp = ' ';
  return xs;
}

/** cannadic形式の辞書の行からindex以外の部分を取り出す */
static char *
get_entry_from_line(char *buf)
{
  char *sp;
  sp = strchr(buf, ' ');
  while(*sp == ' ') {
    sp ++;
  }
  return strdup(sp);
}

static int
index_hash(xstr *xs)
{
  int i;
  unsigned int h = 0;
  for (i = 0; i < xs->len; i++) {
    h += xs->str[i] * 11;
  }
  return (int)(h % YOMI_HASH);
}

const char *
get_wt_name(const char *name)
{
  wtype_t dummy;
  const char *res;
  if (!strcmp(name, "#T35")) {
    return "#T";
  }
  res = anthy_type_to_wtype(name, &dummy);
  if (!res) {
    return "unknown";
  }
  return res;
}

/** 読みに対して、単語を一つを追加する */
static void
push_back_word_entry(struct mkdic_stat *mds,
		     struct yomi_entry *ye, const char *wt_name,
		     int freq, const char *word, int order)
{
  wtype_t wt;
  char *s;
  if (freq == 0) {
    return ;
  }
  if (!anthy_type_to_wtype(wt_name, &wt)) {
    /* anthyの知らない品詞 */
    return ;
  }
  ye->entries = realloc(ye->entries,
			sizeof(struct word_entry) *
			(ye->nr_entries + 1));
  ye->entries[ye->nr_entries].ye = ye;
  ye->entries[ye->nr_entries].wt_name = get_wt_name(wt_name);
  ye->entries[ye->nr_entries].raw_freq = freq;
  ye->entries[ye->nr_entries].feature = 0;
  ye->entries[ye->nr_entries].source_order = order;
  if (mds->input_encoding == ANTHY_EUC_JP_ENCODING) {
    s = anthy_conv_euc_to_utf8(word);
  } else {
    s = strdup(word);
  }
  ye->entries[ye->nr_entries].word_utf8 = s;
  ye->nr_entries ++;
}

static int
parse_wtype(char *wtbuf, char *cur)
{
  /* 品詞 */
  char *t;
  int freq;
  if (strlen(cur) >= MAX_WTYPE_LEN) {
    return 0;
  }
  strcpy(wtbuf, cur);
  /* 頻度 */
  t = strchr(wtbuf, '*');
  freq = 1;
  if (t) {
    int tmp_freq;
    *t = 0;
    t++;
    tmp_freq = atoi(t);
    if (tmp_freq) {
      freq = tmp_freq;
    }
  }
  return freq;
}

/* 複合語の要素の長さは 1,2,3, ... 9,a,b,c */
static int
get_element_len(xchar xc)
{
  if (xc > '0' && xc <= '9') {
    return xc - '0';
  }
  if (xc >= 'a' && xc <= 'z') {
    return xc - 'a' + 10;
  }
  return 0;
}

/** 複合候補の形式チェック */
static int
check_compound_candidate(struct mkdic_stat *mds, xstr *index, const char *cur)
{
  /* 読みの文字数の合計を数える */
  xstr *xs = anthy_cstr_to_xstr(cur, mds->input_encoding);
  int i, total = 0;
  for (i = 0; i < xs->len - 1; i++) {
    if (xs->str[i] == '_') {
      total += get_element_len(xs->str[i+1]);
    }
  }
  anthy_free_xstr(xs);
  /* 比較する */
  if (total != index->len) {
    fprintf(stderr, "Invalid compound candidate (%s, length = %d).\n",
	    cur, total);
    return 0;
  }
  return 1;
}

static int
is_excluded_wtype(struct mkdic_stat *mds, char *wt)
{
  int i;
  for (i = 0; i < mds->nr_excluded; i++) {
    if (!strcmp(mds->excluded_wtypes[i], wt)) {
      return 1;
    }
  }
  return 0;
}

static char *
find_token_end(char *cur)
{
  char *n;
  for (n = cur; *n != ' ' && *n != '\0'; n++) {
    if (*n == '\\') {
      if (!n[1]) {
	return NULL;
      }
      n++;
    }
  }
  return n;
}

/** 読みに対応する行を分割して、配列を構成する */
static void
push_back_word_entry_line(struct mkdic_stat *mds, struct yomi_entry *ye,
			  const char *ent)
{
  char *buf = alloca(strlen(ent) + 1);
  char *cur = buf;
  char *n;
  char wtbuf[MAX_WTYPE_LEN];
  int freq = 0;
  int order = 0;

  strcpy(buf, ent);
  wtbuf[0] = 0;

  while (1) {
    /* トークンを\0で切る。curの後の空白か\0を探す */
    n = find_token_end(cur);
    if (!n) {
      fprintf(stderr, "invalid \\ at the end of line (%s).\n",
	      ent);
      return ;
    }
    if (*n) {
      *n = 0;
    } else {
      n = NULL;
    }
    /**/
    if (cur[0] == '#') {
      if (isalpha((unsigned char)cur[1])) {
	/* #XX*?? をパース */
	freq = parse_wtype(wtbuf, cur);
      } else {
	if (cur[1] == '_' &&
	    check_compound_candidate(mds, ye->index_xstr, &cur[1])) {
	  /* #_ 複合候補 */
	  push_back_word_entry(mds, ye, wtbuf, freq, cur, order);
	  order ++;
	}
      }
    } else {
      /* 品詞が除去リストに入っているかをチェック */
      if (!is_excluded_wtype(mds, wtbuf)) {
	/* 単語を追加 */
	push_back_word_entry(mds, ye, wtbuf, freq, cur, order);
	order ++;
      }/* :to extract excluded words
	  else {
	  anthy_putxstr(ye->index_xstr);
	  printf(" %s*%d %s\n", wtbuf, freq, cur);
	  }*/
    }
    if (!n) {
      /* 行末 */
      return ;
    }
    cur = n;
    cur ++;
  }
}

/** 同じ単語が無いかチェック */
static int
check_same_word(struct yomi_entry *ye, int idx)
{
  struct word_entry *base = &ye->entries[idx];
  int i;
  for (i = idx -1; i >= 0; i--) {
    struct word_entry *cur = &ye->entries[i];
    if (base->raw_freq != cur->raw_freq) {
      return 0;
    }
    if (strcmp(base->wt_name, cur->wt_name)) {
      return 0;
    }
    if (strcmp(base->word_utf8, cur->word_utf8)) {
      return 0;
    }
    /* 同じだった */
    return 1;
  }
  return 0;
}

/** qsort用の比較関数 */
static int
compare_word_entry_by_freq(const void *p1, const void *p2)
{
  const struct word_entry *e1 = p1;
  const struct word_entry *e2 = p2;
  return e2->raw_freq - e1->raw_freq;
}

/** qsort用の比較関数 */
static int
compare_word_entry_by_wtype(const void *p1, const void *p2)
{
  const struct word_entry *e1 = p1;
  const struct word_entry *e2 = p2;
  int ret = strcmp(e1->wt_name, e2->wt_name);
  if (ret != 0) {
    return ret;
  } else {
    return compare_word_entry_by_freq(p1, p2);
  }
}

/** 読みに対する単語を頻度順に並べ、いらない単語を消す */
static int
normalize_word_entry(struct yomi_entry *ye)
{
  int i, nr_dup = 0;
  if (!ye) {
    return 0;
  }
  /* 単語を並べる */
  qsort(ye->entries, ye->nr_entries,
	sizeof(struct word_entry),
	compare_word_entry_by_freq);
  /* ダブったら、0点 */
  for (i = 0; i < ye->nr_entries; i++) {
    if (check_same_word(ye, i)) {
      ye->entries[i].raw_freq = 0;
      nr_dup ++;
    }
  }
  /* 再びソート */
  qsort(ye->entries, ye->nr_entries,
	sizeof(struct word_entry),
	compare_word_entry_by_wtype);
  return ye->nr_entries - nr_dup;
}

/*その読みに対応するyomi_entryを返す
**/
struct yomi_entry *
find_yomi_entry(struct yomi_entry_list *yl, xstr *index, int create)
{
  struct yomi_entry *ye;
  int hash = index_hash(index);
  int search = 0;
  /* hash chainから探す */
  for (ye = yl->hash[hash];ye ; ye = ye->hash_next) {
    search ++;
    if (!anthy_xstrcmp(ye->index_xstr, index)) {
      return ye;
    }
  }
  if (!create) {
    return NULL;
  }

  /* 無いので確保 */
  ye = malloc(sizeof(struct yomi_entry));
  ye->nr_entries = 0;
  ye->entries = 0;
  ye->next = NULL;
  ye->index_xstr = anthy_xstr_dup(index);
  ye->index_str = NULL;

  /* hash chainにつなぐ */
  ye->hash_next = yl->hash[hash];
  yl->hash[hash] = ye;

  /* リストにつなぐ */

  ye->next = yl->head;
  yl->head = ye;

  yl->nr_entries ++;

  return ye;
}

/* 辞書ファイル中のhash bitmapにマークを付ける */
static void
mark_hash_array(unsigned char *hash_array, xstr *xs)
{
  int val, idx, bit, mask;
  val = anthy_xstr_hash(xs);
  val &= (YOMI_HASH_ARRAY_SIZE*YOMI_HASH_ARRAY_BITS-1);
  idx=(val>>YOMI_HASH_ARRAY_SHIFT)&(YOMI_HASH_ARRAY_SIZE-1);
  bit= val & ((1<<YOMI_HASH_ARRAY_SHIFT)-1);
  mask = (1<<bit);
  if (hash_array[idx] & mask) {
    yomi_hash_collision ++;
  }
  hash_array[idx] |= mask;
}

/* 読みhashのビットマップを作る */
static void
mk_yomi_hash(FILE *yomi_hash_out, struct yomi_entry_list *yl)
{
  unsigned char *hash_array;
  int i;
  struct yomi_entry *ye;
  hash_array = (unsigned char *)malloc(YOMI_HASH_ARRAY_SIZE);
  for (i = 0; i < YOMI_HASH_ARRAY_SIZE; i++) {
    hash_array[i] = 0;
  }
  for (i = 0; i < yl->nr_valid_entries; i++) {
    ye = yl->ye_array[i];
    mark_hash_array(hash_array, ye->index_xstr);
  }
  fwrite(hash_array, YOMI_HASH_ARRAY_SIZE, 1, yomi_hash_out);
  printf("generated yomi hash bitmap (%d collisions/%d entries)\n",
	 yomi_hash_collision, yl->nr_valid_entries);
	 
}

static struct adjust_command *
parse_modify_freq_command(const char *buf)
{
  char *line = alloca(strlen(buf) + 1);
  char *yomi, *wt, *word, *type_str;
  struct adjust_command *cmd;
  int type = 0;
  strcpy(line, buf);
  yomi = strtok(line, " ");
  wt = strtok(NULL, " ");
  word = strtok(NULL, " ");
  type_str = strtok(NULL, " ");
  if (!yomi || !wt || !word || !type_str) {
    return NULL;
  }
  if (!strcmp(type_str, "up")) {
    type = ADJUST_FREQ_UP;
  }
  if (!strcmp(type_str, "down")) {
    type = ADJUST_FREQ_DOWN;
  }
  if (!strcmp(type_str, "kill")) {
    type = ADJUST_FREQ_KILL;
  }
  if (!type) {
    return NULL;
  }
  cmd = malloc(sizeof(struct adjust_command));
  cmd->type = type;
  cmd->yomi = anthy_cstr_to_xstr(yomi, ANTHY_EUC_JP_ENCODING);
  cmd->wt = get_wt_name(wt);
  cmd->word = anthy_conv_euc_to_utf8(word);
  return cmd;
}

static void
parse_adjust_command(const char *buf, struct adjust_command *ac_list)
{
  struct adjust_command *cmd = NULL;
  if (!strncmp("\\modify_freq ", buf, 13)) {
    cmd = parse_modify_freq_command(&buf[13]);
  }
  if (cmd) {
    cmd->next = ac_list->next;
    ac_list->next = cmd;
  }
}

/** 辞書を一行ずつ読み込んでリストを作る
 * このコマンドのコア */
static void
parse_dict_file(FILE *fin, struct mkdic_stat *mds)
{
  xstr *index_xs;
  char buf[MAX_LINE_LEN];
  char *ent;
  struct yomi_entry *ye = NULL;

  /* １行ずつ処理 */
  while (read_line(fin, buf)) {
    if (buf[0] == '\\' && buf[1] != ' ') {
      parse_adjust_command(buf, &mds->ac_list);
      continue ;
    }
    index_xs = get_index_from_line(mds, buf);
    if (!index_xs) {
      break;
    }
    ent = get_entry_from_line(buf);

    /* 読みが30文字を越える場合は無視 */
    if (index_xs->len < 31) {
      ye = find_yomi_entry(&mds->yl, index_xs, 1);
      push_back_word_entry_line(mds, ye, ent);
    }

    free(ent);
    anthy_free_xstr(index_xs);
  }
}

/* 読み、品詞、単語の三つ組から単語の構造体を取得する */
static struct word_entry *
find_word_entry(struct yomi_entry_list *yl, xstr *yomi,
		const char *wt, char *word)
{
  struct yomi_entry *ye = find_yomi_entry(yl, yomi, 0);
  int i;
  if (!ye) {
    return NULL;
  }
  for (i = 0; i < ye->nr_entries; i++) {
    struct word_entry *we = &ye->entries[i];
    if (!strcmp(we->wt_name, wt) &&
	!strcmp(we->word_utf8, word)) {
      return we;
    }
  }
  return NULL;
}
		
/* 頻度調整のコマンドを適用する */
static void
apply_adjust_command(struct yomi_entry_list *yl,
		     struct adjust_command *ac_list)
{
  struct adjust_command *cmd;
  for (cmd = ac_list->next; cmd; cmd = cmd->next) {
    struct word_entry *we = find_word_entry(yl, cmd->yomi,
					    cmd->wt, cmd->word);
    if (!we) {
      char *yomi = anthy_xstr_to_cstr(cmd->yomi, ANTHY_UTF8_ENCODING);
      printf("failed to find target of adjust command (%s, %s, %s)\n",
	     yomi, cmd->wt, cmd->word);
      free(yomi);
      continue;
    }
    if (cmd->type == ADJUST_FREQ_UP) {
      we->raw_freq *= 4;
    }
    if (cmd->type == ADJUST_FREQ_DOWN) {
      we->raw_freq /= 4;
      if (we->raw_freq == 0) {
	we->raw_freq = 1;
      }
    }
    if (cmd->type == ADJUST_FREQ_KILL) {
      we->raw_freq = 0;
    }
  }
}

/* qsort用の比較関数 */
static int
compare_yomi_entry(const void *p1, const void *p2)
{
  const struct yomi_entry *const *y1 = p1;
  const struct yomi_entry *const *y2 = p2;
  return strcmp((*y1)->index_str, (*y2)->index_str);
}

/* yomi_entryでsortする */
static void
sort_word_dict(struct yomi_entry_list *yl)
{
  int i;
  struct yomi_entry *ye;
  yl->nr_valid_entries = 0;
  /* 単語を持つ読みだけを yl->ye_arrayに詰め直す */
  yl->ye_array = malloc(sizeof(struct yomi_entry *) * yl->nr_entries);
  for (i = 0, ye = yl->head; i < yl->nr_entries; i++, ye = ye->next) {
    if (ye->nr_entries > 0) {
      yl->ye_array[yl->nr_valid_entries] = ye;
      yl->nr_valid_entries ++;
    }
  }
  /**/
  for (i = 0; i < yl->nr_valid_entries; i++) {
    struct yomi_entry *ye = yl->ye_array[i];
    ye->index_str = anthy_xstr_to_cstr(ye->index_xstr, yl->index_encoding);
  }
  /* ソートする */
  qsort(yl->ye_array, yl->nr_valid_entries,
	sizeof(struct yomi_entry *),
	compare_yomi_entry);
  /* 不要な単語を消す */
  yl->nr_words = 0;
  for (i = 0; i < yl->nr_valid_entries; i++) {
    struct yomi_entry *ye = yl->ye_array[i];
    yl->nr_words += normalize_word_entry(ye);
  }
}

/** ファイルのサイズを取得する */
static int
get_file_size(FILE *fp)
{
  if (!fp) {
    return 0;
  }
  return (ftell (fp) + SECTION_ALIGNMENT - 1) & (-SECTION_ALIGNMENT);
}

static void
copy_file(struct mkdic_stat *mds, FILE *in, FILE *out)
{
  int i;
  size_t nread;
  char buf[BUFSIZ];

  /* Pad OUT to the next aligned offset.  */
  for (i = ftell (out); i & (SECTION_ALIGNMENT - 1); i++) {
    fputc (0, out);
  }

  /* Copy the contents.  */
  rewind (in);
  while ((nread = fread (buf, 1, sizeof buf, in)) > 0) {
    if (fwrite (buf, 1, nread, out) < nread) {
      /* Handle short write (maybe disk full).  */
      fprintf (stderr, "%s: %s: write error: %s\n",
	       progname, mds->output_fn, strerror (errno));
      exit (1);
    }
  }
}

static void
generate_header(FILE *fp)
{
  int buf[NR_HEADER_SECTIONS];
  int i;
  struct file_section *fs;
  int off;

  /* 初期化 */
  for (i = 0; i < NR_HEADER_SECTIONS; i++) {
    buf[i] = 0;
  }

  /* ヘッダ */
  buf[0] = NR_HEADER_SECTIONS * sizeof(int);
  buf[1] = 0;

  /* 各セクションのオフセット */
  off = buf[0];
  for (i = 2, fs = file_array; fs->fpp; fs ++, i++) {
    buf[i] = off;
    off += get_file_size(*(fs->fpp));
  }

  /* ファイルへ出力する */
  for (i = 0; i < NR_HEADER_SECTIONS; i++) {
    write_nl(fp, buf[i]);
  }
}

/* 各セクションのファイルをマージして、ひとつの辞書ファイルを作る */
static void
link_dics(struct mkdic_stat *mds)
{
  FILE *fp;
  struct file_section *fs;

  fp = fopen (mds->output_fn, "w");
  if (!fp) {
      fprintf (stderr, "%s: %s: cannot create: %s\n",
	       progname, mds->output_fn, strerror (errno));
      exit (1);
  }

  /* ヘッダを出力する */
  generate_header(fp);

  for (fs = file_array; fs->fpp; fs ++) {
    /* 各セクションのファイルを結合する */
    copy_file(mds, *(fs->fpp), fp);
    if (fs->fn) {
      unlink(fs->fn);
    }
  }

  if (fclose (fp)) {
    fprintf (stderr, "%s: %s: write error: %s\n",
	     progname, mds->output_fn, strerror (errno));
    exit (1);
  }
}

static void
read_dict_file(struct mkdic_stat *mds, const char *fn)
{
  FILE *fp;
  /* ファイル名が指定されたので読み込む */
  fp = fopen(fn, "r");
  if (fp) {
    printf("file = %s\n", fn);
    parse_dict_file(fp, mds);
    fclose(fp);
  } else {
    printf("failed file = %s\n", fn);
  }
}

static void
complete_words(struct mkdic_stat *mds)
{
  /* 頻度補正を適用する */
  apply_adjust_command(&mds->yl, &mds->ac_list);

  /**/
  calc_freq(&mds->yl);

  /* 読みで並び替える */
  sort_word_dict(&mds->yl);

  /* ファイルを準備する */
  open_output_files();
  /* 単語辞書を出力する */
  output_word_dict(&mds->yl);

  /* 読みハッシュを作る */
  mk_yomi_hash(yomi_hash_out, &mds->yl);
}

static void
read_udict_file(struct mkdic_stat *mds, const char *fn)
{
  if (!mds->ud) {
    mds->ud = create_uc_dict();
    complete_words(mds);
  }
  read_uc_file(mds->ud, fn);
  printf("uc = %s\n", fn);
}

static xstr*
xstr_strncat(xstr* xs, xchar* src, int n)
{
  int i;
  xs->str = realloc(xs->str, sizeof(xchar) * (xs->len + n + 1));

  for (i = 0; i < n; ++i) {
    xs->str[xs->len + i] = src[i];
  }
  xs->len += n;
  return xs;
}

static void
reverse_multi_segment_word(struct mkdic_stat *mds, struct word_entry *we)
{
  /*
    「かなかんじへんかんえんじん #T35 #_2仮名_3漢字_4変換_4エンジン」
    から
    「仮名漢字変換エンジン #T35 #_2かな_2かんじ_2へんかん_4えんじん」
    を作る
  */
  int j;
  /* yomiは仮名漢字混じり wordは平仮名のみからなる */
  int yomi_seg_start = 0;
  int word_seg_start = 0;
  int word_seg_len = 0;
  xstr *yomibuf = anthy_cstr_to_xstr(we->word_utf8, ANTHY_UTF8_ENCODING);
  xstr *wordbuf = we->ye->index_xstr;
  xstr *yomi_xs = anthy_cstr_to_xstr("", 0);
  xstr *word_xs = anthy_cstr_to_xstr("#", 0);
  char *word;
  char ch[256];
  struct yomi_entry *target_ye;

  for (j = 0; j <= yomibuf->len; ++j) {
    if (j == yomibuf->len || yomibuf->str[j] == '_') {
      if (yomi_seg_start != 0) {
	anthy_xstrappend(word_xs, '_');
	snprintf(ch, 256, "%x", j - yomi_seg_start);
	anthy_xstrappend(word_xs, (xchar)ch[0]);
	xstr_strncat(word_xs, &wordbuf->str[word_seg_start], word_seg_len);
	xstr_strncat(yomi_xs, &yomibuf->str[yomi_seg_start], j - yomi_seg_start);
      }
      if (j == yomibuf->len) {
	break;
      }
      yomi_seg_start = j + 2;
      word_seg_start += word_seg_len;
      word_seg_len = get_element_len(yomibuf->str[j + 1]);
    }
  }

  target_ye = find_yomi_entry(&mds->yl, yomi_xs, 1);
  word = anthy_xstr_to_cstr(word_xs, mds->input_encoding);

  /* 逆変換用の辞書はfreqが負 */
  push_back_word_entry(mds, target_ye, we->wt_name, -we->raw_freq,
		       word, we->source_order);

  free(word);
  anthy_free_xstr(yomibuf);
  anthy_free_xstr(yomi_xs);
  anthy_free_xstr(word_xs);
}

/* 逆変換用の辞書を作る */
static void
build_reverse_dict(struct mkdic_stat *mds)
{
  struct yomi_entry *ye;
  int i, n;
  struct word_entry *we_array;
  printf("building reverse index\n");

  /* 単語の数を数える */
  n = 0;
  for (ye = mds->yl.head; ye; ye = ye->next) {
    for (i = 0; i < ye->nr_entries; i++) {
      n++;
    }
  }
  /* コピーする
   * (元の辞書中のポインタはreallocで動くのでコピーが必要)
   */
  we_array = malloc(sizeof(struct word_entry )* n);
  n = 0;
  for (ye = mds->yl.head; ye; ye = ye->next) {
    for (i = 0; i < ye->nr_entries; i++) {
      we_array[n] = ye->entries[i];
      n++;
    }
  }

  /* 辞書に追加していく */
  for (i = 0; i < n; i++) {
    struct word_entry *we;
    struct yomi_entry *target_ye;

    we = &we_array[i];
    if (we->word_utf8[0] == '#') {
      if (we->word_utf8[1] == '_') {
	reverse_multi_segment_word(mds, we);
      }
    } else {
      /* yomiは仮名漢字混じり wordは平仮名のみからなる */
      xstr *yomi_xs;
      char *word;

      yomi_xs = anthy_cstr_to_xstr(we->word_utf8, ANTHY_UTF8_ENCODING);
      target_ye = find_yomi_entry(&mds->yl, yomi_xs, 1);
      word = anthy_xstr_to_cstr(we->ye->index_xstr, mds->input_encoding);

      /* 逆変換用の辞書はfreqが負 */
      push_back_word_entry(mds, target_ye, we->wt_name, -we->raw_freq,
			   word, we->source_order);

      anthy_free_xstr(yomi_xs);
      free(word);
    }
  }
  /**/
  free(we_array);
}

static void
clear_exclude_wtypes(struct mkdic_stat *mds)
{
  int i;
  for (i = 0; i < mds->nr_excluded; i++) {
    free(mds->excluded_wtypes[i]);
  }
  free(mds->excluded_wtypes);
  /**/
  mds->excluded_wtypes = NULL;
  mds->nr_excluded = 0;
}

static void
set_exclude_wtypes(struct mkdic_stat *mds, int nr, char **tokens)
{
  int i;
  mds->nr_excluded = nr - 1;
  mds->excluded_wtypes = malloc(sizeof(char *) * (nr - 1));
  /**/
  for (i = 1; i < nr; i++) {
    mds->excluded_wtypes[i - 1] = strdup(tokens[i]);
  }
}

static void
set_dict_encoding(struct mkdic_stat *mds, const char *enc)
{
  if (!strcmp(enc, "utf8")) {
    mds->yl.body_encoding = ANTHY_UTF8_ENCODING;
  }
}

static void
set_input_encoding(struct mkdic_stat *mds, const char *enc)
{
  if (!strcmp(enc, "utf8")) {
    mds->input_encoding = ANTHY_UTF8_ENCODING;
  }
  if (!strcmp(enc, "eucjp")) {
    mds->input_encoding = ANTHY_EUC_JP_ENCODING;
  }
}

static void
write_dict_file(struct mkdic_stat *mds)
{
  if (!mds->ud) {
    printf("can not build without use case dict\n");
    exit(1);
  }

  /* 用例辞書を作る */
  make_ucdict(uc_out, mds->ud);

  /* 辞書ファイルにまとめる */
  flush_output_files();
  link_dics(mds);
}

static void
show_command(char **tokens, int nr)
{
  int i;
  printf("cmd:");
  for (i = 0; i < nr; i++) {
    printf(" %s", tokens[i]);
  }
  printf("\n");
}

static int
execute_batch(struct mkdic_stat *mds, const char *fn)
{
  int nr;
  char **tokens;
  if (anthy_open_file(fn)) {
    printf("mkanthydic: failed to open %s\n", fn);
    return 1;
  }
  while (!anthy_read_line(&tokens, &nr)) {
    char *cmd = tokens[0];
    show_command(tokens, nr);
    if (!strcmp(cmd, "read") && nr == 2) {
      read_dict_file(mds, tokens[1]);
    } else if (!strcmp(cmd, "read_uc") && nr == 2) {
      read_udict_file(mds, tokens[1]);
    } else if (!strcmp(cmd, "build_reverse_dict")) {
      build_reverse_dict(mds);
    } else if (!strcmp(cmd, "write")) {
      write_dict_file(mds);
    } else if (!strcmp(cmd, "set_exclude_wtypes")) {
      set_exclude_wtypes(mds, nr, tokens);
    } else if (!strcmp(cmd, "clear_exclude_wtypes")) {
      clear_exclude_wtypes(mds);
    } else if (!strcmp(cmd, "set_dict_encoding") && nr == 2) {
      set_dict_encoding(mds, tokens[1]);
    } else if (!strcmp(cmd, "set_input_encoding") && nr == 2) {
      set_input_encoding(mds, tokens[1]);
    } else if (!strcmp(cmd, "done")) {
      anthy_free_line();
      break;
    } else {
      printf("Unknown command(%s).\n", cmd);
    }
    anthy_free_line();
  }
  anthy_close_file();
  return 0;
}

/* 辞書生成のための変数の初期化 */
static void
init_mds(struct mkdic_stat *mds)
{
  int i;
  mds->output_fn = DEFAULT_FN;
  mds->ud = NULL;

  /* 単語辞書を初期化する */
  mds->yl.head = NULL;
  mds->yl.nr_entries = 0;
  for (i = 0; i < YOMI_HASH; i++) {
    mds->yl.hash[i] = NULL;
  }
  mds->yl.index_encoding = ANTHY_UTF8_ENCODING;
  mds->yl.body_encoding = ANTHY_EUC_JP_ENCODING;
  /**/
  mds->ac_list.next = NULL;
  /**/
  mds->input_encoding = ANTHY_EUC_JP_ENCODING;
  /**/
  mds->nr_excluded = 0;
  mds->excluded_wtypes = NULL;
}

/* libanthyの使用する部分だけを初期化する */
static void
init_libs(void)
{
  int res;
  res = anthy_init_xstr();
  if (res == -1) {
    fprintf (stderr, "failed to init dic lib\n");
    exit(1);
  }
}

/**/
int
main(int argc, char **argv)
{
  struct mkdic_stat mds;
  int i;
  char *script_fn = NULL;
  int help_mode = 0;

  anthy_init_wtypes();
  init_libs();
  init_mds(&mds);

  for (i = 1; i < argc; i++) {
    char *arg = argv[i];
    char *prev_arg = argv[i-1];
    if (!strcmp(arg, "--help")) {
      help_mode = 1;
    }
    if (!strcmp(prev_arg, "-f")) {
      script_fn = arg;
    }
  }

  if (help_mode || !script_fn) {
    print_usage();
  }

  return execute_batch(&mds, script_fn);
}
syntax highlighted by Code2HTML, v. 0.9.1