# suikyo.rb: Romaji-Hiragana conversion library for Ruby
# $Id: suikyo.rb,v 1.19 2005/03/29 02:07:09 komatsu Exp $
#
# Copyright (C) 2002 - 2004 Hiroyuki Komatsu <komatsu@taiyaki.org>
#     All rights reserved.
#     This is free software with ABSOLUTELY NO WARRANTY.
#
# You can redistribute it and/or modify it under the terms of 
# the GNU General Public License version 2.
#

$KCODE = 'e'
require 'jcode'
require 'kconv'
require 'suikyo/suikyo-config'

class File
  def File::join2 (*paths)
    dirs = paths[0..-2].map{|path|
      path ? path.split(File::Separator) : ""
    }
    join(dirs, paths[-1])
  end
end

class Suikyo
  attr_reader :table
  def initialize (table = nil)
    if table.kind_of?(String) then
      @table = SuikyoTable2.new()
      @table.loadfile(table)
    elsif table then
      @table = table
    else
      @table = SuikyoTable2.new
    end
  end

  def convert (string, table = @table)
    (conversion, pending, last_node) = convert_internal(string, table)
    return conversion + pending
  end

  def expand (string, table = @table)
    (conversion, pending, last_node) = convert_internal(string, table)

    if last_node and last_node.subtable then
      suffixes = expand_table(last_node.subtable).push(pending).compact.uniq
      conversions = suffixes.map {|suffix|
        conversion + suffix
      }
    else
      conversions = [conversion + pending]
    end
    return [conversion + pending, conversions]
  end

  def convert_internal (string, table = @table)
    chars = string.split(//)
    orig_table = table
    conversion = ""

    loop {
      pending = ""
      table   = orig_table
      node    = nil

      while table and chars.length > 0 do
        head = chars[0]
        tmp_node = table.get_word(head)
        table = (tmp_node and tmp_node.subtable)
        if tmp_node or pending == "" then
          pending += head unless head == " "
          node = tmp_node
          chars.shift
        end
      end

      if table.nil? and node and (node.result or node.cont) then
        pending = ""
        if node.result then
          conversion += node.result
        end
        if node.cont then
          chars.unshift(node.cont)
        end
      end

      if chars.length == 0 then
        if table.nil? then
          return [conversion + pending, "", nil]
        else
          return [conversion, pending, node]
        end
      else
        conversion += pending
      end
    }
  end


  def valid? (string, table = @table)
    # Check a validness of string conversion.
    #   valid: "ringo" -> "りんご"
    # invalid: "apple" -> "あっplえ"
    (conversion, conversions) = expand(string, table)

    # Checking "appl -> あっpl" (invaild)
    if conversions.length == 1 and conversion !~ /^[a-zA-Z]*[^a-zA-Z]+$/ then
      return false
    end

    conversions.each {|word|
      if word =~ /^[^a-zA-Z]+([a-zA-Z]*)$/ then
        if $1.empty? then
          return true
        end
        (conversion2, conversions2) = expand($1, table)
        conversion2.each { | word2 |
          if word2 =~ /^[^a-zA-Z]+$/ then
            return true
          end
        }
      end
    }
    return false
  end

  private
  def expand_table (table)
    return [] unless table

    results = []
    table.allresults_uniq.each {|result, cont|
      if cont then
        subtable = @table.get_word(cont).subtable()
	if subtable then
	  subtable.allresults_uniq.each {|subresult, subcont|
	    results.push(result + subresult)
	  }
	else
	  results.push(result + cont)
	end
      else
	results.push(result)
      end
    }
    return results.uniq
  end
end

class SuikyoTable
  attr_reader :table_files

  def initialize
    @word = Hash.new()
    @table_files = []
  end

  def set (string, result, cont = nil, unescape = true)
    if unescape then
      string = unescape(string)
      result = unescape(result)
      cont   = (cont and unescape(cont))
    end

    head = string.split(//)[0]
    rest = string.split(//)[1..-1].join
    @word[head] = SuikyoNode.new if @word[head].nil?

    if rest == "" then
      @word[head].result = result
      @word[head].cont   = cont
    else
      @word[head].subtable = self.class.new unless @word[head].subtable
      @word[head].subtable.set(rest, result, cont, false)
    end
  end

  ## This removes the string entry from the Suikyo table tree.
  ## If a child tree does not exist it returns ture.
  def unset (string)
    head = string.split(//)[0]
    rest = string.split(//)[1..-1].join()

    if @word[head].nil? then
      return true
    end

    if rest == "" then
      if @word[head].subtable.nil? or @word[head].subtable.allword.empty? then
        @word.delete(head)
        return true
      end

      @word[head].result = nil
      @word[head].cont   = nil
    else
      if @word[head].subtable then
        @word[head].subtable.unset(rest)
        if @word[head].subtable.allword.empty? then
          @word.delete(head)
          return true
        end
      end
    end
    return false
  end

  def loadfile (filename, tablepath = nil)
    filepath = SuikyoTable::loadpath(filename, tablepath)
    if FileTest::exist?(filepath) then
      @table_files.push(filepath)
    else
      $stderr.puts "Suikyo.rb: conv-table '#{filepath}' is not found."
      return false
    end

    comment_flag = false
    open(filepath, "r").readlines.each{|line|
      line.chomp!
      ## The function 'toeuc' converts half-width Katakana to full-width.
#      line = line.toeuc.chomp
      if line =~ /^\/\*/ then
	comment_flag = true
      end
      unless line =~ /^\#|^\s*$/ or comment_flag then
	(string, result, cont) = line.sub(/^ /, "").split(/\t/)
        if result.nil? then
          self.unset(string)
        else
          self.set(string, result, cont)
        end
      end
      if line =~ /\*\// then
	comment_flag = false
      end
    }
    return true
  end

  def SuikyoTable::loadpath (filename, tablepath = nil)
    if filename =~ /^\// then
      return filename
    else
      prefix = (tablepath or ENV['SUIKYO_TABLE_PATH'] or SUIKYO_TABLE_PATH)
      return File::join2(prefix, filename) 
    end
  end

  def get_word (chars)
    word  = nil
    words = allword()
    chars.split(//).each { | char |
      word = words[char]
      if word.nil? or word.subtable.nil? then
        break
      end
      words = word.subtable.allword
    }
    return word
  end

  def allword
    return @word
  end

  def allresults
    # c => [ち, ちゃ, ちゅ, ちょ]
    results = []
    allword.each {|char, table|
      if table.result then
	results.push([table.result, table.cont])
      end
      if table.subtable then
	results += table.subtable.allresults
      end
    }
    return results.uniq
  end

  def allresults_uniq
    # c => [ち]
    results = allresults.sort {|pair1, pair2|
      pair1[0] <=> pair2[0]
    }
    (base_result, base_cont) = results[0]
    uniq_results = [results[0]]

    results.each {|result, cont|
      unless result.index(base_result) == 0 and cont == base_cont then
	uniq_results.push([result, cont])
	base_result = result
	base_cont   = cont
      end
    }
    return uniq_results
  end


  private
  def unescape (string)
    unescaped_string = ""
    while (index = string.index('\\')) do
      next_char = string[index + 1,1]
      case next_char
      when "x" then
        hex_string = string[index + 2,2]
        if hex_string =~ /^[a-zA-F0-9][a-zA-F0-9]$/ then
          unescaped_string += string[0,index] + hex_string.hex.chr
          string = (string[index + 4..-1] or "")
        else
          $stderr.puts "Suikyo: Unescape error from \"#{string}\"."
          unescaped_string += string[0,index] + '\\'
          string = (string[index + 1..-1] or "")
        end
      when "0" then
	unescaped_string += string[0,index]
	string = (string[index + 2..-1] or "")
      else
	unescaped_string += string[0,index] + next_char
	string = (string[index + 2..-1] or "")
      end
    end
    return unescaped_string + string
  end

  private
  class SuikyoNode
    attr_accessor :subtable, :cont, :result
    def initialize (result = nil, cont = nil, subtable = nil)
      @result   = result
      @cont     = cont
      @subtable = subtable
    end
  end
end

class SuikyoTable2 < SuikyoTable
  def get_word (chars)
    word  = nil
    words = allword()
    chars.split(//).each { | char |
      word = words[char]
      if word.nil? then
        word = words[char.swapcase]
      end
      if word.nil? or word.subtable.nil? then
        break
      end
      words = word.subtable.allword
    }
    return word
  end
end



syntax highlighted by Code2HTML, v. 0.9.1