# suikyo.rb: Romaji-Hiragana conversion library for Ruby
# $Id: suikyo.rb,v 1.19 2005/03/29 02:07:09 komatsu Exp $
#
# Copyright (C) 2002 - 2004 Hiroyuki Komatsu <komatsu@taiyaki.org>
# All rights reserved.
# This is free software with ABSOLUTELY NO WARRANTY.
#
# You can redistribute it and/or modify it under the terms of
# the GNU General Public License version 2.
#
$KCODE = 'e'
require 'jcode'
require 'kconv'
require 'suikyo/suikyo-config'
class File
def File::join2 (*paths)
dirs = paths[0..-2].map{|path|
path ? path.split(File::Separator) : ""
}
join(dirs, paths[-1])
end
end
class Suikyo
attr_reader :table
def initialize (table = nil)
if table.kind_of?(String) then
@table = SuikyoTable2.new()
@table.loadfile(table)
elsif table then
@table = table
else
@table = SuikyoTable2.new
end
end
def convert (string, table = @table)
(conversion, pending, last_node) = convert_internal(string, table)
return conversion + pending
end
def expand (string, table = @table)
(conversion, pending, last_node) = convert_internal(string, table)
if last_node and last_node.subtable then
suffixes = expand_table(last_node.subtable).push(pending).compact.uniq
conversions = suffixes.map {|suffix|
conversion + suffix
}
else
conversions = [conversion + pending]
end
return [conversion + pending, conversions]
end
def convert_internal (string, table = @table)
chars = string.split(//)
orig_table = table
conversion = ""
loop {
pending = ""
table = orig_table
node = nil
while table and chars.length > 0 do
head = chars[0]
tmp_node = table.get_word(head)
table = (tmp_node and tmp_node.subtable)
if tmp_node or pending == "" then
pending += head unless head == " "
node = tmp_node
chars.shift
end
end
if table.nil? and node and (node.result or node.cont) then
pending = ""
if node.result then
conversion += node.result
end
if node.cont then
chars.unshift(node.cont)
end
end
if chars.length == 0 then
if table.nil? then
return [conversion + pending, "", nil]
else
return [conversion, pending, node]
end
else
conversion += pending
end
}
end
def valid? (string, table = @table)
# Check a validness of string conversion.
# valid: "ringo" -> "りんご"
# invalid: "apple" -> "あっplえ"
(conversion, conversions) = expand(string, table)
# Checking "appl -> あっpl" (invaild)
if conversions.length == 1 and conversion !~ /^[a-zA-Z]*[^a-zA-Z]+$/ then
return false
end
conversions.each {|word|
if word =~ /^[^a-zA-Z]+([a-zA-Z]*)$/ then
if $1.empty? then
return true
end
(conversion2, conversions2) = expand($1, table)
conversion2.each { | word2 |
if word2 =~ /^[^a-zA-Z]+$/ then
return true
end
}
end
}
return false
end
private
def expand_table (table)
return [] unless table
results = []
table.allresults_uniq.each {|result, cont|
if cont then
subtable = @table.get_word(cont).subtable()
if subtable then
subtable.allresults_uniq.each {|subresult, subcont|
results.push(result + subresult)
}
else
results.push(result + cont)
end
else
results.push(result)
end
}
return results.uniq
end
end
class SuikyoTable
attr_reader :table_files
def initialize
@word = Hash.new()
@table_files = []
end
def set (string, result, cont = nil, unescape = true)
if unescape then
string = unescape(string)
result = unescape(result)
cont = (cont and unescape(cont))
end
head = string.split(//)[0]
rest = string.split(//)[1..-1].join
@word[head] = SuikyoNode.new if @word[head].nil?
if rest == "" then
@word[head].result = result
@word[head].cont = cont
else
@word[head].subtable = self.class.new unless @word[head].subtable
@word[head].subtable.set(rest, result, cont, false)
end
end
## This removes the string entry from the Suikyo table tree.
## If a child tree does not exist it returns ture.
def unset (string)
head = string.split(//)[0]
rest = string.split(//)[1..-1].join()
if @word[head].nil? then
return true
end
if rest == "" then
if @word[head].subtable.nil? or @word[head].subtable.allword.empty? then
@word.delete(head)
return true
end
@word[head].result = nil
@word[head].cont = nil
else
if @word[head].subtable then
@word[head].subtable.unset(rest)
if @word[head].subtable.allword.empty? then
@word.delete(head)
return true
end
end
end
return false
end
def loadfile (filename, tablepath = nil)
filepath = SuikyoTable::loadpath(filename, tablepath)
if FileTest::exist?(filepath) then
@table_files.push(filepath)
else
$stderr.puts "Suikyo.rb: conv-table '#{filepath}' is not found."
return false
end
comment_flag = false
open(filepath, "r").readlines.each{|line|
line.chomp!
## The function 'toeuc' converts half-width Katakana to full-width.
# line = line.toeuc.chomp
if line =~ /^\/\*/ then
comment_flag = true
end
unless line =~ /^\#|^\s*$/ or comment_flag then
(string, result, cont) = line.sub(/^ /, "").split(/\t/)
if result.nil? then
self.unset(string)
else
self.set(string, result, cont)
end
end
if line =~ /\*\// then
comment_flag = false
end
}
return true
end
def SuikyoTable::loadpath (filename, tablepath = nil)
if filename =~ /^\// then
return filename
else
prefix = (tablepath or ENV['SUIKYO_TABLE_PATH'] or SUIKYO_TABLE_PATH)
return File::join2(prefix, filename)
end
end
def get_word (chars)
word = nil
words = allword()
chars.split(//).each { | char |
word = words[char]
if word.nil? or word.subtable.nil? then
break
end
words = word.subtable.allword
}
return word
end
def allword
return @word
end
def allresults
# c => [ち, ちゃ, ちゅ, ちょ]
results = []
allword.each {|char, table|
if table.result then
results.push([table.result, table.cont])
end
if table.subtable then
results += table.subtable.allresults
end
}
return results.uniq
end
def allresults_uniq
# c => [ち]
results = allresults.sort {|pair1, pair2|
pair1[0] <=> pair2[0]
}
(base_result, base_cont) = results[0]
uniq_results = [results[0]]
results.each {|result, cont|
unless result.index(base_result) == 0 and cont == base_cont then
uniq_results.push([result, cont])
base_result = result
base_cont = cont
end
}
return uniq_results
end
private
def unescape (string)
unescaped_string = ""
while (index = string.index('\\')) do
next_char = string[index + 1,1]
case next_char
when "x" then
hex_string = string[index + 2,2]
if hex_string =~ /^[a-zA-F0-9][a-zA-F0-9]$/ then
unescaped_string += string[0,index] + hex_string.hex.chr
string = (string[index + 4..-1] or "")
else
$stderr.puts "Suikyo: Unescape error from \"#{string}\"."
unescaped_string += string[0,index] + '\\'
string = (string[index + 1..-1] or "")
end
when "0" then
unescaped_string += string[0,index]
string = (string[index + 2..-1] or "")
else
unescaped_string += string[0,index] + next_char
string = (string[index + 2..-1] or "")
end
end
return unescaped_string + string
end
private
class SuikyoNode
attr_accessor :subtable, :cont, :result
def initialize (result = nil, cont = nil, subtable = nil)
@result = result
@cont = cont
@subtable = subtable
end
end
end
class SuikyoTable2 < SuikyoTable
def get_word (chars)
word = nil
words = allword()
chars.split(//).each { | char |
word = words[char]
if word.nil? then
word = words[char.swapcase]
end
if word.nil? or word.subtable.nil? then
break
end
words = word.subtable.allword
}
return word
end
end
syntax highlighted by Code2HTML, v. 0.9.1