# Tamito KAJIYAMA <12 June 2001>
# $Id: hgen.py,v 1.3 2002/10/07 16:49:24 kajiyama Exp $

import sys
import os

progname = os.path.basename(sys.argv[0])

__version__ = "1.0"

# N is determined heuristically (must be >= 256)
N = 523

def read(filename, jis_column, ucs_column):
    file = open(filename)
    jis_map = []
    ucs_map = []
    for i in range(N):
        jis_map.append([])
        ucs_map.append([])
    while 1:
        line = file.readline()
        if not line:
            break
        if line[0] == '#':
            continue
        tokens = line.split()
        jis = int(tokens[jis_column], 16) | 0x8080
        ucs = int(tokens[ucs_column], 16)
        jis_map[jis % N].append((jis, ucs))
        ucs_map[ucs % N].append((ucs, jis))
    return jis_map, ucs_map

def dump(prefix, jis_map, ucs_map):
    for n in range(N):
        jis_map[n].sort()
        print "static unsigned char %s_jis_map_%d[] = {" % (prefix, n)
        print "    0x%02x," % len(jis_map[n])
        for jis, ucs in jis_map[n]:
            print "    0x%02x, 0x%02x, 0x%02x," % (jis/N, ucs/256, ucs%256)
        print "};"
    print "static unsigned char *%s_jis_map[] = {" % prefix
    for n in range(N):
        print "    %s_jis_map_%d," % (prefix, n)
    print "};"
    print
    for n in range(N):
        ucs_map[n].sort(lambda x, y: cmp((x[1], x[0]), (y[1], y[0])))
        print "static unsigned char %s_ucs_map_%d[] = {" % (prefix, n)
        print "    0x%02x," % len(ucs_map[n])
        for ucs, jis in ucs_map[n]:
            print "    0x%02x, 0x%02x, 0x%02x," % (ucs/N, jis/256, jis%256)
        print "};"
    print "static unsigned char *%s_ucs_map[] = {" % prefix
    for n in range(N):
        print "    %s_ucs_map_%d," % (prefix, n)
    print "};"

# Stuff not listed in CP932 table, but collected from Win32 API.
ms932_enc_appendix = {
#    ucs: sjis
    0xa1: 0x21,
    0xa6: 0x7c,
    0xa9: 0x63,
    0xaa: 0x61,
    0xab: 0x81e1,
    0xad: 0x2d,
    0xae: 0x52,
    0xaf: 0x8150,
    0xb2: 0x32,
    0xb3: 0x33,
    0xb5: 0x83ca,
    0xb7: 0x8145,
    0xb8: 0x8143,
    0xb9: 0x31,
    0xba: 0x6f,
    0xbb: 0x81e2,
    0xc0: 0x41,
    0xc1: 0x41,
    0xc2: 0x41,
    0xc3: 0x41,
    0xc4: 0x41,
    0xc5: 0x41,
    0xc6: 0x41,
    0xc7: 0x43,
    0xc8: 0x45,
    0xc9: 0x45,
    0xca: 0x45,
    0xcb: 0x45,
    0xcc: 0x49,
    0xcd: 0x49,
    0xce: 0x49,
    0xcf: 0x49,
    0xd0: 0x44,
    0xd1: 0x4e,
    0xd2: 0x4f,
    0xd3: 0x4f,
    0xd4: 0x4f,
    0xd5: 0x4f,
    0xd6: 0x4f,
    0xd8: 0x4f,
    0xd9: 0x55,
    0xda: 0x55,
    0xdb: 0x55,
    0xdc: 0x55,
    0xdd: 0x59,
    0xde: 0x54,
    0xdf: 0x73,
    0xe0: 0x61,
    0xe1: 0x61,
    0xe2: 0x61,
    0xe3: 0x61,
    0xe4: 0x61,
    0xe5: 0x61,
    0xe6: 0x61,
    0xe7: 0x63,
    0xe8: 0x65,
    0xe9: 0x65,
    0xea: 0x65,
    0xeb: 0x65,
    0xec: 0x69,
    0xed: 0x69,
    0xee: 0x69,
    0xef: 0x69,
    0xf0: 0x64,
    0xf1: 0x6e,
    0xf2: 0x6f,
    0xf3: 0x6f,
    0xf4: 0x6f,
    0xf5: 0x6f,
    0xf6: 0x6f,
    0xf8: 0x6f,
    0xf9: 0x75,
    0xfa: 0x75,
    0xfb: 0x75,
    0xfc: 0x75,
    0xfd: 0x79,
    0xfe: 0x74,
    0xff: 0x79,
    0x3094: 0x8394,
    0xf8f0: 0xa0,
    0xf8f1: 0xfd,
    0xf8f2: 0xfe,
    0xf8f3: 0xff,
}

def dump_ms932(sjisfile, j0208file):
    # build ms932 encode/decode map
    ms932 = open(sjisfile).readlines()
    ms932 = [l.split() for l in ms932 if l and l[0] != '#']
    ms932 = [(int(l[0],16), int(l[1], 16)) for l in ms932 if l and l[1][0] != '#']
    ms932_dec = {}
    ms932_enc = ms932_enc_appendix.copy()

    for mbcs, ucs in ms932:
        ms932_dec[mbcs] = ucs
        cur = ms932_enc.get(ucs, 0)
        if cur:
            # Decode to JIS 2-ku and 13-ku rather than IBMNEC/IBM gaiji.
            if cur >= 0x8800:
                ms932_enc[ucs] = mbcs
        else:
            ms932_enc[ucs] = mbcs

    # build JIS0208 encode/decode map
    j0208 = open(j0208file).readlines()
    j0208 = [l.split() for l in j0208 if l and l[0] != '#']
    j0208 = [(int(l[0],16), int(l[2], 16)) for l in j0208 if l and l[2][0] != '#']
    j0208_dec = {}
    j0208_enc = {}
    for mbcs, ucs in j0208:
        j0208_dec[mbcs] = ucs
        j0208_enc[ucs] = mbcs

    jis_map = []
    ucs_map = []

    for i in range(N):
        jis_map.append([])
        ucs_map.append([])

    # build ucs->sjis map
    for ucs, sjis in ms932_enc.iteritems():
        if ucs >= 0x80 and not (0xff61 <= ucs <= 0xff9f):
            if j0208_enc.get(ucs) != sjis:
                ucs_map[ucs % N].append((ucs, sjis))

    # build sjis->ucs map
    for sjis, ucs in ms932_dec.iteritems():
        if sjis >= 0x80 and not (0xa1 <= sjis <= 0xdf):
            if j0208_dec.get(sjis) != ucs:
                jis_map[sjis % N].append((sjis, ucs))

    dump("ms932", jis_map, ucs_map)

def main():
    table1_jis, table1_ucs = read("JIS0208.TXT", 1, 2)
    table2_jis, table2_ucs = read("JIS0212.TXT", 0, 1)
    print "/* This is an auto-generated file (by %s %s) */" % (
        progname, __version__)
    print "/* Do not edit!! */"
    print
    print "#define N", N
    print
    dump("jisx0208", table1_jis, table1_ucs)
    dump("jisx0212", table2_jis, table2_ucs)
    dump_ms932("MS932.TXT", "JIS0208.TXT")
    
if __name__ == "__main__":
    main()


syntax highlighted by Code2HTML, v. 0.9.1