/* convert.c
* This file is split off to contain the kana<->romaji routines.
*/
#include <X11/Xlib.h>
#include <X11/Intrinsic.h>
#include <X11/keysym.h>
#include <stdio.h>
#include "defs.h"
#include "convert.h"
#include "searchwidgets.h"
/*
* Note.. these state indicators, are for parsing
* romaji from RIGHT to LEFT
* "state" stores the state to the right of where we are.
*/
enum { STATE_NONE, STATE_A, STATE_I, STATE_U, STATE_E, STATE_O,
STATE_YA, STATE_YU, STATE_YO,
/* we actualy use "y_state" instead of the above _YX states*/
STATE_HI, /* for chi or shi */
STATE_SU, /* for tsu */
STATE_SPACE, /* for lone 'n' at end */
STATE_OTHER
} ;
/* Order of these is very important. DO NOT CHANGE */
enum {ADD_NONE, ADD_A, ADD_I, ADD_U, ADD_E, ADD_O};
/*enum {ADD_NONE, ADD_YA, ADD_FILL1, ADD_YU, ADD_FILL2, ADD_YO};*/
/* map for converting kana to romaji */
static char * kanamap[128]=
{
"", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "",
"", "", "a", "", "i", "", "u", "",
"e", "", "o", "ka", "ga", "ki", "gi", "ku",
"gu", "ke", "ge", "ko", "go", "sa", "za", "shi",
"zhi", "su", "zu", "se", "ze", "so","zo", "ta",
"da", "chi","ji", "", "tsu","du", "te", "de",
"to", "do", "na", "ni", "nu", "ne", "no", "ha",
"ba", "pa", "hi", "bi", "pi", "fu", "bu", "pu",
"he", "be", "pe", "ho", "bo", "po", "ma", "mi",
"mu", "me", "mo","", "ya", "", "yu", "",
"yo", "ra", "ri","ru","re", "ro", "", "wa",
"", "", "o", "n", "", "", "", "",
"", "", "", "", "", "", "", "",
};
/* Take a kana string as input. Skip along, converting kana to
* romaji. Assume 0x24 or 0x25 means kana or kanji start.
* Assume estring buffer is "long enough".
* Note that "long enough" should probably be
* kstrlen(kstring) *2 * 2;
*
* Sigh. better put a limit on it: MAXROMAJI
*
* Note that we also have to special-case punctuation. sigh.
* "(", ")"
* 0x214a, 0x214b
*
* All this, is so we can print out the "reading" entries, for folks
* who can't read kana. WIMPS! :-)
*
* Currently, this is a state engine.
* It would be much more consistent if I just used romajimap1/2[] and worked
* backwards. Oh well.
*/
void kanatoromaji(XChar2b *kstring, char *estring)
{
char *eptr = estring;
XChar2b *kptr = kstring;
int lastchar=0; /* for state machine */
eptr[0]='\0';
do{
if(strlen(estring)> (MAXROMAJI-3) ) {
puts("WARNING: MAXROMAJI overflow");
estring[MAXROMAJI-2]='X';
estring[MAXROMAJI-1]='\0';
return;
}
/* how boring.. punctuation */
if(kptr->byte1 == 0x21) {
int val=kptr->byte2;
/* print character before this punc */
strcat(eptr, kanamap[lastchar]);
lastchar=0;
kptr++;
switch(val) {
case 0x21:
strcat(eptr," ");
continue;
case 0x3c:
case 0x41:
strcat(eptr,"-");
continue;
case 0x4a:
case 0x5a:
strcat(eptr,"(");
continue;
case 0x4b:
case 0x5b:
strcat(eptr,")");
continue;
}
printf("warning.. kanatoromaji found 0x21,0x%x\n",
val);
continue;
}
if((kptr->byte1 != 0x24 ) && (kptr->byte1 != 0x25 )
&& (kptr->byte1 != 0x0))
{
printf("warning... 0x%x%x found in kanatoromaji\n",
kptr->byte1,kptr->byte2);
kptr++;
continue;
}
/******************************************************/
/* This is a wierd parsing routine.. Try to ignore it!*/
if(kptr->byte2>128){
strcat(eptr, "[bad byte2 found]");
kptr++;
continue;
}
switch(kptr->byte2){
case 0x63:
case 0x65:
case 0x67:
/* small ya,yu,yo. Need to backtrack */
/* fall out of switch, and do so, below */
/* This is a "goto", in essence */
break;
case 0:
default:
/* print previous char!... */
/* then save THIS char,
* and continue through loop..
*/
strcat(eptr, kanamap[lastchar]);
lastchar = kptr->byte2;
kptr++;
continue;
/* hopefully, will continue through while loop */
}
/* Must be on ya,yu,yo */
/* Put the appropriate prefix */
/* Later, will append "a", "u", or "o" */
switch(lastchar){
case 0x2d: /*ki*/
strcat(eptr,"ky");
break;
case 0x2e: /*gi*/
strcat(eptr,"gy");
break;
case 0x37: /*shi*/
strcat(eptr,"sh");
break;
case 0x38: /* shi''*/
strcat(eptr,"j");
break;
case 0x41: /*chi*/
strcat(eptr,"ch");
break;
case 0x4b: /*ni*/
strcat(eptr,"ny");
break;
case 0x52: /*hi*/
strcat(eptr,"hy");
break;
case 0x53: /*bi*/
strcat(eptr,"by");
break;
case 0x54: /*pi*/
strcat(eptr,"py");
break;
case 0x5f: /*mi*/
strcat(eptr,"my");
break;
case 0x6a: /*ri*/
strcat(eptr,"ry");
break;
default:
/*strcat(eptr, "[bad placement of ya/yu/yo]");*/
/* oh well.. just print it as-is */
/* but mark it as small */
strcat(eptr, "_y");
break;
}
switch(kptr->byte2){
case 0x63:
strcat(eptr, "a");
break;
case 0x65:
strcat(eptr, "u");
break;
case 0x67:
strcat(eptr, "o");
break;
/* must be ya,yu,yo */
}
lastchar = kptr->byte2;
kptr++;
} while (kptr->byte1 != 0);
strcat(eptr, kanamap[lastchar]);
}
#define ROMAJIMAPSIZE 150 /* there is some empty space for growth */
/* romajimap1 has to match romajimap2, in exact order!! */
/* If we were nice, this would also be XChar2b encoded, but..*/
static char *romajimap1[ROMAJIMAPSIZE]=
{
"shya", "shyu", "shyo",
"chya","chyu","chyo",
"sya","syu","syo",
"sha","shi", "shu","sho",
"jya","jyu","jyo",
"cha","chu","cho",
"gya","gyu","gyo",
"dzu","tzu", /* same as "du" */
"hya","hyu","hyo",
"bya","byu","byo",
"pya","pyu","pyo",
"mya","myu","myo",
"nya","nyu","nyo",
"kya","kyu","kyo",
"rya","ryu","ryo",
"chi","tsu",
"ka","ga","ki","gi","ku","gu","ke","ge","ko","go",
"sa","za","si","zi","su","zu","se","ze","so","zo",
"ja", "ji", "ju", "jo",
"ta","da","ti","di","tu","du","te","de","to","do",
"na","ni","nu","ne","no",
"n ","nn",
"ha","ba","pa",
"hi","bi","pi",
"hu","fu","bu","pu",
"he","be","pe",
"ho","bo","po",
"ma","mi","mu","me","mo",
"ra","ri","ru","re","ro",
"wa","wo",
"ya","yu","yo",
"_ya","_yu","_yo", /* Small letters, the hard way. For romaji input*/
"a","i","u","e","o",
"-", "'","(",")", /* These are here not so much for romaji input,
* as to match dictionary search patterns,
* and translate kana to romaji.
*/
""
/* See also hardcodes for special keys in Handle_romajikana()
* and matchromaji()
*/
};
/* must match order of romajimap1[]!!*/
static XChar2b romajimap2[ROMAJIMAPSIZE][3]=
{
{{0x24, 0x37}, {0x24, 0x63}, {0, 0}},/*shya*/
{{0x24, 0x37}, {0x24, 0x65}, {0, 0}},/*shyu*/
{{0x24, 0x37}, {0x24, 0x67}, {0, 0}},/*shyo*/
{{0x24, 0x41}, {0x24, 0x63}, {0, 0}},/*chya*/
{{0x24, 0x41}, {0x24, 0x65}, {0, 0}},/*chyu*/
{{0x24, 0x41}, {0x24, 0x67}, {0, 0}},/*chyo*/
{{0x24, 0x37}, {0x24, 0x63}, {0, 0}},/*sya*/
{{0x24, 0x37}, {0x24, 0x65}, {0, 0}},/*syu*/
{{0x24, 0x37}, {0x24, 0x67}, {0, 0}},/*syo*/
{{0x24, 0x37}, {0x24, 0x63}, {0, 0}},/*sha*/
{{0x24, 0x37}, {0, 0}}, /*shi*/
{{0x24, 0x37}, {0x24, 0x65}, {0, 0}},/*shu*/
/*{{0x24, 0x37}, {0x24, 0x67}, {0, 0}}, ??she??*/
{{0x24, 0x37}, {0x24, 0x67}, {0, 0}},/*sho*/
{{0x24, 0x38}, {0x24, 0x63}, {0, 0}},/*jya*/
{{0x24, 0x38}, {0x24, 0x65}, {0, 0}},/*jyu*/
{{0x24, 0x38}, {0x24, 0x67}, {0, 0}},/*jyo*/
{{0x24, 0x41}, {0x24, 0x63}, {0, 0}},/*cha*/
{{0x24, 0x41}, {0x24, 0x65}, {0, 0}},/*chu*/
{{0x24, 0x41}, {0x24, 0x67}, {0, 0}},/*cho*/
{{0x24, 0x2e}, {0x24, 0x63}, {0, 0}},/*gya*/
{{0x24, 0x2e}, {0x24, 0x65}, {0, 0}},/*gyu*/
{{0x24, 0x2e}, {0x24, 0x67}, {0, 0}},/*gyo*/
{{0x24, 0x45}, {0, 0}},/*dzu*/
{{0x24, 0x45}, {0, 0}},/*tzu*/
{{0x24, 0x52}, {0x24, 0x63}, {0, 0}},/*hya*/
{{0x24, 0x52}, {0x24, 0x65}, {0, 0}},/*hyu*/
{{0x24, 0x52}, {0x24, 0x67}, {0, 0}},/*hyo*/
{{0x24, 0x53}, {0x24, 0x63}, {0, 0}},/*bya*/
{{0x24, 0x53}, {0x24, 0x65}, {0, 0}},/*byu*/
{{0x24, 0x53}, {0x24, 0x67}, {0, 0}},/*byo*/
{{0x24, 0x54}, {0x24, 0x63}, {0, 0}},/*pya*/
{{0x24, 0x54}, {0x24, 0x65}, {0, 0}},/*pyu*/
{{0x24, 0x54}, {0x24, 0x67}, {0, 0}},/*pyo*/
{{0x24, 0x5f}, {0x24, 0x63}, {0, 0}},/*mya*/
{{0x24, 0x5f}, {0x24, 0x65}, {0, 0}},/*myu*/
{{0x24, 0x5f}, {0x24, 0x67}, {0, 0}},/*myo*/
{{0x24, 0x4b}, {0x24, 0x63}, {0, 0}},/*nya*/
{{0x24, 0x4b}, {0x24, 0x65}, {0, 0}},/*nyu*/
{{0x24, 0x4b}, {0x24, 0x67}, {0, 0}},/*nyo*/
{{0x24, 0x2d}, {0x24, 0x63}, {0, 0}},/*kya*/
{{0x24, 0x2d}, {0x24, 0x65}, {0, 0}},/*kyu*/
{{0x24, 0x2d}, {0x24, 0x67}, {0, 0}},/*kyo*/
{{0x24, 0x6a}, {0x24, 0x63}, {0, 0}},/*rya*/
{{0x24, 0x6a}, {0x24, 0x65}, {0, 0}},/*ryu*/
{{0x24, 0x6a}, {0x24, 0x67}, {0, 0}},/*ryo*/
{{0x24, 0x41}, {0, 0}},/*chi*/
{{0x24, 0x44}, {0, 0}},/*tsu*/
{{0x24, 0x2b}, {0, 0}},/*ka*/
{{0x24, 0x2c}, {0, 0}},/*ga*/
{{0x24, 0x2d}, {0, 0}},/*ki*/
{{0x24, 0x2e}, {0, 0}},/*gi*/
{{0x24, 0x2f}, {0, 0}},/*ku*/
{{0x24, 0x30}, {0, 0}},/*gu*/
{{0x24, 0x31}, {0, 0}},/*ke*/
{{0x24, 0x32}, {0, 0}},/*ge*/
{{0x24, 0x33}, {0, 0}},/*ko*/
{{0x24, 0x34}, {0, 0}},/*go*/
{{0x24, 0x35}, {0, 0}},/*sa*/
{{0x24, 0x36}, {0, 0}},/*za*/
{{0x24, 0x37}, {0, 0}},/*si*/
{{0x24, 0x38}, {0, 0}},/*zi*/
{{0x24, 0x39}, {0, 0}},/*su*/
{{0x24, 0x3a}, {0, 0}},/*zu*/
{{0x24, 0x3b}, {0, 0}},/*se*/
{{0x24, 0x3c}, {0, 0}},/*ze*/
{{0x24, 0x3d}, {0, 0}},/*so*/
{{0x24, 0x3e}, {0, 0}},/*zo*/
{{0x24, 0x38}, {0x24, 0x63}, {0, 0}},/*ja*/
{{0x24, 0x38}, {0, 0}},/*ji*/
{{0x24, 0x38}, {0x24, 0x65}, {0, 0}},/*ju*/
{{0x24, 0x38}, {0x24, 0x67}, {0, 0}},/*jo*/
{{0x24, 0x3f}, {0, 0}},/*ta*/
{{0x24, 0x40}, {0, 0}},/*da*/
{{0x24, 0x41}, {0, 0}},/*ti*/
{{0x24, 0x42}, {0, 0}},/*di*/
{{0x24, 0x44}, {0, 0}},/*tu*/
{{0x24, 0x45}, {0, 0}},/*du*/
{{0x24, 0x46}, {0, 0}},/*te*/
{{0x24, 0x47}, {0, 0}},/*de*/
{{0x24, 0x48}, {0, 0}},/*to*/
{{0x24, 0x49}, {0, 0}},/*do*/
{{0x24, 0x4a}, {0, 0}},/*na*/
{{0x24, 0x4b}, {0, 0}},/*ni*/
{{0x24, 0x4c}, {0, 0}},/*nu*/
{{0x24, 0x4d}, {0, 0}},/*ne*/
{{0x24, 0x4e}, {0, 0}},/*no*/
{{0x24, 0x73}, {0, 0}},/*'n '*/
{{0x24, 0x73}, {0, 0}},/*'nn'*/
{{0x24, 0x4f}, {0, 0}},/*ha*/
{{0x24, 0x50}, {0, 0}},/*ba*/
{{0x24, 0x51}, {0, 0}},/*pa*/
{{0x24, 0x52}, {0, 0}},/*hi*/
{{0x24, 0x53}, {0, 0}},/*bi*/
{{0x24, 0x54}, {0, 0}},/*pi*/
{{0x24, 0x55}, {0, 0}},/*hu*/
{{0x24, 0x55}, {0, 0}},/*fu*/
{{0x24, 0x56}, {0, 0}},/*bu*/
{{0x24, 0x57}, {0, 0}},/*pu*/
{{0x24, 0x58}, {0, 0}},/*he*/
{{0x24, 0x59}, {0, 0}},/*be*/
{{0x24, 0x5a}, {0, 0}},/*pe*/
{{0x24, 0x5b}, {0, 0}},/*ho*/
{{0x24, 0x5c}, {0, 0}},/*bo*/
{{0x24, 0x5d}, {0, 0}},/*po*/
{{0x24, 0x5e}, {0, 0}},/*ma*/
{{0x24, 0x5f}, {0, 0}},/*mi*/
{{0x24, 0x60}, {0, 0}},/*mu*/
{{0x24, 0x61}, {0, 0}},/*me*/
{{0x24, 0x62}, {0, 0}},/*mo*/
{{0x24, 0x69}, {0, 0}},/*ra*/
{{0x24, 0x6a}, {0, 0}},/*ri*/
{{0x24, 0x6b}, {0, 0}},/*ru*/
{{0x24, 0x6c}, {0, 0}},/*re*/
{{0x24, 0x6d}, {0, 0}},/*ro*/
{{0x24, 0x6f}, {0, 0}},/*wa*/
{{0x24, 0x72}, {0, 0}},/*wo*/
{{0x24, 0x64}, {0, 0}},/*ya*/
{{0x24, 0x66}, {0, 0}},/*yu*/
{{0x24, 0x68}, {0, 0}},/*yo*/
{{0x24, 0x63}, {0, 0}},/*_ya*/
{{0x24, 0x65}, {0, 0}},/*_yu*/
{{0x24, 0x67}, {0, 0}},/*_yo*/
{{0x24, 0x22}, {0, 0}},/*a*/
{{0x24, 0x24}, {0, 0}},/*i*/
{{0x24, 0x26}, {0, 0}},/*u*/
{{0x24, 0x28}, {0, 0}},/*e*/
{{0x24, 0x2a}, {0, 0}},/*o*/
/* These are here really for kana-to-romaji only */
/* But make sure Handle_romajikana uses same values! */
{{0x21, 0x3c}, {0, 0}},/*- for elongation*/
{{0x24, 0x43}, {0, 0}},/*"'" for small-tsu*/
{{0x21, 0x4a}, {0, 0}},/*"(" */
{{0x21, 0x4b}, {0, 0}},/*")" */
{{0}}
};
/* subroutine for romajitokana()
* romaji is one of the standard romaji tokens.
* kstring is user input.
*
* return 1 if kstring matches the known romaji combination.
*/
static int matchromaji(char *romaji, XChar2b *kstring)
{
while(*romaji!='\0'){
if(kstring->byte1==0){
return 0;
}
/*durn non-ascii punctuation codes...*/
/* IFF first char is JIS '_', then try to match
* with "_ya" romaji combinations.
* If we dont HAVE a _yXX combo to compare with,
* then we dont have a match!
*/
if(kstring->byte1==0x21){
if(kstring->byte2 == 0x32){
if(*romaji == '_'){
/* force match */
romaji++;
kstring++;
continue;
} else {
return 0;
}
}
}
if(*romaji != kstring->byte2){
return 0;
}
if(kstring->byte1!=0x23) {
/* Theoretically, this should be redundant. */
/* All matches should be ASCII to pseudo-ASCII now */
printf("WARNING: matchromaji hit nonascii:%x:%x\n",
kstring->byte1,kstring->byte2);
return 0;
}
romaji++;
kstring++;
}
/* match found! */
return 1;
}
/*
* We get called by process_kinput, on the "raw" string,
* before the string actually gets displayed.
*
* We assume kstart points to an array that we can modify in-place.
* We also assume array is fully XChar2b encoded, as a "mix" of
* kana and romaji. But we assume that it always starts with 0 or more
* kana, and followed by 0 or more romaji. (encoded as 0x23,0x??)
* We never should be more mixed than that. If we are, we don't
* handle it well :-)
*
* Expected range of characters is determined by whatever
* Handle_romajikana puts in the string, minus whatever process_kinput
* has filtered out.
*/
void romajitokana(XChar2b *kstart)
{
XChar2b *kparse;
int matchcount;
/* start off by looking for pseudo-8-bit chars, aka romaji.
* look for first possible romaji string match, or end-of-string.
*/
while(kstart->byte1!=0) {
if(kstart->byte1==0x23){
break;
}
/* special handling just for _ya combo */
if(kstart->byte1==0x21){
if(kstart->byte2==0x32){
#ifdef DEBUG
puts("romajitokana: found __");
#endif
break;
}
}
kstart++;
}
if(kstart->byte1==0){
/* no match! */
return;
}
/* kstart now has first ascii-like 8-bit char
* embedded in the XChar2b string
*/
kparse = kstart;
for(matchcount=0; romajimap1[matchcount][0]!='\0' ; matchcount++){
if(matchromaji(romajimap1[matchcount], kstart)==1){
kparse=&romajimap2[matchcount][0];
while(kparse->byte1 != 0){
kstart->byte1=kparse->byte1;
kstart->byte2=kparse->byte2;
kstart++;
kparse++;
}
/* yes, force truncation */
kstart->byte1=kstart->byte2=0;
return;
}
}
/* Special case for 'n' in the middle of things.
* Convert it ourselves to standalone 'n', unless potentially part
* of "nya" or similar known compound
*/
if((kstart[0].byte1==0x23) && (kstart[0].byte2=='n'))
if((kstart[1].byte1!=0) && (kstart[1].byte2!='y') &&
(kstart[1].byte2!='n'))
{
kstart[0].byte1=0x24;
kstart[0].byte2=0x73;
}
return;
}
/************************************************************/
/* This is a KeyPress event handler
* Gets called every time a key is pressed in the romajikana window.
* ALso handles "go search now".
*/
void Handle_romajikana(Widget w, XtPointer closure, XEvent *e, Boolean *cont)
{
XKeyEvent *key_event;
KeySym inbetweenK;
char *charpressed;
XChar2b addchar;
if(e->type != KeyPress){
if(e->type == KeyRelease){
#ifdef DEBUG
puts("key released");
#endif
return;
}
printf("Some other strange event found in for romaji: %d\n",
e->type);
return;
}
key_event = (XKeyEvent *) e;
/*inbetweenK = XKeycodeToKeysym(XtDisplay(w), key_event->keycode,0);*/
/* need XtGetActionkeysym to detect Shift-9=='(' */
inbetweenK = XtGetActionKeysym(e, NULL);
if(inbetweenK == (KeySym)NULL){
puts("NULL keysym on kana input???");
return;
}
/* we switch based on what character has just been
* pressed on an ASCII-based keyboard.
* <return> gets translated to "go search now"
* Making byte1== 0x23, is a special signal to romajitokana,
* that says "Hey, this is romaji"
*/
switch(inbetweenK){
case XK_BackSpace:
case XK_Delete:
addchar.byte1 = 0x22;
addchar.byte2 = 0x2b;
process_kinput(addchar);
return;
case XK_Return:
/* pass our strange "accept" char, that means "do search now"*/
addchar.byte1 = paragraphglyph[0].byte1;
addchar.byte2 = paragraphglyph[0].byte2;
process_kinput(addchar);
return;
case XK_space:
/* add " ", but in kana range */
/* This is a nasty hack to get "n" right */
addchar.byte1 = 0x23;
addchar.byte2 = ' ';
process_kinput(addchar);
return;
/* Special handling for punctuation, because JIS punctuation
* conflicts with 8-bit numbers of ASCII punctuation
*/
case XK_minus:
addchar.byte1 = 0x21;
addchar.byte2 = 0x3c; /* -- in JIS */
process_kinput(addchar);
return;
case XK_apostrophe:
addchar.byte1 = 0x24;
addchar.byte2 = 0x43; /* "'" in JIS */
process_kinput(addchar);
return;
case XK_parenleft:
/*
* Note that this "char" is invisible in JIS space
* It is pseudo-ASCII. See note on XK_minus
*/
addchar.byte1 = 0x21;
addchar.byte2 = 0x4a;
/* in JIS it would be 214a */
process_kinput(addchar);
return;
case XK_parenright:
/*
* directly convert to JIS ourselves.
* These must match whatever ReadPronunciation()
* converts '()' in the dictionary to.
*/
addchar.byte1 = 0x21;
addchar.byte2 = 0x4b;
/* in JIS it would be 214b */
process_kinput(addchar);
return;
case XK_underscore:
addchar.byte1 = 0x21;
addchar.byte2 = 0x32; /* 0x2132 == '_' in JIS */
process_kinput(addchar);
return;
}
charpressed = XKeysymToString(inbetweenK);
if(charpressed == NULL)
return;
#ifdef DEBUG
printf("got string \"%s\"\n", charpressed);
#endif
/* now use process_kinput, 222b is erase */
if((*charpressed <0x61) || (*charpressed >0x7a)){
/* outside range of ascii chars we like */
if((*charpressed!='(') && *charpressed != ')'){
#ifdef DEBUG
puts("ignoring.. not in normal ascii range");
#endif
return;
}
}
addchar.byte1 = 0x23;
addchar.byte2 = *charpressed;
process_kinput(addchar);
}
syntax highlighted by Code2HTML, v. 0.9.1