/* convert.c * This file is split off to contain the kana<->romaji routines. */ #include #include #include #include #include "defs.h" #include "convert.h" #include "searchwidgets.h" /* * Note.. these state indicators, are for parsing * romaji from RIGHT to LEFT * "state" stores the state to the right of where we are. */ enum { STATE_NONE, STATE_A, STATE_I, STATE_U, STATE_E, STATE_O, STATE_YA, STATE_YU, STATE_YO, /* we actualy use "y_state" instead of the above _YX states*/ STATE_HI, /* for chi or shi */ STATE_SU, /* for tsu */ STATE_SPACE, /* for lone 'n' at end */ STATE_OTHER } ; /* Order of these is very important. DO NOT CHANGE */ enum {ADD_NONE, ADD_A, ADD_I, ADD_U, ADD_E, ADD_O}; /*enum {ADD_NONE, ADD_YA, ADD_FILL1, ADD_YU, ADD_FILL2, ADD_YO};*/ /* map for converting kana to romaji */ static char * kanamap[128]= { "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "a", "", "i", "", "u", "", "e", "", "o", "ka", "ga", "ki", "gi", "ku", "gu", "ke", "ge", "ko", "go", "sa", "za", "shi", "zhi", "su", "zu", "se", "ze", "so","zo", "ta", "da", "chi","ji", "", "tsu","du", "te", "de", "to", "do", "na", "ni", "nu", "ne", "no", "ha", "ba", "pa", "hi", "bi", "pi", "fu", "bu", "pu", "he", "be", "pe", "ho", "bo", "po", "ma", "mi", "mu", "me", "mo","", "ya", "", "yu", "", "yo", "ra", "ri","ru","re", "ro", "", "wa", "", "", "o", "n", "", "", "", "", "", "", "", "", "", "", "", "", }; /* Take a kana string as input. Skip along, converting kana to * romaji. Assume 0x24 or 0x25 means kana or kanji start. * Assume estring buffer is "long enough". * Note that "long enough" should probably be * kstrlen(kstring) *2 * 2; * * Sigh. better put a limit on it: MAXROMAJI * * Note that we also have to special-case punctuation. sigh. * "(", ")" * 0x214a, 0x214b * * All this, is so we can print out the "reading" entries, for folks * who can't read kana. WIMPS! :-) * * Currently, this is a state engine. * It would be much more consistent if I just used romajimap1/2[] and worked * backwards. Oh well. */ void kanatoromaji(XChar2b *kstring, char *estring) { char *eptr = estring; XChar2b *kptr = kstring; int lastchar=0; /* for state machine */ eptr[0]='\0'; do{ if(strlen(estring)> (MAXROMAJI-3) ) { puts("WARNING: MAXROMAJI overflow"); estring[MAXROMAJI-2]='X'; estring[MAXROMAJI-1]='\0'; return; } /* how boring.. punctuation */ if(kptr->byte1 == 0x21) { int val=kptr->byte2; /* print character before this punc */ strcat(eptr, kanamap[lastchar]); lastchar=0; kptr++; switch(val) { case 0x21: strcat(eptr," "); continue; case 0x3c: case 0x41: strcat(eptr,"-"); continue; case 0x4a: case 0x5a: strcat(eptr,"("); continue; case 0x4b: case 0x5b: strcat(eptr,")"); continue; } printf("warning.. kanatoromaji found 0x21,0x%x\n", val); continue; } if((kptr->byte1 != 0x24 ) && (kptr->byte1 != 0x25 ) && (kptr->byte1 != 0x0)) { printf("warning... 0x%x%x found in kanatoromaji\n", kptr->byte1,kptr->byte2); kptr++; continue; } /******************************************************/ /* This is a wierd parsing routine.. Try to ignore it!*/ if(kptr->byte2>128){ strcat(eptr, "[bad byte2 found]"); kptr++; continue; } switch(kptr->byte2){ case 0x63: case 0x65: case 0x67: /* small ya,yu,yo. Need to backtrack */ /* fall out of switch, and do so, below */ /* This is a "goto", in essence */ break; case 0: default: /* print previous char!... */ /* then save THIS char, * and continue through loop.. */ strcat(eptr, kanamap[lastchar]); lastchar = kptr->byte2; kptr++; continue; /* hopefully, will continue through while loop */ } /* Must be on ya,yu,yo */ /* Put the appropriate prefix */ /* Later, will append "a", "u", or "o" */ switch(lastchar){ case 0x2d: /*ki*/ strcat(eptr,"ky"); break; case 0x2e: /*gi*/ strcat(eptr,"gy"); break; case 0x37: /*shi*/ strcat(eptr,"sh"); break; case 0x38: /* shi''*/ strcat(eptr,"j"); break; case 0x41: /*chi*/ strcat(eptr,"ch"); break; case 0x4b: /*ni*/ strcat(eptr,"ny"); break; case 0x52: /*hi*/ strcat(eptr,"hy"); break; case 0x53: /*bi*/ strcat(eptr,"by"); break; case 0x54: /*pi*/ strcat(eptr,"py"); break; case 0x5f: /*mi*/ strcat(eptr,"my"); break; case 0x6a: /*ri*/ strcat(eptr,"ry"); break; default: /*strcat(eptr, "[bad placement of ya/yu/yo]");*/ /* oh well.. just print it as-is */ /* but mark it as small */ strcat(eptr, "_y"); break; } switch(kptr->byte2){ case 0x63: strcat(eptr, "a"); break; case 0x65: strcat(eptr, "u"); break; case 0x67: strcat(eptr, "o"); break; /* must be ya,yu,yo */ } lastchar = kptr->byte2; kptr++; } while (kptr->byte1 != 0); strcat(eptr, kanamap[lastchar]); } #define ROMAJIMAPSIZE 150 /* there is some empty space for growth */ /* romajimap1 has to match romajimap2, in exact order!! */ /* If we were nice, this would also be XChar2b encoded, but..*/ static char *romajimap1[ROMAJIMAPSIZE]= { "shya", "shyu", "shyo", "chya","chyu","chyo", "sya","syu","syo", "sha","shi", "shu","sho", "jya","jyu","jyo", "cha","chu","cho", "gya","gyu","gyo", "dzu","tzu", /* same as "du" */ "hya","hyu","hyo", "bya","byu","byo", "pya","pyu","pyo", "mya","myu","myo", "nya","nyu","nyo", "kya","kyu","kyo", "rya","ryu","ryo", "chi","tsu", "ka","ga","ki","gi","ku","gu","ke","ge","ko","go", "sa","za","si","zi","su","zu","se","ze","so","zo", "ja", "ji", "ju", "jo", "ta","da","ti","di","tu","du","te","de","to","do", "na","ni","nu","ne","no", "n ","nn", "ha","ba","pa", "hi","bi","pi", "hu","fu","bu","pu", "he","be","pe", "ho","bo","po", "ma","mi","mu","me","mo", "ra","ri","ru","re","ro", "wa","wo", "ya","yu","yo", "_ya","_yu","_yo", /* Small letters, the hard way. For romaji input*/ "a","i","u","e","o", "-", "'","(",")", /* These are here not so much for romaji input, * as to match dictionary search patterns, * and translate kana to romaji. */ "" /* See also hardcodes for special keys in Handle_romajikana() * and matchromaji() */ }; /* must match order of romajimap1[]!!*/ static XChar2b romajimap2[ROMAJIMAPSIZE][3]= { {{0x24, 0x37}, {0x24, 0x63}, {0, 0}},/*shya*/ {{0x24, 0x37}, {0x24, 0x65}, {0, 0}},/*shyu*/ {{0x24, 0x37}, {0x24, 0x67}, {0, 0}},/*shyo*/ {{0x24, 0x41}, {0x24, 0x63}, {0, 0}},/*chya*/ {{0x24, 0x41}, {0x24, 0x65}, {0, 0}},/*chyu*/ {{0x24, 0x41}, {0x24, 0x67}, {0, 0}},/*chyo*/ {{0x24, 0x37}, {0x24, 0x63}, {0, 0}},/*sya*/ {{0x24, 0x37}, {0x24, 0x65}, {0, 0}},/*syu*/ {{0x24, 0x37}, {0x24, 0x67}, {0, 0}},/*syo*/ {{0x24, 0x37}, {0x24, 0x63}, {0, 0}},/*sha*/ {{0x24, 0x37}, {0, 0}}, /*shi*/ {{0x24, 0x37}, {0x24, 0x65}, {0, 0}},/*shu*/ /*{{0x24, 0x37}, {0x24, 0x67}, {0, 0}}, ??she??*/ {{0x24, 0x37}, {0x24, 0x67}, {0, 0}},/*sho*/ {{0x24, 0x38}, {0x24, 0x63}, {0, 0}},/*jya*/ {{0x24, 0x38}, {0x24, 0x65}, {0, 0}},/*jyu*/ {{0x24, 0x38}, {0x24, 0x67}, {0, 0}},/*jyo*/ {{0x24, 0x41}, {0x24, 0x63}, {0, 0}},/*cha*/ {{0x24, 0x41}, {0x24, 0x65}, {0, 0}},/*chu*/ {{0x24, 0x41}, {0x24, 0x67}, {0, 0}},/*cho*/ {{0x24, 0x2e}, {0x24, 0x63}, {0, 0}},/*gya*/ {{0x24, 0x2e}, {0x24, 0x65}, {0, 0}},/*gyu*/ {{0x24, 0x2e}, {0x24, 0x67}, {0, 0}},/*gyo*/ {{0x24, 0x45}, {0, 0}},/*dzu*/ {{0x24, 0x45}, {0, 0}},/*tzu*/ {{0x24, 0x52}, {0x24, 0x63}, {0, 0}},/*hya*/ {{0x24, 0x52}, {0x24, 0x65}, {0, 0}},/*hyu*/ {{0x24, 0x52}, {0x24, 0x67}, {0, 0}},/*hyo*/ {{0x24, 0x53}, {0x24, 0x63}, {0, 0}},/*bya*/ {{0x24, 0x53}, {0x24, 0x65}, {0, 0}},/*byu*/ {{0x24, 0x53}, {0x24, 0x67}, {0, 0}},/*byo*/ {{0x24, 0x54}, {0x24, 0x63}, {0, 0}},/*pya*/ {{0x24, 0x54}, {0x24, 0x65}, {0, 0}},/*pyu*/ {{0x24, 0x54}, {0x24, 0x67}, {0, 0}},/*pyo*/ {{0x24, 0x5f}, {0x24, 0x63}, {0, 0}},/*mya*/ {{0x24, 0x5f}, {0x24, 0x65}, {0, 0}},/*myu*/ {{0x24, 0x5f}, {0x24, 0x67}, {0, 0}},/*myo*/ {{0x24, 0x4b}, {0x24, 0x63}, {0, 0}},/*nya*/ {{0x24, 0x4b}, {0x24, 0x65}, {0, 0}},/*nyu*/ {{0x24, 0x4b}, {0x24, 0x67}, {0, 0}},/*nyo*/ {{0x24, 0x2d}, {0x24, 0x63}, {0, 0}},/*kya*/ {{0x24, 0x2d}, {0x24, 0x65}, {0, 0}},/*kyu*/ {{0x24, 0x2d}, {0x24, 0x67}, {0, 0}},/*kyo*/ {{0x24, 0x6a}, {0x24, 0x63}, {0, 0}},/*rya*/ {{0x24, 0x6a}, {0x24, 0x65}, {0, 0}},/*ryu*/ {{0x24, 0x6a}, {0x24, 0x67}, {0, 0}},/*ryo*/ {{0x24, 0x41}, {0, 0}},/*chi*/ {{0x24, 0x44}, {0, 0}},/*tsu*/ {{0x24, 0x2b}, {0, 0}},/*ka*/ {{0x24, 0x2c}, {0, 0}},/*ga*/ {{0x24, 0x2d}, {0, 0}},/*ki*/ {{0x24, 0x2e}, {0, 0}},/*gi*/ {{0x24, 0x2f}, {0, 0}},/*ku*/ {{0x24, 0x30}, {0, 0}},/*gu*/ {{0x24, 0x31}, {0, 0}},/*ke*/ {{0x24, 0x32}, {0, 0}},/*ge*/ {{0x24, 0x33}, {0, 0}},/*ko*/ {{0x24, 0x34}, {0, 0}},/*go*/ {{0x24, 0x35}, {0, 0}},/*sa*/ {{0x24, 0x36}, {0, 0}},/*za*/ {{0x24, 0x37}, {0, 0}},/*si*/ {{0x24, 0x38}, {0, 0}},/*zi*/ {{0x24, 0x39}, {0, 0}},/*su*/ {{0x24, 0x3a}, {0, 0}},/*zu*/ {{0x24, 0x3b}, {0, 0}},/*se*/ {{0x24, 0x3c}, {0, 0}},/*ze*/ {{0x24, 0x3d}, {0, 0}},/*so*/ {{0x24, 0x3e}, {0, 0}},/*zo*/ {{0x24, 0x38}, {0x24, 0x63}, {0, 0}},/*ja*/ {{0x24, 0x38}, {0, 0}},/*ji*/ {{0x24, 0x38}, {0x24, 0x65}, {0, 0}},/*ju*/ {{0x24, 0x38}, {0x24, 0x67}, {0, 0}},/*jo*/ {{0x24, 0x3f}, {0, 0}},/*ta*/ {{0x24, 0x40}, {0, 0}},/*da*/ {{0x24, 0x41}, {0, 0}},/*ti*/ {{0x24, 0x42}, {0, 0}},/*di*/ {{0x24, 0x44}, {0, 0}},/*tu*/ {{0x24, 0x45}, {0, 0}},/*du*/ {{0x24, 0x46}, {0, 0}},/*te*/ {{0x24, 0x47}, {0, 0}},/*de*/ {{0x24, 0x48}, {0, 0}},/*to*/ {{0x24, 0x49}, {0, 0}},/*do*/ {{0x24, 0x4a}, {0, 0}},/*na*/ {{0x24, 0x4b}, {0, 0}},/*ni*/ {{0x24, 0x4c}, {0, 0}},/*nu*/ {{0x24, 0x4d}, {0, 0}},/*ne*/ {{0x24, 0x4e}, {0, 0}},/*no*/ {{0x24, 0x73}, {0, 0}},/*'n '*/ {{0x24, 0x73}, {0, 0}},/*'nn'*/ {{0x24, 0x4f}, {0, 0}},/*ha*/ {{0x24, 0x50}, {0, 0}},/*ba*/ {{0x24, 0x51}, {0, 0}},/*pa*/ {{0x24, 0x52}, {0, 0}},/*hi*/ {{0x24, 0x53}, {0, 0}},/*bi*/ {{0x24, 0x54}, {0, 0}},/*pi*/ {{0x24, 0x55}, {0, 0}},/*hu*/ {{0x24, 0x55}, {0, 0}},/*fu*/ {{0x24, 0x56}, {0, 0}},/*bu*/ {{0x24, 0x57}, {0, 0}},/*pu*/ {{0x24, 0x58}, {0, 0}},/*he*/ {{0x24, 0x59}, {0, 0}},/*be*/ {{0x24, 0x5a}, {0, 0}},/*pe*/ {{0x24, 0x5b}, {0, 0}},/*ho*/ {{0x24, 0x5c}, {0, 0}},/*bo*/ {{0x24, 0x5d}, {0, 0}},/*po*/ {{0x24, 0x5e}, {0, 0}},/*ma*/ {{0x24, 0x5f}, {0, 0}},/*mi*/ {{0x24, 0x60}, {0, 0}},/*mu*/ {{0x24, 0x61}, {0, 0}},/*me*/ {{0x24, 0x62}, {0, 0}},/*mo*/ {{0x24, 0x69}, {0, 0}},/*ra*/ {{0x24, 0x6a}, {0, 0}},/*ri*/ {{0x24, 0x6b}, {0, 0}},/*ru*/ {{0x24, 0x6c}, {0, 0}},/*re*/ {{0x24, 0x6d}, {0, 0}},/*ro*/ {{0x24, 0x6f}, {0, 0}},/*wa*/ {{0x24, 0x72}, {0, 0}},/*wo*/ {{0x24, 0x64}, {0, 0}},/*ya*/ {{0x24, 0x66}, {0, 0}},/*yu*/ {{0x24, 0x68}, {0, 0}},/*yo*/ {{0x24, 0x63}, {0, 0}},/*_ya*/ {{0x24, 0x65}, {0, 0}},/*_yu*/ {{0x24, 0x67}, {0, 0}},/*_yo*/ {{0x24, 0x22}, {0, 0}},/*a*/ {{0x24, 0x24}, {0, 0}},/*i*/ {{0x24, 0x26}, {0, 0}},/*u*/ {{0x24, 0x28}, {0, 0}},/*e*/ {{0x24, 0x2a}, {0, 0}},/*o*/ /* These are here really for kana-to-romaji only */ /* But make sure Handle_romajikana uses same values! */ {{0x21, 0x3c}, {0, 0}},/*- for elongation*/ {{0x24, 0x43}, {0, 0}},/*"'" for small-tsu*/ {{0x21, 0x4a}, {0, 0}},/*"(" */ {{0x21, 0x4b}, {0, 0}},/*")" */ {{0}} }; /* subroutine for romajitokana() * romaji is one of the standard romaji tokens. * kstring is user input. * * return 1 if kstring matches the known romaji combination. */ static int matchromaji(char *romaji, XChar2b *kstring) { while(*romaji!='\0'){ if(kstring->byte1==0){ return 0; } /*durn non-ascii punctuation codes...*/ /* IFF first char is JIS '_', then try to match * with "_ya" romaji combinations. * If we dont HAVE a _yXX combo to compare with, * then we dont have a match! */ if(kstring->byte1==0x21){ if(kstring->byte2 == 0x32){ if(*romaji == '_'){ /* force match */ romaji++; kstring++; continue; } else { return 0; } } } if(*romaji != kstring->byte2){ return 0; } if(kstring->byte1!=0x23) { /* Theoretically, this should be redundant. */ /* All matches should be ASCII to pseudo-ASCII now */ printf("WARNING: matchromaji hit nonascii:%x:%x\n", kstring->byte1,kstring->byte2); return 0; } romaji++; kstring++; } /* match found! */ return 1; } /* * We get called by process_kinput, on the "raw" string, * before the string actually gets displayed. * * We assume kstart points to an array that we can modify in-place. * We also assume array is fully XChar2b encoded, as a "mix" of * kana and romaji. But we assume that it always starts with 0 or more * kana, and followed by 0 or more romaji. (encoded as 0x23,0x??) * We never should be more mixed than that. If we are, we don't * handle it well :-) * * Expected range of characters is determined by whatever * Handle_romajikana puts in the string, minus whatever process_kinput * has filtered out. */ void romajitokana(XChar2b *kstart) { XChar2b *kparse; int matchcount; /* start off by looking for pseudo-8-bit chars, aka romaji. * look for first possible romaji string match, or end-of-string. */ while(kstart->byte1!=0) { if(kstart->byte1==0x23){ break; } /* special handling just for _ya combo */ if(kstart->byte1==0x21){ if(kstart->byte2==0x32){ #ifdef DEBUG puts("romajitokana: found __"); #endif break; } } kstart++; } if(kstart->byte1==0){ /* no match! */ return; } /* kstart now has first ascii-like 8-bit char * embedded in the XChar2b string */ kparse = kstart; for(matchcount=0; romajimap1[matchcount][0]!='\0' ; matchcount++){ if(matchromaji(romajimap1[matchcount], kstart)==1){ kparse=&romajimap2[matchcount][0]; while(kparse->byte1 != 0){ kstart->byte1=kparse->byte1; kstart->byte2=kparse->byte2; kstart++; kparse++; } /* yes, force truncation */ kstart->byte1=kstart->byte2=0; return; } } /* Special case for 'n' in the middle of things. * Convert it ourselves to standalone 'n', unless potentially part * of "nya" or similar known compound */ if((kstart[0].byte1==0x23) && (kstart[0].byte2=='n')) if((kstart[1].byte1!=0) && (kstart[1].byte2!='y') && (kstart[1].byte2!='n')) { kstart[0].byte1=0x24; kstart[0].byte2=0x73; } return; } /************************************************************/ /* This is a KeyPress event handler * Gets called every time a key is pressed in the romajikana window. * ALso handles "go search now". */ void Handle_romajikana(Widget w, XtPointer closure, XEvent *e, Boolean *cont) { XKeyEvent *key_event; KeySym inbetweenK; char *charpressed; XChar2b addchar; if(e->type != KeyPress){ if(e->type == KeyRelease){ #ifdef DEBUG puts("key released"); #endif return; } printf("Some other strange event found in for romaji: %d\n", e->type); return; } key_event = (XKeyEvent *) e; /*inbetweenK = XKeycodeToKeysym(XtDisplay(w), key_event->keycode,0);*/ /* need XtGetActionkeysym to detect Shift-9=='(' */ inbetweenK = XtGetActionKeysym(e, NULL); if(inbetweenK == (KeySym)NULL){ puts("NULL keysym on kana input???"); return; } /* we switch based on what character has just been * pressed on an ASCII-based keyboard. * gets translated to "go search now" * Making byte1== 0x23, is a special signal to romajitokana, * that says "Hey, this is romaji" */ switch(inbetweenK){ case XK_BackSpace: case XK_Delete: addchar.byte1 = 0x22; addchar.byte2 = 0x2b; process_kinput(addchar); return; case XK_Return: /* pass our strange "accept" char, that means "do search now"*/ addchar.byte1 = paragraphglyph[0].byte1; addchar.byte2 = paragraphglyph[0].byte2; process_kinput(addchar); return; case XK_space: /* add " ", but in kana range */ /* This is a nasty hack to get "n" right */ addchar.byte1 = 0x23; addchar.byte2 = ' '; process_kinput(addchar); return; /* Special handling for punctuation, because JIS punctuation * conflicts with 8-bit numbers of ASCII punctuation */ case XK_minus: addchar.byte1 = 0x21; addchar.byte2 = 0x3c; /* -- in JIS */ process_kinput(addchar); return; case XK_apostrophe: addchar.byte1 = 0x24; addchar.byte2 = 0x43; /* "'" in JIS */ process_kinput(addchar); return; case XK_parenleft: /* * Note that this "char" is invisible in JIS space * It is pseudo-ASCII. See note on XK_minus */ addchar.byte1 = 0x21; addchar.byte2 = 0x4a; /* in JIS it would be 214a */ process_kinput(addchar); return; case XK_parenright: /* * directly convert to JIS ourselves. * These must match whatever ReadPronunciation() * converts '()' in the dictionary to. */ addchar.byte1 = 0x21; addchar.byte2 = 0x4b; /* in JIS it would be 214b */ process_kinput(addchar); return; case XK_underscore: addchar.byte1 = 0x21; addchar.byte2 = 0x32; /* 0x2132 == '_' in JIS */ process_kinput(addchar); return; } charpressed = XKeysymToString(inbetweenK); if(charpressed == NULL) return; #ifdef DEBUG printf("got string \"%s\"\n", charpressed); #endif /* now use process_kinput, 222b is erase */ if((*charpressed <0x61) || (*charpressed >0x7a)){ /* outside range of ascii chars we like */ if((*charpressed!='(') && *charpressed != ')'){ #ifdef DEBUG puts("ignoring.. not in normal ascii range"); #endif return; } } addchar.byte1 = 0x23; addchar.byte2 = *charpressed; process_kinput(addchar); }