/*
* This file is for just setting up the structs, etc
*/
/* stdio SHOULD get included by Xos.h or something */
/* but it doesn't with sunos, at least */
#include <errno.h>
#include <stdio.h> /* for popen, and other things */
#include <stdlib.h>
#include <ctype.h>
#include <Xfuncs.h> /* handles bzero redefine stuff */
#include <Xlib.h>
#include <Xatom.h>
#include <Xutil.h>
#include <Intrinsic.h>
#include <StringDefs.h>
#include <Xos.h>
#include "defs.h"
#include "utils.h"
#include "externs.h"
/* translations[] keeps track of which kanji it is okay to test the
* user on. Likewise with numberofkanji, highest, and lowest.
* YES, it is best to keep in a large array, otherwise
* it would be difficult to switch between grade levels.
*/
struct translationstruct *translations[MAXTRANSLATIONSALLOWED];
int numberofkanji=0, highestkanji=0, lowestkanji=0;
static char *dictname=NULL;
static char *edictname=NULL;
/* getline:
* reads a line (from dictionary).
* Deals with 8-bit char reads (or attempts to)
* Also attempts to deal with the problem of variable line length.
* Reads in chunks, looking for newline.
* Note the MAXLINELEN define. We DO have a limit. We do NOT use
* malloc
*
* Copies a line from FILE * to passed (unsigned char *)
*
* Returns true (1) if read a line
* Returns false (0) if fail;
*
* Used in "readstructs()", below. Also "readedict()"
*
* Note that this expects static global getline_inptr to be NULL
* if we have just started reading a stream;
*/
int getline(FILE *fp,unsigned char *s)
{
char * val;
char *endparse;
val=fgets(s,MAXLINELEN,fp);
if(val==NULL)
return 0; /* probably EOF. We dont care */
endparse=& s[strlen(s)-1];
switch(*endparse){
case 10:
case 13:
*endparse='\0';
}
return 1;
}
/* StripBrackets:
* Gets rid of those annoying {enlish}{english2} brackets.
* PRESUMES first char of source is '{'!!
* Well, actually, it nicely sets a null string if otherwise.
* See also StripSlash, below
*/
void StripBrackets(char *dest,unsigned char *source)
{
unsigned char *parse = &source[1];
if(source[0] != '{'){
dest[0] = '\0';
return;
}
/* (*dest) is always assumed to be needing a write */
do {
switch(*parse){
case '{':
*dest++ = ':';
*dest++ = ' ';
break;
case '}':
break;
case '\n':
*dest++='\0';
default:
*dest++ = *parse;
}
parse++;
} while((*parse != '\n') && (*parse != '\0'));
*dest = '\0';
return;
}
/* StripSlash
* Gets rid of /enlish/english2/ Slashes.
* Copies the cleaned up version of source, to topdest
*
* This is for readedict. Probably nothing else should use it
* Modeled directly after StripBrackets
* PRESUMES first char of source is '/'!!
* Then looks for LAST '/'
* (Or will set topdest[0] to '\0')
*
* We USED to translate middle ':' to '/'.
*
* Source is actually assumed to be regular ascii signed char,
* but declared as unsigned to stop compiler warnings.
*
* return 0 OKAY, 1 bad line
*/
int StripSlash(char *topdest,unsigned char *source)
{
char *dest;
int englen;
unsigned char *parse = source;
if(*parse != '/'){
topdest = '\0';
return 1;
}
parse=strrchr(source, '/');
if(parse<&source[2]){
fprintf(stderr,"Error: english part too short\n");
fprintf(stderr,"%s\n", source);
return 1;
}
englen=parse- source - 1;
strncpy(topdest, &source[1], parse- source - 1);
topdest[englen]='\0';
/* we've copied the relavant part over to topdest. Now rewrite
* in-place
*/
dest=topdest;
dest=strchr(dest, '/');
while(dest!=NULL){
*dest=':';
dest=strchr(dest, '/');
}
return 0;
}
/* Given a translation, return the index into translations[] that it
* sits at
*/
int trans_to_index(TRANSLATION trans)
{
return trans->kdrill_index;
}
/* read in kanji/kana part of edictfile line.
* format is:
*
* KANA /english_1/.../
*
* or
*
* KANJI [KANA] /english_1/english_2/.../
*
*
*/
void ReadEdictPron(unsigned char **Pstring, struct translationstruct *trans)
{
/* note that MAXLINELEN means we canot possibly run out of space */
XChar2b kbuff[MAXLINELEN];
XChar2b *kptr = kbuff;
unsigned char *parse = *Pstring;
/* Read in a 16-bit string.
* We dont know if its kana or kanji yet
*/
while(*parse && (*parse != '/'))
{
switch(*parse)
{
case ' ':
/* 0x2121 is ' ' */
kptr->byte1 = 0x21;
kptr->byte2 = 0x21;
kptr++;
parse++;
break;
case '[':
/* oops.. the kanji/kana switch */
/* save what must be kanji, then start
* on kana
*/
kptr->byte1 = 0;
kptr->byte2 = 0;
trans->kanji = dup_16(kbuff);
kptr = kbuff;
/* now reset buffer, and read in another char16
* string
*/
parse++;
break;
case ']':
parse++;
while(*parse && (*parse != '/'))
parse++;
/* and then we will fall out of the top loop */
break;
default:
kptr->byte1= (*parse++ & 0x7f);
kptr->byte2= (*parse++ & 0x7f);
kptr++;
}
}
/* when we come out here, we will ALWAYS have kana in
* the kbuff
*/
kptr->byte1 = 0;
kptr->byte2 = 0;
trans->pronunciation = dup_16(kbuff);
*Pstring = parse;
}
/* Okay, it's not actually pronunciation we're reading in
* We are reading the "on-yoni" and "kun-yoni" readings
* in kanjidic. Also, the optional okurigami.
*
* Format:
* reading{.oku} [reading{.oku}] ...
*/
/* 0x2500 stuff is kanakana? (ON?)
* 0x2400 is hiragana? (KUN?)
*
* We need to assume
*
*/
XChar2b * ReadPronunciation(unsigned char **Pstring)
{
XChar2b kbuff[MAXLINELEN];
XChar2b *kptr = kbuff;
unsigned char *parse = *Pstring;
enum {ERROR,READING, OKURIGANA,BLANK, DONE};
int state=BLANK;
if(*parse == '{'){
/* only english exists,
* (no kanji, even)
* so set character to be unusable.
*/
return 0;
}
while(*parse == ' ')
parse++;
/* THIS is going to get yeuky.
* We are going to parse a line segment which has
* reading.oku pairs.
* This is REALLY annoying, because the line jumps between
* 8 -bit and 16-bit chars
*/
/* okay, bad practice... you tell me what would be better :-/ */
while(1){
/* bug in gcc? If we put
* int state=BLANK;
* here, it gets reset each time through the while loop
*/
if(kptr >&kbuff[MAXLINELEN]){
fprintf(stderr,"ERROR! overflow reading in kanjidic\n");
fprintf(stderr,"%s\n",*Pstring);
return 0;
}
switch(*parse){
case '.':
parse++;
/* we ALWAYS need to close this off later */
state = OKURIGANA;
/* open paren */
kptr->byte1 = 0x21;
kptr->byte2 = 0x4a;
kptr++;
break;
case '-':
parse++;
#ifdef USEEXTRABLANKS
if(state == BLANK){
kptr->byte1 = 0x21;
kptr->byte2 = 0x21;
kptr++;
}
#endif
kptr->byte1 = 0x21;
kptr->byte2 = 0x41;
kptr++;
#ifdef USEEXTRABLANKS
if(state != BLANK){
kptr->byte1 = 0x21;
kptr->byte2 = 0x21;
kptr++;
}
#endif
continue;
/* start at top of while again */
case '\0':
case '\n':
case '\r':
case '{':
if(state == OKURIGANA){
/* close paren */
kptr->byte1 = 0x21;
kptr->byte2 = 0x4b;
kptr++;
}
state = DONE;
break;
case ' ':
if(state == OKURIGANA){
/* close paren */
kptr->byte1 = 0x21;
kptr->byte2 = 0x4b;
kptr++;
}
state = BLANK;
parse++;
kptr->byte1 = 0x21;
kptr->byte2 = 0x21;
kptr++;
break;
default:
if(*parse <127){
if(state == OKURIGANA){
/* close paren */
kptr->byte1 = 0x21;
kptr->byte2 = 0x4b;
kptr++;
puts("Kdrill.. error on kana read-in... ");
puts("Expecting high bit char to start after '.'");
printf("%s\n",*Pstring);
}
state = BLANK;
parse++;
} else {
if(state != OKURIGANA)
state = READING;
}
break;
}
if(state == DONE){
break;
}
if(state == BLANK)
continue;
/* else read in another char */
kptr->byte1= (*parse++ & 0x7f);
kptr->byte2= (*parse++ & 0x7f);
kptr++;
} /* while(1) */
/* copy out to struct, and exit */
kptr->byte1 = 0;
kptr->byte2 = 0;
*Pstring = parse;
return dup_16(kbuff);
}
/* readedict()
*
* Read in "edict.gz" if it exists
* [readstructs handles kanjidic reading]
*
* We only make very partial entries for edict entries
* We just fill out "english" and "pronunciation" entries.
*
* If we cannot extract a kanji entry, the kanji pointer of a
* translation will be set to a shared string of '8' on its side
*
* Note that we always start entries at index
* translations[MAXKANJIALLOWED+1]. This is to attempt to keep
* usefiles working
*
*/
void readedict()
{
unsigned char instring[MAXLINELEN];
unsigned char *parse, *slashparse;
int slashcount;
char edict[MAXLINELEN]; /*PATH to dictionary */
FILE *fp;
TRANSLATION newk=NULL,lastk;
int nextindex = MAXKANJIALLOWED+1;
int linecount=0;
static XChar2b no_kanji[2]=
{
{0x0, 0x0},
{0x0, 0x0}
};
no_kanji[0].byte1 = (NOKANJI >> 8);
no_kanji[0].byte2 = (NOKANJI & 0xff);
/* the following will be NULL if kanjidic not read in */
lastk = translations[highestkanji];
GetXtrmString("edictfile","Edictfile",edict);
edictname = edict;
if(strncmp(edictname,"none",4)==0){
fprintf(stderr,"edictfile set to 'none'. Skippping.\n");
return;
}
edictname=malloc(strlen(edict)+1);
strcpy(edictname, edict);
fp = open_compressed(edictname);
if(fp == NULL)
{
fprintf(stderr,"Cannot open edict file %s. Skipping.\n",
edictname);
return;
}
printf("Opened dictionary %s \n",edictname);
if(highestkanji == 0)
{
lowestkanji = nextindex;
}
while(getline(fp, instring) != 0)
{
int instrlen;
linecount++;
if(linecount%1000 == 0)
{
putchar('.');
fflush(stdout);
}
if(newk == NULL)
{
newk = (struct translationstruct *)
malloc(sizeof(struct translationstruct));
if(newk == NULL)
{
fprintf(stderr,"OUT OF MEMORY!!\n");
exit(errno);
}
}
bzero(newk, sizeof(*newk));
/* 1- read first part
* 2- read optional [part]
* 3- read english part
*/
parse = instring;
newk->kanji = no_kanji;
ReadEdictPron(&parse, newk);
if(newk->pronunciation == NULL)
{
fprintf(stderr,"Error reading edict\n");
newk = NULL;
continue;
}
while((*parse != '/') && *parse)
{
parse++;
}
slashcount=1;
slashparse = parse;
while(*slashparse++)
{
if(*slashparse =='/')
slashcount++;
}
/* need extra space for expansion */
instrlen = strlen((char *)parse)+1+ slashcount*4;
newk->english = (char *) malloc(instrlen);
if(newk->english == NULL){
perror("Cannot allocate memory for translation table\n");
exit(errno);
}
if(StripSlash(newk->english, parse)!=0){
fprintf(stderr, "bad line: %s\n", instring);
}
/* Success! Set pointers appropriately */
newk->kdrill_index=nextindex;
translations[nextindex++] = newk;
if(lastk != NULL)
{
lastk->nextk = newk;
}
lastk = newk;
newk = NULL;
}
if(isapipe(fp)){
pclose(fp);
} else {
fclose(fp);
}
if(nextindex != MAXKANJIALLOWED+1)
{
highestkanji = nextindex-1;
}
puts("");
puts("NOTE: an \"infinity\" sign means there is no kanji.");
puts(" Switch to \"show meaning\" option to show alternates.");
return;
}
/* lets make sure we have one single unified skip encoding here! */
short skipfromthree(int one, int two, int three){
int SKIPnum = (one<<12) | (two<<8) | three;
if((one>0xf) | (two>0xf) | (three>0xff) | (SKIPnum <0) )
{
#ifdef DEBUG
printf("corrupted SKIP ('Px-x-x') entry: %d-%d-%d\n",
one, two, three);
#endif
return 0;
}
return (short)(SKIPnum&0xffff);
}
/* parseskip
* Take a string pointing to the first char AFTER the "P", in
* kanjidic.
* So we expect a string like "4-5-11 xxx xxx xxx"
*
* We will then convert the three numbers into single byte values,
* and put them in the short we return.
* In hex, with a full short being [f][f][f][f], that would look like
* [1][2][3][3], in nibble positions.
* Although you really shouldn't care what we do with it, just remember that it
* is a short. We call skipfromthree(), and so should anything else!
*
*
*/
short parseskip(char *input)
{
int one, two, three;
one = atoi(input);
input++;
if(*input != '-')
{
#ifdef DEBUG
puts("corrupted SKIP ('Px-x-x') entry");
#endif
return 0;
}
input++;
two = atoi(input);
input++;
if(*input != '-')
input++;
if(*input != '-')
{
#ifdef DEBUG
puts("corrupted SKIP ('Px-x-x') entry");
#endif
return 0;
}
input++;
three = atoi(input);
return skipfromthree(one, two, three);
}
/* readstructs:
* the main dictionary reading routine for "kanjidic".
* Fills in the global translationstruct with
* all that is available for each selected kanji, in
* Grade, "pronunciation", english translation, and
* frequency of use (by native speakers)
*/
void readstructs(){
unsigned char instring[MAXLINELEN];
char dict[200];
FILE *fp;
TRANSLATION newk=NULL,lastk=NULL;
GetXtrmString("kdictfile","Kdictfile",dict);
dictname = dict;
#ifdef DEBUG
printf("kdictfile from resources is\" %s\"\n",dictname);
#endif
if(strncmp(dictname,"none",4)==0){
fprintf(stderr,"kdictfile set to 'none'. Skippping.\n");
return;
}
dictname=malloc(strlen(dict)+1);
strcpy(dictname, dict);
fp = open_compressed(dictname);
if(fp == NULL)
{
fprintf(stderr,"Cannot open kanjidic file %s. Skipping.\n",
dictname);
return;
}
printf("Opened dictionary %s \n",dictname);
if(fp ==NULL){
fprintf(stderr,"Dictionary not found\n");
exit(-1);
}
while (getline(fp,instring) != 0) {
int Kanji;
int freq,grade,N,U,H,Q,SKIP;
unsigned char *parse;
BYTE strokes;
int instrlen; /* length of pronunciation */
if(strlen((char *)instring) <10) continue;
/*try to get kanji Index right away */
#define BROKENFONTS 0
Kanji = xtoi((char *)&instring[2]) + (BROKENFONTS);
/* skip comments, kanji not specified in
* the usefile, and invalid single kanji
*/
if(Kanji < MINKANJIALLOWED) {
continue;
}
if(Kanji >MAXKANJIALLOWED) {
continue;
}
parse = &instring[2];
if(parse == NULL){
continue;
}
/* now parse for grade level, frequency, and english */
freq = grade = N = U = H = SKIP=0;
strokes=0; Q = -1; /* remember, "0000" IS a valid Qval!*/
nextword(&parse);
/* Check for high bit set, which means
* start of kana definition of kana.
* We cheat a bit, and let this loop skip over
* numbers by the fact that they don't match
* the case statements.
*/
while ( (*parse < 127) && (*parse != '{') ) {
switch(*parse){
case 'F':
freq = atoi((char *)&parse[1]);
break;
case 'G':
grade = atoi((char *)&parse[1]);
break;
case 'H':
H = atoi((char *)&parse[1]);
break;
case 'N':
N = atoi((char *)&parse[1]);
break;
case 'P':
SKIP = parseskip((char *) &parse[1]);
break;
case 'Q':
Q = atoi((char *)&parse[1]);
break;
case 'S':
strokes= atoi((char *)&parse[1]);
break;
case 'U':
U = xtoi((char *)&parse[1]);
if(U&0xffff0000)
{
printf("got hi U: %x\n",
U);
}
break;
default:
parse++;
break;
}
nextword(&parse);
} /* while != '{' */
/**********************************************
* Now we know that we have a useable/wanted *
* dictionary definition *
*********************************************/
if((lowestkanji==highestkanji) && (highestkanji==0)){
lowestkanji = highestkanji = Kanji;
} else{
if(Kanji < lowestkanji) lowestkanji = Kanji;
if (Kanji > highestkanji) highestkanji = Kanji;
}
lastk = newk;
newk = (struct translationstruct *)
malloc(sizeof(struct translationstruct));
if (newk == NULL){
perror("Cannot allocate memory for translation table\n");
exit(errno);
}
newk->Sindex=SKIP;
newk->Qindex=Q;
newk->Uindex=U;
newk->Hindex=H;
newk->Nindex=N;
newk->frequency = freq;
newk->grade_level = grade;
newk->Strokecount=strokes;
newk->incorrect=0;
newk->kanji=0;
newk->pronunciation=0;
newk->nextk = NULL;
#ifdef DEBUG
printf("Q=%d, U=%d, freq=%d\n", Q, U, freq);
#endif
newk->pronunciation = ReadPronunciation(&parse);
if(newk->pronunciation == 0){
free(newk);
newk = lastk;
continue;
} else {
XChar2b buff[2];
buff[0].byte1 = (Kanji & 0xff00) >> 8;
buff[0].byte2 = (Kanji & 0xff);
buff[1].byte1 = 0;
buff[1].byte2 = 0;
newk->kanji = dup_16(buff);
}
if(lastk != NULL)
lastk->nextk = newk;
instrlen = strlen((char *)parse)+1;
newk->english = (char *) malloc(instrlen);
if(newk->english == NULL){
perror("Cannot allocate memory for translation table\n");
exit(errno);
}
StripBrackets(newk->english, parse);
newk->kdrill_index=Kanji;
translations[Kanji] = newk;
numberofkanji++;
if(numberofkanji%1000 == 0)
{
putchar('.');
fflush(stdout);
}
} /* and repeat until end of file */
puts("");
if(isapipe(fp)){
pclose(fp);
} else {
fclose(fp);
}
}
syntax highlighted by Code2HTML, v. 0.9.1