ports//japanese/kdrill/work/kdrill6.4/readfile.c

/*
 * This file is for just setting up the structs, etc
 */
/* stdio SHOULD get included by Xos.h or something */
/* but it doesn't with sunos, at least */

#include <errno.h>
#include <stdio.h>	 /* for popen, and other things */
#include <stdlib.h>
#include <ctype.h>
#include <Xfuncs.h>	/* handles bzero redefine stuff */
#include <Xlib.h>
#include <Xatom.h>
#include <Xutil.h>
#include <Intrinsic.h>
#include <StringDefs.h>
#include <Xos.h>

#include "defs.h"
#include "utils.h"
#include "externs.h"

/* translations[] keeps track of which kanji it is okay to test the
 *	user on. Likewise with numberofkanji, highest, and lowest.
 *	YES, it is best to keep in a large array, otherwise
 *	it would be difficult to switch between grade levels.
 */
struct translationstruct *translations[MAXTRANSLATIONSALLOWED];
int numberofkanji=0, highestkanji=0, lowestkanji=0;

static char *dictname=NULL;
static char *edictname=NULL;

/* getline:
 *	reads a line (from dictionary).
 *	Deals with 8-bit char reads (or attempts to)
 *	Also attempts to deal with the problem of variable line length.
 *	Reads in chunks, looking for newline.
 *	Note the MAXLINELEN define. We DO have a limit. We do NOT use
 *		malloc
 *
 *	Copies a line from FILE * to passed (unsigned char *)
 *
 *	Returns true (1) if read a line
 *	Returns false (0) if fail;
 *
 *	Used in "readstructs()", below. Also "readedict()"
 *
 *	Note that this expects static global getline_inptr to be NULL
 *	if we have just started reading a stream;
 */

int getline(FILE *fp,unsigned char *s)
{
	char * val;
	char *endparse;

	val=fgets(s,MAXLINELEN,fp);
	
	if(val==NULL)
		return 0; /* probably EOF. We dont care */

	endparse=& s[strlen(s)-1];

	switch(*endparse){
		case 10:
		case 13:
			*endparse='\0';
	}

	return 1;

}

/* StripBrackets:
 *	Gets rid of those annoying {enlish}{english2} brackets.
 *	PRESUMES first char of source is '{'!!
 *      Well, actually, it nicely sets a null string if otherwise.
 *	See also StripSlash, below
 */
void StripBrackets(char *dest,unsigned char *source)
{
	unsigned char *parse = &source[1];

	if(source[0] != '{'){
		dest[0] = '\0';
		return;
	}
	/* (*dest) is always assumed to be needing a write */

	do {
		switch(*parse){
			case '{':
				*dest++ = ':';
				*dest++ = ' ';
				break;
			case '}':
				break;
			case '\n':
				*dest++='\0';
			default:
				*dest++ = *parse;				
		}
		parse++;
	} while((*parse != '\n') && (*parse != '\0'));
	*dest = '\0';
	return;
}
/* StripSlash
 *	Gets rid of /enlish/english2/ Slashes.
 *	Copies the cleaned up version of source, to topdest
 *
 *	This is for readedict. Probably nothing else should use it
 *	Modeled directly after StripBrackets
 *	PRESUMES first char of source is '/'!!
 *	Then looks for LAST '/'
 *	(Or will set topdest[0] to '\0')
 *
 *	We USED to translate middle ':' to '/'.
 *
 *	Source is actually assumed to be regular ascii signed char,
 *	but declared as unsigned to stop compiler warnings.
 *
 * return 0 OKAY, 1 bad line
 */
int StripSlash(char *topdest,unsigned char *source)
{
	char *dest;
	int englen;
	unsigned char *parse = source;

	if(*parse != '/'){
		topdest = '\0';
		return 1;
	}
	parse=strrchr(source, '/');
	if(parse<&source[2]){
		fprintf(stderr,"Error: english part too short\n");
		fprintf(stderr,"%s\n", source);
		return 1;
	}
	englen=parse- source - 1;
	strncpy(topdest, &source[1], parse- source - 1);
	topdest[englen]='\0';

	/* we've copied the relavant part over to topdest. Now rewrite
	 * in-place
	 */
	dest=topdest;
	dest=strchr(dest, '/');
	while(dest!=NULL){
		*dest=':';
		dest=strchr(dest, '/');
	}

	return 0;
}

/* Given a translation, return the index into translations[] that it
 * sits at
 */
int trans_to_index(TRANSLATION trans)
{
	return trans->kdrill_index;
}


/* read in kanji/kana part of edictfile line.
 * format is:
 *
 * KANA /english_1/.../
 *
 * or
 *
 * KANJI [KANA] /english_1/english_2/.../
 *
 *
 */
void ReadEdictPron(unsigned char **Pstring, struct translationstruct *trans)
{
	/* note that MAXLINELEN means we canot possibly run out of space */
	XChar2b kbuff[MAXLINELEN];
	XChar2b *kptr = kbuff;
	unsigned char *parse = *Pstring;

	/* Read in a 16-bit string.
	 * We dont know if its kana or kanji yet
	 */
	while(*parse && (*parse != '/'))
	{
		switch(*parse)
		{
		   case ' ':
			/* 0x2121 is ' ' */
			kptr->byte1 = 0x21;
			kptr->byte2 = 0x21;
			kptr++;
			parse++;			
			break;

		   case '[':
		        /* oops.. the kanji/kana switch */
			/* save what must be kanji, then start
			 * on kana
			 */
			kptr->byte1 = 0;
			kptr->byte2 = 0;

			trans->kanji =  dup_16(kbuff);
			kptr = kbuff;
			/* now reset buffer, and read in another char16
			 * string
			 */
			parse++;
			break;
		   case ']':
			parse++;
			while(*parse && (*parse != '/'))
				parse++;
			/* and then we will fall out of the top loop  */
			break;

		   default:
			kptr->byte1= (*parse++ & 0x7f);
			kptr->byte2= (*parse++ & 0x7f);
			kptr++;
		}
	}

	/* when we come out here, we will ALWAYS have kana in
	 * the kbuff
	 */
	
	kptr->byte1 = 0;
	kptr->byte2 = 0;

	trans->pronunciation =  dup_16(kbuff);


	*Pstring = parse;
}


/* Okay, it's not actually pronunciation we're reading in
 * We are reading the "on-yoni" and "kun-yoni" readings
 * in kanjidic. Also, the optional okurigami.
 *
 * Format:
 *     reading{.oku} [reading{.oku}] ...
 */


/* 0x2500 stuff is kanakana? (ON?)
 * 0x2400 is hiragana?  (KUN?)
 *
 *	We need to assume
 *	
 */
XChar2b * ReadPronunciation(unsigned char **Pstring)
{

	XChar2b kbuff[MAXLINELEN];
	XChar2b *kptr = kbuff;
	unsigned char *parse = *Pstring;
	enum {ERROR,READING, OKURIGANA,BLANK, DONE};
	int state=BLANK;

	if(*parse == '{'){
		/* only english exists,
		 *  (no kanji, even)
		 *   so set character to be unusable.
		 */
		return 0;
	}
	while(*parse == ' ')
		parse++;

	/* THIS is going to get yeuky.
	 *  We are going to parse a line segment which has
	 *  reading.oku  pairs.
	 * This is REALLY annoying, because the line jumps between
	 * 8 -bit and 16-bit chars
	 */

	/* okay, bad practice... you tell me what would be better :-/ */

	while(1){

		/* bug in gcc? If we put 
		 *	int state=BLANK;
		 * here, it gets reset each time through the while loop
		 */

		if(kptr >&kbuff[MAXLINELEN]){
			fprintf(stderr,"ERROR! overflow reading in kanjidic\n");
			fprintf(stderr,"%s\n",*Pstring);
			return 0;
		}

		switch(*parse){
			case '.':
				parse++;

				/* we ALWAYS need to close this off later */
				state = OKURIGANA;
				/* open paren */
				kptr->byte1 = 0x21;
				kptr->byte2 = 0x4a;
				kptr++;

				break;
			case '-':
				parse++;
#ifdef USEEXTRABLANKS
				if(state == BLANK){
					kptr->byte1 = 0x21;
					kptr->byte2 = 0x21;
					kptr++;
				}
#endif
				kptr->byte1 = 0x21;
				kptr->byte2 = 0x41;
				kptr++;
#ifdef USEEXTRABLANKS
				if(state != BLANK){
					kptr->byte1 = 0x21;
					kptr->byte2 = 0x21;
					kptr++;
				}
#endif
				continue;
				/* start at top of while again */

			case '\0':
			case '\n':
			case '\r':
			case '{':
				if(state == OKURIGANA){
					/* close paren */
					kptr->byte1 = 0x21;
					kptr->byte2 = 0x4b;
					kptr++;
				}
				state = DONE;
				break;

			case ' ':
				if(state == OKURIGANA){
					/* close paren */
					kptr->byte1 = 0x21;
					kptr->byte2 = 0x4b;
					kptr++;
				}
				state = BLANK;

				parse++;
				kptr->byte1 = 0x21;
				kptr->byte2 = 0x21;
				kptr++;
				break;

			default:
				if(*parse <127){
					if(state == OKURIGANA){
						/* close paren */
						kptr->byte1 = 0x21;
						kptr->byte2 = 0x4b;
						kptr++;
						puts("Kdrill.. error on kana read-in... ");
						puts("Expecting high bit char to start after '.'");
						printf("%s\n",*Pstring);
						
					}
					state = BLANK;
					parse++;
				} else {
					if(state != OKURIGANA)
						state = READING;
				}
				break;
		}

		if(state == DONE){
			break;
		}
		if(state == BLANK)
			continue;
		/* else read in another char */
		kptr->byte1= (*parse++ & 0x7f);
		kptr->byte2= (*parse++ & 0x7f);
		kptr++;

	} /* while(1) */

	/* copy out to struct, and exit */
	kptr->byte1 = 0;
	kptr->byte2 = 0;

	*Pstring = parse;
	return dup_16(kbuff);

}


/* readedict()
 * 
 *	Read in "edict.gz" if it exists
 *	[readstructs handles kanjidic reading]
 *	
 *	We only make very partial entries for edict entries
 *	We just fill out "english" and "pronunciation" entries.
 *
 *	If we cannot extract a kanji entry, the kanji pointer of a
 *	translation will be set to a shared string of '8' on its side
 *
 *	Note that we always start entries at index
 *	translations[MAXKANJIALLOWED+1]. This is to attempt to keep
 *	usefiles working
 *	
 */
void readedict()
{
	unsigned char instring[MAXLINELEN];
	unsigned char *parse, *slashparse;
	int slashcount;
	char edict[MAXLINELEN]; /*PATH to dictionary */
	FILE *fp;
	TRANSLATION newk=NULL,lastk;
	int nextindex = MAXKANJIALLOWED+1;
	int linecount=0;
	static XChar2b no_kanji[2]=
	{
		{0x0, 0x0},
		{0x0,  0x0}
	};

	no_kanji[0].byte1 = (NOKANJI >> 8);
	no_kanji[0].byte2 = (NOKANJI & 0xff);


	/* the following will be NULL if kanjidic not read in */
	lastk = translations[highestkanji];

	GetXtrmString("edictfile","Edictfile",edict);
	edictname = edict;

	if(strncmp(edictname,"none",4)==0){
		fprintf(stderr,"edictfile set to 'none'. Skippping.\n");
		return;
	}

	edictname=malloc(strlen(edict)+1);
	strcpy(edictname, edict);
	fp = open_compressed(edictname);
	if(fp == NULL)
	{
		fprintf(stderr,"Cannot open edict file %s. Skipping.\n",
			edictname);
		return;
	}
	printf("Opened dictionary %s \n",edictname);
	if(highestkanji == 0)
	{
		lowestkanji = nextindex;
	}

	while(getline(fp, instring) != 0)
	{
		int instrlen;

		linecount++;
		if(linecount%1000 == 0)
		{
			putchar('.');
			fflush(stdout);
		}


		if(newk == NULL)
		{
			newk =  (struct translationstruct *)
				malloc(sizeof(struct translationstruct));
			if(newk == NULL)
			{
				fprintf(stderr,"OUT OF MEMORY!!\n");
				exit(errno);
			}
		}

		bzero(newk, sizeof(*newk));

		/* 1- read first part
		 * 2- read optional [part]
		 * 3- read english part
		 */

		parse = instring;

		newk->kanji = no_kanji;

		ReadEdictPron(&parse, newk);
		if(newk->pronunciation == NULL)
		{
			fprintf(stderr,"Error reading edict\n");
			newk = NULL;
			continue;
		}

		while((*parse != '/') && *parse)
		{
			parse++;
		}
		slashcount=1;
		slashparse = parse;
		while(*slashparse++)
		{
			if(*slashparse =='/')
				slashcount++;
		}
		/* need extra space for expansion */
		instrlen = strlen((char *)parse)+1+ slashcount*4;
		
		newk->english = (char *) malloc(instrlen);
		if(newk->english == NULL){
			perror("Cannot allocate memory for translation table\n");
			exit(errno);		
		}
		
		if(StripSlash(newk->english, parse)!=0){
			fprintf(stderr, "bad line: %s\n", instring);
		}

		/* Success! Set pointers appropriately */
		newk->kdrill_index=nextindex;
		translations[nextindex++] = newk;
		if(lastk != NULL)
		{
			lastk->nextk = newk;
		}
		lastk = newk;
		newk = NULL;
		
	}

	if(isapipe(fp)){
		pclose(fp);
	} else {
		fclose(fp);
	}
	if(nextindex != MAXKANJIALLOWED+1)
	{
		highestkanji = nextindex-1;
	}

	puts("");

	puts("NOTE: an \"infinity\" sign means there is no kanji.");
	puts("  Switch to \"show meaning\" option to show alternates.");

	return;

}

/* lets make sure we have one single unified skip encoding here! */
short skipfromthree(int one, int two, int three){

      int SKIPnum = (one<<12) | (two<<8) | three;

      if((one>0xf) | (two>0xf) | (three>0xff) | (SKIPnum <0) )
      {
#ifdef DEBUG
              printf("corrupted SKIP ('Px-x-x') entry: %d-%d-%d\n",
                     one, two, three);
#endif
              return 0;
      }


      return (short)(SKIPnum&0xffff);
}


/* parseskip
 * Take a string pointing to the first char AFTER the "P", in 
 * kanjidic.
 * So we expect a string like "4-5-11 xxx xxx xxx"
 *
 * We will then convert the three numbers into single byte values,
 * and put them in the short we return.
 * In hex, with a full short being [f][f][f][f], that would look like
 * [1][2][3][3], in nibble positions.
 * Although you really shouldn't care what we do with it, just remember that it
 * is a short. We call skipfromthree(), and so should anything else!
 *
 *
 */
short parseskip(char *input)
{
	int one, two, three;
	
	one = atoi(input);

	input++;
	if(*input != '-')
	{
#ifdef DEBUG
		puts("corrupted SKIP ('Px-x-x') entry");
#endif		
		return 0;
	}
	input++;
	two = atoi(input);

	input++;
	if(*input != '-')
		input++;
	if(*input != '-')
	{
#ifdef DEBUG
		puts("corrupted SKIP ('Px-x-x') entry");
#endif		
		return 0;
	}
	input++;
	three = atoi(input);

	return skipfromthree(one, two, three);
}

/* readstructs:
 *	the main dictionary reading routine for "kanjidic".
 *	Fills in the global translationstruct with
 *	all that is available for each selected kanji, in
 *	Grade, "pronunciation", english translation, and
 *	frequency of use (by native speakers)
 */
void readstructs(){
	unsigned char instring[MAXLINELEN];
	char dict[200];
	FILE *fp;
	TRANSLATION newk=NULL,lastk=NULL;

	GetXtrmString("kdictfile","Kdictfile",dict);
	dictname = dict;
#ifdef DEBUG
	printf("kdictfile from resources is\" %s\"\n",dictname);
#endif

	if(strncmp(dictname,"none",4)==0){
		fprintf(stderr,"kdictfile set to 'none'. Skippping.\n");
		return;
	}
	dictname=malloc(strlen(dict)+1);
	strcpy(dictname, dict);
	fp = open_compressed(dictname);
	if(fp == NULL)
	{
		fprintf(stderr,"Cannot open kanjidic file %s. Skipping.\n",
			dictname);
		return;
	}
	
	printf("Opened dictionary %s \n",dictname);


	if(fp ==NULL){
		fprintf(stderr,"Dictionary  not found\n");
		exit(-1);
	}

	while (getline(fp,instring) != 0) {
		int Kanji;
		int freq,grade,N,U,H,Q,SKIP;
		unsigned char *parse;
		BYTE strokes;
		int instrlen;	/* length of pronunciation */

		if(strlen((char *)instring) <10) continue;

		/*try to get kanji Index right away */

#define BROKENFONTS 0
		
		Kanji = xtoi((char *)&instring[2]) + (BROKENFONTS);


		/* skip comments, kanji not specified in
		 * the usefile, and invalid single kanji
		 */	
		if(Kanji < MINKANJIALLOWED) {
			continue;
		}
		if(Kanji >MAXKANJIALLOWED) {
			continue;
		}

		parse = &instring[2];
		if(parse == NULL){
			continue;
		}
		/* now parse for grade level, frequency, and english */
		freq = grade = N = U = H = SKIP=0;
		strokes=0; Q = -1; /* remember, "0000" IS a valid Qval!*/

		nextword(&parse);

		/* Check for high bit set, which means
		 * start of kana definition of kana.
		 * We cheat a bit, and let this loop skip over
		 * numbers by the fact that they don't match
		 * the case statements.
		 */
		while ( (*parse < 127)  && (*parse != '{') ) {
			switch(*parse){
				case 'F':
					freq = atoi((char *)&parse[1]);
					break;
				case 'G':
					grade = atoi((char *)&parse[1]);
					break;
				case 'H':
					H = atoi((char *)&parse[1]);
					break;
				case 'N':
					N = atoi((char *)&parse[1]);
					break;
				case 'P':
					SKIP = parseskip((char *) &parse[1]);
					break;
				case 'Q':
					Q = atoi((char *)&parse[1]);
					break;
				case 'S':
					strokes= atoi((char *)&parse[1]);
					break;
				case 'U':
					U = xtoi((char *)&parse[1]);
					if(U&0xffff0000)
					{
						printf("got hi U: %x\n",
						       U);
					}
					break;
				default:
					parse++;
					break;
			}
			nextword(&parse);
		} /* while != '{' */
		
		
		/**********************************************
		 *  Now we know that we have a useable/wanted *
		 *  dictionary definition                     *
		 *********************************************/
		if((lowestkanji==highestkanji) && (highestkanji==0)){
 			lowestkanji = highestkanji = Kanji;
		} else{
			if(Kanji < lowestkanji) lowestkanji = Kanji;
			if (Kanji > highestkanji) highestkanji = Kanji;
		}

		lastk = newk;
		
		newk = (struct translationstruct *)
			malloc(sizeof(struct translationstruct));
		if (newk == NULL){
			perror("Cannot allocate memory for translation table\n");
			exit(errno);
		}
		newk->Sindex=SKIP;
		newk->Qindex=Q;
		newk->Uindex=U;
		newk->Hindex=H;
		newk->Nindex=N;
		newk->frequency = freq;
		newk->grade_level = grade;
		newk->Strokecount=strokes;
		newk->incorrect=0;
		newk->kanji=0;
		newk->pronunciation=0;
		newk->nextk = NULL;
#ifdef DEBUG
		printf("Q=%d, U=%d, freq=%d\n", Q, U, freq);
#endif
		
		newk->pronunciation = ReadPronunciation(&parse);
		if(newk->pronunciation == 0){
			free(newk);
			newk = lastk;
			continue;
		} else {
			XChar2b buff[2];

			buff[0].byte1 = (Kanji & 0xff00) >> 8;
			buff[0].byte2 = (Kanji & 0xff);
			buff[1].byte1 = 0;
			buff[1].byte2 = 0;
			newk->kanji = dup_16(buff);
		}
		if(lastk != NULL)
			lastk->nextk = newk;

		instrlen = strlen((char *)parse)+1;
		newk->english = (char *) malloc(instrlen);
		if(newk->english == NULL){
			perror("Cannot allocate memory for translation table\n");
			exit(errno);		
		}

		StripBrackets(newk->english, parse);
		newk->kdrill_index=Kanji;
		translations[Kanji] = newk;
		numberofkanji++;
		if(numberofkanji%1000 == 0)
		{
			putchar('.');
			fflush(stdout);
		}

	} /* and repeat until end of file */
	puts("");

	if(isapipe(fp)){
		pclose(fp);
	} else {
		fclose(fp);
	}

}
syntax highlighted by Code2HTML, v. 0.9.1