/* * This file is for just setting up the structs, etc */ /* stdio SHOULD get included by Xos.h or something */ /* but it doesn't with sunos, at least */ #include #include /* for popen, and other things */ #include #include #include /* handles bzero redefine stuff */ #include #include #include #include #include #include #include "defs.h" #include "utils.h" #include "externs.h" /* translations[] keeps track of which kanji it is okay to test the * user on. Likewise with numberofkanji, highest, and lowest. * YES, it is best to keep in a large array, otherwise * it would be difficult to switch between grade levels. */ struct translationstruct *translations[MAXTRANSLATIONSALLOWED]; int numberofkanji=0, highestkanji=0, lowestkanji=0; static char *dictname=NULL; static char *edictname=NULL; /* getline: * reads a line (from dictionary). * Deals with 8-bit char reads (or attempts to) * Also attempts to deal with the problem of variable line length. * Reads in chunks, looking for newline. * Note the MAXLINELEN define. We DO have a limit. We do NOT use * malloc * * Copies a line from FILE * to passed (unsigned char *) * * Returns true (1) if read a line * Returns false (0) if fail; * * Used in "readstructs()", below. Also "readedict()" * * Note that this expects static global getline_inptr to be NULL * if we have just started reading a stream; */ int getline(FILE *fp,unsigned char *s) { char * val; char *endparse; val=fgets(s,MAXLINELEN,fp); if(val==NULL) return 0; /* probably EOF. We dont care */ endparse=& s[strlen(s)-1]; switch(*endparse){ case 10: case 13: *endparse='\0'; } return 1; } /* StripBrackets: * Gets rid of those annoying {enlish}{english2} brackets. * PRESUMES first char of source is '{'!! * Well, actually, it nicely sets a null string if otherwise. * See also StripSlash, below */ void StripBrackets(char *dest,unsigned char *source) { unsigned char *parse = &source[1]; if(source[0] != '{'){ dest[0] = '\0'; return; } /* (*dest) is always assumed to be needing a write */ do { switch(*parse){ case '{': *dest++ = ':'; *dest++ = ' '; break; case '}': break; case '\n': *dest++='\0'; default: *dest++ = *parse; } parse++; } while((*parse != '\n') && (*parse != '\0')); *dest = '\0'; return; } /* StripSlash * Gets rid of /enlish/english2/ Slashes. * Copies the cleaned up version of source, to topdest * * This is for readedict. Probably nothing else should use it * Modeled directly after StripBrackets * PRESUMES first char of source is '/'!! * Then looks for LAST '/' * (Or will set topdest[0] to '\0') * * We USED to translate middle ':' to '/'. * * Source is actually assumed to be regular ascii signed char, * but declared as unsigned to stop compiler warnings. * * return 0 OKAY, 1 bad line */ int StripSlash(char *topdest,unsigned char *source) { char *dest; int englen; unsigned char *parse = source; if(*parse != '/'){ topdest = '\0'; return 1; } parse=strrchr(source, '/'); if(parse<&source[2]){ fprintf(stderr,"Error: english part too short\n"); fprintf(stderr,"%s\n", source); return 1; } englen=parse- source - 1; strncpy(topdest, &source[1], parse- source - 1); topdest[englen]='\0'; /* we've copied the relavant part over to topdest. Now rewrite * in-place */ dest=topdest; dest=strchr(dest, '/'); while(dest!=NULL){ *dest=':'; dest=strchr(dest, '/'); } return 0; } /* Given a translation, return the index into translations[] that it * sits at */ int trans_to_index(TRANSLATION trans) { return trans->kdrill_index; } /* read in kanji/kana part of edictfile line. * format is: * * KANA /english_1/.../ * * or * * KANJI [KANA] /english_1/english_2/.../ * * */ void ReadEdictPron(unsigned char **Pstring, struct translationstruct *trans) { /* note that MAXLINELEN means we canot possibly run out of space */ XChar2b kbuff[MAXLINELEN]; XChar2b *kptr = kbuff; unsigned char *parse = *Pstring; /* Read in a 16-bit string. * We dont know if its kana or kanji yet */ while(*parse && (*parse != '/')) { switch(*parse) { case ' ': /* 0x2121 is ' ' */ kptr->byte1 = 0x21; kptr->byte2 = 0x21; kptr++; parse++; break; case '[': /* oops.. the kanji/kana switch */ /* save what must be kanji, then start * on kana */ kptr->byte1 = 0; kptr->byte2 = 0; trans->kanji = dup_16(kbuff); kptr = kbuff; /* now reset buffer, and read in another char16 * string */ parse++; break; case ']': parse++; while(*parse && (*parse != '/')) parse++; /* and then we will fall out of the top loop */ break; default: kptr->byte1= (*parse++ & 0x7f); kptr->byte2= (*parse++ & 0x7f); kptr++; } } /* when we come out here, we will ALWAYS have kana in * the kbuff */ kptr->byte1 = 0; kptr->byte2 = 0; trans->pronunciation = dup_16(kbuff); *Pstring = parse; } /* Okay, it's not actually pronunciation we're reading in * We are reading the "on-yoni" and "kun-yoni" readings * in kanjidic. Also, the optional okurigami. * * Format: * reading{.oku} [reading{.oku}] ... */ /* 0x2500 stuff is kanakana? (ON?) * 0x2400 is hiragana? (KUN?) * * We need to assume * */ XChar2b * ReadPronunciation(unsigned char **Pstring) { XChar2b kbuff[MAXLINELEN]; XChar2b *kptr = kbuff; unsigned char *parse = *Pstring; enum {ERROR,READING, OKURIGANA,BLANK, DONE}; int state=BLANK; if(*parse == '{'){ /* only english exists, * (no kanji, even) * so set character to be unusable. */ return 0; } while(*parse == ' ') parse++; /* THIS is going to get yeuky. * We are going to parse a line segment which has * reading.oku pairs. * This is REALLY annoying, because the line jumps between * 8 -bit and 16-bit chars */ /* okay, bad practice... you tell me what would be better :-/ */ while(1){ /* bug in gcc? If we put * int state=BLANK; * here, it gets reset each time through the while loop */ if(kptr >&kbuff[MAXLINELEN]){ fprintf(stderr,"ERROR! overflow reading in kanjidic\n"); fprintf(stderr,"%s\n",*Pstring); return 0; } switch(*parse){ case '.': parse++; /* we ALWAYS need to close this off later */ state = OKURIGANA; /* open paren */ kptr->byte1 = 0x21; kptr->byte2 = 0x4a; kptr++; break; case '-': parse++; #ifdef USEEXTRABLANKS if(state == BLANK){ kptr->byte1 = 0x21; kptr->byte2 = 0x21; kptr++; } #endif kptr->byte1 = 0x21; kptr->byte2 = 0x41; kptr++; #ifdef USEEXTRABLANKS if(state != BLANK){ kptr->byte1 = 0x21; kptr->byte2 = 0x21; kptr++; } #endif continue; /* start at top of while again */ case '\0': case '\n': case '\r': case '{': if(state == OKURIGANA){ /* close paren */ kptr->byte1 = 0x21; kptr->byte2 = 0x4b; kptr++; } state = DONE; break; case ' ': if(state == OKURIGANA){ /* close paren */ kptr->byte1 = 0x21; kptr->byte2 = 0x4b; kptr++; } state = BLANK; parse++; kptr->byte1 = 0x21; kptr->byte2 = 0x21; kptr++; break; default: if(*parse <127){ if(state == OKURIGANA){ /* close paren */ kptr->byte1 = 0x21; kptr->byte2 = 0x4b; kptr++; puts("Kdrill.. error on kana read-in... "); puts("Expecting high bit char to start after '.'"); printf("%s\n",*Pstring); } state = BLANK; parse++; } else { if(state != OKURIGANA) state = READING; } break; } if(state == DONE){ break; } if(state == BLANK) continue; /* else read in another char */ kptr->byte1= (*parse++ & 0x7f); kptr->byte2= (*parse++ & 0x7f); kptr++; } /* while(1) */ /* copy out to struct, and exit */ kptr->byte1 = 0; kptr->byte2 = 0; *Pstring = parse; return dup_16(kbuff); } /* readedict() * * Read in "edict.gz" if it exists * [readstructs handles kanjidic reading] * * We only make very partial entries for edict entries * We just fill out "english" and "pronunciation" entries. * * If we cannot extract a kanji entry, the kanji pointer of a * translation will be set to a shared string of '8' on its side * * Note that we always start entries at index * translations[MAXKANJIALLOWED+1]. This is to attempt to keep * usefiles working * */ void readedict() { unsigned char instring[MAXLINELEN]; unsigned char *parse, *slashparse; int slashcount; char edict[MAXLINELEN]; /*PATH to dictionary */ FILE *fp; TRANSLATION newk=NULL,lastk; int nextindex = MAXKANJIALLOWED+1; int linecount=0; static XChar2b no_kanji[2]= { {0x0, 0x0}, {0x0, 0x0} }; no_kanji[0].byte1 = (NOKANJI >> 8); no_kanji[0].byte2 = (NOKANJI & 0xff); /* the following will be NULL if kanjidic not read in */ lastk = translations[highestkanji]; GetXtrmString("edictfile","Edictfile",edict); edictname = edict; if(strncmp(edictname,"none",4)==0){ fprintf(stderr,"edictfile set to 'none'. Skippping.\n"); return; } edictname=malloc(strlen(edict)+1); strcpy(edictname, edict); fp = open_compressed(edictname); if(fp == NULL) { fprintf(stderr,"Cannot open edict file %s. Skipping.\n", edictname); return; } printf("Opened dictionary %s \n",edictname); if(highestkanji == 0) { lowestkanji = nextindex; } while(getline(fp, instring) != 0) { int instrlen; linecount++; if(linecount%1000 == 0) { putchar('.'); fflush(stdout); } if(newk == NULL) { newk = (struct translationstruct *) malloc(sizeof(struct translationstruct)); if(newk == NULL) { fprintf(stderr,"OUT OF MEMORY!!\n"); exit(errno); } } bzero(newk, sizeof(*newk)); /* 1- read first part * 2- read optional [part] * 3- read english part */ parse = instring; newk->kanji = no_kanji; ReadEdictPron(&parse, newk); if(newk->pronunciation == NULL) { fprintf(stderr,"Error reading edict\n"); newk = NULL; continue; } while((*parse != '/') && *parse) { parse++; } slashcount=1; slashparse = parse; while(*slashparse++) { if(*slashparse =='/') slashcount++; } /* need extra space for expansion */ instrlen = strlen((char *)parse)+1+ slashcount*4; newk->english = (char *) malloc(instrlen); if(newk->english == NULL){ perror("Cannot allocate memory for translation table\n"); exit(errno); } if(StripSlash(newk->english, parse)!=0){ fprintf(stderr, "bad line: %s\n", instring); } /* Success! Set pointers appropriately */ newk->kdrill_index=nextindex; translations[nextindex++] = newk; if(lastk != NULL) { lastk->nextk = newk; } lastk = newk; newk = NULL; } if(isapipe(fp)){ pclose(fp); } else { fclose(fp); } if(nextindex != MAXKANJIALLOWED+1) { highestkanji = nextindex-1; } puts(""); puts("NOTE: an \"infinity\" sign means there is no kanji."); puts(" Switch to \"show meaning\" option to show alternates."); return; } /* lets make sure we have one single unified skip encoding here! */ short skipfromthree(int one, int two, int three){ int SKIPnum = (one<<12) | (two<<8) | three; if((one>0xf) | (two>0xf) | (three>0xff) | (SKIPnum <0) ) { #ifdef DEBUG printf("corrupted SKIP ('Px-x-x') entry: %d-%d-%d\n", one, two, three); #endif return 0; } return (short)(SKIPnum&0xffff); } /* parseskip * Take a string pointing to the first char AFTER the "P", in * kanjidic. * So we expect a string like "4-5-11 xxx xxx xxx" * * We will then convert the three numbers into single byte values, * and put them in the short we return. * In hex, with a full short being [f][f][f][f], that would look like * [1][2][3][3], in nibble positions. * Although you really shouldn't care what we do with it, just remember that it * is a short. We call skipfromthree(), and so should anything else! * * */ short parseskip(char *input) { int one, two, three; one = atoi(input); input++; if(*input != '-') { #ifdef DEBUG puts("corrupted SKIP ('Px-x-x') entry"); #endif return 0; } input++; two = atoi(input); input++; if(*input != '-') input++; if(*input != '-') { #ifdef DEBUG puts("corrupted SKIP ('Px-x-x') entry"); #endif return 0; } input++; three = atoi(input); return skipfromthree(one, two, three); } /* readstructs: * the main dictionary reading routine for "kanjidic". * Fills in the global translationstruct with * all that is available for each selected kanji, in * Grade, "pronunciation", english translation, and * frequency of use (by native speakers) */ void readstructs(){ unsigned char instring[MAXLINELEN]; char dict[200]; FILE *fp; TRANSLATION newk=NULL,lastk=NULL; GetXtrmString("kdictfile","Kdictfile",dict); dictname = dict; #ifdef DEBUG printf("kdictfile from resources is\" %s\"\n",dictname); #endif if(strncmp(dictname,"none",4)==0){ fprintf(stderr,"kdictfile set to 'none'. Skippping.\n"); return; } dictname=malloc(strlen(dict)+1); strcpy(dictname, dict); fp = open_compressed(dictname); if(fp == NULL) { fprintf(stderr,"Cannot open kanjidic file %s. Skipping.\n", dictname); return; } printf("Opened dictionary %s \n",dictname); if(fp ==NULL){ fprintf(stderr,"Dictionary not found\n"); exit(-1); } while (getline(fp,instring) != 0) { int Kanji; int freq,grade,N,U,H,Q,SKIP; unsigned char *parse; BYTE strokes; int instrlen; /* length of pronunciation */ if(strlen((char *)instring) <10) continue; /*try to get kanji Index right away */ #define BROKENFONTS 0 Kanji = xtoi((char *)&instring[2]) + (BROKENFONTS); /* skip comments, kanji not specified in * the usefile, and invalid single kanji */ if(Kanji < MINKANJIALLOWED) { continue; } if(Kanji >MAXKANJIALLOWED) { continue; } parse = &instring[2]; if(parse == NULL){ continue; } /* now parse for grade level, frequency, and english */ freq = grade = N = U = H = SKIP=0; strokes=0; Q = -1; /* remember, "0000" IS a valid Qval!*/ nextword(&parse); /* Check for high bit set, which means * start of kana definition of kana. * We cheat a bit, and let this loop skip over * numbers by the fact that they don't match * the case statements. */ while ( (*parse < 127) && (*parse != '{') ) { switch(*parse){ case 'F': freq = atoi((char *)&parse[1]); break; case 'G': grade = atoi((char *)&parse[1]); break; case 'H': H = atoi((char *)&parse[1]); break; case 'N': N = atoi((char *)&parse[1]); break; case 'P': SKIP = parseskip((char *) &parse[1]); break; case 'Q': Q = atoi((char *)&parse[1]); break; case 'S': strokes= atoi((char *)&parse[1]); break; case 'U': U = xtoi((char *)&parse[1]); if(U&0xffff0000) { printf("got hi U: %x\n", U); } break; default: parse++; break; } nextword(&parse); } /* while != '{' */ /********************************************** * Now we know that we have a useable/wanted * * dictionary definition * *********************************************/ if((lowestkanji==highestkanji) && (highestkanji==0)){ lowestkanji = highestkanji = Kanji; } else{ if(Kanji < lowestkanji) lowestkanji = Kanji; if (Kanji > highestkanji) highestkanji = Kanji; } lastk = newk; newk = (struct translationstruct *) malloc(sizeof(struct translationstruct)); if (newk == NULL){ perror("Cannot allocate memory for translation table\n"); exit(errno); } newk->Sindex=SKIP; newk->Qindex=Q; newk->Uindex=U; newk->Hindex=H; newk->Nindex=N; newk->frequency = freq; newk->grade_level = grade; newk->Strokecount=strokes; newk->incorrect=0; newk->kanji=0; newk->pronunciation=0; newk->nextk = NULL; #ifdef DEBUG printf("Q=%d, U=%d, freq=%d\n", Q, U, freq); #endif newk->pronunciation = ReadPronunciation(&parse); if(newk->pronunciation == 0){ free(newk); newk = lastk; continue; } else { XChar2b buff[2]; buff[0].byte1 = (Kanji & 0xff00) >> 8; buff[0].byte2 = (Kanji & 0xff); buff[1].byte1 = 0; buff[1].byte2 = 0; newk->kanji = dup_16(buff); } if(lastk != NULL) lastk->nextk = newk; instrlen = strlen((char *)parse)+1; newk->english = (char *) malloc(instrlen); if(newk->english == NULL){ perror("Cannot allocate memory for translation table\n"); exit(errno); } StripBrackets(newk->english, parse); newk->kdrill_index=Kanji; translations[Kanji] = newk; numberofkanji++; if(numberofkanji%1000 == 0) { putchar('.'); fflush(stdout); } } /* and repeat until end of file */ puts(""); if(isapipe(fp)){ pclose(fp); } else { fclose(fp); } }