/* Copyright 2004 Nadav Har'El and Dan Kenigsberg */

#include <stdio.h>
#include <string.h>
#include <stdlib.h>

#include "prefixes.c"
#include "hspell.h"

/* this little program creates aspell affix information for Hebrew according to
 * the hebrew.wgz*. This version creates a single rule for each of hspell's
 * "word specifier". Each rule expands to all the prefixes that provide that
 * specifier (excluding the null prefix, which is currently implied.)  */

int main(void) {
  int i, specifier;
  char seen_specifiers[100], rulechar;
  int already_seen=0, seen, count;
  FILE *prefixfp, *wordsfp, *hefp;
  int prefixes_size;
  char *prefix_is_word;

  hefp = fopen("he_affix.dat", "w");
  fprintf(hefp, "# This file was generated automatically from data prepared\n"
                "# by the Hspell project (http://ivrix.org.il/projects/spell-"
                "checker).\n# Hspell version %d.%d%s was used.\n"
                "# The conversion was carried out in %s\n",
          HSPELL_VERSION_MAJOR,HSPELL_VERSION_MINOR,HSPELL_VERSION_EXTRA,
          __DATE__); 
  fprintf(hefp, "# Copyright 2004, Nadav Har'El and Dan Kenigsberg\n"); 
  prefixfp = popen("gzip -dc hebrew.wgz.prefixes", "r");
  while ((specifier=fgetc(prefixfp))!= EOF) {
    for(i=0, seen=0; (i<already_seen) && !seen; i++) {
      if (seen_specifiers[i] == specifier) seen = 1; }
    if (seen) continue;
    seen_specifiers[already_seen++] = specifier;

    /* count the number of matching prefixes */
    for (i=1, count=0; prefixes_noH[i]!=0; i++) {
      if (masks_noH[i] & specifier) count++;
    }
    rulechar = already_seen+'A'-1;
    fprintf(hefp, "PFX %c N %d\n",rulechar,2*count-1);

    /* print one rule for each leagal prefix, and remember to double initial waw
       if a prefix is prepended. */

    /* the empty string is 0 in aspell. currently, it is implied, and cannot be
       removed. In one condition it causes what can be called a bug - hspell
       accpets the maqor natuy such as בהמשיכם only with a prefix. aspell
       accepts also המשיכם. Note that this could be considered a feature, since
       it is a perfectly legal, though out-dated form. */
    /* fprintf(hefp, "PFX %c   0 0 .\n",already_seen+'A'-1); */
    for (i=1; prefixes_noH[i]!=0; i++) {
      if (masks_noH[i] & specifier) {
        if (!strcmp("ו",prefixes_noH[i])) {
          fprintf(hefp, "PFX %c   0 %s .\n",rulechar,prefixes_noH[i]);
        }
        else {
          fprintf(hefp, "PFX %c   0 %s [^ו]\n",rulechar,prefixes_noH[i]);
          fprintf(hefp, "PFX %c   0 %sו ו\n",rulechar,prefixes_noH[i]);
        }
      }
    }
    prefixes_size = i;
    fprintf(hefp, "\n");
  }
  pclose(prefixfp);
  fclose(hefp);

  prefix_is_word = (char *)calloc(sizeof(char),prefixes_size);
 
  /* and now, translate hebrew.wgz+hebrew.wgz.prefix into aspell-style word
   * list. */

  prefixfp = popen("gzip -dc hebrew.wgz.prefixes", "r");
  wordsfp = popen("gzip -dc hebrew.wgz|./wunzip", "r");

  while ((specifier=fgetc(prefixfp))!= EOF) {
    char word[100];
    int len, j;
    /* find the specifier place (which infers which aspell rule apply to its
     * word) */
    for(i=0; (i<already_seen) && (seen_specifiers[i]!=specifier) ; i++);
    fgets(word, sizeof(word)-3,wordsfp);

    /* write down whether this word is also a legal prefix (and therefore should
       not be written again later)  */
    for (j=1; prefixes_noH[j]!=0; j++) {
      if (!strcmp(word,prefixes_noH[j])) {
        prefix_is_word[j] = 1;
        break;
      }
    }

    len=strlen(word);
    word[len-1]='/';
    word[len]=i+'A';
    word[len+1]='\n';
    word[len+2]=0;
    printf("%s",word);
  }
  pclose(prefixfp);
  pclose(wordsfp);

  /* accept "dangling" prefixes, that many times precede numbers and latin */
  /* but make sure not to repeat words that already appear in the dictionary.
   * This may cause unwanted warning. */
  /* BUG: in my weeding of prefixes that already appeared, I assume that the
     blank prefix is always allowed. When this cedes to be the case, we would
     to do something more complicated */
  for (i=1; prefixes_noH[i]!=0; i++) {
    if (!prefix_is_word[i])
      printf("%s\n", prefixes_noH[i]);
  }
  free(prefix_is_word);
  return 0;
}



syntax highlighted by Code2HTML, v. 0.9.1