/////////////////////////////////////////////////////////////////////////// /* Copyright 2001-2002 Ronald S. Burkey This file is part of GutenMark. GutenMark is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. GutenMark is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with GutenMark; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA Filename: MatchWordlists.c Purpose: Sets various fields in an existing in-memory wordlist against wordlists/namelists read from the filesystem. Mods: 11/21/01 RSB Began. 11/29/01 RSB Appending to globs, which had been used, worked in Linux but not FreeBSD or MacOS-X. Therefore, appending to the globs has been eliminated. 12/01/01 RSB Globbing doesn't appear at all in mingw32. Rats! Fortunately, the DOS-like functions findfirst() and findnext() appear, and they can be used to produce a similar effect. I provide a globlike equivalent in a separate file (winglob.c), but must include winglob.h rather than glob.h. 12/05/01 RSB Now check for programname+".cfg" if GutenMark.cfg isn't found. This allows a uniform installation procedure to place the configuration file and the wordlists in the same directory as the executable, but for a user's configuration file to override the global file. 12/23/01 RSB Began adding the ability to determine the specific set of languages for each word, rather than just relying on the native vs. foreign dichotomy. 06/30/02 RSB Added much more explicit error messages. Found (and hopefully fixed) several cases in which the program might abort or not process subsequent wordlists when some intermediate wordlist pattern wasn't matched. 07/13/02 RSB Added some stuff to account for the fact that Win32 adds ".exe" to argv[0]. Added ability to specify an alternate configuration file on the command line. 07/14/02 RSB Added a bunch of new log messages reporting results of wordlist globbing. Added *.places.gz to the list of default wordlists in the absence of a config file. In the newest version of BSD I find, sadly, that the glob function no longer uses the GLOB_NOMATCH and GLOB_ABORTED constants as in earlier versions or as in Linux. *Sigh!* Note that although the documentation (as of 11/21/01) only refers to gzipped wordlists, the wordlists can also be unzipped ASCII as well. This takes a LOT more disk space, but might be faster under some circumstances. (But maybe not!) */ /////////////////////////////////////////////////////////////////////////// #include #include #include #include #ifdef WIN32 #include "winglob.h" #else /* */ #include #endif /* */ #include #include "libGutenSpell.h" // Take care of some discrepancies in the constants used by the glob // function on different platforms. #ifndef GLOB_NOMATCH #define GLOB_NOMATCH 0x666 #endif #ifndef GLOB_ABORTED #define GLOB_ABORTED 0x667 #endif #ifndef GLOB_ABEND #define GLOB_ABEND 0x668 #endif #define FIRSTGLOB (GLOB_NOSORT | GLOB_TILDE) //#define NEXTGLOB (GLOB_APPEND | FIRSTGLOB) #define MAX_WORDLISTS 256 // In this list, we keep a marker for each globbed filename, // indicating if it is supposed to be SPELL_NATIVE or SPELL_FOREIGN // language. If there are more globbed files than entries in // this table, they are assumed to be foreign. typedef struct { int Count; unsigned char List[MAX_WORDLISTS]; char *Names[MAX_WORDLISTS]; char *Languages[MAX_WORDLISTS]; unsigned long LanguageMasks[MAX_WORDLISTS]; } NativeList; //------------------------------------------------------------------------ // Reads and processes an on-disk wordlist. Returns 0 on success or // on file not found. Returns non-zero only on actual error. // Note that "gz" file operations (like gzopen rather than fopen) // are used, because the wordlists are usually gzipped. // The Type parameter is either SPELL_NATIVE or SPELL_FOREIGN. // // The strings read from the wordlist must contain ONLY the characters // identified by IsWordChar as candidates for being in words, and must // not begin with punctuation. static int ReadWordlist (Wordlist * Words, const char *Filename, unsigned char LanguageType, unsigned long LanguageMask, FILE *LogFile) { char Full[MAXWORDLENGTH], Normalized[MAXWORDLENGTH], TestFull[MAXWORDLENGTH], TestNormalized[MAXWORDLENGTH], *ss; int i, NumLower, NumUpper, FirstUpper, AnyDiacritical, ReturnValue = 1, Matched; unsigned char Capitalization; gzFile fp; fp = gzopen (Filename, "rb"); if (fp == NULL) { fprintf (stderr, "Note: Wordlist \"%s\" not found (or corrupt).\n", Filename); if (LogFile != NULL) fprintf (LogFile, "ReadWordlist: Wordlist \"%s\" not found (or corrupt).\n", Filename); goto Okay; } // The file is open! Let's read and process it. while (Z_NULL != gzgets (fp, Full, sizeof (Full))) { // Make sure the string is an actual word (not a comment), // and trim off garbage like end of line. for (ss = Full; *ss; ss++) if (*ss == '\n') *ss = 0; if (!Full[0] || Full[0] == '#' || isspace (Full[0])) continue; // Analyze the capitalization of the word. for (NumLower = NumUpper = FirstUpper = AnyDiacritical = 0, ss = Full; *ss; ss++) { i = IsWordChar (*ss); if (i & WORD_LOWER) NumLower++; else if (i & WORD_UPPER) { NumUpper++; if (!NumLower) FirstUpper = 1; } if (i & WORD_DIACRITICAL) AnyDiacritical = 1; } if (!NumUpper) Capitalization = SPELL_LOWERCASE; else if (!NumLower) Capitalization = SPELL_UPPERCASE; else if (NumUpper == 1 && FirstUpper == 1) Capitalization = SPELL_CAPITALIZED; else Capitalization = SPELL_CUSTOMCAP; // Compute the normalized form of the word. if (0 == DiacriticalNormalize (Full, Normalized, sizeof (Normalized))) Normalized[0] = 0; // First, see if this word has been used, as-is. i = SearchWordlist (Words, Normalized, Full, &Matched); if (Matched) { Words->Words[i].Languages |= LanguageMask; // If the word is already marked, we don't have to do // anything more with it. But if it hasn't been ... if (!Words->Words[i].WordlistStatus) Words->Words[i].WordlistStatus = LanguageType | Capitalization; // If foreign, though, we set a flag regardless: if (LanguageType == SPELL_FOREIGN) Words->Words[i].WordlistStatus |= SPELL_NONNATIVE; } // If all lower-case, check if the capitalized version was used. This is // necessary to prevent short foreign words from matching it. if (Capitalization == SPELL_LOWERCASE) { strcpy (TestFull, Full); strcpy (TestNormalized, Normalized); TestFull[0] = DiacriticalToupper (TestFull[0]); TestNormalized[0] = DiacriticalToupper (TestNormalized[0]); i = SearchWordlist (Words, TestNormalized, TestFull, &Matched); if (Matched) { Words->Words[i].Languages |= LanguageMask; // If the word is already marked, we don't have to do // anything more with it. But if it hasn't been ... if (!Words->Words[i].WordlistStatus) Words->Words[i].WordlistStatus = LanguageType | Capitalization; // If foreign, though, we set a flag regardless: if (LanguageType == SPELL_FOREIGN) Words->Words[i].WordlistStatus |= SPELL_NONNATIVE; } } // Next, test the same thing, but for the case of the word having // been used as all-caps. if (Capitalization != SPELL_UPPERCASE) { strcpy (TestFull, Full); strcpy (TestNormalized, Normalized); DiacriticalStrupr (TestFull); DiacriticalStrupr (TestNormalized); i = SearchWordlist (Words, TestNormalized, TestFull, &Matched); if (Matched) { Words->Words[i].Languages |= LanguageMask; // If the word is already marked, we don't have to do // anything more with it. But if it hasn't been ... if (!Words->Words[i].WordlistStatus) Words->Words[i].WordlistStatus = LanguageType | Capitalization; // If foreign, though, we set a flag regardless: if (LanguageType == SPELL_FOREIGN) Words->Words[i].WordlistStatus |= SPELL_NONNATIVE; } } // The tests above assume a 7-bit to 7-bit or 8-bit to 8-bit // match of the word in the etext with the word in the wordlist. // Another possibility, though, is that a 7-bit word in the etext // corresponds to an 8-bit word in the wordlist. These tests // are more complex, because we're interested not merely in // using them for handling ALL-CAPS italicizing, but also for // restoration of diacritical marks. So we also need to check // for the merely-capitalized case. if (!AnyDiacritical) // wordlist word not 8-bit anyhow. continue; // Check 7-bit to 8-bit with unchanged capitalization. i = SearchWordlist (Words, Normalized, Normalized, &Matched); if (Matched) { Words->Words[i].Languages |= LanguageMask; // If the word is already marked, we don't have to do // anything more with it. But if it hasn't been ... if (!Words->Words[i].WordlistStatus) { if (NULL != (Words->Words[i].Match = AllocSpellString (Words, Full))) Words->Words[i].WordlistStatus = SPELL_NORMALIZED | LanguageType | Capitalization; } // If foreign, though, we set a flag regardless: if (LanguageType == SPELL_FOREIGN) Words->Words[i].WordlistStatus |= SPELL_NONNATIVE; } // Check 7-bit to 8-bit with initial capitalization. Normalized[0] = DiacriticalToupper (Normalized[0]); i = SearchWordlist (Words, Normalized, Normalized, &Matched); if (Matched) { Words->Words[i].Languages |= LanguageMask; // If the word is already marked, we don't have to do // anything more with it. But if it hasn't been ... if (!Words->Words[i].WordlistStatus) { if (NULL != (Words->Words[i].Match = AllocSpellString (Words, Full))) Words->Words[i].WordlistStatus = SPELL_NORMALIZED | LanguageType | Capitalization; } // If foreign, though, we set a flag regardless: if (LanguageType == SPELL_FOREIGN) Words->Words[i].WordlistStatus |= SPELL_NONNATIVE; } // Check 7-bit to 8-bit with all caps. i = SearchWordlist (Words, TestNormalized, TestNormalized, &Matched); if (Matched) { Words->Words[i].Languages |= LanguageMask; // If the word is already marked, we don't have to do // anything more with it. But if it hasn't been ... if (!Words->Words[i].WordlistStatus) { if (NULL != (Words->Words[i].Match = AllocSpellString (Words, Full))) Words->Words[i].WordlistStatus = SPELL_NORMALIZED | LanguageType | Capitalization; } // If foreign, though, we set a flag regardless: if (LanguageType == SPELL_FOREIGN) Words->Words[i].WordlistStatus |= SPELL_NONNATIVE; } } Okay:ReturnValue = 0; //Done: if (fp != NULL) gzclose (fp); return (ReturnValue); } //------------------------------------------------------------------------ // Update the NativeList array after globbing. The glob_t contains // the globbed filenames derived from an entry in the configuration // file. void Mark (NativeList * List, glob_t * Glob, char Type, char *Language) { int i, j, k; unsigned long MaxMask; if (*Language == 0) { if (List->Count == 0 || Type == SPELL_NATIVE) Language = "native"; else Language = "foreign"; } for (i = 0, j = List->Count; i < Glob->gl_pathc && j < MAX_WORDLISTS; i++, j++) { List->List[j] = Type; List->Names[j] = (char *) calloc (1, strlen (Glob->gl_pathv[i]) + 1); if (List->Names[j] == NULL) break; strcpy (List->Names[j], Glob->gl_pathv[i]); List->Languages[j] = (char *) calloc (1, strlen (Language) + 1); if (List->Languages[j] == NULL) break; strcpy (List->Languages[j], Language); // Figure out the associated language mask. This is a value // 1, 2, 4, 8, ... uniquely associated with the language. MaxMask = 0; for (k = 0; k < List->Count; k++) if (!strcmp (List->Languages[k], Language)) { List->LanguageMasks[j] = List->LanguageMasks[k]; break; } else if (List->LanguageMasks[k] > MaxMask) MaxMask = List->LanguageMasks[k]; if (k == List->Count) { if (k == 0) List->LanguageMasks[j] = 1; else { if (0 != (MaxMask << 1)) MaxMask = MaxMask << 1; List->LanguageMasks[j] = MaxMask; } } } List->Count = j; } //------------------------------------------------------------------------ // Convert a string to lower case. static void StrLwr (char *s) { for (; *s != '\0'; s++) *s = tolower (*s); } //------------------------------------------------------------------------ // Displays an error message related to globbing. static void GlobErrorMessage (int ReturnValue, FILE *LogFile, char *ss) { if (ReturnValue) { switch (ReturnValue) { case GLOB_NOSPACE: fprintf (stderr, "Out of memory.\n"); if (LogFile != NULL) fprintf (LogFile, "MatchWordlists: GLOB_NOSPACE for \"%s\"\n", ss); break; case GLOB_ABORTED: fprintf (stderr, "Directory-read error.\n"); if (LogFile != NULL) fprintf (LogFile, "MatchWordlists: GLOB_ABORTED for \"%s\"\n", ss); break; case GLOB_NOMATCH: fprintf (stderr, "Note: No wordlists matched \"%s\".\n", ss); if (LogFile != NULL) fprintf (LogFile, "MatchWordlists: GLOB_NOSPACE for \"%s\"\n", ss); break; #ifndef __FreeBSD__ case GLOB_ABEND: fprintf (stderr, "Possible disk-read error.\n"); if (LogFile != NULL) fprintf (LogFile, "MatchWordlists: GLOB_ABEND for \"%s\"\n", ss); break; #endif default: fprintf (stderr, "Unknown globbing error.\n"); if (LogFile != NULL) fprintf (LogFile, "MatchWordlists: glob for \"%s\" returned %d\n", ss, ReturnValue); break; } } } //------------------------------------------------------------------------ // libGutenSpell operates in reverse from the way a normal spell-checker // works. A normal spell-checker takes a selected word and tries to // find it in on-disk wordlists. This forces the on-disk wordlists to // be highly organized for fast searching. With libGutenSpell, however, // it is the in-memory words that are highly organized, and the on-disk // wordlists need no particular organization (i.e., they don't have to // be sorted), because EVERY word in the on-disk wordlists is going to // be read and checked against the in-memory list. This has the advantage // of allowing the on-disk wordlists to be highly compressed. In fact, // they are read with zlib rather than stdio. // // The result of running this function is merely to set certain fields // (mostly flags) in the in-memory wordlist, representing properties of // the various words: i.e., whether they were found or not, whether // native language or foreign, whether 7-bit ASCII or 8-bit ASCII, etc. // // The wordlists to be used are taken from the GutenMark.cfg file. // They may contain wildcards or other regular-expression-type stuff, // and so the full list of wordlists needs to be constructed by globbing, // and this is done before any of the wordlists are actually read. // // Returns zero on success. int MatchWordlists (FILE * LogFile, Wordlist * Words, const char *Language, const char *ProgName, const char *AltCfg) { NativeList Natives = { 0 }; glob_t Glob; char s[256], *ss, Filename[256], ListLanguage[256]; char SpellType; FILE *cfg; int i, j, ReturnValue, Found = 0; Glob.gl_offs = Glob.gl_pathc = 0; Glob.gl_pathv = NULL; cfg = NULL; if (AltCfg != NULL) { cfg = fopen (ss = (char *) AltCfg, "r"); if (cfg == NULL) { fprintf (stderr, "Note: Configuration file \"%s\" not found.\n", ss); if (LogFile != NULL) fprintf (LogFile, "Configuration file \"%s\" not found.\n", ss); } else { fprintf (stderr, "Using configuration file \"%s\".\n", ss); if (LogFile != NULL) fprintf (LogFile, "Using configuration file \"%s\".\n", ss); } } if (cfg == NULL) { cfg = fopen (ss = "./GutenMark.cfg", "r"); if (cfg == NULL) { fprintf (stderr, "Note: Configuration file \"%s\" not found.\n", ss); if (LogFile != NULL) fprintf (LogFile, "Configuration file \"%s\" not found.\n", ss); } else { fprintf (stderr, "Using configuration file \"%s\".\n", ss); if (LogFile != NULL) fprintf (LogFile, "Using configuration file \"%s\".\n", ss); } } if (cfg == NULL) { strcpy (s, ProgName); #ifdef WIN32 // Remove ".exe" from the end of the executable's name. i = strlen (s); if (i > 4) { ss = &s[i - 4]; if (!strcasecmp (ss, ".exe")) *ss = '\0'; } #endif // WIN32 strcat (s, ".cfg"); cfg = fopen (ss = s, "r"); if (cfg == NULL) { fprintf (stderr, "Note: Configuration file \"%s\" not found.\n", ss); if (LogFile != NULL) fprintf (LogFile, "Configuration file \"%s\" not found.\n", ss); } else { fprintf (stderr, "Using configuration file \"%s\".\n", ss); if (LogFile != NULL) fprintf (LogFile, "Using configuration file \"%s\".\n", ss); } } // Loop on the lines in the configuration file. (If the configuration // file wasn't found, we simply do all of the namelists, followed // by all of the wordlists, in the current directory.) if (cfg == NULL) { fprintf (stderr, "Note: Working without a configuration file.\n"); if (LogFile != NULL) fprintf (LogFile, "Note: Working without a configuration file.\n"); Default: sprintf (s, "%s.names.gz", Language); ReturnValue = glob (ss = s, FIRSTGLOB, NULL, &Glob); if (ReturnValue == GLOB_NOMATCH) GlobErrorMessage (ReturnValue, LogFile, ss); else { if (ReturnValue != 0) goto GlobErrorTrap; Mark (&Natives, &Glob, SPELL_NATIVE, "native"); } ReturnValue = glob (ss = "*.names.gz", FIRSTGLOB, NULL, &Glob); if (ReturnValue == GLOB_NOMATCH) GlobErrorMessage (ReturnValue, LogFile, ss); else { if (ReturnValue != 0) goto GlobErrorTrap; Mark (&Natives, &Glob, SPELL_NATIVE, "native"); } sprintf (s, "%s.places.gz", Language); ReturnValue = glob (ss = s, FIRSTGLOB, NULL, &Glob); if (ReturnValue == GLOB_NOMATCH) GlobErrorMessage (ReturnValue, LogFile, ss); else { if (ReturnValue != 0) goto GlobErrorTrap; Mark (&Natives, &Glob, SPELL_NATIVE, "native"); } ReturnValue = glob (ss = "*.places.gz", FIRSTGLOB, NULL, &Glob); if (ReturnValue == GLOB_NOMATCH) GlobErrorMessage (ReturnValue, LogFile, ss); else { if (ReturnValue != 0) goto GlobErrorTrap; Mark (&Natives, &Glob, SPELL_NATIVE, "native"); } sprintf (s, "%s.words.gz", Language); ReturnValue = glob (ss = s, FIRSTGLOB, NULL, &Glob); if (ReturnValue == GLOB_NOMATCH) GlobErrorMessage (ReturnValue, LogFile, ss); else { if (ReturnValue != 0) goto GlobErrorTrap; Mark (&Natives, &Glob, SPELL_NATIVE, "native"); } ReturnValue = glob (ss = "*.words.gz", FIRSTGLOB, NULL, &Glob); if (ReturnValue == GLOB_NOMATCH) GlobErrorMessage (ReturnValue, LogFile, ss); else { if (ReturnValue != 0) goto GlobErrorTrap; Mark (&Natives, &Glob, SPELL_FOREIGN, "foreign"); } GlobErrorTrap: GlobErrorMessage (ReturnValue, LogFile, ss); if (ReturnValue) goto Done; } else { // Search the cfg file for the appropriate language profile. // These are headed by lines like "[language]". while (NULL != fgets (s, sizeof (s) - 1, cfg)) if (s[0] == '[') { for (ss = &s[1]; *ss; ss++) if (*ss == ']') break; if (*ss == ']') { *ss = 0; // Language profile found? if (!strcasecmp (&s[1], Language)) { Found = 1; // We now use all lines between now and the // next profile (or the end of file) beginning // with "native=" or "foreign=". while (NULL != fgets (s, sizeof (s) - 1, cfg)) { if (s[0] == '[') break; ss = strstr (s, "="); if (ss == NULL) continue; *ss = 0; StrLwr (s); *ss = '='; //for (ss = s; *ss; ss++) // if (*ss == '\n') // *ss = 0; ListLanguage[0] = 0; if (sscanf (s, "native=%s%s", Filename, ListLanguage) > 0) SpellType = SPELL_NATIVE; else if (sscanf (s, "foreign=%s%s", Filename, ListLanguage) > 0) SpellType = SPELL_FOREIGN; else continue; ReturnValue = glob (Filename, FIRSTGLOB, NULL, &Glob); GlobErrorMessage (ReturnValue, LogFile, Filename); if (ReturnValue == GLOB_NOMATCH) { if (LogFile != NULL) fprintf (LogFile, "Note: No matches were found " "for wordlist \"%s\".\n", Filename); } else { if (ReturnValue) { fprintf (LogFile, "Note: An error code of %d " "was returned for globbing " "\"%s\".\n", ReturnValue, Filename); break; } if (LogFile != NULL) { fprintf (LogFile, "Note: Globbing \"%s\" " "matched %d files.\n", Filename, Glob.gl_pathc); for (i = 0; i < Glob.gl_pathc; i++) fprintf (LogFile, "\t\"%s\"\n", Glob.gl_pathv[i]); } StrLwr (ListLanguage); Mark (&Natives, &Glob, SpellType, ListLanguage); } } break; } } } fclose (cfg); if (!Found) { fprintf (stderr, "Note: Language section found in configuration.\n"); if (LogFile != NULL) fprintf (LogFile, "Language section not found in configuration.\n"); goto Default; } } // Now the list of all wordslists and namelists has been totally // expanded, though possibly with duplicates. We therefore // process, them one-by-one. for (i = 0; i < Natives.Count; i++) { // Make sure one isn't a duplicate. for (j = 0; j < i; j++) if (!strcmp (Natives.Names[i], Natives.Names[j])) break; if (j == i) { if (LogFile != NULL) fprintf (LogFile, "Wordlist = %s (%s %ld)\n", Natives.Names[i], Natives.Languages[i], Natives.LanguageMasks[i]); fprintf (stderr, "Checking wordlist/namelist %s (%s)\n", Natives.Names[i], Natives.Languages[i]); // Not a duplicate. Process it! ReturnValue = ReadWordlist (Words, Natives.Names[i], Natives.List[i], Natives.LanguageMasks[i], LogFile); if (ReturnValue) { fprintf (stderr, "Error processing wordlist.\n"); if (LogFile != NULL) fprintf (LogFile, "Error %d processing wordlist.\n", ReturnValue); // Removed 06/30/02. goto Done; } } else { if (LogFile != NULL) fprintf (LogFile, "Duplicate wordlist = %s\n", Natives.Names[i]); } } // All done! ReturnValue = 0; Done: // For some reason, the following can cause a segmentation fault. //globfree (&Glob); return (ReturnValue); }