/////////////////////////////////////////////////////////////////////////// /* Copyright 2001 Ronald S. Burkey This file is part of GutenMark. GutenMark is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. GutenMark is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with GutenMark; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA Filename: DiacriticalNormalize.c Purpose: A library function which converts a full PG "8-bit ASCII" word into the likely PG "7-bit ASCII" form. Mods: 11/17/01 RSB Began. */ /////////////////////////////////////////////////////////////////////////// /* This code is used to "normalize" an 8-bit ASCII string (representing HTML 4.0 characters) by removing diacritical marks and replacing ligatures. This is needed in order to perform inexact searches of words from 7-bit ASCII PG etexts against the exact 8-bit forms found in the wordlists. */ #include #include "libGutenSpell.h" //------------------------------------------------------------ // Table of the probable standard-ASCII replacements used // In the PG etexts. When the wordlists are checked, we // match against both the exact forms, and against the // replacement forms. // // Note that the sz and ae ligatures, though present in this // table, are not handled by this table. (I.e., they are // marked with 0.) The reason for this is that they are // handled specifically by the software. static const char ReplacementChars[] = { 'A', 'A', 'A', 'A', 'A', 'A', 0, 'C', /*192:À 193:Á 194:Â 195:Ã 196:Ä 197:Å 198:Æ 199:Ç */ 'E', 'E', 'E', 'E', 'I', 'I', 'I', 'I', 'D', 'N', /*200:È 201:É 202:Ê 203:Ë 204:Ì 205:Í 206:Î 207:Ï 208:Ð 209:Ñ */ 'O', 'O', 'O', 'O', 'O', 0, 'O', 'U', 'U', 'U', /*210:Ò 211:Ó 212:Ô 213:Õ 214:Ö 215:× 216:Ø 217:Ù 218:Ú 219:Û */ 'U', 'Y', 'P', 0, 'a', 'a', 'a', 'a', 'a', 'a', /*220:Ü 221:Ý 222:Þ 223:ß 224:à 225:á 226:â 227:ã 228:ä 229:å */ 0, 'c', 'e', 'e', 'e', 'e', 'i', 'i', 'i', 'i', /*230:æ 231:ç 232:è 233:é 234:ê 235:ë 236:ì 237:í 238:î 239:ï */ 'd', 'n', 'o', 'o', 'o', 'o', 'o', 0, 'o', 'u', /*240:ð 241:ñ 242:ò 243:ó 244:ô 245:õ 246:ö 247:÷ 248:ø 249:ù */ 'u', 'u', 'u', 'y', 'p', 'y' /*250:ú 251:û 252:ü 253:ý 254:þ 255:ÿ */ }; //------------------------------------------------------------ // Converts a character to lower case. char DiacriticalTolower (unsigned char c) { if ((c >= 'A' && c <= 'Z') || (c >= 192 && c < 223 && c != 215)) return (c | 0x20); else return (c); } //------------------------------------------------------------ // Converts a character to upper case. char DiacriticalToupper (unsigned char c) { if ((c >= 'a' && c <= 'z') || (c >= 224 && c < 255 && c != 247)) return (c & ~0x20); else return (c); } //---------------------------------------------------------- // Determine if a string is all-caps, taking diacritical // marks into account. int IsStrupr (const char *s) { for (; *s; s++) if (*s != DiacriticalToupper (*s)) return (0); return (1); } //---------------------------------------------------------- // Converts a string to lower case. void DiacriticalStrlwr (char *s) { for (; *s; s++) *s = DiacriticalTolower (*s); } //---------------------------------------------------------- // Converts a string to upper case. void DiacriticalStrupr (char *s) { for (; *s; s++) *s = DiacriticalToupper (*s); } //------------------------------------------------------------ // Here's the normalization function itself. It returns the // length of the normalized string, or 0 on error. The // outlen parameter gives the amount of storage that has // been allocated for the output string. int DiacriticalNormalize (const char *sin, char *sout, int outlen) { char *send, *start; unsigned char c; start = sout; send = sout + outlen; for (; *sin; sin++) { c = (unsigned char) *sin; // Remove soft hyphens. if (c == 173) continue; // Take care of sz ligature. else if (c == 223) { if (sout < send) *sout++ = 's'; else return (0); if (sout < send) *sout++ = 's'; else return (0); } // Take care of ae ligature. else if (c == 230) { if (sout < send) *sout++ = 'a'; else return (0); if (sout < send) *sout++ = 'e'; else return (0); } // Take care of AE ligature. It makes equal sense to // convert this as AE or as Ae. I believe the latter // is what more people would do. else if (c == 198) { if (sout < send) *sout++ = 'A'; else return (0); if (sout < send) *sout++ = 'e'; else return (0); } // Take care of regular characters. else if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || c == '\'' || c == '-') { if (sout < send) *sout++ = c; else return (0); } // Take care of diacriticals. else if (c >= 192) { c = ReplacementChars[c - 192]; if (c) { if (sout < send) *sout++ = c; } else return (0); } // Unknown character! else return (0); } // All done. Terminate and return length. if (sout < send) { *sout = 0; return (sout - start); } else return (0); } //----------------------------------------------------------------- // A test main program. #ifdef TESTMAIN_NORMALIZE #include char s[1000], ss[1000]; int i; int main (void) { while (1 == scanf ("%s", s)) { i = DiacriticalNormalize (s, ss, sizeof (ss)); if (i) printf ("%d \"%s\" -> \"%s\"\n", i, s, ss); else printf ("%d \"%s\"\n", i, s); } return (0); } #endif