/* * stemmer.c * * Copyright (C) 2003 Nadav Rotem * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Library General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ #include #include #include #include "libots.h" #define MAX_PREFIX_SIZE 256 OtsStemRule * new_stem_rule () { OtsStemRule *rule = g_new0 (OtsStemRule, 1); return rule; } void free_stem_rule (OtsStemRule *rule) { if (rule != NULL) { g_list_foreach (rule->RemovePre, (GFunc) g_free, NULL); g_list_free (rule->RemovePre); g_list_foreach (rule->RemovePost, (GFunc) g_free, NULL); g_list_free (rule->RemovePost); g_list_foreach (rule->step1_pre, (GFunc) g_free, NULL); g_list_free (rule->step1_pre); g_list_foreach (rule->step1_post, (GFunc) g_free, NULL); g_list_free (rule->step1_post); g_list_foreach (rule->synonyms, (GFunc) g_free, NULL); g_list_free (rule->synonyms); g_list_foreach (rule->manual, (GFunc) g_free, NULL); g_list_free (rule->manual); g_list_foreach (rule->ParserBreak, (GFunc) g_free, NULL); g_list_free (rule->ParserBreak); g_list_foreach (rule->ParserDontBreak, (GFunc) g_free, NULL); g_list_free (rule->ParserDontBreak); g_list_foreach (rule->ReplaceChars, (GFunc) g_free, NULL); g_list_free (rule->ReplaceChars); g_free (rule); } return; } static void ots_stem_break (unsigned const char *comp,unsigned char *part_a,unsigned char *part_b) /*given already alocated part_a and b */ { /*example "red|blue" */ int i, j, clen; i = 0; j = 0; if (comp==NULL) return; if (part_a==NULL) return; if (part_b==NULL) return; clen = strlen (comp); part_a[0] = 0; part_b[0] = 0; while ((i < clen) && (i < MAX_PREFIX_SIZE) && (comp[i] != '|')) { part_a[i] = comp[i]; i++; } part_a[i] = 0; i++; /*skip the | mark */ while (i < clen && (j < MAX_PREFIX_SIZE)) { part_b[j] = comp[i]; i++; j++; } part_b[j] = 0; return; } static unsigned char * ots_stem_remove_pre (unsigned const char *aWord,unsigned const char *pre,unsigned const char *new) { int i, plen, wlen, nlen; unsigned char *new_str = NULL; if (aWord==NULL) return NULL; plen = strlen (pre); wlen = strlen (aWord); nlen = strlen (new); for (i = 0; i < plen; i++) if (aWord[i] != pre[i]) return NULL; /*no match */ new_str = g_new0 (char, wlen + nlen +5); for (i = 0; i <= nlen; i++) new_str[i] = new[i]; for (i = nlen; i <= nlen + wlen - plen; i++) new_str[i] = aWord[i + plen - nlen]; new_str[i + 1] = 0; return new_str; } static unsigned char * ots_stem_remove_post (unsigned const char *aWord,unsigned const char *post,unsigned const char *new) { unsigned int i, wlen, plen, nlen; unsigned char *new_str = NULL; if ((NULL==aWord)||(NULL==post)||(NULL==new)) return NULL; wlen = strlen (aWord); plen = strlen (post); nlen = strlen (new); if (plen>wlen) return NULL; for (i = 0; i < plen; i++) if (aWord[wlen - plen + i]!= post[i]) return NULL; /* no match */ new_str = g_new0 (char, wlen + nlen +5); for (i = 0; i <= wlen - plen; i++) /*place word */ new_str[i] = aWord[i]; for (i = 0; i <= nlen; i++) /*place newfix */ new_str[wlen - plen + i] = new[i]; return new_str; /*word replaced */ } static unsigned char * ots_stem_replace_word (unsigned const char *aWord,unsigned const char *old,unsigned const char *new) { if (aWord==NULL) return NULL; if ((aWord)&&(0 == strcmp (aWord, old))) { return g_strdup (new); } else { return NULL; } } unsigned char * ots_stem_format (unsigned const char *aWord, const OtsStemRule * rule) { GList *li; unsigned char *rep = NULL; unsigned char *normWord = NULL; if (aWord==NULL) return NULL; normWord = g_utf8_strdown (aWord, -1); /*lowercase the word */ char *prefix; char *newfix; prefix = g_new0 (char, MAX_PREFIX_SIZE); newfix = g_new0 (char, MAX_PREFIX_SIZE); for (li = (GList *) rule->step1_pre; li != NULL; li = li->next) { ots_stem_break (li->data, prefix, newfix); rep = ots_stem_remove_pre (normWord, prefix, newfix); if (NULL != rep) { g_free (normWord); normWord = rep; rep = NULL; } } for (li = (GList *) rule->step1_post; li != NULL; li = li->next) { ots_stem_break (li->data, prefix, newfix); rep = ots_stem_remove_post(normWord, prefix, newfix); if (NULL != rep) { g_free (normWord); normWord = rep; rep = NULL; } } g_free (prefix); g_free (newfix); return normWord; } unsigned char * ots_stem_strip (unsigned const char *aWord,const OtsStemRule * rule) { GList *li; unsigned char *rep = NULL; unsigned char *prefix; unsigned char *newfix; unsigned char *normWord=NULL; prefix = g_new0 (char, MAX_PREFIX_SIZE); newfix = g_new0 (char, MAX_PREFIX_SIZE); if (aWord==NULL) return NULL; normWord = ots_stem_format (aWord,rule); for (li = (GList *) rule->manual; li != NULL; li = li->next) { ots_stem_break (li->data, prefix, newfix); rep = ots_stem_replace_word (normWord, prefix, newfix); if (NULL != rep) { g_free (normWord); normWord = rep; rep = NULL; break; } } for (li = (GList *) rule->RemovePre; li != NULL; li = li->next) { ots_stem_break (li->data, prefix, newfix); rep = ots_stem_remove_pre (normWord, prefix, newfix); if (NULL != rep) { g_free (normWord); normWord = rep; rep = NULL; break; } } for (li = (GList *) rule->RemovePost; li != NULL; li = li->next) { ots_stem_break (li->data, prefix, newfix); rep = ots_stem_remove_post (normWord, prefix, newfix); if (NULL != rep) { g_free (normWord); normWord = rep; rep = NULL; break; } } for (li = (GList *) rule->synonyms; li != NULL; li = li->next) { ots_stem_break (li->data, prefix, newfix); rep = ots_stem_replace_word (normWord, prefix, newfix); if (NULL != rep) { g_free (normWord); normWord = rep; rep = NULL; break; } } g_free (prefix); g_free (newfix); if (strlen(normWord)<3) /*stem is two letter long. thats not right. N(eed)==N(ation) ?*/ { g_free(normWord); normWord = ots_stem_format (aWord,rule); /*lowercase the word */ } return normWord; }