/************************************************************************************************* * Morphological analyzers for Hyper Estraier * Copyright (C) 2004-2007 Mikio Hirabayashi * This file is part of Hyper Estraier. * Hyper Estraier is free software; you can redistribute it and/or modify it under the terms of * the GNU Lesser General Public License as published by the Free Software Foundation; either * version 2.1 of the License or any later version. Hyper Estraier is distributed in the hope * that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public * License for more details. * You should have received a copy of the GNU Lesser General Public License along with Hyper * Estraier; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, * Boston, MA 02111-1307 USA. *************************************************************************************************/ #include "mymorph.h" #include "estraier.h" #include "myconf.h" #if defined(_MYMECAB) #include #endif #define ESTNUMBUFSIZ 32 /* size of a buffer for a number */ #define ESTSCOREUNIT 100 /* unit of score of each word */ #define ESTWORDMAXLEN 48 /* maximum length of a word */ typedef struct { /* type of structure for a hitting object */ const char *word; /* face of keyword */ int pt; /* score tuned by TF-IDF */ } ESTKEYSC; /* private function prototypes */ static void est_uwords_add(CBMAP *uwords, const char *word, int wsiz, int pt); static int est_keysc_compare(const void *ap, const void *bp); static int est_mecab_analyze(const char *text, CBMAP *uwords); /************************************************************************************************* * pseudo API *************************************************************************************************/ /* Extract keywords of a document object using morphological analyzers. */ CBMAP *est_morph_etch_doc(ESTDOC *doc, int max){ ESTKEYSC *scores; CBMAP *kwords, *uwords; const CBLIST *texts; CBLIST *words; const char *text, *word; const unsigned char *uword; char numbuf[ESTNUMBUFSIZ]; char *enc; int i, j, all, miss, wsiz, pt, snum; double dval; assert(doc && max >= 0); kwords = cbmapopenex(max * 2 + 1); uwords = cbmapopen(); all = 0; texts = est_doc_texts(doc); for(i = -1; i < cblistnum(texts); i++){ text = i < 0 ? est_doc_hidden_texts(doc) : cblistval(texts, i, NULL); if(text[0] == '\0') continue; if(ESTUSEMECAB && (enc = est_iconv(text, -1, "UTF-8", "EUC-JP", NULL, &miss)) != NULL){ if(miss > 0){ free(enc); } else { all += est_mecab_analyze(enc, uwords); free(enc); continue; } } words = cblistopen(); est_break_text_chrcat(text, words, FALSE); for(j = 0; j < cblistnum(words); j++){ uword = (unsigned char *)cblistval(words, j, &wsiz); if(uword[0] >= 0xe3){ if(wsiz <= 3){ pt = ESTSCOREUNIT / 4; if((uword[0] == 0xe3 && (uword[1] == 0x80 || uword[1] == 0x81 || (uword[1] == 0x82 && uword[2] <= 0x9f))) || (uword[0] == 0xef && uword[1] >= 0xbc)) pt /= 10; } else { pt = ESTSCOREUNIT; if((uword[0] == 0xe3 && (uword[1] == 0x80 || uword[1] == 0x81 || (uword[1] == 0x82 && uword[2] <= 0x9f))) || (uword[0] == 0xef && uword[1] >= 0xbc)) pt /= 2; if((uword[3] == 0xe3 && (uword[4] == 0x80 || uword[4] == 0x81 || (uword[4] == 0x82 && uword[5] <= 0x9f))) || (uword[3] == 0xef && uword[4] >= 0xbc)) pt /= 2; } } else if((uword[0] > '\0' && uword[0] <= '/') || (uword[0] >= '[' && uword[0] <= '`') || (uword[0] >= '{' && uword[0] <= '~')){ pt = ESTSCOREUNIT / 50; } else { switch(wsiz){ case 1: pt = ESTSCOREUNIT / 9; break; case 2: pt = ESTSCOREUNIT / 5; break; case 3: pt = ESTSCOREUNIT / 3; break; case 4: pt = ESTSCOREUNIT / 2; break; case 5: pt = ESTSCOREUNIT / 1.5; break; case 6: pt = ESTSCOREUNIT / 1.25; break; default: pt = ESTSCOREUNIT; break; } } est_uwords_add(uwords, (char *)uword, wsiz, pt); all += pt; } cblistclose(words); } scores = cbmalloc(cbmaprnum(uwords) * sizeof(ESTKEYSC) + 1); snum = 0; cbmapiterinit(uwords); while((word = cbmapiternext(uwords, &wsiz)) != NULL){ scores[snum].word = word; scores[snum].pt = *(int *)cbmapget(uwords, word, wsiz, NULL); snum++; } qsort(scores, snum, sizeof(ESTKEYSC), est_keysc_compare); dval = log(all + ESTSCOREUNIT * 64); dval *= dval; dval /= 10; for(i = 0; i < snum && i < max; i++){ pt = (int)(scores[i].pt / dval); wsiz = sprintf(numbuf, "%d", pt > 1 ? pt : 1); cbmapput(kwords, scores[i].word, -1, numbuf, wsiz, FALSE); } free(scores); cbmapclose(uwords); return kwords; } /************************************************************************************************* * private objects *************************************************************************************************/ /* Add a record to a unique word map. `uwords' specifies a unique word map. `word' specifies a word. `wsiz' specifies size of the word. `pt' specifies point of the word. */ static void est_uwords_add(CBMAP *uwords, const char *word, int wsiz, int pt){ const char *vbuf; int num; assert(uwords && word && wsiz >= 0); if(wsiz > ESTWORDMAXLEN) return; num = (vbuf = cbmapget(uwords, word, wsiz, NULL)) ? *(int *)vbuf + pt : pt; if(num < 0) num = 0; cbmapput(uwords, word, wsiz, (char *)&num, sizeof(int), TRUE); } /* Compare two keywords by scores in descending order. `ap' specifies the pointer to one keyword. `bp' specifies the pointer to the other keyword. The return value is negative if one is small, positive if one is big, 0 if both are equal. */ static int est_keysc_compare(const void *ap, const void *bp){ assert(ap && bp); return ((ESTKEYSC *)bp)->pt - ((ESTKEYSC *)ap)->pt; } /* Analyze text with using mecab. `text' specifies a sentence whose encoding is EUC-JP. `uwords' specifies a unique word map. The return value is the total points of extracted words. */ static int est_mecab_analyze(const char *text, CBMAP *uwords){ #if defined(_MYMECAB) static mecab_t *mecab = NULL; static CBMAP *swmap = NULL; static char *args[] = { "mecab", "--node-format=%m\\t%f[0]\\t%f[1]\\n", "--unk-format=%m\\t\\t\\n", NULL }; static char *stops[] = { "a", "an", "the", "there", "any", "no", "each", "all", "most", "more", "i", "my", "me", "mine", "you", "your", "yours", "we", "our", "us", "ours", "he", "his", "him", "she", "her", "hers", "it", "its", "they", "their", "them", "theirs", "this", "these", "that", "those", "what", "who", "which", "where", "when", "why", "whose", "how", "or", "and", "but", "not", "as", "if", "so", "because", "as", "unless", "until", "while", "to", "on", "of", "in", "for", "from", "by", "with", "at", "against", "via", "under", "over", "be", "am", "are", "is", "was", "were", "being", "wanna", "gonna", "gotta", "going", "do", "does", "did", "have", "has", "had", "get", "got", "go", "went", "come", "came", "will", "would", "ll", "ve", "can", "could", "may", "might", "shall", "should", "must", NULL }; CBDATUM *prefix; const char *cres; unsigned char *uword; char *res, *pv, *ep, *word, *sign, *info; int i, all, ppt, pnum, wsiz, pt; assert(text && uwords); if(!mecab){ if(!(mecab = mecab_new(sizeof(args) / sizeof(char *) - 1, args))) return 0; /* for (maybe) mecab's bug cbglobalgc(mecab, (void (*)(void *))mecab_destroy); */ swmap = cbmapopenex(sizeof(stops) / sizeof(char *) * 2 + 1); cbglobalgc(swmap, (void (*)(void *))cbmapclose); for(i = 0; stops[i]; i++){ cbmapput(swmap, stops[i], -1, "", 0, FALSE); } } if(!(cres = mecab_sparse_tostr(mecab, (char *)text))) return 0; if(!(res = est_iconv(cres, -1, "EUC-JP", "UTF-8", NULL, NULL))) return 0; all = 0; pv = res; ep = pv; prefix = cbdatumopen(NULL, -1); pnum = 0; ppt = 0; while(ep){ if((ep = strchr(pv, '\n')) != NULL) *ep = '\0'; word = pv; if((sign = strchr(pv, '\t')) != NULL){ *(sign++) = '\0'; pv = sign; } else { sign = ""; } if((info = strchr(pv, '\t')) != NULL){ *(info++) = '\0'; pv = info; } else { info = ""; } for(pv = word; *pv != '\0'; pv++){ if(*pv >= 'A' && *pv <= 'Z') *pv += 'a' - 'A'; } wsiz = pv - word; if(wsiz > 0){ uword = (unsigned char *)word; if(!strcmp(sign, "記号")){ pt = ESTSCOREUNIT / 100; } else if(!strcmp(sign, "助詞") || !strcmp(sign, "助動詞") || !strcmp(sign, "接頭詞") || !strcmp(sign, "接続詞") || !strcmp(sign, "フィラー") || !strcmp(sign, "その他")){ pt = ESTSCOREUNIT / 50; } else if(!strcmp(sign, "名詞") || !strcmp(sign, "動詞")){ pt = ESTSCOREUNIT; if(!strcmp(info, "非自立") || !strcmp(info, "特殊") || !strcmp(info, "引用文字列") || !strcmp(info, "数") || !strcmp(info, "接続詞的") || !strcmp(info, "接尾") || !strcmp(info, "代名詞") || !strcmp(info, "動詞非自立的")){ pt /= 2; } if(wsiz <= 3){ pt /= 5; if(uword[0] == 0xe3 && (uword[1] == 0x80 || uword[1] == 0x81 || (uword[1] == 0x82 && uword[2] <= 0x9f))) pt /= 5; } else { if(uword[0] == 0xe3 && (uword[1] == 0x80 || uword[1] == 0x81 || (uword[1] == 0x82 && uword[2] <= 0x9f))) pt /= 5; if(uword[3] == 0xe3 && (uword[4] == 0x80 || uword[4] == 0x81 || (uword[4] == 0x82 && uword[5] <= 0x9f))) pt /= 2.5; if(wsiz <= 6) pt /= 1.5; } } else if(!strcmp(sign, "形容詞") || !strcmp(sign, "副詞") || !strcmp(sign, "連体詞") || !strcmp(sign, "感動詞")){ pt = ESTSCOREUNIT / 2; if(wsiz <= 3){ pt /= 5; if(uword[0] == 0xe3 && (uword[1] == 0x80 || uword[1] == 0x81 || (uword[1] == 0x82 && uword[2] <= 0x9f))) pt /= 5; } else { if(uword[0] == 0xe3 && (uword[1] == 0x80 || uword[1] == 0x81 || (uword[1] == 0x82 && uword[2] <= 0x9f))) pt /= 5; if(uword[3] == 0xe3 && (uword[4] == 0x80 || uword[4] == 0x81 || (uword[4] == 0x82 && uword[5] <= 0x9f))) pt /= 2.5; if(wsiz <= 6) pt /= 1.5; } } else if((word[0] > '\0' && word[0] <= '/') || (word[0] >= ':' && word[0] <= '@') || (word[0] >= '[' && word[0] <= '`') || (word[0] >= '{' && word[0] <= '~')){ pt = ESTSCOREUNIT / 50; if(wsiz <= 1) pt /= 2; } else if(cbmapget(swmap, word, wsiz, NULL)){ pt = ESTSCOREUNIT / 25; if(wsiz <= 1) pt /= 2; } else { switch(wsiz){ case 1: pt = ESTSCOREUNIT / 9; break; case 2: pt = ESTSCOREUNIT / 5; break; case 3: pt = ESTSCOREUNIT / 3; break; case 4: pt = ESTSCOREUNIT / 2; break; case 5: pt = ESTSCOREUNIT / 1.5; break; case 6: pt = ESTSCOREUNIT / 1.25; break; default: pt = ESTSCOREUNIT; break; } } est_uwords_add(uwords, (char *)uword, wsiz, pt); all += pt; if(!strcmp(sign, "名詞")){ ppt += pt; pnum++; if(cbdatumsize(prefix) > 0){ est_uwords_add(uwords, cbdatumptr(prefix), cbdatumsize(prefix), (double)ppt / pnum * -0.15 - 1); est_uwords_add(uwords, (char *)uword, wsiz, pt * -0.15 - 1); cbdatumcat(prefix, word, wsiz); est_uwords_add(uwords, cbdatumptr(prefix), cbdatumsize(prefix), ((double)ppt / pnum) * 0.9); all += (ppt / pnum) * 0.8; } else { cbdatumcat(prefix, word, wsiz); } } else if(!strcmp(sign, "") && (cbdatumsize(prefix) < 1 || uword[0] >= 0xc0)){ ppt += pt * 0.8; pnum++; if(cbdatumsize(prefix) > 0){ est_uwords_add(uwords, cbdatumptr(prefix), cbdatumsize(prefix), (double)ppt / pnum * -0.15 - 1); est_uwords_add(uwords, (char *)uword, wsiz, pt * -0.15 - 1); cbdatumcat(prefix, word, wsiz); est_uwords_add(uwords, cbdatumptr(prefix), cbdatumsize(prefix), ((double)ppt / pnum) * 0.9); all += (ppt / pnum) * 0.8; } else if(wsiz > 1 || uword[0] >= 0x80 || (word[0] >= '0' && word[0] <= '9') || (word[0] >= 'A' && word[0] <= 'Z') || (word[0] >= 'a' && word[0] <= 'z')){ cbdatumcat(prefix, word, wsiz); } else { ppt = 0; pnum = 0; } } else { cbdatumsetsize(prefix, 0); pnum = 0; ppt = 0; } } else { cbdatumsetsize(prefix, 0); pnum = 0; ppt = 0; } pv = ep + 1; } cbdatumclose(prefix); free(res); return all; #else return 0; #endif } /* END OF FILE */