", """, "\"", "'", "'", /* ISO-8859-1 */ " ", "\xc2\xa0", "¡", "\xc2\xa1", "¢", "\xc2\xa2", "£", "\xc2\xa3", "¤", "\xc2\xa4", "¥", "\xc2\xa5", "¦", "\xc2\xa6", "§", "\xc2\xa7", "¨", "\xc2\xa8", "©", "\xc2\xa9", "ª", "\xc2\xaa", "«", "\xc2\xab", "¬", "\xc2\xac", "", "\xc2\xad", "®", "\xc2\xae", "¯", "\xc2\xaf", "°", "\xc2\xb0", "±", "\xc2\xb1", "²", "\xc2\xb2", "³", "\xc2\xb3", "´", "\xc2\xb4", "µ", "\xc2\xb5", "¶", "\xc2\xb6", "·", "\xc2\xb7", "¸", "\xc2\xb8", "¹", "\xc2\xb9", "º", "\xc2\xba", "»", "\xc2\xbb", "¼", "\xc2\xbc", "½", "\xc2\xbd", "¾", "\xc2\xbe", "¿", "\xc2\xbf", "À", "\xc3\x80", "Á", "\xc3\x81", "Â", "\xc3\x82", "Ã", "\xc3\x83", "Ä", "\xc3\x84", "Å", "\xc3\x85", "Æ", "\xc3\x86", "Ç", "\xc3\x87", "È", "\xc3\x88", "É", "\xc3\x89", "Ê", "\xc3\x8a", "Ë", "\xc3\x8b", "Ì", "\xc3\x8c", "Í", "\xc3\x8d", "Î", "\xc3\x8e", "Ï", "\xc3\x8f", "Ð", "\xc3\x90", "Ñ", "\xc3\x91", "Ò", "\xc3\x92", "Ó", "\xc3\x93", "Ô", "\xc3\x94", "Õ", "\xc3\x95", "Ö", "\xc3\x96", "×", "\xc3\x97", "Ø", "\xc3\x98", "Ù", "\xc3\x99", "Ú", "\xc3\x9a", "Û", "\xc3\x9b", "Ü", "\xc3\x9c", "Ý", "\xc3\x9d", "Þ", "\xc3\x9e", "ß", "\xc3\x9f", "à", "\xc3\xa0", "á", "\xc3\xa1", "â", "\xc3\xa2", "ã", "\xc3\xa3", "ä", "\xc3\xa4", "å", "\xc3\xa5", "æ", "\xc3\xa6", "ç", "\xc3\xa7", "è", "\xc3\xa8", "é", "\xc3\xa9", "ê", "\xc3\xaa", "ë", "\xc3\xab", "ì", "\xc3\xac", "í", "\xc3\xad", "î", "\xc3\xae", "ï", "\xc3\xaf", "ð", "\xc3\xb0", "ñ", "\xc3\xb1", "ò", "\xc3\xb2", "ó", "\xc3\xb3", "ô", "\xc3\xb4", "õ", "\xc3\xb5", "ö", "\xc3\xb6", "÷", "\xc3\xb7", "ø", "\xc3\xb8", "ù", "\xc3\xb9", "ú", "\xc3\xba", "û", "\xc3\xbb", "ü", "\xc3\xbc", "ý", "\xc3\xbd", "þ", "\xc3\xbe", "ÿ", "\xc3\xbf", /* ISO-10646 */ "ƒ", "\xc6\x92", "Α", "\xce\x91", "Β", "\xce\x92", "Γ", "\xce\x93", "Δ", "\xce\x94", "Ε", "\xce\x95", "Ζ", "\xce\x96", "Η", "\xce\x97", "Θ", "\xce\x98", "Ι", "\xce\x99", "Κ", "\xce\x9a", "Λ", "\xce\x9b", "Μ", "\xce\x9c", "Ν", "\xce\x9d", "Ξ", "\xce\x9e", "Ο", "\xce\x9f", "Π", "\xce\xa0", "Ρ", "\xce\xa1", "Σ", "\xce\xa3", "Τ", "\xce\xa4", "Υ", "\xce\xa5", "Φ", "\xce\xa6", "Χ", "\xce\xa7", "Ψ", "\xce\xa8", "Ω", "\xce\xa9", "α", "\xce\xb1", "β", "\xce\xb2", "γ", "\xce\xb3", "δ", "\xce\xb4", "ε", "\xce\xb5", "ζ", "\xce\xb6", "η", "\xce\xb7", "θ", "\xce\xb8", "ι", "\xce\xb9", "κ", "\xce\xba", "λ", "\xce\xbb", "μ", "\xce\xbc", "ν", "\xce\xbd", "ξ", "\xce\xbe", "ο", "\xce\xbf", "π", "\xcf\x80", "ρ", "\xcf\x81", "ς", "\xcf\x82", "σ", "\xcf\x83", "τ", "\xcf\x84", "υ", "\xcf\x85", "φ", "\xcf\x86", "χ", "\xcf\x87", "ψ", "\xcf\x88", "ω", "\xcf\x89", "ϑ", "\xcf\x91", "ϒ", "\xcf\x92", "ϖ", "\xcf\x96", "•", "\xe2\x80\xa2", "…", "\xe2\x80\xa6", "′", "\xe2\x80\xb2", "″", "\xe2\x80\xb3", "‾", "\xe2\x80\xbe", "⁄", "\xe2\x81\x84", "℘", "\xe2\x84\x98", "ℑ", "\xe2\x84\x91", "ℜ", "\xe2\x84\x9c", "™", "\xe2\x84\xa2", "ℵ", "\xe2\x84\xb5", "←", "\xe2\x86\x90", "↑", "\xe2\x86\x91", "→", "\xe2\x86\x92", "↓", "\xe2\x86\x93", "↔", "\xe2\x86\x94", "↵", "\xe2\x86\xb5", "⇐", "\xe2\x87\x90", "⇑", "\xe2\x87\x91", "⇒", "\xe2\x87\x92", "⇓", "\xe2\x87\x93", "⇔", "\xe2\x87\x94", "∀", "\xe2\x88\x80", "∂", "\xe2\x88\x82", "∃", "\xe2\x88\x83", "∅", "\xe2\x88\x85", "∇", "\xe2\x88\x87", "∈", "\xe2\x88\x88", "∉", "\xe2\x88\x89", "∋", "\xe2\x88\x8b", "∏", "\xe2\x88\x8f", "∑", "\xe2\x88\x91", "−", "\xe2\x88\x92", "∗", "\xe2\x88\x97", "√", "\xe2\x88\x9a", "∝", "\xe2\x88\x9d", "∞", "\xe2\x88\x9e", "∠", "\xe2\x88\xa0", "∧", "\xe2\x88\xa7", "∨", "\xe2\x88\xa8", "∩", "\xe2\x88\xa9", "∪", "\xe2\x88\xaa", "∫", "\xe2\x88\xab", "∴", "\xe2\x88\xb4", "∼", "\xe2\x88\xbc", "≅", "\xe2\x89\x85", "≈", "\xe2\x89\x88", "≠", "\xe2\x89\xa0", "≡", "\xe2\x89\xa1", "≤", "\xe2\x89\xa4", "≥", "\xe2\x89\xa5", "⊂", "\xe2\x8a\x82", "⊃", "\xe2\x8a\x83", "⊄", "\xe2\x8a\x84", "⊆", "\xe2\x8a\x86", "⊇", "\xe2\x8a\x87", "⊕", "\xe2\x8a\x95", "⊗", "\xe2\x8a\x97", "⊥", "\xe2\x8a\xa5", "⋅", "\xe2\x8b\x85", "⌈", "\xe2\x8c\x88", "⌉", "\xe2\x8c\x89", "⌊", "\xe2\x8c\x8a", "⌋", "\xe2\x8c\x8b", "〈", "\xe2\x8c\xa9", "〉", "\xe2\x8c\xaa", "◊", "\xe2\x97\x8a", "♠", "\xe2\x99\xa0", "♣", "\xe2\x99\xa3", "♥", "\xe2\x99\xa5", "♦", "\xe2\x99\xa6", "Œ", "\xc5\x92", "œ", "\xc5\x93", "Š", "\xc5\xa0", "š", "\xc5\xa1", "Ÿ", "\xc5\xb8", "ˆ", "\xcb\x86", "˜", "\xcb\x9c", " ", "\xe2\x80\x82", " ", "\xe2\x80\x83", " ", "\xe2\x80\x89", "", "\xe2\x80\x8c", "", "\xe2\x80\x8d", "", "\xe2\x80\x8e", "", "\xe2\x80\x8f", "–", "\xe2\x80\x93", "—", "\xe2\x80\x94", "‘", "\xe2\x80\x98", "’", "\xe2\x80\x99", "‚", "\xe2\x80\x9a", "“", "\xe2\x80\x9c", "”", "\xe2\x80\x9d", "„", "\xe2\x80\x9e", "†", "\xe2\x80\xa0", "‡", "\xe2\x80\xa1", "‰", "\xe2\x80\xb0", "‹", "\xe2\x80\xb9", "›", "\xe2\x80\xba", "€", "\xe2\x82\xac", NULL }; char *raw, *wp, buf[2], *tmp; int i, j, hit, num, tsiz; assert(html); CB_MALLOC(raw, strlen(html) * 3 + 1); wp = raw; while(*html != '\0'){ if(*html == '&'){ if(*(html + 1) == '#'){ if(*(html + 2) == 'x' || *(html + 2) == 'X'){ num = strtol(html + 3, NULL, 16); } else { num = atoi(html + 2); } buf[0] = num / 256; buf[1] = num % 256; if((tmp = est_uconv_out(buf, 2, &tsiz)) != NULL){ for(j = 0; j < tsiz; j++){ *wp = ((unsigned char *)tmp)[j]; wp++; } free(tmp); } while(*html != ';' && *html != ' ' && *html != '\n' && *html != '\0'){ html++; } if(*html == ';') html++; } else { hit = FALSE; for(i = 0; pairs[i] != NULL; i += 2){ if(cbstrfwmatch(html, pairs[i])){ wp += sprintf(wp, "%s", pairs[i+1]); html += strlen(pairs[i]); hit = TRUE; break; } } if(!hit){ *wp = *html; wp++; html++; } } } else { *wp = *html; wp++; html++; } } *wp = '\0'; return raw; } /* create a document object from MIME */ static ESTDOC *est_doc_new_from_mime(const char *buf, int size, const char *penc, int plang, int bcheck){ ESTDOC *doc, *tdoc; CBMAP *attrs; const CBLIST *texts; CBLIST *parts, *lines; CBDATUM *datum; const char *key, *val, *bound, *part, *text, *line; char *body, *swap, numbuf[NUMBUFSIZ]; int i, j, bsiz, psiz, ssiz, mht; assert(buf && size >= 0); doc = est_doc_new(); attrs = cbmapopenex(MINIBNUM); body = cbmimebreak(buf, size, attrs, &bsiz); if((val = cbmapget(attrs, "subject", -1, NULL)) != NULL){ est_doc_add_attr_mime(doc, ESTDATTRTITLE, val); if((val = est_doc_attr(doc, ESTDATTRTITLE)) != NULL) est_doc_add_hidden_text(doc, val); } if((val = cbmapget(attrs, "from", -1, NULL)) != NULL) est_doc_add_attr_mime(doc, ESTDATTRAUTHOR, val); if((val = cbmapget(attrs, "date", -1, NULL)) != NULL){ est_doc_add_attr_mime(doc, ESTDATTRCDATE, val); est_doc_add_attr_mime(doc, ESTDATTRMDATE, val); } est_doc_add_attr(doc, ESTDATTRTYPE, "message/rfc822"); sprintf(numbuf, "%d", size); est_doc_add_attr(doc, ESTDATTRSIZE, numbuf); cbmapiterinit(attrs); while((key = cbmapiternext(attrs, NULL)) != NULL){ if((key[0] >= 'A' && key[0] <= 'Z') || key[0] == '@' || key[0] == '_') continue; val = cbmapiterval(key, NULL); est_doc_add_attr_mime(doc, key, val); } if((key = cbmapget(attrs, "TYPE", -1, NULL)) != NULL && cbstrfwimatch(key, "multipart/")){ mht = cbstrfwimatch(key, "multipart/related"); if((bound = cbmapget(attrs, "BOUNDARY", -1, NULL)) != NULL){ parts = cbmimeparts(body, bsiz, bound); for(i = 0; i < CB_LISTNUM(parts) && i < 8; i++){ part = CB_LISTVAL2(parts, i, psiz); if((tdoc = est_doc_new_from_mime(part, psiz, penc, plang, bcheck)) != NULL){ if(mht){ if((text = est_doc_attr(tdoc, ESTDATTRTITLE)) != NULL) est_doc_add_attr(doc, ESTDATTRTITLE, text); if((text = est_doc_attr(tdoc, ESTDATTRAUTHOR)) != NULL) est_doc_add_attr(doc, ESTDATTRAUTHOR, text); } texts = est_doc_texts(tdoc); for(j = 0; j < CB_LISTNUM(texts); j++){ text = CB_LISTVAL(texts, j); est_doc_add_text(doc, text); } est_doc_delete(tdoc); } } CB_LISTCLOSE(parts); } } else { key = cbmapget(attrs, "content-transfer-encoding", -1, NULL); if(key && cbstrfwimatch(key, "base64")){ swap = cbbasedecode(body, &ssiz); free(body); body = swap; bsiz = ssiz; } else if(key && cbstrfwimatch(key, "quoted-printable")){ swap = cbquotedecode(body, &ssiz); free(body); body = swap; bsiz = ssiz; } key = cbmapget(attrs, "content-encoding", -1, NULL); if(key && (cbstrfwimatch(key, "x-gzip") || cbstrfwimatch(key, "gzip")) && (swap = cbgzdecode(body, bsiz, &ssiz)) != NULL){ free(body); body = swap; bsiz = ssiz; } else if(key && (cbstrfwimatch(key, "x-deflate") || cbstrfwimatch(key, "deflate")) && (swap = cbinflate(body, bsiz, &ssiz)) != NULL){ free(body); body = swap; bsiz = ssiz; } if(!(key = cbmapget(attrs, "TYPE", -1, NULL)) || cbstrfwimatch(key, "text/plain")){ if(!bcheck || !est_check_binary(body, bsiz)){ if(penc && (swap = est_iconv(body, bsiz, penc, "UTF-8", &ssiz, NULL)) != NULL){ free(body); body = swap; bsiz = ssiz; } else if((key = cbmapget(attrs, "CHARSET", -1, NULL)) != NULL && (swap = est_iconv(body, bsiz, key, "UTF-8", &ssiz, NULL)) != NULL){ free(body); body = swap; bsiz = ssiz; } lines = cbsplit(body, bsiz, "\n"); CB_DATUMOPEN(datum); for(i = 0; i < CB_LISTNUM(lines); i++){ line = CB_LISTVAL(lines, i); while(*line == ' ' || *line == '>' || *line == '|' || *line == '\t' || *line == '\r'){ line++; } if(line[0] == '\0'){ est_doc_add_text(doc, CB_DATUMPTR(datum)); CB_DATUMSETSIZE(datum, 0); } else { CB_DATUMCAT(datum, " ", 1); CB_DATUMCAT(datum, line, strlen(line)); } } est_doc_add_text(doc, CB_DATUMPTR(datum)); CB_DATUMCLOSE(datum); CB_LISTCLOSE(lines); } } else if(cbstrfwimatch(key, "text/html") || cbstrfwimatch(key, "application/xhtml+xml")){ if((tdoc = est_doc_new_from_html(body, bsiz, penc, plang, bcheck)) != NULL){ if((text = est_doc_attr(tdoc, ESTDATTRTITLE)) != NULL){ if(!est_doc_attr(doc, ESTDATTRTITLE)) est_doc_add_attr(doc, ESTDATTRTITLE, text); est_doc_add_text(doc, text); } if((text = est_doc_attr(tdoc, ESTDATTRAUTHOR)) != NULL){ if(!est_doc_attr(doc, ESTDATTRAUTHOR)) est_doc_add_attr(doc, ESTDATTRAUTHOR, text); est_doc_add_text(doc, text); } texts = est_doc_texts(tdoc); for(i = 0; i < CB_LISTNUM(texts); i++){ text = CB_LISTVAL(texts, i); est_doc_add_text(doc, text); } est_doc_delete(tdoc); } } else if(cbstrfwimatch(key, "message/rfc822")){ if((tdoc = est_doc_new_from_mime(body, bsiz, penc, plang, bcheck)) != NULL){ if((text = est_doc_attr(tdoc, ESTDATTRTITLE)) != NULL){ if(!est_doc_attr(doc, ESTDATTRTITLE)) est_doc_add_attr(doc, ESTDATTRTITLE, text); est_doc_add_text(doc, text); } if((text = est_doc_attr(tdoc, ESTDATTRAUTHOR)) != NULL){ if(!est_doc_attr(doc, ESTDATTRAUTHOR)) est_doc_add_attr(doc, ESTDATTRAUTHOR, text); est_doc_add_text(doc, text); } texts = est_doc_texts(tdoc); for(i = 0; i < CB_LISTNUM(texts); i++){ text = CB_LISTVAL(texts, i); est_doc_add_text(doc, text); } est_doc_delete(tdoc); } } else if(cbstrfwimatch(key, "text/")){ if((tdoc = est_doc_new_from_text(body, bsiz, penc, plang, bcheck)) != NULL){ texts = est_doc_texts(tdoc); for(i = 0; i < CB_LISTNUM(texts); i++){ text = CB_LISTVAL(texts, i); est_doc_add_text(doc, text); } est_doc_delete(tdoc); } } } free(body); cbmapclose(attrs); return doc; } /* set mime value as an attribute of a document */ static void est_doc_add_attr_mime(ESTDOC *doc, const char *name, const char *value){ char enc[64], *ebuf, *rbuf; assert(doc && name && value); ebuf = cbmimedecode(value, enc); if((rbuf = est_iconv(ebuf, -1, enc, "UTF-8", NULL, NULL)) != NULL){ est_doc_add_attr(doc, name, rbuf); free(rbuf); } free(ebuf); } /* generate a document with random text */ static ESTDOC *est_doc_new_from_chaos(int cnum, int snum, int mode){ ESTDOC *doc; char *str; int i; doc = est_doc_new(); snum *= pow(est_random_nd() + 0.5, 3.0); if(mode == RD_RAND){ mode = est_random() * 100; if(mode < 20){ mode = RD_ENG; est_doc_add_attr(doc, "mode", "english"); } else if(mode < 40){ mode = RD_LAT; est_doc_add_attr(doc, "mode", "latin"); } else if(mode < 60){ mode = RD_EURO; est_doc_add_attr(doc, "mode", "euromix"); } else if(mode < 65){ mode = RD_ORI; est_doc_add_attr(doc, "mode", "oriental"); } else if(mode < 95){ mode = RD_JPN; est_doc_add_attr(doc, "mode", "japanese"); } else { mode = RD_CHAO; est_doc_add_attr(doc, "mode", "chaos"); } } switch(mode){ case RD_ENG: est_doc_add_attr(doc, "mode", "english"); break; case RD_LAT: est_doc_add_attr(doc, "mode", "latin"); break; case RD_ORI: est_doc_add_attr(doc, "mode", "oriental"); break; case RD_JPN: est_doc_add_attr(doc, "mode", "japanese"); break; case RD_EURO: est_doc_add_attr(doc, "mode", "euromix"); break; case RD_CHAO: est_doc_add_attr(doc, "mode", "chaos"); break; } for(i = 0; i <= snum; i++){ str = est_random_str(cnum, mode); if(est_random() < 0.05){ est_doc_add_hidden_text(doc, str); } else { est_doc_add_text(doc, str); } free(str); } return doc; } /* generate random string */ static char *est_random_str(int cnum, int mode){ const char echrs[] = "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"; CBDATUM *buf; char wc[2], *str; int i, c, wlen, dec, mm, big, n; CB_DATUMOPEN(buf); cnum *= pow(est_random_nd() + 0.5, 3.0); wlen = est_random_nd() * 8 + 4; dec = (int)(est_random() * INT_MAX) % 10; big = (((int)(est_random() * INT_MAX) % 0x29)) * 0x100; for(i = 0; i < cnum; i++){ switch(mode){ case RD_ENG: case RD_LAT: case RD_EURO: mm = (int)(est_random() * INT_MAX) % 100; if((mode == RD_LAT || mode == RD_EURO) && mm < 5){ c = 0x00a1 + (int)(pow(est_random_nd(), 2.0) * (0x00ff - 0x00a0)); } else if(mode == RD_EURO && (mm < 30 || dec > 8)){ if(dec % 2 == 0){ c = 0x0391 + (int)(pow(est_random_nd(), 2.0) * (0x03d6 - 0x0391)); } else { c = 0x0400 + (int)(pow(est_random_nd(), 2.0) * (0x045f - 0x0400)); } } else if(mm < 95){ if((n = est_random_nd() * (sizeof(echrs) - 1)) == (sizeof(echrs) - 1)) n = 0; c = echrs[n]; } else { c = (int)(est_random() * ('@' - ' ')) + ' '; } if(--wlen < 1){ c = ' '; wlen = pow(est_random_nd(), 3.0) * 8 + 4; dec = (int)(est_random() * INT_MAX) % 10; } break; case RD_ORI: c = big + est_random_nd() * 0x100; if(--wlen < 1){ wlen = pow(est_random_nd(), 3.0) * 12 + 6; big = (((int)(est_random() * INT_MAX) % 0x29)) * 0x100; } break; case RD_JPN: if(dec < 4){ c = 0x3041 + pow(est_random_nd(), 3.0) * (0x3094 - 0x3041); } else if(dec < 7){ c = 0x30a1 + pow(est_random_nd(), 3.0) * (0x30fe - 0x30a1); } else if(dec < 9){ c = 0x4e00 + pow(est_random_nd(), 3.0) * (0x9faf - 0x4e00); } else { if(est_random() < 0.7){ c = 0x00a1 + (int)(pow(est_random_nd(), 2.0) * (0x00ff - 0x00a0)); } else { c = 0x3041 + est_random() * (0xffef - 0x3041); } } if(--wlen < 1){ wlen = pow(est_random_nd(), 3.0) * 12 + 6; dec = (int)(est_random() * INT_MAX) % 10; } break; default: if(est_random() < 0.2){ c = 0x00a1 + (int)est_random() * (0x00ff - 0x00a0); } else { c = (int)(est_random() * 0x10000); } break; } if(c <= 0 || c >= 0x10000) c = 0x0020; wc[0] = c / 0x100; wc[1] = c % 0x100; CB_DATUMCAT(buf, wc, 2); } str = est_iconv(CB_DATUMPTR(buf), CB_DATUMSIZE(buf), "UTF-16BE", "UTF-8", NULL, NULL); CB_DATUMCLOSE(buf); return str; } /* compare two keywords by scores in descending order */ static int keysc_compare(const void *ap, const void *bp){ return ((KEYSC *)bp)->pt - ((KEYSC *)ap)->pt; } /* END OF FILE */