/* This file is part of libextractor. (C) 2002, 2003, 2004, 2005, 2006 Vidyut Samanta and Christian Grothoff libextractor is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2, or (at your option) any later version. libextractor is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with libextractor; see the file COPYING. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ /** * Tool to build a bloomfilter from a dictionary. */ #include "platform.h" #include #include "bloomfilter.h" /** * Sets a bit active in the bitArray. Increment bit-specific * usage counter on disk only if below 4bit max (==15). * * @param bitArray memory area to set the bit in * @param bitIdx which bit to set */ static void setBit(unsigned char * bitArray, unsigned int bitIdx) { unsigned int arraySlot; unsigned int targetBit; arraySlot = bitIdx / 8; targetBit = (1L << (bitIdx % 8)); bitArray[arraySlot] |= targetBit; } /** * Callback: increment bit * * @param bf the filter to manipulate * @param bit the bit to increment * @param arg not used */ static void setBitCallback(Bloomfilter * bf, unsigned int bit, void * arg) { setBit(bf->bitArray, bit); } /** * Add an element to the filter * * @param bf the filter * @param e the element */ static void addToBloomfilter(Bloomfilter * bf, const HashCode160 * e) { if (NULL == bf) return; iterateBits(bf, &setBitCallback, NULL, e); } #define ADDR_PER_ELEMENT 46 int main(int argc, char ** argv) { Bloomfilter bf; HashCode160 hc; int i; int j; int cnt; char * fn; char ** words; char line[2048]; /* buffer overflow, here we go */ FILE *dictin; char * bn; char * charset = NULL; #define ALLOCSIZE 1024*1024 if (argc<3) { fprintf(stderr, _("Please provide the name of the language you are building\n" "a dictionary for. For example:\n")); fprintf(stderr, "$ ./dictionary-builder ./en en > en.c\n"); exit(-1); } fn = malloc(strlen(argv[1]) + 6); strcpy(fn, argv[1]); strcat(fn, ".txt"); dictin=fopen(fn,"r"); free(fn); if (dictin==NULL) { fprintf(stderr, _("Error opening file `%s': %s\n"), argv[1],strerror(errno)); exit(-1); } words = malloc(sizeof(char*) * ALLOCSIZE); /* don't we LOVE constant size buffers? */ if (words == NULL) { fprintf(stderr, _("Error allocating: %s\n."), strerror(errno)); exit(-1); } cnt = 0; memset(&line[0], 0, 2048); fscanf(dictin, "%s", (char*)&line); charset = strdup(line); /* not used (yet) */ while (1 == fscanf(dictin, "%s", (char*)&line)) { words[cnt] = strdup(line); cnt++; memset(&line[0], 0, 2048); if (cnt >= ALLOCSIZE) { fprintf(stderr, _("Increase ALLOCSIZE (in %s).\n"), __FILE__); exit(-1); } } bf.addressesPerElement = ADDR_PER_ELEMENT; bf.bitArraySize = (1 + (cnt / SUBTABLES)) * sizeof(int) * SUBTABLES; bf.bitArray = malloc(bf.bitArraySize); memset(bf.bitArray, 0, bf.bitArraySize); for (i=0;i