/************************************************************************* * Bulgarian-English Dictionary * Copyright (C) 2000 Radostin Radnev * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA *************************************************************************/ #include #include #include #include "database.h" //=== Class Database ===================================================== // Written by Radostin Radnev - radnev@yahoo.com // $Id: database.cpp,v 1.9 2001/03/11 05:21:32 radnev Exp $ // // This class represents Database. It search and read data from text file // // Database file must be in plain text with delimiters between different // data '\0'. It is useful because C/C++ uses '\0' to terminate a strings. // The structure must be following: Word must be first. It must be // followed by delimiter '\n'. It also can be used to divide different // meanings in translation data. After '\n' starts a translation data // (result). And it ends of course with '\0'. And after that starts new // word. Data file must be sorted by contents of words. That means // excluding spaces, dashes and other non alpha chars and sorting data // without them. Also database cannot include two equal contents of words. // That means "SECOND-HAND" and "SECOND HAND" are the same words and it is // "SECONDHAND". Also database must starts and ends with '\0'. In this way // search method does not need to make check of beginning and end of file. // // Method uses binary search to find a specified word. The problem is that // database is not with fixed records (positions of words) and each time // it goes somewhere in the middle in the translation data. In this case // method read forward data to meet data delimiter ('\0'). If it meets // end of binary search position then it look backward to ensure that // no more words in search interval. When it meet data delimiter it // compare extracted word with entered word and continue to search // depending of result or stop execution if founds word. If method does // not found a word it returns a nearest similar word in search interval. // // The forward search is increased by using dataBuffer, but backward // search read every byte separately. It is not big problem because method // uses only forward search in most cases. // // Currently engbul.dat and buleng.dat in word positions contains // only " ", "-" and "'", so I delete only this chars from words content. // Also all chars in word position in both database are Upper case. And // the search method expect as search word only Upper case. //======================================================================== // Constant declaration const int Database::MAX_WORD_LEN = 50; const int Database::MAX_DATA_LEN = 7000; const char Database::WORD_SEPARATOR = '\n'; const char Database::DATA_SEPARATOR = '\0'; const char Database::NON_ALPHA_CHARS[] = "-' "; //=== Constructor ======================================================== // Allocate memory //======================================================================== Database::Database() { dataBuffer = new char[MAX_DATA_LEN]; wordBuffer = new char[MAX_WORD_LEN]; compBuffer = new char[MAX_WORD_LEN]; dataFile = NULL; } // End of Constructor //=== Destructor ========================================================= // Close Database and free memory //======================================================================== Database::~Database() { delete [] dataBuffer; delete [] wordBuffer; delete [] compBuffer; if (dataFile != NULL) { fclose(dataFile); } } // End of Destructor //=== Create Dictionary ================================================== // It is the real constructor of object // Open data file and set some variables // Return true if success, false if failed //======================================================================== bool Database::createDictionary(const char *fileName, const long fixedLastWordPointer) { // Ensure against invoking twice if (dataFile != NULL) { return false; } // Set buffers to zero length dataBuffer[0] = '\0'; wordBuffer[0] = '\0'; compBuffer[0] = '\0'; // Open DataFile dataFile = fopen(fileName, "r"); // Return false if failed if (dataFile == NULL) { return false; } // Set First Word Pointer firstWordPointer = 0; // Set Last Word Pointer if (fixedLastWordPointer > 0) { lastWordPointer = fixedLastWordPointer; } else { fseek(dataFile, -2L, SEEK_END); lastWordPointer = ftell(dataFile); do { lastWordPointer--; fseek(dataFile, lastWordPointer, SEEK_SET); fread(dataBuffer, 1, 1, dataFile); } while (dataBuffer[0] != DATA_SEPARATOR); } // Set Current and Next Word Pointer currentWordPointer = firstWordPointer; lastSearchWordPointer = firstWordPointer; // Read (load) data in buffers readData(); // Set data in compBuff onlyLetters(dataBuffer); return true; } // End of createDictionary //=== Go To First Word =================================================== // Point word pointer to first word and read (load) data in buffers //======================================================================== void Database::goToFirstWord() { currentWordPointer = firstWordPointer; readData(); } // End of goToFirstWord //=== Go To Last Word =+================================================== // Point word pointer to last word and read (load) data in buffers //======================================================================== void Database::goToLastWord() { currentWordPointer = lastWordPointer; readData(); } // End of goToLastWord //=== Find Word ========================================================== // Find specified word // Return true if found and false if does not found // Point pointer **result to the word (or nearest word) //======================================================================== bool Database::findWord(const char *word, char **result) { long b, e; // Begin and End position of search long m, rm; // Middle position for binary seacrh and remember middle variable int comp; // Here we store result of comparing two words int pos; // Position // Calculating the begin and the end of search area b = firstWordPointer; // Set first word pointer e = lastWordPointer; // Set last word pointer comp = strcmp(compBuffer, word); // Compare last searched word with new one if (comp < 0) { // We can narrow search in this way b = lastSearchWordPointer; // Set begin of the search area } // to the last search word pointer else if (comp > 0) { // Set end of the search area e = lastSearchWordPointer; // to the last search word pointer } // Ooops! good news else { // We have the same word for translation currentWordPointer = lastSearchWordPointer; // Set currentWordPointer readData(); // Read (load) data in buffers *result = dataBuffer; // Set *result pointer return true; // Stop execution and return true } // Search for word while (true) { // Main loop m = (b + e) / 2; // Get middle position of file rm = m; // Remember it for future use fseek(dataFile, m, SEEK_SET); // Set at middle fread(dataBuffer, 1, MAX_DATA_LEN, dataFile); // Read forward to meet new word pos = strlen(dataBuffer); // This is a offset to a new word m += pos; // Now we in a position of new word onlyLetters(dataBuffer + pos); // Get only letters of new word if (m == e) { // If current position = end pos comp = strcmp(compBuffer, word); // Compare word with entered word if (comp <= 0) { // If it equals or less that entered break; // we exit because found or not found } else { // Else we search for new word backward m = rm; // Restore middle position fseek(dataFile, m, SEEK_SET); // Set again at middle do { // Read backward to meet new word m--; fseek(dataFile, m, SEEK_SET); fread(dataBuffer, 1, 1, dataFile); } while (dataBuffer[0] != DATA_SEPARATOR); // Now we in a position of new word fread(dataBuffer, 1, MAX_WORD_LEN, dataFile); // Read data in buffer but only word onlyLetters(dataBuffer); // Get only letters of new word if (m == b) { // If current position = beginning pos comp = strcmp(compBuffer, word); // Compare word with entered word if (comp < 0) { // If greater than firts word we get last m = e; // get last word in search } break; // we exit because search ends } } } comp = strcmp(compBuffer, word); // Compare middle word with enetered if (comp < 0) { // If middle word less than eneterd b = m; // then beginning = middle } else if (comp > 0) { // If middle word greater than eneterd e = m; // then end = middle } else { // Else two words are equal break; // then we exit from loop } } lastSearchWordPointer = m; // Set lastSearchWordPointer currentWordPointer = m; // Set currentWordPointer readData(); // Read (load) data in buffers onlyLetters(dataBuffer); // Get only letters for the next search *result = dataBuffer; // Set *result pointer return (comp == 0); // Return found or not found } // End of findWord //=== Only Letters ======================================================= // Delete non letters chars from word // Store result in compBuffer //======================================================================== void Database::onlyLetters(const char *word) { int i = 0, j = 0; while (word[i] != WORD_SEPARATOR) { if (strchr(NON_ALPHA_CHARS, word[i]) == NULL) { compBuffer[j] = word[i]; j++; } i++; } compBuffer[j] = '\0'; } // End of onlyLetters //=== Read Data ========================================================== // Load data in buffers // Translational data in dataBuffer and word in wordBuffer //======================================================================== void Database::readData() { fseek(dataFile, currentWordPointer + 1, SEEK_SET); fread(dataBuffer, 1, MAX_DATA_LEN, dataFile); int i = 0; while (dataBuffer[i] != WORD_SEPARATOR) { wordBuffer[i] = dataBuffer[i]; i++; } wordBuffer[i] = '\0'; } // End of readData //=== Get Word =========================================================== // Gets word pointed by word pointer //======================================================================== char *Database::getWord() { return wordBuffer; } // End of getWord //=== Get Result ========================================================= // Gets result (translation data) pointed by word pointer //======================================================================== char *Database::getResult() { return dataBuffer; } // End of getResult //=== Go To Next Word ==================================================== // Search forward and point pointer to the next word in list // Return true if next word exist and false if word pointer is on last word //======================================================================== bool Database::goToNextWord() { bool ret = false; if (currentWordPointer < lastWordPointer) { ret = true; currentWordPointer += (strlen(dataBuffer) + 1); } readData(); return ret; } // End of goToNextWord //=== Get Next Random Word =============================================== // Gets the next random word // Move word pointer to the next random word and return word //======================================================================== char *Database::getNextRandomWord() { long pos = firstWordPointer + (long)((((double)lastWordPointer) * rand()) / (RAND_MAX + (double)firstWordPointer)); if (pos < firstWordPointer) { pos = firstWordPointer; } if (pos > lastWordPointer) { pos = lastWordPointer; } fseek(dataFile, pos, SEEK_SET); fread(dataBuffer, 1, MAX_DATA_LEN, dataFile); currentWordPointer = pos + strlen(dataBuffer); readData(); return wordBuffer; } // End of getNextRandomWord